setup.py fix for python2, README.md fix, Mac osx build failure, small…

… fasttext.py fix Summary: See title. Reviewed By: EdouardGrave Differential Revision: D6220933 fbshipit-source-id: 1bc95ea0c3751c9fce6da6ec0f151eb79aadb3a2
jattenberg · Nov 2, 2017 · f4bced0 · f4bced0
1 parent f10ec1f
commit f4bced0
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 10 deletions.
diff --git a/python/README.md b/python/README.md
@@ -91,8 +91,8 @@ for w in words:
 Training a model is easy. For example
 
 ```
-from fastTextpy import train_supervised
-from fastTextpy import train_unsupervised
+from fastText import train_supervised
+from fastText import train_unsupervised
 
 model_unsup = train_unsupervised(
     input=<data>,
@@ -116,9 +116,20 @@ To get extended help on these functions use the python help functions.
 For example
 
 ```
-Help on function train_unsupervised in module fastTextpy.FastText:
+Help on function train_unsupervised in module fastText.FastText:
 
-train_unsupervised(input, output=u'model', model=model_name.skipgram, lr=0.05, dim=100, ws=5, epoch=5, minCount=5, minCountLabel=0, minn=3, maxn=6, neg=5, wordNgrams=1, loss=loss_name.ns, bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label=u'__label__', verbose=2, pretrainedVectors=u'', saveOutput=0)
+train_unsupervised(input, model=u'skipgram', lr=0.05, dim=100, ws=5, epoch=5, minCount=5, minCountLabel=0, minn=3, maxn=6, neg=5, wordNgrams=1, loss=u'ns', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label=u'__label__', verbose=2, pretrainedVectors=u'', saveOutput=0)
+    Train an unsupervised model and return a model object.
+
+    input must be a filepath. The input text does not need to be tokenized
+    as per the tokenize function, but it must be preprocessed and encoded
+    as UTF-8. You might want to consult standard preprocessing scripts such
+    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
+
+    The input fiel must not contain any labels or use the specified label prefix
+    unless it is ok for those words to be ignored. For an example consult the
+    dataset pulled by the example script word-vector-example.sh, which is
+    part of the fastText repository.
 ```
 
 ## Processing data
@@ -127,14 +138,14 @@ You can tokenize using the fastText Dictionary method readWord.
 
 This will give you a list of tokens split on the same whitespace characters that fastText splits on.
 
-It will also add the EOS character as necessary, which is exposed via fastTextpy.EOS
+It will also add the EOS character as necessary, which is exposed via fastText.EOS
 
 Then resulting text is then stored entirely in memory.
 
 For example:
 
 ```
-from fastTextpy import tokenize
+from fastText import tokenize
 with open(<PATH>, 'r') as f:
     tokens = tokenize(f.read())
 ```
diff --git a/python/fastText/FastText.py b/python/fastText/FastText.py
@@ -63,7 +63,7 @@ def get_sentence_vector(self, text):
                 "predict processes one line at a time (remove \'\\n\')"
             )
         text += "\n"
-        dim = self.f.get_dimension()
+        dim = self.get_dimension()
         b = fasttext.Vector(dim)
         self.f.getSentenceVector(b, text)
         return np.array(b)

diff --git a/python/fastText/pybind/fasttext_pybind.cc b/python/fastText/pybind/fasttext_pybind.cc
@@ -92,7 +92,7 @@ PYBIND11_MODULE(fasttext_pybind, m) {
             py::format_descriptor<fasttext::real>::format(),
             2,
             {m.m_, m.n_},
-            {sizeof(fasttext::real) * m.n_, sizeof(fasttext::real)});
+            {sizeof(fasttext::real) * m.n_, sizeof(fasttext::real) * (int64_t)1});
       });
 
   py::class_<fasttext::FastText>(m, "fasttext")

diff --git a/python/setup.py b/python/setup.py
@@ -41,7 +41,7 @@ def __str__(self):
 fasttext_src_cc = list(filter(lambda x: x.endswith('.cc'), fasttext_src_files))
 
 fasttext_src_cc = list(
-    map(lambda x: os.path.join(FASTTEXT_SRC, x), fasttext_src_cc)
+    map(lambda x: str(os.path.join(FASTTEXT_SRC, x)), fasttext_src_cc)
 )
 
 ext_modules = [
@@ -133,6 +133,6 @@ def build_extensions(self):
     license='BSD',
     install_requires=['pybind11>=2.2'],
     cmdclass={'build_ext': BuildExt},
-    packages=['fastText'],
+    packages=[str('fastText')],
     zip_safe=False
 )