Skip to content

Commit

Permalink
Merge pull request explosion#11 from elyase/master
Browse files Browse the repository at this point in the history
Add gensim to sense2vec format convert script
  • Loading branch information
honnibal committed Mar 29, 2016
2 parents 1e753c7 + 0186d26 commit 8c27151
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 1 deletion.
30 changes: 30 additions & 0 deletions bin/gensim2sense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from sense2vec.vectors import VectorMap
from gensim.models import Word2Vec
import plac

@plac.annotations(
gensim_model_path=("Location of gensim's .bin file"),
out_dir=("Location of output directory"),
min_count=("Min count", "option", "m", int),
)
def main(gensim_model_path, out_dir, min_count=None):
"""Convert a gensim.models.Word2Vec file to VectorMap format"""

gensim_model = Word2Vec.load(gensim_model_path)
vector_map = VectorMap(128)

if min_count is None:
min_count = gensim_model.min_count

for string in gensim_model.vocab:
vocab = gensim_model.vocab[string]
freq, idx = vocab.count, vocab.index
if freq < min_count:
continue
vector = gensim_model.syn0[idx]
vector_map.borrow(string, freq, vector)

vector_map.save(out_dir)

if __name__ == '__main__':
plac.call(main)
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ numpy
ujson>=1.34
spacy>=0.100,<0.101
preshed>=0.46,<0.47
murmurhash>=0.26,<0.27
murmurhash==0.26.1
cymem>=1.30,<1.32
sputnik>=0.9.0,<0.10.0
pytest
joblib
toolz
gensim

0 comments on commit 8c27151

Please sign in to comment.