Skip to content

Commit

Permalink
now importable in Python 3.3 (though most tests fail in 3.x)
Browse files Browse the repository at this point in the history
  • Loading branch information
larsmans committed Feb 11, 2014
1 parent 21099e3 commit 2f20816
Show file tree
Hide file tree
Showing 31 changed files with 114 additions and 89 deletions.
5 changes: 2 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ gensim -- Python Framework for Topic Modelling
Gensim is a Python library for *topic modelling*, *document indexing* and *similarity retrieval* with large corpora.
Target audience is the *natural language processing* (NLP) and *information retrieval* (IR) community.

For a Python3 port of gensim by Parikshit Samant, visit `this fork <https://github.com/samantp/gensimPy3>`_.

Features
---------

Expand Down Expand Up @@ -53,7 +51,8 @@ you'll need to run::
For alternative modes of installation (without root privileges, development
installation, optional install features), see the `documentation <http://radimrehurek.com/gensim/install.html>`_.

This version has been tested under Python 2.5, 2.6 and 2.7, and should run on any 2.5 <= Python < 3.0.
This version has been tested under Python 2.6 and 2.7.
Python 3 support is work in progress.

Documentation
-------------
Expand Down
2 changes: 1 addition & 1 deletion gensim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

# for IPython tab-completion
import utils, matutils, interfaces, corpora, models, similarities
from . import utils, matutils, interfaces, corpora, models, similarities
import logging


Expand Down
File renamed without changes.
20 changes: 10 additions & 10 deletions gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"""

# bring corpus classes directly into package namespace, to save some typing
from indexedcorpus import IndexedCorpus # must appear before the other classes
from .indexedcorpus import IndexedCorpus # must appear before the other classes

from mmcorpus import MmCorpus
from bleicorpus import BleiCorpus
from svmlightcorpus import SvmLightCorpus
from lowcorpus import LowCorpus
from dictionary import Dictionary
from hashdictionary import HashDictionary
from wikicorpus import WikiCorpus
from textcorpus import TextCorpus
from ucicorpus import UciCorpus
from .mmcorpus import MmCorpus
from .bleicorpus import BleiCorpus
from .svmlightcorpus import SvmLightCorpus
from .lowcorpus import LowCorpus
from .dictionary import Dictionary
from .hashdictionary import HashDictionary
from .wikicorpus import WikiCorpus
from .textcorpus import TextCorpus
from .ucicorpus import UciCorpus
29 changes: 19 additions & 10 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@

import logging
import itertools
import UserDict
import sys

from gensim import utils
from .. import utils
from .._six import iteritems, iterkeys


logger = logging.getLogger('gensim.corpora.dictionary')


class Dictionary(utils.SaveLoad, UserDict.DictMixin):
class Dictionary(utils.SaveLoad, dict):
"""
Dictionary encapsulates the mapping between normalized words and their integer ids.
Expand All @@ -51,7 +52,7 @@ def __getitem__(self, tokenid):
if len(self.id2token) != len(self.token2id):
# the word->id mapping has changed (presumably via add_documents);
# recompute id->word accordingly
self.id2token = dict((v, k) for k, v in self.token2id.iteritems())
self.id2token = dict((v, k) for k, v in iteritems(self.token2id))
return self.id2token[tokenid] # will throw for non-existent ids


Expand Down Expand Up @@ -138,11 +139,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
self.num_pos += len(document)
self.num_nnz += len(result)
# increase document count for each unique token that appeared in the document
for tokenid in result.iterkeys():
for tokenid in iterkeys(result):
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

# return tokenids, in ascending id order
result = sorted(result.iteritems())
result = sorted(iteritems(result))
if return_missing:
return result, missing
else:
Expand Down Expand Up @@ -189,12 +190,20 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
"""
if bad_ids is not None:
bad_ids = set(bad_ids)
self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if tokenid not in bad_ids)
self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid not in bad_ids)
self.token2id = dict((token, tokenid)
for token, tokenid in iteritems(self.token2id)
if tokenid not in bad_ids)
self.dfs = dict((tokenid, freq)
for tokenid, freq in iteritems(self.dfs)
if tokenid not in bad_ids)
if good_ids is not None:
good_ids = set(good_ids)
self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if tokenid in good_ids)
self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid in good_ids)
self.token2id = dict((token, tokenid)
for token, tokenid in iteritems(self.token2id)
if tokenid in good_ids)
self.dfs = dict((tokenid, freq)
for tokenid, freq in self.dfs.iteritems()
if tokenid in good_ids)


def compactify(self):
Expand Down
5 changes: 2 additions & 3 deletions gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,16 @@

import logging
import itertools
import UserDict
import zlib

from gensim import utils
from .. import utils


logger = logging.getLogger('gensim.corpora.hashdictionary')



class HashDictionary(utils.SaveLoad, UserDict.DictMixin):
class HashDictionary(utils.SaveLoad, dict):
"""
HashDictionary encapsulates the mapping between normalized words and their
integer ids.
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@

import logging

from gensim import interfaces, utils
from dictionary import Dictionary
from .. import interfaces, utils
from .dictionary import Dictionary

logger = logging.getLogger('gensim.corpora.textcorpus')

Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def get_texts(self):
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print vec
>>> print(vec)
"""
articles, articles_all = 0, 0
positions, positions_all = 0, 0
Expand Down
2 changes: 1 addition & 1 deletion gensim/examples/dmlcz/gensim_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def buildDmlCorpus(config):

# check and process input arguments
if len(sys.argv) < 2:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)
language = sys.argv[1]

Expand Down
2 changes: 1 addition & 1 deletion gensim/examples/dmlcz/gensim_genmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)
language = sys.argv[1]
method = sys.argv[2].strip().lower()
Expand Down
2 changes: 1 addition & 1 deletion gensim/examples/dmlcz/gensim_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def generateSimilar(corpus, index, method):

# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)
language = sys.argv[1]
method = sys.argv[2].strip().lower()
Expand Down
2 changes: 1 addition & 1 deletion gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import logging
import itertools

import utils, matutils
from . import utils, matutils


logger = logging.getLogger('gensim.interfaces')
Expand Down
2 changes: 1 addition & 1 deletion gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n
num_docs = corpus.num_docs
if num_nnz is None:
num_nnz = corpus.num_nnz
except AttributeError, e:
except AttributeError as e:
pass # not a MmCorpus...
if printprogress:
logger.info("creating sparse matrix from corpus")
Expand Down
18 changes: 9 additions & 9 deletions gensim/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"""

# bring model classes directly into package namespace, to save some typing
from hdpmodel import HdpModel
from ldamodel import LdaModel
from lsimodel import LsiModel
from tfidfmodel import TfidfModel
from rpmodel import RpModel
from logentropy_model import LogEntropyModel
from word2vec import Word2Vec

from gensim import interfaces, utils
from .hdpmodel import HdpModel
from .ldamodel import LdaModel
from .lsimodel import LsiModel
from .tfidfmodel import TfidfModel
from .rpmodel import RpModel
from .logentropy_model import LogEntropyModel
from .word2vec import Word2Vec

from .. import interfaces, utils


class VocabTransform(interfaces.TransformationABC):
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/lda_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def main():
program = os.path.basename(sys.argv[0])
# make sure we have enough cmd line parameters
if len(sys.argv) < 1:
print globals()["__doc__"] % locals()
print(globals()["__doc__"] % locals())
sys.exit(1)

if len(sys.argv) < 2:
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/lda_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def main():
program = os.path.basename(sys.argv[0])
# make sure we have enough cmd line parameters
if len(sys.argv) < 1:
print globals()["__doc__"] % locals()
print(globals()["__doc__"] % locals())
sys.exit(1)

utils.pyro_daemon('gensim.lda_worker', Worker(), random_suffix=True)
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
Example:
>>> lda = LdaModel(corpus, num_topics=100) # train model
>>> print lda[doc_bow] # get topic probability distribution for a document
>>> print(lda[doc_bow]) # get topic probability distribution for a document
>>> lda.update(corpus2) # update the LDA model with additional documents
>>> print lda[doc_bow]
>>> print(lda[doc_bow])
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data
Expand Down Expand Up @@ -289,7 +289,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
self.dispatcher = dispatcher
self.numworkers = len(dispatcher.getworkers())
logger.info("using distributed version with %i workers" % self.numworkers)
except Exception, err:
except Exception as err:
logger.error("failed to initialize distributed LDA (%s)" % err)
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

Expand Down
2 changes: 1 addition & 1 deletion gensim/models/logentropy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class LogEntropyModel(interfaces.TransformationABC):
log entropy normalized space.
>>> log_ent = LogEntropyModel(corpus)
>>> print = log_ent[some_doc]
>>> print(log_ent[some_doc])
>>> log_ent.save('/tmp/foo.log_ent_model')
Model persistency is achieved via its load/save methods.
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,9 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
Example:
>>> lsi = LsiModel(corpus, num_topics=10)
>>> print lsi[doc_tfidf] # project some document into LSI space
>>> print(lsi[doc_tfidf]) # project some document into LSI space
>>> lsi.add_documents(corpus2) # update LSI on additional documents
>>> print lsi[doc_tfidf]
>>> print(lsi[doc_tfidf])
.. [3] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf
Expand Down Expand Up @@ -321,7 +321,7 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
self.dispatcher = dispatcher
self.numworkers = len(dispatcher.getworkers())
logger.info("using distributed version with %i workers" % self.numworkers)
except Exception, err:
except Exception as err:
# distributed version was specifically requested, so this is an error state
logger.error("failed to initialize distributed LSI (%s)" % err)
raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class RpModel(interfaces.TransformationABC):
space.
>>> rp = RpModel(corpus)
>>> print rp[some_doc]
>>> print(rp[some_doc])
>>> rp.save('/tmp/foo.rp_model')
Model persistency is achieved via its load/save methods.
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class TfidfModel(interfaces.TransformationABC):
space.
>>> tfidf = TfidfModel(corpus)
>>> print = tfidf[some_doc]
>>> print(tfidf[some_doc])
>>> tfidf.save('/tmp/foo.tfidf_model')
Model persistency is achieved via its load/save methods.
Expand Down
7 changes: 5 additions & 2 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@
import time
import itertools
import threading
from Queue import Queue
try:
from queue import Queue
except ImportError:
from Queue import Queue

from numpy import exp, dot, zeros, outer, random, dtype, get_include, float32 as REAL,\
uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis, ndarray, empty
Expand Down Expand Up @@ -673,7 +676,7 @@ def __iter__(self):
# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)
infile = sys.argv[1]
from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle
Expand Down
6 changes: 3 additions & 3 deletions gensim/nosy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def checkSum():
val = checkSum()
os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS,
' '.join(sys.argv[1:])))
print datetime.datetime.now().__str__()
print '=' * 77
print(datetime.datetime.now().__str__())
print('=' * 77)
time.sleep(1)
except KeyboardInterrupt:
print 'Goodbye'
print('Goodbye')
4 changes: 2 additions & 2 deletions gensim/parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
"""

# bring model classes directly into package namespace, to save some typing
from porter import PorterStemmer
from preprocessing import *
from .porter import PorterStemmer
from .preprocessing import *
2 changes: 1 addition & 1 deletion gensim/scripts/make_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
if len(sys.argv) > 3:
Expand Down
2 changes: 1 addition & 1 deletion gensim/similarities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

# bring classes directly into package namespace, to save some typing
from docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity
from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity
2 changes: 1 addition & 1 deletion gensim/test/simspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)

corpus_dense = gensim.corpora.MmCorpus(sys.argv[1])
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/simspeed2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
print(globals()['__doc__'] % locals())
sys.exit(1)

corpus_dense = gensim.corpora.MmCorpus(sys.argv[1])
Expand Down
Loading

0 comments on commit 2f20816

Please sign in to comment.