more Py3 fixes

Some Py3 tests pass now; many choke on str/unicode differences.
lizhangzhan · Feb 14, 2014 · 7465723 · 7465723
1 parent 447da27
commit 7465723
Show file tree

Hide file tree

Showing 17 changed files with 39 additions and 28 deletions.
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -14,8 +14,9 @@
 from os import path
 import logging
 
-from gensim import interfaces, utils
-from gensim.corpora import IndexedCorpus
+from .. import interfaces, utils
+from ..corpora import IndexedCorpus
+from .._six.moves import xrange
 
 
 logger = logging.getLogger('gensim.corpora.bleicorpus')

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -55,7 +55,7 @@ def __iter__(self):
         """
         reader = csv.reader(open(self.fname), self.dialect)
         if self.headers:
-            reader.next()  # skip the headers
+            next(reader)    # skip the headers
 
         line_no = -1
         for line_no, line in enumerate(reader):

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
@@ -32,6 +32,7 @@
 import logging
 
 from .. import interfaces, utils
+from .._six import string_types
 from .dictionary import Dictionary
 
 logger = logging.getLogger('gensim.corpora.textcorpus')
@@ -43,7 +44,7 @@ def getstream(input):
     If input is a file-like object, reset it to the beginning with `input.seek(0)`.
     """
     assert input is not None
-    if isinstance(input, basestring):
+    if isinstance(input, string_types):
         # input was a filename: open as text file
         result = open(input)
     else:

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -45,9 +45,9 @@ def __init__(self, input):
             input = open(input)
 
         self.num_docs = self.num_terms = self.num_nnz = 0
-        self.num_docs = int(input.next().strip())
-        self.num_terms = int(input.next().strip())
-        self.num_nnz = int(input.next().strip())
+        self.num_docs = int(next(input).strip())
+        self.num_terms = int(next(input).strip())
+        self.num_nnz = int(next(input).strip())
 
         logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
             (self.num_docs, self.num_terms, self.num_nnz))

diff --git a/gensim/interfaces.py b/gensim/interfaces.py
@@ -17,6 +17,7 @@
 import itertools
 
 from . import utils, matutils
+from ._six.moves import xrange
 
 
 logger = logging.getLogger('gensim.interfaces')

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -19,8 +19,8 @@
 import scipy.linalg
 from scipy.linalg.lapack import get_lapack_funcs
 
-from ._six import iteritems, itervalues
-from ._six.moves import zip as izip
+from ._six import iteritems, itervalues, string_types
+from ._six.moves import xrange, zip as izip
 
 # scipy is not a stable package yet, locations change, so try to work
 # around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8)
@@ -318,7 +318,7 @@ def unitvec(vec):
             return vec
 
     try:
-        first = iter(vec).next() # is there at least one element?
+        first = next(iter(vec))     # is there at least one element?
     except:
         return vec
 
@@ -526,9 +526,9 @@ def __init__(self, input, transposed=True):
         """
         logger.info("initializing corpus reader from %s" % input)
         self.input, self.transposed = input, transposed
-        if isinstance(input, basestring):
+        if isinstance(input, string_types):
             input = open(input)
-        header = input.next().strip()
+        header = next(input).strip()
         if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
             raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                              (self.input, header))
@@ -568,7 +568,7 @@ def __iter__(self):
         yielded where appropriate, even if they are not explicitly stored in the
         Matrix Market file.
         """
-        if isinstance(self.input, basestring):
+        if isinstance(self.input, string_types):
             fin = open(self.input)
         else:
             fin = self.input
@@ -614,7 +614,7 @@ def docbyoffset(self, offset):
         # them with a special offset, -1.
         if offset == -1:
             return []
-        if isinstance(self.input, basestring):
+        if isinstance(self.input, string_types):
             fin = open(self.input)
         else:
             fin = self.input

diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -40,7 +40,8 @@
 import numpy as np
 import scipy.special as sp
 
-from gensim import interfaces, utils
+from .. import interfaces, utils
+from .._six.moves import xrange
 
 logger = logging.getLogger(__name__)
 

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -47,9 +47,10 @@
     from scipy.maxentropy import logsumexp # log(sum(exp(x))) that tries to avoid overflow
 except ImportError: # maxentropy has been removed for next release
     from scipy.misc import logsumexp
-from gensim import interfaces, utils
 
 
+from .. import interfaces, utils
+from .._six.moves import xrange
 
 
 def dirichlet_expectation(alpha):

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -61,6 +61,7 @@
 
 from .. import interfaces, matutils, utils
 from .._six import iterkeys
+from .._six.moves import xrange
 
 
 logger = logging.getLogger('gensim.models.lsimodel')

diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py
@@ -31,6 +31,10 @@
 Optimizations and cleanup of the code by Lars Buitinck, July 2012.
 """
 
+
+from .._six.moves import xrange
+
+
 class PorterStemmer(object):
     def __init__(self):
         """The main part of the stemming algorithm starts here.

diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
@@ -93,12 +93,12 @@ def preprocess_string(s, filters=DEFAULT_FILTERS):
 
 
 def preprocess_documents(docs):
-    return map(preprocess_string, docs)
+    return [preprocess_string(d) for d in docs]
 
 
 def read_file(path):
     return open(path).read()
 
 
 def read_files(pattern):
-    return map(read_file, glob.glob(pattern))
+    return [read_file(fname) for fname in glob.glob(pattern)]
diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
@@ -60,7 +60,7 @@
 import scipy.sparse
 
 from .. import interfaces, utils, matutils
-from .._six.moves import map as imap, zip as izip
+from .._six.moves import map as imap, xrange, zip as izip
 
 
 logger = logging.getLogger('gensim.similarities.docsim')

diff --git a/gensim/test/test_big.py b/gensim/test/test_big.py
@@ -34,7 +34,7 @@ def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len
         self.doc_len = doc_len
 
     def __iter__(self):
-        for _ in xrange(self.num_docs):
+        for _ in range(self.num_docs):
             doc_len = numpy.random.poisson(self.doc_len)
             ids = numpy.random.randint(0, len(self.dictionary), doc_len)
             if self.words_only:

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -59,7 +59,7 @@ def test_serialize(self, corpus=[[(1, 1.0)], [], [(0, 0.5), (2, 1.0)], []]):
         self.assertEqual(corpus, list(corpus2))
 
         # make sure the indexing corpus[i] works
-        for i in xrange(len(corpus)):
+        for i in range(len(corpus)):
             self.assertEqual(corpus[i], corpus2[i])
 
         # delete the temporary file

diff --git a/gensim/test/test_models.py b/gensim/test/test_models.py
@@ -191,7 +191,7 @@ def testTransform(self):
         # sometimes, LDA training gets stuck at a local minimum
         # in that case try re-training the model from scratch, hoping for a
         # better random initialization
-        for i in xrange(5): # restart at most 5 times
+        for i in range(5): # restart at most 5 times
             # create the transformation model
             model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
             model.update(corpus)

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -140,7 +140,7 @@ def testParallel(self):
             sims = model.most_similar('israeli')
             # the exact vectors and therefore similarities may differ, due to different thread collisions
             # so let's test only for top3
-            self.assertTrue('palestinian' in [sims[i][0] for i in xrange(3)])
+            self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])
 
 
     def testRNG(self):

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -41,6 +41,7 @@
     unicode = str
 
 from ._six import iteritems, u
+from ._six.moves import xrange
 
 try:
     from pattern.en import parse
@@ -370,13 +371,13 @@ def is_corpus(obj):
             # the input is an iterator object, meaning once we call next()
             # that element could be gone forever. we must be careful to put
             # whatever we retrieve back again
-            doc1 = obj.next()
+            doc1 = next(obj)
             obj = itertools.chain([doc1], obj)
         else:
-            doc1 = iter(obj).next() # empty corpus is resolved to False here
+            doc1 = next(iter(obj)) # empty corpus is resolved to False here
         if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
             return True, obj # the first document is empty=>assume this is a corpus
-        id1, val1 = iter(doc1).next() # if obj is a numpy array, it resolves to False here
+        id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here
         id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
     except:
         return False, obj
@@ -484,7 +485,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False):
     Return elements from the iterable in `chunksize`-ed lists. The last returned
     element may be smaller (if length of collection is not divisible by `chunksize`).
 
-    >>> print(list(grouper(xrange(10), 3)))
+    >>> print(list(grouper(range(10), 3)))
     [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
     """
     import numpy
@@ -565,7 +566,7 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
         If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize
         via `chunkize_serial()` (no I/O optimizations).
 
-        >>> for chunk in chunkize(xrange(10), 4): print(chunk)
+        >>> for chunk in chunkize(range(10), 4): print(chunk)
         [0, 1, 2, 3]
         [4, 5, 6, 7]
         [8, 9]