Skip to content

Commit

Permalink
more Py3 fixes
Browse files Browse the repository at this point in the history
Some Py3 tests pass now; many choke on str/unicode differences.
  • Loading branch information
larsmans committed Feb 14, 2014
1 parent 447da27 commit 7465723
Show file tree
Hide file tree
Showing 17 changed files with 39 additions and 28 deletions.
5 changes: 3 additions & 2 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from os import path
import logging

from gensim import interfaces, utils
from gensim.corpora import IndexedCorpus
from .. import interfaces, utils
from ..corpora import IndexedCorpus
from .._six.moves import xrange


logger = logging.getLogger('gensim.corpora.bleicorpus')
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __iter__(self):
"""
reader = csv.reader(open(self.fname), self.dialect)
if self.headers:
reader.next() # skip the headers
next(reader) # skip the headers

line_no = -1
for line_no, line in enumerate(reader):
Expand Down
3 changes: 2 additions & 1 deletion gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import logging

from .. import interfaces, utils
from .._six import string_types
from .dictionary import Dictionary

logger = logging.getLogger('gensim.corpora.textcorpus')
Expand All @@ -43,7 +44,7 @@ def getstream(input):
If input is a file-like object, reset it to the beginning with `input.seek(0)`.
"""
assert input is not None
if isinstance(input, basestring):
if isinstance(input, string_types):
# input was a filename: open as text file
result = open(input)
else:
Expand Down
6 changes: 3 additions & 3 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def __init__(self, input):
input = open(input)

self.num_docs = self.num_terms = self.num_nnz = 0
self.num_docs = int(input.next().strip())
self.num_terms = int(input.next().strip())
self.num_nnz = int(input.next().strip())
self.num_docs = int(next(input).strip())
self.num_terms = int(next(input).strip())
self.num_nnz = int(next(input).strip())

logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
(self.num_docs, self.num_terms, self.num_nnz))
Expand Down
1 change: 1 addition & 0 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import itertools

from . import utils, matutils
from ._six.moves import xrange


logger = logging.getLogger('gensim.interfaces')
Expand Down
14 changes: 7 additions & 7 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import scipy.linalg
from scipy.linalg.lapack import get_lapack_funcs

from ._six import iteritems, itervalues
from ._six.moves import zip as izip
from ._six import iteritems, itervalues, string_types
from ._six.moves import xrange, zip as izip

# scipy is not a stable package yet, locations change, so try to work
# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8)
Expand Down Expand Up @@ -318,7 +318,7 @@ def unitvec(vec):
return vec

try:
first = iter(vec).next() # is there at least one element?
first = next(iter(vec)) # is there at least one element?
except:
return vec

Expand Down Expand Up @@ -526,9 +526,9 @@ def __init__(self, input, transposed=True):
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
if isinstance(input, basestring):
if isinstance(input, string_types):
input = open(input)
header = input.next().strip()
header = next(input).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
Expand Down Expand Up @@ -568,7 +568,7 @@ def __iter__(self):
yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
"""
if isinstance(self.input, basestring):
if isinstance(self.input, string_types):
fin = open(self.input)
else:
fin = self.input
Expand Down Expand Up @@ -614,7 +614,7 @@ def docbyoffset(self, offset):
# them with a special offset, -1.
if offset == -1:
return []
if isinstance(self.input, basestring):
if isinstance(self.input, string_types):
fin = open(self.input)
else:
fin = self.input
Expand Down
3 changes: 2 additions & 1 deletion gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
import numpy as np
import scipy.special as sp

from gensim import interfaces, utils
from .. import interfaces, utils
from .._six.moves import xrange

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@
from scipy.maxentropy import logsumexp # log(sum(exp(x))) that tries to avoid overflow
except ImportError: # maxentropy has been removed for next release
from scipy.misc import logsumexp
from gensim import interfaces, utils


from .. import interfaces, utils
from .._six.moves import xrange


def dirichlet_expectation(alpha):
Expand Down
1 change: 1 addition & 0 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

from .. import interfaces, matutils, utils
from .._six import iterkeys
from .._six.moves import xrange


logger = logging.getLogger('gensim.models.lsimodel')
Expand Down
4 changes: 4 additions & 0 deletions gensim/parsing/porter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
Optimizations and cleanup of the code by Lars Buitinck, July 2012.
"""


from .._six.moves import xrange


class PorterStemmer(object):
def __init__(self):
"""The main part of the stemming algorithm starts here.
Expand Down
4 changes: 2 additions & 2 deletions gensim/parsing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ def preprocess_string(s, filters=DEFAULT_FILTERS):


def preprocess_documents(docs):
return map(preprocess_string, docs)
return [preprocess_string(d) for d in docs]


def read_file(path):
return open(path).read()


def read_files(pattern):
return map(read_file, glob.glob(pattern))
return [read_file(fname) for fname in glob.glob(pattern)]
2 changes: 1 addition & 1 deletion gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
import scipy.sparse

from .. import interfaces, utils, matutils
from .._six.moves import map as imap, zip as izip
from .._six.moves import map as imap, xrange, zip as izip


logger = logging.getLogger('gensim.similarities.docsim')
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_big.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len
self.doc_len = doc_len

def __iter__(self):
for _ in xrange(self.num_docs):
for _ in range(self.num_docs):
doc_len = numpy.random.poisson(self.doc_len)
ids = numpy.random.randint(0, len(self.dictionary), doc_len)
if self.words_only:
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_serialize(self, corpus=[[(1, 1.0)], [], [(0, 0.5), (2, 1.0)], []]):
self.assertEqual(corpus, list(corpus2))

# make sure the indexing corpus[i] works
for i in xrange(len(corpus)):
for i in range(len(corpus)):
self.assertEqual(corpus[i], corpus2[i])

# delete the temporary file
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def testTransform(self):
# sometimes, LDA training gets stuck at a local minimum
# in that case try re-training the model from scratch, hoping for a
# better random initialization
for i in xrange(5): # restart at most 5 times
for i in range(5): # restart at most 5 times
# create the transformation model
model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
model.update(corpus)
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def testParallel(self):
sims = model.most_similar('israeli')
# the exact vectors and therefore similarities may differ, due to different thread collisions
# so let's test only for top3
self.assertTrue('palestinian' in [sims[i][0] for i in xrange(3)])
self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])


def testRNG(self):
Expand Down
11 changes: 6 additions & 5 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
unicode = str

from ._six import iteritems, u
from ._six.moves import xrange

try:
from pattern.en import parse
Expand Down Expand Up @@ -370,13 +371,13 @@ def is_corpus(obj):
# the input is an iterator object, meaning once we call next()
# that element could be gone forever. we must be careful to put
# whatever we retrieve back again
doc1 = obj.next()
doc1 = next(obj)
obj = itertools.chain([doc1], obj)
else:
doc1 = iter(obj).next() # empty corpus is resolved to False here
doc1 = next(iter(obj)) # empty corpus is resolved to False here
if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
return True, obj # the first document is empty=>assume this is a corpus
id1, val1 = iter(doc1).next() # if obj is a numpy array, it resolves to False here
id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here
id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
except:
return False, obj
Expand Down Expand Up @@ -484,7 +485,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False):
Return elements from the iterable in `chunksize`-ed lists. The last returned
element may be smaller (if length of collection is not divisible by `chunksize`).
>>> print(list(grouper(xrange(10), 3)))
>>> print(list(grouper(range(10), 3)))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
"""
import numpy
Expand Down Expand Up @@ -565,7 +566,7 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize
via `chunkize_serial()` (no I/O optimizations).
>>> for chunk in chunkize(xrange(10), 4): print(chunk)
>>> for chunk in chunkize(range(10), 4): print(chunk)
[0, 1, 2, 3]
[4, 5, 6, 7]
[8, 9]
Expand Down

0 comments on commit 7465723

Please sign in to comment.