Skip to content

Commit

Permalink
Merge branch 'release-0.11.1-1'
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Apr 11, 2015
2 parents c49cc3b + d60fab8 commit 99dec82
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 23 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Changes
=======

0.11.0 = 0.11.1, 10/04/2015
0.11.0 = 0.11.1 = 0.11.1-1, 10/04/2015

* added "topic ranking" to sort topics in LdaModel (jtmcmc, #311)
* new fast ShardedCorpus out-of-core corpus (Jan Hajic jr., #284)
Expand All @@ -13,7 +13,7 @@ Changes
* save/load methods now accept file handles, in addition to file names (macks22, #292)
* fixes to LdaMulticore on Windows (Feng Mai, #305)
* lots of small fixes & py3k compatibility improvements (Chyi-Kwei Yau, Daniel Nouri, Timothy Emerick, Juarez Bochi, Christopher Corley, Chirag Nagpal, Jan Hajic jr., Flávio Codeço Coelho)
* 0.11.0 had a packaging bug (and PyPI doesn't allow re-upload) => 0.11.1 is an identical re-release
* re-released as 0.11.1 and 0.11.1-1 because of a packaging bug

0.10.3, 17/11/2014

Expand Down
40 changes: 20 additions & 20 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@
import scipy.sparse as sparse
import time

logger = logging.getLogger(__name__)

#: Specifies which dtype should be used for serializing the shards.
_default_dtype = float
try:
import theano
_default_dtype = theano.config.floatX
except ImportError:
logging.info('Could not import Theano, will use standard float'
'for default ShardedCorpus dtype.')
pass
logger.info('Could not import Theano, will use standard float for default ShardedCorpus dtype.')


from six.moves import xrange
Expand Down Expand Up @@ -235,20 +235,20 @@ def __init__(self, output_prefix, corpus, dim=None,
self.current_offset = None # The index into the dataset which
# corresponds to index 0 of current shard

logging.info('Initializing sharded corpus with prefix '
logger.info('Initializing sharded corpus with prefix '
'{0}'.format(output_prefix))
if (not os.path.isfile(output_prefix)) or overwrite:
logging.info('Building from corpus...')
logger.info('Building from corpus...')
self.init_shards(output_prefix, corpus, shardsize)

# Save automatically, to facilitate re-loading
# and retain information about how the corpus
# was serialized.
logging.info('Saving ShardedCorpus object to '
logger.info('Saving ShardedCorpus object to '
'{0}'.format(self.output_prefix))
self.save()
else:
logging.info('Cloning existing...')
logger.info('Cloning existing...')
self.init_by_clone()

def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtype):
Expand All @@ -261,10 +261,10 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
proposed_dim = self._guess_n_features(corpus)
if proposed_dim != self.dim:
if self.dim is None:
logging.info('Deriving dataset dimension from corpus: '
logger.info('Deriving dataset dimension from corpus: '
'{0}'.format(proposed_dim))
else:
logging.warn('Dataset dimension derived from input corpus diffe'
logger.warn('Dataset dimension derived from input corpus diffe'
'rs from initialization argument, using corpus.'
'(corpus {0}, init arg {1})'.format(proposed_dim,
self.dim))
Expand All @@ -274,13 +274,13 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp

start_time = time.clock()

logging.info('Running init from corpus.')
logger.info('Running init from corpus.')

for n, doc_chunk in enumerate(gensim.utils.grouper(corpus, chunksize=shardsize)):
logging.info('Chunk no. {0} at {1} s'.format(n, time.clock() - start_time))
logger.info('Chunk no. {0} at {1} s'.format(n, time.clock() - start_time))

current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype)
logging.debug('Current chunk dimension: '
logger.debug('Current chunk dimension: '
'{0} x {1}'.format(len(doc_chunk), self.dim))

for i, doc in enumerate(doc_chunk):
Expand All @@ -294,7 +294,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
self.save_shard(current_shard)

end_time = time.clock()
logging.info('Built {0} shards in {1} s.'.format(self.n_shards, end_time - start_time))
logger.info('Built {0} shards in {1} s.'.format(self.n_shards, end_time - start_time))

def init_by_clone(self):
"""
Expand All @@ -309,9 +309,9 @@ def init_by_clone(self):

if temp.dim != self.dim:
if self.dim is None:
logging.info('Loaded dataset dimension: {0}'.format(temp.dim))
logger.info('Loaded dataset dimension: {0}'.format(temp.dim))
else:
logging.warn('Loaded dataset dimension differs from init arg '
logger.warn('Loaded dataset dimension differs from init arg '
'dimension, using loaded dim. '
'(loaded {0}, init {1})'.format(temp.dim, self.dim))

Expand Down Expand Up @@ -344,7 +344,7 @@ def load_shard(self, n):
"""
Load (unpickle) the n-th shard as the "live" part of the dataset
into the Dataset object."""
#logging.debug('ShardedCorpus loading shard {0}, '
#logger.debug('ShardedCorpus loading shard {0}, '
# 'current shard: {1}'.format(n, self.current_shard_n))

# No-op if the shard is already open.
Expand Down Expand Up @@ -470,7 +470,7 @@ def resize_shards(self, shardsize):
for old_shard_n, old_shard_name in enumerate(old_shard_names):
os.remove(old_shard_name)
except Exception as e:
logging.error('Exception occurred during old shard no. {0} '
logger.error('Exception occurred during old shard no. {0} '
'removal: {1}.\nAttempting to at least move '
'new shards in.'.format(old_shard_n, str(e)))
finally:
Expand Down Expand Up @@ -531,12 +531,12 @@ def _guess_n_features(self, corpus):
'refusing to guess (dimension set to {0},'
'type of corpus: {1}).'.format(self.dim, type(corpus)))
else:
logging.warn('Couldn\'t find number of features, trusting '
logger.warn('Couldn\'t find number of features, trusting '
'supplied dimension ({0})'.format(self.dim))
n_features = self.dim

if self.dim and n_features != self.dim:
logging.warn('Discovered inconsistent dataset dim ({0}) and '
logger.warn('Discovered inconsistent dataset dim ({0}) and '
'feature count from corpus ({1}). Coercing to dimension'
' given by argument.'.format(self.dim, n_features))

Expand Down Expand Up @@ -604,7 +604,7 @@ def __getitem__(self, offset):
# This fails on one-past
# slice indexing; that's why there's a code branch here.

#logging.debug('ShardedCorpus: Retrieving slice {0}: '
#logger.debug('ShardedCorpus: Retrieving slice {0}: '
# 'shard {1}'.format((offset.start, offset.stop),
# (first_shard, last_shard)))

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def readfile(fname):

setup(
name='gensim',
version='0.11.1',
version='0.11.1-1',
description='Python framework for fast Vector Space Modelling',
long_description=readfile('README.rst'),

Expand Down

0 comments on commit 99dec82

Please sign in to comment.