Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kzhai committed Aug 26, 2016
1 parent f7f78b5 commit 47cad69
Show file tree
Hide file tree
Showing 9 changed files with 2,677 additions and 2 deletions.
43 changes: 41 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,41 @@
# PyCTM
A Correlated Topic Model implementation in Python.
PyCTM
==========

PyCTM is a Correlated Topic Modeling package, please download the latest version from our [GitHub repository](https://github.com/kzhai/PyCTM).

Please send any bugs of problems to Ke Zhai ([email protected]).

Install and Build
----------

This package depends on many external python libraries, such as numpy, scipy and nltk.

Launch and Execute
----------

Assume the PyCTM package is downloaded under directory ```$PROJECT_SPACE/src/```, i.e.,

$PROJECT_SPACE/src/PyCTM

To prepare the example dataset,

tar zxvf nips-abstract.tar.gz

To launch PyCTM, first redirect to the directory of PyCTM source code,

cd $PROJECT_SPACE/src/PyCTM

and run the following command on example dataset,

python -m launch_train --input_directory=./nips-abstract --output_directory=./ --number_of_topics=10 --training_iterations=50

The generic argument to run PyCTM is

python -m launch_train --input_directory=$INPUT_DIRECTORY/$CORPUS_NAME --output_directory=$OUTPUT_DIRECTORY --number_of_topics=$NUMBER_OF_TOPICS --training_iterations=$NUMBER_OF_ITERATIONS

You should be able to find the output at directory ```$OUTPUT_DIRECTORY/$CORPUS_NAME```.

Under any circumstances, you may also get help information and usage hints by running the following command

python -m launch_train --help

Empty file added __init__.py
Empty file.
87 changes: 87 additions & 0 deletions inferencer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
@author: Ke Zhai ([email protected])
"""

import time
import numpy
import scipy
import nltk;

def compute_dirichlet_expectation(dirichlet_parameter):
if (len(dirichlet_parameter.shape) == 1):
return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter))
return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter, 1))[:, numpy.newaxis]

def parse_vocabulary(vocab):
type_to_index = {};
index_to_type = {};
for word in set(vocab):
index_to_type[len(index_to_type)] = word;
type_to_index[word] = len(type_to_index);

return type_to_index, index_to_type;

class Inferencer():
"""
"""
def __init__(self,
hyper_parameter_optimize_interval=10,
):

self._hyper_parameter_optimize_interval = hyper_parameter_optimize_interval;
# assert(self._hyper_parameter_optimize_interval>0);

# self._local_parameter_iterations = local_parameter_iterations
# assert(self._local_maximum_iteration>0)

"""
"""
def _initialize(self, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta):
self.parse_vocabulary(vocab);

# initialize the size of the vocabulary, i.e. total number of distinct tokens.
self._number_of_types = len(self._type_to_index)

self._counter = 0;

# initialize the total number of topics.
self._number_of_topics = number_of_topics

# initialize a K-dimensional vector, valued at 1/K.
if self._diagonal_covariance_matrix:
self._alpha_mu = numpy.zeros(self._number_of_topics) + alpha_mu;
self._alpha_sigma = numpy.zeros(self._number_of_topics) + alpha_sigma;
else:
self._alpha_mu = numpy.zeros((1, self._number_of_topics)) + alpha_mu;
self._alpha_sigma = numpy.eye(self._number_of_topics) * alpha_sigma;
self._alpha_sigma_inv = numpy.linalg.pinv(self._alpha_sigma);

self._alpha_beta = numpy.zeros(self._number_of_types) + alpha_beta;

def parse_vocabulary(self, vocab):
self._type_to_index = {};
self._index_to_type = {};
for word in set(vocab):
self._index_to_type[len(self._index_to_type)] = word;
self._type_to_index[word] = len(self._type_to_index);

self._vocab = self._type_to_index.keys();

def parse_data(self):
raise NotImplementedError;

"""
"""
def learning(self):
raise NotImplementedError;

"""
"""
def inference(self):
raise NotImplementedError;

def export_beta(self, exp_beta_path, top_display=-1):
raise NotImplementedError;

if __name__ == "__main__":
raise NotImplementedError;
69 changes: 69 additions & 0 deletions launch_profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import cPickle, string, numpy, getopt, sys, random, time, re, pprint
import datetime, os;

import nltk;
import numpy;
import cProfile

def main():
# parameter set 1
input_directory = "./nips-abstract"

input_directory = input_directory.rstrip("/");
# corpus_name = os.path.basename(input_directory);

'''
output_directory = options.output_directory;
if not os.path.exists(output_directory):
os.mkdir(output_directory);
output_directory = os.path.join(output_directory, corpus_name);
if not os.path.exists(output_directory):
os.mkdir(output_directory);
'''

# Document
train_docs_path = os.path.join(input_directory, 'train.dat')
input_doc_stream = open(train_docs_path, 'r');
train_docs = [];
for line in input_doc_stream:
train_docs.append(line.strip().lower());
print "successfully load all training docs from %s..." % (os.path.abspath(train_docs_path));

# Vocabulary
vocabulary_path = os.path.join(input_directory, 'voc.dat');
input_voc_stream = open(vocabulary_path, 'r');
vocab = [];
for line in input_voc_stream:
vocab.append(line.strip().lower().split()[0]);
vocab = list(set(vocab));
print "successfully load all the words from %s..." % (os.path.abspath(vocabulary_path));

# parameter 2
number_of_topics = 10;
alpha_mu = 0;
alpha_sigma = 1;
alpha_beta = 1.0 / len(vocab);

# parameter set 3
training_iterations = 1;

import variational_bayes
ctm_inferencer = variational_bayes.VariationalBayes();

ctm_inferencer._initialize(train_docs, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta);

for iteration in xrange(training_iterations):
clock = time.time();
log_likelihood = ctm_inferencer.learning();
clock = time.time() - clock;

# print 'training iteration %d finished in %f seconds: number-of-topics = %d, log-likelihood = %f' % (hdp._iteration_counter, clock, hdp._K, log_likelihood);

# gamma_path = os.path.join(output_directory, 'gamma.txt');
# numpy.savetxt(gamma_path, hdp._document_topic_distribution);

# topic_inactive_counts_path = os.path.join(output_directory, "topic_inactive_counts.txt");
# numpy.savetxt(topic_inactive_counts_path, hdp._topic_inactive_counts);

if __name__ == '__main__':
main()
130 changes: 130 additions & 0 deletions launch_resume.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import cPickle;
import optparse
import string, numpy, getopt, sys, random, time, re, pprint
import datetime, os;

import numpy;
import shutil

# model_settings_pattern = re.compile('\d+-\d+-ctm_inferencer-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-aa(?P<alpha>[\d\.]+)(-smh(?P<smh>[\d]+))?(-sp(?P<sp>[\d]+)-mp(?P<mp>[\d]+))?');
model_settings_pattern = re.compile('\d+-\d+-ctm-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-K(?P<topic>\d+)-am(?P<alpha_mu>[\d\.]+)-as(?P<alpha_sigma>[\d\.]+)-ab(?P<alpha_beta>[\d\.]+)');

def parse_args():
parser = optparse.OptionParser()
parser.set_defaults(# parameter set 1
# input_file=None,
model_directory=None,
snapshot_index=-1,

# parameter set 2
output_directory=None,
training_iterations=-1,
snapshot_interval=-1,
)
# parameter set 1
# parser.add_option("--input_file", type="string", dest="input_file",
# help="input directory [None]");
# parser.add_option("--input_directory", type="string", dest="input_directory",
# help="input directory [None]");
parser.add_option("--model_directory", type="string", dest="model_directory",
help="model directory [None]");
parser.add_option("--snapshot_index", type="int", dest="snapshot_index",
help="snapshot index [-1]");
# parser.add_option("--training_iterations", type="int", dest="training_iterations",
# help="number of training iterations [1000]");
# parser.add_option("--dataset_name", type="string", dest="dataset_name",
# help="the corpus name [None]");

# parameter set 2
parser.add_option("--output_directory", type="string", dest="output_directory",
help="output directory [None]");
# parser.add_option("--alpha_alpha", type="float", dest="alpha_alpha",
# help="hyper-parameter for Dirichlet process of cluster [1]")
# parser.add_option("--alpha_kappa", type="float", dest="alpha_kappa",
# help="hyper-parameter for top level Dirichlet process of distribution over topics [1]")
# parser.add_option("--alpha_nu", type="float", dest="alpha_nu",
# help="hyper-parameter for bottom level Dirichlet process of distribution over topics [1]")
parser.add_option("--training_iterations", type="int", dest="training_iterations",
help="number of training iterations [-1]");
parser.add_option("--snapshot_interval", type="int", dest="snapshot_interval",
help="snapshot interval [-1 (default): remain unchanged]");

(options, args) = parser.parse_args();
return options;

def main():
options = parse_args();

assert(options.model_directory != None);
model_directory = options.model_directory;

if not os.path.exists(model_directory):
sys.stderr.write("model directory %s not exists...\n" % (model_directory));
return;
model_directory = model_directory.rstrip("/");
model_settings = os.path.basename(model_directory);

assert options.snapshot_index > 0
snapshot_index = options.snapshot_index;

# load the existing model
model_snapshot_file_path = os.path.join(model_directory, "model-%d" % snapshot_index);
if not os.path.exists(model_snapshot_file_path):
sys.stderr.write("error: model snapshot file unfound %s...\n" % (model_snapshot_file_path));
return;

import variational_bayes;
ctm_inferencer = cPickle.load(open(model_snapshot_file_path, "rb"));
print 'successfully load model snapshot %s...' % (os.path.join(model_directory, "model-%d" % snapshot_index));

# set the resume options
matches = re.match(model_settings_pattern, model_settings);

# training_iterations = int(matches.group('iteration'));
training_iterations = options.training_iterations;
assert training_iterations > snapshot_index;
if options.snapshot_interval == -1:
snapshot_interval = int(matches.group('snapshot'));
else:
snapshot_interval = options.snapshot_interval;
number_of_topics = int(matches.group('topic'));
alpha_mu = float(matches.group('alpha_mu'));
alpha_sigma = float(matches.group('alpha_sigma'));
alpha_beta = float(matches.group('alpha_beta'));

now = datetime.datetime.now();
suffix = now.strftime("%y%m%d-%H%M%S") + "";
suffix += "-%s" % ("ctm");
suffix += "-I%d" % (training_iterations);
suffix += "-S%d" % (snapshot_interval);
suffix += "-K%g" % (number_of_topics);
suffix += "-am%g" % (alpha_mu);
suffix += "-as%g" % (alpha_sigma);
suffix += "-ab%g" % (alpha_beta);

assert options.output_directory != None;
output_directory = options.output_directory;
output_directory = output_directory.rstrip("/");
output_directory = os.path.join(output_directory, suffix);
assert (not os.path.exists(os.path.abspath(output_directory)));
os.mkdir(os.path.abspath(output_directory));

shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "model-" + str(snapshot_index)));
shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "exp_beta-" + str(snapshot_index)));

for iteration in xrange(snapshot_index, training_iterations):
# clock = time.time();
log_likelihood = ctm_inferencer.learning();
# clock = time.time()-clock;
# print 'training iteration %d finished in %f seconds: number-of-clusters = %d, log-likelihood = %f' % (dpgm._iteration_counter, clock, dpgm._K, log_likelihood);

if ((ctm_inferencer._counter) % snapshot_interval == 0):
ctm_inferencer.export_beta(os.path.join(output_directory, 'exp_beta-' + str(ctm_inferencer._counter)));
model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter));
cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb'));

model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter));
cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb'));

if __name__ == '__main__':
main()
Loading

0 comments on commit 47cad69

Please sign in to comment.