forked from kzhai/PyCTM
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
2,677 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,41 @@ | ||
# PyCTM | ||
A Correlated Topic Model implementation in Python. | ||
PyCTM | ||
========== | ||
|
||
PyCTM is a Correlated Topic Modeling package, please download the latest version from our [GitHub repository](https://github.com/kzhai/PyCTM). | ||
|
||
Please send any bugs of problems to Ke Zhai ([email protected]). | ||
|
||
Install and Build | ||
---------- | ||
|
||
This package depends on many external python libraries, such as numpy, scipy and nltk. | ||
|
||
Launch and Execute | ||
---------- | ||
|
||
Assume the PyCTM package is downloaded under directory ```$PROJECT_SPACE/src/```, i.e., | ||
|
||
$PROJECT_SPACE/src/PyCTM | ||
|
||
To prepare the example dataset, | ||
|
||
tar zxvf nips-abstract.tar.gz | ||
|
||
To launch PyCTM, first redirect to the directory of PyCTM source code, | ||
|
||
cd $PROJECT_SPACE/src/PyCTM | ||
|
||
and run the following command on example dataset, | ||
|
||
python -m launch_train --input_directory=./nips-abstract --output_directory=./ --number_of_topics=10 --training_iterations=50 | ||
|
||
The generic argument to run PyCTM is | ||
|
||
python -m launch_train --input_directory=$INPUT_DIRECTORY/$CORPUS_NAME --output_directory=$OUTPUT_DIRECTORY --number_of_topics=$NUMBER_OF_TOPICS --training_iterations=$NUMBER_OF_ITERATIONS | ||
|
||
You should be able to find the output at directory ```$OUTPUT_DIRECTORY/$CORPUS_NAME```. | ||
|
||
Under any circumstances, you may also get help information and usage hints by running the following command | ||
|
||
python -m launch_train --help | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
""" | ||
@author: Ke Zhai ([email protected]) | ||
""" | ||
|
||
import time | ||
import numpy | ||
import scipy | ||
import nltk; | ||
|
||
def compute_dirichlet_expectation(dirichlet_parameter): | ||
if (len(dirichlet_parameter.shape) == 1): | ||
return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter)) | ||
return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter, 1))[:, numpy.newaxis] | ||
|
||
def parse_vocabulary(vocab): | ||
type_to_index = {}; | ||
index_to_type = {}; | ||
for word in set(vocab): | ||
index_to_type[len(index_to_type)] = word; | ||
type_to_index[word] = len(type_to_index); | ||
|
||
return type_to_index, index_to_type; | ||
|
||
class Inferencer(): | ||
""" | ||
""" | ||
def __init__(self, | ||
hyper_parameter_optimize_interval=10, | ||
): | ||
|
||
self._hyper_parameter_optimize_interval = hyper_parameter_optimize_interval; | ||
# assert(self._hyper_parameter_optimize_interval>0); | ||
|
||
# self._local_parameter_iterations = local_parameter_iterations | ||
# assert(self._local_maximum_iteration>0) | ||
|
||
""" | ||
""" | ||
def _initialize(self, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta): | ||
self.parse_vocabulary(vocab); | ||
|
||
# initialize the size of the vocabulary, i.e. total number of distinct tokens. | ||
self._number_of_types = len(self._type_to_index) | ||
|
||
self._counter = 0; | ||
|
||
# initialize the total number of topics. | ||
self._number_of_topics = number_of_topics | ||
|
||
# initialize a K-dimensional vector, valued at 1/K. | ||
if self._diagonal_covariance_matrix: | ||
self._alpha_mu = numpy.zeros(self._number_of_topics) + alpha_mu; | ||
self._alpha_sigma = numpy.zeros(self._number_of_topics) + alpha_sigma; | ||
else: | ||
self._alpha_mu = numpy.zeros((1, self._number_of_topics)) + alpha_mu; | ||
self._alpha_sigma = numpy.eye(self._number_of_topics) * alpha_sigma; | ||
self._alpha_sigma_inv = numpy.linalg.pinv(self._alpha_sigma); | ||
|
||
self._alpha_beta = numpy.zeros(self._number_of_types) + alpha_beta; | ||
|
||
def parse_vocabulary(self, vocab): | ||
self._type_to_index = {}; | ||
self._index_to_type = {}; | ||
for word in set(vocab): | ||
self._index_to_type[len(self._index_to_type)] = word; | ||
self._type_to_index[word] = len(self._type_to_index); | ||
|
||
self._vocab = self._type_to_index.keys(); | ||
|
||
def parse_data(self): | ||
raise NotImplementedError; | ||
|
||
""" | ||
""" | ||
def learning(self): | ||
raise NotImplementedError; | ||
|
||
""" | ||
""" | ||
def inference(self): | ||
raise NotImplementedError; | ||
|
||
def export_beta(self, exp_beta_path, top_display=-1): | ||
raise NotImplementedError; | ||
|
||
if __name__ == "__main__": | ||
raise NotImplementedError; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import cPickle, string, numpy, getopt, sys, random, time, re, pprint | ||
import datetime, os; | ||
|
||
import nltk; | ||
import numpy; | ||
import cProfile | ||
|
||
def main(): | ||
# parameter set 1 | ||
input_directory = "./nips-abstract" | ||
|
||
input_directory = input_directory.rstrip("/"); | ||
# corpus_name = os.path.basename(input_directory); | ||
|
||
''' | ||
output_directory = options.output_directory; | ||
if not os.path.exists(output_directory): | ||
os.mkdir(output_directory); | ||
output_directory = os.path.join(output_directory, corpus_name); | ||
if not os.path.exists(output_directory): | ||
os.mkdir(output_directory); | ||
''' | ||
|
||
# Document | ||
train_docs_path = os.path.join(input_directory, 'train.dat') | ||
input_doc_stream = open(train_docs_path, 'r'); | ||
train_docs = []; | ||
for line in input_doc_stream: | ||
train_docs.append(line.strip().lower()); | ||
print "successfully load all training docs from %s..." % (os.path.abspath(train_docs_path)); | ||
|
||
# Vocabulary | ||
vocabulary_path = os.path.join(input_directory, 'voc.dat'); | ||
input_voc_stream = open(vocabulary_path, 'r'); | ||
vocab = []; | ||
for line in input_voc_stream: | ||
vocab.append(line.strip().lower().split()[0]); | ||
vocab = list(set(vocab)); | ||
print "successfully load all the words from %s..." % (os.path.abspath(vocabulary_path)); | ||
|
||
# parameter 2 | ||
number_of_topics = 10; | ||
alpha_mu = 0; | ||
alpha_sigma = 1; | ||
alpha_beta = 1.0 / len(vocab); | ||
|
||
# parameter set 3 | ||
training_iterations = 1; | ||
|
||
import variational_bayes | ||
ctm_inferencer = variational_bayes.VariationalBayes(); | ||
|
||
ctm_inferencer._initialize(train_docs, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta); | ||
|
||
for iteration in xrange(training_iterations): | ||
clock = time.time(); | ||
log_likelihood = ctm_inferencer.learning(); | ||
clock = time.time() - clock; | ||
|
||
# print 'training iteration %d finished in %f seconds: number-of-topics = %d, log-likelihood = %f' % (hdp._iteration_counter, clock, hdp._K, log_likelihood); | ||
|
||
# gamma_path = os.path.join(output_directory, 'gamma.txt'); | ||
# numpy.savetxt(gamma_path, hdp._document_topic_distribution); | ||
|
||
# topic_inactive_counts_path = os.path.join(output_directory, "topic_inactive_counts.txt"); | ||
# numpy.savetxt(topic_inactive_counts_path, hdp._topic_inactive_counts); | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
import cPickle; | ||
import optparse | ||
import string, numpy, getopt, sys, random, time, re, pprint | ||
import datetime, os; | ||
|
||
import numpy; | ||
import shutil | ||
|
||
# model_settings_pattern = re.compile('\d+-\d+-ctm_inferencer-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-aa(?P<alpha>[\d\.]+)(-smh(?P<smh>[\d]+))?(-sp(?P<sp>[\d]+)-mp(?P<mp>[\d]+))?'); | ||
model_settings_pattern = re.compile('\d+-\d+-ctm-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-K(?P<topic>\d+)-am(?P<alpha_mu>[\d\.]+)-as(?P<alpha_sigma>[\d\.]+)-ab(?P<alpha_beta>[\d\.]+)'); | ||
|
||
def parse_args(): | ||
parser = optparse.OptionParser() | ||
parser.set_defaults(# parameter set 1 | ||
# input_file=None, | ||
model_directory=None, | ||
snapshot_index=-1, | ||
|
||
# parameter set 2 | ||
output_directory=None, | ||
training_iterations=-1, | ||
snapshot_interval=-1, | ||
) | ||
# parameter set 1 | ||
# parser.add_option("--input_file", type="string", dest="input_file", | ||
# help="input directory [None]"); | ||
# parser.add_option("--input_directory", type="string", dest="input_directory", | ||
# help="input directory [None]"); | ||
parser.add_option("--model_directory", type="string", dest="model_directory", | ||
help="model directory [None]"); | ||
parser.add_option("--snapshot_index", type="int", dest="snapshot_index", | ||
help="snapshot index [-1]"); | ||
# parser.add_option("--training_iterations", type="int", dest="training_iterations", | ||
# help="number of training iterations [1000]"); | ||
# parser.add_option("--dataset_name", type="string", dest="dataset_name", | ||
# help="the corpus name [None]"); | ||
|
||
# parameter set 2 | ||
parser.add_option("--output_directory", type="string", dest="output_directory", | ||
help="output directory [None]"); | ||
# parser.add_option("--alpha_alpha", type="float", dest="alpha_alpha", | ||
# help="hyper-parameter for Dirichlet process of cluster [1]") | ||
# parser.add_option("--alpha_kappa", type="float", dest="alpha_kappa", | ||
# help="hyper-parameter for top level Dirichlet process of distribution over topics [1]") | ||
# parser.add_option("--alpha_nu", type="float", dest="alpha_nu", | ||
# help="hyper-parameter for bottom level Dirichlet process of distribution over topics [1]") | ||
parser.add_option("--training_iterations", type="int", dest="training_iterations", | ||
help="number of training iterations [-1]"); | ||
parser.add_option("--snapshot_interval", type="int", dest="snapshot_interval", | ||
help="snapshot interval [-1 (default): remain unchanged]"); | ||
|
||
(options, args) = parser.parse_args(); | ||
return options; | ||
|
||
def main(): | ||
options = parse_args(); | ||
|
||
assert(options.model_directory != None); | ||
model_directory = options.model_directory; | ||
|
||
if not os.path.exists(model_directory): | ||
sys.stderr.write("model directory %s not exists...\n" % (model_directory)); | ||
return; | ||
model_directory = model_directory.rstrip("/"); | ||
model_settings = os.path.basename(model_directory); | ||
|
||
assert options.snapshot_index > 0 | ||
snapshot_index = options.snapshot_index; | ||
|
||
# load the existing model | ||
model_snapshot_file_path = os.path.join(model_directory, "model-%d" % snapshot_index); | ||
if not os.path.exists(model_snapshot_file_path): | ||
sys.stderr.write("error: model snapshot file unfound %s...\n" % (model_snapshot_file_path)); | ||
return; | ||
|
||
import variational_bayes; | ||
ctm_inferencer = cPickle.load(open(model_snapshot_file_path, "rb")); | ||
print 'successfully load model snapshot %s...' % (os.path.join(model_directory, "model-%d" % snapshot_index)); | ||
|
||
# set the resume options | ||
matches = re.match(model_settings_pattern, model_settings); | ||
|
||
# training_iterations = int(matches.group('iteration')); | ||
training_iterations = options.training_iterations; | ||
assert training_iterations > snapshot_index; | ||
if options.snapshot_interval == -1: | ||
snapshot_interval = int(matches.group('snapshot')); | ||
else: | ||
snapshot_interval = options.snapshot_interval; | ||
number_of_topics = int(matches.group('topic')); | ||
alpha_mu = float(matches.group('alpha_mu')); | ||
alpha_sigma = float(matches.group('alpha_sigma')); | ||
alpha_beta = float(matches.group('alpha_beta')); | ||
|
||
now = datetime.datetime.now(); | ||
suffix = now.strftime("%y%m%d-%H%M%S") + ""; | ||
suffix += "-%s" % ("ctm"); | ||
suffix += "-I%d" % (training_iterations); | ||
suffix += "-S%d" % (snapshot_interval); | ||
suffix += "-K%g" % (number_of_topics); | ||
suffix += "-am%g" % (alpha_mu); | ||
suffix += "-as%g" % (alpha_sigma); | ||
suffix += "-ab%g" % (alpha_beta); | ||
|
||
assert options.output_directory != None; | ||
output_directory = options.output_directory; | ||
output_directory = output_directory.rstrip("/"); | ||
output_directory = os.path.join(output_directory, suffix); | ||
assert (not os.path.exists(os.path.abspath(output_directory))); | ||
os.mkdir(os.path.abspath(output_directory)); | ||
|
||
shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "model-" + str(snapshot_index))); | ||
shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "exp_beta-" + str(snapshot_index))); | ||
|
||
for iteration in xrange(snapshot_index, training_iterations): | ||
# clock = time.time(); | ||
log_likelihood = ctm_inferencer.learning(); | ||
# clock = time.time()-clock; | ||
# print 'training iteration %d finished in %f seconds: number-of-clusters = %d, log-likelihood = %f' % (dpgm._iteration_counter, clock, dpgm._K, log_likelihood); | ||
|
||
if ((ctm_inferencer._counter) % snapshot_interval == 0): | ||
ctm_inferencer.export_beta(os.path.join(output_directory, 'exp_beta-' + str(ctm_inferencer._counter))); | ||
model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter)); | ||
cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb')); | ||
|
||
model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter)); | ||
cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb')); | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.