initial commit

fdtomasi · Aug 26, 2016 · 47cad69 · 47cad69
1 parent f7f78b5
commit 47cad69
Show file tree

Hide file tree

Showing 9 changed files with 2,677 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,41 @@
-# PyCTM
-A Correlated Topic Model implementation in Python.
+PyCTM
+==========
+
+PyCTM is a Correlated Topic Modeling package, please download the latest version from our [GitHub repository](https://github.com/kzhai/PyCTM).
+
+Please send any bugs of problems to Ke Zhai ([email protected]).
+
+Install and Build
+----------
+
+This package depends on many external python libraries, such as numpy, scipy and nltk.
+
+Launch and Execute
+----------
+
+Assume the PyCTM package is downloaded under directory ```$PROJECT_SPACE/src/```, i.e.,
+
+	$PROJECT_SPACE/src/PyCTM
+
+To prepare the example dataset,
+
+	tar zxvf nips-abstract.tar.gz
+
+To launch PyCTM, first redirect to the directory of PyCTM source code,
+
+	cd $PROJECT_SPACE/src/PyCTM
+
+and run the following command on example dataset,
+
+	python -m launch_train --input_directory=./nips-abstract --output_directory=./ --number_of_topics=10 --training_iterations=50
+
+The generic argument to run PyCTM is
+
+	python -m launch_train --input_directory=$INPUT_DIRECTORY/$CORPUS_NAME --output_directory=$OUTPUT_DIRECTORY --number_of_topics=$NUMBER_OF_TOPICS --training_iterations=$NUMBER_OF_ITERATIONS
+
+You should be able to find the output at directory ```$OUTPUT_DIRECTORY/$CORPUS_NAME```.
+
+Under any circumstances, you may also get help information and usage hints by running the following command
+
+	python -m launch_train --help
+
diff --git a/__init__.py b/__init__.py
diff --git a/inferencer.py b/inferencer.py
@@ -0,0 +1,87 @@
+"""
+@author: Ke Zhai ([email protected])
+"""
+
+import time
+import numpy
+import scipy
+import nltk;
+
+def compute_dirichlet_expectation(dirichlet_parameter):
+    if (len(dirichlet_parameter.shape) == 1):
+        return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter))
+    return scipy.special.psi(dirichlet_parameter) - scipy.special.psi(numpy.sum(dirichlet_parameter, 1))[:, numpy.newaxis]
+
+def parse_vocabulary(vocab):
+    type_to_index = {};
+    index_to_type = {};
+    for word in set(vocab):
+        index_to_type[len(index_to_type)] = word;
+        type_to_index[word] = len(type_to_index);
+
+    return type_to_index, index_to_type;
+
+class Inferencer():
+    """
+    """
+    def __init__(self,
+                 hyper_parameter_optimize_interval=10,
+                 ):
+
+        self._hyper_parameter_optimize_interval = hyper_parameter_optimize_interval;
+        # assert(self._hyper_parameter_optimize_interval>0);
+
+        # self._local_parameter_iterations = local_parameter_iterations
+        # assert(self._local_maximum_iteration>0)
+
+    """
+    """
+    def _initialize(self, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta):
+        self.parse_vocabulary(vocab);
+
+        # initialize the size of the vocabulary, i.e. total number of distinct tokens.
+        self._number_of_types = len(self._type_to_index)
+
+        self._counter = 0;
+
+        # initialize the total number of topics.
+        self._number_of_topics = number_of_topics
+
+        # initialize a K-dimensional vector, valued at 1/K.
+        if self._diagonal_covariance_matrix:
+            self._alpha_mu = numpy.zeros(self._number_of_topics) + alpha_mu;
+            self._alpha_sigma = numpy.zeros(self._number_of_topics) + alpha_sigma;
+        else:
+            self._alpha_mu = numpy.zeros((1, self._number_of_topics)) + alpha_mu;
+            self._alpha_sigma = numpy.eye(self._number_of_topics) * alpha_sigma;
+            self._alpha_sigma_inv = numpy.linalg.pinv(self._alpha_sigma);
+
+        self._alpha_beta = numpy.zeros(self._number_of_types) + alpha_beta;
+
+    def parse_vocabulary(self, vocab):
+        self._type_to_index = {};
+        self._index_to_type = {};
+        for word in set(vocab):
+            self._index_to_type[len(self._index_to_type)] = word;
+            self._type_to_index[word] = len(self._type_to_index);
+
+        self._vocab = self._type_to_index.keys();
+
+    def parse_data(self):
+        raise NotImplementedError;
+
+    """
+    """
+    def learning(self):
+        raise NotImplementedError;
+
+    """
+    """
+    def inference(self):
+        raise NotImplementedError;
+
+    def export_beta(self, exp_beta_path, top_display=-1):
+        raise NotImplementedError;
+
+if __name__ == "__main__":
+    raise NotImplementedError;
diff --git a/launch_profiler.py b/launch_profiler.py
@@ -0,0 +1,69 @@
+import cPickle, string, numpy, getopt, sys, random, time, re, pprint
+import datetime, os;
+
+import nltk;
+import numpy;
+import cProfile
+
+def main():
+    # parameter set 1
+    input_directory = "./nips-abstract"
+
+    input_directory = input_directory.rstrip("/");
+    # corpus_name = os.path.basename(input_directory);
+
+    '''
+    output_directory = options.output_directory;
+    if not os.path.exists(output_directory):
+        os.mkdir(output_directory);
+    output_directory = os.path.join(output_directory, corpus_name);
+    if not os.path.exists(output_directory):
+        os.mkdir(output_directory);
+    '''
+
+    # Document
+    train_docs_path = os.path.join(input_directory, 'train.dat')
+    input_doc_stream = open(train_docs_path, 'r');
+    train_docs = [];
+    for line in input_doc_stream:
+        train_docs.append(line.strip().lower());
+    print "successfully load all training docs from %s..." % (os.path.abspath(train_docs_path));
+
+    # Vocabulary
+    vocabulary_path = os.path.join(input_directory, 'voc.dat');
+    input_voc_stream = open(vocabulary_path, 'r');
+    vocab = [];
+    for line in input_voc_stream:
+        vocab.append(line.strip().lower().split()[0]);
+    vocab = list(set(vocab));
+    print "successfully load all the words from %s..." % (os.path.abspath(vocabulary_path));
+
+    # parameter 2
+    number_of_topics = 10;
+    alpha_mu = 0;
+    alpha_sigma = 1;
+    alpha_beta = 1.0 / len(vocab);
+
+    # parameter set 3
+    training_iterations = 1;
+
+    import variational_bayes
+    ctm_inferencer = variational_bayes.VariationalBayes();
+
+    ctm_inferencer._initialize(train_docs, vocab, number_of_topics, alpha_mu, alpha_sigma, alpha_beta);
+
+    for iteration in xrange(training_iterations):
+        clock = time.time();
+        log_likelihood = ctm_inferencer.learning();
+        clock = time.time() - clock;
+
+        # print 'training iteration %d finished in %f seconds: number-of-topics = %d, log-likelihood = %f' % (hdp._iteration_counter, clock, hdp._K, log_likelihood);
+
+    # gamma_path = os.path.join(output_directory, 'gamma.txt');
+    # numpy.savetxt(gamma_path, hdp._document_topic_distribution);
+
+    # topic_inactive_counts_path = os.path.join(output_directory, "topic_inactive_counts.txt");
+    # numpy.savetxt(topic_inactive_counts_path, hdp._topic_inactive_counts);
+
+if __name__ == '__main__':
+    main()
diff --git a/launch_resume.py b/launch_resume.py
@@ -0,0 +1,130 @@
+import cPickle;
+import optparse
+import string, numpy, getopt, sys, random, time, re, pprint
+import datetime, os;
+
+import numpy;
+import shutil
+
+# model_settings_pattern = re.compile('\d+-\d+-ctm_inferencer-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-aa(?P<alpha>[\d\.]+)(-smh(?P<smh>[\d]+))?(-sp(?P<sp>[\d]+)-mp(?P<mp>[\d]+))?');
+model_settings_pattern = re.compile('\d+-\d+-ctm-I(?P<iteration>\d+)-S(?P<snapshot>\d+)-K(?P<topic>\d+)-am(?P<alpha_mu>[\d\.]+)-as(?P<alpha_sigma>[\d\.]+)-ab(?P<alpha_beta>[\d\.]+)');
+
+def parse_args():
+    parser = optparse.OptionParser()
+    parser.set_defaults(# parameter set 1
+                        # input_file=None,
+                        model_directory=None,
+                        snapshot_index=-1,
+
+                        # parameter set 2
+                        output_directory=None,
+                        training_iterations=-1,
+                        snapshot_interval=-1,
+                        )
+    # parameter set 1
+    # parser.add_option("--input_file", type="string", dest="input_file",
+                      # help="input directory [None]");
+    # parser.add_option("--input_directory", type="string", dest="input_directory",
+                      # help="input directory [None]");
+    parser.add_option("--model_directory", type="string", dest="model_directory",
+                      help="model directory [None]");
+    parser.add_option("--snapshot_index", type="int", dest="snapshot_index",
+                      help="snapshot index [-1]");
+    # parser.add_option("--training_iterations", type="int", dest="training_iterations",
+                      # help="number of training iterations [1000]");
+    # parser.add_option("--dataset_name", type="string", dest="dataset_name",
+                      # help="the corpus name [None]");
+
+    # parameter set 2
+    parser.add_option("--output_directory", type="string", dest="output_directory",
+                      help="output directory [None]");
+    # parser.add_option("--alpha_alpha", type="float", dest="alpha_alpha",
+                      # help="hyper-parameter for Dirichlet process of cluster [1]")
+    # parser.add_option("--alpha_kappa", type="float", dest="alpha_kappa",
+                      # help="hyper-parameter for top level Dirichlet process of distribution over topics [1]")
+    # parser.add_option("--alpha_nu", type="float", dest="alpha_nu",
+                      # help="hyper-parameter for bottom level Dirichlet process of distribution over topics [1]")
+    parser.add_option("--training_iterations", type="int", dest="training_iterations",
+                      help="number of training iterations [-1]");
+    parser.add_option("--snapshot_interval", type="int", dest="snapshot_interval",
+                      help="snapshot interval [-1 (default): remain unchanged]");
+
+    (options, args) = parser.parse_args();
+    return options;
+
+def main():
+    options = parse_args();
+
+    assert(options.model_directory != None);
+    model_directory = options.model_directory;
+
+    if not os.path.exists(model_directory):
+        sys.stderr.write("model directory %s not exists...\n" % (model_directory));
+        return;
+    model_directory = model_directory.rstrip("/");
+    model_settings = os.path.basename(model_directory);
+
+    assert options.snapshot_index > 0
+    snapshot_index = options.snapshot_index;
+
+    # load the existing model
+    model_snapshot_file_path = os.path.join(model_directory, "model-%d" % snapshot_index);
+    if not os.path.exists(model_snapshot_file_path):
+        sys.stderr.write("error: model snapshot file unfound %s...\n" % (model_snapshot_file_path));
+        return;
+
+    import variational_bayes;
+    ctm_inferencer = cPickle.load(open(model_snapshot_file_path, "rb"));
+    print 'successfully load model snapshot %s...' % (os.path.join(model_directory, "model-%d" % snapshot_index));
+
+    # set the resume options  
+    matches = re.match(model_settings_pattern, model_settings);
+
+    # training_iterations = int(matches.group('iteration'));
+    training_iterations = options.training_iterations;
+    assert training_iterations > snapshot_index;
+    if options.snapshot_interval == -1:
+        snapshot_interval = int(matches.group('snapshot'));
+    else:
+        snapshot_interval = options.snapshot_interval;
+    number_of_topics = int(matches.group('topic'));
+    alpha_mu = float(matches.group('alpha_mu'));
+    alpha_sigma = float(matches.group('alpha_sigma'));
+    alpha_beta = float(matches.group('alpha_beta'));
+
+    now = datetime.datetime.now();
+    suffix = now.strftime("%y%m%d-%H%M%S") + "";
+    suffix += "-%s" % ("ctm");
+    suffix += "-I%d" % (training_iterations);
+    suffix += "-S%d" % (snapshot_interval);
+    suffix += "-K%g" % (number_of_topics);
+    suffix += "-am%g" % (alpha_mu);
+    suffix += "-as%g" % (alpha_sigma);
+    suffix += "-ab%g" % (alpha_beta);
+
+    assert options.output_directory != None;
+    output_directory = options.output_directory;
+    output_directory = output_directory.rstrip("/");
+    output_directory = os.path.join(output_directory, suffix);
+    assert (not os.path.exists(os.path.abspath(output_directory)));
+    os.mkdir(os.path.abspath(output_directory));
+
+    shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "model-" + str(snapshot_index)));
+    shutil.copy(model_snapshot_file_path, os.path.join(output_directory, "exp_beta-" + str(snapshot_index)));
+
+    for iteration in xrange(snapshot_index, training_iterations):
+        # clock = time.time();
+        log_likelihood = ctm_inferencer.learning();
+        # clock = time.time()-clock;
+        # print 'training iteration %d finished in %f seconds: number-of-clusters = %d, log-likelihood = %f' % (dpgm._iteration_counter, clock, dpgm._K, log_likelihood);
+
+        if ((ctm_inferencer._counter) % snapshot_interval == 0):
+            ctm_inferencer.export_beta(os.path.join(output_directory, 'exp_beta-' + str(ctm_inferencer._counter)));
+            model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter));
+            cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb'));
+
+    model_snapshot_path = os.path.join(output_directory, 'model-' + str(ctm_inferencer._counter));
+    cPickle.dump(ctm_inferencer, open(model_snapshot_path, 'wb'));
+
+if __name__ == '__main__':
+    main()