fixed ant

ckling · Apr 9, 2017 · 6a4269b · 6a4269b
1 parent ae15dc0
commit 6a4269b
Show file tree

Hide file tree

Showing 67 changed files with 754 additions and 5,106 deletions.
diff --git a/bin1/org/gesis/promoss/inference/CopyOfDCTM2_CVB.class b/bin1/org/gesis/promoss/inference/CopyOfDCTM2_CVB.class
diff --git a/bin1/org/gesis/promoss/inference/DCTM1_CVB.class b/bin1/org/gesis/promoss/inference/DCTM1_CVB.class
diff --git a/bin1/org/gesis/promoss/inference/DCTM2_CVB.class b/bin1/org/gesis/promoss/inference/DCTM2_CVB.class
diff --git a/bin1/org/gesis/promoss/inference/DCTM2_CVB_old.class b/bin1/org/gesis/promoss/inference/DCTM2_CVB_old.class
diff --git a/bin1/org/gesis/promoss/inference/DCTM_CVB.class b/bin1/org/gesis/promoss/inference/DCTM_CVB.class
diff --git a/...esis/promoss/inference/DCTM_CVB.java~HEAD → ...esis/promoss/inference/DCTM_CVB.java~HEAD b/...esis/promoss/inference/DCTM_CVB.java~HEAD → ...esis/promoss/inference/DCTM_CVB.java~HEAD
diff --git a/bin1/org/gesis/promoss/inference/DMR.class b/bin1/org/gesis/promoss/inference/DMR.class
diff --git a/bin1/org/gesis/promoss/inference/DMR_CSVB.class b/bin1/org/gesis/promoss/inference/DMR_CSVB.class
diff --git a/bin1/org/gesis/promoss/inference/Experiments.class b/bin1/org/gesis/promoss/inference/Experiments.class
diff --git a/bin1/org/gesis/promoss/inference/HMDP_PCSVB.class b/bin1/org/gesis/promoss/inference/HMDP_PCSVB.class
diff --git a/bin1/org/gesis/promoss/inference/HMD_PCSVB.class b/bin1/org/gesis/promoss/inference/HMD_PCSVB.class
diff --git a/bin1/org/gesis/promoss/inference/LDA_CSVB.class b/bin1/org/gesis/promoss/inference/LDA_CSVB.class
diff --git a/bin1/org/gesis/promoss/inference/MVHMDP_PCSVB.class b/bin1/org/gesis/promoss/inference/MVHMDP_PCSVB.class
diff --git a/bin1/org/gesis/promoss/inference/Run.class b/bin1/org/gesis/promoss/inference/Run.class
diff --git a/bin1/org/gesis/promoss/metadata/ClusterMetadata.class b/bin1/org/gesis/promoss/metadata/ClusterMetadata.class
diff --git a/bin1/org/gesis/promoss/output/TopicMap.class b/bin1/org/gesis/promoss/output/TopicMap.class
diff --git a/bin1/org/gesis/promoss/tools/db/Database.class b/bin1/org/gesis/promoss/tools/db/Database.class
diff --git a/bin1/org/gesis/promoss/tools/geo/Coord.class b/bin1/org/gesis/promoss/tools/geo/Coord.class
diff --git a/bin1/org/gesis/promoss/tools/geo/Coordinates.class b/bin1/org/gesis/promoss/tools/geo/Coordinates.class
diff --git a/bin1/org/gesis/promoss/tools/geo/MF_Delaunay$EstQ.class b/bin1/org/gesis/promoss/tools/geo/MF_Delaunay$EstQ.class
diff --git a/bin1/org/gesis/promoss/tools/geo/MF_Delaunay.class b/bin1/org/gesis/promoss/tools/geo/MF_Delaunay.class
diff --git a/bin1/org/gesis/promoss/tools/geo/Map.class b/bin1/org/gesis/promoss/tools/geo/Map.class
diff --git a/bin1/org/gesis/promoss/tools/math/BasicMath.class b/bin1/org/gesis/promoss/tools/math/BasicMath.class
diff --git a/bin1/org/gesis/promoss/tools/math/FunctionsMath.class b/bin1/org/gesis/promoss/tools/math/FunctionsMath.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Envelope.class b/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Envelope.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Metropolis.class b/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Metropolis.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Point.class b/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler$Point.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler.class b/bin1/org/gesis/promoss/tools/probabilistic/ArmSampler.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArrayElement.class b/bin1/org/gesis/promoss/tools/probabilistic/ArrayElement.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArrayElementInt.class b/bin1/org/gesis/promoss/tools/probabilistic/ArrayElementInt.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ArrayUtils.class b/bin1/org/gesis/promoss/tools/probabilistic/ArrayUtils.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation$GammaPolyaArms.class b/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation$GammaPolyaArms.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation$GammaPolyaParams.class b/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation$GammaPolyaParams.class
diff --git a/.../org/gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimation.class b/.../org/gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimation.class
diff --git a/.../gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimationDCTM.class b/.../gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimationDCTM.class
diff --git a/...gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimationDCTM2.class b/...gesis/promoss/tools/probabilistic/DirichletEstimation$LBFGSDirichletEstimationDCTM2.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation.class b/bin1/org/gesis/promoss/tools/probabilistic/DirichletEstimation.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/DoubleFormat.class b/bin1/org/gesis/promoss/tools/probabilistic/DoubleFormat.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/ExpDouble.class b/bin1/org/gesis/promoss/tools/probabilistic/ExpDouble.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/Gamma.class b/bin1/org/gesis/promoss/tools/probabilistic/Gamma.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/Pair.class b/bin1/org/gesis/promoss/tools/probabilistic/Pair.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/RandomSamplers$CrpData.class b/bin1/org/gesis/promoss/tools/probabilistic/RandomSamplers$CrpData.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/RandomSamplers.class b/bin1/org/gesis/promoss/tools/probabilistic/RandomSamplers.class
diff --git a/bin1/org/gesis/promoss/tools/probabilistic/Vectors.class b/bin1/org/gesis/promoss/tools/probabilistic/Vectors.class
diff --git a/bin1/org/gesis/promoss/tools/text/Corpus.class b/bin1/org/gesis/promoss/tools/text/Corpus.class
diff --git a/bin1/org/gesis/promoss/tools/text/DCTM_Corpus.class b/bin1/org/gesis/promoss/tools/text/DCTM_Corpus.class
diff --git a/bin1/org/gesis/promoss/tools/text/DMR_Corpus.class b/bin1/org/gesis/promoss/tools/text/DMR_Corpus.class
diff --git a/bin1/org/gesis/promoss/tools/text/Dictionary.class b/bin1/org/gesis/promoss/tools/text/Dictionary.class
diff --git a/bin1/org/gesis/promoss/tools/text/Document.class b/bin1/org/gesis/promoss/tools/text/Document.class
diff --git a/bin1/org/gesis/promoss/tools/text/HMDP_Corpus.class b/bin1/org/gesis/promoss/tools/text/HMDP_Corpus.class
diff --git a/bin1/org/gesis/promoss/tools/text/Load.class b/bin1/org/gesis/promoss/tools/text/Load.class
diff --git a/bin1/org/gesis/promoss/tools/text/Save.class b/bin1/org/gesis/promoss/tools/text/Save.class
diff --git a/bin1/org/gesis/promoss/tools/text/Text.class b/bin1/org/gesis/promoss/tools/text/Text.class
diff --git a/build.xml b/build.xml
@@ -10,10 +10,14 @@
     <property name="source" value="1.7"/>
     <path id="promoss.classpath">
         <pathelement location="bin"/>
-        <pathelement location="../lib/stemmer.jar"/>
-        <pathelement location="../lib/nekohtml.jar"/>
-        <pathelement location="../lib/xercesImpl.jar"/>
-        <pathelement location="../lib/quickhull3d.jar"/>
+        <pathelement location="lib/nekohtml.jar"/>
+        <pathelement location="lib/quickhull3d.jar"/>
+        <pathelement location="lib/stemmer.jar"/>
+        <pathelement location="lib/xercesImpl.jar"/>
+        <pathelement location="lib/commons-math3-3.6.1.jar"/>
+        <pathelement location="lib/jdistlib-0.4.5-bin.jar"/>
+        <pathelement location="lib/knowceans.jar"/>
+        <pathelement location="lib/mallet.jar"/>
     </path>
     <target name="init">
         <mkdir dir="bin"/>
@@ -37,69 +41,49 @@
         </javac>
     </target>
     <target description="Build all projects which reference this project. Useful to propagate changes." name="build-refprojects"/>
-    <target name="Start">
-        <java classname="org.gesis.promoss.inference.Start" failonerror="true" fork="yes">
-            <classpath refid="promoss.classpath"/>
-        </java>
-    </target>
-    <target name="RandomSamplers">
-        <java classname="org.gesis.promoss.tools.probabilistic.RandomSamplers" failonerror="true" fork="yes">
-            <classpath refid="promoss.classpath"/>
-        </java>
-    </target>
-    <target name="Run">
+    <target name="Run (2)">
         <java classname="org.gesis.promoss.inference.Run" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-    <target name="LBFGS">
-        <java classname="org.gesis.promoss.inference.LBFGS" failonerror="true" fork="yes">
-            <classpath refid="promoss.classpath"/>
-        </java>
-    </target>
-    <target name="DoubleFormat">
-        <java classname="org.gesis.promoss.tools.probabilistic.DoubleFormat" failonerror="true" fork="yes">
-            <classpath refid="promoss.classpath"/>
-        </java>
-    </target>
-    <target name="DirichletEstimation">
-        <java classname="org.gesis.promoss.tools.probabilistic.DirichletEstimation" failonerror="true" fork="yes">
+    <target name="Experiments">
+        <java classname="org.gesis.promoss.inference.Experiments" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-    <target name="ArrayUtils">
-        <java classname="org.gesis.promoss.tools.probabilistic.ArrayUtils" failonerror="true" fork="yes">
+    <target name="Save">
+        <java classname="org.gesis.promoss.tools.text.Save" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-    <target name="TransformData">
-        <java classname="org.gesis.promoss.tools.TransformData" failonerror="true" fork="yes">
+    <target name="DMR (1)">
+        <java classname="org.gesis.promoss.inference.DMR" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-    <target name="Experiments">
-        <java classname="org.gesis.promoss.inference.Experiments" failonerror="true" fork="yes">
+    <target name="DirichletEstimation (1)">
+        <java classname="org.gesis.promoss.tools.probabilistic.DirichletEstimation" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-    <target name="DMR">
-        <java classname="org.gesis.promoss.inference.DMR" failonerror="true" fork="yes">
+    <target name="RandomSamplers (1)">
+        <java classname="org.gesis.promoss.tools.probabilistic.RandomSamplers" failonerror="true" fork="yes">
             <classpath refid="promoss.classpath"/>
         </java>
     </target>
-
+
+
     <target name="build-jar">
        <jar destfile="promoss.jar"
           basedir="bin"
           includes="org/gesis/promoss/**"
-          excludes="**.class">
+          excludes="**.class*">
           <zipgroupfileset dir="lib" includes="*.jar" excludes=""/>
-
           <manifest>
              <attribute name="Main-Class" value="org.gesis.promoss.inference.Run"/>
              <attribute name="Class-Path" value="${promoss.classpath}"/>
           </manifest>
        </jar>
     </target>
-
+    
 </project>
diff --git a/lib/knowceans.jar b/lib/knowceans.jar
diff --git a/lib/mallet.jar b/lib/mallet.jar
diff --git a/readme.txt b/readme.txt
@@ -30,7 +30,7 @@ First steps
 Building the jar file
 ***************************
 You can build the promoss.jar using Ant. Go to the directory of the extracted promoss.tar.gz file (in which the build.xml is located) and enter the command:
-ant || ant build-jar
+ant; ant build-jar
 
 (The ant build might yield errors for classes under development which can be ignored.)
 
@@ -39,6 +39,77 @@ Demo files
 ***************************
 If you would like to have demo files to play around with, just write a mail to [email protected]
 
+###########################
+Latent Dirichlet Allocation (LDA)
+###########################
+Collapsed stochastic variational inference for LDA with an asymmetric document-topic prior.
+
+
+***************************
+Example command line usage
+***************************
+java -Xmx11000M -jar promoss.jar -directory demo/ml_demo/ -method "LDA" -MIN_DICT_WORDS 0 -T 5
+
+
+***************************
+Input files
+***************************
+The most simple way to feed your documents into the topic model is via the corpus.txt file, which can include raw documents (each line corresponds to a document). From this corpus.txt, a wordsets file with the processed documents in SVMlight format is created, called wordsets. You can also directly give the wordsets file and a words.txt dictionary, where the line number (starting with 0) corresponds to the word ID in the SVMlight file.
+
+--------------------------- 
+corpus.txt
+--------------------------- 
+Each line corresponds to a document. Words of documents are separated by spaces. (However, one can also input raw text and set the -processed parameter to false in order to use a library-specific code for splitting words.)
+#Example file:#
+exist distribut origin softwar distributor agre gpl
+gpl establish term distribut origin softwar even goe unmodifi word distribut gpl softwar one agre 
+dynam link constitut make deriv work allow dynam link long rule follow code make deriv work rule
+gpl also deal deriv work link creat deriv work gpl affect gpl defin scope copyright law gpl section 
+
+--------------------------- 
+words.txt
+--------------------------- 
+This optional file gives the vocabulary, one word per row. The line numbers correspond to the later indices in the topic-word matrix.
+
+***************************
+Output files
+***************************
+Cluster descriptions (e.g. means of the geographical clusters, bins of timestamps etc.) are saved in the cluster_desc/ folder.
+After each 10 runs, important parameters are stored in the output_Promoss/ subfolder, with the number of runs as folder name. The clusters_X file contains the topic loadings of each cluster of the Xth metadata. The topktopics file contains the top words of each topic (the number of returned top words can be set via the -topk parameter).
+
+***************************
+Mandatory parameter
+***************************
+-directory 		String. Gives the directory of the texts.txt and groups.txt file.
+
+
+***************************
+Optional parameters:
+***************************
+-T			Integer. Number of truncated topics
+-RUNS			Integer. Number of iterations the sampler will run. Default: 200
+-SAVE_STEP		Integer. Number of iterations after which the learned paramters are saved. Default: 10
+-TRAINING_SHARE		Double. Gives the share of documents which are used for training (0 to 1). Default: 1
+-BATCHSIZE		Integer. Batch size for topic estimation. Default: 128
+-BURNIN			Integer. Number of iterations till the topics are updated. Default: 200
+-INIT_RAND		Double. Topic-word counts are initiatlised as INIT_RAND * RANDOM(). Default: 0
+-MIN_DICT_WORDS		Integer. If the words.txt file is missing, words.txt is created by using words which occur at least MIN_DICT_WORDS times in the corpus. Default: 100
+-save_prefix		String. If given, this String is appended to all output files.
+-alpha			Double. Initial value of alpha_0. Default: 1
+-rhokappa		Double. Initial value of kappa, a parameter for the learning rate of topics. Default: 0.5
+-rhotau			Integer. Initial value of tau, a parameter for the learning rate of topics. Default: 64
+-rhos			Integer. Initial value of s, a parameter for the learning rate of topics. Default: 1
+-rhokappa_document	Double. Initial value of kappa, a parameter for the learning rate of the document-topic distribution. Default: kappa
+-rhotau_document	Integer. Initial value of tau, a parameter for the learning rate of the document-topic distribution. Default: tau
+-rhos_document		Integer. Initial value of tau, a parameter for the learning rate of the document-topic distribution. Default: rhos
+-processed		Boolean. Tells if the text is already processed, or if words should be split with complex regular expressions. Otherwise split by spaces. Default: true.
+-stemming		Boolean. Activates word stemming in case no words.txt/wordsets file is given.
+-stopwords		Boolean. Activates stopword removal in case no words.txt/wordsets file is given.
+-language		String. Currently "en" and "de" are available languages for stemming.
+-store_empty		Boolean. Determines if empty documents should be omitted in the final document-topic matrix or if the topic distribution should be predicted using the context. Default: True
+-topk			Integer. Set the number of top words returned in the topktopics file of the output.
+
+
 ###########################
 Hierarchical Multi-Dirichlet Process Topic Model (Promoss)
 ###########################
@@ -187,73 +258,3 @@ Optional parameters:
 
 
 
-###########################
-Latent Dirichlet Allocation (LDA)
-###########################
-Collapsed stochastic variational inference for LDA with an asymmetric document-topic prior.
-
-
-***************************
-Example command line usage
-***************************
-java -Xmx11000M -jar promoss.jar -directory demo/ml_demo/ -method "LDA" -MIN_DICT_WORDS 1000 -T 5
-
-
-***************************
-Input files
-***************************
-The most simple way to feed your documents into the topic model is via the corpus.txt file, which can include raw documents (each line corresponds to a document). From this corpus.txt, a wordsets file with the processed documents in SVMlight format is created, called wordsets. You can also directly give the wordsets file and a words.txt dictionary, where the line number (starting with 0) corresponds to the word ID in the SVMlight file.
-
---------------------------- 
-corpus.txt
---------------------------- 
-Each line corresponds to a document. Words of documents are separated by spaces. (However, one can also input raw text and set the -processed parameter to false in order to use a library-specific code for splitting words.)
-#Example file:#
-exist distribut origin softwar distributor agre gpl
-gpl establish term distribut origin softwar even goe unmodifi word distribut gpl softwar one agre 
-dynam link constitut make deriv work allow dynam link long rule follow code make deriv work rule
-gpl also deal deriv work link creat deriv work gpl affect gpl defin scope copyright law gpl section 
-
---------------------------- 
-words.txt
---------------------------- 
-This optional file gives the vocabulary, one word per row. The line numbers correspond to the later indices in the topic-word matrix.
-
-***************************
-Output files
-***************************
-Cluster descriptions (e.g. means of the geographical clusters, bins of timestamps etc.) are saved in the cluster_desc/ folder.
-After each 10 runs, important parameters are stored in the output_Promoss/ subfolder, with the number of runs as folder name. The clusters_X file contains the topic loadings of each cluster of the Xth metadata. The topktopics file contains the top words of each topic (the number of returned top words can be set via the -topk parameter).
-
-***************************
-Mandatory parameter
-***************************
--directory 		String. Gives the directory of the texts.txt and groups.txt file.
-
-
-***************************
-Optional parameters:
-***************************
--T			Integer. Number of truncated topics
--RUNS			Integer. Number of iterations the sampler will run. Default: 200
--SAVE_STEP		Integer. Number of iterations after which the learned paramters are saved. Default: 10
--TRAINING_SHARE		Double. Gives the share of documents which are used for training (0 to 1). Default: 1
--BATCHSIZE		Integer. Batch size for topic estimation. Default: 128
--BURNIN			Integer. Number of iterations till the topics are updated. Default: 200
--INIT_RAND		Double. Topic-word counts are initiatlised as INIT_RAND * RANDOM(). Default: 0
--MIN_DICT_WORDS		Integer. If the words.txt file is missing, words.txt is created by using words which occur at least MIN_DICT_WORDS times in the corpus. Default: 100
--save_prefix		String. If given, this String is appended to all output files.
--alpha			Double. Initial value of alpha_0. Default: 1
--rhokappa		Double. Initial value of kappa, a parameter for the learning rate of topics. Default: 0.5
--rhotau			Integer. Initial value of tau, a parameter for the learning rate of topics. Default: 64
--rhos			Integer. Initial value of s, a parameter for the learning rate of topics. Default: 1
--rhokappa_document	Double. Initial value of kappa, a parameter for the learning rate of the document-topic distribution. Default: kappa
--rhotau_document	Integer. Initial value of tau, a parameter for the learning rate of the document-topic distribution. Default: tau
--rhos_document		Integer. Initial value of tau, a parameter for the learning rate of the document-topic distribution. Default: rhos
--processed		Boolean. Tells if the text is already processed, or if words should be split with complex regular expressions. Otherwise split by spaces. Default: true.
--stemming		Boolean. Activates word stemming in case no words.txt/wordsets file is given.
--stopwords		Boolean. Activates stopword removal in case no words.txt/wordsets file is given.
--language		String. Currently "en" and "de" are available languages for stemming.
--store_empty		Boolean. Determines if empty documents should be omitted in the final document-topic matrix or if the topic distribution should be predicted using the context. Default: True
--topk			Integer. Set the number of top words returned in the topktopics file of the output.
-