Use new halite cache size limiting, and implement digifying the corpus

ChenxiCui · Feb 26, 2016 · af74bfd · af74bfd
1 parent 3f7b37c
commit af74bfd
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 64 deletions.
diff --git a/cclustercontexts.cxx b/cclustercontexts.cxx
@@ -49,7 +49,7 @@ namespace po=boost::program_options;
 namespace km=mlpack::kmeans;
 
 
-int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::string& clusterdir, size_t numclust, int vecdim) {
+int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::string& clusterdir, size_t numclust, size_t memlimitmb, int vecdim) {
   for (boost::filesystem::directory_iterator itr(contextdir); itr!=boost::filesystem::directory_iterator(); ++itr) {
     std::string path=itr->path().string();
     if(!boost::algorithm::ends_with(path,".vectors")) {
@@ -92,7 +92,15 @@ int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::
       exit(1);
 #else
     hl::PackedArrayPointSource<float> pts(data.memptr(), vecdim, numpoints);
-    hl::HaliteClustering<float> h(pts, hl::NormalizationMode::Independent, (2*vecdim), -1, 1e-10, 4, 1, 1, DB_HASH, 0);
+
+    int numLevels = 4;
+    uint64_t memlimit=memlimitmb*1024*1024;
+
+    if(numLevels *  numpoints * vecdim * sizeof(float) > memlimit) {
+      std::cout<< "Database is potentially larger than the memory cache size.\n";
+      std::cout<< "Expect some temporary files to be created in the current directory.\n";
+    }
+    hl::HaliteClustering<float> h(pts, hl::NormalizationMode::Independent, (2*vecdim), -1, 1e-10, numLevels, 1, 1, DB_HASH, memlimit);
     h.findCorrelationClusters();
     shared_ptr<hl::Classifier<float> > classifier=h.getClassifier();
     classifier->denormalize();
@@ -129,6 +137,7 @@ int main(int argc, char** argv) {
   std::string contextdir;
   std::string clusterdir;
   size_t numclust;
+  size_t memlimitmb;
   unsigned int dim;
 
   po::options_description desc("CClusterContexts Options");
@@ -142,7 +151,8 @@ int main(int argc, char** argv) {
 #endif
     ("contexts,i", po::value<std::string>(&contextdir)->value_name("<directory>")->required(), "directory of contexts to cluster")
     ("clusters,o", po::value<std::string>(&clusterdir)->value_name("<directory>")->required(), "directory to output clusters")
-    ("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters")
+    ("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters (kmeans only)")
+    ("memlimit,m", po::value<size_t>(&memlimitmb)->value_name("<megabytes>")->default_value(4000),"approximate cutoff for storing database in memory vs disk (halite only)")
     ("dim,d", po::value<unsigned int>(&dim)->value_name("<number>")->default_value(50),"word vector dimension")
     ;
 
@@ -180,5 +190,7 @@ int main(int argc, char** argv) {
     std::cerr << "Cluster directory does not exist" <<std::endl;
     return 4;
   }
-  return cluster_contexts(algorithm, contextdir, clusterdir, numclust, dim);
+
+
+  return cluster_contexts(algorithm, contextdir, clusterdir, numclust, memlimitmb, dim);
 }
diff --git a/cextractcontexts.cxx b/cextractcontexts.cxx
@@ -142,7 +142,7 @@ int compute_and_output_context(const boost::circular_buffer<int>& context, const
 	return 0;
 }
 
-int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std::ifstream& vectorstream, std::string indir, std::string outdir, int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, unsigned int prune, unsigned int fcachesize) {
+int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std::ifstream& vectorstream, std::string indir, std::string outdir, int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, boost::optional<const std::string&> digit_rep, unsigned int prune, unsigned int fcachesize) {
 	boost::unordered_map<std::string, int> vocabmap;
 	std::vector<std::string> vocab;
 	std::vector<float> idfs;
@@ -173,8 +173,8 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
 	vectorstream.close();
 
 
-	int startdoci=lookup_word(vocabmap,"<s>",false);
-	int enddoci=lookup_word(vocabmap,"<\\s>",false);
+	int startdoci=lookup_word(vocabmap,"<s>",false, boost::optional<const std::string&>());
+	int enddoci=lookup_word(vocabmap,"<\\s>",false, boost::optional<const std::string&>());
 
 	unsigned int vsize=vocab.size();
 
@@ -223,7 +223,7 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
 			for(unsigned int i=0; i<contextsize+1; i++) {
 				if(getline(corpusreader,word)) {
 					if(word==eodmarker) goto EOD;
-					int wind=lookup_word(vocabmap,word,indexed);
+					int wind=lookup_word(vocabmap, word, indexed, digit_rep);
 					context.push_back(wind);
 				}
 			}
@@ -234,7 +234,7 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
 				if(retcode) return retcode;
 
 				context.pop_front();
-				int newind=lookup_word(vocabmap,word,indexed);
+				int newind=lookup_word(vocabmap, word, indexed, digit_rep);
 				context.push_back(newind);
 			}
 			EOD:
@@ -271,6 +271,8 @@ int main(int argc, char** argv) {
   int dim;
   unsigned int contextsize;
   std::string eod;
+
+  std::string digit_rep;
   unsigned int prune=0;
   unsigned int fcachesize=0;
   po::options_description desc("CExtractContexts Options");
@@ -285,57 +287,61 @@ int main(int argc, char** argv) {
     ("contextsize,s", po::value<unsigned int>(&contextsize)->value_name("<number>")->default_value(5),"size of context (# of words before and after)")
     ("eodmarker,e",po::value<std::string>(&eod)->value_name("<string>")->default_value("eeeoddd"),"end of document marker")
     ("preindexed","indicates the corpus is pre-indexed with the vocab file")
+    ("digify",po::value<std::string>(&digit_rep)->value_name("<string>"),"Digify number tokens by replacing all digits with the given string")
     ("prune,p",po::value<unsigned int>(&prune)->value_name("<number>"),"only output contexts for the first N words in the vocab")
     ("fcachesize,f", po::value<unsigned int>(&fcachesize)->value_name("<number>"), "maximum number of files to open at once");
 
-
 
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, desc), vm);
 
-	if (vm.count("help")) {
-            std::cout << desc << "\n";
-            return 0;
-   }
-
-	try {
-		po::notify(vm);
-	} catch(po::required_option& exception) {
-		std::cerr << "Error: " << exception.what() << "\n";
-		std::cout << desc << "\n";
-        return 1;
-	}
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 0;
+    }
+
+    try {
+      po::notify(vm);
+    } catch(po::required_option& exception) {
+      std::cerr << "Error: " << exception.what() << "\n";
+      std::cout << desc << "\n";
+      return 1;
+    }
 
-	std::ifstream vocab(vocabf);
-	if(!vocab.good()) {
-		std::cerr << "Vocab file no good" <<std::endl;
-		return 2;
-	}
+    std::ifstream vocab(vocabf);
+    if(!vocab.good()) {
+      std::cerr << "Vocab file no good" <<std::endl;
+      return 2;
+    }
 
-	std::ifstream frequencies(idff);
-	if(!frequencies.good()) {
-		std::cerr << "Frequencies file no good" <<std::endl;
-		return 3;
-	}
+    std::ifstream frequencies(idff);
+    if(!frequencies.good()) {
+      std::cerr << "Frequencies file no good" <<std::endl;
+      return 3;
+    }
 
-	std::ifstream vectors(vecf);
-	if(!vectors.good()) {
-		std::cerr << "Vectors file no good" <<std::endl;
-		return 4;
-	}
+    std::ifstream vectors(vecf);
+    if(!vectors.good()) {
+      std::cerr << "Vectors file no good" <<std::endl;
+      return 4;
+    }
 
 
-	if(!boost::filesystem::is_directory(corpusd)) {
-		std::cerr << "Input directory does not exist" <<std::endl;
-		return 5;
-	}
+    if(!boost::filesystem::is_directory(corpusd)) {
+      std::cerr << "Input directory does not exist" <<std::endl;
+      return 5;
+    }
 
-	if(!boost::filesystem::is_directory(outd)) {
-		std::cerr << "Input directory does not exist" <<std::endl;
-		return 6;
-	}
+    if(!boost::filesystem::is_directory(outd)) {
+      std::cerr << "Input directory does not exist" <<std::endl;
+      return 6;
+    }
 
-	return extract_contexts(vocab, frequencies, vectors, corpusd,outd,dim,contextsize,eod,vm.count("preindexed")>0,prune, fcachesize);
+    boost::optional<const std::string&> digit_rep_arg;
+    if(!digit_rep.empty()) {
+      digit_rep_arg=digit_rep;
+    }
+    return extract_contexts(vocab, frequencies, vectors, corpusd,outd,dim,contextsize,eod,vm.count("preindexed")>0, digit_rep_arg, prune, fcachesize);
 }
 
 
diff --git a/common.hpp b/common.hpp
@@ -27,6 +27,8 @@
 #define COMMON_H
 #include "boost/circular_buffer.hpp"
 #include "boost/unordered_map.hpp"
+#include "boost/optional.hpp"
+#include <regex>
 
 #include <armadillo>
 
@@ -35,15 +37,28 @@ enum ClusterAlgos {
   HaliteAlgo
 };
 
-int lookup_word(const boost::unordered_map<std::string, int>& vocabmap, const std::string& word, bool indexed) {
-	if(indexed) {
-		return std::stoi(word)-1;
-	} else {
-		boost::unordered_map<std::string,int>::const_iterator index=vocabmap.find(word);
-		if(index==vocabmap.end()) {
-			return 0;  //Unknown words are mapped to 0, so the first word in your vocab better be unknown
-		}
-		return index->second;}
+static std::regex numregex("[-+]?\\d*\\.?\\d+", std::regex::ECMAScript | std::regex::optimize);
+static std::regex digitregex("\\d", std::regex::ECMAScript | std::regex::optimize);
+int lookup_word(const boost::unordered_map<std::string, int>& vocabmap, const std::string& word, bool indexed, boost::optional<const std::string&> digit_rep) {
+  if(indexed) {
+    return std::stoi(word)-1;
+  } else {
+    boost::unordered_map<std::string,int>::const_iterator index = vocabmap.find(word);
+    if(index != vocabmap.end()) {
+      return index->second;
+    }
+
+    if(digit_rep.is_initialized()) {
+      if(std::regex_match(word,numregex)) {
+	std::string digified = std::regex_replace(word, digitregex, *digit_rep);
+	index=vocabmap.find(digified);
+	if(index !=vocabmap.end()){
+	  return index->second;
+	}
+      }
+    }
+    return 0;  //Unknown words are mapped to 0, so the first word in your vocab better be unknown
+  } 
 }
 
 void compute_context(const boost::circular_buffer<int>& context, const std::vector<float>& idfs, const arma::fmat&  origvects, arma::fvec& outvec, unsigned int vecdim, unsigned int contextsize) {

diff --git a/crelabelcorpus.cxx b/crelabelcorpus.cxx
@@ -144,7 +144,7 @@ class HaliteClassifier {
 
 
 
-int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream& newvocabstream,fs::ifstream& idfstream, fs::ifstream& vecstream, fs::ifstream& centerstream, fs::path& icorpus, fs::path& ocorpus, unsigned int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed) {
+int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream& newvocabstream,fs::ifstream& idfstream, fs::ifstream& vecstream, fs::ifstream& centerstream, fs::path& icorpus, fs::path& ocorpus, unsigned int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, boost::optional<const std::string&> digit_rep) {
 
 	boost::unordered_map<std::string, int> vocabmap;
 	std::vector<std::string> vocab;
@@ -199,8 +199,8 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&
 		index++;
 	}
 
-	int startdoci=lookup_word(vocabmap,"<s>",false);
-	int enddoci=lookup_word(vocabmap,"<\\s>",false);
+	int startdoci=lookup_word(vocabmap,"<s>", false, boost::optional<const std::string&>());
+	int enddoci=lookup_word(vocabmap,"<\\s>", false, boost::optional<const std::string&>());
 
 	for (boost::filesystem::directory_iterator itr(icorpus); itr!=boost::filesystem::directory_iterator(); ++itr) {
 		if(itr->path().extension()!=".txt") {
@@ -229,7 +229,7 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&
 			for(unsigned int i=0; i<contextsize+1; i++) {
 				if(getline(corpusreader,word)) {
 					if(word==eodmarker) goto EOD;
-					int wind=lookup_word(vocabmap,word,indexed);
+					int wind=lookup_word(vocabmap,word,indexed, digit_rep);
 					context.push_back(wind);
 				}
 			}
@@ -250,7 +250,7 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&
 
 				corpuswriter <<  std::setfill ('0') << std::setw (2) << meaning << vocab[wid]<<'\n';
 				context.pop_front();
-				int nextind=lookup_word(vocabmap,word,indexed);
+				int nextind=lookup_word(vocabmap, word, indexed, digit_rep);
 				context.push_back(nextind);
 			}
 			EOD:
@@ -292,7 +292,7 @@ int main(int argc, char** argv) {
   unsigned int vecdim;
   unsigned int contextsize;
   std::string eod;
-	
+  std::string digit_rep;
   po::options_description desc("CRelabelCorpus Options");
   desc.add_options()
     ("help,h", "produce help message")
@@ -309,7 +309,7 @@ int main(int argc, char** argv) {
     ("contextsize,s", po::value<unsigned int>(&contextsize)->value_name("<number>")->default_value(5),"size of context (# of words before and after)")
     ("eodmarker",po::value<std::string>(&eod)->value_name("<string>")->default_value("eeeoddd"),"end of document marker")
     ("preindexed","indicates the corpus is pre-indexed with the vocab file")
-
+    ("digify", po::value<std::string>(&digit_rep)->value_name("<string>"), "Digify numbers tokens by replacing all digits with the given string") 
     ;
 
 
@@ -376,5 +376,9 @@ int main(int argc, char** argv) {
     std::cerr << "Output corpus directory does not exist" <<std::endl;
     return 8;
   }
-  return relabel_corpus(format, oldvocab,newvocab,idf,vectors,centers,icorpus,ocorpus,vecdim,contextsize, eod, vm.count("preindexed")>0);
+  boost::optional<const std::string&> digit_rep_arg;
+  if(!digit_rep.empty()) {
+    digit_rep_arg=digit_rep;
+  }
+  return relabel_corpus(format, oldvocab,newvocab,idf,vectors,centers,icorpus,ocorpus,vecdim,contextsize, eod, vm.count("preindexed")>0, digit_rep);
 }
diff --git a/halite b/halite
+1 −1		Makefile
+16 −10		demo/Halite.cpp
+2 −2		include/HaliteClustering.h
+1 −1		include/Normalization.h
+1 −1		include/PointSource.h
+24 −24		include/arboretum/stCountingTree.h
+8 −14		src/HaliteClustering.cpp
+1 −1		src/Normalization.cpp
+9 −2		src/Utile.cpp