Skip to content

Commit

Permalink
Use new halite cache size limiting, and implement digifying the corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremysalwen committed Feb 26, 2016
1 parent 3f7b37c commit af74bfd
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 64 deletions.
20 changes: 16 additions & 4 deletions cclustercontexts.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace po=boost::program_options;
namespace km=mlpack::kmeans;


int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::string& clusterdir, size_t numclust, int vecdim) {
int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::string& clusterdir, size_t numclust, size_t memlimitmb, int vecdim) {
for (boost::filesystem::directory_iterator itr(contextdir); itr!=boost::filesystem::directory_iterator(); ++itr) {
std::string path=itr->path().string();
if(!boost::algorithm::ends_with(path,".vectors")) {
Expand Down Expand Up @@ -92,7 +92,15 @@ int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::
exit(1);
#else
hl::PackedArrayPointSource<float> pts(data.memptr(), vecdim, numpoints);
hl::HaliteClustering<float> h(pts, hl::NormalizationMode::Independent, (2*vecdim), -1, 1e-10, 4, 1, 1, DB_HASH, 0);

int numLevels = 4;
uint64_t memlimit=memlimitmb*1024*1024;

if(numLevels * numpoints * vecdim * sizeof(float) > memlimit) {
std::cout<< "Database is potentially larger than the memory cache size.\n";
std::cout<< "Expect some temporary files to be created in the current directory.\n";
}
hl::HaliteClustering<float> h(pts, hl::NormalizationMode::Independent, (2*vecdim), -1, 1e-10, numLevels, 1, 1, DB_HASH, memlimit);
h.findCorrelationClusters();
shared_ptr<hl::Classifier<float> > classifier=h.getClassifier();
classifier->denormalize();
Expand Down Expand Up @@ -129,6 +137,7 @@ int main(int argc, char** argv) {
std::string contextdir;
std::string clusterdir;
size_t numclust;
size_t memlimitmb;
unsigned int dim;

po::options_description desc("CClusterContexts Options");
Expand All @@ -142,7 +151,8 @@ int main(int argc, char** argv) {
#endif
("contexts,i", po::value<std::string>(&contextdir)->value_name("<directory>")->required(), "directory of contexts to cluster")
("clusters,o", po::value<std::string>(&clusterdir)->value_name("<directory>")->required(), "directory to output clusters")
("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters")
("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters (kmeans only)")
("memlimit,m", po::value<size_t>(&memlimitmb)->value_name("<megabytes>")->default_value(4000),"approximate cutoff for storing database in memory vs disk (halite only)")
("dim,d", po::value<unsigned int>(&dim)->value_name("<number>")->default_value(50),"word vector dimension")
;

Expand Down Expand Up @@ -180,5 +190,7 @@ int main(int argc, char** argv) {
std::cerr << "Cluster directory does not exist" <<std::endl;
return 4;
}
return cluster_contexts(algorithm, contextdir, clusterdir, numclust, dim);


return cluster_contexts(algorithm, contextdir, clusterdir, numclust, memlimitmb, dim);
}
90 changes: 48 additions & 42 deletions cextractcontexts.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ int compute_and_output_context(const boost::circular_buffer<int>& context, const
return 0;
}

int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std::ifstream& vectorstream, std::string indir, std::string outdir, int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, unsigned int prune, unsigned int fcachesize) {
int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std::ifstream& vectorstream, std::string indir, std::string outdir, int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, boost::optional<const std::string&> digit_rep, unsigned int prune, unsigned int fcachesize) {
boost::unordered_map<std::string, int> vocabmap;
std::vector<std::string> vocab;
std::vector<float> idfs;
Expand Down Expand Up @@ -173,8 +173,8 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
vectorstream.close();


int startdoci=lookup_word(vocabmap,"<s>",false);
int enddoci=lookup_word(vocabmap,"<\\s>",false);
int startdoci=lookup_word(vocabmap,"<s>",false, boost::optional<const std::string&>());
int enddoci=lookup_word(vocabmap,"<\\s>",false, boost::optional<const std::string&>());

unsigned int vsize=vocab.size();

Expand Down Expand Up @@ -223,7 +223,7 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
for(unsigned int i=0; i<contextsize+1; i++) {
if(getline(corpusreader,word)) {
if(word==eodmarker) goto EOD;
int wind=lookup_word(vocabmap,word,indexed);
int wind=lookup_word(vocabmap, word, indexed, digit_rep);
context.push_back(wind);
}
}
Expand All @@ -234,7 +234,7 @@ int extract_contexts(std::ifstream& vocabstream, std::ifstream& tfidfstream, std
if(retcode) return retcode;

context.pop_front();
int newind=lookup_word(vocabmap,word,indexed);
int newind=lookup_word(vocabmap, word, indexed, digit_rep);
context.push_back(newind);
}
EOD:
Expand Down Expand Up @@ -271,6 +271,8 @@ int main(int argc, char** argv) {
int dim;
unsigned int contextsize;
std::string eod;

std::string digit_rep;
unsigned int prune=0;
unsigned int fcachesize=0;
po::options_description desc("CExtractContexts Options");
Expand All @@ -285,57 +287,61 @@ int main(int argc, char** argv) {
("contextsize,s", po::value<unsigned int>(&contextsize)->value_name("<number>")->default_value(5),"size of context (# of words before and after)")
("eodmarker,e",po::value<std::string>(&eod)->value_name("<string>")->default_value("eeeoddd"),"end of document marker")
("preindexed","indicates the corpus is pre-indexed with the vocab file")
("digify",po::value<std::string>(&digit_rep)->value_name("<string>"),"Digify number tokens by replacing all digits with the given string")
("prune,p",po::value<unsigned int>(&prune)->value_name("<number>"),"only output contexts for the first N words in the vocab")
("fcachesize,f", po::value<unsigned int>(&fcachesize)->value_name("<number>"), "maximum number of files to open at once");



po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 0;
}

try {
po::notify(vm);
} catch(po::required_option& exception) {
std::cerr << "Error: " << exception.what() << "\n";
std::cout << desc << "\n";
return 1;
}
if (vm.count("help")) {
std::cout << desc << "\n";
return 0;
}

try {
po::notify(vm);
} catch(po::required_option& exception) {
std::cerr << "Error: " << exception.what() << "\n";
std::cout << desc << "\n";
return 1;
}

std::ifstream vocab(vocabf);
if(!vocab.good()) {
std::cerr << "Vocab file no good" <<std::endl;
return 2;
}
std::ifstream vocab(vocabf);
if(!vocab.good()) {
std::cerr << "Vocab file no good" <<std::endl;
return 2;
}

std::ifstream frequencies(idff);
if(!frequencies.good()) {
std::cerr << "Frequencies file no good" <<std::endl;
return 3;
}
std::ifstream frequencies(idff);
if(!frequencies.good()) {
std::cerr << "Frequencies file no good" <<std::endl;
return 3;
}

std::ifstream vectors(vecf);
if(!vectors.good()) {
std::cerr << "Vectors file no good" <<std::endl;
return 4;
}
std::ifstream vectors(vecf);
if(!vectors.good()) {
std::cerr << "Vectors file no good" <<std::endl;
return 4;
}


if(!boost::filesystem::is_directory(corpusd)) {
std::cerr << "Input directory does not exist" <<std::endl;
return 5;
}
if(!boost::filesystem::is_directory(corpusd)) {
std::cerr << "Input directory does not exist" <<std::endl;
return 5;
}

if(!boost::filesystem::is_directory(outd)) {
std::cerr << "Input directory does not exist" <<std::endl;
return 6;
}
if(!boost::filesystem::is_directory(outd)) {
std::cerr << "Input directory does not exist" <<std::endl;
return 6;
}

return extract_contexts(vocab, frequencies, vectors, corpusd,outd,dim,contextsize,eod,vm.count("preindexed")>0,prune, fcachesize);
boost::optional<const std::string&> digit_rep_arg;
if(!digit_rep.empty()) {
digit_rep_arg=digit_rep;
}
return extract_contexts(vocab, frequencies, vectors, corpusd,outd,dim,contextsize,eod,vm.count("preindexed")>0, digit_rep_arg, prune, fcachesize);
}


33 changes: 24 additions & 9 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#define COMMON_H
#include "boost/circular_buffer.hpp"
#include "boost/unordered_map.hpp"
#include "boost/optional.hpp"
#include <regex>

#include <armadillo>

Expand All @@ -35,15 +37,28 @@ enum ClusterAlgos {
HaliteAlgo
};

int lookup_word(const boost::unordered_map<std::string, int>& vocabmap, const std::string& word, bool indexed) {
if(indexed) {
return std::stoi(word)-1;
} else {
boost::unordered_map<std::string,int>::const_iterator index=vocabmap.find(word);
if(index==vocabmap.end()) {
return 0; //Unknown words are mapped to 0, so the first word in your vocab better be unknown
}
return index->second;}
static std::regex numregex("[-+]?\\d*\\.?\\d+", std::regex::ECMAScript | std::regex::optimize);
static std::regex digitregex("\\d", std::regex::ECMAScript | std::regex::optimize);
int lookup_word(const boost::unordered_map<std::string, int>& vocabmap, const std::string& word, bool indexed, boost::optional<const std::string&> digit_rep) {
if(indexed) {
return std::stoi(word)-1;
} else {
boost::unordered_map<std::string,int>::const_iterator index = vocabmap.find(word);
if(index != vocabmap.end()) {
return index->second;
}

if(digit_rep.is_initialized()) {
if(std::regex_match(word,numregex)) {
std::string digified = std::regex_replace(word, digitregex, *digit_rep);
index=vocabmap.find(digified);
if(index !=vocabmap.end()){
return index->second;
}
}
}
return 0; //Unknown words are mapped to 0, so the first word in your vocab better be unknown
}
}

void compute_context(const boost::circular_buffer<int>& context, const std::vector<float>& idfs, const arma::fmat& origvects, arma::fvec& outvec, unsigned int vecdim, unsigned int contextsize) {
Expand Down
20 changes: 12 additions & 8 deletions crelabelcorpus.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ class HaliteClassifier {



int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream& newvocabstream,fs::ifstream& idfstream, fs::ifstream& vecstream, fs::ifstream& centerstream, fs::path& icorpus, fs::path& ocorpus, unsigned int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed) {
int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream& newvocabstream,fs::ifstream& idfstream, fs::ifstream& vecstream, fs::ifstream& centerstream, fs::path& icorpus, fs::path& ocorpus, unsigned int vecdim, unsigned int contextsize, std::string eodmarker, bool indexed, boost::optional<const std::string&> digit_rep) {

boost::unordered_map<std::string, int> vocabmap;
std::vector<std::string> vocab;
Expand Down Expand Up @@ -199,8 +199,8 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&
index++;
}

int startdoci=lookup_word(vocabmap,"<s>",false);
int enddoci=lookup_word(vocabmap,"<\\s>",false);
int startdoci=lookup_word(vocabmap,"<s>", false, boost::optional<const std::string&>());
int enddoci=lookup_word(vocabmap,"<\\s>", false, boost::optional<const std::string&>());

for (boost::filesystem::directory_iterator itr(icorpus); itr!=boost::filesystem::directory_iterator(); ++itr) {
if(itr->path().extension()!=".txt") {
Expand Down Expand Up @@ -229,7 +229,7 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&
for(unsigned int i=0; i<contextsize+1; i++) {
if(getline(corpusreader,word)) {
if(word==eodmarker) goto EOD;
int wind=lookup_word(vocabmap,word,indexed);
int wind=lookup_word(vocabmap,word,indexed, digit_rep);
context.push_back(wind);
}
}
Expand All @@ -250,7 +250,7 @@ int relabel_corpus(ClusterAlgos format, fs::ifstream& vocabstream,fs::ifstream&

corpuswriter << std::setfill ('0') << std::setw (2) << meaning << vocab[wid]<<'\n';
context.pop_front();
int nextind=lookup_word(vocabmap,word,indexed);
int nextind=lookup_word(vocabmap, word, indexed, digit_rep);
context.push_back(nextind);
}
EOD:
Expand Down Expand Up @@ -292,7 +292,7 @@ int main(int argc, char** argv) {
unsigned int vecdim;
unsigned int contextsize;
std::string eod;
std::string digit_rep;
po::options_description desc("CRelabelCorpus Options");
desc.add_options()
("help,h", "produce help message")
Expand All @@ -309,7 +309,7 @@ int main(int argc, char** argv) {
("contextsize,s", po::value<unsigned int>(&contextsize)->value_name("<number>")->default_value(5),"size of context (# of words before and after)")
("eodmarker",po::value<std::string>(&eod)->value_name("<string>")->default_value("eeeoddd"),"end of document marker")
("preindexed","indicates the corpus is pre-indexed with the vocab file")

("digify", po::value<std::string>(&digit_rep)->value_name("<string>"), "Digify numbers tokens by replacing all digits with the given string")
;


Expand Down Expand Up @@ -376,5 +376,9 @@ int main(int argc, char** argv) {
std::cerr << "Output corpus directory does not exist" <<std::endl;
return 8;
}
return relabel_corpus(format, oldvocab,newvocab,idf,vectors,centers,icorpus,ocorpus,vecdim,contextsize, eod, vm.count("preindexed")>0);
boost::optional<const std::string&> digit_rep_arg;
if(!digit_rep.empty()) {
digit_rep_arg=digit_rep;
}
return relabel_corpus(format, oldvocab,newvocab,idf,vectors,centers,icorpus,ocorpus,vecdim,contextsize, eod, vm.count("preindexed")>0, digit_rep);
}

0 comments on commit af74bfd

Please sign in to comment.