Skip to content

Commit

Permalink
Implemented halite clustering, only output not input
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremysalwen committed Feb 21, 2016
1 parent dbdbbb4 commit e66df3c
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 76 deletions.
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@

ENABLE_HALITE ?= 1
ifeq ($(ENABLE_HALITE), 1)
CFLAGS = -DENABLE_HALITE `pkg-config --cflags opencv`
CFLAGS = -DENABLE_HALITE -Ihalite/include `pkg-config --cflags opencv`
LDFLAGS = halite/libhalite.a `pkg-config --libs opencv` -ldb_cxx
endif


CC = g++
CFLAGS += -O2 -Wall -std=c++11 `pkg-config --cflags libxml-2.0`
EOBJECTS = cextractcontexts.o
COBJECTS = cclustercontexts.o
COBJECTS += cclustercontexts.o
VOBJECTS = cexpandvocab.o
ROBJECTS = crelabelcorpus.o
INCFLAGS =
LDFLAGS = -lboost_filesystem -lboost_system -lboost_program_options -lboost_iostreams -lmlpack -larmadillo
LDFLAGS += -lboost_filesystem -lboost_system -lboost_program_options -lboost_iostreams -lmlpack -larmadillo
LIBS =

all: CExtractContexts CClusterContexts CExpandVocab CRelabelCorpus
Expand Down
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,25 @@ vector is D IEEE-754 floats. The vectors are just concatenated and there
is no padding.

## Clusters Directory
Directory containing text files N.txt which contain the clusters generated from the contexts of the Nth word in the vocabulary. Each line is a whitespace separated vector, representing the center of one of the clusters.
Directory containing text files N.*.txt which contain the clusters
generated from the contexts of the Nth word in the vocabulary.

Depending on the clustering mode, they will be in different formats.

If using the kmeans clustering mode: N.centers.txt will have on each
line a whitespace separated vector, representing the center of one of
the clusters.

If using the halite clustering mode: N.hlclusters.txt will be a sequence
of "Beta Clusters". Each Beta Cluster will list (whitespace and newline
separated)

Correlation Cluster number\n
Vector of relevance\n
Vector of lower bounds\n
Vector of upper bounds\n

Where each correlation cluster may be composed of multiple beta clusters.

## Expanded Vocab file

Expand Down
207 changes: 137 additions & 70 deletions cclustercontexts.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,90 +30,157 @@
#include <boost/program_options.hpp>
#include <boost/iostreams/device/mapped_file.hpp>

#include <armadillo>

#include <mlpack/core.hpp>
#include <mlpack/methods/kmeans/kmeans.hpp>

#include "mlpack/cosinesqrkernel.hpp"

#ifdef ENABLE_HALITE
#include "halite/include/HaliteClustering.h"
#include "halite/include/PointSource.h"
#endif

namespace po=boost::program_options;
namespace km=mlpack::kmeans;
namespace hl=Halite;

int cluster_contexts(std::string& contextdir,const std::string& clusterdir, size_t numclust, int vecdim) {
for (boost::filesystem::directory_iterator itr(contextdir); itr!=boost::filesystem::directory_iterator(); ++itr) {
std::string path=itr->path().string();
if(!boost::algorithm::ends_with(path,".vectors")) {
continue;
}
if(boost::filesystem::file_size(itr->path())==0) {
continue;
}
std::cout << path << '\n';
boost::iostreams::mapped_file_source file(itr->path());
size_t numpoints=file.size()/(vecdim*sizeof(float));
std::cout << numpoints << " points" <<std::endl;
const arma::fmat data((float*)file.data(), vecdim, numpoints, false,true);

numclust=std::min(numpoints,numclust);

arma::Col<size_t> assignments(numpoints);

arma::fmat centroids(vecdim,numclust);
km::KMeans<CosineSqrKernel> k;
enum ClusterAlgos {
SphericalKMeans,
HaliteAlgo
};

int cluster_contexts(ClusterAlgos algorithm, std::string& contextdir,const std::string& clusterdir, size_t numclust, int vecdim) {
for (boost::filesystem::directory_iterator itr(contextdir); itr!=boost::filesystem::directory_iterator(); ++itr) {
std::string path=itr->path().string();
if(!boost::algorithm::ends_with(path,".vectors")) {
continue;
}
if(boost::filesystem::file_size(itr->path())==0) {
continue;
}
std::cout << path << '\n';
boost::iostreams::mapped_file_source file(itr->path());
size_t numpoints=file.size()/(vecdim*sizeof(float));
std::cout << numpoints << " points" <<std::endl;
const arma::fmat data((float*)file.data(), vecdim, numpoints, false,true);

k.Cluster(data, numclust, assignments,centroids);
numclust=std::min(numpoints,numclust);

boost::filesystem::path outpath=clusterdir / itr->path().filename();
outpath=outpath.replace_extension(".txt");
arma::Col<size_t> assignments(numpoints);

std::ofstream clusterfile(outpath.string());
for(unsigned int i=0; i< numclust; i++) {
for(int j=0; j<vecdim; j++) {
clusterfile << centroids(j,i) << " ";
}
clusterfile << '\n';
}
clusterfile.close();
arma::fmat centroids(vecdim,numclust);

if(algorithm == SphericalKMeans) {
km::KMeans<CosineSqrKernel> k;

k.Cluster(data, numclust, assignments,centroids);

boost::filesystem::path outpath=clusterdir / itr->path().filename();
outpath=outpath.replace_extension(".centers.txt");

std::ofstream clusterfile(outpath.string());
for(unsigned int i=0; i<numclust; i++) {
for(int j=0; j<vecdim; j++) {
clusterfile << centroids(j,i) << " ";
}
return 0;
clusterfile << '\n';
}
clusterfile.close();
} else if(algorithm == HaliteAlgo) {
#ifndef ENABLE_HALITE
std::cerr<<"Error: Attempted to use Halite clustering when it was disabled at compile time\n";
exit(1);
#else
hl::PackedArrayPointSource<float> pts(data.memptr(), vecdim, numpoints);
hl::HaliteClustering<float> h(pts, hl::NormalizationMode::Independent, (2*vecdim), -1, 1e-10, 4, 1, 1, DB_HASH, 0);
h.findCorrelationClusters();
shared_ptr<hl::Classifier<float> > classifier=h.getClassifier();
std::shared_ptr<hl::Normalization<float>> normalization=classifier->normalization;
std::vector<float> denormMin(vecdim), denormMax(vecdim);

boost::filesystem::path outpath=clusterdir / itr->path().filename();
outpath=outpath.replace_extension(".halite.txt");
std::ofstream clusterfile(outpath.string());
for(const hl::BetaCluster<float>& b: classifier->betaClusters) {
normalization->denormalize(b.min.begin(), denormMin.begin());
normalization->denormalize(b.max.begin(), denormMax.begin());

clusterfile << b.correlationCluster<<"\n";
for(unsigned char c: b.relevantDimension) {
clusterfile << (c?"1 ":"0 ");
}
clusterfile<<"\n";
for(float f:denormMin) {
clusterfile << f << " ";
}
clusterfile <<"\n";
for(float f:denormMax) {
clusterfile << f << " ";
}
clusterfile<<"\n";
}
clusterfile.close();
#endif
}

}
return 0;
}


int main(int argc, char** argv) {
std::string contextdir;
std::string clusterdir;
size_t numclust;
unsigned int dim;
std::string contextdir;
std::string clusterdir;
size_t numclust;
unsigned int dim;

po::options_description desc("CClusterContexts Options");
desc.add_options()
po::options_description desc("CClusterContexts Options");
desc.add_options()
("help,h", "produce help message")
("contexts,i", po::value<std::string>(&contextdir)->value_name("<directory>")->required(), "directory of contexts to cluster")
("clusters,o", po::value<std::string>(&clusterdir)->value_name("<directory>")->required(), "directory to output clusters")
("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters")
("dim,d", po::value<unsigned int>(&dim)->value_name("<number>")->default_value(50),"word vector dimension")
;

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 0;
}
try {
po::notify(vm);
} catch(po::required_option& exception) {
std::cerr << "Error: " << exception.what() << "\n";
std::cout << desc << "\n";
return 1;
}
if(!boost::filesystem::is_directory(contextdir)) {
std::cerr << "Context directory does not exist" <<std::endl;
return 2;
}
if(!boost::filesystem::is_directory(clusterdir)) {
std::cerr << "Cluster directory does not exist" <<std::endl;
return 3;
}
return cluster_contexts(contextdir, clusterdir,numclust, dim);
}
("kmeans,k", "use spherical k-means clustering (default)")
#ifdef ENABLE_HALITE
("halite,l", "use Halite clustering")
#else
("halite,l", "use Halite clustering [DISABLED AT COMPILE TIME]")
#endif
("contexts,i", po::value<std::string>(&contextdir)->value_name("<directory>")->required(), "directory of contexts to cluster")
("clusters,o", po::value<std::string>(&clusterdir)->value_name("<directory>")->required(), "directory to output clusters")
("numclust,n", po::value<size_t>(&numclust)->value_name("<number>")->default_value(10),"number of clusters")
("dim,d", po::value<unsigned int>(&dim)->value_name("<number>")->default_value(50),"word vector dimension")
;

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 0;
}
if(vm.count("kmeans") && vm.count("halite")) {
std::cerr << "Error: Only one clustering algorithm can be selected\n";
return 0;
}

ClusterAlgos algorithm=SphericalKMeans;
if(vm.count("halite")) {
algorithm=HaliteAlgo;
}

try {
po::notify(vm);
} catch(po::required_option& exception) {
std::cerr << "Error: " << exception.what() << "\n";
std::cout << desc << "\n";
return 1;
}
if(!boost::filesystem::is_directory(contextdir)) {
std::cerr << "Context directory does not exist" <<std::endl;
return 2;
}
if(!boost::filesystem::is_directory(clusterdir)) {
std::cerr << "Cluster directory does not exist" <<std::endl;
return 3;
}
return cluster_contexts(algorithm, contextdir, clusterdir, numclust, dim);
}

0 comments on commit e66df3c

Please sign in to comment.