将C++版本变成可以调用接口的形式

qdj0511 · Mar 10, 2017 · 2582693 · 2582693
1 parent 2de1b65
commit 2582693
Show file tree

Hide file tree

Showing 22 changed files with 763 additions and 486 deletions.
diff --git a/Makefile b/Makefile
@@ -1,20 +1,25 @@
 dst_dir=.
+include_dir=include
 src_dir=src
 bin_dir=.
-thulac=g++ -O3 -std=c++0x -march=native -I $(src_dir)
+thulac=g++ -O3 -march=native -I $(include_dir)
 
-#all: $(dst_dir)/dat_builder $(dst_dir)/thulac $(dst_dir)/thulac_time $(dst_dir)/predict_c
+# all: $(bin_dir)/thulac_test $(bin_dir)/train_c $(bin_dir)/thulac
 all: $(bin_dir)/thulac $(bin_dir)/train_c
 
-$(bin_dir)/thulac: $(src_dir)/thulac.cc $(src_dir)/*.h
+$(bin_dir)/thulac: $(src_dir)/thulac.cc $(include_dir)/*.h
 	$(thulac) $(src_dir)/thulac.cc -o $(bin_dir)/thulac
 
-$(bin_dir)/train_c: $(src_dir)/train_c.cc $(src_dir)/*.h
+$(bin_dir)/train_c: $(src_dir)/train_c.cc $(include_dir)/*.h
 	$(thulac) -o $(bin_dir)/train_c $(src_dir)/train_c.cc
 
+$(bin_dir)/thulac_test: $(src_dir)/thulac_test.cc $(include_dir)/*.h
+	$(thulac) -o $(bin_dir)/thulac_test $(src_dir)/thulac_test.cc	
+
 clean:
-	rm $(bin_dir)/thulac 
-	rm $(bin_dir)/train_c 
+	rm -f $(bin_dir)/thulac 
+	rm -f $(bin_dir)/train_c 
+	rm -f $(bin_dir)/thulac_test 
 
 pack:
 	tar -czvf THULAC_lite_c++_v1.tar.gz src Makefile doc README.md
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ THULAC（THU Lexical Analyzer for Chinese）由清华大学自然语言处理与
 ####1.1.命令格式
 * C++版
 	* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt]   从命令行输入输出
-	* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt] <inputfile >outputfile   利用重定向从文本文件输入输出（注意均为UTF8文本）
+	* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt] [-intput inputfile] [-output outputfile] 从文本文件输入输出（注意均为UTF8文本）
 
 ####1.2.通用参数
 	-t2s			    将句子从繁体转化为简体
@@ -42,8 +42,32 @@ THULAC（THU Lexical Analyzer for Chinese）由清华大学自然语言处理与
 	-filter				使用过滤器去除一些没有意义的词语，例如“可以”。
 	-user userword.txt	设置用户词典，用户词典中的词会被打上uw标签。词典中每一个词一行，UTF8编码(python版暂无)
 	-model_dir dir		设置模型文件所在文件夹，默认为models/
+	-input inputfile	设置输入文件地址
+	-output outputfile	设置输出文件地址
 
-####1.3.分词和词性标注模型的使用
+####1.3.接口使用示例
+
+新版的THULAC提供了分词和词性标注接口，将`include文件夹`拷贝到自己工程下的`include`中，通过在程序中引用`include"thulac.h"`，即可调用thulac提供的功能。
+
+具体的使用方法可以参考`src/thulac.cc`文件。
+
+####1.4.接口参数
+
+首先需要实例化`THULAC类`，然后可以调用以下接口：
+
+* `int init(const char* model_path = NULL, const char* user_path = NULL, int just_seg = 0, int t2s = 0, int ufilter = 0, char separator = '_');`初始化类，进行自定义设置。
+
+		user_path           设置用户词典，用户词典中的词会被打上uw标签。词典中每一个词一行，UTF8编码
+		t2s                 默认False, 是否将句子从繁体转化为简体
+		just_seg            默认False, 时候只进行分词，不进行词性标注
+		ufilter             默认False, 是否使用过滤器去除一些没有意义的词语，例如“可以”。
+		model_path          设置模型文件所在文件夹，默认为models/
+		separator           默认为‘_’, 设置词与词性之间的分隔符
+* `int cut(const std::string&, THULAC_result& result);`输入一个待分词和词性标注的字符串和THULAC_result类型变量，结果会存储在`result`中。
+
+	`THULAC_result`类型为`std::vector<std::pair<std::string, std::string> >`的重定义。即`cut函数`返回结果为`std::vector<std::pair<分词,词性> >`。如果只分词，那么词性会是''（空字符串）。
+
+####1.5.分词和词性标注模型的使用
 
 THULAC需要分词和词性标注模型的支持，用户可以登录[thulac.thunlp.org](http://thulac.thunlp.org)网站填写个人信息进行下载，并放到THULAC的根目录即可，或者使用参数`-model_dir dir`指定模型的位置。
 

diff --git a/src/cb_decoder.h → include/cb_decoder.h b/src/cb_decoder.h → include/cb_decoder.h
diff --git a/src/cb_model.h → include/cb_model.h b/src/cb_model.h → include/cb_model.h
diff --git a/src/cb_ngram_feature.h → include/cb_ngram_feature.h b/src/cb_ngram_feature.h → include/cb_ngram_feature.h
@@ -108,7 +108,7 @@ class NGramFeature{
             key.push_back(right);key.push_back(right2);key.push_back(SEPERATOR);key.push_back('4');
             indexer.get_index(key);
         }
-    };
+    }
 
     int put_values(int*sequence,int len){
         if(len>=this->max_length){
@@ -142,9 +142,10 @@ class NGramFeature{
             if((base=bi_bases[i+3])!=-1)
                 add_values(value_offset,base,52,NULL);
         }
+        return 0;
+    }
 
-    };
-    int update_weights(int*sequence,int len,int* results,int delta,long steps){
+    void update_weights(int*sequence,int len,int* results,int delta,long steps){
         find_bases(dat_size,SENTENCE_BOUNDARY,SENTENCE_BOUNDARY,uni_bases[0],bi_bases[0]);
         find_bases(dat_size,SENTENCE_BOUNDARY,sequence[0],uni_bases[0],bi_bases[1]);
         for(int i=0;i+1<len;i++)

diff --git a/src/cb_tagging_decoder.h → include/cb_tagging_decoder.h b/src/cb_tagging_decoder.h → include/cb_tagging_decoder.h
@@ -52,10 +52,6 @@ class TaggingDecoder{
     int**label_trans_pre;
     int**label_trans_post;
 
-    ///*后处理用*/
-    int threshold;
-    int* allow_com;
-
     ///*后处理用_ tagging*/
     int tag_size;//postag的个数
     int** label_looking_for;
@@ -117,10 +113,7 @@ TaggingDecoder::TaggingDecoder(){
     this->label_trans=NULL;
     label_trans_pre=NULL;
     label_trans_post=NULL;
-    this->threshold=0;
-
-//    this->allow_sep=new int[this->max_length];
-    this->allow_com=new int[this->max_length];
+
 
     this->tag_size=0;
     //this->is_good_choice=NULL;
@@ -162,14 +155,11 @@ TaggingDecoder::~TaggingDecoder(){
     free(label_trans_pre);
     free(label_trans_post);
 
-//    delete[](allow_sep);
-    delete[](allow_com);
 
     if(model!=NULL)for(int i=0;i<model->l_size;i++){
         if(label_looking_for)delete[](label_looking_for[i]);
     };
     delete[](label_looking_for);
-    delete[](is_good_choice);
 
     if(pocs_to_tags){
         for(int i=1;i<16;i++){
@@ -454,6 +444,7 @@ int TaggingDecoder::segment(RawSentence& raw, POCGraph& graph, TaggedSentence& t
             //if((i+1)<len)putchar(' ');//在分词位置输出空格
         }
     }
+    return 1;
 }
 void TaggingDecoder::get_seg_result(SegmentedSentence& ss){
         ss.clear();

diff --git a/src/cb_tagging_learner.h → include/cb_tagging_learner.h b/src/cb_tagging_learner.h → include/cb_tagging_learner.h
@@ -212,8 +212,8 @@ void TaggingLearner::train(const char*training_file,
     //model_file
     int l_size=tag_indexer.list.size();
     int f_size=kv.size();
-    fprintf(stderr,"number of labels: %d\n",l_size);
-    fprintf(stderr,"number of features: %d\n",f_size);
+    printf("number of labels: %d\n",l_size);
+    printf("number of features: %d\n",f_size);
     permm::Model* model=new permm::Model(l_size,f_size);
     model->save(model_file);
     delete model;
@@ -334,4 +334,4 @@ void TaggingLearner::train(const char*training_file,
 
 
 
-}//end of thulac
+}//end of thulac
diff --git a/src/dat.h → include/dat.h b/src/dat.h → include/dat.h
@@ -128,7 +128,7 @@ class DAT{
         }
         return -1;
     }
-    int update(const Word& word,int value,int post=0){
+    void update(const Word& word,int value,int post=0){
         int base=match(word,post);
         if(base>=0){
             dat[base].base=value;

diff --git a/src/filter.h → include/filter.h b/src/filter.h → include/filter.h
diff --git a/src/negword.h → include/negword.h b/src/negword.h → include/negword.h
diff --git a/src/postprocess.h → include/postprocess.h b/src/postprocess.h → include/postprocess.h
diff --git a/src/preprocess.h → include/preprocess.h b/src/preprocess.h → include/preprocess.h
diff --git a/src/punctuation.h → include/punctuation.h b/src/punctuation.h → include/punctuation.h