Skip to content

Commit

Permalink
将C++版本变成可以调用接口的形式
Browse files Browse the repository at this point in the history
  • Loading branch information
gzp9595 committed Mar 10, 2017
1 parent 2de1b65 commit 2582693
Show file tree
Hide file tree
Showing 22 changed files with 763 additions and 486 deletions.
17 changes: 11 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
dst_dir=.
include_dir=include
src_dir=src
bin_dir=.
thulac=g++ -O3 -std=c++0x -march=native -I $(src_dir)
thulac=g++ -O3 -march=native -I $(include_dir)

#all: $(dst_dir)/dat_builder $(dst_dir)/thulac $(dst_dir)/thulac_time $(dst_dir)/predict_c
# all: $(bin_dir)/thulac_test $(bin_dir)/train_c $(bin_dir)/thulac
all: $(bin_dir)/thulac $(bin_dir)/train_c

$(bin_dir)/thulac: $(src_dir)/thulac.cc $(src_dir)/*.h
$(bin_dir)/thulac: $(src_dir)/thulac.cc $(include_dir)/*.h
$(thulac) $(src_dir)/thulac.cc -o $(bin_dir)/thulac

$(bin_dir)/train_c: $(src_dir)/train_c.cc $(src_dir)/*.h
$(bin_dir)/train_c: $(src_dir)/train_c.cc $(include_dir)/*.h
$(thulac) -o $(bin_dir)/train_c $(src_dir)/train_c.cc

$(bin_dir)/thulac_test: $(src_dir)/thulac_test.cc $(include_dir)/*.h
$(thulac) -o $(bin_dir)/thulac_test $(src_dir)/thulac_test.cc

clean:
rm $(bin_dir)/thulac
rm $(bin_dir)/train_c
rm -f $(bin_dir)/thulac
rm -f $(bin_dir)/train_c
rm -f $(bin_dir)/thulac_test

pack:
tar -czvf THULAC_lite_c++_v1.tar.gz src Makefile doc README.md
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ THULAC(THU Lexical Analyzer for Chinese)由清华大学自然语言处理与
####1.1.命令格式
* C++版
* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt] 从命令行输入输出
* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt] <inputfile >outputfile 利用重定向从文本文件输入输出(注意均为UTF8文本)
* ./thulac [-t2s] [-seg_only] [-deli delimeter] [-user userword.txt] [-intput inputfile] [-output outputfile] 从文本文件输入输出(注意均为UTF8文本)

####1.2.通用参数
-t2s 将句子从繁体转化为简体
Expand All @@ -42,8 +42,32 @@ THULAC(THU Lexical Analyzer for Chinese)由清华大学自然语言处理与
-filter 使用过滤器去除一些没有意义的词语,例如“可以”。
-user userword.txt 设置用户词典,用户词典中的词会被打上uw标签。词典中每一个词一行,UTF8编码(python版暂无)
-model_dir dir 设置模型文件所在文件夹,默认为models/
-input inputfile 设置输入文件地址
-output outputfile 设置输出文件地址

####1.3.分词和词性标注模型的使用
####1.3.接口使用示例

新版的THULAC提供了分词和词性标注接口,将`include文件夹`拷贝到自己工程下的`include`中,通过在程序中引用`include"thulac.h"`,即可调用thulac提供的功能。

具体的使用方法可以参考`src/thulac.cc`文件。

####1.4.接口参数

首先需要实例化`THULAC类`,然后可以调用以下接口:

* `int init(const char* model_path = NULL, const char* user_path = NULL, int just_seg = 0, int t2s = 0, int ufilter = 0, char separator = '_');`初始化类,进行自定义设置。

user_path 设置用户词典,用户词典中的词会被打上uw标签。词典中每一个词一行,UTF8编码
t2s 默认False, 是否将句子从繁体转化为简体
just_seg 默认False, 时候只进行分词,不进行词性标注
ufilter 默认False, 是否使用过滤器去除一些没有意义的词语,例如“可以”。
model_path 设置模型文件所在文件夹,默认为models/
separator 默认为‘_’, 设置词与词性之间的分隔符
* `int cut(const std::string&, THULAC_result& result);`输入一个待分词和词性标注的字符串和THULAC_result类型变量,结果会存储在`result`中。

`THULAC_result`类型为`std::vector<std::pair<std::string, std::string> >`的重定义。即`cut函数`返回结果为`std::vector<std::pair<分词,词性> >`。如果只分词,那么词性会是''(空字符串)。

####1.5.分词和词性标注模型的使用

THULAC需要分词和词性标注模型的支持,用户可以登录[thulac.thunlp.org](http://thulac.thunlp.org)网站填写个人信息进行下载,并放到THULAC的根目录即可,或者使用参数`-model_dir dir`指定模型的位置。

Expand Down
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions src/cb_ngram_feature.h → include/cb_ngram_feature.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class NGramFeature{
key.push_back(right);key.push_back(right2);key.push_back(SEPERATOR);key.push_back('4');
indexer.get_index(key);
}
};
}

int put_values(int*sequence,int len){
if(len>=this->max_length){
Expand Down Expand Up @@ -142,9 +142,10 @@ class NGramFeature{
if((base=bi_bases[i+3])!=-1)
add_values(value_offset,base,52,NULL);
}
return 0;
}

};
int update_weights(int*sequence,int len,int* results,int delta,long steps){
void update_weights(int*sequence,int len,int* results,int delta,long steps){
find_bases(dat_size,SENTENCE_BOUNDARY,SENTENCE_BOUNDARY,uni_bases[0],bi_bases[0]);
find_bases(dat_size,SENTENCE_BOUNDARY,sequence[0],uni_bases[0],bi_bases[1]);
for(int i=0;i+1<len;i++)
Expand Down
13 changes: 2 additions & 11 deletions src/cb_tagging_decoder.h → include/cb_tagging_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ class TaggingDecoder{
int**label_trans_pre;
int**label_trans_post;

///*后处理用*/
int threshold;
int* allow_com;

///*后处理用_ tagging*/
int tag_size;//postag的个数
int** label_looking_for;
Expand Down Expand Up @@ -117,10 +113,7 @@ TaggingDecoder::TaggingDecoder(){
this->label_trans=NULL;
label_trans_pre=NULL;
label_trans_post=NULL;
this->threshold=0;

// this->allow_sep=new int[this->max_length];
this->allow_com=new int[this->max_length];


this->tag_size=0;
//this->is_good_choice=NULL;
Expand Down Expand Up @@ -162,14 +155,11 @@ TaggingDecoder::~TaggingDecoder(){
free(label_trans_pre);
free(label_trans_post);

// delete[](allow_sep);
delete[](allow_com);

if(model!=NULL)for(int i=0;i<model->l_size;i++){
if(label_looking_for)delete[](label_looking_for[i]);
};
delete[](label_looking_for);
delete[](is_good_choice);

if(pocs_to_tags){
for(int i=1;i<16;i++){
Expand Down Expand Up @@ -454,6 +444,7 @@ int TaggingDecoder::segment(RawSentence& raw, POCGraph& graph, TaggedSentence& t
//if((i+1)<len)putchar(' ');//在分词位置输出空格
}
}
return 1;
}
void TaggingDecoder::get_seg_result(SegmentedSentence& ss){
ss.clear();
Expand Down
6 changes: 3 additions & 3 deletions src/cb_tagging_learner.h → include/cb_tagging_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ void TaggingLearner::train(const char*training_file,
//model_file
int l_size=tag_indexer.list.size();
int f_size=kv.size();
fprintf(stderr,"number of labels: %d\n",l_size);
fprintf(stderr,"number of features: %d\n",f_size);
printf("number of labels: %d\n",l_size);
printf("number of features: %d\n",f_size);
permm::Model* model=new permm::Model(l_size,f_size);
model->save(model_file);
delete model;
Expand Down Expand Up @@ -334,4 +334,4 @@ void TaggingLearner::train(const char*training_file,



}//end of thulac
}//end of thulac
2 changes: 1 addition & 1 deletion src/dat.h → include/dat.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class DAT{
}
return -1;
}
int update(const Word& word,int value,int post=0){
void update(const Word& word,int value,int post=0){
int base=match(word,post);
if(base>=0){
dat[base].base=value;
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 2582693

Please sign in to comment.