Skip to content

Commit

Permalink
1、提高了用户词典的优先级,现在用户词典中的词作为后处理的最高位置
Browse files Browse the repository at this point in the history
2、增加了自动分句功能,如果一段话(读入的一行)超过了50000字,会按照常见句子结束符(。!?:!?:  7种)作为分隔符进行切分
3、现在在分词的时候,中文和英文不会被分到一起,会严格分开
4、修正了时间后处理的bug
  • Loading branch information
gzp9595 committed Jan 4, 2017
1 parent 64b301d commit d683983
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 80 deletions.
2 changes: 1 addition & 1 deletion src/cb_ngram_feature.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class NGramFeature{
this->dat=dat->dat;
this->dat_size=dat->dat_size;
this->model=model;
max_length=10000;
max_length=50000;
this->uni_bases=new int[this->max_length+2];
this->bi_bases=new int[this->max_length+4];
this->values=values;
Expand Down
2 changes: 1 addition & 1 deletion src/cb_tagging_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class TaggingDecoder{

TaggingDecoder::TaggingDecoder(){
this->separator='_';
this->max_length=10000; //就是这里!
this->max_length=50000; //就是这里!
this->len=0;
this->sequence=new int[this->max_length];
this->allowed_label_lists=new int*[this->max_length];
Expand Down
2 changes: 1 addition & 1 deletion src/preprocess.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ class Preprocesser{
hasSinglePun = false;
}else{
senClean.push_back(c);
graph.push_back(15);
graph.push_back(9);
}
}else{
senClean.push_back(c);
Expand Down
96 changes: 51 additions & 45 deletions src/thulac.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ int main (int argc,char **argv) {
bool seg_only = false;
bool useFilter = false;
bool use_second = false;
int max_length = 50000;

int c = 1;
while(c < argc){
Expand Down Expand Up @@ -164,62 +165,67 @@ int main (int argc,char **argv) {
std::vector<thulac::RawSentence> vec;
while(1){
rtn=thulac::get_raw(oiraw);//读入生句子
// std::cout << oiraw << std::endl;

if(useT2S) {
preprocesser->clean(oiraw,traw,poc_cands);
preprocesser->T2S(traw, raw);
if(oiraw.size() > max_length) {
thulac::cut_raw(oiraw, vec, max_length);
}
else {
preprocesser -> clean(oiraw,raw,poc_cands);
vec.clear();
vec.push_back(oiraw);
}
if(raw.size()){

if(!seg_only) {
tagging_decoder->segment(raw,poc_cands,tagged);

//后处理
ns_dict->adjust(tagged);
idiom_dict->adjust(tagged);

if(user_dict){
user_dict->adjust(tagged);
}

punctuation->adjust(tagged);
timeword->adjustDouble(tagged);
negword->adjust(tagged);
if(useFilter){
filter->adjust(tagged);
}

// 输出
std::cout<<tagged;//输出
for(int vec_num = 0; vec_num < vec.size(); vec_num++) {
if(useT2S) {
preprocesser->clean(vec[vec_num],traw,poc_cands);
preprocesser->T2S(traw, raw);
}
else {
preprocesser -> clean(vec[vec_num],raw,poc_cands);
}
if(raw.size()){

if(!seg_only) {
tagging_decoder->segment(raw,poc_cands,tagged);

//后处理
ns_dict->adjust(tagged);
idiom_dict->adjust(tagged);
punctuation->adjust(tagged);
timeword->adjustDouble(tagged);
negword->adjust(tagged);
if(user_dict){
user_dict->adjust(tagged);
}
if(useFilter){
filter->adjust(tagged);
}

cws_decoder->segment(raw, poc_cands, tagged);
cws_decoder->get_seg_result(segged);
ns_dict->adjust(segged);
idiom_dict->adjust(segged);

if(user_dict){
user_dict->adjust(segged);
if(vec_num != 0) std::cout << " ";// 输出
std::cout<<tagged;//输出
}
else {

cws_decoder->segment(raw, poc_cands, tagged);
cws_decoder->get_seg_result(segged);
ns_dict->adjust(segged);
idiom_dict->adjust(segged);
punctuation->adjust(segged);
timeword->adjust(segged);
negword->adjust(segged);
if(user_dict){
user_dict->adjust(segged);
}
if(useFilter){
filter->adjust(segged);
}
if(vec_num != 0) std::cout << " ";
for(int j = 0; j < segged.size(); j++){
if(j!=0) std::cout<<" ";
std::cout<<segged[j];
}

punctuation->adjust(segged);
timeword->adjust(segged);
negword->adjust(segged);
if(useFilter){
filter->adjust(segged);
}
for(int j = 0; j < segged.size(); j++){
if(j!=0) std::cout<<" ";
std::cout<<segged[j];
}
}
}
if(rtn==-1)break;//如果到了文件末尾,退出
if(rtn==-1) break;//如果到了文件末尾,退出
putchar(rtn);//否则打印结尾符号
std::cout.flush();
//并继续
Expand Down
91 changes: 62 additions & 29 deletions src/thulac_raw.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,16 +200,17 @@ inline int get_raw_vector(std::vector<Raw>& vec,FILE* pFile=stdin,int min_char=3
std::set<int>::iterator it;
//int punInts[] = {46,63,33,12290,65311,65281};
int punInts[] = {63,33,59,12290,65311,65281,65307};
for(int i = 0; i < 9; i ++){
for(int i = 0; i < 7; i ++){
pun_set.insert(punInts[i]);
}
int current_character=-1;
int c;
while(1){//反复读取输入流
c=fgetc(pFile);
std::cout << c;
// std::cout << c;
if(c==EOF){
if(current_character!=-1)sent.push_back(current_character);
if(sent.size()) vec.push_back(sent);
return c;//end of file
}
if(!(c&0x80)){//1个byte的utf-8编码
Expand All @@ -233,8 +234,10 @@ inline int get_raw_vector(std::vector<Raw>& vec,FILE* pFile=stdin,int min_char=3
}
}
if(c<min_char){//非打印字符及空格
//return c;
current_character=32;
if(sent.size()) vec.push_back(sent);
// break;
return c;
// current_character=32;
}else{//一般ascii字符
current_character=c;//+65248;//半角转全角,放入缓存
}
Expand Down Expand Up @@ -330,33 +333,63 @@ inline int get_raw_vector(std::vector<Raw>& vec,FILE* pFile=stdin,int min_char=3
}
}
//if(current_character>0 && len != 9999)sent.push_back(current_character);
if(current_character > 0){
sent.push_back(current_character);
vec.push_back(sent);
sent.clear();
}else if(current_character > 0){
vec.push_back(sent);
sent.clear();
}
return -1;
/*
for(int i = 0; i < vec.size(); i ++){
std::cout<<"get_raw_vec:"<<vec[i]<<std::endl;
}
/*
int startIndex = 0;
int endIndex = 0;
Raw tmpRaw;
for(int i = 0; i < pun_vec.size(); i ++){
startIndex = (i == 0) ? 0 : pun_vec[i - 1];
endIndex = pun_vec[i];
if(endIndex > 1 )
std::cout<<"get_raw_vec:"<<pun_vec[i]<<std::endl;
}
*/
// std::cout << sent << std::endl;
// if(current_character > 0){
// sent.push_back(current_character);
// vec.push_back(sent);
// sent.clear();
// }else if(current_character > 0){
// vec.push_back(sent);
// sent.clear();
// }
// return -1;
// for(int i = 0; i < vec.size(); i ++){
// std::cout<<"get_raw_vec:"<<vec[i]<<std::endl;
// }
// return c;
// int startIndex = 0;
// int endIndex = 0;
// Raw tmpRaw;
// for(int i = 0; i < pun_vec.size(); i ++){
// startIndex = (i == 0) ? 0 : pun_vec[i - 1];
// endIndex = pun_vec[i];
// if(endIndex > 1 )
// std::cout<<"get_raw_vec:"<<pun_vec[i]<<std::endl;
// }
//if(!(c&0x80))sent.push_back(current_character);
//return 0;
}


inline void cut_raw(Raw& sent, std::vector<Raw>& vec, int max_len){
vec.clear();
//std::vector<int> pun_vec;
Raw sent_tmp;
std::set<int> pun_set;
std::set<int>::iterator it;
//int punInts[] = {46,63,33,12290,65311,65281};
int punInts[] = {63,33,59,12290,65311,65281,65307};
for(int i = 0; i < 7; i ++){
pun_set.insert(punInts[i]);
}
int current_character=-1;
int c, num = 0, last_pun = 0;
sent_tmp.clear();
for(int i = 0; i < sent.size(); i++){//反复读取输入流
c = sent[i];
num++;
it = pun_set.find(c);
if(it != pun_set.end() || i == sent.size()-1) {
if(num > max_len) {
vec.push_back(sent_tmp);
sent_tmp.clear();
num = i - last_pun + 1;
}
for(int j = last_pun; j <= i; j++) sent_tmp.push_back(sent[j]);
last_pun = i+1;
}
}
if(sent_tmp.size()) vec.push_back(sent_tmp);
}

}//for thulac
12 changes: 9 additions & 3 deletions src/timeword.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,10 @@ class TimeWord{
sentence[i] += sentence[i+1];
sentence.erase(sentence.begin() + i + 1);
}
else {
hasTimeWord = false;
}
}
hasTimeWord = false;
}
}

Expand Down Expand Up @@ -179,8 +181,10 @@ class TimeWord{
sentence.erase(sentence.begin() + i + 1);
sentence[i].tag = "t";
}
else {
hasTimeWord = false;
}
}
hasTimeWord = false;
}
}

Expand Down Expand Up @@ -226,8 +230,10 @@ class TimeWord{
sentence.erase(sentence.begin() + i + 1);
sentence[i].tag = "t";
}
else {
hasTimeWord = false;
}
}
hasTimeWord = false;
}
}

Expand Down

0 comments on commit d683983

Please sign in to comment.