Skip to content

Commit

Permalink
保留英文大小写
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed Jul 21, 2017
1 parent 201f5f3 commit f522de7
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public abstract class AbstractSegmentation implements DictionaryBasedSegmentati

private static final boolean PERSON_NAME_RECOGNIZE = WordConfTools.getBoolean("person.name.recognize", true);
private static final boolean KEEP_WHITESPACE = WordConfTools.getBoolean("keep.whitespace", false);
private static final boolean KEEP_CASE = WordConfTools.getBoolean("keep.case", false);
private static final boolean KEEP_PUNCTUATION = WordConfTools.getBoolean("keep.punctuation", false);
private static final boolean PARALLEL_SEG = WordConfTools.getBoolean("parallel.seg", true);
private static final int INTERCEPT_LENGTH = WordConfTools.getInt("intercept.length", 16);
Expand Down Expand Up @@ -167,12 +168,12 @@ private List<Word> segSentence(final String sentence){
if(sentence.length() == 1){
if(KEEP_WHITESPACE){
List<Word> result = new ArrayList<>(1);
result.add(new Word(sentence));
result.add(new Word(KEEP_CASE ? sentence : sentence.toLowerCase()));
return result;
}else{
if(!Character.isWhitespace(sentence.charAt(0))){
List<Word> result = new ArrayList<>(1);
result.add(new Word(sentence));
result.add(new Word(KEEP_CASE ? sentence : sentence.toLowerCase()));
return result;
}
}
Expand Down Expand Up @@ -224,7 +225,25 @@ protected void addWord(Stack<Word> result, String text, int start, int len){
* @return 词或空
*/
protected Word getWord(String text, int start, int len){
Word word = new Word(text.substring(start, start+len).toLowerCase());
if(len < 1){
return null;
}
if(start < 0){
return null;
}
if(text == null){
return null;
}
if(start + len > text.length()){
return null;
}
String wordText = null;
if(KEEP_CASE){
wordText = text.substring(start, start+len);
}else{
wordText = text.substring(start, start+len).toLowerCase();
}
Word word = new Word(wordText);
//方便编译器优化
if(KEEP_WHITESPACE){
//保留空白字符
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/word.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#是否启用自动检测功能,如:用户自定义词典、停用词词典
auto.detect=true
#保留英文大小写
keep.case=false
#词典机制实现类,词首字索引式前缀树
#dic.class=org.apdplat.word.dictionary.impl.DictionaryTrie
#前缀树词首字索引分配空间大小,如过小则会导致碰撞增加,减小查询性能
Expand Down

0 comments on commit f522de7

Please sign in to comment.