Skip to content

Commit

Permalink
增加了.根据不同的用户自定义词典来进行分词
Browse files Browse the repository at this point in the history
  • Loading branch information
ansjsun committed Apr 1, 2013
1 parent 33fba32 commit 8853b57
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 30 deletions.
Binary file added dist/ans_seg-20130401.jar
Binary file not shown.
2 changes: 2 additions & 0 deletions library/user1.dic
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
java学习 j 100
java不学习 j 100
2 changes: 2 additions & 0 deletions library/user2.dic
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
php学习 j 100
php不学习 j 100
67 changes: 47 additions & 20 deletions src/org/ansj/library/UserDefineLibrary.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,16 @@ public static void main(String[] args) throws Exception {
}

static {
String temp = null;
BufferedReader br = null;
try {
long start = System.currentTimeMillis();
FOREST = new Forest();
// 先加载系统内置补充词典
br = MyStaticValue.getSystemLibraryReader();

while ((temp = br.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
} else {
Library.insertWord(FOREST, temp);
}
}
loadLibrary(MyStaticValue.userDefinePath);
initSystemLibrary(FOREST);
loadLibrary(FOREST, MyStaticValue.userDefinePath);
System.out.println("init user library ok use time :" + (System.currentTimeMillis() - start));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.err.println("init user library error :" + temp);
} finally {
IOUtil.close(br);
}
}

Expand All @@ -87,8 +74,31 @@ public static void insertWord(String keyword, String nature, int freq) {
Library.insertWord(FOREST, value);
}

private static void initSystemLibrary(Forest FOREST) {
// TODO Auto-generated method stub
String temp = null;
BufferedReader br = null;

br = MyStaticValue.getSystemLibraryReader();

try {
while ((temp = br.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
} else {
Library.insertWord(FOREST, temp);
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
IOUtil.close(br);
}
}

// 单个文件价值词典
private static void loadFile(File file) {
private static void loadFile(Forest forest, File file) {
// TODO Auto-generated method stub
if (!file.canRead()) {
System.err.println("file in path " + file.getAbsolutePath() + " can not to read!");
Expand All @@ -110,7 +120,7 @@ private static void loadFile(File file) {
} else {
value = new Value(strs[0], strs[1], strs[2]);
}
Library.insertWord(FOREST, value);
Library.insertWord(forest, value);
}
}
} catch (UnsupportedEncodingException e) {
Expand All @@ -125,21 +135,38 @@ private static void loadFile(File file) {
}
}

/**
* 用户自定义自己的词典,生成
* @param isSystem 是否加载系统词典
* @param libraryPaths 词典路径,可以是目录,也可以是具体的文件.如果是目录.只加载后缀为dic的文件
* @return 返回的词典结构.
*/
public static Forest makeUserDefineForest(boolean isSystem, String... libraryPaths) {
Forest forest = new Forest();
if (isSystem) {
initSystemLibrary(forest);
}
for (String path : libraryPaths) {
loadLibrary(forest,path) ;
}
return forest;
}

/**
* 加载词典,传入一本词典的路径.或者目录.词典后缀必须为.dic
*/
public static void loadLibrary(String temp) {
public static void loadLibrary(Forest forest, String temp) {
// 加载用户自定义词典
File file = null;
if ((temp != null || (temp = MyStaticValue.rb.getString("userLibrary")) != null)) {
file = new File(temp);
if (file.isFile()) {
loadFile(file);
loadFile(forest,file);
} else if (file.isDirectory()) {
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
if (file.getName().trim().endsWith(".dic")) {
loadFile(files[i]);
loadFile(forest, files[i]);
}
}
} else {
Expand Down
31 changes: 25 additions & 6 deletions src/org/ansj/splitWord/analysis/ToAnalysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import java.util.ArrayList;
import java.util.List;

import love.cq.domain.Forest;

import org.ansj.domain.Term;
import org.ansj.splitWord.Analysis;
import org.ansj.util.Graph;
Expand All @@ -20,9 +22,15 @@
*/
public class ToAnalysis extends Analysis {

private Forest forest = null;

public ToAnalysis(Reader reader) {
super(reader);
// TODO Auto-generated constructor stub
}

public ToAnalysis(Reader reader, Forest forest) {
super(reader);
this.forest = forest;
}

@Override
Expand All @@ -48,11 +56,9 @@ public List<Term> merger() {
new ForeignPersonRecognition(graph.terms).recognition();
graph.walkPathByScore();
}




// 用户自定义词典的识别
new UserDefineRecognition(graph.terms).recognition();
new UserDefineRecognition(graph.terms, forest).recognition();
graph.rmLittlePath();
graph.walkPathByScore();

Expand All @@ -77,8 +83,21 @@ private List<Term> getResult() {
private ToAnalysis() {
};

/**
* 用户自己定义的词典
*
* @param forest
*/
public ToAnalysis(Forest forest) {
// TODO Auto-generated constructor stub
this.forest = forest;
}

public static List<Term> paser(String str) {
return new ToAnalysis().paserStr(str);
}


public static List<Term> paser(String str, Forest forest) {
return new ToAnalysis(forest).paserStr(str);
}
}
20 changes: 16 additions & 4 deletions src/org/ansj/util/recognition/UserDefineRecognition.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
package org.ansj.util.recognition;

import static org.ansj.library.UserDefineLibrary.FOREST;
import love.cq.domain.Forest;
import love.cq.domain.WoodInterface;
import love.cq.util.ObjectBean;

import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.UserDefineLibrary;
import org.ansj.util.TermUtil;

/**
Expand All @@ -19,7 +20,9 @@ public class UserDefineRecognition {

private Term[] terms = null;

private WoodInterface branch = FOREST;
private WoodInterface forest = UserDefineLibrary.FOREST;

private WoodInterface branch = forest;

private int offe = -1;
private int endOffe = -1;
Expand All @@ -30,6 +33,15 @@ public UserDefineRecognition(Term[] terms) {
this.terms = terms;
}

public UserDefineRecognition(Term[] terms, Forest forest) {
this.terms = terms;
if (forest != null){
this.forest = forest;
branch = this.forest ;
}

}

public void recognition() {
if (branch == null) {
return;
Expand All @@ -40,7 +52,7 @@ public void recognition() {
for (int i = 0; i < length; i++) {
if (terms[i] == null)
continue;
if (branch == FOREST) {
if (branch == forest) {
flag = false;
} else {
flag = true;
Expand Down Expand Up @@ -112,7 +124,7 @@ private void reset() {
endOffe = -1;
tempFreq = 50;
tempNature = null;
branch = FOREST;
branch = forest;
}

/**
Expand Down
36 changes: 36 additions & 0 deletions test/org/ansj/demo/UserDefineAnalysisDemo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package org.ansj.demo;

import java.util.HashMap;
import java.util.List;

import org.ansj.domain.Term;
import org.ansj.library.UserDefineLibrary;
import org.ansj.splitWord.analysis.ToAnalysis;

import love.cq.domain.Forest;

/**
* 这个例子是根据不同的,用户自定义词典来加载分词器
* @author ansj
*
*/
public class UserDefineAnalysisDemo {
private static HashMap<String,Forest> userForestMap = new HashMap<String,Forest>() ;

public static void main(String[] args) {
//创建一些用户词典结构树
Forest forest = UserDefineLibrary.makeUserDefineForest(false, "library/user1.dic") ;
userForestMap.put("user1", forest) ;
forest = UserDefineLibrary.makeUserDefineForest(false, "library/user2.dic") ;
userForestMap.put("user2", forest) ;

List<Term> paser = null ;
paser = ToAnalysis.paser("java学习是一个很难的过程.", userForestMap.get("user1")) ;
System.out.println(paser);
paser = ToAnalysis.paser("java学习是一个很难的过程.", userForestMap.get("user2")) ;
System.out.println(paser);
paser = ToAnalysis.paser("php学习是一个很难的过程.", userForestMap.get("user2")) ;
System.out.println(paser);

}
}

0 comments on commit 8853b57

Please sign in to comment.