From 337dae970d67c8cfaa6b0285909c1149571c23f5 Mon Sep 17 00:00:00 2001 From: niexiaolong <26566145@qq.com> Date: Tue, 29 Aug 2017 14:30:59 +0800 Subject: [PATCH] =?UTF-8?q?=E5=85=A8=E5=88=87=E5=88=86=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 全切分算法,返回所有分词结果的合集。 例如:牛奶好喝。切分为:“牛”,“牛奶”,“奶”,“好”,“好喝”,“喝” 再索引的时候,以该种全切分算法存储索引,查询的时候,再用最大Ngram算法进行查询分词。配合使用,达到更优的效果。 --- .../word/segmentation/impl/FullSegmentation.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apdplat/word/segmentation/impl/FullSegmentation.java b/src/main/java/org/apdplat/word/segmentation/impl/FullSegmentation.java index ce8fb143..ca567e5e 100644 --- a/src/main/java/org/apdplat/word/segmentation/impl/FullSegmentation.java +++ b/src/main/java/org/apdplat/word/segmentation/impl/FullSegmentation.java @@ -56,10 +56,11 @@ public List segImpl(String text) { } //获取全切分结果 List[] array = fullSeg(text); - //利用ngram计算分值 - Map, Float> words = ngram(array); - //歧义消解(ngram分值优先、词个数少优先) - List result = disambiguity(words); + Set words = new HashSet(); + for(List wordList : array){ + words.addAll(wordList); + } + List result = new ArrayList(words); return result; } private List disambiguity(Map, Float> words){ @@ -284,4 +285,4 @@ public static void main(String[] args){ String text = "蝶舞打扮得漂漂亮亮出现在张公公面前"; System.out.println(m.seg(text)); } -} \ No newline at end of file +}