中文分词查找

qiwsir · qiwsir · commit b37568351399 · 2014-07-11T16:28:53.000+08:00
diff --git a/chinesetokenizer.py b/chinesetokenizer.py
@@ -0,0 +1,68 @@
+#! /usr/bin/env python
+#coding:utf-8
+
+#本代码来自网络：http://ashin.sinaapp.com/article/118/
+
+import jieba
+from whoosh.analysis import Tokenizer,Token 
+from whoosh.compat import text_type
+
+class ChineseTokenizer(Tokenizer):  
+    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):  
+        assert isinstance(value, text_type), "%r is not unicode" % value  
+        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)  
+        seglist=jieba.cut_for_search(value)                       #使用结巴分词库进行分词  
+        for w in seglist:  
+            t.original = t.text = w  
+            t.boost = 1.0  
+            if positions:  
+                t.pos=start_pos+value.find(w)  
+            if chars:  
+                t.startchar=start_char+value.find(w)  
+                t.endchar=start_char+value.find(w)+len(w)  
+            yield t                                               #通过生成器返回每个分词的结果token
+
+def ChineseAnalyzer():  
+    return ChineseTokenizer()
+
+"""
+测试脚本：
+
+#!/usr/bin/env python  
+# -*- coding: UTF-8 -*-  
+
+from whoosh.index import create_in  
+from whoosh.fields import *  
+from chinesetokenizer import ChineseAnalyzer
+#from whoosh.analysis import RegexAnalyzer  
+#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))  
+ix = create_in("schema", schema)  
+writer = ix.writer()
+
+writer.add_document(title=u"First document", path=u"/a", content=u"先生说我们都是好学生")  
+writer.add_document(title=u"Second document", path=u"/b", content=u"我们要树立科学发展观")
+writer.commit()
+
+with ix.searcher() as searcher:
+    results = searcher.find("content", u"发展")
+    if 0 != len(results):
+        for hit in results:
+            print hit.highlights("content")
+
+
+运行结果：
+
+
+先<b class="match term0">生</b>说我们都是好<b class="match term1">学</b><b class="match term0">生</b>
+
+先生说我们都是好<b class="match term0">学生</b>
+
+我们要树立科学<b class="match term0">发</b><b class="match term1">展</b>观
+
+我们要树立科学<b class="match term0">发展</b>观
+
+"""