Skip to content

Commit b375683

Browse files
committed
中文分词查找
1 parent 0ced19f commit b375683

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

chinesetokenizer.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#! /usr/bin/env python
2+
#coding:utf-8
3+
4+
#本代码来自网络:http://ashin.sinaapp.com/article/118/
5+
6+
import jieba
7+
from whoosh.analysis import Tokenizer,Token
8+
from whoosh.compat import text_type
9+
10+
class ChineseTokenizer(Tokenizer):
11+
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):
12+
assert isinstance(value, text_type), "%r is not unicode" % value
13+
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
14+
seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词
15+
for w in seglist:
16+
t.original = t.text = w
17+
t.boost = 1.0
18+
if positions:
19+
t.pos=start_pos+value.find(w)
20+
if chars:
21+
t.startchar=start_char+value.find(w)
22+
t.endchar=start_char+value.find(w)+len(w)
23+
yield t #通过生成器返回每个分词的结果token
24+
25+
def ChineseAnalyzer():
26+
return ChineseTokenizer()
27+
28+
"""
29+
测试脚本:
30+
31+
#!/usr/bin/env python
32+
# -*- coding: UTF-8 -*-
33+
34+
from whoosh.index import create_in
35+
from whoosh.fields import *
36+
from chinesetokenizer import ChineseAnalyzer
37+
#from whoosh.analysis import RegexAnalyzer
38+
#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
39+
40+
analyzer = ChineseAnalyzer()
41+
42+
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
43+
ix = create_in("schema", schema)
44+
writer = ix.writer()
45+
46+
writer.add_document(title=u"First document", path=u"/a", content=u"先生说我们都是好学生")
47+
writer.add_document(title=u"Second document", path=u"/b", content=u"我们要树立科学发展观")
48+
writer.commit()
49+
50+
with ix.searcher() as searcher:
51+
results = searcher.find("content", u"发展")
52+
if 0 != len(results):
53+
for hit in results:
54+
print hit.highlights("content")
55+
56+
57+
运行结果:
58+
59+
60+
先<b class="match term0">生</b>说我们都是好<b class="match term1">学</b><b class="match term0">生</b>
61+
62+
先生说我们都是好<b class="match term0">学生</b>
63+
64+
我们要树立科学<b class="match term0">发</b><b class="match term1">展</b>观
65+
66+
我们要树立科学<b class="match term0">发展</b>观
67+
68+
"""

0 commit comments

Comments
 (0)