|
| 1 | +#! /usr/bin/env python |
| 2 | +#coding:utf-8 |
| 3 | + |
| 4 | +#本代码来自网络:http://ashin.sinaapp.com/article/118/ |
| 5 | + |
| 6 | +import jieba |
| 7 | +from whoosh.analysis import Tokenizer,Token |
| 8 | +from whoosh.compat import text_type |
| 9 | + |
| 10 | +class ChineseTokenizer(Tokenizer): |
| 11 | + def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): |
| 12 | + assert isinstance(value, text_type), "%r is not unicode" % value |
| 13 | + t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) |
| 14 | + seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词 |
| 15 | + for w in seglist: |
| 16 | + t.original = t.text = w |
| 17 | + t.boost = 1.0 |
| 18 | + if positions: |
| 19 | + t.pos=start_pos+value.find(w) |
| 20 | + if chars: |
| 21 | + t.startchar=start_char+value.find(w) |
| 22 | + t.endchar=start_char+value.find(w)+len(w) |
| 23 | + yield t #通过生成器返回每个分词的结果token |
| 24 | + |
| 25 | +def ChineseAnalyzer(): |
| 26 | + return ChineseTokenizer() |
| 27 | + |
| 28 | +""" |
| 29 | +测试脚本: |
| 30 | +
|
| 31 | +#!/usr/bin/env python |
| 32 | +# -*- coding: UTF-8 -*- |
| 33 | +
|
| 34 | +from whoosh.index import create_in |
| 35 | +from whoosh.fields import * |
| 36 | +from chinesetokenizer import ChineseAnalyzer |
| 37 | +#from whoosh.analysis import RegexAnalyzer |
| 38 | +#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)") |
| 39 | +
|
| 40 | +analyzer = ChineseAnalyzer() |
| 41 | +
|
| 42 | +schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) |
| 43 | +ix = create_in("schema", schema) |
| 44 | +writer = ix.writer() |
| 45 | +
|
| 46 | +writer.add_document(title=u"First document", path=u"/a", content=u"先生说我们都是好学生") |
| 47 | +writer.add_document(title=u"Second document", path=u"/b", content=u"我们要树立科学发展观") |
| 48 | +writer.commit() |
| 49 | +
|
| 50 | +with ix.searcher() as searcher: |
| 51 | + results = searcher.find("content", u"发展") |
| 52 | + if 0 != len(results): |
| 53 | + for hit in results: |
| 54 | + print hit.highlights("content") |
| 55 | +
|
| 56 | +
|
| 57 | +运行结果: |
| 58 | +
|
| 59 | +
|
| 60 | +先<b class="match term0">生</b>说我们都是好<b class="match term1">学</b><b class="match term0">生</b> |
| 61 | +
|
| 62 | +先生说我们都是好<b class="match term0">学生</b> |
| 63 | +
|
| 64 | +我们要树立科学<b class="match term0">发</b><b class="match term1">展</b>观 |
| 65 | +
|
| 66 | +我们要树立科学<b class="match term0">发展</b>观 |
| 67 | +
|
| 68 | +""" |
0 commit comments