Skip to content

Commit

Permalink
modify readme
Browse files Browse the repository at this point in the history
  • Loading branch information
letiantian committed Dec 1, 2014
1 parent daec540 commit 3722b98
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 20 deletions.
70 changes: 69 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ TextRank4ZH暂不支持使用easy_install、pip来安装,使用者可以将`te

##原理

Text的详细原理请参考
TextRank的详细原理请参考

> Mihalcea R, Tarau P. TextRank: Bringing order into texts[C]. Association for Computational Linguistics, 2004.
Expand Down Expand Up @@ -145,6 +145,74 @@ print '\n'.join(tr4s.get_key_sentences(num=3))

##使用说明

类TextRank4Keyword、TextRank4Sentence在处理一段文本时会将文本拆分成4种格式:

sentences:由句子组成的列表。
words_no_filter:对sentences中每个句子分词而得到的两级列表。
words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。

例如,对于:
```
这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足。答谢宴于晚上8点开始。
```
`speech_tag_filter=True, lower=True, source = 'all_filters'`时,
sentences:
```
['这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足',
'答谢宴于晚上8点开始']
```
words_no_filter:
```
[
[ '这', '间, '酒店, '位于, '北京, '东三环, '里面, '摆放, '很多, '雕塑, '文艺, '气息, '十足'],
[ '答谢', '宴于, '晚上, '8, '点, '开始' ]
]
```
words_no_stop_words:
```
[
[ '间', '酒店, '位于, '北京, '东三环, '里面, '摆放, '很多, '雕塑, '文艺, '气息, '十足' ],
[ '答谢', '宴于, '晚上, '8, '点' ]
]
```
words_all_filters:
```
[
[ '酒店', '位于, '北京, '东三环, '摆放, '雕塑, '文艺, '气息' ],
[ '答谢', '宴于, '晚上' ]
]
```

###class TextRank4Keyword
位于`textrank4zh/TextRank4Keyword.py`中,

**构造函数:**

`stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。

`delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。

**函数train(...):**

`text`:文本内容,字符串。

`window`:窗口大小,int,用来构造单词之间的边。默认值为2。

`lower`:是否将文本转换为小写。默认为False。

`speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。若值为False,words_all_filters与words_no_stop_words相同。

`vertex_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点。默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。关键词也来自`vertex_source`

`edge_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边。默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数。










32 changes: 16 additions & 16 deletions textrank4zh/TextRank4Keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class TextRank4Keyword(object):

def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
def __init__(self, stop_words_file = None, delimiters = '?!;?!。;…\n'):
''' '''
self.text = ''
self.keywords = []
Expand All @@ -26,10 +26,10 @@ def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
self.graph = None

def train(self, text, window = 2, lower = False, speech_tag_filter=True,
candidate_words_source = 'all_filters',
pagerank_words_source = 'no_stop_words'):
vertex_source = 'all_filters',
edge_source = 'no_stop_words'):
'''
candidate_words_source, pagerank_words_source: no_filter, no_stop_words, all_filters这三个值
vertex_source, edge_source: no_filter, no_stop_words, all_filters这三个值
'''

self.text = text
Expand All @@ -42,24 +42,24 @@ def train(self, text, window = 2, lower = False, speech_tag_filter=True,
lower=lower,
speech_tag_filter=speech_tag_filter)

if candidate_words_source == 'no_filter':
candidate_words_source = self.words_no_filter
elif candidate_words_source == 'no_stop_words':
candidate_words_source = self.words_no_stop_words
if vertex_source == 'no_filter':
vertex_source = self.words_no_filter
elif vertex_source == 'no_stop_words':
vertex_source = self.words_no_stop_words
else:
candidate_words_source = self.words_all_filters
vertex_source = self.words_all_filters

if pagerank_words_source == 'no_filter':
pagerank_words_source = self.words_no_filter
elif candidate_words_source == 'all_filters':
pagerank_words_source = self.words_all_filters
if edge_source == 'no_filter':
edge_source = self.words_no_filter
elif vertex_source == 'all_filters':
edge_source = self.words_all_filters
else:
pagerank_words_source = self.words_no_stop_words
edge_source = self.words_no_stop_words



index = 0
for words in candidate_words_source:
for words in vertex_source:
for word in words:
if not self.word_index.has_key(word):
self.word_index[word] = index
Expand All @@ -69,7 +69,7 @@ def train(self, text, window = 2, lower = False, speech_tag_filter=True,
words_number = index # 单词数量
self.graph = np.zeros((words_number, words_number))

for word_list in pagerank_words_source:
for word_list in edge_source:
for w1, w2 in self.combine(word_list, window):
if not self.word_index.has_key(w1):
continue
Expand Down
15 changes: 12 additions & 3 deletions textrank4zh/TextRank4Sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,18 @@ def get_key_sentences(self, num = 6, sentence_min_len = 6):
if __name__ == '__main__':

import codecs
text = codecs.open('../text/03.txt', 'r', 'utf-8').read()
# text = "坏人坏人坏人坏人坏人。你好"
# text = codecs.open('../text/03.txt', 'r', 'utf-8').read()
text = "这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足。答谢宴于晚上8点开始。"
tr4s = TextRank4Sentence(stop_words_file='../stopword.data')
tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters')

print '\n'.join(tr4s.get_key_sentences(num=1))

print '\n'.join(tr4s.sentences)
for wl in tr4s.words_no_filter:
print '[', ', \''.join(wl), ']'
print
for wl in tr4s.words_no_stop_words:
print '[', ', \''.join(wl), ']'
print
for wl in tr4s.words_all_filters:
print '[', ', \''.join(wl), ']'

0 comments on commit 3722b98

Please sign in to comment.