forked from letiantian/TextRank4ZH
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
200 lines (164 loc) · 6.28 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#-*- encoding:utf-8 -*-
"""
@author: letian
@homepage: http://www.letiantian.me
@github: https://github.com/someus/
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import math
import networkx as nx
import numpy as np
import sys
try:
reload(sys)
sys.setdefaultencoding('utf-8')
except:
pass
sentence_delimiters = ['?', '!', ';', '?', '!', '。', ';', '……', '…', '\n']
allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
PY2 = sys.version_info[0] == 2
if not PY2:
# Python 3.x and up
text_type = str
string_types = (str,)
xrange = range
def as_text(v): ## 生成unicode字符串
if v is None:
return None
elif isinstance(v, bytes):
return v.decode('utf-8', errors='ignore')
elif isinstance(v, str):
return v
else:
raise ValueError('Unknown type %r' % type(v))
def is_text(v):
return isinstance(v, text_type)
else:
# Python 2.x
text_type = unicode
string_types = (str, unicode)
xrange = xrange
def as_text(v):
if v is None:
return None
elif isinstance(v, unicode):
return v
elif isinstance(v, str):
return v.decode('utf-8', errors='ignore')
else:
raise ValueError('Invalid type %r' % type(v))
def is_text(v):
return isinstance(v, text_type)
__DEBUG = None
def debug(*args):
global __DEBUG
if __DEBUG is None:
try:
if os.environ['DEBUG'] == '1':
__DEBUG = True
else:
__DEBUG = False
except:
__DEBUG = False
if __DEBUG:
print( ' '.join([str(arg) for arg in args]) )
class AttrDict(dict):
"""Dict that can get attribute by dot"""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def combine(word_list, window = 2):
"""构造在window下的单词组合,用来构造单词之间的边。
Keyword arguments:
word_list -- list of str, 由单词组成的列表。
windows -- int, 窗口大小。
"""
if window < 2: window = 2
for x in xrange(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r
def get_similarity(word_list1, word_list2):
"""默认的用于计算两个句子相似度的函数。
Keyword arguments:
word_list1, word_list2 -- 分别代表两个句子,都是由单词组成的列表
"""
words = list(set(word_list1 + word_list2))
vector1 = [float(word_list1.count(word)) for word in words]
vector2 = [float(word_list2.count(word)) for word in words]
vector3 = [vector1[x]*vector2[x] for x in xrange(len(vector1))]
vector4 = [1 for num in vector3 if num > 0.]
co_occur_num = sum(vector4)
if abs(co_occur_num) <= 1e-12:
return 0.
denominator = math.log(float(len(word_list1))) + math.log(float(len(word_list2))) # 分母
if abs(denominator) < 1e-12:
return 0.
return co_occur_num / denominator
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
"""将单词按关键程度从大到小排序
Keyword arguments:
vertex_source -- 二维列表,子列表代表句子,子列表的元素是单词,这些单词用来构造pagerank中的节点
edge_source -- 二维列表,子列表代表句子,子列表的元素是单词,根据单词位置关系构造pagerank中的边
window -- 一个句子中相邻的window个单词,两两之间认为有边
pagerank_config -- pagerank的设置
"""
sorted_words = []
word_index = {}
index_word = {}
_vertex_source = vertex_source
_edge_source = edge_source
words_number = 0
for word_list in _vertex_source:
for word in word_list:
if not word in word_index:
word_index[word] = words_number
index_word[words_number] = word
words_number += 1
graph = np.zeros((words_number, words_number))
for word_list in _edge_source:
for w1, w2 in combine(word_list, window):
if w1 in word_index and w2 in word_index:
index1 = word_index[w1]
index2 = word_index[w2]
graph[index1][index2] = 1.0
graph[index2][index1] = 1.0
debug('graph:\n', graph)
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(word=index_word[index], weight=score)
sorted_words.append(item)
return sorted_words
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
"""将句子按照关键程度从大到小排序
Keyword arguments:
sentences -- 列表,元素是句子
words -- 二维列表,子列表和sentences中的句子对应,子列表由单词组成
sim_func -- 计算两个句子的相似性,参数是两个由单词组成的列表
pagerank_config -- pagerank的设置
"""
sorted_sentences = []
_source = words
sentences_num = len(_source)
graph = np.zeros((sentences_num, sentences_num))
for x in xrange(sentences_num):
for y in xrange(x, sentences_num):
similarity = sim_func( _source[x], _source[y] )
graph[x, y] = similarity
graph[y, x] = similarity
nx_graph = nx.from_numpy_matrix(graph)
scores = nx.pagerank(nx_graph, **pagerank_config) # this is a dict
sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
for index, score in sorted_scores:
item = AttrDict(index=index, sentence=sentences[index], weight=score)
sorted_sentences.append(item)
return sorted_sentences
if __name__ == '__main__':
pass