-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbootstrapping.py
303 lines (275 loc) · 12.7 KB
/
bootstrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
####################################################################
# Licence: Creative Commons (see COPYRIGHT) #
# Authors: Nikolaos Pappas, Georgios Katsimpras #
# {nik0spapp, gkatsimpras}@gmail.com #
# Supervisor: Efstathios stamatatos #
# University of the Aegean #
# Department of Information and Communication Systems Engineering #
# Information Management Track (MSc) #
# Karlovasi, Samos #
# Greece #
####################################################################
import re
import sys
import pickle
from terminal_colors import Tcolors
from pb_classifiers import PbSubj
class Bootstrapping:
"""
Bootstrapping: Class performing the bootstrapping process for
subjectivity and objectivity classification of sentences. The
method learns linguistically rich extraction patterns for subjective
(opinionated) expressions from unannotated data. The learned
patterns are used to identify more subjective sentences that simple
high precision classifiers can't recall.
Related paper:
E. Riloff and J. Wiebe. Learning extraction patterns for subjective
expressions. In Proceedings of the 2003 conference on Empirical methods
in natural language processing, EMNLP '03, pages 105--112, 2003. ACL.
Learned patterns structure
e.g. {"<subj> was killed" : {'type' : 'subj',
'display' : 'was killed',
'subj_freq' : 10,
'freq' : 20,
'prob' : 0.5}}
"""
def __init__(self, hp_obj, hp_subj, tagger, debug=False):
# Syntactic forms for pattern extraction
self.syntactic_forms = {"subj" : [["BE","VBN*|VBD*"],
["HAVE","BE","VB*"],
["VB*"],
["VB*","*","NN*|NP*|NC*"],
["VB*","TO","VB*"],
["HAVE","TO","BE"],
["HAVE","NN*"]],
"dobj" : [["VB*"],
["TO","VB*"],
["VB*","TO","VB*"]],
"np" : [["NN","IN"],
["VB*","NN","IN"],
["BE","VBN","IN"],
["TO","VB","TO"]]
}
self.filename = "stored/learned_patterns"
try:
self.learned_patterns = pickle.load(open(self.filename))
print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC
except:
print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC
self.learned_patterns = {}
# Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram)
self.tagger = tagger
# Sentence to be classified
self.subjective = False
self.objective = False
# High precision objective classifier
self.hp_obj = hp_obj
# High precision subjective classifier
self.hp_subj = hp_subj
# Pattern-Based Subjective Classifier
self.pb_subj = PbSubj(self.tagger, debug=debug)
# Learned patterns
self.patterns = {}
self.debug = debug
def classify(self, sentence, previous="", next=""):
"""
Subjectivity classification using boostrapping method.
"""
# STEP 1: Classify sentence with HP Subjective classifier
self.subjective = self.hp_subj.classify(sentence)
# STEP 1: Get help from learned patterns
if not self.subjective:
if self.debug: print Tcolors.ACT + " Training pattern based classifier...\n"
self.pb_subj.train(self.learned_patterns)
found, self.subjective, obj = self.pb_subj.classify(sentence)
if not self.subjective and not self.objective:
# STEP 2: Classify sentence with HP Objective classifier
self.objective = self.hp_obj.classify(sentence, previous, next)
if self.subjective or self.objective:
# STEP 3: Learn
self.learn_patterns_from(sentence)
else:
# STEP 4: Classify based on learned patterns
found, self.subjective, self.objective = self.pb_subj.classify(sentence)
# Uncomment the two following to bootstrap further the subjective
# sentences detected from the pattern-based classifier.
# if self.subjective:
# self.learn_patterns_from(sentence)
if self.subjective:
return 'subjective'
elif self.objective:
return 'objective'
else:
return None
def learn_patterns_from(self, sentence):
"""
Learns extraction patterns associated with subjectivity
from a given sentence.
"""
tagged_sentence = self.tagger.tag(sentence)
tags = []
words = []
if self.debug:
print Tcolors.ACT + " Performing part of speech (POS) tagging..." + Tcolors.WARNING
print tagged_sentence
print Tcolors.ENDC
for (w,tag) in tagged_sentence:
if tag is None:
tag = ""
tags.append(tag)
words.append(w)
self.trigger_patterns(tags, words)
def match_until_next_nn(self, i, tags, words, form, key):
"""
The hard job for triggering the syntactic forms :-)
"""
LIMITER = 4
BE = ['was','were','be','being','am','been','are','is']
HAVE = ['has','have','had']
matched = 0
prev_matched = 0
positions_matched = []
learned_pattern = []
star = False
for j,ctag in enumerate(form):
next = i + j + 1
inner = 0
found = False
while(not found and next < len(tags)):
next += inner
if next < len(words) and ctag == "VB*" and words[next] in HAVE:
next += 1
if next < len(words) and ctag == "VB*" and words[next] in BE:
next += 1
elif next < len(words) and ctag == "VB*" and words[next] in BE:
next += 1
if ctag == "*":
star = True
elif ctag.find("*") > -1:
ortags = ctag.split("|")
for ortag in ortags:
if next < len(tags) and tags[next].find(ortag.replace("*","")) > -1\
and next not in positions_matched:
if star and inner < 2:
matched += 1
matched += 1
positions_matched.append(next)
found = True
elif ctag == "BE":
if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("BE") > -1) \
and words[next] in BE and next not in positions_matched:
matched += 1
positions_matched.append(next)
found = True
elif ctag == "HAVE":
if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("HV") > -1)\
and words[next] in HAVE and next not in positions_matched:
matched += 1
positions_matched.append(next)
found = True
elif next < len(tags) and tags[next].find(ctag) > -1\
and next not in positions_matched:
matched += 1
positions_matched.append(next)
found = True
else:
found = True
inner += 1
if key == "subj":
learned_pattern = ["<subj>"]
for pos in positions_matched:
learned_pattern.append(words[pos])
if key != "subj":
learned_pattern.append("<" + key +">")
learned_pattern = " ".join(learned_pattern)
if matched == len(form):
if self.debug:
print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC
print "Pattern learned:", learned_pattern
return True, learned_pattern
else:
return False, None
def proccess_learned_pattern(self, pattern):
"""
Add pattern to learned patterns if it doesn't exist else
update its probability.
"""
if pattern.find("subj") > -1:
key = "subj"
elif pattern.find("dobj") > -1:
key = "dobj"
else:
key = "np"
cur_subj_freq = 0
if self.subjective:
cur_subj_freq = 1
pkey = pattern
pkey = re.sub(r"<subj> | <np>| <dobj>","",pkey)
if self.learned_patterns.has_key(pattern):
subj_freq = self.learned_patterns[pattern]['subj_freq'] + cur_subj_freq
freq = self.learned_patterns[pattern]['freq'] + 1
prob = (float)(subj_freq)/(float)(freq)
self.learned_patterns[pattern]['prob'] = prob
self.learned_patterns[pattern]['subj_freq'] = subj_freq
self.learned_patterns[pattern]['freq'] = freq
if self.debug: print Tcolors.ADD + Tcolors.HEADER + " Updating pattern:", pattern, Tcolors.ENDC
else:
subj_freq = 0
freq = 1
subj_freq += cur_subj_freq
prob = (float)(subj_freq)/(float)(freq)
self.learned_patterns[pattern] = {'type': key,
'display': pkey,
'freq' : freq,
'subj_freq' : subj_freq,
'prob' : prob}
if self.debug: print Tcolors.ADD + Tcolors.CYAN + " Learning pattern:", pattern, Tcolors.ENDC
def store_knowledge(self):
"""
Stored learned patterns for future usage.
"""
output = open(self.filename, 'wb')
pickle.dump(self.learned_patterns, output)
def trigger_patterns(self, tags, words):
"""
Method that triggers syntactic forms and returns the learned
patterns from the triggering.
"""
patterns = []
if self.debug: print Tcolors.ACT + " Triggering subjective syntactic forms..."
for key in self.syntactic_forms.keys():
syntactic_forms = self.syntactic_forms[key]
if self.debug: print Tcolors.PROC + Tcolors.GRAY + " Checking form group " + key + "..." + Tcolors.ENDC
for form in syntactic_forms:
for i,tag in enumerate(tags):
if tag.find("NN") > -1 or tag.find("NP") > -1 \
or tag.find("PR") > -1:
triggered, pattern = self.match_until_next_nn(i, tags, words, form, key)
if pattern is not None and pattern not in patterns:
if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC
patterns.append(pattern)
for pattern in patterns:
self.proccess_learned_pattern(pattern)
if self.debug:
print Tcolors.OKBLUE
print self.learned_patterns
print Tcolors.ENDC
self.store_knowledge()
def train(self, data):
"""
Method to train the pattern-based classifier
"""
for sentence in data:
self.classify(sentence)
def clear_learned_data(self):
self.learned_patterns = {}
if __name__ == "__main__":
from hp_classifiers import HpObj, HpSubj
from pos import SequentialTagger
hp_obj = HpObj()
hp_subj = HpSubj()
tagger = SequentialTagger()
bootstrapping = Bootstrapping(hp_obj, hp_subj, tagger)
if self.debug:
print bootstrapping.classify(sys.argv[1])