forked from qolina/NNED
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheventExtFromRaw.py
184 lines (165 loc) · 7.25 KB
/
eventExtFromRaw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import re
import sys
import cPickle
from aceEventUtil import event2str
from xml.etree.ElementTree import ElementTree
from sgmllib import SGMLParser
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
import nltk.data
class aceParser(SGMLParser):
content = ""
line_content = []
def init(self, path):
self.path = path
def handle_data(self, text):
text = text.replace("&", "&----")
self.line_content.append(text)
self.content += text
def parseSGML(filename):
debug = True
sgm_parser = aceParser()
file_content = open(filename, "r").read()
sgm_parser.content = ""
sgm_parser.line_content = []
sgm_parser.feed(file_content)
content = sgm_parser.content
line_content = sgm_parser.line_content
content = content.replace("\n", " ")
if filename.find("FLOPPINGACES_20041114.1240.03") >= 0 or filename.find("CNN_CF_20030304.1900.04") >= 0 or filename.find("BACONSREBELLION_20050226.1317") >= 0 or filename.find("CNN_ENG_20030616_130059.25") >= 0 or filename.find("FLOPPINGACES_20050217.1237.014") >= 0:
content = content.replace("&----", "&")
line_content = [line.replace("&----", "&") for line in line_content]
sentences = []
sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")
# test output
if debug:
#idx = (1527, 1536)
idx = (718, 909)
print "# direct from new content:", content[idx[0]:idx[1]+1]
sent_id = 0
line_id = 0
while line_id < len(line_content):
line = line_content[line_id]
pre_content = "".join(line_content[:line_id])
char_st = len(pre_content)
while line_id < len(line_content)-2 and line_content[line_id+1] == "&----":
line = line + line_content[line_id+1] + line_content[line_id+2]
line_id += 2
line_id += 1
line = line.replace("\n", " ")
char_ed = char_st + len(line)
if debug:
print "-----------------------------", line_id, (char_st, char_ed)
print "S-"+line+"-E"
if len(line.strip())<1: continue
#sents_in_line = line.split("\n\n")
#print line
sents_in_line = sent_tokenize(line)
last_end = 0
#sents_in_line = sent_detector.tokenize(line)
for sent in sents_in_line:
sent = sent.replace("\n", " ").strip()
sent_st_in_line = line.find(sent, last_end)
sent_ed_in_line = sent_st_in_line + len(sent) - 1
last_end = sent_ed_in_line
sent_st = char_st + sent_st_in_line
sent_ed = sent_st + len(sent) - 1
sent_id += 1
if debug:
print "------##", sent_id, (sent_st_in_line, sent_ed_in_line), (sent_st, sent_ed)
print sent
sentences.append(((sent_st, sent_ed), sent))
for sent_id, (sent_span, sent) in enumerate(sentences[:]):
print "##",sent_id, sent_span, sent
return sentences[3:], content
#return line_content
# root: sourcefile *.sgm eventfilename *.apf.xml
# second_layer: document
# third_layer: entity, timex, relation, event
def extractEvents(filename):
xmlTree = ElementTree(file=filename[:-3]+"apf.xml")
#root = xmlTree.getroot()
eventArrOneDoc = []
for eventEle in xmlTree.iter(tag="event"):
#print eventEle
#print eventEle.tag, eventEle.attrib
eventArr = extractEvent(eventEle)
#print eventArr
eventArrOneDoc.extend(eventArr)
#print event2str(eventArrOneDoc[0], "\t")
return eventArrOneDoc
# forth_layer(event): [optional]event_argument, event_mention
# fifth_layer(event_mention): extent, ldc_scope, anchor, [optional]event_mention_argument
# sixth_layer(event_mention_argument): extent
# event_v1 = [sentence_ldc_scope, eventType, eventSubType, anchorText, (argText, role), (argText, role), ...]
# event_v2 = [(sentence_ldc_scope, index), eventType, eventSubType, (anchorText, index), (argText, role, index), (argText, role, index), ...]
def extractEvent(eventEle):
#print eventEle.attrib
eventType = eventEle.attrib["TYPE"]
eventSubType = eventEle.attrib["SUBTYPE"]
#print "-- Event type, subtype", eventType, "\t", eventSubType
#print "-- Event Arguments:",
#for eventArgument in eventEle:
# if eventArgument.tag != "event_argument": continue
# print eventArgument.attrib["ROLE"],
#print
eventArr = []
#print "-- Event Mention:"
for eventMention in eventEle:
if eventMention.tag != "event_mention": continue
sentenceElement = eventMention[1][0]
sentence_ldc_scope = sentenceElement .text
sentence_ldc_scope = re.sub(r"\n", " ", sentence_ldc_scope).strip()
sentence_index = (int(sentenceElement.attrib["START"]), int(sentenceElement.attrib["END"]))
#sentence = (sentence_ldc_scope, (0, sentence_index[1]-sentence_index[0]))
sentence = (sentence_ldc_scope, sentence_index)
anchorEle = eventMention[2][0]
anchorText = anchorEle.text
anchorText = re.sub(r"\n", " ", anchorText)
#anchor_index = (int(anchorEle.attrib["START"])-sentence_index[0], int(anchorEle.attrib["END"])-sentence_index[0])
anchor_index = (int(anchorEle.attrib["START"]), int(anchorEle.attrib["END"]))
anchor = (anchorText, anchor_index)
#print "----Sentence", sentence
#print "----Anchor", anchor
event = [sentence, eventType, eventSubType, anchor]
for eventMentionArgument in eventMention:
if eventMentionArgument.tag != "event_mention_argument": continue
argRole = eventMentionArgument.attrib["ROLE"]
argElement = eventMentionArgument[0][0]
argText = argElement .text
argText = re.sub(r"\n", " ", argText)
#arg_index = (int(argElement.attrib["START"])-sentence_index[0], int(argElement.attrib["END"])-sentence_index[0])
arg_index = (int(argElement.attrib["START"]), int(argElement.attrib["END"]))
arg = (argText, argRole, arg_index)
event.append(arg)
#print arg
eventArr.append(event)
return eventArr
def main():
dataDir = sys.argv[1]
fileList = sorted(os.listdir(dataDir))
line_num = 0
for filename in fileList[:]:
if not (os.path.isfile(dataDir + filename) and filename.endswith(".sgm")): continue
#if not filename.startswith("CNN_CF_20030304.1900.04"): continue
#if not filename.startswith("BACONSREBELLION_20050226.1317"): continue
sentences_in_doc, content = parseSGML(dataDir+filename)
line_num += len(sentences_in_doc)
eventArrOneDoc = extractEvents(dataDir+filename)
if len(eventArrOneDoc) == 0: continue
outfilename = dataDir + filename[:-3]+"ee"
outfile = open(outfilename, "w")
cPickle.dump(content, outfile)
cPickle.dump(sentences_in_doc, outfile)
cPickle.dump(eventArrOneDoc, outfile)
## to string version for output
#for event in eventArrOneDoc:
# print event
# eventString = event2str(event, "|||")
# outfile.write(eventString + "\n")
outfile.close()
#print "## Events writen to ", outfilename
#print line_num
if __name__ == "__main__":
#print "Usage: python readEventTag.py dataDir"
main()