forked from qolina/NNED
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
preprocessing data, extract gold event tag
- Loading branch information
Showing
12 changed files
with
3,073 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
## format | ||
#event type | ||
#event sub type: argument roles | ||
|
||
life | ||
be-born: person time place | ||
marry: person time place | ||
divorce: person time place | ||
injure: agent victim instrument time place | ||
die: agent victim instrument time place | ||
|
||
movement | ||
transport: agent artifact vehicle price origin destination time | ||
|
||
transaction | ||
transfer-ownership: buyer seller beneficiary artifact price time place | ||
transfer-money: giver recipient beneficiary money time place | ||
|
||
business | ||
start-org: agent org time place | ||
merge-org: org time place | ||
declare-bankruptcy: org time place | ||
end: org time place | ||
|
||
conflict | ||
attack: attacker target instrument time place | ||
demonstrate: entity time place | ||
|
||
contact | ||
meet: entity time place | ||
phone-write: entity time | ||
|
||
personnel | ||
start-position: person entity position time place | ||
end-position: person entity position time place | ||
nominate: person entity position time place | ||
elect: person entity position time place | ||
|
||
justice | ||
arrest-jail: person agent crime time place | ||
RELEASE-PAROLE: person entity crime time place | ||
TRIAL-HEARING: defendant prosecutor adjudicator crime time place | ||
charge-indict: defendant prosecutor adjudicator crime time place | ||
sue: plaintiff defendant adjudicator crime time place | ||
convict: defendant adjudicator crime time place | ||
sentence: defendant adjudicator crime sentence time place | ||
fine: entity adjudicator money crime time place | ||
execute: person agent crime time place | ||
EXTRADITE: agent person destination origin crime time | ||
acquit: defendant adjudicator crime time place | ||
pardon: defendant adjudicator crime time place | ||
appeal: defendant prosecutor adjudicator crime time place |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# This file mainly implements a basic attention model for neural event extraction | ||
# input file consists of sentences which contain one-event. | ||
import os | ||
import re | ||
import sys | ||
from collections import Counter | ||
from readEventTag import event2str | ||
from statisticCorpus import str2event | ||
|
||
import torch | ||
from torch.autograd import Variable | ||
|
||
def loadWord2Vec(modelPath): | ||
content = open(modelPath, "r").readlines() | ||
wordNum, dim = content[0].strip().split() | ||
content = [line.strip().split() for line in content[1:]] | ||
content = [(item[0], [float(val) for val in item[1:]]) for item in content] | ||
#word2vecModel = {} #word:vector | ||
word2vecModel = dict(content) | ||
return word2vecModel | ||
|
||
def prepData(eventArr, Vocab): | ||
for event in eventArr: | ||
|
||
############## | ||
def getArg(args, flag): | ||
arg = None | ||
if flag in args: | ||
arg = args[args.index(flag)+1] | ||
return arg | ||
|
||
# arguments received from arguments | ||
def parseArgs(args): | ||
arg1 = getArg(args, "-train") | ||
arg2 = getArg(args, "-embed") | ||
return [arg1, arg2] | ||
|
||
|
||
# event = [eventtype, eventsubtype, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...] | ||
# eventstring: sentence[sep]eventtype[sep]eventsubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role...... | ||
if __name__ == "__main__": | ||
print "Usage: python statisticCorpus.py -train trainFile -embed embeddingFile" | ||
print sys.argv | ||
|
||
Vocab = loadWord2Vec(sys.argv[2]) | ||
content = open(sys.argv[1], "r").readlines() | ||
eventArr = [str2event(line.strip(), "\t") for line in content] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import os | ||
import re | ||
import sys | ||
from xml.etree.ElementTree import ElementTree | ||
|
||
# root: sourcefile | ||
# second_layer: document | ||
# third_layer: entity, timex, relation, event | ||
def extractEvents(filename): | ||
xmlTree = ElementTree(file=filename) | ||
#root = xmlTree.getroot() | ||
|
||
eventArrOneDoc = [] | ||
for eventEle in xmlTree.iter(tag="event"): | ||
#print eventEle | ||
#print eventEle.tag, eventEle.attrib | ||
eventArr = extractEvent(eventEle) | ||
#print eventArr | ||
eventArrOneDoc.extend(eventArr) | ||
|
||
print event2str(eventArrOneDoc[0], "\t") | ||
return eventArrOneDoc | ||
|
||
# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...] | ||
# output: sentence[sep]eventType[sep]eventSubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role...... | ||
def event2str(event, separator): | ||
(eventType, eventSubType, sentence_ldc_scope, anchor) = event[:4] | ||
arguments = [arg[0]+separator+arg[1] for arg in event[4:]] | ||
|
||
newArrangedEvent = [sentence_ldc_scope, eventType, eventSubType, anchor] | ||
newArrangedEvent.extend(arguments) | ||
eventString = separator.join(newArrangedEvent) | ||
#print eventString | ||
return eventString | ||
|
||
# forth_layer(event): [optional]event_argument, event_mention | ||
# fifth_layer(event_mention): extent, ldc_scope, anchor, [optional]event_mention_argument | ||
# sixth_layer(event_mention_argument): extent | ||
# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...] | ||
def extractEvent(eventEle): | ||
#print eventEle.attrib | ||
eventType = eventEle.attrib["TYPE"] | ||
eventSubType = eventEle.attrib["SUBTYPE"] | ||
#print "-- Event type, subtype", eventType, "\t", eventSubType | ||
|
||
#print "-- Event Arguments:", | ||
#for eventArgument in eventEle: | ||
# if eventArgument.tag != "event_argument": continue | ||
# print eventArgument.attrib["ROLE"], | ||
|
||
eventArr = [] | ||
#print "-- Event Mention:" | ||
for eventMention in eventEle: | ||
if eventMention.tag != "event_mention": continue | ||
sentence_ldc_scope = eventMention[1][0].text | ||
sentence_ldc_scope = re.sub(r"\n", " ", sentence_ldc_scope).strip() + "." | ||
anchor = eventMention[2][0].text | ||
anchor = re.sub(r"\n", " ", anchor) | ||
#print "----Sentence", sentence_ldc_scope | ||
#print "----Anchor", anchor | ||
event = [eventType, eventSubType, sentence_ldc_scope, anchor] | ||
|
||
for eventMentionArgument in eventMention: | ||
if eventMentionArgument.tag != "event_mention_argument": continue | ||
argRole = eventMentionArgument.attrib["ROLE"] | ||
argText = eventMentionArgument[0][0].text | ||
argText = re.sub(r"\n", " ", argText) | ||
arg = (argText, argRole) | ||
event.append(arg) | ||
#print arg | ||
eventArr.append(event) | ||
return eventArr | ||
|
||
if __name__ == "__main__": | ||
print "Usage: python readEventTag.py dataDir" | ||
|
||
dataDir = sys.argv[1] | ||
fileList = sorted(os.listdir(dataDir)) | ||
for filename in fileList: | ||
if not (os.path.isfile(dataDir + filename) and filename.endswith(".apf.xml")): continue | ||
print "## Processing ", filename | ||
eventArrOneDoc = extractEvents(dataDir+filename) | ||
if len(eventArrOneDoc) == 0: continue | ||
#outfilename = dataDir + filename.strip(".apf.xml") + ".ee" | ||
#outfile = open(outfilename, "w") | ||
#for event in eventArrOneDoc: | ||
# eventString = event2str(event, "\t") | ||
# outfile.write(eventString + "\n") | ||
#outfile.close() | ||
#print "## Events writen to ", outfilename |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
import os | ||
import sys | ||
|
||
if __name__ == "__main__": | ||
print "Usage: python splitTrainTest.py dataDir testFilenames" | ||
testFiles = open(sys.argv[2], "r").readlines() | ||
testFiles = [line.strip().strip(".sgm") for line in testFiles] | ||
|
||
dataPath = sys.argv[1] | ||
fileList = os.listdir(dataPath) | ||
for fileItem in fileList: | ||
if os.path.isdir(dataPath+fileItem): continue | ||
print "## Processing ", fileItem | ||
filename = fileItem.strip(".sgm").strip(".apf.xml") | ||
if filename in testFiles: | ||
os.rename(dataPath+fileItem, dataPath + "/test/"+fileItem) | ||
else: | ||
os.rename(dataPath+fileItem, dataPath + "/train/"+fileItem) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
import re | ||
import sys | ||
from collections import Counter | ||
from readEventTag import event2str | ||
|
||
# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...] | ||
# eventString: sentence[sep]eventType[sep]eventSubType[sep]anchor[sep]arg[sep]role[sep]arg[sep]role...... | ||
def str2event(eventString, separator): | ||
arr = eventString.split(separator) | ||
#sentence_ldc_scope, eventType, eventSubType, anchor = arr[:4] | ||
arguments = arr[4:] | ||
arguments = [(arguments[i*2], arguments[i*2+1]) for i in range(len(arguments)/2)] | ||
event = arr[:4] | ||
event.extend(arguments) | ||
return event | ||
|
||
def indexEventBySentence(eventArr): | ||
sentHash = {} | ||
for event in eventArr: | ||
sentence = event[0] | ||
if sentence in sentHash: | ||
sentHash[sentence].append(event) | ||
else: | ||
sentHash[sentence] = [event] | ||
|
||
oneEventSentence = dict([(sent, events[0]) for sent, events in sentHash.items() if len(events) == 1]) | ||
eventNumInSent = [len(events) for sent, events in sentHash.items()] | ||
sentNumAll = len(sentHash) | ||
print "## ori #event", len(eventArr) | ||
print "## #sentence", sentNumAll | ||
print "## #events-in-sents:", | ||
print Counter(eventNumInSent).most_common() | ||
for eventNum, sentNum in Counter(eventNumInSent).most_common(): | ||
print eventNum, "\t", sentNum, "\t", sentNum*100.0/sentNumAll | ||
return oneEventSentence | ||
|
||
if __name__ == "__main__": | ||
print "Usage: python statisticCorpus.py dataDir" | ||
print sys.argv | ||
|
||
dataDir = sys.argv[1] | ||
fileList = sorted(os.listdir(dataDir)) | ||
allEvents = [] | ||
for filename in fileList: | ||
if not (os.path.isfile(dataDir + filename) and filename.endswith(".ee")): continue | ||
#print "## Processing ", filename | ||
content = open(dataDir+filename, "r").readlines() | ||
eventArrOneDoc = [str2event(line.strip(), "\t") for line in content] | ||
allEvents.extend(eventArrOneDoc) | ||
|
||
oneEventSentence = indexEventBySentence(allEvents) | ||
outfile = open(dataDir + "../train.oneEventSent.txt", "w") | ||
for sent, event in oneEventSentence.items(): | ||
eventString = event2str(event, "\t") | ||
outfile.write(eventString + "\n") | ||
outfile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# This file mainly implements a basic attention model for neural event extraction | ||
# input file consists of sentences which contain one-event. | ||
import os | ||
import re | ||
import sys | ||
from collections import Counter | ||
from readEventTag import event2str | ||
from statisticCorpus import str2event | ||
|
||
import torch | ||
from torch.autograd import Variable | ||
|
||
def loadWord2Vec(modelPath): | ||
content = open(modelPath, "r").readlines() | ||
wordNum, dim = content[0].strip().split() | ||
content = [line.strip().split() for line in content[1:]] | ||
content = [(item[0], [float(val) for val in item[1:]]) for item in content] | ||
#word2vecModel = {} #word:vector | ||
word2vecModel = dict(content) | ||
return word2vecModel | ||
|
||
def prepData(eventArr, Vocab): | ||
for event in eventArr: | ||
|
||
############## | ||
def getArg(args, flag): | ||
arg = None | ||
if flag in args: | ||
arg = args[args.index(flag)+1] | ||
return arg | ||
|
||
# arguments received from arguments | ||
def parseArgs(args): | ||
arg1 = getArg(args, "-train") | ||
arg2 = getArg(args, "-embed") | ||
return [arg1, arg2] | ||
|
||
|
||
# event = [eventtype, eventsubtype, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...] | ||
# eventstring: sentence[sep]eventtype[sep]eventsubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role...... | ||
if __name__ == "__main__": | ||
print "Usage: python statisticCorpus.py -train trainFile -embed embeddingFile" | ||
print sys.argv | ||
|
||
Vocab = loadWord2Vec(sys.argv[2]) | ||
content = open(sys.argv[1], "r").readlines() | ||
eventArr = [str2event(line.strip(), "\t") for line in content] |
Oops, something went wrong.