Skip to content

Commit

Permalink
preprocessing data, extract gold event tag
Browse files Browse the repository at this point in the history
  • Loading branch information
qolina committed Jun 29, 2017
1 parent adc38fa commit a668719
Show file tree
Hide file tree
Showing 12 changed files with 3,073 additions and 0 deletions.
52 changes: 52 additions & 0 deletions data/ACE-event-argument.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
## format
#event type
#event sub type: argument roles

life
be-born: person time place
marry: person time place
divorce: person time place
injure: agent victim instrument time place
die: agent victim instrument time place

movement
transport: agent artifact vehicle price origin destination time

transaction
transfer-ownership: buyer seller beneficiary artifact price time place
transfer-money: giver recipient beneficiary money time place

business
start-org: agent org time place
merge-org: org time place
declare-bankruptcy: org time place
end: org time place

conflict
attack: attacker target instrument time place
demonstrate: entity time place

contact
meet: entity time place
phone-write: entity time

personnel
start-position: person entity position time place
end-position: person entity position time place
nominate: person entity position time place
elect: person entity position time place

justice
arrest-jail: person agent crime time place
RELEASE-PAROLE: person entity crime time place
TRIAL-HEARING: defendant prosecutor adjudicator crime time place
charge-indict: defendant prosecutor adjudicator crime time place
sue: plaintiff defendant adjudicator crime time place
convict: defendant adjudicator crime time place
sentence: defendant adjudicator crime sentence time place
fine: entity adjudicator money crime time place
execute: person agent crime time place
EXTRADITE: agent person destination origin crime time
acquit: defendant adjudicator crime time place
pardon: defendant adjudicator crime time place
appeal: defendant prosecutor adjudicator crime time place
2,593 changes: 2,593 additions & 0 deletions data/train.oneEventSent.txt

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions src/bs_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# This file mainly implements a basic attention model for neural event extraction
# input file consists of sentences which contain one-event.
import os
import re
import sys
from collections import Counter
from readEventTag import event2str
from statisticCorpus import str2event

import torch
from torch.autograd import Variable

def loadWord2Vec(modelPath):
content = open(modelPath, "r").readlines()
wordNum, dim = content[0].strip().split()
content = [line.strip().split() for line in content[1:]]
content = [(item[0], [float(val) for val in item[1:]]) for item in content]
#word2vecModel = {} #word:vector
word2vecModel = dict(content)
return word2vecModel

def prepData(eventArr, Vocab):
for event in eventArr:

##############
def getArg(args, flag):
arg = None
if flag in args:
arg = args[args.index(flag)+1]
return arg

# arguments received from arguments
def parseArgs(args):
arg1 = getArg(args, "-train")
arg2 = getArg(args, "-embed")
return [arg1, arg2]


# event = [eventtype, eventsubtype, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...]
# eventstring: sentence[sep]eventtype[sep]eventsubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role......
if __name__ == "__main__":
print "Usage: python statisticCorpus.py -train trainFile -embed embeddingFile"
print sys.argv

Vocab = loadWord2Vec(sys.argv[2])
content = open(sys.argv[1], "r").readlines()
eventArr = [str2event(line.strip(), "\t") for line in content]
91 changes: 91 additions & 0 deletions src/readEventTag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import re
import sys
from xml.etree.ElementTree import ElementTree

# root: sourcefile
# second_layer: document
# third_layer: entity, timex, relation, event
def extractEvents(filename):
xmlTree = ElementTree(file=filename)
#root = xmlTree.getroot()

eventArrOneDoc = []
for eventEle in xmlTree.iter(tag="event"):
#print eventEle
#print eventEle.tag, eventEle.attrib
eventArr = extractEvent(eventEle)
#print eventArr
eventArrOneDoc.extend(eventArr)

print event2str(eventArrOneDoc[0], "\t")
return eventArrOneDoc

# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...]
# output: sentence[sep]eventType[sep]eventSubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role......
def event2str(event, separator):
(eventType, eventSubType, sentence_ldc_scope, anchor) = event[:4]
arguments = [arg[0]+separator+arg[1] for arg in event[4:]]

newArrangedEvent = [sentence_ldc_scope, eventType, eventSubType, anchor]
newArrangedEvent.extend(arguments)
eventString = separator.join(newArrangedEvent)
#print eventString
return eventString

# forth_layer(event): [optional]event_argument, event_mention
# fifth_layer(event_mention): extent, ldc_scope, anchor, [optional]event_mention_argument
# sixth_layer(event_mention_argument): extent
# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...]
def extractEvent(eventEle):
#print eventEle.attrib
eventType = eventEle.attrib["TYPE"]
eventSubType = eventEle.attrib["SUBTYPE"]
#print "-- Event type, subtype", eventType, "\t", eventSubType

#print "-- Event Arguments:",
#for eventArgument in eventEle:
# if eventArgument.tag != "event_argument": continue
# print eventArgument.attrib["ROLE"],
#print

eventArr = []
#print "-- Event Mention:"
for eventMention in eventEle:
if eventMention.tag != "event_mention": continue
sentence_ldc_scope = eventMention[1][0].text
sentence_ldc_scope = re.sub(r"\n", " ", sentence_ldc_scope).strip() + "."
anchor = eventMention[2][0].text
anchor = re.sub(r"\n", " ", anchor)
#print "----Sentence", sentence_ldc_scope
#print "----Anchor", anchor
event = [eventType, eventSubType, sentence_ldc_scope, anchor]

for eventMentionArgument in eventMention:
if eventMentionArgument.tag != "event_mention_argument": continue
argRole = eventMentionArgument.attrib["ROLE"]
argText = eventMentionArgument[0][0].text
argText = re.sub(r"\n", " ", argText)
arg = (argText, argRole)
event.append(arg)
#print arg
eventArr.append(event)
return eventArr

if __name__ == "__main__":
print "Usage: python readEventTag.py dataDir"

dataDir = sys.argv[1]
fileList = sorted(os.listdir(dataDir))
for filename in fileList:
if not (os.path.isfile(dataDir + filename) and filename.endswith(".apf.xml")): continue
print "## Processing ", filename
eventArrOneDoc = extractEvents(dataDir+filename)
if len(eventArrOneDoc) == 0: continue
#outfilename = dataDir + filename.strip(".apf.xml") + ".ee"
#outfile = open(outfilename, "w")
#for event in eventArrOneDoc:
# eventString = event2str(event, "\t")
# outfile.write(eventString + "\n")
#outfile.close()
#print "## Events writen to ", outfilename
Binary file added src/readEventTag.pyc
Binary file not shown.
19 changes: 19 additions & 0 deletions src/splitTrainTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

import os
import sys

if __name__ == "__main__":
print "Usage: python splitTrainTest.py dataDir testFilenames"
testFiles = open(sys.argv[2], "r").readlines()
testFiles = [line.strip().strip(".sgm") for line in testFiles]

dataPath = sys.argv[1]
fileList = os.listdir(dataPath)
for fileItem in fileList:
if os.path.isdir(dataPath+fileItem): continue
print "## Processing ", fileItem
filename = fileItem.strip(".sgm").strip(".apf.xml")
if filename in testFiles:
os.rename(dataPath+fileItem, dataPath + "/test/"+fileItem)
else:
os.rename(dataPath+fileItem, dataPath + "/train/"+fileItem)
57 changes: 57 additions & 0 deletions src/statisticCorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import re
import sys
from collections import Counter
from readEventTag import event2str

# event = [eventType, eventSubType, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...]
# eventString: sentence[sep]eventType[sep]eventSubType[sep]anchor[sep]arg[sep]role[sep]arg[sep]role......
def str2event(eventString, separator):
arr = eventString.split(separator)
#sentence_ldc_scope, eventType, eventSubType, anchor = arr[:4]
arguments = arr[4:]
arguments = [(arguments[i*2], arguments[i*2+1]) for i in range(len(arguments)/2)]
event = arr[:4]
event.extend(arguments)
return event

def indexEventBySentence(eventArr):
sentHash = {}
for event in eventArr:
sentence = event[0]
if sentence in sentHash:
sentHash[sentence].append(event)
else:
sentHash[sentence] = [event]

oneEventSentence = dict([(sent, events[0]) for sent, events in sentHash.items() if len(events) == 1])
eventNumInSent = [len(events) for sent, events in sentHash.items()]
sentNumAll = len(sentHash)
print "## ori #event", len(eventArr)
print "## #sentence", sentNumAll
print "## #events-in-sents:",
print Counter(eventNumInSent).most_common()
for eventNum, sentNum in Counter(eventNumInSent).most_common():
print eventNum, "\t", sentNum, "\t", sentNum*100.0/sentNumAll
return oneEventSentence

if __name__ == "__main__":
print "Usage: python statisticCorpus.py dataDir"
print sys.argv

dataDir = sys.argv[1]
fileList = sorted(os.listdir(dataDir))
allEvents = []
for filename in fileList:
if not (os.path.isfile(dataDir + filename) and filename.endswith(".ee")): continue
#print "## Processing ", filename
content = open(dataDir+filename, "r").readlines()
eventArrOneDoc = [str2event(line.strip(), "\t") for line in content]
allEvents.extend(eventArrOneDoc)

oneEventSentence = indexEventBySentence(allEvents)
outfile = open(dataDir + "../train.oneEventSent.txt", "w")
for sent, event in oneEventSentence.items():
eventString = event2str(event, "\t")
outfile.write(eventString + "\n")
outfile.close()
47 changes: 47 additions & 0 deletions srcData/bs_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# This file mainly implements a basic attention model for neural event extraction
# input file consists of sentences which contain one-event.
import os
import re
import sys
from collections import Counter
from readEventTag import event2str
from statisticCorpus import str2event

import torch
from torch.autograd import Variable

def loadWord2Vec(modelPath):
content = open(modelPath, "r").readlines()
wordNum, dim = content[0].strip().split()
content = [line.strip().split() for line in content[1:]]
content = [(item[0], [float(val) for val in item[1:]]) for item in content]
#word2vecModel = {} #word:vector
word2vecModel = dict(content)
return word2vecModel

def prepData(eventArr, Vocab):
for event in eventArr:

##############
def getArg(args, flag):
arg = None
if flag in args:
arg = args[args.index(flag)+1]
return arg

# arguments received from arguments
def parseArgs(args):
arg1 = getArg(args, "-train")
arg2 = getArg(args, "-embed")
return [arg1, arg2]


# event = [eventtype, eventsubtype, sentence_ldc_scope, anchor, (arg, role), (arg, role), ...]
# eventstring: sentence[sep]eventtype[sep]eventsubtype[sep]anchor[sep]arg[sep]role[sep]arg[sep]role......
if __name__ == "__main__":
print "Usage: python statisticCorpus.py -train trainFile -embed embeddingFile"
print sys.argv

Vocab = loadWord2Vec(sys.argv[2])
content = open(sys.argv[1], "r").readlines()
eventArr = [str2event(line.strip(), "\t") for line in content]
Loading

0 comments on commit a668719

Please sign in to comment.