Skip to content

Commit

Permalink
upd 0815
Browse files Browse the repository at this point in the history
  • Loading branch information
qolina committed Aug 15, 2017
1 parent f9d4cc8 commit a9235eb
Show file tree
Hide file tree
Showing 6 changed files with 593 additions and 198 deletions.
2 changes: 1 addition & 1 deletion src/cmd.trigger
Original file line number Diff line number Diff line change
@@ -1 +1 @@
python trigger_lstm.py -train ../ni_data/pre_processed_feng/tmp.train -test ../ni_data/pre_processed_feng/tmp.test -tag ../ni_data/pre_processed_feng/labellist -embed ../ni_data/pre_processed_feng/wordvector -vocab ../ni_data/pre_processed_feng/wordlist
python trigger_lstm.py -train ../ni_data/pre_processed_feng/tmp.train -test ../ni_data/pre_processed_feng/tmp.test -tag ../ni_data/pre_processed_feng/labellist -embed ../ni_data/pre_processed_feng/wordvector -vocab ../ni_data/pre_processed_feng/wordlist -model ../ni_data/models/model.trigger
115 changes: 101 additions & 14 deletions src/eventExtFromRaw.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,90 @@
import cPickle
from aceEventUtil import event2str
from xml.etree.ElementTree import ElementTree
from sgmllib import SGMLParser
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
import nltk.data

# root: sourcefile
class aceParser(SGMLParser):
content = ""
line_content = []
def init(self, path):
self.path = path
def handle_data(self, text):
text = text.replace("&", "&----")
self.line_content.append(text)
self.content += text

def parseSGML(filename):
debug = True
sgm_parser = aceParser()

file_content = open(filename, "r").read()
sgm_parser.content = ""
sgm_parser.line_content = []
sgm_parser.feed(file_content)
content = sgm_parser.content
line_content = sgm_parser.line_content
content = content.replace("\n", " ")
if filename.find("FLOPPINGACES_20041114.1240.03") >= 0:
content = content.replace("&", "&----")
line_content = [line.replace("&----", "&") for line in line_content]
sentences = []

sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")
# test output
if debug:
#idx = (1527, 1536)
idx = (718, 909)
print "# direct from new content:", content[idx[0]:idx[1]+1]

sent_id = 0
line_id = 0
while line_id < len(line_content):
line = line_content[line_id]
pre_content = "".join(line_content[:line_id])
char_st = len(pre_content)

while line_id < len(line_content)-2 and line_content[line_id+1] == "&----":
line = line + line_content[line_id+1] + line_content[line_id+2]
line_id += 2
line_id += 1
line = line.replace("\n", " ")
char_ed = char_st + len(line)
if debug:
print "-----------------------------", line_id, (char_st, char_ed)
print "S-"+line+"-E"

if len(line.strip())<1: continue

#sents_in_line = line.split("\n\n")
#print line
sents_in_line = sent_tokenize(line)
last_end = 0
#sents_in_line = sent_detector.tokenize(line)
for sent in sents_in_line:
sent = sent.replace("\n", " ").strip()
sent_st_in_line = line.find(sent, last_end)
sent_ed_in_line = sent_st_in_line + len(sent) - 1
last_end = sent_ed_in_line
sent_st = char_st + sent_st_in_line
sent_ed = sent_st + len(sent) - 1
sent_id += 1
if debug:
print "------##", sent_id, (sent_st_in_line, sent_ed_in_line), (sent_st, sent_ed)
print sent
sentences.append(((sent_st, sent_ed), sent))
#for sent_id, (sent_span, sent) in enumerate(sentences[:]):
# print "##",sent_id, sent_span, sent
return sentences[3:], content
#return line_content

# root: sourcefile *.sgm eventfilename *.apf.xml
# second_layer: document
# third_layer: entity, timex, relation, event
def extractEvents(filename):
xmlTree = ElementTree(file=filename)

xmlTree = ElementTree(file=filename[:-3]+"apf.xml")
#root = xmlTree.getroot()

eventArrOneDoc = []
Expand Down Expand Up @@ -48,12 +126,14 @@ def extractEvent(eventEle):
sentence_ldc_scope = sentenceElement .text
sentence_ldc_scope = re.sub(r"\n", " ", sentence_ldc_scope).strip()
sentence_index = (int(sentenceElement.attrib["START"]), int(sentenceElement.attrib["END"]))
sentence = (sentence_ldc_scope, (0, sentence_index[1]-sentence_index[0]))
#sentence = (sentence_ldc_scope, (0, sentence_index[1]-sentence_index[0]))
sentence = (sentence_ldc_scope, sentence_index)

anchorEle = eventMention[2][0]
anchorText = anchorEle.text
anchorText = re.sub(r"\n", " ", anchorText)
anchor_index = (int(anchorEle.attrib["START"])-sentence_index[0], int(anchorEle.attrib["END"])-sentence_index[0])
#anchor_index = (int(anchorEle.attrib["START"])-sentence_index[0], int(anchorEle.attrib["END"])-sentence_index[0])
anchor_index = (int(anchorEle.attrib["START"]), int(anchorEle.attrib["END"]))
anchor = (anchorText, anchor_index)
#print "----Sentence", sentence
#print "----Anchor", anchor
Expand All @@ -66,30 +146,37 @@ def extractEvent(eventEle):
argElement = eventMentionArgument[0][0]
argText = argElement .text
argText = re.sub(r"\n", " ", argText)
arg_index = (int(argElement.attrib["START"])-sentence_index[0], int(argElement.attrib["END"])-sentence_index[0])
#arg_index = (int(argElement.attrib["START"])-sentence_index[0], int(argElement.attrib["END"])-sentence_index[0])
arg_index = (int(argElement.attrib["START"]), int(argElement.attrib["END"]))
arg = (argText, argRole, arg_index)
event.append(arg)
#print arg
eventArr.append(event)
return eventArr

if __name__ == "__main__":
print "Usage: python readEventTag.py dataDir"

def main():
dataDir = sys.argv[1]
fileList = sorted(os.listdir(dataDir))
for filename in fileList:
if not (os.path.isfile(dataDir + filename) and filename.endswith(".apf.xml")): continue
print "## Processing ", filename
line_num = 0
for filename in fileList[:]:
if not (os.path.isfile(dataDir + filename) and filename.endswith(".sgm")): continue
sentences_in_doc, content = parseSGML(dataDir+filename)
line_num += len(sentences_in_doc)
eventArrOneDoc = extractEvents(dataDir+filename)
if len(eventArrOneDoc) == 0: continue
outfilename = dataDir + filename.replace("apf.xml", "ee")
#if len(eventArrOneDoc) == 0: continue
outfilename = dataDir + filename[:-3]+"ee"
outfile = open(outfilename, "w")
cPickle.dump(content, outfile)
cPickle.dump(sentences_in_doc, outfile)
cPickle.dump(eventArrOneDoc, outfile)
## to string version for output
#for event in eventArrOneDoc:
# print event
# eventString = event2str(event, "|||")
# outfile.write(eventString + "\n")
outfile.close()
print "## Events writen to ", outfilename
#print "## Events writen to ", outfilename
#print line_num
if __name__ == "__main__":
#print "Usage: python readEventTag.py dataDir"
main()
93 changes: 80 additions & 13 deletions src/lstm_trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

class LSTMTrigger(nn.Module):
def __init__(self, pretrain_embedding, pretrain_embed_dim, hidden_dim, vocab_size, tagset_size, dropout, bilstm, num_layers, random_dim, gpu):
def __init__(self, pretrain_embedding, pretrain_embed_dim, lstm_hidden_dim, vocab_size, tagset_size, dropout, bilstm, num_layers, random_dim, gpu, conv_width1=2, conv_width2=3, conv_filter_num=0, hidden_dim_snd=0):
super(LSTMTrigger, self).__init__()

embedding_dim = pretrain_embed_dim
self.hidden_dim = hidden_dim
self.lstm_hidden_dim = lstm_hidden_dim
self.random_embed = False
if random_dim >= 50:
self.word_embeddings = nn.Embedding(vocab_size, random_dim)
Expand All @@ -24,36 +25,75 @@ def __init__(self, pretrain_embedding, pretrain_embed_dim, hidden_dim, vocab_siz
if pretrain_embedding is not None:
self.word_embeddings.weight.data.copy_(torch.from_numpy(pretrain_embedding))

#self.word_embeddings.weight.requires_grad = False
self.drop = nn.Dropout(dropout)
self.bilstm_flag = bilstm
self.lstm_layer = num_layers

self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.lstm_layer, bidirectional=self.bilstm_flag)
if self.bilstm_flag:
self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
# conv layer
self.cnn_flag = True
self.in_channels = embedding_dim
self.out_channels = conv_filter_num
self.kernal_size1 = conv_width1
self.kernal_size2 = conv_width2
self.conv1 = nn.Conv1d(self.in_channels, self.out_channels, self.kernal_size1)
self.conv2 = nn.Conv1d(self.in_channels, self.out_channels, self.kernal_size2)

self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=self.lstm_layer, bidirectional=self.bilstm_flag)
self.hidden_dim_fst = lstm_hidden_dim
if self.bilstm_flag: self.hidden_dim_fst *= 2
if self.cnn_flag: self.hidden_dim_fst += self.out_channels*2

if hidden_dim_snd == 0:
self.hidden_dim_snd = self.hidden_dim_fst
else:
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.hidden_dim_snd = hidden_dim_snd

self.fst_hidden = nn.Linear(self.hidden_dim_fst, self.hidden_dim_snd)
self.hidden2tag = nn.Linear(self.hidden_dim_snd, tagset_size)
self.hidden2tag_iden = nn.Linear(self.hidden_dim_snd, 2)
if gpu:
self.drop = self.drop.cuda()
self.word_embeddings = self.word_embeddings.cuda()
self.lstm = self.lstm.cuda()
self.conv1 = self.conv1.cuda()
self.conv2 = self.conv2.cuda()
self.fst_hidden = self.fst_hidden.cuda()
self.hidden2tag = self.hidden2tag.cuda()
self.hidden2tag_iden = self.hidden2tag_iden.cuda()

self.hidden = self.init_hidden(gpu)

def init_hidden(self, gpu):
if self.bilstm_flag:
h0 = autograd.Variable(torch.zeros(2*self.lstm_layer, 1, self.hidden_dim))
c0 = autograd.Variable(torch.zeros(2*self.lstm_layer, 1, self.hidden_dim))
dims = (2*self.lstm_layer, 1, self.lstm_hidden_dim)
else:
h0 = autograd.Variable(torch.zeros(self.lstm_layer, 1, self.hidden_dim))
c0 = autograd.Variable(torch.zeros(self.lstm_layer, 1, self.hidden_dim))

dims = (self.lstm_layer, 1, self.lstm_hidden_dim)
init_value = np.random.uniform(-0.01, 0.01, dims)
h0 = autograd.Variable(torch.Tensor(init_value))
c0 = autograd.Variable(torch.Tensor(init_value))

if gpu:
h0 = h0.cuda()
c0 = c0.cuda()
return (h0,c0)

# from: Variable of sent_length*embedding_dim
# to: Variable of batch_size*embedding_dim*sent_length
def lstmformat2cnn(self, inputs, gpu):
sent_length = inputs.size()[0]
batch_size = 1
inputs = inputs.view(sent_length, batch_size, -1) # sent_length*batch_size*embedding_dim
inputs = inputs.transpose(0, 1).transpose(1, 2) # batch_size*embedding_dim*sent_length
return inputs

# from: batch_size*out_channels*1
# to: 1*out_channels
def cnnformat2lstm(self, outputs, gpu):
outputs = outputs.transpose(1, 2).transpose(0, 1) # 1*batch_size*out_channels
outputs = outputs.view(1, self.out_channels)
return outputs

def forward(self, sentence, gpu):
self.hidden = self.init_hidden(gpu)

Expand All @@ -70,12 +110,39 @@ def forward(self, sentence, gpu):
embeds = Variable(embeds.float())
if gpu: embeds = embeds.cuda()
#print embeds
# conv forward
if self.cnn_flag:
inputs = self.lstmformat2cnn(embeds, gpu)
self.maxp1 = nn.MaxPool1d(len(sentence)-self.kernal_size1+1)
self.maxp2 = nn.MaxPool1d(len(sentence)-self.kernal_size2+1)
c1 = self.conv1(inputs) # batch_size*out_channels*(sent_length-conv_width+1)
p1 = self.maxp1(c1) # batch_size * out_channels * 1
c2 = self.conv2(inputs)
p2 = self.maxp2(c2)
c1_embed = self.cnnformat2lstm(p1, gpu)
c2_embed = self.cnnformat2lstm(p2, gpu)

embeds = self.drop(embeds)
lstm_out, self.hidden = self.lstm(
embeds.view(len(sentence), 1, -1), self.hidden)
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
lstm_out = lstm_out.view(len(sentence), -1)
hidden_in = lstm_out
if self.cnn_flag:
#c1_embed.data = c1_embed.data.expand(len(sentence), c1_embed.size()[1])
#c2_embed.data = c2_embed.data.expand(len(sentence), c2_embed.size()[1])
#hidden_in = torch.cat((lstm_out.data, c1_embed.data, c2_embed.data), 1)
c1_embed= c1_embed.expand(len(sentence), c1_embed.size()[1])
c2_embed= c2_embed.expand(len(sentence), c2_embed.size()[1])
hidden_in = torch.cat((lstm_out, c1_embed, c2_embed), 1)
#hidden_in = Variable(hidden_in)
#print hidden_in
#print type(hidden_in)

hidden_snd = self.fst_hidden(hidden_in)
hidden_snd = F.relu(hidden_snd)
tag_space = self.hidden2tag(hidden_snd)
tag_scores = F.log_softmax(tag_space)
return tag_scores
tag_space_iden = self.hidden2tag_iden(hidden_snd)
return tag_space, tag_scores, tag_space_iden


Loading

0 comments on commit a9235eb

Please sign in to comment.