Skip to content

Commit

Permalink
Extension 2
Browse files Browse the repository at this point in the history
  • Loading branch information
Aditya Kashyap committed Apr 24, 2018
1 parent 1843d32 commit 6f4c6c3
Show file tree
Hide file tree
Showing 12 changed files with 66,092 additions and 42,002 deletions.
Binary file added .DS_Store
Binary file not shown.
30,153 changes: 30,153 additions & 0 deletions Extension-2/Extension_2_Features.csv

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions Extension-2/Extension_Part_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,25 +104,25 @@ def compute_cosine_similarity(vector1, vector2):
# X_data.append(line)

y_data = []
with open("supervised_y_data_test.txt","r") as f:
with open("supervised_y_data_train.txt","r") as f:
data = f.read().split("\n")
for line in data:
y_data.append(line)

entity_scores = []
entity_sentence = []
article_num = []
with open("../entity_score_ranks_test.txt","r") as f:
with open("../entity_scores_train.txt","r") as f:
data = f.read().split("\n")
for line in data:
article_num.append(int(line.split("@@@")[0].strip()))
entity_scores.append(float(line.split("@@@")[2].strip()))
entity_sentence.append(line.split("@@@")[1].strip())
entity_scores.append(float(line.split("@@@")[1].strip()))
entity_sentence.append(line.split("@@@")[2].strip())

article_set = set(article_num)

nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat', 'tokenizer'])

# nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat', 'tokenizer'])
nlp = spacy.load('en')
features_labels = []

for article in article_set:
Expand Down Expand Up @@ -195,10 +195,10 @@ def compute_cosine_similarity(vector1, vector2):
event_features_length = len('{0}'.format(text.ents).split())
current_sentence_vect = np.transpose(np.asarray(tf_matrix_uni[j].todense()))
FirstRel_Doc = compute_cosine_similarity(first_sentence_vect,current_sentence_vect)
features_labels.append({"article-sentence":str(article) + "-" + str(j),"position":position,"doc_first":doc_first,"length":length,"quote":quote,"Centroid_Uni":Centroid_uni,"Centroid_Bi":Centroid_bi,"SigTerm_Uni":SigTerm_Uni,"SigTerm_Bi":SigTerm_Bi,"FreqWord_Uni":FreqWord_Uni,"FreqWord_Bi":FreqWord_Bi,"Event_Features":event_features_length,"FirstRel_Doc":FirstRel_Doc,"Label":sentence_label,"entity_score":X_data_entity_score[j]})
features_labels.append({"position":position,"doc_first":doc_first,"length":length,"quote":quote,"Centroid_Uni":Centroid_uni,"Centroid_Bi":Centroid_bi,"SigTerm_Uni":SigTerm_Uni,"SigTerm_Bi":SigTerm_Bi,"FreqWord_Uni":FreqWord_Uni,"FreqWord_Bi":FreqWord_Bi,"Event_Features":event_features_length,"FirstRel_Doc":FirstRel_Doc,"Label":sentence_label,"entity_score":X_data_entity_score[j]})

features_label_df = pd.DataFrame(features_labels)
features_label_df.to_csv("Test_Data_Extension_3.csv",index=False)
features_label_df.to_csv("Extension_2_Features.csv",index=False)

end = datetime.datetime.now()
duration = end - start
Expand Down
6 changes: 3 additions & 3 deletions Extension-2/Extension_Part_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def compute_cosine_similarity(vector1, vector2):

X_data_train = []
y_data_train = []
with open("Training_data.csv","r") as f:
with open("Extension_2_Features.csv","r") as f:
data= csv.reader(f)
for row in data:
X_data_train.append(row[:6] + row[7:])
Expand Down Expand Up @@ -78,7 +78,7 @@ def compute_cosine_similarity(vector1, vector2):
entity_scores = []
entity_sentence = []
article_num = []
with open("entity_score_ranks_test.txt","r") as f:
with open("../entity_scores_test.txt","r") as f:
data = f.read().split("\n")
for line in data:
article_num.append(int(line.split("@@@")[0].strip()))
Expand Down Expand Up @@ -134,7 +134,7 @@ def compute_cosine_similarity(vector1, vector2):
SigTerm_Bi = len(np.where(tf_matrix_bi[i].todense() > 0)[0])
FreqWord_Uni = np.sum(tf_matrix_uni_1[i].todense())
FreqWord_Bi = np.sum(tf_matrix_bi_1[i].todense())
text = nlp(sentences[i])
text = nlp(sentences[i].decode("utf8"))
event_features_length = len('{0}'.format(text.ents).split())
current_sentence_vect = np.transpose(np.asarray(tf_matrix_uni[i].todense()))
FirstRel_Doc = compute_cosine_similarity(first_sentence_vect,current_sentence_vect)
Expand Down
994 changes: 994 additions & 0 deletions Extension-2/y_pred_summary.txt

Large diffs are not rendered by default.

21,307 changes: 9,344 additions & 11,963 deletions entity_scores_test.txt

Large diffs are not rendered by default.

49,400 changes: 23,592 additions & 25,808 deletions entity_scores.txt → entity_scores_train.txt

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions milestone3.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
#To run spacy, in command line: pip install spacy
#python -m spacy download en

nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat', 'tokenizer'])
# nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat', 'tokenizer'])
nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))

parser.add_argument('--test_file', type=str, required=True, dest = 'test_file')
Expand Down Expand Up @@ -234,7 +235,6 @@ def get_character_positions(ent1, ent2):

#print('sentence_scores')
#print(sentence_scores)

all_scores += sentence_scores

sentence_scores = np.array(sentence_scores)
Expand Down Expand Up @@ -264,16 +264,16 @@ def get_character_positions(ent1, ent2):
end_time = datetime.datetime.now()
total_time = end_time - start_time

with open("entity_score_ranks_test.txt","w") as f:
for line in entity_score_extension:
f.write(str(line[0]) + " @@@ " + str(line[1]) + " @@@ " + str(line[2]))
f.write("\n")

# with open("entity_scores_test.txt","w") as f:
# for article, line, sentence in zip(all_articles,all_scores,all_sentences):
# f.write(str(int(article)) + " @@@ " + str(line) + " @@@ " + str(sentence))
# with open("entity_score_ranks_test.txt","w") as f:
# for line in entity_score_extension:
# f.write(str(line[0]) + " @@@ " + str(line[1]) + " @@@ " + str(line[2]))
# f.write("\n")

with open("entity_scores_test.txt","w") as f:
for article, line, sentence in zip(all_articles,all_scores,all_sentences):
f.write(str(int(article)) + " @@@ " + str(line) + " @@@ " + str(sentence))
f.write("\n")

print('total running time for '+str(number_articles)+" articles is "+str(total_time))


Loading

0 comments on commit 6f4c6c3

Please sign in to comment.