-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_kb_bio101_terms.py
89 lines (70 loc) · 3.31 KB
/
preprocess_kb_bio101_terms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Extracts biology terms from the Bio101 knowledge base.
#
# Author: Matthew Boggess
# Version: 6/08/20
#
# Data Source: Raw dump of Bio101 KB lexicon.
#
#===================================================================================
# Libraries
import spacy
import pandas as pd
from io import StringIO
from tqdm import tqdm
#===================================================================================
# Parameters
life_bio_data_dir = '../data/raw_data/life_bio/bio101_kb'
term_dir = '../data/preprocessed/terms'
kb_dir = '../data/preprocessed'
bio_concepts_file = f"{life_bio_data_dir}/kb_biology_concepts.txt"
lexicon_input_file = f"{life_bio_data_dir}/kb_lexicon.txt"
terms_output_file = f"{term_dir}/kb_bio101_terms.txt"
rel_terms_output_file = f"{term_dir}/kb_bio101_relations_exclude.txt"
# concepts that have too general representations
exclude_concepts = ['Aggregate', 'Center', 'Grouping-Activity', 'Normal', 'Person',
'Unstable-System', 'Sequence', 'Region']
#===================================================================================
if __name__ == '__main__':
seen_lemmas = set()
terms = []
# initialize Spacy NLP pipeline
nlp = spacy.load('en_core_web_sm')
print("Extracting Life Biology KB terms")
with open(lexicon_input_file, "r") as f:
lexicon = f.read()
with open(bio_concepts_file, "r") as f:
bio_concepts = set([t.strip() for t in f.readlines()])
lexicon = pd.read_csv(StringIO(lexicon), sep="\s*\|\s*", header=None, engine='python',
names=['concept', 'relation', 'text', 'pos'])
# separate out event/entity labels
concept_types = lexicon.query("text in ['Entity', 'Event', 'Relation']")
lexicon = lexicon.query("text not in ['Entity', 'Event', 'Relation']")
# filter out too general upper ontology words, relation concepts, and
# concepts that only have representations that are too general
upper_ontology = lexicon[~lexicon.concept.isin(bio_concepts)]
upper_ontology = set(upper_ontology.text)
lexicon = lexicon[(lexicon.concept.isin(bio_concepts)) & (~lexicon.concept.isin(exclude_concepts))]
# spacy process terms to get lemmas
terms = []
relation_terms = set()
seen_terms = set()
for _, row in tqdm(list(lexicon.iterrows())):
# pull out event/entity label
term_type = concept_types.loc[concept_types.concept == row.concept, 'text'].iloc[0]
if term_type == 'Relation':
relation_terms.add(row.text.replace('"', '').strip().replace('-', ' '))
continue
term = row.text.replace('"', '').strip().replace('-', ' ')
if term in upper_ontology:
print(term)
elif term not in seen_terms:
terms.append(f"{term} -- {term_type.lower()} -- {row.concept}")
seen_terms.add(term)
concept_text = row.concept.lower().replace('-', ' ').strip()
if concept_text not in seen_terms:
terms.append(f"{concept_text} -- {term_type.lower()} -- {row.concept}")
seen_terms.add(concept_text)
with open(terms_output_file, 'w') as fid:
fid.write('\n'.join(sorted(terms)))
with open(rel_terms_output_file, 'w') as fid:
fid.write('\n'.join(sorted(list(relation_terms))))