-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_processing.py
44 lines (33 loc) · 1.24 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import stanza
import spacy
def preprocessing(_input_text, _nlp_pipeline):
input_tk_list = ["ROOT"]
input_dep_list = []
if _nlp_pipeline == "stanza":
nlp = stanza.Pipeline('en')
text = nlp(_input_text)
for sen in text.sentences:
for tk in sen.tokens:
tk_infor_dict = tk.to_dict()[0]
cur_tk = tk_infor_dict["text"]
cur_id = tk_infor_dict['id']
cur_head = tk_infor_dict['head']
cur_dep = tk_infor_dict["deprel"]
cur_dep_triple = (cur_id, cur_dep, cur_head)
input_tk_list.append(cur_tk)
input_dep_list.append(cur_dep_triple)
elif _nlp_pipeline == "spacy":
nlp = spacy.load("en_core_web_sm")
text = nlp(_input_text)
for tk_idx, tk in enumerate(text):
cur_tk = tk.text
cur_id = tk_idx+1
cur_dep = tk.dep_
if cur_dep == "ROOT":
cur_head = 0
else:
cur_head = tk.head.i+1
cur_dep_triple = (cur_id, cur_dep, cur_head)
input_tk_list.append(cur_tk)
input_dep_list.append(cur_dep_triple)
return input_tk_list, input_dep_list