forked from preetyverma20/Amazing-Python-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
32 lines (24 loc) · 918 Bytes
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import spacy
import neologdn
class EnglishCorpus:
# Preparation of morphological analyzer
def __init__(self):
self.nlp = spacy.load("en_core_web_sm")
# Pre-processing of line breaks and special characters
def preprocessing(self, text: str) -> str:
text = text.replace("\n", "")
text = neologdn.normalize(text)
return text
# Divide sentences into sentences while retaining the results of morphological analysis
def make_sentence_list(self, sentences: str) -> list:
doc = self.nlp(sentences)
self.ginza_sents_object = doc.sents
sentence_list = [s for s in doc.sents]
return sentence_list
# Put a space between words
def make_corpus(self) -> list:
corpus = []
for s in self.ginza_sents_object:
tokens = [str(t) for t in s]
corpus.append(" ".join(tokens))
return corpus