forked from foamliu/Speech-Transformer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram_lm.py
50 lines (39 loc) · 1.23 KB
/
ngram_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import collections
import pickle
import nltk
from tqdm import tqdm
from config import pickle_file
with open(pickle_file, 'rb') as file:
data = pickle.load(file)
char_list = data['IVOCAB']
vocab_size = len(char_list)
samples = data['train']
bigram_counter = collections.Counter()
for sample in tqdm(samples):
text = sample['trn']
# text = [char_list[idx] for idx in text]
tokens = list(text)
bigrm = nltk.bigrams(tokens)
# print(*map(' '.join, bigrm), sep=', ')
# get the frequency of each bigram in our corpus
bigram_counter.update(bigrm)
# what are the ten most popular ngrams in this Spanish corpus?
print(bigram_counter.most_common(10))
bigram_freq = dict()
for key, value in bigram_counter.items():
bigram_freq[key] = value
print('smoothing')
for i in tqdm(range(vocab_size)):
for j in range(vocab_size):
if (i, j) not in bigram_freq:
bigram_freq[(i, j)] = 1
print('freq -> prob')
for i in tqdm(range(vocab_size)):
total = 0
for j in range(vocab_size):
total += bigram_freq[(i, j)]
for j in range(vocab_size):
bigram_freq[(i, j)] = bigram_freq[(i, j)] / total
print(bigram_freq)
with open('bigram_freq.pkl', 'wb') as file:
pickle.dump(bigram_freq, file)