forked from foamliu/Speech-Transformer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram_lm.py
52 lines (41 loc) · 1.29 KB
/
ngram_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import collections
import pickle
import nltk
import numpy as np
from tqdm import tqdm
from config import pickle_file
with open(pickle_file, 'rb') as file:
data = pickle.load(file)
char_list = data['IVOCAB']
vocab_size = len(char_list)
samples = data['train']
bigram_counter = collections.Counter()
for sample in tqdm(samples):
text = sample['trn']
# text = [char_list[idx] for idx in text]
tokens = list(text)
bigrm = nltk.bigrams(tokens)
# print(*map(' '.join, bigrm), sep=', ')
# get the frequency of each bigram in our corpus
bigram_counter.update(bigrm)
# what are the ten most popular ngrams in this Spanish corpus?
print(bigram_counter.most_common(10))
temp_dict = dict()
for key, value in bigram_counter.items():
temp_dict[key] = value
print('smoothing and freq -> prob')
bigram_freq = dict()
for i in tqdm(range(vocab_size)):
freq_list = []
for j in range(vocab_size):
if (i, j) in temp_dict:
freq_list.append(temp_dict[(i, j)])
else:
freq_list.append(1)
freq_list = np.array(freq_list)
freq_list = freq_list / np.sum(freq_list)
assert (len(freq_list) == vocab_size)
bigram_freq[i] = freq_list
print(len(bigram_freq[0]))
with open('bigram_freq.pkl', 'wb') as file:
pickle.dump(bigram_freq, file)