-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnb_vis1.py
30 lines (28 loc) · 1.21 KB
/
nb_vis1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from gensim import corpora, models, similarities
import codecs
import json
import pyLDAvis
import pyLDAvis.gensim
with codecs.open("../input/hafez_Train3cls_cls3.txt", "r", 'UTF-8') as myfile:
documents=myfile.readlines()
with codecs.open("../../stop-words_persian_1_fa.txt","r", 'UTF-8') as myfile:
stoplist=myfile.read()
#textha = [[word for word in document.lower().split() if word not in stoplist]
#for document in matns]
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
#textha = [[word for word in text if word not in tokens_once]
# for text in textha]
texts = [[word for word in text if word not in tokens_once]
for text in texts]
#loghatname = corpora.Dictionary(textha)
dictionary = corpora.Dictionary(texts)
#maincorpus = [loghatname.doc2bow(text) for text in textha]
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20,passes=10)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda,corpus,dictionary)
vis