forked from ddbourgin/numpy-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda_smoothed.py
155 lines (133 loc) · 5.62 KB
/
lda_smoothed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
class SmoothedLDA(object):
"""
A smoothed LDA model trained using collapsed Gibbs sampling. Generates
posterior mean estimates for model parameters `phi` and `theta`.
Model Parameters
----------------
T : int
Number of topics
D : int
Number of documents
N : int
Total number of words across all documents
V : int
Number of unique word tokens across all documents
phi : numpy array of shape (N[d], T)
The word-topic distribution
theta : numpy array of shape (D, T)
The document-topic distribution
alpha : numpy array of shape (1, T)
Parameter for the Dirichlet prior on the document-topic distribution
beta : numpy array of shape (V, T)
Parameter for the Dirichlet prior on the topic-word distribution
"""
def __init__(self, T, **kwargs):
self.T = T
self.alpha = (50.0 / self.T) * np.ones(self.T)
if "alpha" in kwargs.keys():
self.alpha = (kwargs["alpha"]) * np.ones(self.T)
self.beta = 0.01
if "beta" in kwargs.keys():
self.beta = kwargs["beta"]
def _init_params(self, texts, tokens):
self.tokens = tokens
self.D = len(texts)
self.V = len(np.unique(self.tokens))
self.N = np.sum(np.array([len(doc) for doc in texts]))
self.word_document = np.zeros(self.N)
# now that we know the number of tokens in our corpus, we can set beta
self.beta = self.beta * np.ones(self.V)
count = 0
for doc_idx, doc in enumerate(texts):
for word_idx, word in enumerate(doc):
word_idx = word_idx + count
self.word_document[word_idx] = doc_idx
count = count + len(doc)
def train(self, texts, tokens, n_gibbs=2000):
"""
Trains a topic model on the documents in texts. Assumes `texts` is an
array of subarrays, where each subarray corresponds to a separate
document.
"""
self._init_params(texts, tokens)
C_wt, C_dt, assignments = self._gibbs_sampler(n_gibbs, texts)
self.fit_params(C_wt, C_dt)
return C_wt, C_dt, assignments
def what_did_you_learn(self, top_n=10):
"""
Prints the `top_n` most probable words for each topic
"""
for tt in range(self.T):
top_idx = np.argsort(self.phi[:, tt])[::-1][:top_n]
top_tokens = self.tokens[top_idx]
print("\nTop Words for Topic %s:\n" % (str(tt)))
for token in top_tokens:
print("\t%s\n" % (str(token)))
def fit_params(self, C_wt, C_dt):
"""
Estimate `phi`, the word-topic distribution, and `theta`, the
topic-document distribution from the current count matrices
"""
self.phi = np.zeros([self.V, self.T])
self.theta = np.zeros([self.D, self.T])
b, a = self.beta[0], self.alpha[0]
for ii in range(self.V):
for jj in range(self.T):
self.phi[ii, jj] = (C_wt[ii, jj] + b) / (
np.sum(C_wt[:, jj]) + self.V * b
)
for dd in range(self.D):
for jj in range(self.T):
self.theta[dd, jj] = (C_dt[dd, jj] + a) / (
np.sum(C_dt[dd, :]) + self.T * a
)
return self.phi, self.theta
def _estimate_topic_prob(self, ii, d, C_wt, C_dt):
"""
Compute an approximation of the conditional probability that token ii
is assigned to topic jj given all previous topic assignments and the
current document d: p(t_i = j | t_{-i}, w_i, d_i)
"""
p_vec = np.zeros(self.T)
b, a = self.beta[0], self.alpha[0]
for jj in range(self.T):
# prob of word ii under topic jj
frac1 = (C_wt[ii, jj] + b) / (np.sum(C_wt[:, jj]) + self.V * b)
# prob of topic jj under document d
frac2 = (C_dt[d, jj] + a) / (np.sum(C_dt[d, :]) + self.T * a)
p_vec[jj] = frac1 * frac2
return p_vec / np.sum(p_vec)
def _gibbs_sampler(self, n_gibbs, texts):
"""
Collapsed Gibbs sampler for estimating the posterior distribution over
topic assignments.
"""
# Initialize count matrices
C_wt = np.zeros([self.V, self.T])
C_dt = np.zeros([self.D, self.T])
assignments = np.zeros([self.N, n_gibbs + 1])
# Randomly initialize topic assignments for words
for ii in range(self.N):
token_idx = np.concatenate(texts)[ii]
assignments[ii, 0] = np.random.randint(0, self.T)
doc = self.word_document[ii]
C_dt[doc, assignments[ii, 0]] += 1
C_wt[token_idx, assignments[ii, 0]] += 1
# run collapsed Gibbs sampler
for gg in range(n_gibbs):
print("Gibbs iteration {} of {}".format(gg + 1, n_gibbs))
for jj in range(self.N):
token_idx = np.concatenate(texts)[jj]
# Decrement count matrices by 1
doc = self.word_document[jj]
C_wt[token_idx, assignments[jj, gg]] -= 1
C_dt[doc, assignments[jj, gg]] -= 1
# Draw new topic from our approximation of the conditional dist.
p_topics = self._estimate_topic_prob(token_idx, doc, C_wt, C_dt)
sampled_topic = np.nonzero(np.random.multinomial(1, p_topics))[0][0]
# Update count matrices
C_wt[token_idx, sampled_topic] += 1
C_dt[doc, sampled_topic] += 1
assignments[jj, gg + 1] = sampled_topic
return C_wt, C_dt, assignments