Skip to content

Commit 0ffb01d

Browse files
author
李闯
committed
chatbotv3
1 parent 1557268 commit 0ffb01d

File tree

1 file changed

+248
-0
lines changed

1 file changed

+248
-0
lines changed

chatbotv3/encoder_decoder_seq2seq.py

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
# coding: utf-8
2+
# 自动编解码器实现自动问答
3+
4+
import sys
5+
import jieba
6+
import struct
7+
import numpy as np
8+
import tensorflow as tf
9+
from tensorflow.contrib import rnn
10+
11+
12+
class MyLSTM(object):
13+
def __init__(self):
14+
self.max_abs_weight = 32 # 最大权重绝对值,用来对词向量做正规化
15+
self.max_seq_len = 8 # 最大句子长度(词)
16+
self.word_vec_dim = 0 # 词向量维度,读vectors.bin二进制时动态确定
17+
self.epoch = 10000
18+
self.word_vector_dict = {} # 词向量词典,加载vectors.bin读入
19+
self.vectors_bin_file = './vectors.bin' # 词向量二进制
20+
self.model_dir = './model/model' # 模型文件路径
21+
self.n_hidden = 1000 # lstm隐藏状态单元数目
22+
self.learning_rate = 0.01 # 学习率
23+
24+
def load_word_vectors(self):
25+
"""加载词向量二进制到内存"""
26+
float_size = 4 # 一个浮点数4字节
27+
max_w = 50 # 最大单词字数
28+
input_file = open(self.vectors_bin_file, "rb")
29+
# 获取词表数目及向量维度
30+
words_and_size = input_file.readline()
31+
words_and_size = words_and_size.strip()
32+
words = long(words_and_size.split(' ')[0])
33+
self.word_vec_dim = long(words_and_size.split(' ')[1])
34+
print("词表总词数:%d" % words)
35+
print("词向量维度:%d" % self.word_vec_dim)
36+
37+
for b in range(0, words):
38+
a = 0
39+
word = ''
40+
# 读取一个词
41+
while True:
42+
c = input_file.read(1)
43+
word = word + c
44+
if False == c or c == ' ':
45+
break
46+
if a < max_w and c != '\n':
47+
a = a + 1
48+
word = word.strip()
49+
vector = []
50+
51+
for index in range(0, self.word_vec_dim):
52+
m = input_file.read(float_size)
53+
(weight,) = struct.unpack('f', m)
54+
f_weight = float(weight)
55+
vector.append(f_weight)
56+
57+
# 将词及其对应的向量存到dict中
58+
try:
59+
self.word_vector_dict[word.decode('utf-8')] = vector[0:self.word_vec_dim]
60+
except:
61+
# 异常的词舍弃掉
62+
# print('bad word:' + word)
63+
pass
64+
65+
input_file.close()
66+
print "finish"
67+
68+
def next_batch(self):
69+
"""获取训练样本"""
70+
XY = [] # lstm的训练输入
71+
Y = [] # lstm的训练输出
72+
EOS = [np.ones(self.word_vec_dim)]
73+
sample_file_object = open('./samples/1', 'r')
74+
lines = sample_file_object.readlines()
75+
for line in lines:
76+
line = line.strip()
77+
split = line.split('|')
78+
if len(split) == 2:
79+
question = split[0]
80+
answer = split[1]
81+
print('question:[%s] answer:[%s]' % (question, answer))
82+
83+
question_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len
84+
answer_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len
85+
segments = jieba.cut(question)
86+
for index, word in enumerate(segments):
87+
if word in self.word_vector_dict:
88+
vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight
89+
# 防止词过多越界
90+
if self.max_seq_len - index - 1 < 0:
91+
break
92+
# 问题不足max_seq_len在前面补零,存储时倒序存储
93+
question_seq[self.max_seq_len - index - 1] = vec
94+
95+
segments = jieba.cut(answer)
96+
for index, word in enumerate(segments):
97+
if word in self.word_vector_dict:
98+
vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight
99+
# 防止词过多越界
100+
if index >= self.max_seq_len:
101+
break
102+
answer_seq[index] = vec
103+
104+
xy = question_seq + EOS + answer_seq[0:-1]
105+
y = answer_seq
106+
XY.append(xy)
107+
Y.append(y)
108+
109+
sample_file_object.close()
110+
111+
return XY, Y
112+
113+
def model(self, x, y, weights, biases, predict=False):
114+
encoder_inputs = tf.slice(x, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim])
115+
encoder_inputs = tf.unstack(encoder_inputs, self.max_seq_len, 1)
116+
117+
if predict:
118+
decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, 1, self.word_vec_dim])
119+
decoder_inputs = tf.unstack(decoder_inputs, 1, 1)
120+
else:
121+
decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, self.max_seq_len, self.word_vec_dim])
122+
decoder_inputs = tf.unstack(decoder_inputs, self.max_seq_len, 1)
123+
124+
target_outputs = tf.slice(y, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim])
125+
target_outputs = tf.unstack(target_outputs, self.max_seq_len, 1)
126+
127+
encoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
128+
decoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
129+
130+
encoder_outputs, states = rnn.static_rnn(encoder, encoder_inputs, dtype=tf.float32, scope='encoder')
131+
if predict:
132+
decoder_output, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder')
133+
else:
134+
decoder_outputs, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder')
135+
136+
optimizer = None
137+
cost = None
138+
139+
if predict:
140+
decoder_outputs = []
141+
decoder_outputs.append(decoder_output)
142+
143+
for i in range(self.max_seq_len - 1):
144+
decoder_output = tf.unstack(decoder_output, axis=1)[0]
145+
decoder_output = tf.matmul(decoder_output, weights['out']) + tf.slice(biases['out'], [i, 0],
146+
[1, self.word_vec_dim])
147+
decoder_output, states = rnn.static_rnn(decoder, [decoder_output], initial_state=states,
148+
dtype=tf.float32,
149+
scope='decoder')
150+
decoder_outputs.append(decoder_output)
151+
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
152+
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
153+
decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out']
154+
else:
155+
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
156+
decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out']
157+
target_outputs = tf.unstack(target_outputs, axis=1)[0]
158+
159+
cost = tf.losses.mean_squared_error(decoder_outputs, target_outputs)
160+
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost)
161+
return optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs
162+
163+
def train(self):
164+
x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim])
165+
y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim])
166+
167+
weights = {
168+
'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim]))
169+
}
170+
biases = {
171+
'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim]))
172+
}
173+
174+
optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases)
175+
176+
init = tf.global_variables_initializer()
177+
sess = tf.Session()
178+
sess.run(init)
179+
180+
XY, Y = self.next_batch()
181+
n_steps = len(XY)
182+
183+
for i in range(self.epoch):
184+
for step in range(n_steps):
185+
train_XY = XY[step:]
186+
train_Y = Y[step:]
187+
sess.run(optimizer, feed_dict={x: train_XY, y: train_Y})
188+
loss = sess.run(cost, feed_dict={x: train_XY, y: train_Y})
189+
if i % 100 == 0 and step == 0:
190+
print 'i=%d, loss=%f' % (i, loss)
191+
192+
saver = tf.train.Saver()
193+
saver.save(sess, self.model_dir)
194+
195+
def predict(self):
196+
x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim])
197+
y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim])
198+
199+
weights = {
200+
'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim]))
201+
}
202+
biases = {
203+
'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim]))
204+
}
205+
206+
optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases, predict=True)
207+
208+
init = tf.global_variables_initializer()
209+
sess = tf.Session()
210+
sess.run(init)
211+
saver = tf.train.Saver()
212+
saver.restore(sess, self.model_dir)
213+
214+
question = '你是谁'
215+
XY = [] # lstm的训练输入
216+
Y = []
217+
EOS = [np.ones(self.word_vec_dim)]
218+
question_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len
219+
segments = jieba.cut(question)
220+
for index, word in enumerate(segments):
221+
if word in self.word_vector_dict:
222+
vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight
223+
# 防止词过多越界
224+
if self.max_seq_len - index - 1 < 0:
225+
break
226+
question_seq[self.max_seq_len - index - 1] = vec
227+
228+
xy = question_seq + EOS + [np.zeros(self.word_vec_dim)] * (self.max_seq_len-1)
229+
XY.append(xy)
230+
Y.append([np.zeros(self.word_vec_dim)] * self.max_seq_len)
231+
print sess.run(decoder_outputs, feed_dict={x: XY, y: Y})
232+
233+
234+
def main(op):
235+
lstm = MyLSTM()
236+
lstm.load_word_vectors()
237+
if op == 'train':
238+
lstm.train()
239+
elif op == 'predict':
240+
lstm.predict()
241+
else:
242+
print 'Usage:'
243+
244+
if __name__ == '__main__':
245+
if len(sys.argv) == 2:
246+
main(sys.argv[1])
247+
else:
248+
print 'Usage:'

0 commit comments

Comments
 (0)