|
| 1 | +# coding: utf-8 |
| 2 | +# 自动编解码器实现自动问答 |
| 3 | + |
| 4 | +import sys |
| 5 | +import jieba |
| 6 | +import struct |
| 7 | +import numpy as np |
| 8 | +import tensorflow as tf |
| 9 | +from tensorflow.contrib import rnn |
| 10 | + |
| 11 | + |
| 12 | +class MyLSTM(object): |
| 13 | + def __init__(self): |
| 14 | + self.max_abs_weight = 32 # 最大权重绝对值,用来对词向量做正规化 |
| 15 | + self.max_seq_len = 8 # 最大句子长度(词) |
| 16 | + self.word_vec_dim = 0 # 词向量维度,读vectors.bin二进制时动态确定 |
| 17 | + self.epoch = 10000 |
| 18 | + self.word_vector_dict = {} # 词向量词典,加载vectors.bin读入 |
| 19 | + self.vectors_bin_file = './vectors.bin' # 词向量二进制 |
| 20 | + self.model_dir = './model/model' # 模型文件路径 |
| 21 | + self.n_hidden = 1000 # lstm隐藏状态单元数目 |
| 22 | + self.learning_rate = 0.01 # 学习率 |
| 23 | + |
| 24 | + def load_word_vectors(self): |
| 25 | + """加载词向量二进制到内存""" |
| 26 | + float_size = 4 # 一个浮点数4字节 |
| 27 | + max_w = 50 # 最大单词字数 |
| 28 | + input_file = open(self.vectors_bin_file, "rb") |
| 29 | + # 获取词表数目及向量维度 |
| 30 | + words_and_size = input_file.readline() |
| 31 | + words_and_size = words_and_size.strip() |
| 32 | + words = long(words_and_size.split(' ')[0]) |
| 33 | + self.word_vec_dim = long(words_and_size.split(' ')[1]) |
| 34 | + print("词表总词数:%d" % words) |
| 35 | + print("词向量维度:%d" % self.word_vec_dim) |
| 36 | + |
| 37 | + for b in range(0, words): |
| 38 | + a = 0 |
| 39 | + word = '' |
| 40 | + # 读取一个词 |
| 41 | + while True: |
| 42 | + c = input_file.read(1) |
| 43 | + word = word + c |
| 44 | + if False == c or c == ' ': |
| 45 | + break |
| 46 | + if a < max_w and c != '\n': |
| 47 | + a = a + 1 |
| 48 | + word = word.strip() |
| 49 | + vector = [] |
| 50 | + |
| 51 | + for index in range(0, self.word_vec_dim): |
| 52 | + m = input_file.read(float_size) |
| 53 | + (weight,) = struct.unpack('f', m) |
| 54 | + f_weight = float(weight) |
| 55 | + vector.append(f_weight) |
| 56 | + |
| 57 | + # 将词及其对应的向量存到dict中 |
| 58 | + try: |
| 59 | + self.word_vector_dict[word.decode('utf-8')] = vector[0:self.word_vec_dim] |
| 60 | + except: |
| 61 | + # 异常的词舍弃掉 |
| 62 | + # print('bad word:' + word) |
| 63 | + pass |
| 64 | + |
| 65 | + input_file.close() |
| 66 | + print "finish" |
| 67 | + |
| 68 | + def next_batch(self): |
| 69 | + """获取训练样本""" |
| 70 | + XY = [] # lstm的训练输入 |
| 71 | + Y = [] # lstm的训练输出 |
| 72 | + EOS = [np.ones(self.word_vec_dim)] |
| 73 | + sample_file_object = open('./samples/1', 'r') |
| 74 | + lines = sample_file_object.readlines() |
| 75 | + for line in lines: |
| 76 | + line = line.strip() |
| 77 | + split = line.split('|') |
| 78 | + if len(split) == 2: |
| 79 | + question = split[0] |
| 80 | + answer = split[1] |
| 81 | + print('question:[%s] answer:[%s]' % (question, answer)) |
| 82 | + |
| 83 | + question_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len |
| 84 | + answer_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len |
| 85 | + segments = jieba.cut(question) |
| 86 | + for index, word in enumerate(segments): |
| 87 | + if word in self.word_vector_dict: |
| 88 | + vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight |
| 89 | + # 防止词过多越界 |
| 90 | + if self.max_seq_len - index - 1 < 0: |
| 91 | + break |
| 92 | + # 问题不足max_seq_len在前面补零,存储时倒序存储 |
| 93 | + question_seq[self.max_seq_len - index - 1] = vec |
| 94 | + |
| 95 | + segments = jieba.cut(answer) |
| 96 | + for index, word in enumerate(segments): |
| 97 | + if word in self.word_vector_dict: |
| 98 | + vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight |
| 99 | + # 防止词过多越界 |
| 100 | + if index >= self.max_seq_len: |
| 101 | + break |
| 102 | + answer_seq[index] = vec |
| 103 | + |
| 104 | + xy = question_seq + EOS + answer_seq[0:-1] |
| 105 | + y = answer_seq |
| 106 | + XY.append(xy) |
| 107 | + Y.append(y) |
| 108 | + |
| 109 | + sample_file_object.close() |
| 110 | + |
| 111 | + return XY, Y |
| 112 | + |
| 113 | + def model(self, x, y, weights, biases, predict=False): |
| 114 | + encoder_inputs = tf.slice(x, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim]) |
| 115 | + encoder_inputs = tf.unstack(encoder_inputs, self.max_seq_len, 1) |
| 116 | + |
| 117 | + if predict: |
| 118 | + decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, 1, self.word_vec_dim]) |
| 119 | + decoder_inputs = tf.unstack(decoder_inputs, 1, 1) |
| 120 | + else: |
| 121 | + decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, self.max_seq_len, self.word_vec_dim]) |
| 122 | + decoder_inputs = tf.unstack(decoder_inputs, self.max_seq_len, 1) |
| 123 | + |
| 124 | + target_outputs = tf.slice(y, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim]) |
| 125 | + target_outputs = tf.unstack(target_outputs, self.max_seq_len, 1) |
| 126 | + |
| 127 | + encoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0) |
| 128 | + decoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0) |
| 129 | + |
| 130 | + encoder_outputs, states = rnn.static_rnn(encoder, encoder_inputs, dtype=tf.float32, scope='encoder') |
| 131 | + if predict: |
| 132 | + decoder_output, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder') |
| 133 | + else: |
| 134 | + decoder_outputs, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder') |
| 135 | + |
| 136 | + optimizer = None |
| 137 | + cost = None |
| 138 | + |
| 139 | + if predict: |
| 140 | + decoder_outputs = [] |
| 141 | + decoder_outputs.append(decoder_output) |
| 142 | + |
| 143 | + for i in range(self.max_seq_len - 1): |
| 144 | + decoder_output = tf.unstack(decoder_output, axis=1)[0] |
| 145 | + decoder_output = tf.matmul(decoder_output, weights['out']) + tf.slice(biases['out'], [i, 0], |
| 146 | + [1, self.word_vec_dim]) |
| 147 | + decoder_output, states = rnn.static_rnn(decoder, [decoder_output], initial_state=states, |
| 148 | + dtype=tf.float32, |
| 149 | + scope='decoder') |
| 150 | + decoder_outputs.append(decoder_output) |
| 151 | + decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0] |
| 152 | + decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0] |
| 153 | + decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out'] |
| 154 | + else: |
| 155 | + decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0] |
| 156 | + decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out'] |
| 157 | + target_outputs = tf.unstack(target_outputs, axis=1)[0] |
| 158 | + |
| 159 | + cost = tf.losses.mean_squared_error(decoder_outputs, target_outputs) |
| 160 | + optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) |
| 161 | + return optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs |
| 162 | + |
| 163 | + def train(self): |
| 164 | + x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim]) |
| 165 | + y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim]) |
| 166 | + |
| 167 | + weights = { |
| 168 | + 'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim])) |
| 169 | + } |
| 170 | + biases = { |
| 171 | + 'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim])) |
| 172 | + } |
| 173 | + |
| 174 | + optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases) |
| 175 | + |
| 176 | + init = tf.global_variables_initializer() |
| 177 | + sess = tf.Session() |
| 178 | + sess.run(init) |
| 179 | + |
| 180 | + XY, Y = self.next_batch() |
| 181 | + n_steps = len(XY) |
| 182 | + |
| 183 | + for i in range(self.epoch): |
| 184 | + for step in range(n_steps): |
| 185 | + train_XY = XY[step:] |
| 186 | + train_Y = Y[step:] |
| 187 | + sess.run(optimizer, feed_dict={x: train_XY, y: train_Y}) |
| 188 | + loss = sess.run(cost, feed_dict={x: train_XY, y: train_Y}) |
| 189 | + if i % 100 == 0 and step == 0: |
| 190 | + print 'i=%d, loss=%f' % (i, loss) |
| 191 | + |
| 192 | + saver = tf.train.Saver() |
| 193 | + saver.save(sess, self.model_dir) |
| 194 | + |
| 195 | + def predict(self): |
| 196 | + x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim]) |
| 197 | + y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim]) |
| 198 | + |
| 199 | + weights = { |
| 200 | + 'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim])) |
| 201 | + } |
| 202 | + biases = { |
| 203 | + 'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim])) |
| 204 | + } |
| 205 | + |
| 206 | + optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases, predict=True) |
| 207 | + |
| 208 | + init = tf.global_variables_initializer() |
| 209 | + sess = tf.Session() |
| 210 | + sess.run(init) |
| 211 | + saver = tf.train.Saver() |
| 212 | + saver.restore(sess, self.model_dir) |
| 213 | + |
| 214 | + question = '你是谁' |
| 215 | + XY = [] # lstm的训练输入 |
| 216 | + Y = [] |
| 217 | + EOS = [np.ones(self.word_vec_dim)] |
| 218 | + question_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len |
| 219 | + segments = jieba.cut(question) |
| 220 | + for index, word in enumerate(segments): |
| 221 | + if word in self.word_vector_dict: |
| 222 | + vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight |
| 223 | + # 防止词过多越界 |
| 224 | + if self.max_seq_len - index - 1 < 0: |
| 225 | + break |
| 226 | + question_seq[self.max_seq_len - index - 1] = vec |
| 227 | + |
| 228 | + xy = question_seq + EOS + [np.zeros(self.word_vec_dim)] * (self.max_seq_len-1) |
| 229 | + XY.append(xy) |
| 230 | + Y.append([np.zeros(self.word_vec_dim)] * self.max_seq_len) |
| 231 | + print sess.run(decoder_outputs, feed_dict={x: XY, y: Y}) |
| 232 | + |
| 233 | + |
| 234 | +def main(op): |
| 235 | + lstm = MyLSTM() |
| 236 | + lstm.load_word_vectors() |
| 237 | + if op == 'train': |
| 238 | + lstm.train() |
| 239 | + elif op == 'predict': |
| 240 | + lstm.predict() |
| 241 | + else: |
| 242 | + print 'Usage:' |
| 243 | + |
| 244 | +if __name__ == '__main__': |
| 245 | + if len(sys.argv) == 2: |
| 246 | + main(sys.argv[1]) |
| 247 | + else: |
| 248 | + print 'Usage:' |
0 commit comments