Skip to content

Commit

Permalink
全部更新
Browse files Browse the repository at this point in the history
  • Loading branch information
wuhuxiao committed May 11, 2022
1 parent b85a846 commit fbb2d60
Show file tree
Hide file tree
Showing 12 changed files with 482 additions and 97 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
cloud
data
glove
word2vec
word2vec
cnn_weibo_output
82 changes: 0 additions & 82 deletions Flask.py

This file was deleted.

184 changes: 184 additions & 0 deletions Web_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# 停止线程
import ctypes
import inspect
import sys
import threading

from flask import Flask
from flask import request
from flask_cors import CORS
# socket
from flask_socketio import SocketIO, emit

from cnn_model_test import queryComment
from cnn_train import trainModel


def _async_raise(tid, exctype):
"""raises the exception, performs cleanup if needed"""
try:
tid = ctypes.c_long(tid)
if not inspect.isclass(exctype):
exctype = type(exctype)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
if res == 0:
# pass
raise ValueError("invalid thread id")
elif res != 1:
# """if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect"""
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
raise SystemError("PyThreadState_SetAsyncExc failed")
except Exception as err:
print(err)


def stop_thread(thread):
"""终止线程"""
_async_raise(thread.ident, SystemExit)


#
# class trainJob(threading.Thread):
# def __init__(self, *args, **kwargs):
# super(threading.Thread, self).__init__(self)
# # 用于暂停线程的标识
# self.__flag = threading.Event()
# self.__flag.set() # 设置为True
# # 用于停止线程的标识
# self.__running = threading.Event()
# self.__running.set() # 将running设置为True
#
# def run(self):
# while self.__running.isSet():
# self.__flag.wait() # 为True时立即返回, 为False时阻塞直到内部的标识位为True后返回
# time.sleep(1)
#
# def pause(self):
# self.__flag.clear() # 设置为False, 让线程阻塞
#
# def resume(self):
# self.__flag.set() # 设置为True, 让线程停止阻塞
#
# def stop(self):
# self.__flag.set() # 将线程从暂停状态恢复, 如果已经暂停的话
# self.__running.clear() # 设置为False

class Logger(object):
def __init__(self, logFile="Default.log", emit=None):
self.terminal = sys.stdout
self.emit = emit
self.log = open(logFile, 'a', encoding='utf-8')

def write(self, message):
self.terminal.write(message)
self.log.write(message)
self.emit('train cnn', {'data': message}, broadcast=True)

def flush(self):
pass


# web服务
app = Flask(__name__)
# 允许跨域
app.config['SECRET_KEY'] = 'secret!'
socketio = SocketIO(app, cors_allowed_origins='*', threaded=True)
CORS(app, supports_credentials=True)
thread1 = None
cnn_trainning = False


# @socketio.on('my event', namespace='/test')
# def test_message(message):
# emit('my response', {'data': message['data']})
#
# @socketio.on('my broadcast event', namespace='/test')
# def test_message(message):
# emit('my response', {'data': message['data']}, broadcast=True)

@socketio.on('connect', namespace='/test')
def test_connect():
emit('connect response', {'data': '后台连接成功'})


@socketio.on('disconnect', namespace='/test')
def test_disconnect():
print('Client disconnected')


@app.route("/")
def hello():
global cnn_trainning
cnn_trainning = False
stop_thread(thread1)
return "Hello World!"


def emit_wrap(f):
def decorator(event, *args, **kwargs):
return f(event, *args, **kwargs)

return decorator


@socketio.on('train cnn', namespace='/test')
def socket_trainCNNModel(message):
print(message['data'])
data = message['data']
global cnn_trainning
global thread1
MaxLength = eval(data[0])
WordVectorType = data[1]
EmbeddingSize = eval(data[2])
BatchSize = eval(data[3])
Epochs = eval(data[4])

if cnn_trainning:
emit('train cnn', {'data': '模型正在训练'})
else:
# return '训练完成'
cnn_trainning = True
emit('train cnn',
{'data': '开始训练模型参数为MaxLength %s WordVectorType %s EmbeddingSize %s BatchSize %s Epochs %s' % (
MaxLength, WordVectorType, EmbeddingSize, BatchSize, Epochs)})
# thread1 = socketio.start_background_task(trainModel, MaxLength, WordVectorType, EmbeddingSize, BatchSize,
# Epochs, socketio)
thread1 = threading.Thread(target=trainModel,
args=(MaxLength, WordVectorType, EmbeddingSize, BatchSize, Epochs, socketio))
thread1.start()
return '训练完成'


# trainCNNModel?MaxLength=128&WordVectorType=word2vec&EmbeddingSize=300&BatchSize=32&Epochs=5
# @app.route("/trainCNNModel")
# def trainCNNModel():
# MaxLength = request.args.get('MaxLength')
# WordVectorType = request.args.get('WordVectorType')
# EmbeddingSize = request.args.get('EmbeddingSize')
# BatchSize = request.args.get('BatchSize')
# Epochs = request.args.get('Epochs')
# try:
# # return '训练完成'
# emit('terminal', {'data': '开始训练模型参数为MaxLength %s WordVectorType %s EmbeddingSize %s BatchSize %s Epochs %s' % (
# MaxLength, WordVectorType, EmbeddingSize, BatchSize, Epochs)})
# # trainModel(MaxLength, WordVectorType, EmbeddingSize, BatchSize, Epochs)
# return '训练完成'
# except Exception as e:
# return str(e)


@app.route("/getCommentSenti")
def getCommentSenti():
comment = request.args.get('message')
logits = queryComment(comment)
label = logits.argmax()
if label == 1:
return '积极,置信度:%.2f' % (logits[0, 1])
else:
return '消极,置信度:%.2f' % (logits[0, 0])


if __name__ == "__main__":
# app.run('0.0.0.0', 8080, True, )
socketio.run(app, host='0.0.0.0', port=8080)
4 changes: 2 additions & 2 deletions cnn_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def queryComment(comment):
words = get_segment_words([comment], stopwords)
review_ids = tokenizer.texts_to_sequences(words)
review_ids = preprocessing.sequence.pad_sequences(review_ids, max_len)
label = model.predict_classes(review_ids)
return label[0]
logits = model.predict(review_ids)
return logits
# if label[0] == 1:
# polarity = '积极'
# else:
Expand Down
26 changes: 22 additions & 4 deletions cnn_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,20 @@


class Logger(object):
def __init__(self, logFile="Default.log"):
def __init__(self, logFile="Default.log", emit=None):
self.terminal = sys.stdout
self.emit = emit
self.log = open(logFile, 'a', encoding='utf-8')

def write(self, message):
self.terminal.write(message)
self.log.write(message)
self.emit('train cnn', {'data': message}, broadcast=True,namespace='/test')

def flush(self):
pass



os.environ['CUDA_VISIBLE_DEVICES'] = '/gpu:1'
keras = tf.keras
preprocessing = keras.preprocessing
Expand Down Expand Up @@ -119,9 +120,13 @@ def print_history(history):
plt.show()


def trainModel(max_len, word_vector_type, embedding_size, batch_size, epochs):

def trainModel(max_len=128, word_vector_type='word2vec', embedding_size=300, batch_size=32, epochs=5,socketio = None):
# 数据预处理
emit = socketio.emit
sys.stdout = Logger("./log/web_train.txt", emit)
print('读取数据集')
emit('train cnn', {'data': '读取数据集'}, broadcast=True,namespace='/test')
train_data = pd.read_csv(os.path.join(path, './train.csv'))
dev_data = pd.read_csv(os.path.join(path, './dev.csv'))
test_data = pd.read_csv(os.path.join(path, './test.csv'))
Expand All @@ -138,22 +143,27 @@ def trainModel(max_len, word_vector_type, embedding_size, batch_size, epochs):
sentences.extend(x_test)
# 根据分词结果 生词vocab 统计词频以及对词进行编号 词频越大,编号越小
print('根据分词样本生成vocab')
emit('train cnn', {'data': '根据分词样本生成vocab'}, broadcast=True,namespace='/test')
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab = tokenizer.word_index
# 根据vocab,将数据集中的每个样本中的分词序列转化为编号序列
print('分词序列编号化')
emit('train cnn', {'data': '分词序列编号化'}, broadcast=True,namespace='/test')
x_train = tokenizer.texts_to_sequences(x_train)
x_dev = tokenizer.texts_to_sequences(x_dev)
x_test = tokenizer.texts_to_sequences(x_test)
# 每条样本长度不一,将每条样本的长度设置为一个固定值 将超过固定值的部分截掉,不足的在最前面用0填充
print('padding sequence')
emit('train cnn', {'data': 'padding sequence'}, broadcast=True,namespace='/test')
# 知道样本中的所有词在vocab中的位置信息,以及位置所对应的词向量矩阵,就可以实现Embedding
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_dev = preprocessing.sequence.pad_sequences(x_dev, maxlen=max_len)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)
# 加载词向量
print('加载词向量...')
emit('train cnn', {'data': '加载词向量...'}, broadcast=True,namespace='/test')

# wv=KeyedVectors.load_word2vec_format('/home/ydwang/word_vector/news_12g_baidubaike_20g_novel_90g_embedding_64.bin',binary=True)
if word_vector_type == 'word2vec':
wv = KeyedVectors.load_word2vec_format('./word2vec/weibo_zh_word2vec_format_' + str(embedding_size) + '.txt',
Expand All @@ -164,6 +174,8 @@ def trainModel(max_len, word_vector_type, embedding_size, batch_size, epochs):
# 词向量的嵌入矩阵的行数为什么是len(vocab)+1呢?
# 因为vocab词典中词的最小编号是从1开始的,为了保证vocab与嵌入矩阵的索引统一,所以做个加1操作
print('构建嵌入矩阵')
emit('train cnn', {'data': '构建嵌入矩阵'}, broadcast=True,namespace='/test')

embedding_matrix = np.zeros((len(vocab) + 1, embedding_size))
for word, i in vocab.items():
try:
Expand All @@ -172,10 +184,12 @@ def trainModel(max_len, word_vector_type, embedding_size, batch_size, epochs):
continue
# 初始化网络模型
print('初始化网络模型')
emit('train cnn', {'data': '初始化网络模型'}, broadcast=True,namespace='/test')
model = CNN_model(max_len, len(vocab) + 1, embedding_size, embedding_matrix)
metrics = keras.metrics
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
emit('train cnn', {'data': 'Train...'}, broadcast=True,namespace='/test')
# tensorbord
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="log/", histogram_freq=1)

Expand All @@ -196,17 +210,21 @@ def trainModel(max_len, word_vector_type, embedding_size, batch_size, epochs):
report = classification_report(y_test, y_predict, digits=4)
result = str(report)
print(result)
emit('train cnn', {'data': result})

with open(output_path + 'train_cnn_result_' + word_vector_type + '_' + str(embedding_size) + '.txt', 'w',
encoding='utf-8') as f:
f.write(result)
# 保存网络模型
model.save(output_path + 'weibo_cnn_model_' + word_vector_type + '_' + str(embedding_size) + '.h5')
print('模型保存成功')
emit('train cnn', {'data': '模型保存成功'}, broadcast=True,namespace='/test')

return 1


if __name__ == '__main__':
sys.stdout = Logger("./log/cnn_train.txt")
# sys.stdout = Logger("./log/cnn_train.txt")

# 超参
max_len = 128
Expand Down
File renamed without changes.
Loading

0 comments on commit fbb2d60

Please sign in to comment.