Skip to content

Commit

Permalink
updata
Browse files Browse the repository at this point in the history
  • Loading branch information
tantao258 committed Jul 10, 2018
0 parents commit bd3e40a
Show file tree
Hide file tree
Showing 7 changed files with 346,637 additions and 0 deletions.
1 change: 1 addition & 0 deletions generation_by_word2vector_replace/edit_01/category.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"}
67 changes: 67 additions & 0 deletions generation_by_word2vector_replace/edit_01/data_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import re
import jieba
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing


process_data=False
train_word2vector=True

#========================================================================文本批处理==========================================================
if process_data:
rootDir="corpus"
filepath="/data/books/"
for dir in os.listdir(filepath):
if not os.path.exists(os.path.join(rootDir, dir)):
os.makedirs(os.path.join(rootDir, dir))
if dir=='女频':
continue
for item in os.listdir(os.path.join(filepath, dir)):
if not os.path.exists(os.path.join(rootDir, dir, item)):
os.makedirs(os.path.join(rootDir, dir, item))
with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff:
counter=0
for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)):
for f in fs:
try:
with open(os.path.join(fpath,f),"r",encoding="utf-8") as f:
for line in f:
if len(line)<=15 or len(line)>120:
continue
line = re.sub("\s", "", line) # 去掉空格(也去掉了换行符)
line=" ".join(jieba.cut(line))
ff.write(line)
ff.write("\n")
counter+=1
if counter %10000==0:
print(dir, item, counter)
except:
print(fpath, f)

#========================================================================词向量批量训练=======================================================
if train_word2vector:
filepath="./corpus"
for dir in os.listdir(filepath):
if dir=="total":
continue
if not os.path.exists(os.path.join("word2vector", dir)):
os.makedirs(os.path.join("word2vector", dir))

for item in os.listdir(os.path.join(filepath, dir)):

if not os.path.exists(os.path.join("word2vector", dir, item)):
os.makedirs(os.path.join("word2vector", dir, item))
else:
continue
print(item, "开始训练词向量......")
model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")),
size=256,
window=5,
min_count=1,
workers=50,
iter=10)
model.save(os.path.join("word2vector", dir, item, "word2vector.model"))
print(item,"词向量训练完成,模型保存到:","/word2vector/",dir,"/",item,"/","word2vector.model")
145 changes: 145 additions & 0 deletions generation_by_word2vector_replace/edit_01/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os
import json
import pickle
import itertools
import argparse
import numpy as np
from gensim.models import Word2Vec


def parse_args():
# 创建参数解析对象
parser = argparse.ArgumentParser()
parser.add_argument('--num_keywords', type=int, default=2, help='the number of keywords')
parser.add_argument('--most_similarity', type=int, default=5, help='most_similarity')
parser.add_argument('--file_path', type=str, default="./corpus/total/total_segment.txt", help='filepath of corpus')
return parser.parse_args()


def main(args):
# 选择生成文本类型
print("请选择生成文本的类型:")
print("女频:")
print("0.农家种田 1.古言穿越 2.女尊天下 3.幻想言情 4.总裁霸爱 5.情深虐恋 6.浪漫青春 7.现代言情 8.存爱同人 9.蜜恋宠文 10.重生虐渣")
print("男频:")
print(
"11.修真异能 12.修真狂人 13.兵王传奇 14.异世奇遇 15.悬疑灵异 16.摸骨神医 17.架空历史 18.武侠仙侠 19.游戏竞技 20.热血爽文 21.玄幻魔幻 22.科幻末日 23.都市小说")

# ---------------------------------------------------------------------
with open("./category.json", "r", encoding="utf-8") as f:
category = json.load(f)
category_choose = input("请输入类别编号:")

# word2vector model list
word2vector_dir = []
for root, dirs, files in os.walk("./word2vector"):
for name in files:
if ".npy" not in os.path.join(root, name):
word2vector_dir.append(os.path.join(root, name))

# corpus list
corpus_dir = []
for root, dirs, files in os.walk("./corpus"):
for name in files:
corpus_dir.append(os.path.join(root, name))

for item in word2vector_dir:
if category[category_choose] in item:
word2vector_path = item

for item in corpus_dir:
if category[category_choose] in item:
corpus_path = item

# 加载word2vector模型
model = Word2Vec.load(word2vector_path)

# ----------------------------------------------------------------------
# 输入关键词
string = []
for i in range(args.num_keywords):
temp = input("输入第" + str(i + 1) + "个关键词:")
while temp not in model.wv.vocab:
print("请换个关键词试试.....")
temp = input("输入第" + str(i + 1) + "个关键词:")
string.append(temp)

# 通过word2vector寻找相关检索词
def find_similaritr_words(string):
index_words = []
for i in range(len(string)):
try:
similarity_words = model.most_similar([string[i]])[:args.most_similarity]
temp = [item[0] for item in similarity_words]
temp.append(string[i])
index_words.append(temp)
except:
print("抱歉,没有关键词:", string[i])
return index_words

index_words = find_similaritr_words(string)

# print('相关检索词: ',index_words)

# 相关句子检索
def find_similarity_sentence(corpus_path, index_words):
for ii in range(len(index_words)):
for temp in itertools.combinations(index_words, len(index_words) - ii):
print("------------------------------------------------------------------------------------")
print('相关检索词: ', temp)
print("------------------------------------------------------------------------------------")
# 相关句子检索
matched_sentence = []
with open(corpus_path, "r", encoding="utf-8") as f:
for line in f:
line = list(line.strip().split(" "))
if len(line) <= 100: # 去掉长度大于100的句子
match_num = 0
for item in temp:
if len(set(item) & set(line)) > 0:
match_num += 1
if match_num == len(index_words) - ii:
for i in index_words:
for j in i:
if j in line:
line[line.index(j)] = i[-1]
print("".join(line))
if line not in matched_sentence:
matched_sentence.append(line)
if len(matched_sentence) != 0:
return matched_sentence

matched_sentence = find_similarity_sentence(corpus_path, index_words)

# 按照相似度排序
print("-------------------------------------------------------------------------------------------")
print("相关度排序中......")
print("-------------------------------------------------------------------------------------------")

# top_n 排序
def top_n(matched_sentence, n=5):
matched_similarity = []
for i in range(len(matched_sentence)):
temp = []
for w in matched_sentence[i]:
if w in model:
temp.append(w)

matched_similarity.append(model.n_similarity(string, temp))
ordered_matched = [matched_sentence[i] for i in np.argsort(-np.array(matched_similarity))]
if len(ordered_matched) > n:
ordered_matched_top_n = ordered_matched[0:n]
else:
ordered_matched_top_n = ordered_matched
return ordered_matched_top_n

ordered_matched_top_n = top_n(matched_sentence, 5)
for item in ordered_matched_top_n:
print("".join(item))
print("===============================================================================================")


# ===========================================================================================================

if __name__ == '__main__':
main(parse_args())
1 change: 1 addition & 0 deletions generation_by_word2vector_replace/edit_02/category.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"}
67 changes: 67 additions & 0 deletions generation_by_word2vector_replace/edit_02/data_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import re
import jieba
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing


process_data=False
train_word2vector=True

#========================================================================文本批处理==========================================================
if process_data:
rootDir="corpus"
filepath="/data/books/"
for dir in os.listdir(filepath):
if not os.path.exists(os.path.join(rootDir, dir)):
os.makedirs(os.path.join(rootDir, dir))
if dir=='女频':
continue
for item in os.listdir(os.path.join(filepath, dir)):
if not os.path.exists(os.path.join(rootDir, dir, item)):
os.makedirs(os.path.join(rootDir, dir, item))
with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff:
counter=0
for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)):
for f in fs:
try:
with open(os.path.join(fpath,f),"r",encoding="utf-8") as f:
for line in f:
if len(line)<=15 or len(line)>120:
continue
line = re.sub("\s", "", line) # 去掉空格(也去掉了换行符)
line=" ".join(jieba.cut(line))
ff.write(line)
ff.write("\n")
counter+=1
if counter %10000==0:
print(dir, item, counter)
except:
print(fpath, f)

#========================================================================词向量批量训练=======================================================
if train_word2vector:
filepath="./corpus"
for dir in os.listdir(filepath):
if dir=="total":
continue
if not os.path.exists(os.path.join("word2vector", dir)):
os.makedirs(os.path.join("word2vector", dir))

for item in os.listdir(os.path.join(filepath, dir)):

if not os.path.exists(os.path.join("word2vector", dir, item)):
os.makedirs(os.path.join("word2vector", dir, item))
else:
continue
print(item, "开始训练词向量......")
model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")),
size=256,
window=5,
min_count=1,
workers=50,
iter=10)
model.save(os.path.join("word2vector", dir, item, "word2vector.model"))
print(item,"词向量训练完成,模型保存到:","/word2vector/",dir,"/",item,"/","word2vector.model")
Loading

0 comments on commit bd3e40a

Please sign in to comment.