-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit bd3e40a
Showing
7 changed files
with
346,637 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import os | ||
import re | ||
import jieba | ||
import gensim | ||
from gensim.models import Word2Vec | ||
from gensim.models.word2vec import LineSentence | ||
import multiprocessing | ||
|
||
|
||
process_data=False | ||
train_word2vector=True | ||
|
||
#========================================================================文本批处理========================================================== | ||
if process_data: | ||
rootDir="corpus" | ||
filepath="/data/books/" | ||
for dir in os.listdir(filepath): | ||
if not os.path.exists(os.path.join(rootDir, dir)): | ||
os.makedirs(os.path.join(rootDir, dir)) | ||
if dir=='女频': | ||
continue | ||
for item in os.listdir(os.path.join(filepath, dir)): | ||
if not os.path.exists(os.path.join(rootDir, dir, item)): | ||
os.makedirs(os.path.join(rootDir, dir, item)) | ||
with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff: | ||
counter=0 | ||
for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)): | ||
for f in fs: | ||
try: | ||
with open(os.path.join(fpath,f),"r",encoding="utf-8") as f: | ||
for line in f: | ||
if len(line)<=15 or len(line)>120: | ||
continue | ||
line = re.sub("\s", "", line) # 去掉空格(也去掉了换行符) | ||
line=" ".join(jieba.cut(line)) | ||
ff.write(line) | ||
ff.write("\n") | ||
counter+=1 | ||
if counter %10000==0: | ||
print(dir, item, counter) | ||
except: | ||
print(fpath, f) | ||
|
||
#========================================================================词向量批量训练======================================================= | ||
if train_word2vector: | ||
filepath="./corpus" | ||
for dir in os.listdir(filepath): | ||
if dir=="total": | ||
continue | ||
if not os.path.exists(os.path.join("word2vector", dir)): | ||
os.makedirs(os.path.join("word2vector", dir)) | ||
|
||
for item in os.listdir(os.path.join(filepath, dir)): | ||
|
||
if not os.path.exists(os.path.join("word2vector", dir, item)): | ||
os.makedirs(os.path.join("word2vector", dir, item)) | ||
else: | ||
continue | ||
print(item, "开始训练词向量......") | ||
model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")), | ||
size=256, | ||
window=5, | ||
min_count=1, | ||
workers=50, | ||
iter=10) | ||
model.save(os.path.join("word2vector", dir, item, "word2vector.model")) | ||
print(item,"词向量训练完成,模型保存到:","/word2vector/",dir,"/",item,"/","word2vector.model") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import os | ||
import json | ||
import pickle | ||
import itertools | ||
import argparse | ||
import numpy as np | ||
from gensim.models import Word2Vec | ||
|
||
|
||
def parse_args(): | ||
# 创建参数解析对象 | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--num_keywords', type=int, default=2, help='the number of keywords') | ||
parser.add_argument('--most_similarity', type=int, default=5, help='most_similarity') | ||
parser.add_argument('--file_path', type=str, default="./corpus/total/total_segment.txt", help='filepath of corpus') | ||
return parser.parse_args() | ||
|
||
|
||
def main(args): | ||
# 选择生成文本类型 | ||
print("请选择生成文本的类型:") | ||
print("女频:") | ||
print("0.农家种田 1.古言穿越 2.女尊天下 3.幻想言情 4.总裁霸爱 5.情深虐恋 6.浪漫青春 7.现代言情 8.存爱同人 9.蜜恋宠文 10.重生虐渣") | ||
print("男频:") | ||
print( | ||
"11.修真异能 12.修真狂人 13.兵王传奇 14.异世奇遇 15.悬疑灵异 16.摸骨神医 17.架空历史 18.武侠仙侠 19.游戏竞技 20.热血爽文 21.玄幻魔幻 22.科幻末日 23.都市小说") | ||
|
||
# --------------------------------------------------------------------- | ||
with open("./category.json", "r", encoding="utf-8") as f: | ||
category = json.load(f) | ||
category_choose = input("请输入类别编号:") | ||
|
||
# word2vector model list | ||
word2vector_dir = [] | ||
for root, dirs, files in os.walk("./word2vector"): | ||
for name in files: | ||
if ".npy" not in os.path.join(root, name): | ||
word2vector_dir.append(os.path.join(root, name)) | ||
|
||
# corpus list | ||
corpus_dir = [] | ||
for root, dirs, files in os.walk("./corpus"): | ||
for name in files: | ||
corpus_dir.append(os.path.join(root, name)) | ||
|
||
for item in word2vector_dir: | ||
if category[category_choose] in item: | ||
word2vector_path = item | ||
|
||
for item in corpus_dir: | ||
if category[category_choose] in item: | ||
corpus_path = item | ||
|
||
# 加载word2vector模型 | ||
model = Word2Vec.load(word2vector_path) | ||
|
||
# ---------------------------------------------------------------------- | ||
# 输入关键词 | ||
string = [] | ||
for i in range(args.num_keywords): | ||
temp = input("输入第" + str(i + 1) + "个关键词:") | ||
while temp not in model.wv.vocab: | ||
print("请换个关键词试试.....") | ||
temp = input("输入第" + str(i + 1) + "个关键词:") | ||
string.append(temp) | ||
|
||
# 通过word2vector寻找相关检索词 | ||
def find_similaritr_words(string): | ||
index_words = [] | ||
for i in range(len(string)): | ||
try: | ||
similarity_words = model.most_similar([string[i]])[:args.most_similarity] | ||
temp = [item[0] for item in similarity_words] | ||
temp.append(string[i]) | ||
index_words.append(temp) | ||
except: | ||
print("抱歉,没有关键词:", string[i]) | ||
return index_words | ||
|
||
index_words = find_similaritr_words(string) | ||
|
||
# print('相关检索词: ',index_words) | ||
|
||
# 相关句子检索 | ||
def find_similarity_sentence(corpus_path, index_words): | ||
for ii in range(len(index_words)): | ||
for temp in itertools.combinations(index_words, len(index_words) - ii): | ||
print("------------------------------------------------------------------------------------") | ||
print('相关检索词: ', temp) | ||
print("------------------------------------------------------------------------------------") | ||
# 相关句子检索 | ||
matched_sentence = [] | ||
with open(corpus_path, "r", encoding="utf-8") as f: | ||
for line in f: | ||
line = list(line.strip().split(" ")) | ||
if len(line) <= 100: # 去掉长度大于100的句子 | ||
match_num = 0 | ||
for item in temp: | ||
if len(set(item) & set(line)) > 0: | ||
match_num += 1 | ||
if match_num == len(index_words) - ii: | ||
for i in index_words: | ||
for j in i: | ||
if j in line: | ||
line[line.index(j)] = i[-1] | ||
print("".join(line)) | ||
if line not in matched_sentence: | ||
matched_sentence.append(line) | ||
if len(matched_sentence) != 0: | ||
return matched_sentence | ||
|
||
matched_sentence = find_similarity_sentence(corpus_path, index_words) | ||
|
||
# 按照相似度排序 | ||
print("-------------------------------------------------------------------------------------------") | ||
print("相关度排序中......") | ||
print("-------------------------------------------------------------------------------------------") | ||
|
||
# top_n 排序 | ||
def top_n(matched_sentence, n=5): | ||
matched_similarity = [] | ||
for i in range(len(matched_sentence)): | ||
temp = [] | ||
for w in matched_sentence[i]: | ||
if w in model: | ||
temp.append(w) | ||
|
||
matched_similarity.append(model.n_similarity(string, temp)) | ||
ordered_matched = [matched_sentence[i] for i in np.argsort(-np.array(matched_similarity))] | ||
if len(ordered_matched) > n: | ||
ordered_matched_top_n = ordered_matched[0:n] | ||
else: | ||
ordered_matched_top_n = ordered_matched | ||
return ordered_matched_top_n | ||
|
||
ordered_matched_top_n = top_n(matched_sentence, 5) | ||
for item in ordered_matched_top_n: | ||
print("".join(item)) | ||
print("===============================================================================================") | ||
|
||
|
||
# =========================================================================================================== | ||
|
||
if __name__ == '__main__': | ||
main(parse_args()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import os | ||
import re | ||
import jieba | ||
import gensim | ||
from gensim.models import Word2Vec | ||
from gensim.models.word2vec import LineSentence | ||
import multiprocessing | ||
|
||
|
||
process_data=False | ||
train_word2vector=True | ||
|
||
#========================================================================文本批处理========================================================== | ||
if process_data: | ||
rootDir="corpus" | ||
filepath="/data/books/" | ||
for dir in os.listdir(filepath): | ||
if not os.path.exists(os.path.join(rootDir, dir)): | ||
os.makedirs(os.path.join(rootDir, dir)) | ||
if dir=='女频': | ||
continue | ||
for item in os.listdir(os.path.join(filepath, dir)): | ||
if not os.path.exists(os.path.join(rootDir, dir, item)): | ||
os.makedirs(os.path.join(rootDir, dir, item)) | ||
with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff: | ||
counter=0 | ||
for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)): | ||
for f in fs: | ||
try: | ||
with open(os.path.join(fpath,f),"r",encoding="utf-8") as f: | ||
for line in f: | ||
if len(line)<=15 or len(line)>120: | ||
continue | ||
line = re.sub("\s", "", line) # 去掉空格(也去掉了换行符) | ||
line=" ".join(jieba.cut(line)) | ||
ff.write(line) | ||
ff.write("\n") | ||
counter+=1 | ||
if counter %10000==0: | ||
print(dir, item, counter) | ||
except: | ||
print(fpath, f) | ||
|
||
#========================================================================词向量批量训练======================================================= | ||
if train_word2vector: | ||
filepath="./corpus" | ||
for dir in os.listdir(filepath): | ||
if dir=="total": | ||
continue | ||
if not os.path.exists(os.path.join("word2vector", dir)): | ||
os.makedirs(os.path.join("word2vector", dir)) | ||
|
||
for item in os.listdir(os.path.join(filepath, dir)): | ||
|
||
if not os.path.exists(os.path.join("word2vector", dir, item)): | ||
os.makedirs(os.path.join("word2vector", dir, item)) | ||
else: | ||
continue | ||
print(item, "开始训练词向量......") | ||
model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")), | ||
size=256, | ||
window=5, | ||
min_count=1, | ||
workers=50, | ||
iter=10) | ||
model.save(os.path.join("word2vector", dir, item, "word2vector.model")) | ||
print(item,"词向量训练完成,模型保存到:","/word2vector/",dir,"/",item,"/","word2vector.model") |
Oops, something went wrong.