updata

tantao258 · Jul 10, 2018 · bd3e40a · bd3e40a
commit bd3e40a
Show file tree

Hide file tree

Showing 7 changed files with 346,637 additions and 0 deletions.
diff --git a/generation_by_word2vector_replace/edit_01/category.json b/generation_by_word2vector_replace/edit_01/category.json
@@ -0,0 +1 @@
+{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"}
diff --git a/generation_by_word2vector_replace/edit_01/data_process.py b/generation_by_word2vector_replace/edit_01/data_process.py
@@ -0,0 +1,67 @@
+import os
+import re
+import jieba
+import gensim
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+import multiprocessing
+
+
+process_data=False
+train_word2vector=True
+
+#========================================================================文本批处理==========================================================
+if process_data:
+    rootDir="corpus"
+    filepath="/data/books/"
+    for dir in os.listdir(filepath):
+        if not os.path.exists(os.path.join(rootDir, dir)):
+            os.makedirs(os.path.join(rootDir, dir))
+        if dir=='女频':
+            continue
+        for item in os.listdir(os.path.join(filepath, dir)):
+            if not os.path.exists(os.path.join(rootDir, dir, item)):
+                os.makedirs(os.path.join(rootDir, dir, item))
+            with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff:
+                counter=0
+                for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)):
+                    for f in fs:
+                        try:
+                            with open(os.path.join(fpath,f),"r",encoding="utf-8") as f:
+                                for line in f:
+                                    if len(line)<=15 or len(line)>120:
+                                        continue
+                                    line = re.sub("\s", "", line)  # 去掉空格(也去掉了换行符)
+                                    line="  ".join(jieba.cut(line))
+                                    ff.write(line)
+                                    ff.write("\n")
+                                    counter+=1
+                                    if counter %10000==0:
+                                        print(dir, item, counter)
+                        except:
+                            print(fpath, f)
+
+#========================================================================词向量批量训练=======================================================
+if train_word2vector:
+    filepath="./corpus"
+    for dir in os.listdir(filepath):
+        if dir=="total":
+            continue
+        if not os.path.exists(os.path.join("word2vector", dir)):
+            os.makedirs(os.path.join("word2vector", dir))
+
+        for item in os.listdir(os.path.join(filepath, dir)):
+
+            if not os.path.exists(os.path.join("word2vector", dir, item)):
+                os.makedirs(os.path.join("word2vector", dir, item))
+            else:
+                continue
+            print(item, "开始训练词向量......")
+            model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")), 
+                             size=256, 
+                             window=5, 
+                             min_count=1,
+                             workers=50, 
+                             iter=10)
+            model.save(os.path.join("word2vector", dir, item, "word2vector.model"))
+            print(item,"词向量训练完成，模型保存到：","/word2vector/",dir,"/",item,"/","word2vector.model")
diff --git a/generation_by_word2vector_replace/edit_01/main.py b/generation_by_word2vector_replace/edit_01/main.py
@@ -0,0 +1,145 @@
+import os
+import json
+import pickle
+import itertools
+import argparse
+import numpy as np
+from gensim.models import Word2Vec
+
+
+def parse_args():
+    # 创建参数解析对象
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_keywords', type=int, default=2, help='the number of keywords')
+    parser.add_argument('--most_similarity', type=int, default=5, help='most_similarity')
+    parser.add_argument('--file_path', type=str, default="./corpus/total/total_segment.txt", help='filepath of corpus')
+    return parser.parse_args()
+
+
+def main(args):
+    # 选择生成文本类型
+    print("请选择生成文本的类型：")
+    print("女频：")
+    print("0.农家种田    1.古言穿越    2.女尊天下    3.幻想言情    4.总裁霸爱    5.情深虐恋    6.浪漫青春    7.现代言情    8.存爱同人    9.蜜恋宠文    10.重生虐渣")
+    print("男频：")
+    print(
+        "11.修真异能  12.修真狂人   13.兵王传奇   14.异世奇遇   15.悬疑灵异   16.摸骨神医   17.架空历史   18.武侠仙侠   19.游戏竞技   20.热血爽文   21.玄幻魔幻   22.科幻末日   23.都市小说")
+
+    # ---------------------------------------------------------------------
+    with open("./category.json", "r", encoding="utf-8") as f:
+        category = json.load(f)
+    category_choose = input("请输入类别编号：")
+
+    # word2vector model list
+    word2vector_dir = []
+    for root, dirs, files in os.walk("./word2vector"):
+        for name in files:
+            if ".npy" not in os.path.join(root, name):
+                word2vector_dir.append(os.path.join(root, name))
+
+    # corpus list
+    corpus_dir = []
+    for root, dirs, files in os.walk("./corpus"):
+        for name in files:
+            corpus_dir.append(os.path.join(root, name))
+
+    for item in word2vector_dir:
+        if category[category_choose] in item:
+            word2vector_path = item
+
+    for item in corpus_dir:
+        if category[category_choose] in item:
+            corpus_path = item
+
+    # 加载word2vector模型
+    model = Word2Vec.load(word2vector_path)
+
+    # ----------------------------------------------------------------------
+    # 输入关键词
+    string = []
+    for i in range(args.num_keywords):
+        temp = input("输入第" + str(i + 1) + "个关键词：")
+        while temp not in model.wv.vocab:
+            print("请换个关键词试试.....")
+            temp = input("输入第" + str(i + 1) + "个关键词：")
+        string.append(temp)
+
+    # 通过word2vector寻找相关检索词
+    def find_similaritr_words(string):
+        index_words = []
+        for i in range(len(string)):
+            try:
+                similarity_words = model.most_similar([string[i]])[:args.most_similarity]
+                temp = [item[0] for item in similarity_words]
+                temp.append(string[i])
+                index_words.append(temp)
+            except:
+                print("抱歉，没有关键词：", string[i])
+        return index_words
+
+    index_words = find_similaritr_words(string)
+
+    # print('相关检索词： ',index_words)
+
+    # 相关句子检索
+    def find_similarity_sentence(corpus_path, index_words):
+        for ii in range(len(index_words)):
+            for temp in itertools.combinations(index_words, len(index_words) - ii):
+                print("------------------------------------------------------------------------------------")
+                print('相关检索词： ', temp)
+                print("------------------------------------------------------------------------------------")
+                # 相关句子检索
+                matched_sentence = []
+                with open(corpus_path, "r", encoding="utf-8") as f:
+                    for line in f:
+                        line = list(line.strip().split(" "))
+                        if len(line) <= 100:  # 去掉长度大于100的句子
+                            match_num = 0
+                            for item in temp:
+                                if len(set(item) & set(line)) > 0:
+                                    match_num += 1
+                            if match_num == len(index_words) - ii:
+                                for i in index_words:
+                                    for j in i:
+                                        if j in line:
+                                            line[line.index(j)] = i[-1]
+                                print("".join(line))
+                                if line not in matched_sentence:
+                                    matched_sentence.append(line)
+                if len(matched_sentence) != 0:
+                    return matched_sentence
+
+    matched_sentence = find_similarity_sentence(corpus_path, index_words)
+
+    # 按照相似度排序
+    print("-------------------------------------------------------------------------------------------")
+    print("相关度排序中......")
+    print("-------------------------------------------------------------------------------------------")
+
+    # top_n 排序
+    def top_n(matched_sentence, n=5):
+        matched_similarity = []
+        for i in range(len(matched_sentence)):
+            temp = []
+            for w in matched_sentence[i]:
+                if w in model:
+                    temp.append(w)
+
+            matched_similarity.append(model.n_similarity(string, temp))
+        ordered_matched = [matched_sentence[i] for i in np.argsort(-np.array(matched_similarity))]
+        if len(ordered_matched) > n:
+            ordered_matched_top_n = ordered_matched[0:n]
+        else:
+            ordered_matched_top_n = ordered_matched
+        return ordered_matched_top_n
+
+    ordered_matched_top_n = top_n(matched_sentence, 5)
+    for item in ordered_matched_top_n:
+        print("".join(item))
+    print("===============================================================================================")
+
+
+# ===========================================================================================================
+
+if __name__ == '__main__':
+    main(parse_args())
diff --git a/generation_by_word2vector_replace/edit_02/category.json b/generation_by_word2vector_replace/edit_02/category.json
@@ -0,0 +1 @@
+{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"}
diff --git a/generation_by_word2vector_replace/edit_02/data_process.py b/generation_by_word2vector_replace/edit_02/data_process.py
@@ -0,0 +1,67 @@
+import os
+import re
+import jieba
+import gensim
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+import multiprocessing
+
+
+process_data=False
+train_word2vector=True
+
+#========================================================================文本批处理==========================================================
+if process_data:
+    rootDir="corpus"
+    filepath="/data/books/"
+    for dir in os.listdir(filepath):
+        if not os.path.exists(os.path.join(rootDir, dir)):
+            os.makedirs(os.path.join(rootDir, dir))
+        if dir=='女频':
+            continue
+        for item in os.listdir(os.path.join(filepath, dir)):
+            if not os.path.exists(os.path.join(rootDir, dir, item)):
+                os.makedirs(os.path.join(rootDir, dir, item))
+            with open(os.path.join(rootDir, dir, item, item+".txt"), 'w', encoding='utf-8') as ff:
+                counter=0
+                for fpath, dirs, fs in os.walk(os.path.join(filepath, dir, item)):
+                    for f in fs:
+                        try:
+                            with open(os.path.join(fpath,f),"r",encoding="utf-8") as f:
+                                for line in f:
+                                    if len(line)<=15 or len(line)>120:
+                                        continue
+                                    line = re.sub("\s", "", line)  # 去掉空格(也去掉了换行符)
+                                    line="  ".join(jieba.cut(line))
+                                    ff.write(line)
+                                    ff.write("\n")
+                                    counter+=1
+                                    if counter %10000==0:
+                                        print(dir, item, counter)
+                        except:
+                            print(fpath, f)
+
+#========================================================================词向量批量训练=======================================================
+if train_word2vector:
+    filepath="./corpus"
+    for dir in os.listdir(filepath):
+        if dir=="total":
+            continue
+        if not os.path.exists(os.path.join("word2vector", dir)):
+            os.makedirs(os.path.join("word2vector", dir))
+
+        for item in os.listdir(os.path.join(filepath, dir)):
+
+            if not os.path.exists(os.path.join("word2vector", dir, item)):
+                os.makedirs(os.path.join("word2vector", dir, item))
+            else:
+                continue
+            print(item, "开始训练词向量......")
+            model = Word2Vec(LineSentence(os.path.join(filepath, dir, item, item+".txt")), 
+                             size=256, 
+                             window=5, 
+                             min_count=1,
+                             workers=50, 
+                             iter=10)
+            model.save(os.path.join("word2vector", dir, item, "word2vector.model"))
+            print(item,"词向量训练完成，模型保存到：","/word2vector/",dir,"/",item,"/","word2vector.model")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"0": "\u519c\u5bb6\u79cd\u7530", "1": "\u53e4\u8a00\u7a7f\u8d8a", "2": "\u5973\u5c0a\u5929\u4e0b", "3": "\u5e7b\u60f3\u8a00\u60c5", "4": "\u603b\u88c1\u9738\u7231", "5": "\u60c5\u6df1\u8650\u604b", "6": "\u6d6a\u6f2b\u9752\u6625", "7": "\u73b0\u4ee3\u8a00\u60c5", "8": "\u7eaf\u7231\u540c\u4eba", "9": "\u871c\u604b\u5ba0\u6587", "10": "\u91cd\u751f\u8650\u6e23", "11": "\u4fee\u771f\u5f02\u80fd", "12": "\u4fee\u771f\u72c2\u4eba", "13": "\u5175\u738b\u4f20\u5947", "14": "\u5f02\u4e16\u5947\u9047", "15": "\u60ac\u7591\u7075\u5f02", "16": "\u60ac\u7591\u7075\u5f02", "17": "\u67b6\u7a7a\u5386\u53f2", "18": "\u6b66\u4fa0\u4ed9\u4fa0", "19": "\u6e38\u620f\u7ade\u6280", "20": "\u70ed\u8840\u723d\u6587", "21": "\u7384\u5e7b\u9b54\u5e7b", "22": "\u79d1\u5e7b\u672b\u65e5", "23": "\u90fd\u5e02\u5c0f\u8bf4"}