初始化数据

hlooc · Feb 5, 2018 · b3f6e70 · b3f6e70
1 parent a3d71d6
commit b3f6e70
Show file tree

Hide file tree

Showing 10 changed files with 645,064 additions and 0 deletions.
diff --git a/Untitled.ipynb b/Untitled.ipynb
diff --git a/addAbbreviation.py b/addAbbreviation.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+chengyu = pd.read_json('chengyu.json')  
+
+replace = {'ā':'a', 'á':'a', 'ǎ':'a', 'à':'a', 'ō':'o', 'ó':'o', 'ǒ':'o', 'ò':'o',\
+           'ē':'e', 'é':'e', 'ě':'e', 'è':'e'}
+def abbreviation(pinyin):
+    each = pinyin.split(' ')
+    return ''.join(list(map(lambda x: replace[x[:1]] if x[:1] in replace else x[:1], each)))
+
+chengyu['abbreviation'] = chengyu['pinyin'].apply(abbreviation)
+
+chengyu.to_json('chengyu01.json', force_ascii=False, orient='records')
+
diff --git a/chengyu.json b/chengyu.json
diff --git a/chengyu.py b/chengyu.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+"""
+
+author: pwxcoo
+date: 2018-02-05 
+description: 抓取下载成语并保存
+
+"""
+
+import requests, json
+from bs4 import BeautifulSoup
+
+def downloader(url):
+    """
+    下载成语并保存
+    """
+    response = requests.get(url)
+
+    if response.status_code != 200:
+        print(f'{url} is failed!')
+        return
+
+    print(f'{url} is parsing')
+    html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
+    table = html.find_all('table')[-2]
+
+    prefix = 'http://www.zd9999.com'
+    words = [prefix + a.get('href') for a in table.find_all('a')]
+
+    res = []
+    for i in range(0, len(words)):
+        response = requests.get(words[i])
+        print(f'{[words[i]]} is parsing')
+        if response.status_code != 200:
+            print(f'{words[i]} is failed!')
+            continue
+
+        wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
+        explanation = wordhtml.find_all('table')[-3].find_all('tr')
+        res.append({'word':explanation[0].text.strip(),\
+                    'pinyin': explanation[1].find_all('tr')[0].find_all('td')[1].text.strip(),\
+                    'explanation': explanation[1].find_all('tr')[1].find_all('td')[1].text.strip(),\
+                    'derivation': explanation[1].find_all('tr')[2].find_all('td')[1].text.strip(),\
+                    'example': explanation[1].find_all('tr')[3].find_all('td')[1].text.strip()})
+    return res
+
+if __name__ == '__main__':
+    res = downloader('http://www.zd9999.com/cy/')
+    for i in range(2, 199):
+        res += downloader(f'http://www.zd9999.com/cy/index_{i}.htm')
+    print(len(res))
+    with open('chengyu.json', mode='w+', encoding='utf-8') as json_file:
+        json.dump(res, json_file, ensure_ascii=False)
diff --git a/chengyu01.json b/chengyu01.json
diff --git a/record.md b/record.md
@@ -0,0 +1,3 @@
+歇后语：14034
+汉字16142个
+成语:31648个
diff --git a/word.json b/word.json
diff --git a/word.py b/word.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+"""
+
+author: pwxcoo
+date: 2018-02-05 
+description: 抓取下载汉字并保存
+
+"""
+
+import requests,json
+from bs4 import BeautifulSoup
+
+def downloader(url):
+    """
+    下载汉字并保存
+    """
+    response = requests.get(url)
+
+    if response.status_code != 200:
+        print(f'{url} is failed!')
+        return
+
+    print(f'{url} is parsing')
+    html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
+    a = html.find_all('a', target="_blank")
+
+    prefix = 'http://www.zd9999.com'
+    words = [prefix + w.get('href') for w in a]
+
+    res = []
+    for i in range(0, len(words)):
+        response = requests.get(words[i])
+        print(f'{[words[i]]} is parsing')
+        if response.status_code != 200:
+            print(f'{words[i]} is failed!')
+            continue
+
+        wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
+        tr = wordhtml.find_all('table')[5].find_all('tr')
+        explanation = tr[6].find_all('td')[1].text
+        res.append({'word': tr[2].find_all('td')[1].text.strip(),\
+                    'strokes': tr[3].find_all('td')[1].text.strip(),\
+                    'pinyin': tr[4].find_all('td')[1].text.strip(),\
+                    'radicals': tr[5].find_all('td')[1].text.strip(),\
+                    'explanation': explanation[explanation.find('\r\n'):].strip()})
+    return res
+
+if __name__ == '__main__':
+    res = downloader('http://www.zd9999.com/zi/index.htm')
+    for i in range(2, 102):
+        res += downloader(f'http://www.zd9999.com/zi/index_{i}.htm')
+    print(len(res))
+    with open('word.json', mode='w+', encoding='utf-8') as json_file:
+        json.dump(res, json_file, ensure_ascii=False)