Skip to content

Commit

Permalink
初始化数据
Browse files Browse the repository at this point in the history
  • Loading branch information
pwxcoo committed Feb 5, 2018
1 parent a3d71d6 commit b3f6e70
Show file tree
Hide file tree
Showing 10 changed files with 645,064 additions and 0 deletions.
1,052 changes: 1,052 additions & 0 deletions Untitled.ipynb

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions addAbbreviation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pandas as pd

chengyu = pd.read_json('chengyu.json')

replace = {'ā':'a', 'á':'a', 'ǎ':'a', 'à':'a', 'ō':'o', 'ó':'o', 'ǒ':'o', 'ò':'o',\
'ē':'e', 'é':'e', 'ě':'e', 'è':'e'}
def abbreviation(pinyin):
each = pinyin.split(' ')
return ''.join(list(map(lambda x: replace[x[:1]] if x[:1] in replace else x[:1], each)))

chengyu['abbreviation'] = chengyu['pinyin'].apply(abbreviation)

chengyu.to_json('chengyu01.json', force_ascii=False, orient='records')

221,538 changes: 221,538 additions & 0 deletions chengyu.json

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions chengyu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-

"""
author: pwxcoo
date: 2018-02-05
description: 抓取下载成语并保存
"""

import requests, json
from bs4 import BeautifulSoup

def downloader(url):
"""
下载成语并保存
"""
response = requests.get(url)

if response.status_code != 200:
print(f'{url} is failed!')
return

print(f'{url} is parsing')
html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
table = html.find_all('table')[-2]

prefix = 'http://www.zd9999.com'
words = [prefix + a.get('href') for a in table.find_all('a')]

res = []
for i in range(0, len(words)):
response = requests.get(words[i])
print(f'{[words[i]]} is parsing')
if response.status_code != 200:
print(f'{words[i]} is failed!')
continue

wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
explanation = wordhtml.find_all('table')[-3].find_all('tr')
res.append({'word':explanation[0].text.strip(),\
'pinyin': explanation[1].find_all('tr')[0].find_all('td')[1].text.strip(),\
'explanation': explanation[1].find_all('tr')[1].find_all('td')[1].text.strip(),\
'derivation': explanation[1].find_all('tr')[2].find_all('td')[1].text.strip(),\
'example': explanation[1].find_all('tr')[3].find_all('td')[1].text.strip()})
return res

if __name__ == '__main__':
res = downloader('http://www.zd9999.com/cy/')
for i in range(2, 199):
res += downloader(f'http://www.zd9999.com/cy/index_{i}.htm')
print(len(res))
with open('chengyu.json', mode='w+', encoding='utf-8') as json_file:
json.dump(res, json_file, ensure_ascii=False)
253,186 changes: 253,186 additions & 0 deletions chengyu01.json

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions record.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
歇后语:14034
汉字16142个
成语:31648个
112,996 changes: 112,996 additions & 0 deletions word.json

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions word.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-

"""
author: pwxcoo
date: 2018-02-05
description: 抓取下载汉字并保存
"""

import requests,json
from bs4 import BeautifulSoup

def downloader(url):
"""
下载汉字并保存
"""
response = requests.get(url)

if response.status_code != 200:
print(f'{url} is failed!')
return

print(f'{url} is parsing')
html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
a = html.find_all('a', target="_blank")

prefix = 'http://www.zd9999.com'
words = [prefix + w.get('href') for w in a]

res = []
for i in range(0, len(words)):
response = requests.get(words[i])
print(f'{[words[i]]} is parsing')
if response.status_code != 200:
print(f'{words[i]} is failed!')
continue

wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
tr = wordhtml.find_all('table')[5].find_all('tr')
explanation = tr[6].find_all('td')[1].text
res.append({'word': tr[2].find_all('td')[1].text.strip(),\
'strokes': tr[3].find_all('td')[1].text.strip(),\
'pinyin': tr[4].find_all('td')[1].text.strip(),\
'radicals': tr[5].find_all('td')[1].text.strip(),\
'explanation': explanation[explanation.find('\r\n'):].strip()})
return res

if __name__ == '__main__':
res = downloader('http://www.zd9999.com/zi/index.htm')
for i in range(2, 102):
res += downloader(f'http://www.zd9999.com/zi/index_{i}.htm')
print(len(res))
with open('word.json', mode='w+', encoding='utf-8') as json_file:
json.dump(res, json_file, ensure_ascii=False)
Loading

0 comments on commit b3f6e70

Please sign in to comment.