forked from gxcuizy/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcity_to_json.py
134 lines (124 loc) · 4.56 KB
/
city_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
爬取国家统计局最新地址库
省市区三级(Json版本)
author: gxcuizy
time: 2018-08-24
"""
import requests
from bs4 import BeautifulSoup
import json
import os
def get_province(index_href):
"""抓取省份信息"""
print('开始抓取省份信息……')
province_url = url + index_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(province_url, headers=headers)
request.encoding = 'gbk'
province_html_text = str(request.text)
soup = BeautifulSoup(province_html_text, "html.parser")
province_tr_list = soup.select('.provincetr a')
province_list = {}
province_key = 0
# 遍历省份列表信息
for province_tr in province_tr_list:
if province_tr:
province_href = province_tr.attrs['href']
province_no = province_href.split('.')[0]
province_code = province_no + '0000'
province_name = province_tr.text
province_info = {'code': province_code, 'name': province_name}
province_list.setdefault(province_key, province_info)
province_key += 1
print('已写入省级:', province_info)
# 市级
get_city(province_href, province_code)
print('抓取省份信息结束!')
file = open('json/province.json', 'w', encoding='utf-8')
json.dump(province_list, file, ensure_ascii=False)
file.close()
def get_city(province_href, province_code):
"""抓取市级城市信息"""
print('开始抓取市级信息')
city_url = url + province_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(city_url, headers=headers)
request.encoding = 'gbk'
city_html_text = str(request.text)
soup = BeautifulSoup(city_html_text, "html.parser")
city_tr_list = soup.select('.citytr')
if os.path.exists('json/city.json'):
file = open('json/city.json', 'r', encoding='utf-8')
city_data = json.load(file)
file.close()
else:
city_data = {}
city_list = {}
city_key = 0
# 遍历市级城市列表信息
for city_tr in city_tr_list:
if city_tr:
city_a_info = city_tr.select('a')
city_href = city_a_info[0].attrs['href']
city_code = city_a_info[0].text[:6]
city_name = city_a_info[1].text
city_info = {'code': city_code, 'name': city_name}
city_list.setdefault(city_key, city_info)
city_key += 1
print('已写入市级:', city_info)
# 区级
get_area(city_href, city_code)
city_data.setdefault(province_code, city_list)
print('抓取市级城市结束!')
file = open('json/city.json', 'w', encoding='utf-8')
json.dump(city_data, file, ensure_ascii=False)
file.close()
def get_area(city_href, city_code):
"""抓取区级信息"""
print('开始抓取区级信息')
area_url = url + city_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(area_url, headers=headers)
request.encoding = 'gbk'
area_html_text = str(request.text)
soup = BeautifulSoup(area_html_text, "html.parser")
area_tr_list = soup.select('.countytr')
if os.path.exists('json/area.json'):
file = open('json/area.json', 'r', encoding='utf-8')
area_data = json.load(file)
file.close()
else:
area_data = {}
area_list = {}
area_key = 0
# 遍历区级列表信息
for area_tr in area_tr_list:
area_a_info = area_tr.select('td')
if area_a_info:
area_code = area_a_info[0].text[:6]
area_name = area_a_info[1].text
area_info = {'code': area_code, 'name': area_name}
area_list.setdefault(area_key, area_info)
area_key += 1
print('已写入区级:', area_info)
area_data.setdefault(city_code, area_list)
print('抓取区级信息结束!')
file = open('json/area.json', 'w', encoding='utf-8')
json.dump(area_data, file, ensure_ascii=False)
file.close()
# 程序主入口
if __name__ == "__main__":
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
# 创建json目录
json_folder = 'json/'
if not os.path.exists(json_folder):
os.makedirs(json_folder)
get_province('index.html')