|
2 | 2 |
|
3 | 3 | 中华人民共和国行政区划:省级、地级、县级、乡级和村级
|
4 | 4 |
|
| 5 | +> Gitee <https://gitee.com/netnr/zoning> |
| 6 | +
|
| 7 | +> GitHub <https://github.com/netnr/zoning> |
| 8 | +
|
5 | 9 | ----------
|
6 |
| -# 数据来源 |
| 10 | +# 来源 |
7 | 11 | <http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/>
|
8 | 12 |
|
9 | 13 | <http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017>
|
10 | 14 |
|
11 | 15 | ----------
|
12 |
| -# 使用方法 |
| 16 | +# 使用 |
13 | 17 | - 打开页面 <http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html>
|
14 | 18 | - 打开浏览器控制台(推荐谷歌,请不要用IE系列,谢谢)
|
15 | 19 | - 拷贝脚本`zoning.js`的内容粘贴到控制台运行
|
16 | 20 |
|
17 | 21 | ----------
|
18 |
| -# 注意事项 |
19 |
| -初次抓取可能会出错,请求太频繁,浏览器可能会卡顿,也会出现网络错误等问题 |
| 22 | +# 注意 |
| 23 | +首次抓取会出现大量失败请求,再次抓取会从浏览器缓存获取,非常快 |
| 24 | + |
| 25 | +---------- |
| 26 | +# 代码 |
| 27 | +``` |
| 28 | +var zoning = { |
| 29 | + //版本号 |
| 30 | + version: "1.0.0", |
| 31 | + //载入js脚本 |
| 32 | + getScript: function (src, success) { |
| 33 | + var ele = document.createElement("SCRIPT"); |
| 34 | + ele.src = src; |
| 35 | + ele.type = "text/javascript"; |
| 36 | + document.getElementsByTagName("HEAD")[0].appendChild(ele); |
| 37 | + //加载完成回调 |
| 38 | + if (success != undefined) { |
| 39 | + ele.onload = ele.onreadystatechange = function () { |
| 40 | + if (!this.readyState || this.readyState == "loaded" || this.readyState == "complete") { success(); } |
| 41 | + } |
| 42 | + } |
| 43 | + }, |
| 44 | + //参数配置 |
| 45 | + config: { |
| 46 | + //jszip CDN |
| 47 | + urljszip: "https://lib.baomitu.com/jszip/3.1.4/jszip.min.js", |
| 48 | + //fileSaver CDN |
| 49 | + urlfilesaver: "https://lib.baomitu.com/FileSaver.js/2014-11-29/FileSaver.min.js", |
| 50 | + //抓取首页 |
| 51 | + urlprefix: "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/", |
| 52 | + //起始深度 |
| 53 | + deep: 1, |
| 54 | + //最大深度 |
| 55 | + //5 村 约46800 |
| 56 | + //4 街道 约3380 |
| 57 | + deepmax: 5, |
| 58 | + //抓取过程信息 |
| 59 | + item: { |
| 60 | + //父级编码 |
| 61 | + id: "00", |
| 62 | + //请求相对地址 |
| 63 | + href: "index" |
| 64 | + } |
| 65 | + }, |
| 66 | + //fetch 抓取 |
| 67 | + grab: function (urlprefix, deep, item) { |
| 68 | + if (item.href == null) { |
| 69 | + return false; |
| 70 | + } |
| 71 | + var url = urlprefix; |
| 72 | + switch (deep) { |
| 73 | + case 4: |
| 74 | + url += item.id.substr(0, 2) + "/"; |
| 75 | + break; |
| 76 | + case 5: |
| 77 | + url += item.id.substr(0, 2) + "/" + item.id.substr(2, 2) + "/"; |
| 78 | + break; |
| 79 | + } |
| 80 | + url += item.href + ".html"; |
| 81 | +
|
| 82 | + zoning.taskcount += 1; |
| 83 | +
|
| 84 | + //fetch 抓取 gb2312 |
| 85 | + fetch(url).then(res => res.blob()).then(blob => { |
| 86 | + var reader = new FileReader(); |
| 87 | + reader.onload = function () { |
| 88 | + var list = zoning.matcharray(reader.result, item, deep); |
| 89 | + zoning.taskcount -= 1; |
| 90 | + if (list.length > 0 && deep < zoning.config.deepmax) { |
| 91 | + for (var i = 0; i < list.length; i++) { |
| 92 | + var li = list[i]; |
| 93 | + deep += 1; |
| 94 | + zoning.grab(urlprefix, deep, li); |
| 95 | + deep -= 1; |
| 96 | + } |
| 97 | + } |
| 98 | + } |
| 99 | + reader.readAsText(blob, 'GBK'); |
| 100 | + }).catch(function (e) { |
| 101 | + var obj = {}; |
| 102 | + obj.item = item; |
| 103 | + obj.url = url; |
| 104 | + obj.error = e; |
| 105 | + zoning.catchdata.push(obj); |
| 106 | + zoning.taskcount -= 1; |
| 107 | + }); |
| 108 | + }, |
| 109 | + //任务量 |
| 110 | + taskcount: 0, |
| 111 | + //抓取数量 |
| 112 | + matchcount: 0, |
| 113 | + //抓取异常记录 |
| 114 | + catchdata: [], |
| 115 | + //抓取结果数据 |
| 116 | + matchdata: {}, |
| 117 | + //匹配抓取内容 |
| 118 | + matcharray: function (data, item, deep) { |
| 119 | + var arr = []; |
| 120 | + if (deep != 5) { |
| 121 | + //匹配 市辖区 无链接 项 |
| 122 | + data.replace(/<td>[0-9]{12}<\/td><td>.*?<\/td>/g, function (x) { |
| 123 | + var mat = x.split('</td><td>'); |
| 124 | + var obj = {}; |
| 125 | + obj.href = null; |
| 126 | + obj.id = mat[0].split('>')[1]; |
| 127 | + obj.text = mat[1].split('<')[0]; |
| 128 | + arr.push(obj); |
| 129 | + }); |
| 130 | + } |
| 131 | + data = data.replace(/'/g, '"').replace(/<br\/>/g, ""); |
| 132 | + //匹配所有的A标签 |
| 133 | + var reg = /<a[^>]*href=['"]([^"]*)['"][^>]*>(.*?)<\/a>/g; |
| 134 | + var matchs = data.match(reg); |
| 135 | + var filename = "00"; |
| 136 | + switch (deep) { |
| 137 | + //首页 |
| 138 | + case 1: |
| 139 | + if (!matchs) { |
| 140 | + return []; |
| 141 | + } |
| 142 | + for (var i = 0; i < matchs.length; i++) { |
| 143 | + var mat = matchs[i]; |
| 144 | + var obj = {}; |
| 145 | + obj.id = mat.split('"')[1].split('.')[0]; |
| 146 | + obj.href = obj.id; |
| 147 | + obj.text = mat.split('>')[1].split('<')[0]; |
| 148 | + arr.push(obj); |
| 149 | + } |
| 150 | + break; |
| 151 | + case 2: |
| 152 | + case 3: |
| 153 | + case 4: |
| 154 | + if (!matchs) { |
| 155 | + return []; |
| 156 | + } |
| 157 | + for (var i = 0; i < matchs.length; i++) { |
| 158 | + var mat = matchs[i]; |
| 159 | + var obj = {}; |
| 160 | + obj.href = mat.split('"')[1].split('.')[0]; |
| 161 | + obj.id = mat.split('>')[1].split('<')[0]; |
| 162 | + mat = matchs[++i]; |
| 163 | + obj.text = mat.split('>')[1].split('<')[0]; |
| 164 | + arr.push(obj); |
| 165 | + } |
| 166 | + break; |
| 167 | + case 5: |
| 168 | + //匹配 村委会 无连接 |
| 169 | + data.replace(/<td>[0-9]{12}<\/td><td>[0-9]{3}<\/td><td>.*?<\/td>/g, function (x) { |
| 170 | + var mat = x.split('</td><td>'); |
| 171 | + var obj = {}; |
| 172 | + obj.href = null; |
| 173 | + obj.id = mat[0].split('>')[1]; |
| 174 | + obj.text = mat[2].split('<')[0]; |
| 175 | + arr.push(obj); |
| 176 | + }); |
| 177 | + break; |
| 178 | + } |
| 179 | + //根据深度 得到文件名(编码) |
| 180 | + switch (deep) { |
| 181 | + case 2: |
| 182 | + filename = item.id; |
| 183 | + break; |
| 184 | + case 3: |
| 185 | + filename = item.id.substr(0, 4); |
| 186 | + break; |
| 187 | + case 4: |
| 188 | + filename = item.id.substr(0, 6); |
| 189 | + break; |
| 190 | + case 5: |
| 191 | + filename = item.id.substr(0, 9); |
| 192 | + break; |
| 193 | + } |
| 194 | + zoning.matchdata[filename] = arr; |
| 195 | + //记录请求结果数量 |
| 196 | + zoning.matchcount += 1; |
| 197 | + return arr; |
| 198 | + }, |
| 199 | + //外部调用生成下载 |
| 200 | + zip: function () { |
| 201 | + zoning.ziping(zoning.matchdata, zoning.catchdata); |
| 202 | + }, |
| 203 | + //内部调用生成下载 |
| 204 | + ziping: function (matchdata, catchdata) { |
| 205 | + zoning.getScript(zoning.config.urljszip, function () { |
| 206 | + zoning.getScript(zoning.config.urlfilesaver, function () { |
| 207 | + var zip = new JSZip(); |
| 208 | + var data = {}; |
| 209 | + for (var i in matchdata) { |
| 210 | + var di = matchdata[i]; |
| 211 | + for (var j = 0; j < di.length; j++) { |
| 212 | + delete di[j].href; |
| 213 | + switch (i.length) { |
| 214 | + case 2: |
| 215 | + di[j].id = di[j].id.substr(0, 4); |
| 216 | + break; |
| 217 | + case 4: |
| 218 | + di[j].id = di[j].id.substr(0, 6); |
| 219 | + break; |
| 220 | + case 6: |
| 221 | + di[j].id = di[j].id.substr(0, 9); |
| 222 | + break; |
| 223 | + } |
| 224 | + } |
| 225 | + data[i] = di; |
| 226 | + zip.file(i + ".json", JSON.stringify(di)); |
| 227 | + } |
| 228 | + zip.file("all.json", JSON.stringify(data)); |
| 229 | + if (catchdata.length) { |
| 230 | + zip.file('catch.json', JSON.stringify(catchdata)); |
| 231 | + } |
| 232 | + zip.generateAsync({ type: "blob" }).then(function (content) { |
| 233 | + saveAs(content, "zoning.zip"); |
| 234 | + }); |
| 235 | + }); |
| 236 | + }); |
| 237 | + }, |
| 238 | + //开始运行 |
| 239 | + run: function () { |
| 240 | + zoning.startTime = new Date().valueOf(); |
| 241 | + zoning.taskid = setInterval(function () { |
| 242 | + console.log("count:" + zoning.matchcount, "taskcount:" + zoning.taskcount); |
| 243 | + if (zoning.taskcount == 0) { |
| 244 | + clearInterval(zoning.taskid); |
| 245 | + zoning.zip(); |
| 246 | + } |
| 247 | + }, 1000 * 4); |
| 248 | + console.log('fetching ... please see the network tab'); |
| 249 | + zoning.grab(zoning.config.urlprefix, zoning.config.deep, zoning.config.item); |
| 250 | + } |
| 251 | +}; |
| 252 | +
|
| 253 | +//开始运行 可手动调用 |
| 254 | +zoning.run(); |
| 255 | +
|
| 256 | +//下载zip,抓取完成后 |
| 257 | +//zoning.zip(); |
| 258 | +
|
| 259 | +/* |
| 260 | + * 注意: |
| 261 | + * |
| 262 | + * 首次抓取会出现大量失败请求,再次抓取会从浏览器缓存获取,非常快。 |
| 263 | + * |
| 264 | + * 文件: |
| 265 | + * 00.json 根数据 |
| 266 | + * 12.json 二级数据 |
| 267 | + * 1234.json 三级数据 |
| 268 | + * 123456.json 四级数据 |
| 269 | + * 123456789.json 五级数据 |
| 270 | + * |
| 271 | + * 其他: |
| 272 | + * all.json 所有数据 |
| 273 | + * catch.json 抓取异常记录(有异常时,经测试有5个页面请求失败) |
| 274 | + */ |
| 275 | +``` |
20 | 276 |
|
21 |
| -由于浏览器有缓存机制,第二次开始从缓存获取页面内容,很快能完成 |
| 277 | +> [联系打赏](https://ss.netnr.com/contact) |
0 commit comments