Skip to content

Commit 330436e

Browse files
committed
add JDSpider.py file
1 parent fa14971 commit 330436e

File tree

2 files changed

+44
-0
lines changed

2 files changed

+44
-0
lines changed

JDSpider.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import requests
2+
import re
3+
import pandas as pd
4+
5+
def get_data():
6+
jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page='
7+
jj_url2 = '&s=53&click=0'
8+
bt_ = []
9+
_id = []
10+
url_list = []
11+
for i in range(1, 10000, 2):
12+
jj_url = jj_url1 + str(i) + jj_url2
13+
url_list.append(jj_url)
14+
html = requests.get(jj_url).content.decode('utf-8')
15+
reg1 = re.compile('<a target="_blank" title="(.*?)"')
16+
reg2 = re.compile('<i class="promo-words" id="(.*?)"></i>')
17+
bt = re.findall(reg1, html)
18+
id_ = re.findall(reg2, html)
19+
bt_.extend(bt)
20+
_id.extend(id_)
21+
return bt_, _id
22+
23+
def split_str(_id):
24+
zid = []
25+
for _ in _id:
26+
zid.append(_.split('_')[2])
27+
return zid
28+
29+
def save_data(zid, bt_):
30+
data = pd.DataFrame({
31+
'标题': bt_,
32+
'ID': zid
33+
})
34+
data.to_excel('./家居用品.xlsx', index=False)
35+
36+
def start_main():
37+
bt_, _id = get_data()
38+
zid = split_str(_id)
39+
save_data(zid, bt_)
40+
41+
if __name__ == '__main__':
42+
start_main()

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@
1111
##### 5. lagouPositionSpider.py: 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件
1212

1313
##### 6. student_img.py: 基于本学校官网的url漏洞,获取所有注册学生学籍证件照
14+
15+
##### 7. JDSpider.py: 大批量抓取京东商品id和标签

0 commit comments

Comments
 (0)