File tree Expand file tree Collapse file tree 2 files changed +44
-0
lines changed Expand file tree Collapse file tree 2 files changed +44
-0
lines changed Original file line number Diff line number Diff line change
1
+ import requests
2
+ import re
3
+ import pandas as pd
4
+
5
+ def get_data ():
6
+ jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page='
7
+ jj_url2 = '&s=53&click=0'
8
+ bt_ = []
9
+ _id = []
10
+ url_list = []
11
+ for i in range (1 , 10000 , 2 ):
12
+ jj_url = jj_url1 + str (i ) + jj_url2
13
+ url_list .append (jj_url )
14
+ html = requests .get (jj_url ).content .decode ('utf-8' )
15
+ reg1 = re .compile ('<a target="_blank" title="(.*?)"' )
16
+ reg2 = re .compile ('<i class="promo-words" id="(.*?)"></i>' )
17
+ bt = re .findall (reg1 , html )
18
+ id_ = re .findall (reg2 , html )
19
+ bt_ .extend (bt )
20
+ _id .extend (id_ )
21
+ return bt_ , _id
22
+
23
+ def split_str (_id ):
24
+ zid = []
25
+ for _ in _id :
26
+ zid .append (_ .split ('_' )[2 ])
27
+ return zid
28
+
29
+ def save_data (zid , bt_ ):
30
+ data = pd .DataFrame ({
31
+ '标题' : bt_ ,
32
+ 'ID' : zid
33
+ })
34
+ data .to_excel ('./家居用品.xlsx' , index = False )
35
+
36
+ def start_main ():
37
+ bt_ , _id = get_data ()
38
+ zid = split_str (_id )
39
+ save_data (zid , bt_ )
40
+
41
+ if __name__ == '__main__' :
42
+ start_main ()
Original file line number Diff line number Diff line change 11
11
##### 5. lagouPositionSpider.py: 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件
12
12
13
13
##### 6. student_img.py: 基于本学校官网的url漏洞,获取所有注册学生学籍证件照
14
+
15
+ ##### 7. JDSpider.py: 大批量抓取京东商品id和标签
You can’t perform that action at this time.
0 commit comments