Skip to content

Commit bd3a3c8

Browse files
authored
Create job_crawl.py
1 parent 5d7b225 commit bd3a3c8

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

jobSkill/job_crawl.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# -*- coding: utf-8 -*-
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import time
5+
import random
6+
7+
urlFileName = 'shurls.txt'
8+
contentFileName = 'context.txt'
9+
10+
11+
def getUrls2Txt(page_num):
12+
13+
p = page_num+1
14+
for i in range(1, p):
15+
urls = []
16+
# 抓取魔都的
17+
url = 'https://search.51job.com/list/020000,000000,0000,00,2,99,Python,2,'+str(i)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
18+
19+
html = requests.get(url)
20+
soup = BeautifulSoup(html.content, "html.parser")
21+
ps = soup.find_all('p', class_='t1')
22+
for p in ps:
23+
a = p.find('a')
24+
urls.append(str(a['href']))
25+
with open(urlFileName, 'a', encoding='utf-8') as f:
26+
for url in urls:
27+
f.write(url+'\n')
28+
s = random.randint(5, 30)
29+
print(str(i)+'page done,'+str(s)+'s later')
30+
time.sleep(s)
31+
32+
def getContent(url, headers):
33+
record = ''
34+
try:
35+
html = requests.get(url, headers=headers)
36+
soup = BeautifulSoup(html.content, "html.parser")
37+
positionTitle = str(soup.find('h1')['title'])
38+
salary = soup.find_all('strong')[1].get_text()
39+
companyName = soup.find('p', class_='cname').get_text().strip().replace('\n','').replace('查看所有职位','')
40+
positionInfo = soup.find(
41+
'div', class_='bmsg job_msg inbox').get_text().strip().replace('\n', '').replace('分享', '').replace('举报', '').replace(' ', '').replace('\r', '')
42+
record = positionTitle + '&&&' + salary + '&&&' + companyName + '&&&' + '&&&' + positionInfo
43+
except Exception as e:
44+
print('错误了')
45+
return record
46+
47+
48+
def main():
49+
page_num = 93
50+
getUrls2Txt(page_num)
51+
user_Agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
52+
headers = {'User-Agent': user_Agent}
53+
with open(urlFileName, 'r', encoding='utf-8') as f:
54+
urls = f.readlines()
55+
i = 0
56+
for url in urls:
57+
url = url.strip()
58+
if url != '':
59+
record = getContent(url, headers)
60+
with open(contentFileName, 'a', encoding='utf-8') as f:
61+
f.write(record + '\n')
62+
i += 1
63+
print(str(i)+'详情抓取完成')
64+
time.sleep(1)
65+
66+
print('完成了')
67+
68+
69+
if __name__ == '__main__':
70+
main()

0 commit comments

Comments
 (0)