forked from gxcuizy/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
cuizhongyi
committed
Jun 19, 2020
1 parent
bed79c9
commit 10caa5e
Showing
7 changed files
with
503 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
### 文件结构 | ||
|
||
``` | ||
├── get_ip.py # 代码脚本 | ||
├── ip.json # ip列表的json数据文件 | ||
``` | ||
|
||
### 交流学习 | ||
|
||
如有写的不对或者错误的地方,希望大家指正,相互交流,谢谢。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
利用requests+bs4爬取国内高匿代理IP | ||
author: gxcuizy | ||
date: 2020-06-19 | ||
""" | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
import json | ||
|
||
|
||
class GetIpData(object): | ||
"""爬取50页国内高匿代理IP""" | ||
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'} | ||
base_url = 'https://www.xicidaili.com/nn/' | ||
check_url = 'https://www.ip.cn/' | ||
json_data = [] | ||
|
||
def get_url_html(self, url): | ||
"""请求页面html""" | ||
request = requests.get(url=url, headers=self.header, timeout=5) | ||
html = False | ||
if request.status_code == 200: | ||
html = request.content | ||
return html | ||
|
||
def check_ip(self, ip_info): | ||
"""测试IP地址是否有效""" | ||
ip_url = ip_info['ip'] + ':' + str(ip_info['port']) | ||
proxies = {'http': 'http://' + ip_url, 'https': 'https://' + ip_url} | ||
res = False | ||
try: | ||
request = requests.get(url=self.check_url, headers=self.header, proxies=proxies, timeout=5) | ||
if request.status_code == 200: | ||
res = True | ||
except Exception as error_info: | ||
res = False | ||
return res | ||
|
||
def run(self): | ||
"""执行入口""" | ||
page_list = range(1, 51) | ||
with open("ip.json", "w") as write_file: | ||
for page in page_list: | ||
# 分页爬取数据 | ||
print('开始爬取第' + str(page) + '页IP数据') | ||
ip_url = self.base_url + str(page) | ||
html = self.get_url_html(ip_url) | ||
soup = BeautifulSoup(html, 'html.parser') | ||
# IP列表 | ||
ip_list = soup.select('#ip_list .odd') | ||
for ip_tr in ip_list: | ||
# 单条Ip信息 | ||
td_list = ip_tr.select('td') | ||
ip_address = td_list[1].get_text() | ||
ip_port = td_list[2].get_text() | ||
ip_type = td_list[5].get_text() | ||
info = {'ip': ip_address, 'port': ip_port, 'type': ip_type} | ||
# 先校验一下IP的有效性再存储 | ||
check_res = self.check_ip(info); | ||
if check_res: | ||
print('IP有效:', info) | ||
self.json_data.append(info) | ||
else: | ||
print('IP无效:', info) | ||
json.dump(self.json_data, write_file) | ||
|
||
|
||
# 程序主入口 | ||
if __name__ == '__main__': | ||
# 实例化 | ||
ip = GetIpData() | ||
# 执行脚本 | ||
ip.run() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
### 文件结构 | ||
|
||
``` | ||
├── boss.py # 代码脚本 | ||
├── job.md # 职位列表的数据字典Markdown文本 | ||
``` | ||
|
||
### 交流学习 | ||
|
||
如有写的不对或者错误的地方,希望大家指正,相互交流,谢谢。 | ||
|
||
**温馨提醒**:最好别用自己的IP去爬,很容易被封进小黑屋,一般封一天,搞个代理比较稳 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
利用requests+bs4爬取Boss直聘数据 | ||
author: gxcuizy | ||
date: 2020-06-18 | ||
""" | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
class GetBossData(object): | ||
"""爬取10页的Boss直聘职位数据""" | ||
domain = 'https://www.zhipin.com' | ||
base_url = 'https://www.zhipin.com/c101280600/?query=' | ||
position = '' | ||
# 代理IP地址 | ||
proxies_ip = '58.220.95.30' | ||
proxies_port = '10174' | ||
|
||
def __init__(self, position): | ||
self.position = position | ||
|
||
def get_url_html(self, url, cookie): | ||
"""请求页面html""" | ||
ip_url = self.proxies_ip + ':' + str(self.proxies_port) | ||
proxies = {'http': 'http://' + ip_url, 'https': 'https://' + ip_url} | ||
header = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', | ||
'cookie': cookie | ||
} | ||
request = requests.get(url=url, headers=header, proxies=proxies, timeout=3) | ||
html = False | ||
if request.status_code == 200: | ||
html = request.content | ||
return html | ||
|
||
def run(self): | ||
"""执行入口""" | ||
page_list = range(1, 11) | ||
# 打开文件,准备写入 | ||
dict_file = open('job.md', 'a', encoding='UTF-8') | ||
# 清空文件内容 | ||
dict_file.seek(0) | ||
dict_file.truncate() | ||
dict_file.write('| 岗位 | 区域 | 薪资 | 年限信息 | 公司名称 | 公司信息 | 链接 |') | ||
dict_file.write('\n| --- | --- | --- | --- | --- | --- | --- |') | ||
# 分页爬取数据 | ||
for page in page_list: | ||
print('开始爬取第' + str(page) + '页数据') | ||
boss_url = self.base_url + str(self.position) + '&page=' + str(page) + '&ka=page-' + str(page) | ||
# F12打开调试模式,手动刷新网页获取cookie,然后替换 | ||
if page < 4: | ||
cookie_val = 'lastCity=101280600; __zp_seo_uuid__=d59649f5-bc8a-4263-b4e1-d5fb1526ebbe; __c=1592469667; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1592469673; __l=l=%2Fwww.zhipin.com%2Fshenzhen%2F&r=https%3A%2F%2Fwww.google.com%2F&friend_source=0&friend_source=0; toUrl=https%3A%2F%2Fwww.zhipin.com%2F%2Fjob_detail%2F3f35305467e161991nJ429i4GA%7E%7E.html; __a=43955211.1592469667..1592469667.39.1.39.39; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1592530438; __zp_stoken__=7f3aaPCVBFktLe0xkP21%2BJSFCLWILSwx7NEw4bVJkRx8pdBE3JGNmWjVwdx5PXC8rHmN%2BJB0hX1UvTz5VPyMmOhIVHBglVzoxJQIdLQtKR3ZFBFIeazwOByVndHwXBAN%2FXFo7W2BffFxtXSU%3D; __zp_sseed__=Ykg0aQ3ow1dZqyi9KmeVnWrqZXcZ32a4psiagwqme3M=; __zp_sname__=93bf4835; __zp_sts__=1592530479301' | ||
elif page < 7: | ||
cookie_val = 'lastCity=101280600; __zp_seo_uuid__=d59649f5-bc8a-4263-b4e1-d5fb1526ebbe; __c=1592469667; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1592469673; __l=l=%2Fwww.zhipin.com%2Fshenzhen%2F&r=https%3A%2F%2Fwww.google.com%2F&friend_source=0&friend_source=0; toUrl=https%3A%2F%2Fwww.zhipin.com%2F%2Fjob_detail%2F3f35305467e161991nJ429i4GA%7E%7E.html; __a=43955211.1592469667..1592469667.39.1.39.39; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1592530438; __zp_stoken__=7f3aaPCVBFktLe0xkP21%2BJSFCLWILSwx7NEw4bVJkRx8pdBE3JGNmWjVwdx5PXC8rHmN%2BJB0hX1UvTz5VPyMmOhIVHBglVzoxJQIdLQtKR3ZFBFIeazwOByVndHwXBAN%2FXFo7W2BffFxtXSU%3D; __zp_sseed__=Ykg0aQ3ow1dZqyi9KmeVnWrqZXcZ32a4psiagwqme3M=; __zp_sname__=93bf4835; __zp_sts__=1592530514188' | ||
elif page < 10: | ||
cookie_val = 'lastCity=101280600; __zp_seo_uuid__=d59649f5-bc8a-4263-b4e1-d5fb1526ebbe; __c=1592469667; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1592469673; __l=l=%2Fwww.zhipin.com%2Fshenzhen%2F&r=https%3A%2F%2Fwww.google.com%2F&friend_source=0&friend_source=0; toUrl=https%3A%2F%2Fwww.zhipin.com%2F%2Fjob_detail%2F3f35305467e161991nJ429i4GA%7E%7E.html; __a=43955211.1592469667..1592469667.40.1.40.40; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1592530479; __zp_stoken__=7f3aaPCVBFktLCT4uVVV%2BJSFCLWIVPWZyNUk4bVJkR25XXHVeZWNmWjVwd286Sm83HmN%2BJB0hX1UvBiBVRyt9IWQOcRtWSk83fAsfJAtKR3ZFBE5efUl%2FByVndHwXRQN%2FXFo7W2BffFxtXSU%3D; __zp_sseed__=Ykg0aQ3ow1dZqyi9KmeVnd/9vyiSRHrJFoMai+azsb8=; __zp_sname__=93bf4835; __zp_sts__=1592530496863' | ||
else: | ||
cookie_val = 'lastCity=101280600; __zp_seo_uuid__=d59649f5-bc8a-4263-b4e1-d5fb1526ebbe; __c=1592469667; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1592469673; __l=l=%2Fwww.zhipin.com%2Fshenzhen%2F&r=https%3A%2F%2Fwww.google.com%2F&friend_source=0&friend_source=0; toUrl=https%3A%2F%2Fwww.zhipin.com%2F%2Fjob_detail%2F3f35305467e161991nJ429i4GA%7E%7E.html; __a=43955211.1592469667..1592469667.41.1.41.41; __zp_stoken__=7f3aaPCVBFktLc1t4VTp%2BJSFCLWJscnlxSgw4bVJkRw9tLB4pb2NmWjVwdwwgc2l7HmN%2BJB0hX1UvGFZVTH0OdhQQfwxfOyoieW8cOgtKR3ZFBAJYRFMcByVndHwXTwN%2FXFo7W2BffFxtXSU%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1592530497; __zp_sseed__=Ykg0aQ3ow1dZqyi9KmeVnSZKsrhFUU/CYntJcRoFki4=; __zp_sname__=93bf4835; __zp_sts__=1592530514188' | ||
html = self.get_url_html(boss_url, cookie_val) | ||
soup = BeautifulSoup(html, 'html.parser') | ||
# 招聘职位列表 | ||
job_list = soup.select('.job-list ul li') | ||
for job_li in job_list: | ||
# 单条职位信息 | ||
url = self.domain + job_li.select('.job-title a')[0].attrs['href'] | ||
title = job_li.select('.job-title a')[0].get_text() | ||
area = job_li.select('.job-title .job-area')[0].get_text() | ||
salary = job_li.select('.job-limit .red')[0].get_text() | ||
year = job_li.select('.job-limit p')[0].get_text() | ||
company = job_li.select('.info-company h3')[0].get_text() | ||
industry = job_li.select('.info-company p')[0].get_text() | ||
info = { | ||
'title': title, | ||
'area': area, | ||
'salary': salary, | ||
'year': year, | ||
'company': company, | ||
'industry': industry, | ||
'url': url | ||
} | ||
print(info) | ||
# 写入职位信息 | ||
info_demo = '\n| %s | %s | %s | %s | %s | %s | %s |' | ||
dict_file.write(info_demo % (title, area, salary, year, company, industry, url)) | ||
dict_file.close() | ||
|
||
|
||
# 程序主入口 | ||
if __name__ == '__main__': | ||
# 实例化 | ||
job_name = input('请输入职位关键字:').strip() | ||
if job_name == '': | ||
print('关键字为空,请重新尝试') | ||
exit(0) | ||
gl = GetBossData(job_name) | ||
# 执行脚本 | ||
gl.run() |
Oops, something went wrong.