Skip to content

Commit 841168d

Browse files
committed
add ECUT_pos_html.py
1 parent 330436e commit 841168d

File tree

2 files changed

+59
-0
lines changed

2 files changed

+59
-0
lines changed

ECUT_pos_html.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import requests
2+
import re
3+
from bs4 import BeautifulSoup as bs
4+
5+
6+
def crawl_all_main_url(page=10):
7+
# 默认抓取官网前十页招聘信息的url
8+
all_url_list = []
9+
for _ in range(1, page+1):
10+
url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_)
11+
page_html = requests.get(url).text
12+
x_url_reg = re.compile('<a class="t_13px" href="(.*?)"')
13+
x_url = re.findall(x_url_reg, page_html)
14+
main_url = ['http://zjc.ecit.edu.cn/jy/app/{0}'.format(i) for i in x_url]
15+
all_url_list.extend(main_url)
16+
return all_url_list
17+
18+
def get_title(son_url):
19+
# 判断该网页是否为校园招聘
20+
html = requests.get(son_url).content.decode('gbk')
21+
explain_text_reg = re.compile('<h1 class="newstitle">(.*?)</h1>')
22+
explain_text = re.findall(explain_text_reg, html)[0]
23+
if ('时间' and '地点') in explain_text:
24+
return True
25+
else:
26+
pass
27+
28+
def save_html():
29+
all_url_list = crawl_all_main_url()
30+
for son_url in all_url_list:
31+
if get_title(son_url):
32+
text_html = requests.get(son_url).content.decode('gbk')
33+
domain_url = 'http://zjc.ecit.edu.cn/jy'
34+
img_url_reg = re.compile('border=0 src="\.\.(.*?)"')
35+
child_url = re.findall(img_url_reg, text_html)
36+
if child_url != []:
37+
img_url = domain_url + child_url[0]
38+
re_url = 'src="..{0}"'.format(child_url[0])
39+
end_url = 'src="{0}"'.format(img_url)
40+
end_html = text_html.replace(re_url, end_url)
41+
soup = bs(end_html, 'lxml')
42+
text_div = soup.find_all('div', id='main')[0]
43+
with open('./{0}.html'.format(son_url[-11:]), 'wb') as file:
44+
text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div)
45+
file.write(text_html.encode('utf-8'))
46+
else:
47+
with open('./{0}.html'.format(son_url[-11:]), 'wb') as file:
48+
html = requests.get(son_url).content.decode('gbk')
49+
soup = bs(text_html, 'lxml')
50+
text_div = soup.find_all('div', id='main')[0]
51+
text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div)
52+
file.write(text_html.encode('utf-8'))
53+
else:
54+
continue
55+
56+
if __name__ == '__main__':
57+
save_html()

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@
1313
##### 6. student_img.py: 基于本学校官网的url漏洞,获取所有注册学生学籍证件照
1414

1515
##### 7. JDSpider.py: 大批量抓取京东商品id和标签
16+
17+
##### 8. ECUT_pos_html.py: 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。

0 commit comments

Comments
 (0)