|
| 1 | +import requests |
| 2 | +import re |
| 3 | +from bs4 import BeautifulSoup as bs |
| 4 | + |
| 5 | + |
| 6 | +def crawl_all_main_url(page=10): |
| 7 | + # 默认抓取官网前十页招聘信息的url |
| 8 | + all_url_list = [] |
| 9 | + for _ in range(1, page+1): |
| 10 | + url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) |
| 11 | + page_html = requests.get(url).text |
| 12 | + x_url_reg = re.compile('<a class="t_13px" href="(.*?)"') |
| 13 | + x_url = re.findall(x_url_reg, page_html) |
| 14 | + main_url = ['http://zjc.ecit.edu.cn/jy/app/{0}'.format(i) for i in x_url] |
| 15 | + all_url_list.extend(main_url) |
| 16 | + return all_url_list |
| 17 | + |
| 18 | +def get_title(son_url): |
| 19 | + # 判断该网页是否为校园招聘 |
| 20 | + html = requests.get(son_url).content.decode('gbk') |
| 21 | + explain_text_reg = re.compile('<h1 class="newstitle">(.*?)</h1>') |
| 22 | + explain_text = re.findall(explain_text_reg, html)[0] |
| 23 | + if ('时间' and '地点') in explain_text: |
| 24 | + return True |
| 25 | + else: |
| 26 | + pass |
| 27 | + |
| 28 | +def save_html(): |
| 29 | + all_url_list = crawl_all_main_url() |
| 30 | + for son_url in all_url_list: |
| 31 | + if get_title(son_url): |
| 32 | + text_html = requests.get(son_url).content.decode('gbk') |
| 33 | + domain_url = 'http://zjc.ecit.edu.cn/jy' |
| 34 | + img_url_reg = re.compile('border=0 src="\.\.(.*?)"') |
| 35 | + child_url = re.findall(img_url_reg, text_html) |
| 36 | + if child_url != []: |
| 37 | + img_url = domain_url + child_url[0] |
| 38 | + re_url = 'src="..{0}"'.format(child_url[0]) |
| 39 | + end_url = 'src="{0}"'.format(img_url) |
| 40 | + end_html = text_html.replace(re_url, end_url) |
| 41 | + soup = bs(end_html, 'lxml') |
| 42 | + text_div = soup.find_all('div', id='main')[0] |
| 43 | + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: |
| 44 | + text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div) |
| 45 | + file.write(text_html.encode('utf-8')) |
| 46 | + else: |
| 47 | + with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: |
| 48 | + html = requests.get(son_url).content.decode('gbk') |
| 49 | + soup = bs(text_html, 'lxml') |
| 50 | + text_div = soup.find_all('div', id='main')[0] |
| 51 | + text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div) |
| 52 | + file.write(text_html.encode('utf-8')) |
| 53 | + else: |
| 54 | + continue |
| 55 | + |
| 56 | +if __name__ == '__main__': |
| 57 | + save_html() |
0 commit comments