-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider_text.py
58 lines (58 loc) · 2.57 KB
/
spider_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding:UTF -8 -*-
from urllib import request
from bs4 import BeautifulSoup
import re
import sys
if __name__=="__main":
#创建txt文件
file = open('奥古之都之路.txt','w',encoding='utf-8')
#小说目录
target_url = "http://www.biqukan.com/3_3398/"
#模拟访问
head={}
head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
target_req = request.Request(url=target_url,headers = head)
target_response = request.urlopen(target_req)
target_html = target_response.read().decode('gbk','ignore')
#创建bs对象
list_soup = BeautifulSoup(target_html,'lxml')
#搜索文档树 找出div标签中class为listmain的所有子标签
chapters = list_soup.find_all('div',class_='listmian')
#使用查询结果再创建一个bs对象,对其继续进行解析
download_soup = BeautifulSoup(str(chapters),'lxml')
#计算章节个数
numbers = (len(download_soup.dl.contents)-1)/2-8
index = 1
#开始记录内容标志位 只要正文卷下面的链接 最新章节列表链接剔除
begin_flag = False
#遍历dl标签下所有子节点
for child in download_soup.dl.children:
if child !='\n':
#找到正文卷,标志位启用
begin_flag = True
#爬起链接并下载链接内容
if begin_flag == True and child.a!=None:
download_url = "http://www.biqukan.com" + child.a.get('href')
download_req = request.Request(url = download_url,headers = head)
download_response = request.urlopen(download_req)
download_html = download_response.read().decode('gbk','ignore')
download_name =child.string
texts_soup = BeautifulSoup(download_html,'lxml')
texts = texts_soup.find_all(id = 'content',class_="showtxt")
texts_soup = BeautifulSoup(str(texts),'lxml')
wrire_flag = True
file.write(download_name+'\n\n')
#将爬取内容写入文件
for each in texts_soup.div.text.replace('\xa0',''):
if each =='h':
write_flag = False
if write_flag == True and each !=" ":
file.write(each)
if write_flag == True and each =='\r':
file.write('\n')
file.write('\n\n')
#打印爬取速度
sys.stdout.write("已下载:%.3f%%" %float(index/numbers)+"\r")
sys.stdout.flush()
index+=1
file.close()