forked from Ehco1996/Python-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
76 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,4 +63,4 @@ def test(filename='blank.txt'): | |
|
||
|
||
#调用函数! | ||
test('1.txt') | ||
test('kdl_proxy.txt') |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
''' | ||
从本地文件proxy.py中 | ||
读取可以用的代理列表 | ||
并从中随机选择一个代理 | ||
供给spider使用 | ||
''' | ||
|
||
|
||
from xiubai.middlewares.proxy import proxies | ||
import random | ||
|
||
class RandomProxy(object): | ||
def process_request(self,request,spider): | ||
# 从文件中随机选择一个代理 | ||
proxy = random.choice(proxies) | ||
|
||
request.meta['proxy'] = 'http://{}'.format(proxy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
''' | ||
自定义scrapy框架的 | ||
user-agent头 | ||
从一个被良好维护的user-agent列表里 | ||
随机筛选合适的user-agent | ||
防止封锁 | ||
''' | ||
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware | ||
|
||
import random | ||
|
||
|
||
|
||
#一个不容易被封锁的user-agent列表 | ||
agents = ['Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', | ||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', | ||
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', | ||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | ||
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'] | ||
|
||
class RandomUserAgent(UserAgentMiddleware): | ||
def process_request(self,request,spider): | ||
''' | ||
定义下载中间件, | ||
必须要写这个函数, | ||
这是scrapy数据流转的一个环节 | ||
具体可以看文档: | ||
http://scrapy-chs.readthedocs.io/zh_CN/0.24/topics/downloader-middleware.html | ||
''' | ||
ua = random.choice(agents) | ||
request.headers.setdefault('User-agent',ua) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
''' | ||
通过爬取可用的免费代理, | ||
进行测试可用度之后 | ||
筛选获得的代理 | ||
将静态的资源文件写在功能文件之外 | ||
方便及时更新维护。 | ||
''' | ||
|
||
|
||
# 可以自行添加更多代理 | ||
|
||
proxies = [ | ||
'89.36.215.72:1189', | ||
'94.177.203.123:1189', | ||
'110.73.11.227:8123', | ||
'180.183.176.189:8080', | ||
'109.62.247.81:8080', | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters