Skip to content

Commit

Permalink
糗事百科代理中间件
Browse files Browse the repository at this point in the history
  • Loading branch information
Ehco1996 committed May 18, 2017
1 parent edf089c commit 6dfa1fd
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Scrapy 爬虫框架/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ def test(filename='blank.txt'):


#调用函数!
test('1.txt')
test('kdl_proxy.txt')
Empty file.
17 changes: 17 additions & 0 deletions Scrapy 爬虫框架/xiubai/xiubai/middlewares/coustomProxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
从本地文件proxy.py中
读取可以用的代理列表
并从中随机选择一个代理
供给spider使用
'''


from xiubai.middlewares.proxy import proxies
import random

class RandomProxy(object):
def process_request(self,request,spider):
# 从文件中随机选择一个代理
proxy = random.choice(proxies)

request.meta['proxy'] = 'http://{}'.format(proxy)
31 changes: 31 additions & 0 deletions Scrapy 爬虫框架/xiubai/xiubai/middlewares/coustomUserAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
'''
自定义scrapy框架的
user-agent头
从一个被良好维护的user-agent列表里
随机筛选合适的user-agent
防止封锁
'''
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware

import random



#一个不容易被封锁的user-agent列表
agents = ['Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)']

class RandomUserAgent(UserAgentMiddleware):
def process_request(self,request,spider):
'''
定义下载中间件,
必须要写这个函数,
这是scrapy数据流转的一个环节
具体可以看文档:
http://scrapy-chs.readthedocs.io/zh_CN/0.24/topics/downloader-middleware.html
'''
ua = random.choice(agents)
request.headers.setdefault('User-agent',ua)
19 changes: 19 additions & 0 deletions Scrapy 爬虫框架/xiubai/xiubai/middlewares/proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
'''
通过爬取可用的免费代理,
进行测试可用度之后
筛选获得的代理
将静态的资源文件写在功能文件之外
方便及时更新维护。
'''


# 可以自行添加更多代理

proxies = [
'89.36.215.72:1189',
'94.177.203.123:1189',
'110.73.11.227:8123',
'180.183.176.189:8080',
'109.62.247.81:8080',
]
9 changes: 6 additions & 3 deletions Scrapy 爬虫框架/xiubai/xiubai/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'xiubai.middlewares.MyCustomDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'xiubai.middlewares.coustomProxy.RandomProxy':10,
'xiubai.middlewares.coustomUserAgent.RandomUserAgent': 20,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':None,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':100,
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
Expand Down
4 changes: 2 additions & 2 deletions Scrapy 爬虫框架/xiubai/xiubai/spiders/hotspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ class HotspiderSpider(scrapy.Spider):
allowed_domains = ["qiushibaike.com"]
start_urls = []
# 我们爬取35页的全部热门段子
for i in range(1,36):
start_urls.append('http://qiushibaike.com/8hr/page/'+str(i)+'/')
for i in range(1,3):
start_urls.append('http://www.qiushibaike.com/8hr/page/'+str(i)+'/')


def parse(self, response):
Expand Down

0 comments on commit 6dfa1fd

Please sign in to comment.