Skip to content

Commit 28079eb

Browse files
committed
Added some webdriver configurability
1 parent 13e8604 commit 28079eb

File tree

6 files changed

+38
-11
lines changed

6 files changed

+38
-11
lines changed

README.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,27 @@
11
scrapy-webdriver
2-
================
2+
================
3+
4+
Scrape using Selenium webdriver.
5+
6+
Configuration
7+
-------------
8+
9+
Add something like this in your scrapy project settings:
10+
11+
DOWNLOAD_HANDLERS = {
12+
'http': 'scrapy_webdriver.download.WebdriverDownloadHandler',
13+
'https': 'scrapy_webdriver.download.WebdriverDownloadHandler',
14+
}
15+
16+
SPIDER_MIDDLEWARES = {
17+
'scrapy_webdriver.middlewares.WebdriverSpiderMiddleware': 543,
18+
}
19+
20+
WEBDRIVER_BROWSER = 'PhantomJS'
21+
22+
Usage
23+
-----
24+
25+
In order to have webdriver handle your downloads, use the provided class
26+
`scrapy_webdriver.http.WebdriverRequest` in place of the stock scrapy `Request`.
27+

scrapy_webdriver/download.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class WebdriverDownloadHandler(object):
1414
1515
"""
1616
def __init__(self, settings):
17-
self._enabled = settings.getbool('WEBDRIVER_ENABLED')
17+
self._enabled = settings.get('WEBDRIVER_BROWSER') is not None
1818
self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)
1919

2020
def download_request(self, request, spider):

scrapy_webdriver/manager.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@ def __init__(self, crawler):
1313
self._lock = Lock()
1414
self._wait_queue = deque()
1515
self._wait_inpage_queue = deque()
16-
self._webdriver = crawler.settings.get('_WEBDRIVER_INSTANCE', None)
16+
self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None)
17+
self._webdriver = None
18+
if isinstance(self._browser, basestring):
19+
self._browser = getattr(webdriver, self._browser)
20+
elif self._browser is not None:
21+
self._webdriver = self._browser
1722

1823
@property
1924
def webdriver(self):
2025
"""Return the webdriver instance, instantiate it if necessary."""
2126
if self._webdriver is None:
22-
self._webdriver = webdriver.PhantomJS()
27+
self._webdriver = self._browser()
2328
self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
2429
return self._webdriver
2530

scrapy_webdriver/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
title = 'scrapy_webdriver'
77
nice_title = 'Webdriver helpers for scrapy'
88
nice_title_no_spaces = nice_title.replace(' ', '')
9-
version = '0.1'
9+
version = '0.2'
1010
description = 'Helpers to work with webdriver in scrapy projects'
1111
authors = ['Nicolas Cadou']
1212
authors_string = ', '.join(authors)

scrapy_webdriver/middlewares.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from scrapy.exceptions import IgnoreRequest, NotConfigured
1+
from scrapy.exceptions import IgnoreRequest
22

33
from .http import WebdriverInPageRequest, WebdriverRequest
44
from .manager import WebdriverManager
@@ -11,8 +11,6 @@ def __init__(self, crawler):
1111

1212
@classmethod
1313
def from_crawler(cls, crawler):
14-
if not crawler.settings.getbool('WEBDRIVER_ENABLED'):
15-
raise NotConfigured
1614
return cls(crawler)
1715

1816
def process_start_requests(self, start_requests, spider):

scrapy_webdriver/tests/test_request_queue.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818
},
1919
SPIDER_MIDDLEWARES={
2020
'scrapy_webdriver.middlewares.WebdriverSpiderMiddleware': 543,
21-
},
22-
WEBDRIVER_ENABLED=True)
21+
})
2322

2423

2524
class TestRequestQueue:
@@ -40,7 +39,7 @@ def _wait(self, url, *args, **kwargs):
4039

4140
def test_priorization(self):
4241
webdriver = Mock()
43-
settings = self.settings(_WEBDRIVER_INSTANCE=webdriver)
42+
settings = self.settings(WEBDRIVER_BROWSER=webdriver)
4443
webdriver.get.side_effect = self._wait
4544
webdriver.page_source = u''
4645

0 commit comments

Comments
 (0)