Skip to content

Commit 45f3473

Browse files
committed
Added an XPath selector that works with scrapy's XPathItemLoader
1 parent 5e25130 commit 45f3473

File tree

3 files changed

+95
-1
lines changed

3 files changed

+95
-1
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ scrapy-webdriver
33

44
Scrape using Selenium webdriver.
55

6+
Not well tested. Has probably lots of bugs, unhandled situations, and scrapy
7+
integration problems. Use at your own risk.
8+
9+
That being said, it works for me, in all its non-blocking glory.
10+
611
Installation
712
=============
813

scrapy_webdriver/manager.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,28 @@
88

99
class WebdriverManager(object):
1010
"""Manages the life cycle of a webdriver instance."""
11+
USER_AGENT_KEY = 'phantomjs.page.settings.userAgent'
12+
1113
def __init__(self, crawler):
1214
self.crawler = crawler
1315
self._lock = Lock()
1416
self._wait_queue = deque()
1517
self._wait_inpage_queue = deque()
1618
self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None)
19+
self._user_agent = crawler.settings.get('USER_AGENT', None)
1720
self._webdriver = None
1821
if isinstance(self._browser, basestring):
1922
self._browser = getattr(webdriver, self._browser)
2023
elif self._browser is not None:
2124
self._webdriver = self._browser
2225

26+
@property
27+
def _desired_capabilities(self):
28+
capabilities = dict()
29+
if self._user_agent is not None:
30+
capabilities[self.USER_AGENT_KEY] = self._user_agent
31+
return capabilities or None
32+
2333
@classmethod
2434
def valid_settings(cls, settings):
2535
browser = settings.get('WEBDRIVER_BROWSER')
@@ -32,7 +42,8 @@ def valid_settings(cls, settings):
3242
def webdriver(self):
3343
"""Return the webdriver instance, instantiate it if necessary."""
3444
if self._webdriver is None:
35-
self._webdriver = self._browser()
45+
options = dict(desired_capabilities=self._desired_capabilities)
46+
self._webdriver = self._browser(**options)
3647
self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
3748
return self._webdriver
3849

scrapy_webdriver/selector.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import re
2+
3+
from scrapy.selector import XPathSelector, XPathSelectorList
4+
5+
_UNSUPPORTED_XPATH_ENDING = re.compile(r'.*/((@)?([^/()]+)(\(\))?)$')
6+
7+
8+
class WebdriverXPathSelector(XPathSelector):
9+
"""Scrapy selector that works using XPath selectors in a remote browser.
10+
11+
Based on some code from Marconi Moreto:
12+
https://github.com/marconi/ghost-selector
13+
14+
"""
15+
def __init__(self, response=None, webdriver=None, element=None,
16+
*args, **kwargs):
17+
kwargs['response'] = response
18+
super(WebdriverXPathSelector, self).__init__(*args, **kwargs)
19+
self.response = response
20+
self.webdriver = webdriver or response.webdriver
21+
self.element = element
22+
23+
def _make_result(self, result):
24+
if type(result) is not list:
25+
result = [result]
26+
return [self.__class__(webdriver=self.webdriver, element=e)
27+
for e in result]
28+
29+
def select(self, xpath):
30+
"""Return elements using webdriver `find_elements_by_xpath` method."""
31+
xpathev = self.element if self.element else self.webdriver
32+
ending = _UNSUPPORTED_XPATH_ENDING.match(xpath)
33+
if ending:
34+
match, atsign, name, parens = ending.groups()
35+
if atsign:
36+
xpath = xpath[:-len(name) - 2]
37+
elif parens and name == 'text':
38+
xpath = xpath[:-len(name) - 3]
39+
else:
40+
atsign = parens = None
41+
result = self._make_result(xpathev.find_elements_by_xpath(xpath))
42+
if atsign:
43+
result = (_NodeAttribute(r.element, name) for r in result)
44+
elif parens and result and name == 'text':
45+
result = (_TextNode(self.webdriver, r.element) for r in result)
46+
return XPathSelectorList(result)
47+
48+
def select_script(self, script, *args):
49+
"""Return elements using JavaScript snippet execution."""
50+
result = self.webdriver.execute_script(script, *args)
51+
return XPathSelectorList(self._make_result(result))
52+
53+
def extract(self):
54+
"""Extract text from selenium element."""
55+
return self.element.text if self.element else None
56+
57+
58+
class _NodeAttribute(object):
59+
"""Works around webdriver XPath inability to select attributes."""
60+
def __init__(self, element, attribute):
61+
self.element = element
62+
self.attribute = attribute
63+
64+
def extract(self):
65+
return self.element.get_attribute(self.attribute)
66+
67+
68+
class _TextNode(object):
69+
"""Works around webdriver XPath inability to select text nodes."""
70+
JS_FIND_FIRST_TEXT_NODE = 'return arguments[0].firstChild.nodeValue'
71+
72+
def __init__(self, webdriver, element):
73+
self.element = element
74+
self.webdriver = webdriver
75+
76+
def extract(self):
77+
args = (self.JS_FIND_FIRST_TEXT_NODE, self.element)
78+
return self.webdriver.execute_script(*args)

0 commit comments

Comments
 (0)