.next()

Signed-off-by: Kenneth Reitz <[email protected]>
Mrigank11 · Mar 3, 2018 · 38f692e · 38f692e
1 parent 6249bb7
commit 38f692e
Showing 1 changed file with 100 additions and 20 deletions.
diff --git a/requests_html.py b/requests_html.py
@@ -10,8 +10,10 @@
 from pyquery.pyquery import fromstring
 
 from fake_useragent import UserAgent
+import lxml
 from lxml import etree
 from lxml.html import HtmlElement
+from lxml.html.soupparser import fromstring as soup_parse
 from parse import search as parse_search
 from parse import findall, Result
 from w3lib.encoding import html_to_unicode
@@ -57,8 +59,14 @@ class BaseParser:
 
     """
 
-    def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
+    __slots__ = [
+        'element', 'url', 'skip_anchors', 'default_encoding', '_encoding',
+        '_encoding', '_html', '_lxml', '_pq', 'session'
+    ]
+
+    def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
         self.element = element
+        self.session = session or HTMLSession()
         self.url = url
         self.skip_anchors = True
         self.default_encoding = default_encoding
@@ -127,7 +135,10 @@ def lxml(self) -> HtmlElement:
         :class:`Element <Element>` or :class:`HTML <HTML>`.
         """
         if self._lxml is None:
-            self._lxml = fromstring(self.html, parser='soup')[0]
+            try:
+                self._lxml = soup_parse(self.html, features='html.parser')
+            except ValueError:
+                self._lxml = lxml.html.fromstring(self.html)
 
         return self._lxml
 
@@ -145,11 +156,51 @@ def full_text(self) -> _Text:
         """
         return self.lxml.text_content()
 
-    def find(self, selector: str, first: bool = False, _encoding: str = None) -> _Find:
+    def next(self, fetch=True):
+        """Attempts to find the next page, if there is one."""
+
+        def get_next():
+            candidates = self.find('a', containing=('next', 'more', 'older'))
+
+            for candidate in candidates:
+                if candidate.attrs.get('href'):
+                    # Support 'next' rel (e.g. reddit).
+                    if 'next' in candidate.attrs.get('rel', []):
+                        return candidate.attrs['href']
+
+                    # Support 'next' in classnames.
+                    for _class in candidate.attrs.get('class', []):
+                        if 'next' in _class:
+                            return candidate.attrs['href']
+
+                    if 'page' in candidate.attrs['href']:
+                        return candidate.attrs['href']
+
+            try:
+                # Resort to the last candidate.
+                return candidates[-1].attrs['href']
+            except IndexError:
+                return None
+
+
+        next = get_next()
+        if next:
+            url = self._make_absolute(next)
+        else:
+            return None
+
+        if fetch:
+            return self.session.get(url)
+        else:
+            return url
+
+
+    def find(self, selector: str = "*", containing: Optional[str] = None, first: bool = False, _encoding: str = None) -> _Find:
         """Given a CSS Selector, returns a list of
         :class:`Element <Element>` objects or a single one.
 
         :param selector: CSS Selector to use.
+        :param containing: If specified, only return elements that contain the provided text.
         :param first: Whether or not to return just the first result.
         :param _encoding: The encoding format.
 
@@ -174,6 +225,16 @@ def find(self, selector: str, first: bool = False, _encoding: str = None) -> _Fi
             for found in self.pq(selector)
         ]
 
+        if containing:
+            elements_copy = elements.copy()
+            elements = []
+
+            for element in elements_copy:
+                if any([c.lower() in element.full_text.lower() for c in containing]):
+                    elements.append(element)
+
+            elements.reverse()
+
         return _get_first_or_list(elements, first)
 
     def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
@@ -236,6 +297,23 @@ def gen():
 
         return set(gen())
 
+    def _make_absolute(self, link):
+        """Makes a given link absolute."""
+
+        # Parse the link with stdlib.
+        parsed = urlparse(link)._asdict()
+
+        # Appears to be a relative link:
+        if not parsed['netloc']:
+            parsed['netloc'] = urlparse(self.base_url).netloc
+        if not parsed['scheme']:
+            parsed['scheme'] = urlparse(self.base_url).scheme
+
+        # Re-construct URL, with new data.
+        parsed = (v for v in parsed.values())
+        return urlunparse(parsed)
+
+
     @property
     def absolute_links(self) -> _Links:
         """All found links on page, in absolute form
@@ -244,20 +322,7 @@ def absolute_links(self) -> _Links:
 
         def gen():
             for link in self.links:
-                # Parse the link with stdlib.
-                parsed = urlparse(link)._asdict()
-
-                # Appears to be a relative link:
-                if not parsed['netloc']:
-                    parsed['netloc'] = urlparse(self.base_url).netloc
-                if not parsed['scheme']:
-                    parsed['scheme'] = urlparse(self.base_url).scheme
-
-                # Re-construct URL, with new data.
-                parsed = (v for v in parsed.values())
-                href = urlunparse(parsed)
-
-                yield href
+                yield self._make_absolute(link)
 
         return set(gen())
 
@@ -269,7 +334,9 @@ def base_url(self) -> _URL:
         # Support for <base> tag.
         base = self.find('base', first=True)
         if base:
-            return base.attrs['href'].strip()
+            result = base.attrs['href'].strip()
+            if result:
+                return result
 
         url = '/'.join(self.url.split('/')[:-1])
         if url.endswith('/'):
@@ -286,6 +353,8 @@ class Element(BaseParser):
     :param default_encoding: Which encoding to default to.
     """
 
+    __slots__ = BaseParser.__slots__
+
     def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
         super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
         self.element = element
@@ -301,8 +370,8 @@ def attrs(self) -> _Attrs:
         """
         attrs = {k: v for k, v in self.element.items()}
 
-        # Split class up, as there are ussually many of them:
-        for attr in ['class']:
+        # Split class and rel up, as there are ussually many of them:
+        for attr in ['class', 'rel']:
             if attr in attrs:
                 attrs[attr] = tuple(attrs[attr].split())
 
@@ -334,6 +403,17 @@ def __init__(self, *, url: str = DEFAULT_URL, html: _HTML, default_encoding: str
     def __repr__(self) -> str:
         return f"<HTML url={self.url!r}>"
 
+    def __iter__(self):
+
+        next = self
+
+        while True:
+            yield next
+            try:
+                next = next.next(fetch=True).html
+            except AttributeError:
+                break
+
     def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0):
         """Reloads the response in Chromium, and replaces HTML content
         with an updated version, with JavaScript executed.