Skip to content

Commit

Permalink
.next()
Browse files Browse the repository at this point in the history
Signed-off-by: Kenneth Reitz <[email protected]>
  • Loading branch information
kennethreitz committed Mar 3, 2018
1 parent 6249bb7 commit 38f692e
Showing 1 changed file with 100 additions and 20 deletions.
120 changes: 100 additions & 20 deletions requests_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
from pyquery.pyquery import fromstring

from fake_useragent import UserAgent
import lxml
from lxml import etree
from lxml.html import HtmlElement
from lxml.html.soupparser import fromstring as soup_parse
from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode
Expand Down Expand Up @@ -57,8 +59,14 @@ class BaseParser:
"""

def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
__slots__ = [
'element', 'url', 'skip_anchors', 'default_encoding', '_encoding',
'_encoding', '_html', '_lxml', '_pq', 'session'
]

def __init__(self, *, element, session: 'HTTPSession' = None, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
self.element = element
self.session = session or HTMLSession()
self.url = url
self.skip_anchors = True
self.default_encoding = default_encoding
Expand Down Expand Up @@ -127,7 +135,10 @@ def lxml(self) -> HtmlElement:
:class:`Element <Element>` or :class:`HTML <HTML>`.
"""
if self._lxml is None:
self._lxml = fromstring(self.html, parser='soup')[0]
try:
self._lxml = soup_parse(self.html, features='html.parser')
except ValueError:
self._lxml = lxml.html.fromstring(self.html)

return self._lxml

Expand All @@ -145,11 +156,51 @@ def full_text(self) -> _Text:
"""
return self.lxml.text_content()

def find(self, selector: str, first: bool = False, _encoding: str = None) -> _Find:
def next(self, fetch=True):
"""Attempts to find the next page, if there is one."""

def get_next():
candidates = self.find('a', containing=('next', 'more', 'older'))

for candidate in candidates:
if candidate.attrs.get('href'):
# Support 'next' rel (e.g. reddit).
if 'next' in candidate.attrs.get('rel', []):
return candidate.attrs['href']

# Support 'next' in classnames.
for _class in candidate.attrs.get('class', []):
if 'next' in _class:
return candidate.attrs['href']

if 'page' in candidate.attrs['href']:
return candidate.attrs['href']

try:
# Resort to the last candidate.
return candidates[-1].attrs['href']
except IndexError:
return None


next = get_next()
if next:
url = self._make_absolute(next)
else:
return None

if fetch:
return self.session.get(url)
else:
return url


def find(self, selector: str = "*", containing: Optional[str] = None, first: bool = False, _encoding: str = None) -> _Find:
"""Given a CSS Selector, returns a list of
:class:`Element <Element>` objects or a single one.
:param selector: CSS Selector to use.
:param containing: If specified, only return elements that contain the provided text.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
Expand All @@ -174,6 +225,16 @@ def find(self, selector: str, first: bool = False, _encoding: str = None) -> _Fi
for found in self.pq(selector)
]

if containing:
elements_copy = elements.copy()
elements = []

for element in elements_copy:
if any([c.lower() in element.full_text.lower() for c in containing]):
elements.append(element)

elements.reverse()

return _get_first_or_list(elements, first)

def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
Expand Down Expand Up @@ -236,6 +297,23 @@ def gen():

return set(gen())

def _make_absolute(self, link):
"""Makes a given link absolute."""

# Parse the link with stdlib.
parsed = urlparse(link)._asdict()

# Appears to be a relative link:
if not parsed['netloc']:
parsed['netloc'] = urlparse(self.base_url).netloc
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme

# Re-construct URL, with new data.
parsed = (v for v in parsed.values())
return urlunparse(parsed)


@property
def absolute_links(self) -> _Links:
"""All found links on page, in absolute form
Expand All @@ -244,20 +322,7 @@ def absolute_links(self) -> _Links:

def gen():
for link in self.links:
# Parse the link with stdlib.
parsed = urlparse(link)._asdict()

# Appears to be a relative link:
if not parsed['netloc']:
parsed['netloc'] = urlparse(self.base_url).netloc
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme

# Re-construct URL, with new data.
parsed = (v for v in parsed.values())
href = urlunparse(parsed)

yield href
yield self._make_absolute(link)

return set(gen())

Expand All @@ -269,7 +334,9 @@ def base_url(self) -> _URL:
# Support for <base> tag.
base = self.find('base', first=True)
if base:
return base.attrs['href'].strip()
result = base.attrs['href'].strip()
if result:
return result

url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
Expand All @@ -286,6 +353,8 @@ class Element(BaseParser):
:param default_encoding: Which encoding to default to.
"""

__slots__ = BaseParser.__slots__

def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
self.element = element
Expand All @@ -301,8 +370,8 @@ def attrs(self) -> _Attrs:
"""
attrs = {k: v for k, v in self.element.items()}

# Split class up, as there are ussually many of them:
for attr in ['class']:
# Split class and rel up, as there are ussually many of them:
for attr in ['class', 'rel']:
if attr in attrs:
attrs[attr] = tuple(attrs[attr].split())

Expand Down Expand Up @@ -334,6 +403,17 @@ def __init__(self, *, url: str = DEFAULT_URL, html: _HTML, default_encoding: str
def __repr__(self) -> str:
return f"<HTML url={self.url!r}>"

def __iter__(self):

next = self

while True:
yield next
try:
next = next.next(fetch=True).html
except AttributeError:
break

def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0):
"""Reloads the response in Chromium, and replaces HTML content
with an updated version, with JavaScript executed.
Expand Down

0 comments on commit 38f692e

Please sign in to comment.