Skip to content

Commit

Permalink
Merge branch 'master' of github.com:kennethreitz/requests-html
Browse files Browse the repository at this point in the history
  • Loading branch information
kennethreitz committed Mar 9, 2018
2 parents 3a5a94e + ecb1024 commit d9ee89e
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 11 deletions.
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ XPath is also supported (`learn more <https://msdn.microsoft.com/en-us/library/m
>>> r.html.xpath('a')
[<Element 'a' class='btn' href='https://help.github.com/articles/supported-browsers'>]
You can also select only elements containing certian text:
You can also select only elements containing certain text:

.. code-block:: pycon
Expand Down
29 changes: 20 additions & 9 deletions requests_html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import asyncio
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlparse, urlunparse, urljoin
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures._base import TimeoutError
from functools import partial
Expand Down Expand Up @@ -337,15 +337,20 @@ def _make_absolute(self, link):
# Parse the link with stdlib.
parsed = urlparse(link)._asdict()

# Appears to be a relative link:
# If link is relative, then join it with base_url.
if not parsed['netloc']:
parsed['netloc'] = urlparse(self.base_url).netloc
return urljoin(self.base_url, link)

# Link is absolute; if it lacks a scheme, add one from base_url.
if not parsed['scheme']:
parsed['scheme'] = urlparse(self.base_url).scheme

# Re-construct URL, with new data.
parsed = (v for v in parsed.values())
return urlunparse(parsed)
# Reconstruct the URL to incorporate the new scheme.
parsed = (v for v in parsed.values())
return urlunparse(parsed)

# Link is absolute and complete with scheme; nothing to be done here.
return link


@property
Expand All @@ -372,9 +377,15 @@ def base_url(self) -> _URL:
if result:
return result

url = '/'.join(self.url.split('/')[:-1])
if url.endswith('/'):
url = url[:-1]
# Parse the url to separate out the path
parsed = urlparse(self.url)._asdict()

# Remove any part of the path after the last '/'
path = '/'.join(parsed['path'].split('/')[:-1])

# Reconstruct the url with the modified path
parsed = (v for v in parsed.values())
url = urlunparse(parsed)

return url

Expand Down
2 changes: 1 addition & 1 deletion tests/test_internet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_pagination():
pages = (
'https://xkcd.com/1957/',
'https://reddit.com/',
'https://pornhub.com/',
'https://smile.amazon.com/',
'https://theverge.com/archives'
)

Expand Down
25 changes: 25 additions & 0 deletions tests/test_requests_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,31 @@ def test_anchor_links():
assert '#site-map' in r.html.links


@pytest.mark.ok
@pytest.mark.parametrize('url,link,expected', [
('http://example.com/', 'test.html', 'http://example.com/test.html'),
('http://example.com', 'test.html', 'http://example.com/test.html'),
('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'),
('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'),
('http://example.com/foo/', '/test.html', 'http://example.com/test.html'),
('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'),
('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'),
])
def test_absolute_links(url, link, expected):
head_template = """<head><base href='{}'></head>"""
body_template = """<body><a href='{}'>Next</a></body>"""

# Test without `<base>` tag (url is base)
html = HTML(html=body_template.format(link), url=url)
assert html.absolute_links.pop() == expected

# Test with `<base>` tag (url is other)
html = HTML(
html=head_template.format(url) + body_template.format(link),
url='http://example.com/foobar/')
assert html.absolute_links.pop() == expected


@pytest.mark.render
def test_render():
r = get()
Expand Down

0 comments on commit d9ee89e

Please sign in to comment.