Skip to content

Commit

Permalink
Remove support for passing hostname in URI
Browse files Browse the repository at this point in the history
This feature depended on fragile URL parsing, which no longer works after https://bugs.python.org/issue43882. In addition, this feature defeats active NameNode tracking.
Users should instead create a new client object for different clusters.

As a bonus, we can now test against additional pathological characters.
  • Loading branch information
Jing Wang committed Jul 18, 2021
1 parent 20c8ad3 commit 77be5f1
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 61 deletions.
6 changes: 0 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,6 @@ Example usage:
True
You can also pass the hostname as part of the URI:

.. code-block:: python
fs.list_status('//nn1.example.com:50070;nn2.example.com:50070/')
The methods and return values generally map directly to `WebHDFS endpoints <https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html>`_.
The client also provides convenience methods that mimic Python ``os`` methods and HDFS CLI commands (e.g. ``walk`` and ``copy_to_local``).

Expand Down
35 changes: 7 additions & 28 deletions pyhdfs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from typing import Union
from typing import cast
from urllib.parse import quote as url_quote
from urllib.parse import urlsplit

import requests.api
import requests.exceptions
Expand Down Expand Up @@ -369,37 +368,15 @@ def _parse_hosts(self, hosts: Union[str, Iterable[str]]) -> List[str]:
random.shuffle(host_list)
return host_list

def _parse_path(self, path: str) -> Tuple[List[str], str]:
"""Return (hosts, path) tuple"""
# Support specifying another host via hdfs://host:port/path syntax
# We ignore the scheme and piece together the query and fragment
# Note that HDFS URIs are not URL encoded, so a '?' or a '#' in the URI is part of the
# path
parts = urlsplit(path, allow_fragments=False)
if not parts.path.startswith("/"):
raise ValueError("Path must be absolute, was given {}".format(path))
if parts.scheme not in ("", "hdfs", "hftp", "webhdfs"):
warnings.warn("Unexpected scheme {}".format(parts.scheme))
assert not parts.fragment
path = parts.path
if parts.query:
path += "?" + parts.query
if parts.netloc:
hosts = self._parse_hosts(parts.netloc)
else:
hosts = self.hosts
return hosts, path

def _record_last_active(self, host: str) -> None:
"""Put host first in our host list, so we try it first next time
The implementation of get_active_namenode relies on this reordering.
"""
# this check is for when user passes a host at request time
if host in self.hosts:
# Keep this thread safe: set hosts atomically and update it before the timestamp
self.hosts = [host] + [h for h in self.hosts if h != host]
self._last_time_recorded_active = time.time()
assert host in self.hosts
# Keep this thread safe: set hosts atomically and update it before the timestamp
self.hosts = [host] + [h for h in self.hosts if h != host]
self._last_time_recorded_active = time.time()

def _request(
self,
Expand All @@ -414,7 +391,9 @@ def _request(
This function handles NameNode failover and error checking.
All kwargs are passed as query params to the WebHDFS server.
"""
hosts, path = self._parse_path(path)
hosts = self.hosts
if not posixpath.isabs(path):
raise ValueError("Path must be absolute, was given {}".format(path))
_transform_user_name_key(kwargs)
kwargs.setdefault("user.name", self.user_name)

Expand Down
29 changes: 2 additions & 27 deletions test_pyhdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from typing import cast
from unittest import mock

import pytest
import requests
from requests.api import request as original_request

Expand All @@ -37,10 +36,9 @@
FILE_CONTENTS = b"lorem ipsum dolor sit amet"
FILE_CONTENTS2 = b"some stuff"

# Exclude control characters and special path characters
# Exclude special path characters
PATHOLOGICAL_NAME = (
"".join(chr(n) for n in range(32, 128) if chr(n) not in {"/", ":", ";"})
+ "\b\t\n\f\r中文"
"".join(chr(n) for n in range(0, 150) if chr(n) not in {"/", ":"}) + "中文"
)


Expand Down Expand Up @@ -354,21 +352,6 @@ def mock_request(*args: object, **kwargs: object) -> requests.Response:
else:
self.fail("should have thrown") # pragma: no cover

def test_host_in_request(self) -> None:
"""Client should support specifying host afterwards"""
client = make_client("does_not_exist")
with self.assertRaises(HdfsNoServerException):
client.get_file_status("/")
client.get_file_status("//localhost/")
client.get_file_status("hdfs://localhost:50070/")
with pytest.warns(UserWarning) as record:
client.get_file_status("foobar://localhost:50070/")
assert len(record) == 1
message = record[0].message
assert isinstance(message, Warning)
assert message.args[0] == "Unexpected scheme foobar"
assert client.hosts == ["does_not_exist:50070"]

def test_concat(self) -> None:
MIN_BLOCK_SIZE = 1024 * 1024
client = make_client()
Expand Down Expand Up @@ -487,14 +470,6 @@ def test_walk(self) -> None:
(path("a1", "b2"), [], []),
(path("a2"), [], []),
]
prefix = "//localhost"
assert list(client.walk(prefix + TEST_DIR, topdown=False)) == [
(prefix + path("a1", "b1"), [], ["f2"]),
(prefix + path("a1", "b2"), [], []),
(prefix + path("a1"), ["b1", "b2"], []),
(prefix + path("a2"), [], []),
(prefix + path(), ["a1", "a2"], ["f1"]),
]

def test_walk_error(self) -> None:
client = make_client()
Expand Down

0 comments on commit 77be5f1

Please sign in to comment.