From 3dc63883387726dae067d413cebf4f119218862d Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Sun, 15 Sep 2024 08:23:40 +0200 Subject: [PATCH] Refactoring of LocationPath.from_uri() - Add get_uri_path(): to get the path part from an URI - Add is_unc_path(): used for checking if a path is an UNC path, using the capabilities of PureWindowsPath of the Python release; - Add is_drive_path(): to check if is a path starting with a drive. --- tests/test_locations.py | 174 +++++++++++++++++++++++++++++++++++++--- xmlschema/locations.py | 168 ++++++++++++++++++++------------------ 2 files changed, 250 insertions(+), 92 deletions(-) diff --git a/tests/test_locations.py b/tests/test_locations.py index a9303259..aa1edcdc 100644 --- a/tests/test_locations.py +++ b/tests/test_locations.py @@ -9,18 +9,20 @@ # @author Davide Brunato # import unittest +import sys import os import pathlib import platform -from urllib.parse import urlsplit, uses_relative +from urllib.parse import urlsplit from pathlib import Path, PurePath, PureWindowsPath, PurePosixPath from unittest.mock import patch, MagicMock import xmlschema.locations from xmlschema.locations import LocationPath, LocationPosixPath, LocationWindowsPath, \ - is_url, is_local_url, is_remote_url, url_path_is_file, normalize_url, \ - normalize_locations, match_location, is_encoded_url, is_safe_url, encode_url, decode_url + is_url, is_local_url, is_remote_url, url_path_is_file, is_unc_path, is_drive_path, \ + normalize_url, normalize_locations, match_location, is_encoded_url, is_safe_url, \ + encode_url, decode_url, get_uri_path, get_uri, DRIVE_LETTERS TEST_CASES_DIR = str(pathlib.Path(__file__).absolute().parent.joinpath('test_cases')) @@ -30,6 +32,45 @@ ' \n' \ '' +URL_CASES = ( + 'file:///c:/Downloads/file.xsd', + 'file:///tmp/xmlschema/schemas/VC/XMLSchema-versioning.xsd', + 'file:///tmp/xmlschema/schemas/XSD_1.1/xsd11-extra.xsd', + 'issue #000.xml', 'dev/XMLSCHEMA/test.xsd', + 'file:///tmp/xmlschema/schemas/XSI/XMLSchema-instance_minimal.xsd', + 'vehicles.xsd', 'file://filer01/MY_HOME/', + '//anaconda/envs/testenv/lib/python3.6/site-packages/xmlschema/validators/schemas/', + 'z:\\Dir-1.0\\Dir-2_0\\', 'https://host/path?name=2&id=', 'data.xml', + 'alpha', 'other.xsd?id=2', '\\\\filer01\\MY_HOME\\', '//root/dir1', + '/tmp/xmlschema/schemas/XSD_1.1/xsd11-extra.xsd', + '/tmp/xmlschema/schemas/VC/XMLSchema-versioning.xsd', + '\\\\host\\share\\file.xsd', 'https://example.com/xsd/other_schema.xsd', + '/tmp/tests/test_cases/examples/collection/collection.xml', 'XMLSchema.xsd', + 'file:///c:/Windows/unknown', 'k:\\Dir3\\schema.xsd', + '/tmp/tests/test_cases/examples/collection', 'file:other.xsd', + 'issue%20%23000.xml', '\\\\filer01\\MY_HOME\\dev\\XMLSCHEMA\\test.xsd', + 'http://site/base', 'dir2/schema.xsd', '//root/dir1/schema.xsd', + 'file:///tmp/xmlschema/schemas/XML/xml_minimal.xsd', + 'https://site/base', 'file:///home#attribute', + '/dir1/dir2/issue%20%23002', '////root/dir1/schema.xsd', + '/tmp/xmlschema/schemas/XSD_1.1/XMLSchema.xsd', + '/tmp/xmlschema/schemas/XML/xml_minimal.xsd', + '/tmp/xmlschema/schemas/XSD_1.0/XMLSchema.xsd', + 'file:///home/', '////root/dir1', '//root/dir1/', 'file:///home', 'other.xsd', + 'file:///tmp/tests/test_cases/examples/collection/collection.xml', + 'file://host/home/', 'dummy path.xsd', 'other.xsd#element', + 'z:\\Dir_1_0\\Dir2-0\\schemas/XSD_1.0/XMLSchema.xsd', + 'd:/a/xmlschema/xmlschema/tests/test_cases/examples/', + 'https://xmlschema.test/schema 2/test.xsd?name=2 id=3', + 'xsd1.0/schema.xsd', '/home', 'schema.xsd', + 'dev\\XMLSCHEMA\\test.xsd', '../dir1/./dir2', 'beta', + '/tmp/xmlschema/schemas/XSI/XMLSchema-instance_minimal.xsd', + 'file:///dir1/dir2/', 'file:///dir1/dir2/issue%20001', '/root/dir1/schema.xsd', + 'file:///tmp/xmlschema/schemas/XSD_1.1', '/path/schema 2/test.xsd?name=2 id=3', + 'file:////filer01/MY_HOME/', 'file:///home?name=2&id=', 'http://example.com/beta', + '/home/user', 'file:///\\k:\\Dir A\\schema.xsd' +) + def casepath(relative_path): return str(pathlib.Path(TEST_CASES_DIR).joinpath(relative_path)) @@ -63,9 +104,9 @@ def setUpClass(cls): def check_url(self, url, expected): url_parts = urlsplit(url) - if urlsplit(expected).scheme not in uses_relative: - expected = add_leading_slash(expected) + if urlsplit(expected).scheme in DRIVE_LETTERS: + expected = add_leading_slash(expected) expected_parts = urlsplit(expected, scheme='file') self.assertEqual(url_parts.scheme, expected_parts.scheme, @@ -153,6 +194,53 @@ def test_path_from_uri(self): LocationPath.from_uri('file://c:/home/foo/names/') self.assertEqual(str(ec.exception), "Invalid URI 'file://c:/home/foo/names/'") + def test_get_uri(self): + for url in URL_CASES: + self.assertEqual(get_uri(*urlsplit(url)), url) + self.assertEqual(get_uri(*urlsplit(f' {url}')), url) + + url = 'D:/a/xmlschema/xmlschema/tests/test_cases/examples/' + self.assertNotEqual(get_uri(*urlsplit(url)), url) + + # Test urlsplit() roundtrip with urlunsplit() + for url in URL_CASES: + if url == 'file:other.xsd': + if sys.version_info < (3, 13): + self.assertNotEqual(urlsplit(url).geturl(), url) + elif url.startswith(('////', 'file:////')) and not is_unc_path('////'): + self.assertNotEqual(urlsplit(url).geturl(), url) + else: + self.assertEqual(urlsplit(url).geturl(), url) + self.assertEqual(urlsplit(f' {url}').geturl(), url) + + def test_get_uri_path(self): + self.assertEqual(get_uri_path('https', 'host', 'path', 'id=7', 'types'), + '//host/path') + self.assertEqual(get_uri_path('k', '', 'path/file', 'id=7', 'types'), + 'path/file') + self.assertEqual(get_uri_path('file', '', 'path/file', 'id=7', 'types'), + 'path/file') + + def test_urn_uri(self): + with self.assertRaises(ValueError) as ec: + LocationPath.from_uri("urn:ietf:rfc:2648") + self.assertIn("Can't create", str(ec.exception)) + + self.assertEqual(get_uri(scheme='urn', path='ietf:rfc:2648'), 'urn:ietf:rfc:2648') + self.assertEqual(get_uri_path(scheme='urn', path='ietf:rfc:2648'), 'ietf:rfc:2648') + + with self.assertRaises(ValueError) as ec: + get_uri_path(get_uri_path(scheme='urn', path='ietf:rfc:2648:')) + self.assertIn("Invalid URN path ", str(ec.exception)) + + for arg in ('authority', 'query', 'fragment'): + with self.assertRaises(ValueError) as ec: + get_uri_path(get_uri_path(scheme='urn', path='ietf:rfc:2648:', **{arg: 'foo'})) + + self.assertEqual( + str(ec.exception), "An URN can have only scheme and path components" + ) + @unittest.skipIf(platform.system() == 'Windows', "Run only on posix systems") def test_normalize_url_posix(self): url1 = "https://example.com/xsd/other_schema.xsd" @@ -206,7 +294,6 @@ def test_normalize_url_windows(self): normalize_url('file:///\\k:\\Dir A\\schema.xsd') self.assertIn("Invalid URI", str(ec.exception)) - # u base_url = 'D:/a/xmlschema/xmlschema/tests/test_cases/examples/' self.assertEqual(normalize_url('vehicles.xsd', base_url), f'file:///{base_url}vehicles.xsd') @@ -280,7 +367,10 @@ def test_normalize_url_with_base_unc_path(self,): self.assertEqual(url, 'file:////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') url = normalize_url(r'dev\XMLSCHEMA\test.xsd', base_url=base_url_host_in_path) - self.assertEqual(url, 'file://////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') + if is_unc_path('////filer01/MY_HOME/'): + self.assertEqual(url, 'file://////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') + else: + self.assertEqual(url, 'file:///filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') with patch.object(os, 'name', 'posix'): self.assertEqual(os.name, 'posix') @@ -294,7 +384,10 @@ def test_normalize_url_with_base_unc_path(self,): self.assertEqual(url, 'file:////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') url = normalize_url(r'dev/XMLSCHEMA/test.xsd', base_url=base_url_host_in_path) - self.assertEqual(url, 'file://////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') + if is_unc_path('////'): + self.assertEqual(url, 'file://////filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') + else: + self.assertEqual(url, 'file:///filer01/MY_HOME/dev/XMLSCHEMA/test.xsd') def test_normalize_url_slashes(self): # Issue #116 @@ -311,10 +404,17 @@ def test_normalize_url_slashes(self): self.assertRegex(normalize_url('/root/dir1/schema.xsd'), f'file://{DRIVE_REGEX}/root/dir1/schema.xsd') - self.assertRegex(normalize_url('////root/dir1/schema.xsd'), - f'file://{DRIVE_REGEX}////root/dir1/schema.xsd') - self.assertRegex(normalize_url('dir2/schema.xsd', '////root/dir1'), - f'file://{DRIVE_REGEX}////root/dir1/dir2/schema.xsd') + if is_unc_path('////root/dir1/schema.xsd'): + self.assertRegex(normalize_url('////root/dir1/schema.xsd'), + f'file://{DRIVE_REGEX}////root/dir1/schema.xsd') + self.assertRegex(normalize_url('dir2/schema.xsd', '////root/dir1'), + f'file://{DRIVE_REGEX}////root/dir1/dir2/schema.xsd') + else: + # If the Python release is not capable to detect the UNC path + self.assertRegex(normalize_url('////root/dir1/schema.xsd'), + f'file://{DRIVE_REGEX}/root/dir1/schema.xsd') + self.assertRegex(normalize_url('dir2/schema.xsd', '////root/dir1'), + f'file://{DRIVE_REGEX}/root/dir1/dir2/schema.xsd') self.assertEqual(normalize_url('//root/dir1/schema.xsd'), 'file:////root/dir1/schema.xsd') @@ -422,6 +522,56 @@ def test_url_path_is_file_function(self): with patch('platform.system', MagicMock(return_value="Windows")): self.assertFalse(url_path_is_file('file:///c:/Windows/unknown')) + def test_is_unc_path_function(self): + self.assertFalse(is_unc_path('')) + self.assertFalse(is_unc_path('foo')) + self.assertFalse(is_unc_path('foo\\bar')) + self.assertFalse(is_unc_path('foo/bar')) + self.assertFalse(is_unc_path('\\')) + self.assertFalse(is_unc_path('/')) + self.assertFalse(is_unc_path('\\foo\\bar')) + self.assertFalse(is_unc_path('/foo/bar')) + self.assertFalse(is_unc_path('c:foo/bar')) + self.assertFalse(is_unc_path('c:\\foo\\bar')) + self.assertFalse(is_unc_path('c:/foo/bar')) + + self.assertTrue(is_unc_path('/\\host/share/path')) + self.assertTrue(is_unc_path('\\/host\\share/path')) + self.assertTrue(is_unc_path('//host/share/dir/file')) + self.assertTrue(is_unc_path('//?/UNC/server/share/dir')) + + if sys.version_info >= (3, 12, 5): + # Generally these tests fail with older Python releases, due to + # bug/limitation of old versions of ntpath.splitdrive() + self.assertTrue(is_unc_path('//')) + self.assertTrue(is_unc_path('\\\\')) + self.assertTrue(is_unc_path('\\\\host\\share\\foo\\bar')) + self.assertTrue(is_unc_path('\\\\?\\UNC\\server\\share\\dir')) + self.assertTrue(is_unc_path('////')) + self.assertTrue(is_unc_path('////host/share/schema.xsd')) + + def test_is_drive_path_function(self): + self.assertFalse(is_drive_path('')) + self.assertFalse(is_drive_path('foo')) + self.assertFalse(is_drive_path('foo\\bar')) + self.assertFalse(is_drive_path('foo/bar')) + self.assertFalse(is_drive_path('\\')) + self.assertFalse(is_drive_path('/')) + self.assertFalse(is_drive_path('\\foo\\bar')) + self.assertFalse(is_drive_path('/foo/bar')) + + self.assertTrue(is_drive_path('c:foo/bar')) + self.assertTrue(is_drive_path('c:\\foo\\bar')) + self.assertTrue(is_drive_path('c:/foo/bar')) + self.assertFalse(is_drive_path('/c:foo/bar')) + self.assertFalse(is_drive_path('\\c:\\foo\\bar')) + self.assertFalse(is_drive_path('/c:/foo/bar')) + + self.assertFalse(is_drive_path('/\\host/share/path')) + self.assertFalse(is_drive_path('\\/host\\share/path')) + self.assertFalse(is_drive_path('//host/share/dir/file')) + self.assertFalse(is_drive_path('//?/UNC/server/share/dir')) + def test_is_encoded_url(self): self.assertFalse(is_encoded_url("https://xmlschema.test/schema/test.xsd")) self.assertTrue(is_encoded_url("https://xmlschema.test/schema/issue%20%231999.xsd")) diff --git a/xmlschema/locations.py b/xmlschema/locations.py index f6748330..002012f6 100644 --- a/xmlschema/locations.py +++ b/xmlschema/locations.py @@ -27,62 +27,6 @@ 'svn', 'svn+ssh', 'nfs', 'git', 'git+ssh', 'ws', 'wss')) -def get_uri_path(scheme: str = '', authority: str = '', path: str = '', - query: str = '', fragment: str = '') -> str: - """ - Get the URI path from components, according to https://datatracker.ietf.org/doc/html/rfc3986. - """ - if scheme == 'urn': - if not path or authority or query or fragment: - raise XMLSchemaValueError("an URN can have only scheme and the path components") - elif path.startswith(':') or path.endswith(':'): - raise XMLSchemaValueError(f"invalid URN path {path!r}") - return path - elif authority: - if path and path[0] != '/': - return f'{authority}/{path}' - else: - return f'{authority}{path}' - elif path[:2] == '//': - return f'//{path}' - elif scheme and scheme not in DRIVE_LETTERS and (not path or path[0] == '/'): - return f'//{path}' - else: - return path - - -def get_uri(scheme: str = '', authority: str = '', path: str = '', - query: str = '', fragment: str = '') -> str: - """ - Get the URI from components, according to https://datatracker.ietf.org/doc/html/rfc3986. - """ - if scheme == 'urn': - return f'urn:{get_uri_path(scheme, authority, path, query, fragment)}' - - path = get_uri_path(scheme, authority, path, query, fragment) - - if authority: - if path and path[0] != '/': - url = f'//{authority}/{path}' - else: - url = f'//{authority}{path}' - elif path[:2] == '//': - url = f'//{path}' - elif scheme and scheme not in DRIVE_LETTERS and (not path or path[0] == '/'): - url = f'//{path}' - else: - url = path - - if scheme: - url = scheme + ':' + url - if query: - url = url + '?' + query - if fragment: - url = url + '#' + fragment - - return url - - class LocationPath(PurePath): """ A version of pathlib.PurePath with an enhanced URI conversion and for @@ -117,38 +61,49 @@ def from_uri(cls, uri: str) -> 'LocationPath': parts = urlsplit(uri) if not parts.scheme or parts.scheme == 'file': - path = get_uri(authority=parts.netloc, path=parts.path) + path = get_uri_path(authority=parts.netloc, path=parts.path) + + # Detect invalid Windows paths (rooted or UNC path followed by a drive) + for k in range(len(path)): + if path[k] not in '/\\': + if not k or not is_drive_path(path[k:]): + break + elif k == 1 and parts.scheme == 'file': + # Valid case for a URL with a file scheme + return LocationWindowsPath(unquote(path[1:])) + else: + raise XMLSchemaValueError(f"Invalid URI {uri!r}") + + if '\\' in path or platform.system() == 'Windows': + return LocationWindowsPath(unquote(path)) + elif ntpath.splitdrive(path)[0]: + location_path = LocationWindowsPath(unquote(path)) + if location_path.drive: + # PureWindowsPath not detects a drive in Python 3.11.x also + # if it's detected by ntpath.splitdrive(). + return location_path + + return LocationPosixPath(unquote(path)) + elif parts.scheme in DRIVE_LETTERS: # uri is a Windows path with a drive, e.g. k:/Python/lib/file # urlsplit() converts the scheme to lowercase so use uri[0] - path = get_uri(scheme=uri[0], authority=parts.netloc, path=parts.path) - + path = f'{uri[0]}:{get_uri_path(authority=parts.netloc, path=parts.path)}' return LocationWindowsPath(unquote(path)) + + elif parts.scheme == 'urn': + raise XMLSchemaValueError(f"Can't create a {cls!r} from an URN!") else: return LocationPosixPath(unquote(parts.path)) - if parts.scheme == 'file': - # Detect Windows drives in a 'file' scheme URL - path_start = path[:4].replace('\\', '/') - if path_start.startswith(('////', '///')): - pass - elif path_start.startswith('/') and ntpath.splitdrive(path[1:])[0]: - return LocationWindowsPath(unquote(path[1:])) - elif path_start.startswith('//') and ntpath.splitdrive(path[2:])[0]: - raise XMLSchemaValueError(f"Invalid URI {uri!r}") - - if ntpath.splitdrive(path)[0] or '\\' in path: - return LocationWindowsPath(unquote(path)) - return cls(unquote(path)) - def as_uri(self) -> str: # Implementation that maps relative paths to not RFC 8089 compliant relative # file URIs because urlopen() doesn't accept simple paths. For UNC paths uses # the format with four slashes to let urlopen() works. drive = self.drive - if len(drive) == 2 and drive[1] == ':': + if len(drive) == 2 and drive[1] == ':' and drive[0] in DRIVE_LETTERS: # A Windows path with a drive: 'c:\dir\file' => 'file:///c:/dir/file' prefix = 'file:///' + drive path = self.as_posix()[2:] @@ -183,6 +138,50 @@ class LocationWindowsPath(LocationPath, PureWindowsPath): __slots__ = () +def get_uri_path(scheme: str = '', authority: str = '', path: str = '', + query: str = '', fragment: str = '') -> str: + """ + Get the URI path from components, according to https://datatracker.ietf.org/doc/html/rfc3986. + The returned path includes the authority. + """ + if scheme == 'urn': + if not path or authority or query or fragment: + raise XMLSchemaValueError("An URN can have only scheme and path components") + elif path.startswith(':') or path.endswith(':'): + raise XMLSchemaValueError(f"Invalid URN path {path!r}") + return path + elif authority: + if path and path[:1] != '/': + return f'//{authority}/{path}' + else: + return f'//{authority}{path}' + elif path[:2] == '//': + return f'//{path}' # UNC path + elif scheme and scheme not in DRIVE_LETTERS and (not path or path[0] == '/'): + return f'//{path}' + else: + return path + + +def get_uri(scheme: str = '', authority: str = '', path: str = '', + query: str = '', fragment: str = '') -> str: + """ + Get the URI from components, according to https://datatracker.ietf.org/doc/html/rfc3986. + """ + if scheme == 'urn': + return f'urn:{get_uri_path(scheme, authority, path, query, fragment)}' + + url = get_uri_path(scheme, authority, path, query, fragment) + if scheme: + url = scheme + ':' + url + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + + return url.rstrip() + + def normalize_url(url: str, base_url: Optional[str] = None, keep_relative: bool = False, method: str = 'xml') -> str: """ @@ -201,12 +200,7 @@ def normalize_url(url: str, base_url: Optional[str] = None, """ url_parts = urlsplit(url) if not is_local_scheme(url_parts.scheme): - url = get_uri(scheme=url_parts.scheme, - authority=url_parts.netloc, - path=url_parts.path, - query=url_parts.query, - fragment=url_parts.fragment) - return encode_url(url, method) + return encode_url(url_parts.geturl(), method) path = LocationPath.from_uri(url) if path.is_absolute(): @@ -305,6 +299,20 @@ def url_path_is_file(url: str) -> bool: return os.path.isfile(path) +def is_unc_path(path: str) -> bool: + """ + Returns `True` if the provided path is a UNC path, `False` otherwise. + Based on the capabilities of `PureWindowsPath` of the Python release. + """ + return PureWindowsPath(path).drive.startswith('\\\\') + + +def is_drive_path(path: str) -> bool: + """Returns `True` if the provided path starts with a drive (e.g. 'C:'), `False` otherwise.""" + drive = ntpath.splitdrive(path)[0] + return len(drive) == 2 and drive[1] == ':' and drive[0] in DRIVE_LETTERS + + def is_encoded_url(url: str) -> bool: """ Determines whether the given URL is encoded. The case with '+' and without