Skip to content

Commit

Permalink
Add strict param to enforce RECAP_URLS for gateway
Browse files Browse the repository at this point in the history
The gateway has a gaping security hole. It allows users to ls/schema on
arbitrary URLs using whatever credentials the gateway host might have. This is
dangerous in a cloud environment where the host might be given a service account
with access to systems that end users should not have access to. It also is
dangerous now that we have a FilesystemClient that allows users to read the
local disk.

I've fixed this by forcing the gateway to run `ls` and `schema` commands with
`struct=True`. This parmater forces any URLs to be defined in the RECAP_URLS
environment variable. Unknown URLs will now fail with a ValueError.

I have left the CLI with `strict=False` because the users running locally should
be able to query whatever they want using the credentials they have on their
machine.
  • Loading branch information
criccomini committed Oct 20, 2023
1 parent 55b82cc commit fcab653
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 39 deletions.
4 changes: 2 additions & 2 deletions recap/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def ls(url: Annotated[Optional[str], typer.Argument(help="URL to parent.")] = No
List a URL's children.
"""

if children := commands.ls(url):
if children := commands.ls(url, strict=False):
print_json(data=children)


Expand All @@ -30,7 +30,7 @@ def schema(
Get a URL's schema.
"""

struct_obj = commands.schema(url, output_format)
struct_obj = commands.schema(url, output_format, strict=False)
match struct_obj:
case dict():
print_json(data=struct_obj)
Expand Down
5 changes: 3 additions & 2 deletions recap/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,13 @@ def create_client(url: str) -> Generator[Client, None, None]:
raise ValueError(f"No clients available for scheme: {scheme}")


def parse_url(method: str, url: str) -> tuple[str, list[Any]]:
def parse_url(method: str, url: str, strict: bool = True) -> tuple[str, list[Any]]:
"""
Parse a URL into a connection URL and a list of method arguments.
:param method: Either "ls" or "schema".
:param url: URL to parse
:param strict: If True, raise an error if the URL is not configured in settings.
:return: Tuple of connection URL and list of method arguments.
"""

Expand All @@ -63,7 +64,7 @@ def parse_url(method: str, url: str) -> tuple[str, list[Any]]:
module = import_module(module_path)
client_class = getattr(module, class_name)
connection_url, method_args = client_class.parse(method, **url_args)
return (settings.unsafe_url(connection_url), method_args)
return (settings.unsafe_url(connection_url, strict), method_args)
else:
raise ValueError(f"No clients available for scheme: {scheme}")

Expand Down
14 changes: 10 additions & 4 deletions recap/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,37 @@ class SchemaFormat(str, Enum):
}


def ls(url: str | None = None) -> list[str] | None:
def ls(url: str | None = None, strict: bool = True) -> list[str] | None:
"""
List a URL's children.
:param url: URL where children are located. If `url` is None, list root URLs.
:param strict: If True, raise an error if the URL is not configured in settings.
:return: List of children. Values are relative to `url`.
"""

if not url:
return settings.safe_urls
connection_url, method_args = parse_url("ls", url)
connection_url, method_args = parse_url("ls", url, strict)
with create_client(connection_url) as client:
return client.ls(*method_args)


def schema(url: str, format: SchemaFormat = SchemaFormat.recap) -> dict | str:
def schema(
url: str,
format: SchemaFormat = SchemaFormat.recap,
strict: bool = True,
) -> dict | str:
"""
Get a URL's schema.
:param url: URL where schema is located.
:param format: Schema format to convert to.
:param strict: If True, raise an error if the URL is not configured in settings.
:return: Schema in the requested format (encoded as a dict or string).
"""

connection_url, method_args = parse_url("schema", url)
connection_url, method_args = parse_url("schema", url, strict)
with create_client(connection_url) as client:
recap_struct = client.schema(*method_args)
output_obj: dict | str
Expand Down
20 changes: 16 additions & 4 deletions recap/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,21 @@ def safe_urls(self) -> list[str]:

return safe_urls_list

def unsafe_url(self, url: str) -> str:
def unsafe_url(self, url: str, strict: bool = True) -> str:
"""
If scheme, host, and port match a URL in the settings, merge the unsafe
URL's user, pass, path, query, and fragment into the safe URL and return it.
Merge the a URL's user, pass, path, query, and fragment in settings
into the input URL and return it. Merging occurs if:
- The input URL's scheme, hostname, and port match a URL in settings.
- The input URL's path starts with the settings URL's path.
:param url: URL to merge
:param strict: If True, raise an error if the URL is not configured in settings.
:return: URL with user, pass, path, query, and fragment merged from settings.
"""

url_split = urlsplit(url)
url_path = Path(url_split.path or "/").as_posix()

for unsafe_url in self.urls:
unsafe_url_split = urlsplit(unsafe_url.unicode_string())
Expand All @@ -83,6 +91,7 @@ def unsafe_url(self, url: str) -> str:
unsafe_url_split.scheme == url_split.scheme
and unsafe_url_split.hostname == url_split.hostname
and unsafe_url_split.port == url_split.port
and url_path.startswith(Path(unsafe_url_split.path).as_posix())
):
netloc = unsafe_url_split.netloc

Expand All @@ -98,7 +107,7 @@ def unsafe_url(self, url: str) -> str:
(
url_split.scheme,
netloc,
url_split.path.strip("/"),
url_path.strip("/"),
query,
url_split.fragment or unsafe_url_split.fragment,
)
Expand All @@ -112,4 +121,7 @@ def unsafe_url(self, url: str) -> str:

return merged_url

if strict:
raise ValueError(f"URL must be configured in Recap settings: {url}")

return url
12 changes: 8 additions & 4 deletions tests/unit/clients/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,38 +44,42 @@ def test_url_to_dict_multiple_query_params():


@pytest.mark.parametrize(
"method, url, expected_result",
"method, url, expected_result, strict",
[
# ls
(
"ls",
"bigquery://bigquery-url/path1/path2",
("bigquery://", ["path1", "path2", None]),
False,
),
# schema
(
"schema",
"bigquery://bigquery-url/path1/path2/path3",
("bigquery://", ["path1", "path2", "path3"]),
False,
),
# Example of a scheme that isn't supported
(
"ls",
"invalidscheme://invalid-url",
pytest.raises(ValueError, match="No clients available for scheme"),
False,
),
# Test invalid method
(
"invalid_method",
"bigquery://bigquery-url",
pytest.raises(ValueError, match="Invalid method"),
False,
),
],
)
def test_parse_url(method, url, expected_result):
def test_parse_url(method, url, expected_result, strict):
if isinstance(expected_result, tuple):
result = parse_url(method, url)
result = parse_url(method, url, strict)
assert result == expected_result
else:
with expected_result:
parse_url(method, url)
parse_url(method, url, strict)
75 changes: 52 additions & 23 deletions tests/unit/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,96 +55,125 @@ def test_safe_urls(input_urls, expected_output):


@pytest.mark.parametrize(
"stored_urls, merging_url, expected_output",
"stored_urls, merging_url, expected_output, strict",
[
# Basic test with scheme, host, and port match
(
["http://user:pass@example.com"],
"http://example.com",
"http://user:pass@example.com",
True,
),
# Test with a different path in the merging URL
(
["http://user:pass@example.com"],
"http://example.com/path",
"http://user:pass@example.com/path",
True,
),
# Test with a query in the merging URL
(
["http://user:pass@example.com"],
"http://example.com?query=value",
"http://user:pass@example.com?query=value",
True,
),
# Test with a fragment in the merging URL
(
["http://user:pass@example.com"],
"http://example.com#fragment",
"http://user:pass@example.com#fragment",
True,
),
# Test with both a query and a fragment in the merging URL
(
["http://user:pass@example.com"],
"http://example.com?query=value#fragment",
"http://user:pass@example.com?query=value#fragment",
True,
),
# Merging query parameters from both stored and merging URL
(
["http://user:pass@example.com?stored=query"],
"http://example.com?query=value",
"http://user:pass@example.com?stored=query&query=value",
True,
),
# Merging fragments from both stored and merging URL (prefer merging URL's fragment)
(
["http://user:pass@example.com#stored_fragment"],
"http://example.com#fragment",
"http://user:pass@example.com#fragment",
True,
),
# No matching base URL should result in original merging URL being returned
(["http://different.com"], "http://example.com", "http://example.com"),
# Basic merging of paths
(
["http://user:pass@example.com/path1"],
"http://example.com/path2",
"http://user:pass@example.com/path2",
"http://example.com/path1/path2",
"http://user:pass@example.com/path1/path2",
True,
),
# Merging path and query
(
["http://user:pass@example.com/path1?query1=value1"],
"http://example.com/path2?query2=value2",
"http://user:pass@example.com/path2?query1=value1&query2=value2",
"http://example.com/path1/path2?query2=value2",
"http://user:pass@example.com/path1/path2?query1=value1&query2=value2",
True,
),
# Merging path and fragment
(
["http://user:pass@example.com/path1#fragment1"],
"http://example.com/path2#fragment2",
"http://user:pass@example.com/path2#fragment2",
"http://example.com/path1/path2#fragment2",
"http://user:pass@example.com/path1/path2#fragment2",
True,
),
# Merging path, query, and fragment
(
["http://user:pass@example.com/path1?query1=value1#fragment1"],
"http://example.com/path2?query2=value2#fragment2",
"http://user:pass@example.com/path2?query1=value1&query2=value2#fragment2",
"http://example.com/path1/path2?query2=value2#fragment2",
"http://user:pass@example.com/path1/path2?query1=value1&query2=value2#fragment2",
True,
),
# When both URLs have the same paths but different queries or fragments
(
["http://user:pass@example.com/path"],
"http://example.com/path?query=value",
"http://user:pass@example.com/path?query=value",
True,
),
# When stored URL has a longer path than merging URL
# Test unstrict mode
(
["http://user:pass@example.com/path1/path2"],
"http://example.com/path1",
"http://user:pass@example.com/path1",
),
# When merging URL has a longer path than stored URL
(
["http://user:pass@example.com/path1"],
"http://example.com/path1/path2",
"http://user:pass@example.com/path1/path2",
["http://some-other.com"],
"http://example.com/path",
"http://example.com/path",
False,
),
],
)
def test_unsafe_url(stored_urls, merging_url, expected_output):
def test_unsafe_url(stored_urls, merging_url, expected_output, strict):
settings = RecapSettings(urls=stored_urls)
assert settings.unsafe_url(merging_url) == expected_output
assert settings.unsafe_url(merging_url, strict) == expected_output


@pytest.mark.parametrize(
"stored_urls, merging_url",
[
# Different paths
(["http://example.com/path1"], "http://example.com/path2"),
# Shorter path
(["http://example.com/path1/path2"], "http://example.com/path1"),
# Different ports
(["http://example.com:8080"], "http://example.com:9090"),
# Different schemes
(["http://example.com"], "https://example.com"),
# Different hostnames
(["http://example1.com"], "http://example2.com"),
],
)
def test_unsafe_url_exceptions(stored_urls, merging_url):
settings = RecapSettings(urls=stored_urls)

with pytest.raises(
ValueError, match=f"URL must be configured in Recap settings: {merging_url}"
):
settings.unsafe_url(merging_url)

0 comments on commit fcab653

Please sign in to comment.