Skip to content

Commit

Permalink
Update URL parsers to include :// on empty netloc
Browse files Browse the repository at this point in the history
Python's `urlunparse` has this annoying behavior where it doesn't include double
slashes if the netloc param is empty. This is annoying with URLs like
`sqlite:///foo/bar/baz`. The parser would return `sqlite:/foo/bar/baz` in this
scenario. I've updated the safe/unsafe methods in `RecapSettings` to account for
this.
  • Loading branch information
criccomini committed Feb 5, 2024
1 parent 6ae133f commit 0ea697d
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 8 deletions.
4 changes: 3 additions & 1 deletion recap/clients/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def schema(self, table: str) -> StructType:
for row_cells in cursor.fetchall():
row = dict(zip(names, row_cells))
row = self.add_information_schema(row)
row = self.add_information_schema(row)
rows.append(row)

return self.converter.to_recap(rows)
Expand All @@ -119,6 +118,9 @@ def add_information_schema(self, row: dict[str, Any]) -> dict[str, Any]:
}

# Extract precision, scale, and octet length.
# This regex matches the following patterns:
# - <type>(<param1>(, <param2>)?)
# param1 can be a precision or octet length, and param2 can be a scale.
numeric_pattern = re_compile(r"(\w+)\((\d+)(?:,\s*(\d+))?\)")
param_match = numeric_pattern.search(row["TYPE"])

Expand Down
20 changes: 13 additions & 7 deletions recap/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,18 @@ def safe_urls(self) -> list[str]:
(
split_url.scheme,
netloc,
split_url.path.strip("/"),
split_url.path.rstrip("/"),
split_url.query,
split_url.fragment,
)
)

# urlunsplit does not double slashes if netloc is empty. But most
# URLs with empty netloc should have a double slash (e.g.
# bigquery:// or sqlite:///some/file.db).
if not netloc:
sanitized_url = sanitized_url.replace(":", "://", 1)

safe_urls_list.append(sanitized_url)

return safe_urls_list
Expand Down Expand Up @@ -107,17 +113,17 @@ def unsafe_url(self, url: str, strict: bool = True) -> str:
(
url_split.scheme,
netloc,
url_path.strip("/"),
url_path.rstrip("/"),
query,
url_split.fragment or unsafe_url_split.fragment,
)
)

# Unsplit returns a URL with a trailing colon if the URL only
# has a scheme. This looks weird, so include trailing double
# slash (e.g. bigquery: to bigquery://).
if merged_url == f"{url_split.scheme}:":
merged_url += "//"
# urlunsplit does not double slashes if netloc is empty. But most
# URLs with empty netloc should have a double slash (e.g.
# bigquery:// or sqlite:///some/file.db).
if not netloc:
merged_url = merged_url.replace(":", "://", 1)

return merged_url

Expand Down
10 changes: 10 additions & 0 deletions tests/unit/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@
["http://example.com?query=value", "http://user:pass@example.com#fragment"],
["http://example.com?query=value", "http://example.com#fragment"],
),
# SQLite empty netloc
(
["sqlite:///some/file.db"],
["sqlite:///some/file.db"],
),
# Complex SQLite
(
["sqlite:///file:mem1?mode=memory&cache=shared&uri=true"],
["sqlite:///file:mem1?mode=memory&cache=shared&uri=true"],
),
],
)
def test_safe_urls(input_urls, expected_output):
Expand Down

0 comments on commit 0ea697d

Please sign in to comment.