From 1a4fcb5914cd8d0101b271498c40f7ca0ffdb784 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Sat, 28 Oct 2023 15:49:33 -0400 Subject: [PATCH 1/3] Add a view for HTTP headers This commit adds a migration that creates a view of the HTTP headers in the response table. Once the view is in place you can run a query like this without requiring JSON parsing: ```sql SELECT warc_record_id, name, value FROM http_headers; ``` It can be helpful for identifying for things like: ```sql SELECT value, COUNT(*) AS count FROM http_header WHERE name = 'content-type' GROUP BY value ORDER BY count DESC; value count --------------------------------- ----- application/javascript 57 image/png 11 text/css 7 text/html; charset=utf-8 6 image/jpeg 4 image/gif 4 text/fragment+html; charset=utf-8 3 image/svg+xml 3 text/plain 2 text/html; charset=UTF-8 1 ``` Closes #24 --- README.md | 3 +++ tests/test_warcdb.py | 16 ++++++++++++++++ warcdb/migrations.py | 14 ++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/README.md b/README.md index 31188e7..0c4a8ab 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,9 @@ Here's the relational schema of the `.warcdb` file. ![WarcDB Schema](schema.png) +In addition to the core tables that map to the WARC record types there are also helper views that make it a bit easier to query data: + +- *http_header*: A view of HTTP headers in responses where each row is a tuple of `(warc_record_id, name, value)` ## Motivation diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index bc6b41c..dc2be10 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -64,3 +64,19 @@ def test_column_names(): assert re.match(r"^[a-z_]+", col.name), f"column {col.name} named correctly" os.remove(db_file) + + +def test_http_header(): + runner = CliRunner() + runner.invoke( + warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] + ) + + db = sqlite_utils.Database(db_file) + headers = list(db["http_header"].rows) + assert len(headers) == 43 + assert { + "name": "content-type", + "value": "text/html; charset=UTF-8", + "warc_record_id": "", + } in headers diff --git a/warcdb/migrations.py b/warcdb/migrations.py index 64925ba..5e12da4 100644 --- a/warcdb/migrations.py +++ b/warcdb/migrations.py @@ -95,3 +95,17 @@ def m001_initial(db): ("warc_concurrent_to", "metadata", "warc_record_id"), ], ) + + +@migration() +def m002_headers(db): + db.create_view( + "http_header", + """ + SELECT + response.warc_record_id AS warc_record_id, + LOWER(JSON_EXTRACT(header.VALUE, '$.header')) AS name, + JSON_EXTRACT(header.VALUE, '$.value') AS value + FROM response, JSON_EACH(response.http_headers) AS header + """, + ) From 9c74b4b813af7e0275caf1b8c2c18d20cf8a9618 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 30 Oct 2023 09:28:07 -0400 Subject: [PATCH 2/3] Modified view names, add request headers and doc. Add a similar table for HTTP requests. Prefix the view names with a `v_` to distinguish it in the schema from actual tables. Also add a description of the view with a table that defines the columns. --- README.md | 24 ++++++++++++++++++++++-- tests/test_warcdb.py | 17 ++++++++++++++--- warcdb/migrations.py | 12 +++++++++++- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0c4a8ab..f5f95a9 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,29 @@ Here's the relational schema of the `.warcdb` file. ![WarcDB Schema](schema.png) -In addition to the core tables that map to the WARC record types there are also helper views that make it a bit easier to query data: +### Views -- *http_header*: A view of HTTP headers in responses where each row is a tuple of `(warc_record_id, name, value)` +In addition to the core tables that map to the WARC record types there are also helper *views* that make it a bit easier to query data: + +#### v_request_http_header + +A view of HTTP headers in WARC request records: + +| Column Name | Column Type | Description | +| -------------- | ----------- | ---------------------------------------------------------------------- | +| warc_record_id | text | The WARC-Record-Id for the *request* record that it was extracted from. | +| name | text | The lowercased HTTP header name (e.g. content-type) | +| value | text | The HTTP header value (e.g. text/html) | + +#### v_response_http_header + +A view of HTTP headers in WARC response records: + +| Column Name | Column Type | Description | +| -------------- | ----------- | ---------------------------------------------------------------------- | +| warc_record_id | text | The WARC-Record-Id for the *response* record that it was extracted from. | +| name | text | The lowercased HTTP header name (e.g. content-type) | +| value | text | The HTTP header value (e.g. text/html) | ## Motivation diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index dc2be10..fb5dd7c 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -73,10 +73,21 @@ def test_http_header(): ) db = sqlite_utils.Database(db_file) - headers = list(db["http_header"].rows) - assert len(headers) == 43 + + resp_headers = list(db["v_response_http_header"].rows) + assert len(resp_headers) == 43 assert { "name": "content-type", "value": "text/html; charset=UTF-8", "warc_record_id": "", - } in headers + } in resp_headers + + req_headers = list(db["v_request_http_header"].rows) + assert len(req_headers) == 17 + assert { + "name": "user-agent", + "value": "Wget/1.21.3", + "warc_record_id": "" + } in req_headers + + diff --git a/warcdb/migrations.py b/warcdb/migrations.py index 5e12da4..c17ce42 100644 --- a/warcdb/migrations.py +++ b/warcdb/migrations.py @@ -100,7 +100,17 @@ def m001_initial(db): @migration() def m002_headers(db): db.create_view( - "http_header", + "v_request_http_header", + """ + SELECT + request.warc_record_id AS warc_record_id, + LOWER(JSON_EXTRACT(header.VALUE, '$.header')) AS name, + JSON_EXTRACT(header.VALUE, '$.value') AS value + FROM request, JSON_EACH(request.http_headers) AS header + """, + ) + db.create_view( + "v_response_http_header", """ SELECT response.warc_record_id AS warc_record_id, From 3b705a48c3b3f4001572b2ffb88ac3b25728ffb7 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 30 Oct 2023 11:21:08 -0400 Subject: [PATCH 3/3] reformat for black --- tests/test_warcdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index fb5dd7c..5ba3dc0 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -87,7 +87,5 @@ def test_http_header(): assert { "name": "user-agent", "value": "Wget/1.21.3", - "warc_record_id": "" + "warc_record_id": "", } in req_headers - -