Add a view for HTTP headers

This commit adds a migration that creates a view of the HTTP headers in the response table. Once the view is in place you can run a query like this without requiring JSON parsing: ```sql SELECT warc_record_id, name, value FROM http_headers; ``` It can be helpful for identifying for things like: ```sql SELECT value, COUNT(*) AS count FROM http_header WHERE name = 'content-type' GROUP BY value ORDER BY count DESC; value count --------------------------------- ----- application/javascript 57 image/png 11 text/css 7 text/html; charset=utf-8 6 image/jpeg 4 image/gif 4 text/fragment+html; charset=utf-8 3 image/svg+xml 3 text/plain 2 text/html; charset=UTF-8 1 ``` Closes #24
Florents-Tselai · Oct 28, 2023 · 41848dd · 41848dd
1 parent 626f443
commit 41848dd
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,9 @@ Here's the relational schema of the `.warcdb` file.
 
 ![WarcDB Schema](schema.png)
 
+In addition to the core tables that map to the WARC record types there are also helper views that make it a bit easier to query data:
+
+- *http_header*: A view of HTTP headers in responses where each row is a tuple of `(warc_record_id, name, value)` 
 
 ## Motivation
 

diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py
@@ -64,3 +64,19 @@ def test_column_names():
  assert re.match(r"^[a-z_]+", col.name), f"column {col.name} named correctly"
 
  os.remove(db_file)
+
+
+def test_http_header():
+ runner = CliRunner()
+ runner.invoke(
+ warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))]
+ )
+
+ db = sqlite_utils.Database(db_file)
+ headers = list(db["http_header"].rows)
+ assert len(headers) == 43
+ assert {
+ "name": "content-type",
+ "value": "text/html; charset=UTF-8",
+ "warc_record_id": "<urn:uuid:2008CBED-030B-435B-A4DF-09A842DDB764>",
+ } in headers
diff --git a/warcdb/migrations.py b/warcdb/migrations.py
@@ -95,3 +95,17 @@ def m001_initial(db):
  ("warc_concurrent_to", "metadata", "warc_record_id"),
  ],
  )
+
+
+@migration()
+def m002_headers(db):
+ db.create_view(
+ "http_header",
+ """
+ SELECT
+ response.warc_record_id AS warc_record_id,
+ LOWER(JSON_EXTRACT(JSON_EACH.VALUE, '$.header')) AS name,
+ JSON_EXTRACT(JSON_EACH.VALUE, '$.value') AS value
+ FROM response, JSON_EACH(response.http_headers)
+ """,
+ )