From b3e75b9fee14f5d05345383d17b5997b98b5180c Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 30 Oct 2023 12:11:59 -0400 Subject: [PATCH] Model response status Since the HTTP Response status code isn't in the headers dictionary it should be modeled separately. Fixes #24 --- tests/test_warcdb.py | 12 ++++++++++++ warcdb/__init__.py | 2 ++ warcdb/migrations.py | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index 5ba3dc0..ebb5db7 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -89,3 +89,15 @@ def test_http_header(): "value": "Wget/1.21.3", "warc_record_id": "", } in req_headers + + +def test_http_header(): + runner = CliRunner() + runner.invoke( + warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] + ) + db = sqlite_utils.Database(db_file) + responses = db["response"].rows + assert next(responses)["http_status"] == 301 + assert next(responses)["http_status"] == 302 + assert next(responses)["http_status"] == 200 diff --git a/warcdb/__init__.py b/warcdb/__init__.py index 751111d..cb5e561 100644 --- a/warcdb/__init__.py +++ b/warcdb/__init__.py @@ -182,6 +182,8 @@ def __iadd__(self, r: ArcWarcRecord): ) elif r.rec_type == "response": + if r.http_headers: + record_dict["http_status"] = r.http_headers.get_statuscode() self.db.table("response").insert( record_dict, pk="warc_record_id", diff --git a/warcdb/migrations.py b/warcdb/migrations.py index c17ce42..198a6ab 100644 --- a/warcdb/migrations.py +++ b/warcdb/migrations.py @@ -119,3 +119,8 @@ def m002_headers(db): FROM response, JSON_EACH(response.http_headers) AS header """, ) + + +@migration() +def m003_status(db): + db["response"].add_column("http_status", int)