diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index 5ba3dc0..05aa80c 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -89,3 +89,14 @@ def test_http_header(): "value": "Wget/1.21.3", "warc_record_id": "", } in req_headers + +def test_http_header(): + runner = CliRunner() + runner.invoke( + warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))] + ) + db = sqlite_utils.Database(db_file) + responses = db['response'].rows + assert next(responses)['http_status'] == 301 + assert next(responses)['http_status'] == 302 + assert next(responses)['http_status'] == 200 diff --git a/warcdb/__init__.py b/warcdb/__init__.py index 751111d..42597a6 100644 --- a/warcdb/__init__.py +++ b/warcdb/__init__.py @@ -182,6 +182,8 @@ def __iadd__(self, r: ArcWarcRecord): ) elif r.rec_type == "response": + if r.http_headers: + record_dict['http_status'] = r.http_headers.get_statuscode() self.db.table("response").insert( record_dict, pk="warc_record_id", diff --git a/warcdb/migrations.py b/warcdb/migrations.py index c17ce42..cf3ed3c 100644 --- a/warcdb/migrations.py +++ b/warcdb/migrations.py @@ -119,3 +119,7 @@ def m002_headers(db): FROM response, JSON_EACH(response.http_headers) AS header """, ) + +@migration() +def m003_status(db): + db['response'].add_column('http_status', int)