Skip to content

Commit

Permalink
Merge pull request #16 from edsu/wacz
Browse files Browse the repository at this point in the history
Support import from WACZ files
  • Loading branch information
Florents-Tselai committed Oct 20, 2023
2 parents 7da8f4d + 89d8a75 commit 7ac8fdb
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 12 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
`WarcDB` is a an `SQLite`-based file format that makes web crawl data easier to share and query.

It is based on the standardized [Web ARChive format](https://en.wikipedia.org/wiki/Web_ARChive),
used by web archivers.
used by web archives, and defined in [ISO 28500:2017](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/).

## Usage

Expand Down Expand Up @@ -32,6 +32,12 @@ For example to get a part of the [Common Crawl January 2022 Crawl Archive ](http
warcdb import archive.warcdb "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320306346.64/warc/CC-MAIN-20220128212503-20220129002503-00719.warc.gz
```
You can also import WARC files contained in [WACZ](https://specs.webrecorder.net/wacz/latest) files, that are created by tools like [ArchiveWeb.Page](https://archiveweb.page), [Browsertrix-Crawler](https://github.com/webrecorder/browsertrix-crawler), and [Scoop](https://github.com/harvard-lil/scoop).
```shell
warcdb import archive.warcdb archive.wacz
```
## How It Works
Individual `.warc` files are read and parsed and their data is inserted into an SQLite database with the relational schema seen below.
Expand Down
Binary file added tests/scoop.wacz
Binary file not shown.
9 changes: 5 additions & 4 deletions tests/test_warcdb.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from click.testing import CliRunner
from warcdb import warcdb_cli
import os
import re
import pathlib
import re

import pytest
import sqlite_utils
from unittest import TestCase
from click.testing import CliRunner
from warcdb import warcdb_cli

db_file = "test_warc.db"
tests_dir = pathlib.Path(__file__).parent
Expand All @@ -16,6 +16,7 @@
@pytest.mark.parametrize("warc_path", [str(tests_dir / "google.warc"),
str(tests_dir / "google.warc.gz"),
str(tests_dir / "no-warc-info.warc"),
str(tests_dir / "scoop.wacz"),
"https://tselai.com/data/google.warc",
"https://tselai.com/data/google.warc.gz"
])
Expand Down
17 changes: 10 additions & 7 deletions warcdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import zipfile
from collections.abc import MutableMapping
from functools import cache
from itertools import chain
Expand Down Expand Up @@ -152,7 +153,7 @@ def __iadd__(self, r: ArcWarcRecord):
record_dict['payload'] = r.payload()

# Certain rec_types have http_headers
has_http_headers = r.rec_type in ['request', 'response']
has_http_headers = r.http_headers is not None
if has_http_headers:
record_dict['http_headers'] = r.http_headers.to_json()

Expand Down Expand Up @@ -261,13 +262,15 @@ def import_(db_path, warc_path, batch_size):
def to_import():
for f in always_iterable(warc_path):
if f.startswith('http'):
for record in tqdm(ArchiveIterator(req.get(f, stream=True).raw, arc2warc=True),
desc=f):
yield record
yield from tqdm(ArchiveIterator(req.get(f, stream=True).raw, arc2warc=True), desc=f)
elif f.endswith('.wacz'):
# TODO: can we support loading WACZ files by URL?
wacz = zipfile.ZipFile(f)
warcs = filter(lambda f: f.filename.endswith('warc.gz'), wacz.infolist())
for warc in warcs:
yield from tqdm(ArchiveIterator(wacz.open(warc.filename, 'r'), arc2warc=True), desc=warc.filename)
else:
with open(f, 'rb') as stream:
for record in tqdm(ArchiveIterator(stream), desc=f):
yield record
yield from tqdm(ArchiveIterator(open(f, 'rb'), arc2warc=True), desc=f)

for r in to_import():
db += r

0 comments on commit 7ac8fdb

Please sign in to comment.