diff --git a/conf/systemd/sync_images_s3@.service b/conf/systemd/sync_images_s3@.service deleted file mode 100644 index 72558620132d9..0000000000000 --- a/conf/systemd/sync_images_s3@.service +++ /dev/null @@ -1,13 +0,0 @@ -# service instance name "%i" is off only (for now) -[Unit] -Description=Synchronize images to AWS S3 %i -# __ will be replaced by @ in email-failures@.service -OnFailure=email-failures@sync_images_s3__%i.service - -[Service] -Type=oneshot -User=off -Group=off -# Warning: this script doesn't work currently with non-off product type -ExecStart=/srv/%i/scripts/sync-s3-images/.venv/bin/python3 /srv/%i/scripts/sync-s3-images/sync_s3_images.py /mnt/off/images/products /mnt/off/html_data/openfoodfacts-products.jsonl.gz -KillMode=process diff --git a/conf/systemd/sync_images_s3@.timer b/conf/systemd/sync_images_s3@.timer deleted file mode 100644 index a95321360637f..0000000000000 --- a/conf/systemd/sync_images_s3@.timer +++ /dev/null @@ -1,12 +0,0 @@ -# service instance name "%i" is off only (for now) -[Unit] -Description=Synchronize images to AWS S3 daily - -[Timer] -# every tuesday -OnCalendar=Tue *-*-* 02:00:00 -# service instance name "%i" is off / obf / opff / opf -Unit=sync_images_s3@%i.service - -[Install] -WantedBy=multi-user.target diff --git a/scripts/sync-s3-images/README.md b/scripts/sync-s3-images/README.md deleted file mode 100644 index 0a4c5e475defc..0000000000000 --- a/scripts/sync-s3-images/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# AWS Open Dataset: Open Food Facts images - -This directory contains the [script](./sync_s3_images.py) that synchronizes -images and OCR results, from off1 to `openfoodfacts-images` bucket, as part of -AWS Open Dataset program. - -The dataset YAML description sent to [AWS Open Data -registry](https://github.com/awslabs/open-data-registry/tree/main) can be found -at [openfoodfacts-images.yml](./openfoodfacts-images.yml). \ No newline at end of file diff --git a/scripts/sync-s3-images/openfoodfacts-images.yaml b/scripts/sync-s3-images/openfoodfacts-images.yaml deleted file mode 100644 index 01a67136a2060..0000000000000 --- a/scripts/sync-s3-images/openfoodfacts-images.yaml +++ /dev/null @@ -1,16 +0,0 @@ -Name: Open Food Facts Images -Description: A dataset of all images of Open Food Facts, the biggest open - dataset of food products in the world. -Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/aws-images-dataset -Contact: contact@openfoodfacts.org -ManagedBy: "[Open Food Facts](https://world.openfoodfacts.org)" -UpdateFrequency: Monthly -License: All data contained in this dataset is licenced under the [Creative Commons Attribution ShareAlike licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en) -Tags: - - machine learning - - image processing -Resources: - - Description: Open Food Facts image dataset - ARN: arn:aws:s3:::openfoodfacts-images - Region: eu-west-3 - Type: S3 Bucket diff --git a/scripts/sync-s3-images/requirements.txt b/scripts/sync-s3-images/requirements.txt deleted file mode 100644 index 292b3565c2e7f..0000000000000 --- a/scripts/sync-s3-images/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -openfoodfacts==1.1.3 -orjson==3.10.7 -boto3==1.35.32 -tqdm==4.66.5 \ No newline at end of file diff --git a/scripts/sync-s3-images/sync_s3_images.py b/scripts/sync-s3-images/sync_s3_images.py deleted file mode 100644 index 1df4b6ac1de03..0000000000000 --- a/scripts/sync-s3-images/sync_s3_images.py +++ /dev/null @@ -1,207 +0,0 @@ -"""This script is used to synchronize Open Food Facts images and OCR JSONs on -AWS S3. As part of AWS Open Dataset program, we can host free of charge data on -AWS S3. - -This dataset can be used by researchers to access easily OFF data, without -overloading OFF servers. - -This script should be run regularly, to synchronize new images. We currently -upload: - -- all raw images (ex: 1.jpg, 2.jpg,...) -- 400px resized version of the raw images -- OCR results of the raw images (ex: 1.json.gz) -""" - -import argparse -import gzip -import logging -import re -import tempfile -from logging import getLogger -from pathlib import Path -from typing import Iterator, Tuple - -import boto3 -import tqdm -from openfoodfacts import ProductDataset -from openfoodfacts.images import split_barcode - -logger = getLogger() -handler = logging.StreamHandler() -formatter = logging.Formatter( - "%(asctime)s :: %(processName)s :: " - "%(threadName)s :: %(levelname)s :: " - "%(message)s" -) -handler.setFormatter(formatter) -handler.setLevel(logging.INFO) -logger.addHandler(handler) -logger.setLevel(logging.INFO) - -s3 = boto3.resource("s3", region_name="eu-west-3") -bucket = s3.Bucket("openfoodfacts-images") - - -BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$") - - -def generate_product_path(barcode: str) -> str: - if not barcode.isdigit(): - raise ValueError("unknown barcode format: {}".format(barcode)) - - splitted_barcode = split_barcode(barcode) - return "/".join(splitted_barcode) - - -def get_sync_filepaths( - base_dir: Path, ds: ProductDataset -) -> Iterator[Tuple[str, Path]]: - """Return an iterator containing files to synchronize with AWS S3 bucket. - - The iterator returns (barcode, file_path) tuples, where `barcode` is the - product barcode, and `file_path` is the path of the file to synchronize. - - We use the product dataset to know images associated with each products, - this way we don't push to S3 deleted images. - - We currently synchronize: - - - all raw images (ex: 1.jpg, 2.jpg,...) - - 400px resized version of the raw images - - OCR results of the raw images (ex: 1.json.gz) - - :param base_dir: directory where images are stored - :param ds: product dataset - """ - for item in tqdm.tqdm(ds, desc="products"): - barcode = item["code"] - if not barcode: - continue - product_path = generate_product_path(barcode) - product_dir = Path(product_path) - full_product_dir = base_dir / product_dir - - for image_id in item.get("images", {}).keys(): - if not image_id.isdigit(): - # Ignore selected image keys - continue - - # Only synchronize raw and 400px version of images - for image_name in ( - "{}.jpg".format(image_id), - "{}.400.jpg".format(image_id), - ): - full_image_path = full_product_dir / image_name - if not full_image_path.is_file(): - logger.warning("image {} not found".format(full_image_path)) - continue - yield barcode, product_dir / image_name - - # Synchronize OCR JSON if it exists - ocr_file_name = "{}.json.gz".format(image_id) - if (full_product_dir / ocr_file_name).is_file(): - yield barcode, product_dir / ocr_file_name - - -def run(image_dir: Path, dataset_path: Path) -> None: - """Launch the synchronization. - - :param image_dir: directory where images are stored - :param dataset_path: path to the JSONL dataset - """ - ds = ProductDataset(dataset_path=dataset_path) - logger.info("Fetching existing keys...") - existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/")) - logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys)) - dataset_keys = set() - - uploaded = 0 - kept = 0 - deleted = 0 - for barcode, file_path in get_sync_filepaths(image_dir, ds): - full_file_path = image_dir / file_path - key = "data/{}".format(file_path) - dataset_keys.add(key) - - if key in existing_keys: - logger.debug("File %s already exists on S3", key) - kept += 1 - continue - - extra_args = {"Metadata": {"barcode": barcode}} - if key.endswith(".jpg"): - extra_args["ContentType"] = "image/jpeg" - - logger.debug("Uploading file %s -> %s", full_file_path, key) - bucket.upload_file(str(full_file_path), key, ExtraArgs=extra_args) - uploaded += 1 - existing_keys.add(key) - - if (kept + uploaded) % 1000 == 0: - logger.info("uploaded: %d, kept: %d", uploaded, kept) - - logger.info("Removing deleted files...") - for missing_key in existing_keys - dataset_keys: - # Removing files associated with deleted images - logger.debug("Deleting S3 file %s", missing_key) - deleted += 1 - bucket.delete_objects( - Delete={ - "Objects": [ - {"Key": missing_key}, - ], - }, - ) - - # We upload all S3 keys in a single `data_keys.txt` text file - # to make it easier to know existing files on the bucket - - # Create a temporary directory to avoid uploading a corrupted file - tmp_dir = Path(tempfile.mkdtemp()) - data_keys_path = tmp_dir / "data_keys.txt" - logger.info("Saving data keys in %s", data_keys_path) - - with gzip.open(str(data_keys_path), "wt") as f: - f.write("\n".join(sorted(existing_keys))) - - logger.info("Uploading data keys...") - bucket.upload_file(str(data_keys_path), "data/data_keys.gz") - data_keys_path.unlink() - tmp_dir.rmdir() - - logger.info( - "Synchronization finished, uploaded: %d, kept: %d, deleted: %d", - uploaded, - kept, - deleted, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="""Synchronize Open Food Facts images and OCR files with AWS S3. - - This script should be run regularly, to synchronize new images. We currently - upload: - - - all raw images (ex: 1.jpg, 2.jpg,...) - - 400px resized version of the raw images - - OCR results of the raw images (ex: 1.json.gz) - - Before upload, the latest version of the dataset is downloaded from Open Food - Facts servers to get the list of images to synchronize. - """ - ) - parser.add_argument( - "image_dir", - type=Path, - help="Directory where images are stored.", - ) - parser.add_argument( - "dataset_path", - type=Path, - help="Directory where dataset is stored.", - ) - args = parser.parse_args() - run(args.image_dir, args.dataset_path)