fix: remove dataset push to HF

It's now in https://github.com/openfoodfacts/openfoodfacts-exports repository
openfoodfacts · Nov 15, 2024 · 4403de7 · 4403de7
1 parent dd016d9
commit 4403de7
Show file tree

Hide file tree

Showing 7 changed files with 3 additions and 236 deletions.
diff --git a/.github/workflows/container-deploy.yml b/.github/workflows/container-deploy.yml
@@ -36,7 +36,6 @@ jobs:
         echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV
         echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV
         echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV
-        echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
     - name: Set various variable for production deployment
       if: matrix.env == 'robotoff-org'
       run: |
@@ -57,7 +56,6 @@ jobs:
         echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV
         echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV
         echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV
-        echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV
     - name: Wait for container build workflow
       uses: tomchv/wait-my-workflow@v1.1.0
       id: wait-build
@@ -174,15 +172,9 @@ jobs:
           # Google Cloud credentials
           echo "GOOGLE_CREDENTIALS=${{ secrets.GOOGLE_CREDENTIALS }}" >> .env
 
-          # Token to push dataset to Hugging Face
-          echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> .env
-
           # Secret key to secure batch job import
           echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env
 
-          # Enable or not dataset push to Hugging Face
-          echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env
-
     - name: Create Docker volumes
       uses: appleboy/ssh-action@master
       with:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -58,8 +58,6 @@ x-robotoff-base-env:
   GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
   GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
   BATCH_JOB_KEY: # Secure Batch job import with a token key 
-  HF_TOKEN: # Hugging Face token
-  ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default)
 
 x-robotoff-worker-base:
   &robotoff-worker

diff --git a/robotoff/products.py b/robotoff/products.py
@@ -10,9 +10,7 @@
 from pathlib import Path
 from typing import Iterable, Iterator, Optional, Union
 
-import duckdb
 import requests
-from huggingface_hub import HfApi
 from pymongo import MongoClient
 
 from robotoff import settings
@@ -574,51 +572,3 @@ def get_product(
     :return: the product as a dict or None if it was not found
     """
     return get_product_store(product_id.server_type).get_product(product_id, projection)
-
-
-def convert_jsonl_to_parquet(
-    output_file_path: str,
-    dataset_path: Path = settings.JSONL_DATASET_PATH,
-    query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
-) -> None:
-    logger.info("Start JSONL to Parquet conversion process.")
-    if not dataset_path.exists() or not query_path.exists():
-        raise FileNotFoundError(
-            f"{str(dataset_path)} or {str(query_path)} was not found."
-        )
-    query = (
-        query_path.read_text()
-        .replace("{dataset_path}", str(dataset_path))
-        .replace("{output_path}", output_file_path)
-    )
-    try:
-        duckdb.sql(query)
-    except duckdb.Error as e:
-        logger.error(f"Error executing query: {query}\nError message: {e}")
-        raise
-    logger.info("JSONL successfully converted into Parquet file.")
-
-
-def push_data_to_hf(
-    data_path: str,
-    repo_id: str = "openfoodfacts/product-database",
-    revision: str = "main",
-    commit_message: str = "Database updated",
-) -> None:
-    logger.info(f"Start pushing data to Hugging Face at {repo_id}")
-    if not os.path.exists(data_path):
-        raise FileNotFoundError(f"Data is missing: {data_path}")
-    if os.path.splitext(data_path)[-1] != ".parquet":
-        raise ValueError(
-            f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead."
-        )
-    # We use the HF_Hub api since it gives us way more flexibility than push_to_hub()
-    HfApi().upload_file(
-        path_or_fileobj=data_path,
-        repo_id=repo_id,
-        revision=revision,
-        repo_type="dataset",
-        path_in_repo="products.parquet",
-        commit_message=commit_message,
-    )
-    logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}")
diff --git a/robotoff/scheduler/__init__.py b/robotoff/scheduler/__init__.py
@@ -1,6 +1,5 @@
 import datetime
 import os
-import tempfile
 import uuid
 from typing import Iterable
 
@@ -24,11 +23,9 @@
 from robotoff.models import Prediction, ProductInsight, db
 from robotoff.products import (
     Product,
-    convert_jsonl_to_parquet,
     fetch_dataset,
     get_min_product_store,
     has_dataset_changed,
-    push_data_to_hf,
 )
 from robotoff.types import InsightType, ServerType
 from robotoff.utils import get_logger
@@ -294,34 +291,15 @@ def update_insight_attributes(product: Product, insight: ProductInsight) -> bool
 
 # this job does no use database
 def _update_data() -> None:
-    """Download the latest version of the Product Opener product JSONL dump,
-    convert it to Parquet format and push it to Hugging Face Hub.
-
-    Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is
-    set to 1.
-    """
+    """Download the latest version of the Product Opener product JSONL dump."""
     logger.info("Downloading new version of product dataset")
-    ds_changed = False
     try:
-        if ds_changed := has_dataset_changed():
+        if has_dataset_changed():
             fetch_dataset()
     except requests.exceptions.RequestException:
         logger.exception("Exception during product dataset refresh")
         return
 
-    if not settings.ENABLE_HF_PUSH:
-        logger.info("HF push is disabled, skipping Parquet conversion")
-        return
-
-    if ds_changed:
-        logger.info("Starting conversion of JSONL to Parquet (to HF)")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            file_path = os.path.join(tmp_dir, "converted_data.parquet")
-            convert_jsonl_to_parquet(output_file_path=file_path)
-            push_data_to_hf(data_path=file_path)
-    else:
-        logger.info("No changes in product dataset, skipping Parquet conversion")
-
 
 def transform_insight_iter(insights_iter: Iterable[dict]):
     for insight in insights_iter:

diff --git a/robotoff/settings.py b/robotoff/settings.py
@@ -360,7 +360,3 @@ def get_package_version() -> str:
 
 # Batch jobs
 GOOGLE_PROJECT_NAME = "robotoff"
-
-# SQL queries paths
-JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
-ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0)))
diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
diff --git a/tests/unit/test_products.py b/tests/unit/test_products.py
@@ -1,10 +1,9 @@
 import json
-from pathlib import Path
 from typing import Optional
 
 import pytest
 
-from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image
+from robotoff.products import is_special_image, is_valid_image
 from robotoff.settings import TEST_DATA_DIR
 from robotoff.types import JSONType
 
@@ -52,14 +51,3 @@ def test_is_valid_image(
     output: bool,
 ):
     assert is_valid_image(images, image_path) is output
-
-
-class TestConvertJSONLToParquet:
-    def test_convert_jsonl_to_parquet_data_missing(self):
-        non_existing_path = Path("non/existing/dataset/path")
-        with pytest.raises(FileNotFoundError):
-            convert_jsonl_to_parquet(
-                output_file_path="any_path",
-                dataset_path=non_existing_path,
-                query_path=non_existing_path,
-            )