diff --git a/.github/workflows/container-deploy.yml b/.github/workflows/container-deploy.yml index 2d756bbd6a..2d6188e6ce 100644 --- a/.github/workflows/container-deploy.yml +++ b/.github/workflows/container-deploy.yml @@ -36,7 +36,6 @@ jobs: echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV - echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV - name: Set various variable for production deployment if: matrix.env == 'robotoff-org' run: | @@ -57,7 +56,6 @@ jobs: echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV - echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV - name: Wait for container build workflow uses: tomchv/wait-my-workflow@v1.1.0 id: wait-build @@ -174,15 +172,9 @@ jobs: # Google Cloud credentials echo "GOOGLE_CREDENTIALS=${{ secrets.GOOGLE_CREDENTIALS }}" >> .env - # Token to push dataset to Hugging Face - echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> .env - # Secret key to secure batch job import echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env - # Enable or not dataset push to Hugging Face - echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env - - name: Create Docker volumes uses: appleboy/ssh-action@master with: diff --git a/docker-compose.yml b/docker-compose.yml index 4146f42ff2..85c2d04e28 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -58,8 +58,6 @@ x-robotoff-base-env: GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable BATCH_JOB_KEY: # Secure Batch job import with a token key - HF_TOKEN: # Hugging Face token - ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default) x-robotoff-worker-base: &robotoff-worker diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 05b74906d2..9652788988 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -1201,38 +1201,5 @@ def launch_normalize_barcode_job( logger.info("Updated %d images", updated) -@app.command() -def push_jsonl_to_hf( - repo_id: str = "openfoodfacts/product-database", - revision: str = "main", - commit_message: str = "Database updated", - output_path: Optional[str] = None, -): - """Clean and convert the JSONL database before pushing to HF. - Possibility to only convert the database locally by indicating an `output_path`. - """ - import os - import tempfile - - from robotoff.products import convert_jsonl_to_parquet, push_data_to_hf - from robotoff.utils.logger import get_logger - - logger = get_logger() - logger.info("Start command: convert JSONL to Parquet (to HF).") - if output_path: - convert_jsonl_to_parquet(output_file_path=output_path) - else: - with tempfile.TemporaryDirectory() as tmp_dir: - file_path = os.path.join(tmp_dir, "converted_data.parquet") - convert_jsonl_to_parquet(output_file_path=file_path) - push_data_to_hf( - data_path=file_path, - repo_id=repo_id, - revision=revision, - commit_message=commit_message, - ) - logger.info("JSONL to Parquet succesfully finished.") - - def main() -> None: app() diff --git a/robotoff/products.py b/robotoff/products.py index 284e4d8445..aaeb16cae5 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -10,9 +10,7 @@ from pathlib import Path from typing import Iterable, Iterator, Optional, Union -import duckdb import requests -from huggingface_hub import HfApi from pymongo import MongoClient from robotoff import settings @@ -574,51 +572,3 @@ def get_product( :return: the product as a dict or None if it was not found """ return get_product_store(product_id.server_type).get_product(product_id, projection) - - -def convert_jsonl_to_parquet( - output_file_path: str, - dataset_path: Path = settings.JSONL_DATASET_PATH, - query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY, -) -> None: - logger.info("Start JSONL to Parquet conversion process.") - if not dataset_path.exists() or not query_path.exists(): - raise FileNotFoundError( - f"{str(dataset_path)} or {str(query_path)} was not found." - ) - query = ( - query_path.read_text() - .replace("{dataset_path}", str(dataset_path)) - .replace("{output_path}", output_file_path) - ) - try: - duckdb.sql(query) - except duckdb.Error as e: - logger.error(f"Error executing query: {query}\nError message: {e}") - raise - logger.info("JSONL successfully converted into Parquet file.") - - -def push_data_to_hf( - data_path: str, - repo_id: str = "openfoodfacts/product-database", - revision: str = "main", - commit_message: str = "Database updated", -) -> None: - logger.info(f"Start pushing data to Hugging Face at {repo_id}") - if not os.path.exists(data_path): - raise FileNotFoundError(f"Data is missing: {data_path}") - if os.path.splitext(data_path)[-1] != ".parquet": - raise ValueError( - f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead." - ) - # We use the HF_Hub api since it gives us way more flexibility than push_to_hub() - HfApi().upload_file( - path_or_fileobj=data_path, - repo_id=repo_id, - revision=revision, - repo_type="dataset", - path_in_repo="products.parquet", - commit_message=commit_message, - ) - logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}") diff --git a/robotoff/scheduler/__init__.py b/robotoff/scheduler/__init__.py index 4fb879b62d..f7e7192b04 100644 --- a/robotoff/scheduler/__init__.py +++ b/robotoff/scheduler/__init__.py @@ -1,6 +1,5 @@ import datetime import os -import tempfile import uuid from typing import Iterable @@ -24,11 +23,9 @@ from robotoff.models import Prediction, ProductInsight, db from robotoff.products import ( Product, - convert_jsonl_to_parquet, fetch_dataset, get_min_product_store, has_dataset_changed, - push_data_to_hf, ) from robotoff.types import InsightType, ServerType from robotoff.utils import get_logger @@ -294,34 +291,15 @@ def update_insight_attributes(product: Product, insight: ProductInsight) -> bool # this job does no use database def _update_data() -> None: - """Download the latest version of the Product Opener product JSONL dump, - convert it to Parquet format and push it to Hugging Face Hub. - - Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is - set to 1. - """ + """Download the latest version of the Product Opener product JSONL dump.""" logger.info("Downloading new version of product dataset") - ds_changed = False try: - if ds_changed := has_dataset_changed(): + if has_dataset_changed(): fetch_dataset() except requests.exceptions.RequestException: logger.exception("Exception during product dataset refresh") return - if not settings.ENABLE_HF_PUSH: - logger.info("HF push is disabled, skipping Parquet conversion") - return - - if ds_changed: - logger.info("Starting conversion of JSONL to Parquet (to HF)") - with tempfile.TemporaryDirectory() as tmp_dir: - file_path = os.path.join(tmp_dir, "converted_data.parquet") - convert_jsonl_to_parquet(output_file_path=file_path) - push_data_to_hf(data_path=file_path) - else: - logger.info("No changes in product dataset, skipping Parquet conversion") - def transform_insight_iter(insights_iter: Iterable[dict]): for insight in insights_iter: diff --git a/robotoff/settings.py b/robotoff/settings.py index 9840f5321e..be5669f406 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -360,7 +360,3 @@ def get_package_version() -> str: # Batch jobs GOOGLE_PROJECT_NAME = "robotoff" - -# SQL queries paths -JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql" -ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0))) diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql deleted file mode 100644 index dfc8f8f175..0000000000 --- a/robotoff/utils/sql/jsonl_to_parquet.sql +++ /dev/null @@ -1,135 +0,0 @@ -SET threads to 4; -SET preserve_insertion_order = false; -COPY ( - SELECT - code, - additives_n, - additives_tags, - allergens_from_ingredients, - allergens_from_user, - allergens_tags, - brands_tags, - categories_properties_tags, - categories, - checkers_tags, - cities_tags, - compared_to_category, - complete, - completeness, - correctors_tags, - countries_tags, - created_t, - creator, - data_quality_errors_tags, - data_quality_info_tags, - data_quality_warnings_tags, - data_sources_tags, - ecoscore_data, - ecoscore_grade, - ecoscore_score, - ecoscore_tags, - editors, - emb_codes, - emb_codes_tags, - entry_dates_tags, - environment_impact_level, - food_groups_tags, - forest_footprint_data, - generic_name, - grades, - images, - informers_tags, - ingredients_analysis_tags, - ingredients_from_palm_oil_n, - ingredients_n, - ingredients_tags, - ingredients_text_with_allergens, - ingredients_text, - COLUMNS('ingredients_text_\w{2}$'), - ingredients_with_specified_percent_n, - ingredients_with_unspecified_percent_n, - ciqual_food_name_tags, - ingredients_percent_analysis, - ingredients_original_tags, - ingredients_without_ciqual_codes_n, - ingredients_without_ciqual_codes, - ingredients, - known_ingredients_n, - labels_tags, - lang, - languages_tags, - languages_codes, - last_edit_dates_tags, - last_editor, - last_image_t, - last_modified_by, - last_modified_t, - last_updated_t, - link, - main_countries_tags, - manufacturing_places, - manufacturing_places_tags, - max_imgid, - misc_tags, - minerals_tags, - new_additives_n, - no_nutrition_data, - nova_group, - nova_groups, - nova_groups_markers, - nova_groups_tags, - nucleotides_tags, - nutrient_levels_tags, - unknown_nutrients_tags, - nutriments, - nutriscore_data, - nutriscore_grade, - nutriscore_score, - nutriscore_tags, - nutrition_data_prepared_per, - nutrition_data, - nutrition_grades_tags, - nutrition_score_beverage, - nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients, - nutrition_score_warning_no_fiber, - nutrition_score_warning_no_fruits_vegetables_nuts, - obsolete_since_date, - obsolete, - origins_tags, - packaging_recycling_tags, - packaging_shapes_tags, - packaging_tags, - packagings_materials, - packagings_n, - packagings_n, - photographers, - pnns_groups_1_tags, - pnns_groups_2_tags, - popularity_key, - popularity_tags, - product_name, - product_quantity_unit, - product_quantity, - purchase_places_tags, - quantity, - rev, - scans_n, - scores, - serving_quantity, - serving_size, - sources, - sources_fields, - specific_ingredients, - states_tags, - stores, - stores_tags, - traces_tags, - unique_scans_n, - unknown_ingredients_n, - vitamins_tags, - weighers_tags, - with_non_nutritive_sweeteners, - with_sweeteners, - FROM read_ndjson('{dataset_path}', ignore_errors=True) -) TO '{output_path}' (FORMAT PARQUET) -; \ No newline at end of file diff --git a/tests/unit/test_products.py b/tests/unit/test_products.py index f298290844..a4113a29ce 100644 --- a/tests/unit/test_products.py +++ b/tests/unit/test_products.py @@ -1,10 +1,9 @@ import json -from pathlib import Path from typing import Optional import pytest -from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image +from robotoff.products import is_special_image, is_valid_image from robotoff.settings import TEST_DATA_DIR from robotoff.types import JSONType @@ -52,14 +51,3 @@ def test_is_valid_image( output: bool, ): assert is_valid_image(images, image_path) is output - - -class TestConvertJSONLToParquet: - def test_convert_jsonl_to_parquet_data_missing(self): - non_existing_path = Path("non/existing/dataset/path") - with pytest.raises(FileNotFoundError): - convert_jsonl_to_parquet( - output_file_path="any_path", - dataset_path=non_existing_path, - query_path=non_existing_path, - )