Skip to content

Commit

Permalink
fix: remove dataset push to HF
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Nov 15, 2024
1 parent dd016d9 commit 4403de7
Show file tree
Hide file tree
Showing 7 changed files with 3 additions and 236 deletions.
8 changes: 0 additions & 8 deletions .github/workflows/container-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
- name: Set various variable for production deployment
if: matrix.env == 'robotoff-org'
run: |
Expand All @@ -57,7 +56,6 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV
- name: Wait for container build workflow
uses: tomchv/wait-my-workflow@v1.1.0
id: wait-build
Expand Down Expand Up @@ -174,15 +172,9 @@ jobs:
# Google Cloud credentials
echo "GOOGLE_CREDENTIALS=${{ secrets.GOOGLE_CREDENTIALS }}" >> .env
# Token to push dataset to Hugging Face
echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> .env
# Secret key to secure batch job import
echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env
# Enable or not dataset push to Hugging Face
echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env
- name: Create Docker volumes
uses: appleboy/ssh-action@master
with:
Expand Down
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ x-robotoff-base-env:
GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
BATCH_JOB_KEY: # Secure Batch job import with a token key
HF_TOKEN: # Hugging Face token
ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default)

x-robotoff-worker-base:
&robotoff-worker
Expand Down
50 changes: 0 additions & 50 deletions robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
from pathlib import Path
from typing import Iterable, Iterator, Optional, Union

import duckdb
import requests
from huggingface_hub import HfApi
from pymongo import MongoClient

from robotoff import settings
Expand Down Expand Up @@ -574,51 +572,3 @@ def get_product(
:return: the product as a dict or None if it was not found
"""
return get_product_store(product_id.server_type).get_product(product_id, projection)


def convert_jsonl_to_parquet(
output_file_path: str,
dataset_path: Path = settings.JSONL_DATASET_PATH,
query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
) -> None:
logger.info("Start JSONL to Parquet conversion process.")
if not dataset_path.exists() or not query_path.exists():
raise FileNotFoundError(
f"{str(dataset_path)} or {str(query_path)} was not found."
)
query = (
query_path.read_text()
.replace("{dataset_path}", str(dataset_path))
.replace("{output_path}", output_file_path)
)
try:
duckdb.sql(query)
except duckdb.Error as e:
logger.error(f"Error executing query: {query}\nError message: {e}")
raise
logger.info("JSONL successfully converted into Parquet file.")


def push_data_to_hf(
data_path: str,
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated",
) -> None:
logger.info(f"Start pushing data to Hugging Face at {repo_id}")
if not os.path.exists(data_path):
raise FileNotFoundError(f"Data is missing: {data_path}")
if os.path.splitext(data_path)[-1] != ".parquet":
raise ValueError(
f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead."
)
# We use the HF_Hub api since it gives us way more flexibility than push_to_hub()
HfApi().upload_file(
path_or_fileobj=data_path,
repo_id=repo_id,
revision=revision,
repo_type="dataset",
path_in_repo="products.parquet",
commit_message=commit_message,
)
logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}")
26 changes: 2 additions & 24 deletions robotoff/scheduler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import os
import tempfile
import uuid
from typing import Iterable

Expand All @@ -24,11 +23,9 @@
from robotoff.models import Prediction, ProductInsight, db
from robotoff.products import (
Product,
convert_jsonl_to_parquet,
fetch_dataset,
get_min_product_store,
has_dataset_changed,
push_data_to_hf,
)
from robotoff.types import InsightType, ServerType
from robotoff.utils import get_logger
Expand Down Expand Up @@ -294,34 +291,15 @@ def update_insight_attributes(product: Product, insight: ProductInsight) -> bool

# this job does no use database
def _update_data() -> None:
"""Download the latest version of the Product Opener product JSONL dump,
convert it to Parquet format and push it to Hugging Face Hub.
Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is
set to 1.
"""
"""Download the latest version of the Product Opener product JSONL dump."""
logger.info("Downloading new version of product dataset")
ds_changed = False
try:
if ds_changed := has_dataset_changed():
if has_dataset_changed():
fetch_dataset()
except requests.exceptions.RequestException:
logger.exception("Exception during product dataset refresh")
return

if not settings.ENABLE_HF_PUSH:
logger.info("HF push is disabled, skipping Parquet conversion")
return

if ds_changed:
logger.info("Starting conversion of JSONL to Parquet (to HF)")
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, "converted_data.parquet")
convert_jsonl_to_parquet(output_file_path=file_path)
push_data_to_hf(data_path=file_path)
else:
logger.info("No changes in product dataset, skipping Parquet conversion")


def transform_insight_iter(insights_iter: Iterable[dict]):
for insight in insights_iter:
Expand Down
4 changes: 0 additions & 4 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,3 @@ def get_package_version() -> str:

# Batch jobs
GOOGLE_PROJECT_NAME = "robotoff"

# SQL queries paths
JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0)))
135 changes: 0 additions & 135 deletions robotoff/utils/sql/jsonl_to_parquet.sql

This file was deleted.

14 changes: 1 addition & 13 deletions tests/unit/test_products.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import json
from pathlib import Path
from typing import Optional

import pytest

from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image
from robotoff.products import is_special_image, is_valid_image
from robotoff.settings import TEST_DATA_DIR
from robotoff.types import JSONType

Expand Down Expand Up @@ -52,14 +51,3 @@ def test_is_valid_image(
output: bool,
):
assert is_valid_image(images, image_path) is output


class TestConvertJSONLToParquet:
def test_convert_jsonl_to_parquet_data_missing(self):
non_existing_path = Path("non/existing/dataset/path")
with pytest.raises(FileNotFoundError):
convert_jsonl_to_parquet(
output_file_path="any_path",
dataset_path=non_existing_path,
query_path=non_existing_path,
)

0 comments on commit 4403de7

Please sign in to comment.