openfoodfacts · raphael0202 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
@@ -42,3 +42,4 @@ site/
 gh_pages/
 doc/README.md
 doc/references/cli.md 
+data/diskcache
@@ -79,6 +79,7 @@ toml = "~0.10.2"
 openfoodfacts = "0.1.10"
 imagehash = "~4.3.1"
 peewee-migrate = "~1.12.2"
+diskcache = "~5.6.3"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"

@@ -696,7 +696,11 @@
         x_min = req.get_param_as_float("x_min", required=True)
         y_max = req.get_param_as_float("y_max", required=True)
         x_max = req.get_param_as_float("x_max", required=True)
-        image = get_image_from_url(image_url, session=http_session, error_raise=False)
+        # Get image from cache, as Hunger Games can requests many crops
+        # from the same image
+        image = get_image_from_url(
+            image_url, session=http_session, error_raise=False, use_cache=True
+        )
 
         if image is None:
             raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
@@ -799,7 +803,9 @@
                 "when `output_image` is True",
             )
 
-        image = get_image_from_url(image_url, session=http_session, error_raise=False)
+        image = get_image_from_url(
+            image_url, session=http_session, error_raise=False, use_cache=True
+        )
 
         if image is None:
             raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")

@@ -19,6 +19,7 @@
     source_image: str,
     image_url: str,
     images: Optional[JSONType],
+    use_cache: bool = False,
 ) -> Optional[ImageModel]:
     """Save imported image details in DB.
 
@@ -83,7 +84,9 @@
         # MongoDB (in the `images` field), we download the image to know the
         # image size
         logger.info("DB Product check disabled, downloading image to get image size")
-        image = get_image_from_url(image_url, error_raise=False, session=http_session)
+        image = get_image_from_url(
+            image_url, error_raise=False, session=http_session, use_cache=use_cache
+        )
 
         if image is None:
             logger.info("Could not import image %s in DB", image_url)
@@ -131,7 +134,7 @@
         source_image = generate_image_path(product_id, missing_image_id)
         image_url = generate_image_url(product_id, missing_image_id)
         logger.debug("Creating missing image %s in DB", source_image)
-        save_image(product_id, source_image, image_url, images)
+        save_image(product_id, source_image, image_url, images, use_cache=True)
 
 
 def add_image_fingerprint(image_model: ImageModel):
@@ -140,7 +143,9 @@
     :param image_model: the image model to update
     """
     image_url = image_model.get_image_url()
-    image = get_image_from_url(image_url, error_raise=False, session=http_session)
+    image = get_image_from_url(
+        image_url, error_raise=False, session=http_session, use_cache=True
+    )
 
     if image is None:
         logger.info(

@@ -149,6 +149,7 @@ def generate_image_embeddings(
                     generate_image_url(product_id, f"{image_id}.400"),
                     error_raise=False,
                     session=http_session,
+                    use_cache=True,
                 )
                 for image_id in missing_embedding_ids
             }

@@ -2,6 +2,7 @@
 import gzip
 import json
 import re
+from functools import cache
 from pathlib import Path
 from typing import BinaryIO, Iterable, Optional, Union
 
@@ -10,7 +11,6 @@
 from robotoff import settings
 from robotoff.types import Prediction, PredictionType
 from robotoff.utils import get_logger
-from robotoff.utils.cache import CachedStore
 from robotoff.utils.text import KeywordProcessor, strip_accents_v1
 
 # Increase version ID when introducing breaking change: changes for which we
@@ -31,6 +31,7 @@ class City:
     coordinates: Optional[tuple[float, float]]
 
 
+@cache
 def load_cities_fr(source: Union[Path, BinaryIO, None] = None) -> set[City]:
     """Load French cities dataset.
 
@@ -254,11 +255,6 @@ def find_nearby_postal_code(
             return match.group(1), sub_start + match.start(1), sub_start + match.end(1)
 
 
-ADDRESS_EXTRACTOR_STORE = CachedStore(
-    lambda: AddressExtractor(load_cities_fr()), expiration_interval=None
-)
-
-
 def find_locations(content: Union[OCRResult, str]) -> list[Prediction]:
     """Find location predictions in the text content.
 
@@ -270,5 +266,5 @@ def find_locations(content: Union[OCRResult, str]) -> list[Prediction]:
     Returns:
         list of Prediction: See :meth:`.AddressExtractor.extract_addresses`.
     """
-    location_extractor: AddressExtractor = ADDRESS_EXTRACTOR_STORE.get()
+    location_extractor = AddressExtractor(load_cities_fr())
     return location_extractor.extract_addresses(content)
@@ -1,4 +1,5 @@
 import re
+from functools import cache
 from typing import Optional, Union
 
 from openfoodfacts.ocr import (
@@ -12,7 +13,6 @@
 from robotoff import settings
 from robotoff.types import Prediction, PredictionType
 from robotoff.utils import text_file_iter
-from robotoff.utils.cache import CachedStore
 from robotoff.utils.text import KeywordProcessor
 
 from .utils import generate_keyword_processor
@@ -52,16 +52,14 @@
     unchecked_code = match.group().upper()
     unchecked_code = re.sub(r"\s*\.*", "", unchecked_code)
 
-    processor = USDA_CODE_KEYWORD_PROCESSOR_STORE.get()
+    processor = generate_USDA_code_keyword_processor()
     USDA_code = extract_USDA_code(processor, unchecked_code)
     return USDA_code
 
 
+@cache
 def generate_USDA_code_keyword_processor() -> KeywordProcessor:
-    """Builds the KeyWordProcessor for USDA codes
-
-    This will be called only once thanks to CachedStore
-    """
+    """Builds the KeyWordProcessor for USDA codes."""
 
     codes = text_file_iter(settings.OCR_USDA_CODE_FLASHTEXT_DATA_PATH)
     return generate_keyword_processor(codes)
@@ -79,11 +77,6 @@
     return USDA_code
 
 
-USDA_CODE_KEYWORD_PROCESSOR_STORE = CachedStore(
-    fetch_func=generate_USDA_code_keyword_processor, expiration_interval=None
-)
-
-
 PACKAGER_CODE = {
     "fr_emb": [
         OCRRegex(
@@ -191,6 +184,7 @@
     return results
 
 
+@cache
 def generate_fishing_code_keyword_processor() -> KeywordProcessor:
     codes = text_file_iter(settings.OCR_FISHING_FLASHTEXT_DATA_PATH)
     return generate_keyword_processor(("{}||{}".format(c.upper(), c) for c in codes))
@@ -226,13 +220,8 @@
     return predictions
 
 
-FISHING_KEYWORD_PROCESSOR_STORE = CachedStore(
-    fetch_func=generate_fishing_code_keyword_processor, expiration_interval=None
-)
-
-
 def find_packager_codes(content: Union[OCRResult, str]) -> list[Prediction]:
     predictions = find_packager_codes_regex(content)
-    processor = FISHING_KEYWORD_PROCESSOR_STORE.get()
+    processor = generate_fishing_code_keyword_processor()
     predictions += extract_fishing_code(processor, content)
     return predictions
@@ -1,4 +1,5 @@
 import re
+from functools import cache
 from typing import Optional, Union
 
 from openfoodfacts.ocr import (
@@ -12,7 +13,7 @@
 from robotoff import settings
 from robotoff.types import Prediction, PredictionType
 from robotoff.utils import text_file_iter
-from robotoff.utils.cache import CachedStore
+from robotoff.utils.text.flashtext import KeywordProcessor
 
 from .utils import generate_keyword_processor
 
@@ -21,7 +22,10 @@
 PREDICTOR_VERSION = "1"
 
 
-def generate_trace_keyword_processor(labels: Optional[list[str]] = None):
+@cache
+def generate_trace_keyword_processor(
+    labels: Optional[list[str]] = None,
+) -> KeywordProcessor:
     if labels is None:
         labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH))
 
@@ -36,10 +40,6 @@
     field=OCRField.full_text_contiguous,
 )
 
-TRACE_KEYWORD_PROCESSOR_STORE = CachedStore(
-    fetch_func=generate_trace_keyword_processor, expiration_interval=None
-)
-
 
 def find_traces(content: Union[OCRResult, str]) -> list[Prediction]:
     predictions = []
@@ -49,7 +49,7 @@
     if not text:
         return []
 
-    processor = TRACE_KEYWORD_PROCESSOR_STORE.get()
+    processor = generate_trace_keyword_processor()
 
     for match in TRACES_REGEX.regex.finditer(text):
         prompt = match.group()

@@ -336,3 +336,7 @@ def get_package_version() -> str:
 # (https://github.com/klen/peewee_migrate)
 # Migrations are automatically applied when the API service is launched
 MIGRATE_DIR = PROJECT_DIR / "migrations"
+
+
+# Path of the local disk cache, see robotoff.cache for more information
+DISKCACHE_DIR = DATA_DIR / "diskcache"
@@ -1,35 +1,46 @@
-import datetime
-from typing import Callable, Optional
-
-from robotoff.utils import get_logger
-
-logger = get_logger(__name__)
-
-
-class CachedStore:
-    def __init__(self, fetch_func: Callable, expiration_interval: Optional[int] = 30):
-        self.store = None
-        self.expires_after: Optional[datetime.datetime] = None
-        self.fetch_func: Callable = fetch_func
-        self.expiration_timedelta: Optional[datetime.timedelta]
-
-        if expiration_interval is not None:
-            self.expiration_timedelta = datetime.timedelta(minutes=expiration_interval)
-        else:
-            self.expiration_timedelta = None
-
-    def get(self, **kwargs):
-        if self.store is None or (
-            self.expiration_timedelta is not None
-            and datetime.datetime.utcnow() >= self.expires_after
-        ):
-            if self.store is not None:
-                logger.info("CachedStore expired, reloading...")
-
-            if self.expiration_timedelta is not None:
-                self.expires_after = (
-                    datetime.datetime.utcnow() + self.expiration_timedelta
-                )
-            self.store = self.fetch_func(**kwargs)
-
-        return self.store
+from typing import Callable
+
+from diskcache import Cache
+
+from robotoff import settings
+
+# Disk-cache to store any kind of content (but currently mostly images).
+# It avoids having to download multiple times the same image from the server,
+# with a reasonable disk usage (default to 1GB).
+# diskcache Cache is thread-safe and process-safe, and every transaction is
+# atomic. We can therefore define a single cache here and use it across the
+# project.
+disk_cache = Cache(settings.DISKCACHE_DIR)
+
+
+def cache_http_request(
+    key: str,
+    func: Callable,
+    cache_expire: int | None = None,
+    tag: str | None = None,
+    *args,
+    **kwargs,
+) -> bytes | None:
+    """Cache raw response (bytes) of HTTP requests.
+
+    :param key: the cache key
+    :param func: the function to call, must return a Request object
+    :param cache_expire: expiration time of the item in the cache, defaults to
+        None (no expiration)
+    :param tag: a tag of the item in the cache (optional), defaults to None
+    :return: the response bytes or None if an error occured while calling
+      `func`
+    """
+    # Check if the item is already cached, and use it instead of sending
+    # the HTTP request if it is
+    content_bytes = disk_cache.get(key)
+    if content_bytes is None:
+        r = func(*args, **kwargs)
+        if r is None:
+            # Don't save in cache if an error (or HTTP 404) occurred
+            return None
+        content_bytes = r.content
+        # We store the raw byte content of the response in the cache
+        disk_cache.set(key, r.content, expire=cache_expire, tag=tag)
+
+    return content_bytes