Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve caching mechanism #1281

Merged
merged 4 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ site/
gh_pages/
doc/README.md
doc/references/cli.md
data/diskcache
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ toml = "~0.10.2"
openfoodfacts = "0.1.10"
imagehash = "~4.3.1"
peewee-migrate = "~1.12.2"
diskcache = "~5.6.3"

[tool.poetry.dependencies.sentry-sdk]
version = "~1.14.0"
Expand Down
10 changes: 8 additions & 2 deletions robotoff/app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,11 @@
x_min = req.get_param_as_float("x_min", required=True)
y_max = req.get_param_as_float("y_max", required=True)
x_max = req.get_param_as_float("x_max", required=True)
image = get_image_from_url(image_url, session=http_session, error_raise=False)
# Get image from cache, as Hunger Games can requests many crops
# from the same image
image = get_image_from_url(

Check warning on line 701 in robotoff/app/api.py

View check run for this annotation

Codecov / codecov/patch

robotoff/app/api.py#L701

Added line #L701 was not covered by tests
image_url, session=http_session, error_raise=False, use_cache=True
)

if image is None:
raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
Expand Down Expand Up @@ -799,7 +803,9 @@
"when `output_image` is True",
)

image = get_image_from_url(image_url, session=http_session, error_raise=False)
image = get_image_from_url(

Check warning on line 806 in robotoff/app/api.py

View check run for this annotation

Codecov / codecov/patch

robotoff/app/api.py#L806

Added line #L806 was not covered by tests
image_url, session=http_session, error_raise=False, use_cache=True
)

if image is None:
raise falcon.HTTPBadRequest(f"Could not fetch image: {image_url}")
Expand Down
11 changes: 8 additions & 3 deletions robotoff/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
source_image: str,
image_url: str,
images: Optional[JSONType],
use_cache: bool = False,
) -> Optional[ImageModel]:
"""Save imported image details in DB.

Expand Down Expand Up @@ -83,7 +84,9 @@
# MongoDB (in the `images` field), we download the image to know the
# image size
logger.info("DB Product check disabled, downloading image to get image size")
image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(

Check warning on line 87 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L87

Added line #L87 was not covered by tests
image_url, error_raise=False, session=http_session, use_cache=use_cache
)

if image is None:
logger.info("Could not import image %s in DB", image_url)
Expand Down Expand Up @@ -131,7 +134,7 @@
source_image = generate_image_path(product_id, missing_image_id)
image_url = generate_image_url(product_id, missing_image_id)
logger.debug("Creating missing image %s in DB", source_image)
save_image(product_id, source_image, image_url, images)
save_image(product_id, source_image, image_url, images, use_cache=True)

Check warning on line 137 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L137

Added line #L137 was not covered by tests


def add_image_fingerprint(image_model: ImageModel):
Expand All @@ -140,7 +143,9 @@
:param image_model: the image model to update
"""
image_url = image_model.get_image_url()
image = get_image_from_url(image_url, error_raise=False, session=http_session)
image = get_image_from_url(

Check warning on line 146 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L146

Added line #L146 was not covered by tests
image_url, error_raise=False, session=http_session, use_cache=True
)

if image is None:
logger.info(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def generate_image_embeddings(
generate_image_url(product_id, f"{image_id}.400"),
error_raise=False,
session=http_session,
use_cache=True,
)
for image_id in missing_embedding_ids
}
Expand Down
10 changes: 3 additions & 7 deletions robotoff/prediction/ocr/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import gzip
import json
import re
from functools import cache
from pathlib import Path
from typing import BinaryIO, Iterable, Optional, Union

Expand All @@ -10,7 +11,6 @@
from robotoff import settings
from robotoff.types import Prediction, PredictionType
from robotoff.utils import get_logger
from robotoff.utils.cache import CachedStore
from robotoff.utils.text import KeywordProcessor, strip_accents_v1

# Increase version ID when introducing breaking change: changes for which we
Expand All @@ -31,6 +31,7 @@ class City:
coordinates: Optional[tuple[float, float]]


@cache
def load_cities_fr(source: Union[Path, BinaryIO, None] = None) -> set[City]:
"""Load French cities dataset.

Expand Down Expand Up @@ -254,11 +255,6 @@ def find_nearby_postal_code(
return match.group(1), sub_start + match.start(1), sub_start + match.end(1)


ADDRESS_EXTRACTOR_STORE = CachedStore(
lambda: AddressExtractor(load_cities_fr()), expiration_interval=None
)


def find_locations(content: Union[OCRResult, str]) -> list[Prediction]:
"""Find location predictions in the text content.

Expand All @@ -270,5 +266,5 @@ def find_locations(content: Union[OCRResult, str]) -> list[Prediction]:
Returns:
list of Prediction: See :meth:`.AddressExtractor.extract_addresses`.
"""
location_extractor: AddressExtractor = ADDRESS_EXTRACTOR_STORE.get()
location_extractor = AddressExtractor(load_cities_fr())
return location_extractor.extract_addresses(content)
23 changes: 6 additions & 17 deletions robotoff/prediction/ocr/packager_code.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from functools import cache
from typing import Optional, Union

from openfoodfacts.ocr import (
Expand All @@ -12,7 +13,6 @@
from robotoff import settings
from robotoff.types import Prediction, PredictionType
from robotoff.utils import text_file_iter
from robotoff.utils.cache import CachedStore
from robotoff.utils.text import KeywordProcessor

from .utils import generate_keyword_processor
Expand Down Expand Up @@ -52,16 +52,14 @@
unchecked_code = match.group().upper()
unchecked_code = re.sub(r"\s*\.*", "", unchecked_code)

processor = USDA_CODE_KEYWORD_PROCESSOR_STORE.get()
processor = generate_USDA_code_keyword_processor()

Check warning on line 55 in robotoff/prediction/ocr/packager_code.py

View check run for this annotation

Codecov / codecov/patch

robotoff/prediction/ocr/packager_code.py#L55

Added line #L55 was not covered by tests
USDA_code = extract_USDA_code(processor, unchecked_code)
return USDA_code


@cache
def generate_USDA_code_keyword_processor() -> KeywordProcessor:
"""Builds the KeyWordProcessor for USDA codes

This will be called only once thanks to CachedStore
"""
"""Builds the KeyWordProcessor for USDA codes."""

codes = text_file_iter(settings.OCR_USDA_CODE_FLASHTEXT_DATA_PATH)
return generate_keyword_processor(codes)
Expand All @@ -79,11 +77,6 @@
return USDA_code


USDA_CODE_KEYWORD_PROCESSOR_STORE = CachedStore(
fetch_func=generate_USDA_code_keyword_processor, expiration_interval=None
)


PACKAGER_CODE = {
"fr_emb": [
OCRRegex(
Expand Down Expand Up @@ -191,6 +184,7 @@
return results


@cache
def generate_fishing_code_keyword_processor() -> KeywordProcessor:
codes = text_file_iter(settings.OCR_FISHING_FLASHTEXT_DATA_PATH)
return generate_keyword_processor(("{}||{}".format(c.upper(), c) for c in codes))
Expand Down Expand Up @@ -226,13 +220,8 @@
return predictions


FISHING_KEYWORD_PROCESSOR_STORE = CachedStore(
fetch_func=generate_fishing_code_keyword_processor, expiration_interval=None
)


def find_packager_codes(content: Union[OCRResult, str]) -> list[Prediction]:
predictions = find_packager_codes_regex(content)
processor = FISHING_KEYWORD_PROCESSOR_STORE.get()
processor = generate_fishing_code_keyword_processor()
predictions += extract_fishing_code(processor, content)
return predictions
14 changes: 7 additions & 7 deletions robotoff/prediction/ocr/trace.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from functools import cache
from typing import Optional, Union

from openfoodfacts.ocr import (
Expand All @@ -12,7 +13,7 @@
from robotoff import settings
from robotoff.types import Prediction, PredictionType
from robotoff.utils import text_file_iter
from robotoff.utils.cache import CachedStore
from robotoff.utils.text.flashtext import KeywordProcessor

from .utils import generate_keyword_processor

Expand All @@ -21,7 +22,10 @@
PREDICTOR_VERSION = "1"


def generate_trace_keyword_processor(labels: Optional[list[str]] = None):
@cache
def generate_trace_keyword_processor(
labels: Optional[list[str]] = None,
) -> KeywordProcessor:
if labels is None:
labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH))

Expand All @@ -36,10 +40,6 @@
field=OCRField.full_text_contiguous,
)

TRACE_KEYWORD_PROCESSOR_STORE = CachedStore(
fetch_func=generate_trace_keyword_processor, expiration_interval=None
)


def find_traces(content: Union[OCRResult, str]) -> list[Prediction]:
predictions = []
Expand All @@ -49,7 +49,7 @@
if not text:
return []

processor = TRACE_KEYWORD_PROCESSOR_STORE.get()
processor = generate_trace_keyword_processor()

Check warning on line 52 in robotoff/prediction/ocr/trace.py

View check run for this annotation

Codecov / codecov/patch

robotoff/prediction/ocr/trace.py#L52

Added line #L52 was not covered by tests

for match in TRACES_REGEX.regex.finditer(text):
prompt = match.group()
Expand Down
4 changes: 4 additions & 0 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,7 @@ def get_package_version() -> str:
# (https://github.com/klen/peewee_migrate)
# Migrations are automatically applied when the API service is launched
MIGRATE_DIR = PROJECT_DIR / "migrations"


# Path of the local disk cache, see robotoff.cache for more information
DISKCACHE_DIR = DATA_DIR / "diskcache"
81 changes: 46 additions & 35 deletions robotoff/utils/cache.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,46 @@
import datetime
from typing import Callable, Optional

from robotoff.utils import get_logger

logger = get_logger(__name__)


class CachedStore:
def __init__(self, fetch_func: Callable, expiration_interval: Optional[int] = 30):
self.store = None
self.expires_after: Optional[datetime.datetime] = None
self.fetch_func: Callable = fetch_func
self.expiration_timedelta: Optional[datetime.timedelta]

if expiration_interval is not None:
self.expiration_timedelta = datetime.timedelta(minutes=expiration_interval)
else:
self.expiration_timedelta = None

def get(self, **kwargs):
if self.store is None or (
self.expiration_timedelta is not None
and datetime.datetime.utcnow() >= self.expires_after
):
if self.store is not None:
logger.info("CachedStore expired, reloading...")

if self.expiration_timedelta is not None:
self.expires_after = (
datetime.datetime.utcnow() + self.expiration_timedelta
)
self.store = self.fetch_func(**kwargs)

return self.store
from typing import Callable

from diskcache import Cache

from robotoff import settings

# Disk-cache to store any kind of content (but currently mostly images).
# It avoids having to download multiple times the same image from the server,
# with a reasonable disk usage (default to 1GB).
# diskcache Cache is thread-safe and process-safe, and every transaction is
# atomic. We can therefore define a single cache here and use it across the
# project.
disk_cache = Cache(settings.DISKCACHE_DIR)


def cache_http_request(
key: str,
func: Callable,
cache_expire: int | None = None,
tag: str | None = None,
*args,
**kwargs,
) -> bytes | None:
"""Cache raw response (bytes) of HTTP requests.

:param key: the cache key
:param func: the function to call, must return a Request object
:param cache_expire: expiration time of the item in the cache, defaults to
None (no expiration)
:param tag: a tag of the item in the cache (optional), defaults to None
:return: the response bytes or None if an error occured while calling
`func`
"""
# Check if the item is already cached, and use it instead of sending
# the HTTP request if it is
content_bytes = disk_cache.get(key)
if content_bytes is None:
r = func(*args, **kwargs)
if r is None:
# Don't save in cache if an error (or HTTP 404) occurred
return None
content_bytes = r.content
# We store the raw byte content of the response in the cache
disk_cache.set(key, r.content, expire=cache_expire, tag=tag)

return content_bytes
Loading
Loading