Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: store fingerprint of all images #1272

Merged
merged 1 commit into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 93 additions & 53 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ h5py = "~3.8.0"
opencv-contrib-python = "~4.7.0.72"
toml = "~0.10.2"
openfoodfacts = "0.1.10"
imagehash = "~4.3.1"

[tool.poetry.dependencies.sentry-sdk]
version = "~1.14.0"
Expand Down
39 changes: 39 additions & 0 deletions robotoff/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from pathlib import Path
from typing import Optional

import imagehash
import numpy as np
from PIL import Image

from robotoff.models import ImageModel
from robotoff.off import generate_image_path, generate_image_url
from robotoff.types import JSONType, ProductIdentifier
Expand Down Expand Up @@ -128,3 +132,38 @@
image_url = generate_image_url(product_id, missing_image_id)
logger.debug("Creating missing image %s in DB", source_image)
save_image(product_id, source_image, image_url, images)


def add_image_fingerprint(image_model: ImageModel):
"""Update image in DB to add the image fingerprint.

:param image_model: the image model to update
"""
image_url = image_model.get_image_url()
image = get_image_from_url(image_url, error_raise=False, session=http_session)

Check warning on line 143 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L142-L143

Added lines #L142 - L143 were not covered by tests

if image is None:
logger.info(

Check warning on line 146 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L145-L146

Added lines #L145 - L146 were not covered by tests
"could not fetch image from %s, aborting image fingerprinting", image_url
)
return

Check warning on line 149 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L149

Added line #L149 was not covered by tests

image_model.fingerprint = generate_image_fingerprint(image)
ImageModel.bulk_update([image_model], fields=["fingerprint"])

Check warning on line 152 in robotoff/images.py

View check run for this annotation

Codecov / codecov/patch

robotoff/images.py#L151-L152

Added lines #L151 - L152 were not covered by tests


def generate_image_fingerprint(image: Image.Image) -> int:
"""Generate a fingerprint from an image, used for near-duplicate
detection.

We use perceptual hashing algorithm.

:param image: the input image
:return: the fingerprint, as a 64-bit integer
"""
array = imagehash.phash(image).hash
# `int_array` is a flattened int array of dim 64
int_array = array.flatten().astype(int)
# convert the 64-bit array to a 64 bits integer
fingerprint = int_array.dot(2 ** np.arange(int_array.size)[::-1])
return fingerprint
13 changes: 13 additions & 0 deletions robotoff/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import datetime
import functools
import uuid
from pathlib import Path
from typing import Iterable

import peewee
from playhouse.postgres_ext import BinaryJSONField, PostgresqlExtDatabase
from playhouse.shortcuts import model_to_dict

from robotoff import settings
from robotoff.off import generate_image_url
from robotoff.types import ProductIdentifier, ServerType

db = PostgresqlExtDatabase(
Expand Down Expand Up @@ -245,13 +247,24 @@
height = peewee.IntegerField(null=False, index=True)
deleted = peewee.BooleanField(null=False, index=True, default=False)
server_type = peewee.CharField(null=True, max_length=10, index=True)
# Perceptual hash of the image, used to find near-duplicates
# It's a 64-bit bitmap, so it can be stored as a bigint (8 bits)
fingerprint = peewee.BigIntegerField(null=True, index=True)

class Meta:
table_name = "image"

def get_product_id(self) -> ProductIdentifier:
return ProductIdentifier(self.barcode, ServerType[self.server_type])

def get_image_url(self) -> str:
"""Get the full image URL from the product `barcode`, `server_type`
and `source_image` fields.

:return: the image URL
"""
return generate_image_url(self.get_product_id(), Path(self.source_image).stem)

Check warning on line 266 in robotoff/models.py

View check run for this annotation

Codecov / codecov/patch

robotoff/models.py#L266

Added line #L266 was not covered by tests


class ImagePrediction(BaseModel):
"""Table to store computer vision predictions (object detection,
Expand Down
31 changes: 29 additions & 2 deletions robotoff/workers/tasks/import_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from robotoff import settings
from robotoff.elasticsearch import get_es_client
from robotoff.images import save_image
from robotoff.images import add_image_fingerprint, save_image
from robotoff.insights.extraction import (
DEFAULT_OCR_PREDICTION_TYPES,
extract_ocr_predictions,
Expand Down Expand Up @@ -45,7 +45,7 @@
)
from robotoff.utils import get_image_from_url, get_logger, http_session
from robotoff.utils.image import convert_image_to_array
from robotoff.workers.queues import enqueue_job, get_high_queue
from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue

logger = get_logger(__name__)

Expand Down Expand Up @@ -92,6 +92,14 @@
ImageModel.bulk_update([image_model], fields=["deleted"])
return

# Compute image fingerprint, this job is low priority
enqueue_job(

Check warning on line 96 in robotoff/workers/tasks/import_image.py

View check run for this annotation

Codecov / codecov/patch

robotoff/workers/tasks/import_image.py#L96

Added line #L96 was not covered by tests
add_image_fingerprint_job,
low_queue,
job_kwargs={"result_ttl": 0},
image_model_id=image_model.id,
)

if product_id.server_type.is_food():
# Currently we don't support insight generation for projects other
# than OFF (OBF, OPF,...)
Expand Down Expand Up @@ -495,3 +503,22 @@
logos = [embedding.logo for embedding in logo_embeddings]
thresholds = get_logo_confidence_thresholds()
import_logo_insights(logos, thresholds=thresholds, server_type=server_type)


@with_db
def add_image_fingerprint_job(image_model_id: int):
"""Job to add the fingerprint of an image in DB.

:param image_model_id: the DB ID of the image
"""
logger.info("Computing fingerprint for image ID %s", image_model_id)

Check warning on line 514 in robotoff/workers/tasks/import_image.py

View check run for this annotation

Codecov / codecov/patch

robotoff/workers/tasks/import_image.py#L514

Added line #L514 was not covered by tests

image_model: ImageModel
if (image_model := ImageModel.get_or_none(id=image_model_id)) is None:
logger.warning(

Check warning on line 518 in robotoff/workers/tasks/import_image.py

View check run for this annotation

Codecov / codecov/patch

robotoff/workers/tasks/import_image.py#L517-L518

Added lines #L517 - L518 were not covered by tests
"image ID %s not found in DB, skipping fingerprint generation",
image_model_id,
)
return

Check warning on line 522 in robotoff/workers/tasks/import_image.py

View check run for this annotation

Codecov / codecov/patch

robotoff/workers/tasks/import_image.py#L522

Added line #L522 was not covered by tests

add_image_fingerprint(image_model)

Check warning on line 524 in robotoff/workers/tasks/import_image.py

View check run for this annotation

Codecov / codecov/patch

robotoff/workers/tasks/import_image.py#L524

Added line #L524 was not covered by tests
28 changes: 28 additions & 0 deletions tests/unit/test_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pathlib import Path

from PIL import Image

from robotoff.images import generate_image_fingerprint

IMAGE_DATA_DIR = Path(__file__).parent / "data/upc_image"


def load_test_image(file_name: str) -> Image.Image:
file_path = IMAGE_DATA_DIR / file_name
return Image.open(file_path)


def test_generate_image_fingerprint():
image_1 = load_test_image("no_upc1.jpg")
image_2 = load_test_image("no_upc2.jpg")
image_1_rescaled = image_1.copy()
image_1_rescaled.thumbnail((400, 400))

fingerprint_1 = generate_image_fingerprint(image_1)
fingerprint_2 = generate_image_fingerprint(image_2)
fingerprint_rescaled_1 = generate_image_fingerprint(image_1_rescaled)

# two different images should have different fingerprints
assert fingerprint_1 != fingerprint_2
# fingerprints should be invariant to rescaling
assert fingerprint_1 == fingerprint_rescaled_1
Loading