fix: improve OCR dump script (#1351)

openfoodfacts · Jun 24, 2024 · b49836b · b49836b
1 parent edeb3ee
commit b49836b
Showing 1 changed file with 84 additions and 25 deletions.
diff --git a/scripts/ocr/run_ocr.py b/scripts/ocr/run_ocr.py
@@ -12,6 +12,7 @@
 import base64
 import glob
 import gzip
+import logging
 import os
 import pathlib
 import sys
@@ -22,8 +23,19 @@
 import orjson
 import requests
 
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+stream_handler = logging.StreamHandler()
+stream_handler.setLevel(logging.INFO)
+stream_handler.setFormatter(formatter)
+logger.addHandler(stream_handler)
+file_handler = logging.FileHandler("run_ocr.log")
+file_handler.setLevel(logging.DEBUG)
+file_handler.setFormatter(formatter)
+logger.addHandler(file_handler)
+
 API_KEY = os.environ.get("CLOUD_VISION_API_KEY")
-MAXIMUM_MODIFICATION_DATETIME = datetime(year=2019, month=5, day=1)
 
 if not API_KEY:
     sys.exit("missing Google Cloud CLOUD_VISION_API_KEY as envvar")
@@ -33,7 +45,6 @@
     API_KEY
 )
 
-BASE_IMAGE_DIR = pathlib.Path("/srv2/off/html/images/products")
 session = requests.Session()
 
 
@@ -67,7 +78,7 @@ def get_base64_image_from_path(
         if error_raise:
             raise e
         else:
-            print(e)
+            logger.exception(e)
             return None
 
 
@@ -99,7 +110,7 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal
         json_path = image_path.with_suffix(".json.gz")
         if json_path.is_file():
             if override:
-                # print("Deleting file {}".format(json_path))
+                logger.debug("Overriding file %s", json_path)
                 json_path.unlink()
             else:
                 continue
@@ -115,9 +126,9 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal
     r = run_ocr_on_image_batch([x[1] for x in images_content])
 
     if not r.ok:
-        print("HTTP {} received".format(r.status_code))
-        print("Response: {}".format(r.text))
-        print(image_paths)
+        # logger.debug("HTTP %d received", r.status_code)
+        # logger.debug("Response: %s", r.text)
+        # logger.debug(image_paths)
         return [], True
 
     r_json = orjson.loads(r.content)
@@ -137,7 +148,7 @@ def dump_ocr(
         json_path = image_path.with_suffix(".json.gz")
 
         with gzip.open(str(json_path), "wb") as f:
-            # print("Dumping OCR JSON to {}".format(json_path))
+            logger.debug("Dumping OCR JSON to %s", json_path)
             f.write(orjson.dumps({"responses": [response]}))
 
     if performed_request and sleep:
@@ -149,32 +160,51 @@ def add_to_seen_set(seen_path: pathlib.Path, item: str):
         f.write("{}\n".format(item))
 
 
-def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
+def add_missing_ocr(
+    base_image_dir: pathlib.Path,
+    sleep: float,
+    seen_path: pathlib.Path,
+    maximum_modification_datetime: Optional[datetime] = None,
+    dry_run: bool = False,
+):
+    logger.info(
+        "Launching job with base_image_dir={},"
+        "sleep={}, "
+        "seen_path={}, "
+        "maximum_modification_datetime={}, "
+        "dry_run={}".format(
+            base_image_dir, sleep, seen_path, maximum_modification_datetime, dry_run
+        )
+    )
     total = 0
     missing = 0
     json_error = 0
     ocr_error = 0
     valid = 0
     empty_images = 0
     expired = 0
+    # OCR is still in plain JSON
+    plain_json_count = 0
 
     with seen_path.open("r", encoding="utf-8") as f:
         seen_set = set(map(str.strip, f))
 
     for i, image_path_str in enumerate(
-        glob.iglob("{}/**/*.jpg".format(BASE_IMAGE_DIR), recursive=True)
+        glob.iglob("{}/**/*.jpg".format(base_image_dir), recursive=True)
     ):
         if i % 10000 == 0:
-            print(
-                "scanned: {}, total: {}, missing: {}, json_error: {}, ocr_error: {}, empty images: {}, valid: {}, "
-                "expired: {}".format(
+            logger.info(
+                "scanned: {}, total: {}, missing: {}, json_error: {}, "
+                "ocr_error: {}, empty images: {}, valid: {}, "
+                "plain_json: {}, expired: {}".format(
                     i,
                     total,
                     missing,
                     json_error,
                     ocr_error,
                     empty_images,
                     valid,
+                    plain_json_count,
                     expired,
                 )
             )
@@ -190,11 +220,13 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
 
         if not image_size:
             empty_images += 1
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                add_to_seen_set(seen_path, image_path_str)
             continue
 
         if image_size >= 10485760:
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                add_to_seen_set(seen_path, image_path_str)
             continue
 
         json_path = image_path.with_suffix(".json.gz")
@@ -203,18 +235,24 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
         if not json_path.is_file():
             plain_json_path = image_path.with_suffix(".json")
             if plain_json_path.is_file():
+                plain_json_count += 1
                 continue
 
             missing += 1
-            dump_ocr([image_path], sleep=sleep, override=False)
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                dump_ocr([image_path], sleep=sleep, override=False)
+                add_to_seen_set(seen_path, image_path_str)
             continue
 
         modification_datetime = datetime.fromtimestamp(json_path.stat().st_mtime)
-        if modification_datetime < MAXIMUM_MODIFICATION_DATETIME:
+        if (
+            maximum_modification_datetime is not None
+            and modification_datetime < maximum_modification_datetime
+        ):
             expired += 1
-            dump_ocr([image_path], sleep=sleep, override=True)
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                dump_ocr([image_path], sleep=sleep, override=True)
+                add_to_seen_set(seen_path, image_path_str)
             continue
 
         has_json_error = False
@@ -226,8 +264,9 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
 
         if has_json_error:
             json_error += 1
-            dump_ocr([image_path], sleep=sleep, override=True)
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                dump_ocr([image_path], sleep=sleep, override=True)
+                add_to_seen_set(seen_path, image_path_str)
             continue
 
         has_error = False
@@ -237,8 +276,9 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
 
         if has_error:
             ocr_error += 1
-            dump_ocr([image_path], sleep=sleep, override=True)
-            add_to_seen_set(seen_path, image_path_str)
+            if not dry_run:
+                dump_ocr([image_path], sleep=sleep, override=True)
+                add_to_seen_set(seen_path, image_path_str)
         else:
             valid += 1
 
@@ -247,5 +287,24 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
     parser = argparse.ArgumentParser()
     parser.add_argument("--sleep", type=float, default=1.0)
     parser.add_argument("--seen-path", type=pathlib.Path, required=True)
+    parser.add_argument(
+        "--maximum-modification-datetime",
+        required=False,
+        type=lambda s: datetime.fromisoformat(s),
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Dry run mode (don't write to disk or send Google Cloud Vision requests)",
+    )
+    parser.add_argument(
+        "--base-image-dir", type=pathlib.Path, default="/mnt/off/images/products/"
+    )
     args = parser.parse_args()
-    add_missing_ocr(sleep=args.sleep, seen_path=args.seen_path)
+    add_missing_ocr(
+        base_image_dir=args.base_image_dir,
+        sleep=args.sleep,
+        seen_path=args.seen_path,
+        maximum_modification_datetime=args.maximum_modification_datetime,
+        dry_run=args.dry_run,
+    )