Skip to content

Commit

Permalink
fix: improve OCR dump script (#1351)
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 authored Jun 24, 2024
1 parent edeb3ee commit b49836b
Showing 1 changed file with 84 additions and 25 deletions.
109 changes: 84 additions & 25 deletions scripts/ocr/run_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import base64
import glob
import gzip
import logging
import os
import pathlib
import sys
Expand All @@ -22,8 +23,19 @@
import orjson
import requests

logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
file_handler = logging.FileHandler("run_ocr.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

API_KEY = os.environ.get("CLOUD_VISION_API_KEY")
MAXIMUM_MODIFICATION_DATETIME = datetime(year=2019, month=5, day=1)

if not API_KEY:
sys.exit("missing Google Cloud CLOUD_VISION_API_KEY as envvar")
Expand All @@ -33,7 +45,6 @@
API_KEY
)

BASE_IMAGE_DIR = pathlib.Path("/srv2/off/html/images/products")
session = requests.Session()


Expand Down Expand Up @@ -67,7 +78,7 @@ def get_base64_image_from_path(
if error_raise:
raise e
else:
print(e)
logger.exception(e)
return None


Expand Down Expand Up @@ -99,7 +110,7 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal
json_path = image_path.with_suffix(".json.gz")
if json_path.is_file():
if override:
# print("Deleting file {}".format(json_path))
logger.debug("Overriding file %s", json_path)
json_path.unlink()
else:
continue
Expand All @@ -115,9 +126,9 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal
r = run_ocr_on_image_batch([x[1] for x in images_content])

if not r.ok:
print("HTTP {} received".format(r.status_code))
print("Response: {}".format(r.text))
print(image_paths)
# logger.debug("HTTP %d received", r.status_code)
# logger.debug("Response: %s", r.text)
# logger.debug(image_paths)
return [], True

r_json = orjson.loads(r.content)
Expand All @@ -137,7 +148,7 @@ def dump_ocr(
json_path = image_path.with_suffix(".json.gz")

with gzip.open(str(json_path), "wb") as f:
# print("Dumping OCR JSON to {}".format(json_path))
logger.debug("Dumping OCR JSON to %s", json_path)
f.write(orjson.dumps({"responses": [response]}))

if performed_request and sleep:
Expand All @@ -149,32 +160,51 @@ def add_to_seen_set(seen_path: pathlib.Path, item: str):
f.write("{}\n".format(item))


def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
def add_missing_ocr(
base_image_dir: pathlib.Path,
sleep: float,
seen_path: pathlib.Path,
maximum_modification_datetime: Optional[datetime] = None,
dry_run: bool = False,
):
logger.info(
"Launching job with base_image_dir={},"
"sleep={}, "
"seen_path={}, "
"maximum_modification_datetime={}, "
"dry_run={}".format(
base_image_dir, sleep, seen_path, maximum_modification_datetime, dry_run
)
)
total = 0
missing = 0
json_error = 0
ocr_error = 0
valid = 0
empty_images = 0
expired = 0
# OCR is still in plain JSON
plain_json_count = 0

with seen_path.open("r", encoding="utf-8") as f:
seen_set = set(map(str.strip, f))

for i, image_path_str in enumerate(
glob.iglob("{}/**/*.jpg".format(BASE_IMAGE_DIR), recursive=True)
glob.iglob("{}/**/*.jpg".format(base_image_dir), recursive=True)
):
if i % 10000 == 0:
print(
"scanned: {}, total: {}, missing: {}, json_error: {}, ocr_error: {}, empty images: {}, valid: {}, "
"expired: {}".format(
logger.info(
"scanned: {}, total: {}, missing: {}, json_error: {}, "
"ocr_error: {}, empty images: {}, valid: {}, "
"plain_json: {}, expired: {}".format(
i,
total,
missing,
json_error,
ocr_error,
empty_images,
valid,
plain_json_count,
expired,
)
)
Expand All @@ -190,11 +220,13 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):

if not image_size:
empty_images += 1
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
add_to_seen_set(seen_path, image_path_str)
continue

if image_size >= 10485760:
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
add_to_seen_set(seen_path, image_path_str)
continue

json_path = image_path.with_suffix(".json.gz")
Expand All @@ -203,18 +235,24 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
if not json_path.is_file():
plain_json_path = image_path.with_suffix(".json")
if plain_json_path.is_file():
plain_json_count += 1
continue

missing += 1
dump_ocr([image_path], sleep=sleep, override=False)
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
dump_ocr([image_path], sleep=sleep, override=False)
add_to_seen_set(seen_path, image_path_str)
continue

modification_datetime = datetime.fromtimestamp(json_path.stat().st_mtime)
if modification_datetime < MAXIMUM_MODIFICATION_DATETIME:
if (
maximum_modification_datetime is not None
and modification_datetime < maximum_modification_datetime
):
expired += 1
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
continue

has_json_error = False
Expand All @@ -226,8 +264,9 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):

if has_json_error:
json_error += 1
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
continue

has_error = False
Expand All @@ -237,8 +276,9 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):

if has_error:
ocr_error += 1
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
if not dry_run:
dump_ocr([image_path], sleep=sleep, override=True)
add_to_seen_set(seen_path, image_path_str)
else:
valid += 1

Expand All @@ -247,5 +287,24 @@ def add_missing_ocr(sleep: float, seen_path: pathlib.Path):
parser = argparse.ArgumentParser()
parser.add_argument("--sleep", type=float, default=1.0)
parser.add_argument("--seen-path", type=pathlib.Path, required=True)
parser.add_argument(
"--maximum-modification-datetime",
required=False,
type=lambda s: datetime.fromisoformat(s),
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Dry run mode (don't write to disk or send Google Cloud Vision requests)",
)
parser.add_argument(
"--base-image-dir", type=pathlib.Path, default="/mnt/off/images/products/"
)
args = parser.parse_args()
add_missing_ocr(sleep=args.sleep, seen_path=args.seen_path)
add_missing_ocr(
base_image_dir=args.base_image_dir,
sleep=args.sleep,
seen_path=args.seen_path,
maximum_modification_datetime=args.maximum_modification_datetime,
dry_run=args.dry_run,
)

0 comments on commit b49836b

Please sign in to comment.