Skip to content

Commit

Permalink
Merge branch 'rl-0.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
amenezes committed Mar 10, 2022
2 parents 5a9daeb + ea3b947 commit a89b4bc
Show file tree
Hide file tree
Showing 17 changed files with 220 additions and 114 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
tests:
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', 'pypy-3.8']
python-version: ['3.8', '3.9', '3.10', 'pypy-3.8', 'pypy-3.9']
os: [ubuntu]
fail-fast: true
runs-on: ${{ matrix.os }}-latest
Expand Down
2 changes: 1 addition & 1 deletion aiopytesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
tesseract_version,
)

__version__ = "0.3.0"
__version__ = "0.4.0"
__all__ = [
"__version__",
"confidence",
Expand Down
48 changes: 29 additions & 19 deletions aiopytesseract/base_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,25 @@
from ._logger import logger
from .constants import (
AIOPYTESSERACT_DEFAULT_ENCODING,
AIOPYTESSERACT_DEFAULT_TIMEOUT,
OUTPUT_FILE_EXTENSIONS,
TESSERACT_CMD,
)
from .exceptions import TesseractNotFoundError, TesseractRuntimeError
from .validators import file_exists, language_is_valid, oem_is_valid, psm_is_valid


async def execute_cmd(cmd_args: str):
async def execute_cmd(cmd_args: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT):
logger.debug(f"Command: '{TESSERACT_CMD} {shlex.join(shlex.split(cmd_args))}'")
proc = await asyncio.create_subprocess_exec(
TESSERACT_CMD,
*shlex.split(cmd_args),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*shlex.split(cmd_args),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
return proc

Expand Down Expand Up @@ -91,12 +95,15 @@ async def _(
lang=lang,
)
try:
proc = await asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
except OSError:
raise TesseractNotFoundError(f"{TESSERACT_CMD} not found.")
Expand Down Expand Up @@ -129,12 +136,15 @@ async def execute_multi_output_cmd(
output=output_file,
)
try:
proc = await asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
except OSError:
raise TesseractNotFoundError(f"{TESSERACT_CMD} not found.")
Expand Down
130 changes: 82 additions & 48 deletions aiopytesseract/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Any, AsyncGenerator, List, Optional, Tuple

import cattr
from aiofiles import tempfile # type: ignore

from .base_command import execute, execute_cmd, execute_multi_output_cmd
Expand All @@ -19,7 +20,7 @@
)
from .exceptions import TesseractRuntimeError
from .file_format import FileFormat
from .parameter import Parameter
from .models import OSD, Box, Data, Parameter


async def languages(config: str = "") -> List:
Expand Down Expand Up @@ -59,7 +60,7 @@ async def confidence(
lang: str = AIOPYTESSERACT_DEFAULT_LANGUAGE,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> Optional[str]:
) -> float:
"""Get script confidence.
:param image: image input to tesseract. (valid values: str)
Expand All @@ -71,14 +72,16 @@ async def confidence(
stdout, stderr = await asyncio.wait_for(
proc.communicate(Path(image).read_bytes()), timeout=timeout
)
m = re.search(
r"(Script.confidence:.(\d{1,10}.\d{1,10})$)",
stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
)
resp = None
if m:
resp = m.group(2)
return resp
try:
confidence_value = float(
re.search( # type: ignore
r"(Script.confidence:.(\d{1,10}.\d{1,10})$)",
stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
).group(2)
)
except AttributeError:
confidence_value = 0.0
return confidence_value


async def deskew(
Expand All @@ -87,7 +90,7 @@ async def deskew(
lang: str = AIOPYTESSERACT_DEFAULT_LANGUAGE,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> Optional[str]:
) -> float:
"""Get Deskew angle.
:param image: image input to tesseract. (valid values: str)
Expand All @@ -100,29 +103,35 @@ async def deskew(
f"{image} stdout -l {lang} --dpi {dpi} --psm 2 --oem {oem}"
)
data = await asyncio.wait_for(proc.stderr.read(), timeout=timeout)
m = re.search(
r"(Deskew.angle:.)(\d{1,10}.\d{1,10}$)",
data.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
)
resp = None
if m:
resp = m.group(2)
return resp
try:
deskew_value = float(
re.search( # type: ignore
r"(Deskew.angle:.)(\d{1,10}.\d{1,10}$)",
data.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
).group(2)
)
except AttributeError:
deskew_value = 0.0
return deskew_value


async def tesseract_parameters():
async def tesseract_parameters() -> List[Parameter]:
"""List of all Tesseract parameters with default value and short description.
reference: https://tesseract-ocr.github.io/tessdoc/tess3/ControlParams.html
"""
proc = await execute_cmd("--print-parameters")
data: bytes = await proc.stdout.read()
data = data.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
datalen = len(data.split("\n")) - 1
raw_data: bytes = await proc.stdout.read()
data = raw_data.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
params = []
for line in data.split("\n")[1:datalen]:
param = line.split()
params.append(Parameter(param[0], param[1], " ".join(param[2:])))
for line in data.split("\n"):
param = re.search(r"(\w+)\s+(-?\d+.?\d{0,})\s+(.*)[^\n]$", line)
if param:
params.append(
cattr.structure_attrs_fromtuple(
[param.group(1), param.group(2), param.group(3)], Parameter # type: ignore
)
)
return params


Expand Down Expand Up @@ -211,7 +220,7 @@ async def image_to_hocr(
psm: int = AIOPYTESSERACT_DEFAULT_PSM,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
):
) -> str:
"""HOCR
:param image: image input to tesseract. (valid values: str, bytes)
Expand All @@ -236,7 +245,7 @@ async def _(
psm: int = AIOPYTESSERACT_DEFAULT_PSM,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> bytes:
) -> str:
output: bytes = await execute(
image,
FileFormat.HOCR,
Expand All @@ -248,7 +257,7 @@ async def _(
user_words,
user_patterns,
)
return output
return output.decode(AIOPYTESSERACT_DEFAULT_ENCODING)


@image_to_hocr.register(bytes)
Expand All @@ -261,7 +270,7 @@ async def _(
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
user_words: Optional[str] = None,
user_patterns: Optional[str] = None,
) -> bytes:
) -> str:
output: bytes = await execute(
image,
FileFormat.HOCR,
Expand All @@ -273,7 +282,7 @@ async def _(
user_words,
user_patterns,
)
return output
return output.decode(AIOPYTESSERACT_DEFAULT_ENCODING)


@singledispatch
Expand Down Expand Up @@ -354,7 +363,7 @@ async def _(
@singledispatch
async def image_to_boxes(
image: Any, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT
) -> str:
) -> List[Box]:
"""Bounding box estimates.
:param image: image input to tesseract. (valid values: str, bytes)
Expand All @@ -364,26 +373,31 @@ async def image_to_boxes(


@image_to_boxes.register(str)
async def _(image: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> str:
async def _(image: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> List[Box]:
boxes = await image_to_boxes(Path(image).read_bytes(), timeout)
return boxes


@image_to_boxes.register(bytes)
async def _(image: bytes, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> str:
async def _(image: bytes, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> List[Box]:
proc = await execute_cmd("stdin stdout batch.nochop makebox")
stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
if proc.returncode != 0:
raise TesseractRuntimeError(stderr.decode(AIOPYTESSERACT_DEFAULT_ENCODING))
return stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING) # type: ignore
data = stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
datalen = len(data.split("\n")) - 1
boxes = []
for line in data.split("\n")[:datalen]:
boxes.append(cattr.structure_attrs_fromtuple(line.split(), Box))
return boxes


@singledispatch
async def image_to_data(
image: Any,
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
) -> List[Data]:
"""Information about boxes, confidences, line and page numbers.
:param image: image input to tesseract. (valid values: str, bytes)
Expand All @@ -397,22 +411,28 @@ async def _(
image: str,
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
resp = await image_to_data(Path(image).read_bytes(), dpi, timeout)
return resp
) -> List[Data]:
data_values = await image_to_data(Path(image).read_bytes(), dpi, timeout)
return data_values


@image_to_data.register(bytes)
async def _(
image: bytes,
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
) -> List[Data]:
proc = await execute_cmd(f"stdin stdout -c tessedit_create_tsv=1 --dpi {dpi}")
stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
if proc.returncode != 0:
raise TesseractRuntimeError(stderr.decode(AIOPYTESSERACT_DEFAULT_ENCODING))
return stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING) # type: ignore
data: str = stdout.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
datalen = len(data.split("\n")) - 1
params = []
for line in data.split("\n")[1:datalen]:
param = line.split()
params.append(cattr.structure_attrs_fromtuple(param, Data)) # type: ignore
return params


@singledispatch
Expand All @@ -421,7 +441,7 @@ async def image_to_osd(
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
) -> OSD:
"""Information about orientation and script detection.
:param image: image input to tesseract. (valid values: str, bytes)
Expand All @@ -438,9 +458,16 @@ async def _(
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
resp = await execute(image, FileFormat.OSD, dpi, None, 0, oem, timeout)
return resp.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
) -> OSD:
data = await execute(image, FileFormat.OSD, dpi, None, 0, oem, timeout)
osd = cattr.structure_attrs_fromtuple(
re.findall( # type: ignore
r"\w+\s?\:\s{0,}(\d+.?\d{0,}|\w+)",
data.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
),
OSD,
)
return osd


@image_to_osd.register(bytes)
Expand All @@ -449,9 +476,16 @@ async def _(
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
oem: int = AIOPYTESSERACT_DEFAULT_OEM,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> str:
resp = await execute(image, FileFormat.OSD, dpi, None, 0, oem, timeout)
return resp.decode(AIOPYTESSERACT_DEFAULT_ENCODING)
) -> OSD:
data = await execute(image, FileFormat.OSD, dpi, None, 0, oem, timeout)
osd = cattr.structure_attrs_fromtuple(
re.findall( # type: ignore
r"\w+\s?\:\s{0,}(\d+.?\d{0,}|\w+)",
data.decode(AIOPYTESSERACT_DEFAULT_ENCODING),
),
OSD,
)
return osd


@asynccontextmanager
Expand Down
4 changes: 4 additions & 0 deletions aiopytesseract/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .box import Box
from .data import Data
from .osd import OSD
from .parameter import Parameter
10 changes: 10 additions & 0 deletions aiopytesseract/models/box.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass


@dataclass(frozen=True)
class Box:
character: str
x: int
y: int
w: int
h: int
Loading

0 comments on commit a89b4bc

Please sign in to comment.