Skip to content

Commit

Permalink
Merge branch 'rl-0.7.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
amenezes committed Apr 22, 2022
2 parents c80ca2c + f9ed17c commit 10e2801
Showing 9 changed files with 127 additions and 72 deletions.
2 changes: 1 addition & 1 deletion aiopytesseract/__init__.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
image_to_string, languages, run, tesseract_parameters,
tesseract_version)

__version__ = "0.6.0"
__version__ = "0.7.0"
__all__ = [
"__version__",
"confidence",
72 changes: 43 additions & 29 deletions aiopytesseract/base_command.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
import shlex
from collections import deque
from functools import singledispatch
from functools import lru_cache, singledispatch
from pathlib import Path
from typing import Any, List, Optional, Tuple

from ._logger import logger
from .constants import (
AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE,
AIOPYTESSERACT_DEFAULT_ENCODING,
AIOPYTESSERACT_DEFAULT_TIMEOUT,
OUTPUT_FILE_EXTENSIONS,
@@ -63,7 +64,7 @@ async def _(
user_patterns: Optional[str] = None,
tessdata_dir: Optional[str] = None,
) -> bytes:
await file_exists(image)
file_exists(image)
response: bytes = await execute(
Path(image).read_bytes(),
output_format,
@@ -93,7 +94,7 @@ async def _(
tessdata_dir: Optional[str] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> bytes:
cmd_args = await _build_cmd_args(
cmd_args = _build_cmd_args(
output_extension=output_format,
dpi=dpi,
psm=psm,
@@ -103,17 +104,23 @@ async def _(
tessdata_dir=tessdata_dir,
lang=lang,
)
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
try:
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
stdout, stderr = await asyncio.wait_for(
proc.communicate(image), timeout=timeout
)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
if proc.returncode != ReturnCode.SUCCESS:
raise TesseractRuntimeError(stderr.decode(encoding))
return stdout
@@ -133,7 +140,7 @@ async def execute_multi_output_cmd(
tessdata_dir: Optional[str] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> Tuple[str, ...]:
cmd_args = await _build_cmd_args(
cmd_args = _build_cmd_args(
output_extension=output_format,
dpi=dpi,
psm=psm,
@@ -144,25 +151,30 @@ async def execute_multi_output_cmd(
lang=lang,
output=output_file,
)
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
_, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
try:
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
TESSERACT_CMD,
*cmd_args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=timeout,
)
_, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
if proc.returncode != ReturnCode.SUCCESS:
raise TesseractRuntimeError(stderr.decode(encoding))
return tuple(
[f"{output_file}{OUTPUT_FILE_EXTENSIONS[ext]}" for ext in output_format.split()] # type: ignore
)


async def _build_cmd_args(
@lru_cache(maxsize=AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE)
def _build_cmd_args(
output_extension: str,
dpi: int,
psm: int,
@@ -173,7 +185,9 @@ async def _build_cmd_args(
lang: Optional[str] = None,
output: str = "stdout",
) -> List[str]:
await asyncio.gather(psm_is_valid(psm), oem_is_valid(oem))
psm_is_valid(psm)
oem_is_valid(oem)

cmd_args = deque(
["stdin", f"{output}", "--dpi", f"{dpi}", "--psm", f"{psm}", "--oem", f"{oem}"]
)
@@ -190,7 +204,7 @@ async def _build_cmd_args(
cmd_args.append(tessdata_dir)

if lang:
await language_is_valid(lang)
language_is_valid(lang)
cmd_args.append("-l")
cmd_args.append(lang)

50 changes: 35 additions & 15 deletions aiopytesseract/commands.py
Original file line number Diff line number Diff line change
@@ -76,17 +76,22 @@ async def confidence(
:param oem: ocr engine modes (default: 3)
:param timeout: command timeout (default: 30)
"""
proc = await execute_cmd(f"stdin stdout -l {lang} --dpi {dpi} --psm 0 --oem {oem}")
stdout, _ = await asyncio.wait_for(
proc.communicate(Path(image).read_bytes()), timeout=timeout
)
try:
proc = await execute_cmd(
f"stdin stdout -l {lang} --dpi {dpi} --psm 0 --oem {oem}"
)
stdout, _ = await asyncio.wait_for(
proc.communicate(Path(image).read_bytes()), timeout=timeout
)
confidence_value = float(
re.search( # type: ignore
r"(Script.confidence:.(\d{1,10}.\d{1,10})$)",
stdout.decode(encoding),
).group(2)
)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
except AttributeError:
confidence_value = 0.0
return confidence_value
@@ -108,17 +113,20 @@ async def deskew(
:param lang: tesseract language. (Format: eng, eng+por, eng+por+fra)
:param timeout: command timeout (default: 30)
"""
proc = await execute_cmd(
f"{image} stdout -l {lang} --dpi {dpi} --psm 2 --oem {oem}"
)
data = await asyncio.wait_for(proc.stderr.read(), timeout=timeout)
try:
proc = await execute_cmd(
f"{image} stdout -l {lang} --dpi {dpi} --psm 2 --oem {oem}"
)
data = await asyncio.wait_for(proc.stderr.read(), timeout=timeout)
deskew_value = float(
re.search( # type: ignore
r"(Deskew.angle:.)(\d{1,10}.\d{1,10}$)",
data.decode(encoding),
).group(2)
)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
except AttributeError:
deskew_value = 0.0
return deskew_value
@@ -404,7 +412,7 @@ async def image_to_boxes(

@image_to_boxes.register(str)
async def _(image: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> List[Box]:
await file_exists(image)
file_exists(image)
boxes = await image_to_boxes(Path(image).read_bytes(), timeout)
return boxes

@@ -415,8 +423,14 @@ async def _(
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> List[Box]:
proc = await execute_cmd("stdin stdout batch.nochop makebox")
stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
try:
proc = await execute_cmd("stdin stdout batch.nochop makebox")
stdout, stderr = await asyncio.wait_for(
proc.communicate(image), timeout=timeout
)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
if proc.returncode != ReturnCode.SUCCESS:
raise TesseractRuntimeError(stderr.decode(encoding))
data = stdout.decode(encoding)
@@ -447,7 +461,7 @@ async def _(
dpi: int = AIOPYTESSERACT_DEFAULT_DPI,
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
) -> List[Data]:
await file_exists(image)
file_exists(image)
data_values = await image_to_data(Path(image).read_bytes(), dpi, timeout)
return data_values

@@ -459,8 +473,14 @@ async def _(
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> List[Data]:
proc = await execute_cmd(f"stdin stdout -c tessedit_create_tsv=1 --dpi {dpi}")
stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout)
try:
proc = await execute_cmd(f"stdin stdout -c tessedit_create_tsv=1 --dpi {dpi}")
stdout, stderr = await asyncio.wait_for(
proc.communicate(image), timeout=timeout
)
except asyncio.TimeoutError:
proc.kill()
raise RuntimeError("Tesseract process timeout")
if proc.returncode != ReturnCode.SUCCESS:
raise TesseractRuntimeError(stderr.decode(encoding))
data: str = stdout.decode(encoding)
@@ -498,7 +518,7 @@ async def _(
timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> OSD:
await file_exists(image)
file_exists(image)
osd = await image_to_osd(Path(image).read_bytes(), dpi, oem, timeout, encoding)
return osd

1 change: 1 addition & 0 deletions aiopytesseract/constants.py
Original file line number Diff line number Diff line change
@@ -161,6 +161,7 @@
AIOPYTESSERACT_DEFAULT_DPI: int = 200
AIOPYTESSERACT_DEFAULT_PSM: int = 3
AIOPYTESSERACT_DEFAULT_OEM: int = 3
AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE: int = 1

OUTPUT_FILE_EXTENSIONS = {
FileFormat.ALTO: ".xml",
8 changes: 4 additions & 4 deletions aiopytesseract/validators.py
Original file line number Diff line number Diff line change
@@ -6,22 +6,22 @@
OEMInvalidException, PSMInvalidException)


async def psm_is_valid(psm: int) -> None:
def psm_is_valid(psm: int) -> None:
if psm not in PAGE_SEGMENTATION_MODES.keys():
raise PSMInvalidException


async def oem_is_valid(oem: int) -> None:
def oem_is_valid(oem: int) -> None:
if oem not in OCR_ENGINE_MODES.keys():
raise OEMInvalidException


async def file_exists(file_path: str) -> None:
def file_exists(file_path: str) -> None:
if not Path(file_path).exists():
raise NoSuchFileException(f"No such file: '{file_path}'")


async def language_is_valid(language: str) -> None:
def language_is_valid(language: str) -> None:
for lang in language.split("+"):
if lang not in TESSERACT_LANGUAGES:
raise LanguageInvalidException(
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -56,7 +56,6 @@ ignore = E501 # line too long
D103 # missing docstring in public function
D105 # missing docstring in magic method
D107 # missing docstring in __init__
W503 # line break before binary operator
verbose = 2
doctests = True
show_source = True
2 changes: 1 addition & 1 deletion tests/test_base_command.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ async def test_execute_unsupported(input_data):

@pytest.mark.asyncio
async def test_build_cmd_args_with_user_patterns():
command = await aiopytesseract.base_command._build_cmd_args(
command = aiopytesseract.base_command._build_cmd_args(
"stdout",
200,
3,
31 changes: 30 additions & 1 deletion tests/test_commands.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,6 @@ async def test_tesseract_version(func):
assert len(version) > 0


# run
@pytest.mark.asyncio
async def test_run_with_type_not_supported():
with pytest.raises(NotImplementedError):
@@ -84,3 +83,33 @@ async def test_tesseract_parameters():
parameters = await aiopytesseract.tesseract_parameters()
assert isinstance(parameters, list)
assert isinstance(parameters[0], Parameter)


@pytest.mark.asyncio
@pytest.mark.parametrize(
"func, timeout",
[
(aiopytesseract.image_to_string, 0.1),
(aiopytesseract.image_to_hocr, 0.1),
(aiopytesseract.image_to_osd, 0.1),
(aiopytesseract.image_to_pdf, 0.1),
(aiopytesseract.image_to_data, 0.1),
(aiopytesseract.image_to_boxes, 0.1),
(aiopytesseract.deskew, 0.01),
(aiopytesseract.confidence, 0.1),
],
)
async def test_method_timeout(func, timeout):
with pytest.raises(RuntimeError):
await func("tests/samples/file-sample_150kB.png", timeout=timeout)


async def test_run_timeout():
with pytest.raises(RuntimeError):
async with aiopytesseract.run(
Path("tests/samples/file-sample_150kB.png").read_bytes(),
"xxx",
"alto tsv txt",
timeout=0.1,
) as out:
print(out)
Loading

0 comments on commit 10e2801

Please sign in to comment.