Skip to content

Commit

Permalink
Fix the bug where Python scripts fail to execute PDF text recognition… (
Browse files Browse the repository at this point in the history
PaddlePaddle#11994)

* Fix the bug where Python scripts fail to execute PDF text recognition tasks, optimize the logic of judging PDF files, and add cases to the quickstart document for layout analysis.

* Add two examples of PDF layout analysis to the quickstart file of ppstructure.

* Add a return comment for the check_img function
  • Loading branch information
guangyunms committed Apr 25, 2024
1 parent 00f0d42 commit f7117ef
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 7 deletions.
24 changes: 17 additions & 7 deletions paddleocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,9 @@ def check_img(img, alpha_color=(255, 255, 255)):
file format: jpg, png and other image formats that opencv can decode, as well as gif and pdf formats
storage type: binary image, net image file, local image file
alpha_color: Background color in images in RGBA format
return: numpy.array (h, w, 3)
return: numpy.array (h, w, 3) or list (p, h, w, 3) (p: page of pdf), boolean, boolean
"""
flag_gif, flag_pdf = False, False
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
Expand Down Expand Up @@ -589,17 +590,17 @@ def check_img(img, alpha_color=(255, 255, 255)):
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
return None, flag_gif, flag_pdf
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
return None, flag_gif, flag_pdf
# single channel image array.shape:h,w
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
# four channel image array.shape:h,w,c
if isinstance(img, np.ndarray) and len(img.shape) == 3 and img.shape[2] == 4:
img = alpha_to_color(img, alpha_color)
return img
return img, flag_gif, flag_pdf


class PaddleOCR(predict_system.TextSystem):
Expand Down Expand Up @@ -700,9 +701,9 @@ def ocr(
"Since the angle classifier is not initialized, it will not be used during the forward process"
)

img = check_img(img, alpha_color)
img, flag_gif, flag_pdf = check_img(img, alpha_color)
# for infer pdf file
if isinstance(img, list):
if isinstance(img, list) and flag_pdf:
if self.page_num > len(img) or self.page_num == 0:
imgs = img
else:
Expand Down Expand Up @@ -837,7 +838,16 @@ def __call__(
img_idx=0,
alpha_color=(255, 255, 255),
):
img = check_img(img, alpha_color)
img, flag_gif, flag_pdf = check_img(img, alpha_color)
if isinstance(img, list) and flag_pdf:
res_list = []
for index, pdf_img in enumerate(img):
logger.info("processing {}/{} page:".format(index + 1, len(img)))
res, _ = super().__call__(
pdf_img, return_ocr_result_in_table, img_idx=index
)
res_list.append(res)
return res_list
res, _ = super().__call__(img, return_ocr_result_in_table, img_idx=img_idx)
return res

Expand Down
56 changes: 56 additions & 0 deletions ppstructure/docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,62 @@ for line in result:
print(line)
```
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
result = ocr_engine(img_path)
for index, res in enumerate(result):
save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
for res in result:
for line in res:
line.pop('img')
print(line)
```
```python
import os
import cv2
import numpy as np
from paddleocr import PPStructure,save_structure_res
from paddle.utils import try_import
from PIL import Image
ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
fitz = try_import("fitz")
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.page_count):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for index, img in enumerate(imgs):
result = ocr_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
for line in result:
line.pop('img')
print(line)
```
<a name="224"></a>
#### 2.2.4 表格识别
Expand Down
56 changes: 56 additions & 0 deletions ppstructure/docs/quickstart_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,62 @@ for line in result:
print(line)
```
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
result = ocr_engine(img_path)
for index, res in enumerate(result):
save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
for res in result:
for line in res:
line.pop('img')
print(line)
```
```python
import os
import cv2
import numpy as np
from paddleocr import PPStructure,save_structure_res
from paddle.utils import try_import
from PIL import Image
ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
save_folder = './output'
img_path = 'ppstructure/recovery/UnrealText.pdf'
fitz = try_import("fitz")
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.page_count):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for index, img in enumerate(imgs):
result = ocr_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
for line in result:
line.pop('img')
print(line)
```
<a name="224"></a>
#### 2.2.4 table recognition
Expand Down

0 comments on commit f7117ef

Please sign in to comment.