Fix the bug where Python scripts fail to execute PDF text recognition… (

PaddlePaddle#11994) * Fix the bug where Python scripts fail to execute PDF text recognition tasks, optimize the logic of judging PDF files, and add cases to the quickstart document for layout analysis. * Add two examples of PDF layout analysis to the quickstart file of ppstructure. * Add a return comment for the check_img function
Liyulingyue · Apr 25, 2024 · f7117ef · f7117ef
1 parent 00f0d42
commit f7117ef
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 7 deletions.
diff --git a/paddleocr.py b/paddleocr.py
@@ -559,8 +559,9 @@ def check_img(img, alpha_color=(255, 255, 255)):
  file format: jpg, png and other image formats that opencv can decode, as well as gif and pdf formats
  storage type: binary image, net image file, local image file
  alpha_color: Background color in images in RGBA format
- return: numpy.array (h, w, 3)
+ return: numpy.array (h, w, 3) or list (p, h, w, 3) (p: page of pdf), boolean, boolean
  """
+ flag_gif, flag_pdf = False, False
  if isinstance(img, bytes):
  img = img_decode(img)
  if isinstance(img, str):
@@ -589,17 +590,17 @@ def check_img(img, alpha_color=(255, 255, 255)):
  img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  except:
  logger.error("error in loading image:{}".format(image_file))
- return None
+ return None, flag_gif, flag_pdf
  if img is None:
  logger.error("error in loading image:{}".format(image_file))
- return None
+ return None, flag_gif, flag_pdf
  # single channel image array.shape:h,w
  if isinstance(img, np.ndarray) and len(img.shape) == 2:
  img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
  # four channel image array.shape:h,w,c
  if isinstance(img, np.ndarray) and len(img.shape) == 3 and img.shape[2] == 4:
  img = alpha_to_color(img, alpha_color)
- return img
+ return img, flag_gif, flag_pdf
 
 
 class PaddleOCR(predict_system.TextSystem):
@@ -700,9 +701,9 @@ def ocr(
  "Since the angle classifier is not initialized, it will not be used during the forward process"
  )
 
- img = check_img(img, alpha_color)
+ img, flag_gif, flag_pdf = check_img(img, alpha_color)
  # for infer pdf file
- if isinstance(img, list):
+ if isinstance(img, list) and flag_pdf:
  if self.page_num > len(img) or self.page_num == 0:
  imgs = img
  else:
@@ -837,7 +838,16 @@ def __call__(
  img_idx=0,
  alpha_color=(255, 255, 255),
  ):
- img = check_img(img, alpha_color)
+ img, flag_gif, flag_pdf = check_img(img, alpha_color)
+ if isinstance(img, list) and flag_pdf:
+ res_list = []
+ for index, pdf_img in enumerate(img):
+ logger.info("processing {}/{} page:".format(index + 1, len(img)))
+ res, _ = super().__call__(
+ pdf_img, return_ocr_result_in_table, img_idx=index
+ )
+ res_list.append(res)
+ return res_list
  res, _ = super().__call__(img, return_ocr_result_in_table, img_idx=img_idx)
  return res
 

diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md
@@ -209,6 +209,62 @@ for line in result:
  print(line)
 ```
 
+```python
+import os
+import cv2
+from paddleocr import PPStructure,save_structure_res
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+result = ocr_engine(img_path)
+for index, res in enumerate(result):
+ save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
+
+for res in result:
+ for line in res:
+ line.pop('img')
+ print(line)
+```
+
+```python
+import os
+import cv2
+import numpy as np
+from paddleocr import PPStructure,save_structure_res
+from paddle.utils import try_import
+from PIL import Image
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+
+fitz = try_import("fitz")
+imgs = []
+with fitz.open(img_path) as pdf:
+ for pg in range(0, pdf.page_count):
+ page = pdf[pg]
+ mat = fitz.Matrix(2, 2)
+ pm = page.get_pixmap(matrix=mat, alpha=False)
+
+ # if width or height > 2000 pixels, don't enlarge the image
+ if pm.width > 2000 or pm.height > 2000:
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+ imgs.append(img)
+
+for index, img in enumerate(imgs):
+ result = ocr_engine(img)
+ save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
+ for line in result:
+ line.pop('img')
+ print(line)
+```
+
 <a name="224"></a>
 
 #### 2.2.4 表格识别

diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md
@@ -192,6 +192,62 @@ for line in result:
  print(line)
 ```
 
+```python
+import os
+import cv2
+from paddleocr import PPStructure,save_structure_res
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+result = ocr_engine(img_path)
+for index, res in enumerate(result):
+ save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index)
+
+for res in result:
+ for line in res:
+ line.pop('img')
+ print(line)
+```
+
+```python
+import os
+import cv2
+import numpy as np
+from paddleocr import PPStructure,save_structure_res
+from paddle.utils import try_import
+from PIL import Image
+
+ocr_engine = PPStructure(table=False, ocr=True, show_log=True)
+
+save_folder = './output'
+img_path = 'ppstructure/recovery/UnrealText.pdf'
+
+fitz = try_import("fitz")
+imgs = []
+with fitz.open(img_path) as pdf:
+ for pg in range(0, pdf.page_count):
+ page = pdf[pg]
+ mat = fitz.Matrix(2, 2)
+ pm = page.get_pixmap(matrix=mat, alpha=False)
+
+ # if width or height > 2000 pixels, don't enlarge the image
+ if pm.width > 2000 or pm.height > 2000:
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+ imgs.append(img)
+
+for index, img in enumerate(imgs):
+ result = ocr_engine(img)
+ save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index)
+ for line in result:
+ line.pop('img')
+ print(line)
+```
+
 <a name="224"></a>
 #### 2.2.4 table recognition