From b5eedf727e96a6efec493ab0574e372c8cc21bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:54:49 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90OCR=20Issue=20No.9=E3=80=91=E7=A7=BB?= =?UTF-8?q?=E9=99=A4=E6=98=8E=E7=A1=AE=E4=B8=8D=E9=80=82=E5=90=88=E6=94=BE?= =?UTF-8?q?=E5=9C=A8ppocr=E4=BE=9D=E8=B5=96=E4=B8=AD=E7=9A=84=E4=BE=9D?= =?UTF-8?q?=E8=B5=96=E9=A1=B9=20(#11946)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify requestions * Update requirements.txt * Update requirements.txt * try import pdfconvert * try import lxml * try import lxml * try import premailer * try import openpyxl * Apply suggestions from code review --- paddleocr.py | 2 ++ ppstructure/pdf2word/pdf2word.py | 4 ++- ppstructure/predict_system.py | 2 ++ ppstructure/recovery/requirements.txt | 1 - .../table/table_metric/table_metric.py | 5 ++- ppstructure/table/tablepyxl/style.py | 33 +++++++++++-------- ppstructure/table/tablepyxl/tablepyxl.py | 16 ++++++--- requirements.txt | 6 +--- 8 files changed, 44 insertions(+), 25 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index d03a6932e2..dcac47802e 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -19,6 +19,7 @@ __dir__ = os.path.dirname(__file__) import paddle +from paddle.utils import try_import sys.path.append(os.path.join(__dir__, "")) @@ -910,6 +911,7 @@ def main(): img = cv2.imread(img_path) if args.recovery and args.use_pdf2docx_api and flag_pdf: + try_import("pdf2docx") from pdf2docx.converter import Converter docx_file = os.path.join(args.output, "{}.docx".format(img_name)) diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py index 06ae555598..c9e61eee8f 100644 --- a/ppstructure/pdf2word/pdf2word.py +++ b/ppstructure/pdf2word/pdf2word.py @@ -25,7 +25,6 @@ fitz = try_import("fitz") from PIL import Image -from pdf2docx.converter import Converter from qtpy.QtWidgets import ( QApplication, QWidget, @@ -209,6 +208,9 @@ def run(self): break # using use_pdf2docx_api for PDF parsing if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf": + try_import("pdf2docx") + from pdf2docx.converter import Converter + self.totalPageCnt += 1 self.progressBarRange.emit(self.totalPageCnt) print("===============using use_pdf2docx_api===============") diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 9073e87ee1..6148cc8901 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -28,6 +28,7 @@ import logging from copy import deepcopy +from paddle.utils import try_import from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger from ppocr.utils.visual import draw_ser_results, draw_re_results @@ -300,6 +301,7 @@ def main(args): img_name = os.path.basename(image_file).split(".")[0] if args.recovery and args.use_pdf2docx_api and flag_pdf: + try_import("pdf2docx") from pdf2docx.converter import Converter os.makedirs(args.output, exist_ok=True) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 761b9d7c3e..bad600d728 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,4 +2,3 @@ python-docx beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 -pdf2docx \ No newline at end of file diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py index d5ba6a0afb..327b87adbf 100755 --- a/ppstructure/table/table_metric/table_metric.py +++ b/ppstructure/table/table_metric/table_metric.py @@ -12,10 +12,10 @@ from rapidfuzz.distance import Levenshtein from apted import APTED, Config from apted.helpers import Tree -from lxml import etree, html from collections import deque from .parallel import parallel_process from tqdm import tqdm +from paddle.utils import try_import class TableTree(Tree): @@ -161,6 +161,9 @@ def evaluate(self, pred, true): """Computes TEDS score between the prediction and the ground truth of a given sample """ + try_import("lxml") + from lxml import etree, html + if (not pred) or (not true): return 0.0 parser = html.HTMLParser(remove_comments=True, encoding="utf-8") diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py index 4787e7d377..dfd0f2478a 100644 --- a/ppstructure/table/tablepyxl/style.py +++ b/ppstructure/table/tablepyxl/style.py @@ -1,19 +1,26 @@ # This is where we handle translating css styles into openpyxl styles # and cascading those from parent to child in the dom. -from openpyxl.cell import cell -from openpyxl.styles import ( - Font, - Alignment, - PatternFill, - NamedStyle, - Border, - Side, - Color, -) -from openpyxl.styles.fills import FILL_SOLID -from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE -from openpyxl.styles.colors import BLACK +try: + from openpyxl.cell import cell + from openpyxl.styles import ( + Font, + Alignment, + PatternFill, + NamedStyle, + Border, + Side, + Color, + ) + from openpyxl.styles.fills import FILL_SOLID + from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE + from openpyxl.styles.colors import BLACK +except: + import warnings + + warnings.warn( + "Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure." + ) FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy" diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py index 95b75d1ac7..92f6536052 100644 --- a/ppstructure/table/tablepyxl/tablepyxl.py +++ b/ppstructure/table/tablepyxl/tablepyxl.py @@ -1,11 +1,9 @@ # Do imports like python3 so our package works for 2 and 3 from __future__ import absolute_import -from lxml import html -from openpyxl import Workbook -from openpyxl.utils import get_column_letter -from premailer import Premailer + from tablepyxl.style import Table +from paddle.utils import try_import def string_to_int(s): @@ -15,6 +13,9 @@ def string_to_int(s): def get_Tables(doc): + try_import("lxml") + from lxml import etree, html + tree = html.fromstring(doc) comments = tree.xpath("//comment()") for comment in comments: @@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1): Writes every tr child element of elem to a row in the worksheet returns the next row after all rows are written """ + try_import("openpyxl") from openpyxl.cell.cell import MergedCell + from openpyxl.utils import get_column_letter initial_column = column for table_row in elem.rows: @@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None): every table in the document. The workbook is returned """ + try_import("premailer") + try_import("openpyxl") + from premailer import Premailer + from openpyxl import Workbook + if not wb: wb = Workbook() wb.remove(wb.active) diff --git a/requirements.txt b/requirements.txt index aee6d2e2df..7df7143a12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,9 +9,5 @@ rapidfuzz opencv-python<=4.6.0.66 opencv-contrib-python<=4.6.0.66 cython -lxml -premailer -openpyxl -attrdict Pillow>=10.0.0 -pyyaml \ No newline at end of file +pyyaml