From b5eedf727e96a6efec493ab0574e372c8cc21bf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:54:49 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90OCR=20Issue=20No.9=E3=80=91=E7=A7=BB?=
 =?UTF-8?q?=E9=99=A4=E6=98=8E=E7=A1=AE=E4=B8=8D=E9=80=82=E5=90=88=E6=94=BE?=
 =?UTF-8?q?=E5=9C=A8ppocr=E4=BE=9D=E8=B5=96=E4=B8=AD=E7=9A=84=E4=BE=9D?=
 =?UTF-8?q?=E8=B5=96=E9=A1=B9=20(#11946)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify requestions

* Update requirements.txt

* Update requirements.txt

* try import pdfconvert

* try import lxml

* try import lxml

* try import premailer

* try import openpyxl

* Apply suggestions from code review
---
 paddleocr.py                                  |  2 ++
 ppstructure/pdf2word/pdf2word.py              |  4 ++-
 ppstructure/predict_system.py                 |  2 ++
 ppstructure/recovery/requirements.txt         |  1 -
 .../table/table_metric/table_metric.py        |  5 ++-
 ppstructure/table/tablepyxl/style.py          | 33 +++++++++++--------
 ppstructure/table/tablepyxl/tablepyxl.py      | 16 ++++++---
 requirements.txt                              |  6 +---
 8 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/paddleocr.py b/paddleocr.py
index d03a6932e2..dcac47802e 100644
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -19,6 +19,7 @@
 __dir__ = os.path.dirname(__file__)
 
 import paddle
+from paddle.utils import try_import
 
 sys.path.append(os.path.join(__dir__, ""))
 
@@ -910,6 +911,7 @@ def main():
                 img = cv2.imread(img_path)
 
             if args.recovery and args.use_pdf2docx_api and flag_pdf:
+                try_import("pdf2docx")
                 from pdf2docx.converter import Converter
 
                 docx_file = os.path.join(args.output, "{}.docx".format(img_name))
diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py
index 06ae555598..c9e61eee8f 100644
--- a/ppstructure/pdf2word/pdf2word.py
+++ b/ppstructure/pdf2word/pdf2word.py
@@ -25,7 +25,6 @@
 
 fitz = try_import("fitz")
 from PIL import Image
-from pdf2docx.converter import Converter
 from qtpy.QtWidgets import (
     QApplication,
     QWidget,
@@ -209,6 +208,9 @@ def run(self):
                     break
                 # using use_pdf2docx_api for PDF parsing
                 if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
+                    try_import("pdf2docx")
+                    from pdf2docx.converter import Converter
+
                     self.totalPageCnt += 1
                     self.progressBarRange.emit(self.totalPageCnt)
                     print("===============using use_pdf2docx_api===============")
diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py
index 9073e87ee1..6148cc8901 100644
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@@ -28,6 +28,7 @@
 import logging
 from copy import deepcopy
 
+from paddle.utils import try_import
 from ppocr.utils.utility import get_image_file_list, check_and_read
 from ppocr.utils.logging import get_logger
 from ppocr.utils.visual import draw_ser_results, draw_re_results
@@ -300,6 +301,7 @@ def main(args):
         img_name = os.path.basename(image_file).split(".")[0]
 
         if args.recovery and args.use_pdf2docx_api and flag_pdf:
+            try_import("pdf2docx")
             from pdf2docx.converter import Converter
 
             os.makedirs(args.output, exist_ok=True)
diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt
index 761b9d7c3e..bad600d728 100644
--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
@@ -2,4 +2,3 @@ python-docx
 beautifulsoup4
 fonttools>=4.24.0
 fire>=0.3.0
-pdf2docx
\ No newline at end of file
diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py
index d5ba6a0afb..327b87adbf 100755
--- a/ppstructure/table/table_metric/table_metric.py
+++ b/ppstructure/table/table_metric/table_metric.py
@@ -12,10 +12,10 @@
 from rapidfuzz.distance import Levenshtein
 from apted import APTED, Config
 from apted.helpers import Tree
-from lxml import etree, html
 from collections import deque
 from .parallel import parallel_process
 from tqdm import tqdm
+from paddle.utils import try_import
 
 
 class TableTree(Tree):
@@ -161,6 +161,9 @@ def evaluate(self, pred, true):
         """Computes TEDS score between the prediction and the ground truth of a
         given sample
         """
+        try_import("lxml")
+        from lxml import etree, html
+
         if (not pred) or (not true):
             return 0.0
         parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py
index 4787e7d377..dfd0f2478a 100644
--- a/ppstructure/table/tablepyxl/style.py
+++ b/ppstructure/table/tablepyxl/style.py
@@ -1,19 +1,26 @@
 # This is where we handle translating css styles into openpyxl styles
 # and cascading those from parent to child in the dom.
 
-from openpyxl.cell import cell
-from openpyxl.styles import (
-    Font,
-    Alignment,
-    PatternFill,
-    NamedStyle,
-    Border,
-    Side,
-    Color,
-)
-from openpyxl.styles.fills import FILL_SOLID
-from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
-from openpyxl.styles.colors import BLACK
+try:
+    from openpyxl.cell import cell
+    from openpyxl.styles import (
+        Font,
+        Alignment,
+        PatternFill,
+        NamedStyle,
+        Border,
+        Side,
+        Color,
+    )
+    from openpyxl.styles.fills import FILL_SOLID
+    from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
+    from openpyxl.styles.colors import BLACK
+except:
+    import warnings
+
+    warnings.warn(
+        "Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
+    )
 
 FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"
 
diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py
index 95b75d1ac7..92f6536052 100644
--- a/ppstructure/table/tablepyxl/tablepyxl.py
+++ b/ppstructure/table/tablepyxl/tablepyxl.py
@@ -1,11 +1,9 @@
 # Do imports like python3 so our package works for 2 and 3
 from __future__ import absolute_import
 
-from lxml import html
-from openpyxl import Workbook
-from openpyxl.utils import get_column_letter
-from premailer import Premailer
+
 from tablepyxl.style import Table
+from paddle.utils import try_import
 
 
 def string_to_int(s):
@@ -15,6 +13,9 @@ def string_to_int(s):
 
 
 def get_Tables(doc):
+    try_import("lxml")
+    from lxml import etree, html
+
     tree = html.fromstring(doc)
     comments = tree.xpath("//comment()")
     for comment in comments:
@@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
     Writes every tr child element of elem to a row in the worksheet
     returns the next row after all rows are written
     """
+    try_import("openpyxl")
     from openpyxl.cell.cell import MergedCell
+    from openpyxl.utils import get_column_letter
 
     initial_column = column
     for table_row in elem.rows:
@@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
     every table in the document.
     The workbook is returned
     """
+    try_import("premailer")
+    try_import("openpyxl")
+    from premailer import Premailer
+    from openpyxl import Workbook
+
     if not wb:
         wb = Workbook()
         wb.remove(wb.active)
diff --git a/requirements.txt b/requirements.txt
index aee6d2e2df..7df7143a12 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,9 +9,5 @@ rapidfuzz
 opencv-python<=4.6.0.66
 opencv-contrib-python<=4.6.0.66
 cython
-lxml
-premailer
-openpyxl
-attrdict
 Pillow>=10.0.0
-pyyaml
\ No newline at end of file
+pyyaml