Skip to content

Commit

Permalink
【OCR Issue No.9】移除明确不适合放在ppocr依赖中的依赖项 (PaddlePaddle#11946)
Browse files Browse the repository at this point in the history
* modify requestions

* Update requirements.txt

* Update requirements.txt

* try import pdfconvert

* try import lxml

* try import lxml

* try import premailer

* try import openpyxl

* Apply suggestions from code review
  • Loading branch information
Liyulingyue committed Apr 26, 2024
1 parent b32677c commit b5eedf7
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 25 deletions.
2 changes: 2 additions & 0 deletions paddleocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
__dir__ = os.path.dirname(__file__)

import paddle
from paddle.utils import try_import

sys.path.append(os.path.join(__dir__, ""))

Expand Down Expand Up @@ -910,6 +911,7 @@ def main():
img = cv2.imread(img_path)

if args.recovery and args.use_pdf2docx_api and flag_pdf:
try_import("pdf2docx")
from pdf2docx.converter import Converter

docx_file = os.path.join(args.output, "{}.docx".format(img_name))
Expand Down
4 changes: 3 additions & 1 deletion ppstructure/pdf2word/pdf2word.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

fitz = try_import("fitz")
from PIL import Image
from pdf2docx.converter import Converter
from qtpy.QtWidgets import (
QApplication,
QWidget,
Expand Down Expand Up @@ -209,6 +208,9 @@ def run(self):
break
# using use_pdf2docx_api for PDF parsing
if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
try_import("pdf2docx")
from pdf2docx.converter import Converter

self.totalPageCnt += 1
self.progressBarRange.emit(self.totalPageCnt)
print("===============using use_pdf2docx_api===============")
Expand Down
2 changes: 2 additions & 0 deletions ppstructure/predict_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import logging
from copy import deepcopy

from paddle.utils import try_import
from ppocr.utils.utility import get_image_file_list, check_and_read
from ppocr.utils.logging import get_logger
from ppocr.utils.visual import draw_ser_results, draw_re_results
Expand Down Expand Up @@ -300,6 +301,7 @@ def main(args):
img_name = os.path.basename(image_file).split(".")[0]

if args.recovery and args.use_pdf2docx_api and flag_pdf:
try_import("pdf2docx")
from pdf2docx.converter import Converter

os.makedirs(args.output, exist_ok=True)
Expand Down
1 change: 0 additions & 1 deletion ppstructure/recovery/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ python-docx
beautifulsoup4
fonttools>=4.24.0
fire>=0.3.0
pdf2docx
5 changes: 4 additions & 1 deletion ppstructure/table/table_metric/table_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
from rapidfuzz.distance import Levenshtein
from apted import APTED, Config
from apted.helpers import Tree
from lxml import etree, html
from collections import deque
from .parallel import parallel_process
from tqdm import tqdm
from paddle.utils import try_import


class TableTree(Tree):
Expand Down Expand Up @@ -161,6 +161,9 @@ def evaluate(self, pred, true):
"""Computes TEDS score between the prediction and the ground truth of a
given sample
"""
try_import("lxml")
from lxml import etree, html

if (not pred) or (not true):
return 0.0
parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
Expand Down
33 changes: 20 additions & 13 deletions ppstructure/table/tablepyxl/style.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
# This is where we handle translating css styles into openpyxl styles
# and cascading those from parent to child in the dom.

from openpyxl.cell import cell
from openpyxl.styles import (
Font,
Alignment,
PatternFill,
NamedStyle,
Border,
Side,
Color,
)
from openpyxl.styles.fills import FILL_SOLID
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
from openpyxl.styles.colors import BLACK
try:
from openpyxl.cell import cell
from openpyxl.styles import (
Font,
Alignment,
PatternFill,
NamedStyle,
Border,
Side,
Color,
)
from openpyxl.styles.fills import FILL_SOLID
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
from openpyxl.styles.colors import BLACK
except:
import warnings

warnings.warn(
"Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
)

FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"

Expand Down
16 changes: 12 additions & 4 deletions ppstructure/table/tablepyxl/tablepyxl.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
# Do imports like python3 so our package works for 2 and 3
from __future__ import absolute_import

from lxml import html
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from premailer import Premailer

from tablepyxl.style import Table
from paddle.utils import try_import


def string_to_int(s):
Expand All @@ -15,6 +13,9 @@ def string_to_int(s):


def get_Tables(doc):
try_import("lxml")
from lxml import etree, html

tree = html.fromstring(doc)
comments = tree.xpath("//comment()")
for comment in comments:
Expand All @@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
Writes every tr child element of elem to a row in the worksheet
returns the next row after all rows are written
"""
try_import("openpyxl")
from openpyxl.cell.cell import MergedCell
from openpyxl.utils import get_column_letter

initial_column = column
for table_row in elem.rows:
Expand Down Expand Up @@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
every table in the document.
The workbook is returned
"""
try_import("premailer")
try_import("openpyxl")
from premailer import Premailer
from openpyxl import Workbook

if not wb:
wb = Workbook()
wb.remove(wb.active)
Expand Down
6 changes: 1 addition & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,5 @@ rapidfuzz
opencv-python<=4.6.0.66
opencv-contrib-python<=4.6.0.66
cython
lxml
premailer
openpyxl
attrdict
Pillow>=10.0.0
pyyaml
pyyaml

0 comments on commit b5eedf7

Please sign in to comment.