diff --git a/camelot/handlers.py b/camelot/handlers.py index 66ee1697..0a6db89f 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,24 +1,23 @@ +# -*- coding: utf-8 -*- + import os import sys -from pathlib import Path -from typing import Union -from pypdf import PdfReader -from pypdf import PdfWriter -from pypdf._utils import StrByteType +from PyPDF2 import PdfReader,PdfWriter from .core import TableList -from .parsers import Lattice -from .parsers import Stream -from .utils import TemporaryDirectory -from .utils import download_url -from .utils import get_page_layout -from .utils import get_rotation -from .utils import get_text_objects -from .utils import is_url - - -class PDFHandler: +from .parsers import Stream, Lattice +from .utils import ( + TemporaryDirectory, + get_page_layout, + get_text_objects, + get_rotation, + is_url, + download_url, +) + + +class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. @@ -35,23 +34,22 @@ class PDFHandler: """ - def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) - self.filepath: Union[StrByteType, Path] = filepath - - if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"): + self.filepath = filepath + if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: - self.password = "" # noqa: S105 + self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") - self.pages = self._get_pages(pages) + self.pages = self._get_pages(self.filepath, pages) - def _get_pages(self, pages): + def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters @@ -69,15 +67,13 @@ def _get_pages(self, pages): """ page_numbers = [] - if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - infile = PdfReader(self.filepath, strict=False) - + instream = open(filepath, "rb") + infile = PdfReader(instream, strict=False) if infile.is_encrypted: infile.decrypt(self.password) - if pages == "all": page_numbers.append({"start": 1, "end": len(infile.pages)}) else: @@ -89,13 +85,13 @@ def _get_pages(self, pages): page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) - - result = [] + instream.close() + P = [] for p in page_numbers: - result.extend(range(p["start"], p["end"] + 1)) - return sorted(set(result)) + P.extend(range(p["start"], p["end"] + 1)) + return sorted(set(P)) - def _save_page(self, filepath: Union[StrByteType, Path], page, temp): + def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters @@ -108,42 +104,43 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp): Tmp directory. """ - infile = PdfReader(filepath, strict=False) - if infile.is_encrypted: - infile.decrypt(self.password) - fpath = os.path.join(temp, f"page-{page}.pdf") - froot, fext = os.path.splitext(fpath) - p = infile.pages[page - 1] - outfile = PdfWriter() - outfile.add_page(p) - with open(fpath, "wb") as f: - outfile.write(f) - layout, dim = get_page_layout(fpath) - # fix rotated PDF - chars = get_text_objects(layout, ltype="char") - horizontal_text = get_text_objects(layout, ltype="horizontal_text") - vertical_text = get_text_objects(layout, ltype="vertical_text") - rotation = get_rotation(chars, horizontal_text, vertical_text) - if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) - os.rename(fpath, fpath_new) - instream = open(fpath_new, "rb") - infile = PdfReader(instream, strict=False) + with open(filepath, "rb") as fileobj: + infile = PdfReader(fileobj, strict=False) if infile.is_encrypted: infile.decrypt(self.password) + fpath = os.path.join(temp, f"page-{page}.pdf") + froot, fext = os.path.splitext(fpath) + p = infile.pages[page - 1] outfile = PdfWriter() - p = infile.pages[0] - if rotation == "anticlockwise": - p.rotate(90) - elif rotation == "clockwise": - p.rotate(-90) outfile.add_page(p) with open(fpath, "wb") as f: outfile.write(f) - instream.close() + layout, dim = get_page_layout(fpath) + # fix rotated PDF + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) + if rotation != "": + fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + os.rename(fpath, fpath_new) + instream = open(fpath_new, "rb") + infile = PdfFileReader(instream, strict=False) + if infile.isEncrypted: + infile.decrypt(self.password) + outfile = PdfFileWriter() + p = infile.getPage(0) + if rotation == "anticlockwise": + p.rotateClockwise(90) + elif rotation == "clockwise": + p.rotateCounterClockwise(90) + outfile.addPage(p) + with open(fpath, "wb") as f: + outfile.write(f) + instream.close() def parse( - self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs + self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -156,8 +153,7 @@ def parse( suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) - A dict of `pdfminer.layout.LAParams - `_ kwargs. + A dict of `pdfminer.layout.LAParams `_ kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -167,9 +163,6 @@ def parse( List of tables found in PDF. """ - if layout_kwargs is None: - layout_kwargs = {} - tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: