Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Master #483

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 58 additions & 65 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
# -*- coding: utf-8 -*-

import os
import sys
from pathlib import Path
from typing import Union

from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf._utils import StrByteType
from PyPDF2 import PdfReader,PdfWriter

from .core import TableList
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
from .utils import is_url


class PDFHandler:
from .parsers import Stream, Lattice
from .utils import (
TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
)


class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.
Expand All @@ -35,23 +34,22 @@ class PDFHandler:

"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath

if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

if password is None:
self.password = "" # noqa: S105
self.password = ""
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)
self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.

Parameters
Expand All @@ -69,15 +67,13 @@ def _get_pages(self, pages):

"""
page_numbers = []

if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfReader(self.filepath, strict=False)

instream = open(filepath, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
Expand All @@ -89,13 +85,13 @@ def _get_pages(self, pages):
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})

result = []
instream.close()
P = []
for p in page_numbers:
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))

def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.

Parameters
Expand All @@ -108,42 +104,43 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
Tmp directory.

"""
infile = PdfReader(filepath, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
with open(filepath, "rb") as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfFileReader(instream, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == "anticlockwise":
p.rotateClockwise(90)
elif rotation == "clockwise":
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()

def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Expand All @@ -156,8 +153,7 @@ def parse(
suppress_stdout : str (default: False)
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
See camelot.read_pdf kwargs.

Expand All @@ -167,9 +163,6 @@ def parse(
List of tables found in PDF.

"""
if layout_kwargs is None:
layout_kwargs = {}

tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
Expand Down