-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf.py
128 lines (104 loc) · 4.41 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
from PyPDF2 import PdfWriter, PdfReader
import re
import datetime
import os
import shutil
from pathlib import Path
class Pdf:
"""This class represents a PDF file."""
def __init__(self, filename, filter_text, bounding_box, rotate):
self.file = filename
self.filename = filename
self.filter_text = filter_text
self.bounding_box = bounding_box
self.rotate = rotate
self.pages = 0
self.new_pages = 0
def __str__(self):
return self.filename
@property
def file(self):
"""
Gets the pdf file object
:return: The pdf file object
"""
return self._filename
@file.setter
def file(self, new_filename):
"""
Sets the filename of the pdf and creates a pdf object
:param new_filename: The filename of the pdf
"""
if new_filename:
try:
self._filename = PdfReader(open(new_filename, "rb"))
except OSError:
raise ValueError(f"Cannot find file: {self.file}")
else:
raise ValueError("PDF filename cannot be blank.")
def processed_file(self, file_suffix):
"""
The pdf file is cropped, rotated, and text filtered. A new file is created.
:param file_suffix: The suffix added to the end of the new file.
:return: A string of the new filename.
"""
processed_file = PdfWriter()
self.pages = len(self.file.pages)
for i in range(self.pages):
page = self.file.pages[i]
# Skip page if filtered text appears in that page.
if self.filter_text:
text_result = page.extract_text()
if re.search(self.filter_text, text_result):
continue
# trim the PDF
page.trimbox.lower_left = (self.bounding_box[0], self.bounding_box[2])
page.trimbox.upper_right = (self.bounding_box[1], self.bounding_box[3])
page.cropbox.lower_left = (self.bounding_box[0], self.bounding_box[2])
page.cropbox.upper_right = (self.bounding_box[1], self.bounding_box[3])
# If Portrait, rotate to Landscape
if self.rotate:
if page.mediabox.right - page.mediabox.left > \
page.mediabox.top - page.mediabox.bottom:
page.rotate(90)
processed_file.add_page(page)
# Create new file
new_filename = self.filename.replace(".pdf", "-" + file_suffix + ".pdf")
output_stream = open(new_filename, "wb")
processed_file.write(output_stream)
self.new_pages = len(processed_file.pages)
output_stream.close()
return new_filename
def archive(self, archived_directory, input_directory, archive_by_month):
"""
put the PDF in a subdirectory with the option of a further subdirectory broken down by month (ie: 2022-08).
:param archived_directory: The name of the archive directory
:param input_directory: The input directory where the processed pdf files are located.
:param archive_by_month: Is it archived by month?
:return: a string of the archived filename's absolute path
"""
if not archived_directory.endswith('/'):
archived_directory += '/'
if archive_by_month:
now = datetime.datetime.now()
now_append = f'{now.year}-{now.month:02d}/'
archived_directory += now_append
# Make sure the archive folder ends with a / but does not begin with a /
if archived_directory.startswith('/'):
archived_directory = archived_directory[1:]
if not input_directory.endswith('/'):
input_directory += '/'
pdf_archived_path = input_directory + archived_directory
# Create the directory if it doesn't exist
if not os.path.exists(pdf_archived_path):
Path(pdf_archived_path).mkdir(parents=True, exist_ok=True)
# Move the pdf to the new directory
pdf_archived_path_file = self.filename.replace(input_directory,input_directory + archived_directory)
shutil.move(self.filename, pdf_archived_path_file)
return pdf_archived_path_file
@staticmethod
def merge(merged_pdf, input_filename):
merged_pdf.append(open(input_filename, "rb"))
# delete the temporary individual file.
os.remove(input_filename)