-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaast_api_v4.py
227 lines (201 loc) · 7.56 KB
/
faast_api_v4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import os
import shutil
from fastapi import FastAPI
from fastapi.responses import HTMLResponse
import ironpdf
import fitz # PyMuPDF
import re
app = FastAPI()
# Directory to store converted images and HTML files
OUTPUT_DIR = "docs"
IMAGE_DIR = os.path.join(OUTPUT_DIR, "images")
os.makedirs(IMAGE_DIR, exist_ok=True)
# Directory containing PDF files
PDF_DIR = "pdfs"
def get_pdf_files():
pdf_files = {}
for filename in os.listdir(PDF_DIR):
if filename.lower().endswith('.pdf'):
pdf_name = os.path.splitext(filename)[0]
pdf_files[pdf_name] = os.path.join(PDF_DIR, filename)
return pdf_files
def convert_pdf_to_images(pdf_file, dpi=300):
pdf = ironpdf.PdfDocument.FromFile(pdf_file)
base_filename = os.path.splitext(os.path.basename(pdf_file))[0]
# Extract all pages to the images folder as PNG files with higher DPI
pdf.RasterizeToImageFiles(os.path.join(IMAGE_DIR, f"{base_filename}_page_{{0}}.png"), dpi=dpi)
# Get the list of image files in the folder
image_paths = []
for filename in os.listdir(IMAGE_DIR):
if filename.lower().endswith(".png") and base_filename in filename:
image_paths.append(filename)
return sorted(image_paths)
def extract_content_from_pdf(pdf_file):
content = []
try:
doc = fitz.open(pdf_file)
for page_num in range(len(doc)):
page = doc[page_num]
# Extract text with more layout preservation
blocks = page.get_text("blocks")
text = ""
for block in blocks:
block_text = block[4]
# Replace multiple spaces with a single space
block_text = re.sub(r'\s+', ' ', block_text)
# Add two newlines after each block to preserve some layout
text += block_text + "\n\n"
# Remove excessive newlines
text = re.sub(r'\n{3,}', '\n\n', text)
# Encode and decode to handle potential unicode issues
text = text.encode('ascii', 'ignore').decode('ascii')
content.append({"text": text.strip()})
return content
except Exception as e:
print(f"Error extracting content from PDF: {str(e)}")
return []
def generate_index_html():
pdf_files = get_pdf_files()
pdf_links = ""
for pdf_name in pdf_files.keys():
pdf_links += f"""
<h2>{pdf_name}</h2>
<ul>
<li><a href="{pdf_name}_text.html">Text View</a></li>
<li><a href="{pdf_name}_image.html">Image View</a></li>
</ul>
"""
html_content = f"""
<html>
<head>
<title>PDF Viewer</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
h1 {{ color: #333; }}
a {{ color: #007bff; text-decoration: none; }}
a:hover {{ text-decoration: underline; }}
</style>
</head>
<body>
<h1>PDF Viewer</h1>
<p>Click to view the PDF content:</p>
{pdf_links}
</body>
</html>
"""
with open(os.path.join(OUTPUT_DIR, "index.html"), "w", encoding="utf-8") as f:
f.write(html_content)
def generate_text_view(file_name, pdf_file):
content = extract_content_from_pdf(pdf_file)
html_content = f"""
<html>
<head>
<title>PDF Text Viewer - {file_name}</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
padding: 20px;
margin: 0 auto;
max-width: 100%;
box-sizing: border-box;
}}
h1 {{
color: #333;
border-bottom: 2px solid #333;
padding-bottom: 10px;
}}
#pdf-content {{
width: 100%;
max-width: 1200px;
margin: 0 auto;
}}
.page {{
margin-bottom: 20px;
border: 1px solid #ddd;
padding: 20px;
background-color: #f9f9f9;
}}
.page-number {{
font-weight: bold;
margin-bottom: 10px;
font-size: 1.2em;
color: #555;
}}
.page-content {{
white-space: pre-wrap;
word-wrap: break-word;
font-size: 16px;
}}
@media (max-width: 768px) {{
body {{
padding: 10px;
}}
.page {{
padding: 10px;
}}
}}
</style>
</head>
<body>
<h1>PDF Text Content - {file_name}</h1>
<div id="pdf-content">
"""
for i, page in enumerate(content, 1):
html_content += f'<div class="page"><div class="page-number">Page {i}</div>'
html_content += f'<div class="page-content">{page["text"]}</div></div>'
html_content += """
</div>
</body>
</html>
"""
with open(os.path.join(OUTPUT_DIR, f"{file_name}_text.html"), "w", encoding="utf-8") as f:
f.write(html_content)
def generate_image_view(file_name, pdf_file):
image_paths = convert_pdf_to_images(pdf_file, dpi=300)
html_content = f"""
<html>
<head>
<title>PDF Image Viewer - {file_name}</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; margin: 0 auto; }}
h1 {{ color: #333; border-bottom: 2px solid #333; padding-bottom: 10px; }}
#pdf-container {{ max-width: 800px; margin: 0 auto; }}
.pdf-page {{ width: 100%; margin-bottom: 20px; }}
#zoom-controls {{ position: fixed; top: 10px; right: 10px; background: white; padding: 10px; border: 1px solid #ddd; }}
</style>
<script>
function changeZoom() {{
var zoom = document.getElementById('zoom').value;
var container = document.getElementById('pdf-container');
container.style.transform = `scale(${{zoom}})`;
container.style.transformOrigin = 'top center';
}}
</script>
</head>
<body>
<h1>PDF Image Content - {file_name}</h1>
<div id="zoom-controls">
<label for="zoom">Zoom:</label>
<input type="number" id="zoom" name="zoom" min="0.1" max="3.0" step="0.1" value="1.0" onchange="changeZoom()">
</div>
<div id="pdf-container">
"""
for image_path in image_paths:
html_content += f'<img src="images/{image_path}" alt="{image_path}" class="pdf-page"><br>'
html_content += """
</div>
</body>
</html>
"""
with open(os.path.join(OUTPUT_DIR, f"{file_name}_image.html"), "w", encoding="utf-8") as f:
f.write(html_content)
def generate_static_files():
pdf_files = get_pdf_files()
generate_index_html()
for file_name, pdf_file in pdf_files.items():
generate_text_view(file_name, pdf_file)
generate_image_view(file_name, pdf_file)
if __name__ == "__main__":
generate_static_files()
print("Static files generated in the 'docs' folder.")