-
Notifications
You must be signed in to change notification settings - Fork 0
/
image_processor.py
137 lines (119 loc) · 4.61 KB
/
image_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
import requests
import torch
import conversion_utils as cu
import time
import asyncio
from got_ocr import predict
# Main function that processes the PDF and creates the Markdown output
def process_images(images, base_path):
time_start = time.time()
try:
# Open output files to store responses
latex_file = open(base_path + ".tex", "w")
extracted_file = open(base_path + "_js.tex", "w")
markdown_file = open(base_path + ".md", 'w')
html_file = open(base_path + ".html", "w")
html_file.write("<!DOCTYPE html><html><body>")
js_file = open(base_path + "_js.html", "w")
# Process each page asynchronously
print(f"Processing {len(images)} pages for {base_path} ...")
latex_text, js_text, latex_extracted = asyncio.run(process_pages(images))
# Finalize LaTeX doc
latex_text += "\n\\end{document}"
latex_extracted += "\n\\end{document}"
print("Converting output formats...")
# Write to file
latex_file.write(f"{latex_text}\n")
latex_file.close()
extracted_file.write(f"{latex_extracted}\n")
extracted_file.close()
# JavaScript-rendered
js_file.write(f"{cu.html_template(js_text)}")
js_file.close()
# Markdown
try:
markdown_text = cu.latex_to_markdown(latex_extracted)
markdown_file.write(f"{markdown_text}\n")
except Exception as e:
print(f'An error occurred while converting LaTeX to MD:\n{e}')
finally:
if not markdown_file.closed:
markdown_file.close()
# HTML
try:
html_response = cu.latex_to_html(latex_text)
html_file.write(f"{html_response}\n")
html_file.write("</body></html>")
except Exception as e:
print(f'An error occurred while converting LaTeX to HTML:\n{e}')
finally:
if not html_file.closed:
html_file.close()
print(f"Output saved to {base_path}.*")
except Exception as e:
print(f"An error occurred: {e}")
time_end = time.time()
total_duration = time_end - time_start
# total_minutes, total_seconds = divmod(total_duration, 60)
print(f"{base_path} processing time: {time.strftime('%Mm %Ss', time.gmtime(total_duration))}")
async def process_pages(images):
tasks = []
latex_text = "\\begin{document}\n\\pagenumbering{arabic}\n"
latex_extracted = "\\begin{document}\n\\pagenumbering{arabic}\n"
js_text = ""
for page_number, image in enumerate(images, start=1):
tasks.append(process_page(page_number, image))
responses = await asyncio.gather(*tasks)
# Sort the responses by page_number to ensure correct order
responses.sort(key=lambda x: x['page_number'])
for result in responses:
# Add page number and response to latex_text
latex_response = result['latex']
text_value = latex_response
html_string = result["html"]
# Extract the value of 'text' inside the rendered JavaScript block
pattern = r'const text\s*=\s*"(.*?)\\n"\ '
script_block = re.search(pattern, html_string, re.DOTALL)
if script_block:
text_value = script_block.group(1)
text_value = text_value.replace('\\n"+\n"', '\n').replace('\\\\', '\\')
latex_text += latex_response + "\n\n\\newpage\n\n"
latex_extracted += text_value + "\n\n\\newpage\n\n"
if js_text != "":
js_text += " +\n"
js_text += cu.to_js_string(text_value) + " + \"\\n\\n\""
return latex_text, js_text, latex_extracted
# Define a semaphore to limit the number of concurrent tasks
# to the number of GPUs
n_gpus = torch.cuda.device_count()
sem = asyncio.Semaphore(n_gpus if n_gpus > 0 else 1)
async def process_page(page_number, image):
async with sem:
start_page = time.time() # Start timer for the current page
base64_img = cu.image_to_base64(image)
response = await asyncio.to_thread(predict, base64_img)
end_page = time.time() # End timer for the current page
print(f"Page {page_number} processed in {end_page - start_page:.1f}s")
return {
'page_number': page_number,
'latex': response["output"],
'html': response["html"]
}
# Send a base64 image to the server and get the HTML response
def send_image_to_server(base64_img, page_number):
server_url = "http://localhost:8000"
payload = {"input": base64_img, "page": page_number}
response = requests.post(f"{server_url}/predict", json=payload)
if response.status_code == 200:
try:
outer_output = response.json().get("output")
if isinstance(outer_output, dict) and "output" in outer_output:
return outer_output["output"]
return outer_output
except KeyError:
print("Error: 'output' key not found in response")
return ""
else:
print(f"Error {response.status_code}: {response.text}")
return ""