Skip to content

Commit

Permalink
Fix bugs in moving merge to run.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ankur-gos committed May 6, 2019
1 parent 65916af commit 0ea71c6
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
1 change: 0 additions & 1 deletion cosmos/converters/list2html.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from lxml import html, etree
from dominate.util import raw
from latex_ocr.img2latex import img2latex_api, get_im2latex_model
from postprocess.postprocess import group_cls, group_cls_columnwise
from config import IM2LATEX_WEIGHT
from .pdf_extractor import parse_pdf

Expand Down
9 changes: 6 additions & 3 deletions cosmos/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from proposal_matcher.process import process_doc
from config import ingestion_settings
import psycopg2
from postprocess.postprocess import group_cls

# PDF directory path

Expand Down Expand Up @@ -85,6 +86,7 @@ def resize_pngs(img_path):
path, im = pp.resize_png(os.path.join(f'{tmp}', 'images', img_path))
if path is not None:
im.save(os.path.join(f'{tmp}', 'images', img_path))
print(os.path.join(f'{tmp}', 'images', img_path))

def flatten_png(img_f):
subprocess.run(['convert', '-flatten', os.path.join(f'{tmp}', 'images', img_f), os.path.join(f'{tmp}', 'images', img_f)])
Expand All @@ -93,16 +95,17 @@ def preprocess_pngs(img_f):
pth, padded_img = pp.pad_image(os.path.join(f'{tmp}', 'images', img_f))
if pth is not None:
padded_img.save(os.path.join(img_d, img_f))
print(os.path.join(img_d, img_f))

FILE_NAME = re.compile("(.*\.pdf)_([0-9]+)\.png")

def convert_to_html(xml_f):
xpath = os.path.join(xml, xml_f)
l = xml2list(xpath)
l = group_cls(input_list, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
l = group_cls(input_list, 'Figure')
l = group_cls(l, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
l = group_cls(l, 'Figure')
pdf_name = FILE_NAME.search(f'{xml_f[:-4]}.png').group(1)
list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'), html, unicodes[pdf_name])
list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'), html, unicodes[pdf_name] if pdf_name in unicodes else None)

def update_xmls(html_f):
hpath = os.path.join(html, html_f)
Expand Down

0 comments on commit 0ea71c6

Please sign in to comment.