Fix bugs in moving merge to run.py

UW-COSMOS · May 6, 2019 · 0ea71c6 · 0ea71c6
1 parent 65916af
commit 0ea71c6
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 4 deletions.
diff --git a/cosmos/converters/list2html.py b/cosmos/converters/list2html.py
@@ -13,7 +13,6 @@
 from lxml import html, etree
 from dominate.util import raw
 from latex_ocr.img2latex import img2latex_api, get_im2latex_model
-from postprocess.postprocess import group_cls, group_cls_columnwise
 from config import IM2LATEX_WEIGHT
 from .pdf_extractor import parse_pdf
 

diff --git a/cosmos/run.py b/cosmos/run.py
@@ -32,6 +32,7 @@
 from proposal_matcher.process import process_doc
 from config import ingestion_settings
 import psycopg2
+from postprocess.postprocess import group_cls
 
 # PDF directory path
 
@@ -85,6 +86,7 @@ def resize_pngs(img_path):
  path, im = pp.resize_png(os.path.join(f'{tmp}', 'images', img_path))
  if path is not None:
  im.save(os.path.join(f'{tmp}', 'images', img_path))
+ print(os.path.join(f'{tmp}', 'images', img_path))
 
  def flatten_png(img_f):
  subprocess.run(['convert', '-flatten', os.path.join(f'{tmp}', 'images', img_f), os.path.join(f'{tmp}', 'images', img_f)])
@@ -93,16 +95,17 @@ def preprocess_pngs(img_f):
  pth, padded_img = pp.pad_image(os.path.join(f'{tmp}', 'images', img_f))
  if pth is not None:
  padded_img.save(os.path.join(img_d, img_f))
+ print(os.path.join(img_d, img_f))
 
  FILE_NAME = re.compile("(.*\.pdf)_([0-9]+)\.png")
 
  def convert_to_html(xml_f):
  xpath = os.path.join(xml, xml_f)
  l = xml2list(xpath)
- l = group_cls(input_list, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
- l = group_cls(input_list, 'Figure')
+ l = group_cls(l, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
+ l = group_cls(l, 'Figure')
  pdf_name = FILE_NAME.search(f'{xml_f[:-4]}.png').group(1)
- list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'), html, unicodes[pdf_name])
+ list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'), html, unicodes[pdf_name] if pdf_name in unicodes else None)
 
  def update_xmls(html_f):
  hpath = os.path.join(html, html_f)