Merge pull request #43 from UW-COSMOS/v0.0.4

V0.0.4
UW-COSMOS · May 6, 2019 · b40ef3c · b40ef3c
2 parents fb62e2b + 0ea71c6
commit b40ef3c
Show file tree

Hide file tree

Showing 46 changed files with 37,551 additions and 58 deletions.
diff --git a/cosmos/connected_components/connected_components.py b/cosmos/connected_components/connected_components.py
@@ -241,8 +241,29 @@ def write_proposals(img_p, output_dir='tmp/cc_proposals', white_thresh=245, blan
  obj_count = 0
  obj_heights = 0
  for row, top_coord, bottom_coord in rows:
- num_cols = get_columns_for_row(row)
- blocks, coords, col_idx = divide_row_into_columns(row, num_cols)
+ blocks = coords = col_idx = num_cols = None
+ # Old way
+ if row.shape[0] < 10 * blank_row_height:
+ num_cols = get_columns_for_row(row)
+ blocks, coords, col_idx = divide_row_into_columns(row, num_cols)
+ else:
+ # New way
+ rowT = row.T
+ white_cols = get_blank_rows(rowT, blank_row_height)
+ cols = []
+ blocks = []
+ coords = []
+ col_idx = []
+ num_cols = len(col_idx)
+ for i in range(len(white_cols)-1):
+ curr = white_cols[i]
+ nxt = white_cols[i+1]
+ spl = rowT[curr:nxt, :]
+ spl = spl.T
+ blocks.append(spl)
+ coords.append((curr, nxt))
+ col_idx.append(i)
+
  for ind, b in enumerate(blocks):
  c = coords[ind]
  column_index = col_idx[ind]
@@ -254,17 +275,16 @@ def write_proposals(img_p, output_dir='tmp/cc_proposals', white_thresh=245, blan
  nxt = white_rows[i+1]
  rows2.append((b[curr:nxt, :], curr, nxt))
  for r, c2, n in rows2:
- components = get_components(r, numpy=True)
- if len(components) == 0:
+ # Replacing components with finding the proper pixel vals
+ one_inds = np.argwhere(r)
+ if len(one_inds) == 0:
  continue
- x1 = min(components, key=lambda x: x[1])
- x1 = x1[1]
- y1 = min(components, key=lambda x: x[0])
- y1 = y1[0]
- x2 = max(components, key=lambda x: x[3])
- x2 = x2[3]
- y2 = max(components, key=lambda x: x[2])
- y2 = y2[2]
+ h_one_inds = np.hsplit(one_inds, 2)
+
+ x1 = int(np.min(h_one_inds[1]))
+ y1 = int(np.min(h_one_inds[0]))
+ x2 = int(np.max(h_one_inds[1]))
+ y2 = int(np.max(h_one_inds[0]))
 
  key = (num_cols, column_index)
  val = (top_coord + c2 + y1, c[0] + x1, top_coord + c2 + y2, c[0]+x2)

diff --git a/cosmos/construct_caption_tables/construct.py b/cosmos/construct_caption_tables/construct.py
@@ -121,10 +121,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
  continue
  img = target_div.find_next('img')
  target_img_path = str(img['src'])
- target_unic = str(target_div)#.find_next('div', 'text_unicode'))
- target_unic = collect_words(target_unic, 'text_unicode')
- target_tess = target_div.find_next('div', 'rawtext')
- target_tess = target_tess.text.strip()
+ tdiv = str(target_div)
+ target_unic = collect_words(tdiv, 'text_unicode')
+ target_tess = collect_words(tdiv, 'hocr')
  break
  # Sometimes there is no association to an object (Dangling caption).
  # TODO: Decide what to do in this case
@@ -144,10 +143,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
  continue
  img = assoc_div.find_next('img')
  assoc_img_path = str(img['src'])
- assoc_unic = str(assoc_div)#.find_next('div', 'text_unicode'))
- assoc_unic = collect_words(assoc_unic, 'text_unicode')
- assoc_tess = assoc_div.find_next('div', 'rawtext')
- assoc_tess = assoc_tess.text.strip()
+ adiv = str(assoc_div)
+ assoc_unic = collect_words(adiv, 'text_unicode')
+ assoc_tess = collect_words(adiv, 'hocr')
  break
  df_dict['target_img_path'].append(target_img_path)
  df_dict['assoc_img_path'].append(assoc_img_path)
@@ -167,10 +165,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
  continue
  img = assoc_div.find_next('img')
  assoc_img_path = str(img['src'])
- assoc_unic = str(assoc_div)#.find_next('div', 'text_unicode'))
- assoc_unic = collect_words(assoc_unic, 'text_unicode')
- assoc_tess = assoc_div.find_next('div', 'rawtext')
- assoc_tess = assoc_tess.text.strip()
+ adiv = str(assoc_div)
+ assoc_unic = collect_words(adiv, 'text_unicode')
+ assoc_tess = collect_words(adiv, 'hocr')
  df_dict['target_img_path'].append(None)
  df_dict['assoc_img_path'].append(assoc_img_path)
  df_dict['target_unicode'].append(None)
@@ -192,9 +189,13 @@ def construct(html_dir, target_cls, assoc_cls, output_file, processes=160):
  :param output_file: Output path
  :param processes: Number of processes
  """
- pool = mp.Pool(processes=processes)
- ret = [pool.apply_async(construct_single_df, args=(f, target_cls, assoc_cls,)) for f in glob.glob(os.path.join(html_dir, '*.html'))]
- results = [r.get() for r in ret]
+ results = []
+ if processes == 1:
+ results = [construct_single_df(f, target_cls, assoc_cls) for f in glob.glob(os.path.join(html_dir, '*.html'))]
+ else:
+ pool = mp.Pool(processes=processes)
+ ret = [pool.apply_async(construct_single_df, args=(f, target_cls, assoc_cls,)) for f in glob.glob(os.path.join(html_dir, '*.html'))]
+ results = [r.get() for r in ret]
  results = [r for r in results if r is not None]
  final_df = None
  if len(results) > 0:

diff --git a/cosmos/converters/list2html.py b/cosmos/converters/list2html.py
@@ -13,7 +13,6 @@
 from lxml import html, etree
 from dominate.util import raw
 from latex_ocr.img2latex import img2latex_api, get_im2latex_model
-from postprocess.postprocess import group_cls, group_cls_columnwise
 from config import IM2LATEX_WEIGHT
 from .pdf_extractor import parse_pdf
 
@@ -193,12 +192,8 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
  :param feather_x: x feathering parameter to increase accuracy of ocr
  :param feather_y: x feathering parameter to increase accuracy of ocr
  """
- input_list = group_cls(input_list, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
- input_list = group_cls(input_list, 'Figure')
- #input_list = group_cls_columnwise(input_list, 'Body Text')
  doc = dominate.document(title=image_name[:-4])
 
-
  inter_path = os.path.join(output_dir, 'img', image_name[:-4])
  im2latex_model = get_im2latex_model(IM2LATEX_WEIGHT)
  with doc:
@@ -231,9 +226,8 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
  # We do a quick loading and deloading to properly convert encodings
  div(raw(b_text), cls='hocr', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}', data_score=f'{score}')
  loaded = html.fromstring(b_text) 
- tree = etree.fromstring(etree.tostring(loaded))
- latex_tree = variable_ocr(im2latex_model, tree, cropped, [])
- div(raw(etree.tostring(latex_tree).decode("utf-8")), cls='hocr_img2latex', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}')
+ # Running variable ocr is too expensive right now. We need a better solution for this
+ # If we were to run variable ocr over every token, it would go here.
  tree = etree.fromstring(etree.tostring(loaded))
  if unicode_df is not None:
  match = FILE_NAME_PATTERN.search(image_name)
@@ -244,13 +238,6 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
  div(raw(etree.tostring(unicode_tree).decode("utf-8")), cls='text_unicode', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}', id=str(first_id))
  div(text, cls='equation_unicode') 
 
- if tesseract_text:
- if t == 'Equation':
- txt = img2latex_api(im2latex_model, img=cropped, downsample_image_ratio=2, cropping=True, padding=True, gray_scale=True)
- else:
- txt = pytesseract.image_to_string(cropped, lang='eng')
- div(txt, cls='rawtext')
-
 
  with open(os.path.join(output_dir, f'{image_name[:-4]}.html'), 'w', encoding='utf-8') as wf:
  wf.write(doc.render())

diff --git a/cosmos/converters/list2json.py b/cosmos/converters/list2json.py
@@ -0,0 +1,32 @@
+"""
+From a zipped list of class and coordinates, create json file
+"""
+
+import os
+from PIL import Image, ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import re
+
+def list2obj(input_list, img_path):
+ result = {}
+ img = Image.open(img_path)
+ width, height = img.size
+ result['img_width'] = width
+ result['img_height'] = height
+ with open(img_path, 'rb') as rf:
+ result['img_bytes'] = rf.read()
+ m = re.match(r"(.*)_(.*)\..*", image_name)
+ if not m or or not m[1] or not m[2]:
+ raise Exception('Invalid image name provided to list2json')
+ result['doc_name'] = m[1]
+ result['page_number'] = int(m[2])
+ obj_list = [{'cls': i[0], 'coordinates': i[1], 'score': i[2]} for i in input_list]
+ result['page_objects'] = obj_list
+ return result
+
+
+
+
+
+
+
diff --git a/cosmos/glossary/Index_of_agriculture_articles.csv b/cosmos/glossary/Index_of_agriculture_articles.csv
@@ -0,0 +1,66 @@
+Agricultural Machinery
+Agricultural Science
+Agronomy
+Agroecology
+Agricultural soil science
+Agricultural engineering
+Agriculture in Canada
+Agricultural biotechnology
+Biofertilizer
+Biotechnology
+Buckwheat
+Biodynamic agriculture
+Broadcast seeding
+Cattle creep
+Conventional tillage
+Common Agricultural Policy
+compost
+Corn
+Erosion
+Entomology
+Farm
+Fertilizer
+Food systems
+Farming (disambiguation)
+Food Security
+Goat
+Green Revolution
+Green Revolution in India
+Green Revolution (disambiguation)
+Green Revolution (Iran)
+Harrow (tool)
+Hay
+History of agriculture
+Horticulture
+List of agricultural universities and colleges
+List of agriculture ministries
+Maize
+Minimum tillage
+|Ministry of Agriculture, Fisheries and Food (United Kingdom)|
+No-till farming
+National Agricultural Law Center
+Orchard
+Organic farming
+Optimum water content for tillage
+Pig
+Plant Breeding
+Plant nutrition
+Plough
+Selective breeding
+Soil Science
+Seed
+Seed contamination
+Seed treatment
+Sheep
+Silage
+Theoretical production ecology
+Tillage
+Tillage Live
+Tractor
+Urban agriculture
+United States National Agricultural Library
+Weed
+Weed control
+Zero tillage
+Portal:Agropedia
+Agriculture