Skip to content

Commit

Permalink
Merge pull request #43 from UW-COSMOS/v0.0.4
Browse files Browse the repository at this point in the history
V0.0.4
  • Loading branch information
ankur-gos authored May 6, 2019
2 parents fb62e2b + 0ea71c6 commit b40ef3c
Show file tree
Hide file tree
Showing 46 changed files with 37,551 additions and 58 deletions.
44 changes: 32 additions & 12 deletions cosmos/connected_components/connected_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,29 @@ def write_proposals(img_p, output_dir='tmp/cc_proposals', white_thresh=245, blan
obj_count = 0
obj_heights = 0
for row, top_coord, bottom_coord in rows:
num_cols = get_columns_for_row(row)
blocks, coords, col_idx = divide_row_into_columns(row, num_cols)
blocks = coords = col_idx = num_cols = None
# Old way
if row.shape[0] < 10 * blank_row_height:
num_cols = get_columns_for_row(row)
blocks, coords, col_idx = divide_row_into_columns(row, num_cols)
else:
# New way
rowT = row.T
white_cols = get_blank_rows(rowT, blank_row_height)
cols = []
blocks = []
coords = []
col_idx = []
num_cols = len(col_idx)
for i in range(len(white_cols)-1):
curr = white_cols[i]
nxt = white_cols[i+1]
spl = rowT[curr:nxt, :]
spl = spl.T
blocks.append(spl)
coords.append((curr, nxt))
col_idx.append(i)

for ind, b in enumerate(blocks):
c = coords[ind]
column_index = col_idx[ind]
Expand All @@ -254,17 +275,16 @@ def write_proposals(img_p, output_dir='tmp/cc_proposals', white_thresh=245, blan
nxt = white_rows[i+1]
rows2.append((b[curr:nxt, :], curr, nxt))
for r, c2, n in rows2:
components = get_components(r, numpy=True)
if len(components) == 0:
# Replacing components with finding the proper pixel vals
one_inds = np.argwhere(r)
if len(one_inds) == 0:
continue
x1 = min(components, key=lambda x: x[1])
x1 = x1[1]
y1 = min(components, key=lambda x: x[0])
y1 = y1[0]
x2 = max(components, key=lambda x: x[3])
x2 = x2[3]
y2 = max(components, key=lambda x: x[2])
y2 = y2[2]
h_one_inds = np.hsplit(one_inds, 2)

x1 = int(np.min(h_one_inds[1]))
y1 = int(np.min(h_one_inds[0]))
x2 = int(np.max(h_one_inds[1]))
y2 = int(np.max(h_one_inds[0]))

key = (num_cols, column_index)
val = (top_coord + c2 + y1, c[0] + x1, top_coord + c2 + y2, c[0]+x2)
Expand Down
31 changes: 16 additions & 15 deletions cosmos/construct_caption_tables/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
continue
img = target_div.find_next('img')
target_img_path = str(img['src'])
target_unic = str(target_div)#.find_next('div', 'text_unicode'))
target_unic = collect_words(target_unic, 'text_unicode')
target_tess = target_div.find_next('div', 'rawtext')
target_tess = target_tess.text.strip()
tdiv = str(target_div)
target_unic = collect_words(tdiv, 'text_unicode')
target_tess = collect_words(tdiv, 'hocr')
break
# Sometimes there is no association to an object (Dangling caption).
# TODO: Decide what to do in this case
Expand All @@ -144,10 +143,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
continue
img = assoc_div.find_next('img')
assoc_img_path = str(img['src'])
assoc_unic = str(assoc_div)#.find_next('div', 'text_unicode'))
assoc_unic = collect_words(assoc_unic, 'text_unicode')
assoc_tess = assoc_div.find_next('div', 'rawtext')
assoc_tess = assoc_tess.text.strip()
adiv = str(assoc_div)
assoc_unic = collect_words(adiv, 'text_unicode')
assoc_tess = collect_words(adiv, 'hocr')
break
df_dict['target_img_path'].append(target_img_path)
df_dict['assoc_img_path'].append(assoc_img_path)
Expand All @@ -167,10 +165,9 @@ def construct_single_df(html_f, target_cls, target_cls_association):
continue
img = assoc_div.find_next('img')
assoc_img_path = str(img['src'])
assoc_unic = str(assoc_div)#.find_next('div', 'text_unicode'))
assoc_unic = collect_words(assoc_unic, 'text_unicode')
assoc_tess = assoc_div.find_next('div', 'rawtext')
assoc_tess = assoc_tess.text.strip()
adiv = str(assoc_div)
assoc_unic = collect_words(adiv, 'text_unicode')
assoc_tess = collect_words(adiv, 'hocr')
df_dict['target_img_path'].append(None)
df_dict['assoc_img_path'].append(assoc_img_path)
df_dict['target_unicode'].append(None)
Expand All @@ -192,9 +189,13 @@ def construct(html_dir, target_cls, assoc_cls, output_file, processes=160):
:param output_file: Output path
:param processes: Number of processes
"""
pool = mp.Pool(processes=processes)
ret = [pool.apply_async(construct_single_df, args=(f, target_cls, assoc_cls,)) for f in glob.glob(os.path.join(html_dir, '*.html'))]
results = [r.get() for r in ret]
results = []
if processes == 1:
results = [construct_single_df(f, target_cls, assoc_cls) for f in glob.glob(os.path.join(html_dir, '*.html'))]
else:
pool = mp.Pool(processes=processes)
ret = [pool.apply_async(construct_single_df, args=(f, target_cls, assoc_cls,)) for f in glob.glob(os.path.join(html_dir, '*.html'))]
results = [r.get() for r in ret]
results = [r for r in results if r is not None]
final_df = None
if len(results) > 0:
Expand Down
17 changes: 2 additions & 15 deletions cosmos/converters/list2html.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from lxml import html, etree
from dominate.util import raw
from latex_ocr.img2latex import img2latex_api, get_im2latex_model
from postprocess.postprocess import group_cls, group_cls_columnwise
from config import IM2LATEX_WEIGHT
from .pdf_extractor import parse_pdf

Expand Down Expand Up @@ -193,12 +192,8 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
:param feather_x: x feathering parameter to increase accuracy of ocr
:param feather_y: x feathering parameter to increase accuracy of ocr
"""
input_list = group_cls(input_list, 'Table', do_table_merge=True, merge_over_classes=['Figure', 'Section Header', 'Page Footer', 'Page Header'])
input_list = group_cls(input_list, 'Figure')
#input_list = group_cls_columnwise(input_list, 'Body Text')
doc = dominate.document(title=image_name[:-4])


inter_path = os.path.join(output_dir, 'img', image_name[:-4])
im2latex_model = get_im2latex_model(IM2LATEX_WEIGHT)
with doc:
Expand Down Expand Up @@ -231,9 +226,8 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
# We do a quick loading and deloading to properly convert encodings
div(raw(b_text), cls='hocr', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}', data_score=f'{score}')
loaded = html.fromstring(b_text)
tree = etree.fromstring(etree.tostring(loaded))
latex_tree = variable_ocr(im2latex_model, tree, cropped, [])
div(raw(etree.tostring(latex_tree).decode("utf-8")), cls='hocr_img2latex', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}')
# Running variable ocr is too expensive right now. We need a better solution for this
# If we were to run variable ocr over every token, it would go here.
tree = etree.fromstring(etree.tostring(loaded))
if unicode_df is not None:
match = FILE_NAME_PATTERN.search(image_name)
Expand All @@ -244,13 +238,6 @@ def list2html(input_list, image_name, image_dir, output_dir, unicode_df=None,tes
div(raw(etree.tostring(unicode_tree).decode("utf-8")), cls='text_unicode', data_coordinates=f'{coords[0]} {coords[1]} {coords[2]} {coords[3]}', id=str(first_id))
div(text, cls='equation_unicode')

if tesseract_text:
if t == 'Equation':
txt = img2latex_api(im2latex_model, img=cropped, downsample_image_ratio=2, cropping=True, padding=True, gray_scale=True)
else:
txt = pytesseract.image_to_string(cropped, lang='eng')
div(txt, cls='rawtext')


with open(os.path.join(output_dir, f'{image_name[:-4]}.html'), 'w', encoding='utf-8') as wf:
wf.write(doc.render())
Expand Down
32 changes: 32 additions & 0 deletions cosmos/converters/list2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
From a zipped list of class and coordinates, create json file
"""

import os
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import re

def list2obj(input_list, img_path):
result = {}
img = Image.open(img_path)
width, height = img.size
result['img_width'] = width
result['img_height'] = height
with open(img_path, 'rb') as rf:
result['img_bytes'] = rf.read()
m = re.match(r"(.*)_(.*)\..*", image_name)
if not m or or not m[1] or not m[2]:
raise Exception('Invalid image name provided to list2json')
result['doc_name'] = m[1]
result['page_number'] = int(m[2])
obj_list = [{'cls': i[0], 'coordinates': i[1], 'score': i[2]} for i in input_list]
result['page_objects'] = obj_list
return result







66 changes: 66 additions & 0 deletions cosmos/glossary/Index_of_agriculture_articles.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
Agricultural Machinery
Agricultural Science
Agronomy
Agroecology
Agricultural soil science
Agricultural engineering
Agriculture in Canada
Agricultural biotechnology
Biofertilizer
Biotechnology
Buckwheat
Biodynamic agriculture
Broadcast seeding
Cattle creep
Conventional tillage
Common Agricultural Policy
compost
Corn
Erosion
Entomology
Farm
Fertilizer
Food systems
Farming (disambiguation)
Food Security
Goat
Green Revolution
Green Revolution in India
Green Revolution (disambiguation)
Green Revolution (Iran)
Harrow (tool)
Hay
History of agriculture
Horticulture
List of agricultural universities and colleges
List of agriculture ministries
Maize
Minimum tillage
|Ministry of Agriculture, Fisheries and Food (United Kingdom)|
No-till farming
National Agricultural Law Center
Orchard
Organic farming
Optimum water content for tillage
Pig
Plant Breeding
Plant nutrition
Plough
Selective breeding
Soil Science
Seed
Seed contamination
Seed treatment
Sheep
Silage
Theoretical production ecology
Tillage
Tillage Live
Tractor
Urban agriculture
United States National Agricultural Library
Weed
Weed control
Zero tillage
Portal:Agropedia
Agriculture
Loading

0 comments on commit b40ef3c

Please sign in to comment.