Merge branch 'fix-build'

lfoppiano · Nov 10, 2023 · 9a08ff5 · 9a08ff5
2 parents e625e8a + 92b3a77
commit 9a08ff5
Show file tree

Hide file tree

Showing 14 changed files with 514 additions and 294 deletions.
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -29,6 +29,6 @@ jobs:
  flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
  # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
  flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-# - name: Test with pytest
-# run: |
-# pytest
+ - name: Test with pytest
+ run: |
+ pytest
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+cache_dir = build/pytest
+testpaths = tests
+markers =
+ slow: marks tests as slow (deselect with '-m "not slow"')
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ gensim
 pandas
 regex
 bump-my-version
-supermat
+supermat
+pytest
diff --git a/scripts/xml2LossyJSON.py b/scripts/xml2LossyJSON.py
@@ -0,0 +1,257 @@
+import argparse
+import json
+import os
+import re
+from collections import OrderedDict
+from pathlib import Path
+
+from bs4 import BeautifulSoup, NavigableString, Tag
+from supermat.grobid_tokenizer import tokenizeSimple
+from supermat.supermat_tei_parser import get_children_list, get_section, get_hash
+
+
+def tokenise(string):
+ return tokenizeSimple(string)
+
+
+def write_on_file(fw, paragraphText, dic_token, i, item_length):
+ # tsvText += f'#Text={paragraphText}\n'
+ print(f'#Text={paragraphText}', file=fw)
+ for k, v in dic_token.items():
+ # print(v)
+ if k[0] == i + 1 and v[2]:
+ print('{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t'.format(*k, *v), file=fw)
+
+ # Avoid adding a line break too much at the end of the file
+ if i != item_length - 1:
+ print('', file=fw)
+
+
+def process_file(finput, use_paragraphs=False):
+ with open(finput, encoding='utf-8') as fp:
+ doc = fp.read()
+
+ mod_tags = re.finditer(r'(</\w+>) ', doc)
+ for mod in mod_tags:
+ doc = doc.replace(mod.group(), ' ' + mod.group(1))
+ soup = BeautifulSoup(doc, 'xml')
+
+ children = get_children_list(soup, use_paragraphs=use_paragraphs)
+
+ off_token = 0
+ ient = 1
+
+ # list containing text and the dictionary with all the annotations
+ paragraphs = []
+ dic_dest_relationships = {}
+ dic_source_relationships = {}
+
+ output_document = OrderedDict()
+ output_document['lang'] = 'en'
+ output_document['level'] = 'sentence' if not use_paragraphs else 'paragraph'
+ output_document['paragraphs'] = paragraphs
+
+ linked_entity_registry = {}
+
+ i = 0
+ for child in children:
+ for pTag in child:
+ paragraph = OrderedDict()
+ j = 0
+ offset = 0
+ section = get_section(pTag)
+ if not section:
+ section = get_section(pTag.parent)
+
+ paragraph['section'] = section
+ paragraph_text = ''
+ paragraph['text'] = paragraph_text
+ spans = []
+ paragraph['spans'] = spans
+ tokens = []
+ paragraph['tokens'] = tokens
+ for item in pTag.contents:
+ if type(item) == NavigableString:
+ local_text = str(item)
+ paragraph_text += local_text
+ offset += len(local_text)
+
+ elif type(item) is Tag and item.name == 'rs':
+ local_text = item.text
+ paragraph_text += local_text
+
+ span = OrderedDict()
+ front_offset = 0
+ if local_text.startswith(" "):
+ front_offset = len(local_text) - len(local_text.lstrip(" "))
+
+ span['text'] = local_text.strip(" ")
+ span['offset_start'] = offset + front_offset
+ span['offset_end'] = offset + len(span['text']) + front_offset
+ spans.append(span)
+
+ offset += len(local_text)
+
+ assert paragraph_text[span['offset_start']:span['offset_end']] == span['text']
+
+ if 'type' not in item.attrs:
+ raise Exception("RS without type is invalid. Stopping")
+
+ entity_class = item.attrs['type']
+ span['type'] = '<' + entity_class + '>'
+
+ if len(item.attrs) > 0:
+ ## multiple entities can point ot the same one, so "corresp" value can be duplicated
+ allow_duplicates = False
+ if 'xml:id' in item.attrs:
+ span['id'] = item['xml:id']
+ if item.attrs['xml:id'] not in dic_dest_relationships:
+ dic_dest_relationships[item.attrs['xml:id']] = [i + 1, j + 1, ient, entity_class]
+
+ if 'corresp' in item.attrs:
+ if 'id' not in span or span['id'] == "":
+ id_str = str(i + 1) + "," + str(j + 1)
+ span['id'] = get_hash(id_str)
+ if (span['id']) not in dic_source_relationships:
+ dic_source_relationships[span['id']] = [item.attrs['corresp'].replace('#', ''),
+ ient,
+ entity_class]
+ else:
+ if (span['id']) not in dic_source_relationships:
+ dic_source_relationships[span['id']] = [item.attrs['corresp'].replace('#', ''),
+ ient,
+ entity_class]
+
+ allow_duplicates = True
+
+ if 'id' in span:
+ if span['id'] not in linked_entity_registry.keys():
+ linked_entity_registry[span['id']] = span
+ else:
+ if not allow_duplicates:
+ print("The same key exists... something's wrong: ", span['id'])
+
+ j += 1
+
+ ient += 1 # entity No.
+
+ paragraph['text'] = paragraph_text
+ off_token += 1 # return
+
+ paragraphs.append(paragraph)
+ i += 1
+
+ for id__ in dic_source_relationships:
+ destination_xml_id = dic_source_relationships[id__][0]
+ # source_entity_id = dic_source_relationships[par_num, token_num][1]
+ # label_source = dic_source_relationships[id__][2]
+
+ # destination_xml_id: Use this to pick up information from dic_dest_relationship
+
+ for des in destination_xml_id.split(","):
+ destination_item = dic_dest_relationships[str(des)]
+ # destination_paragraph_tsv = destination_item[0]
+ # destination_token_tsv = destination_item[1]
+ # destination_entity_id = destination_item[2]
+ # label_destination = destination_item[3]
+
+ # relationship_name = get_relationship_name(label, destination_label)
+
+ dict_coordinates = get_hash(id__)
+
+ span_destination = linked_entity_registry[des]
+ span_source = linked_entity_registry[dict_coordinates]
+ link_source = {
+ "targetId": span_destination['id'],
+ "targetText": span_destination['text'],
+ "targetType": span_destination['type']
+ }
+
+ link_destination = {
+ "targetId": span_source['id'],
+ "targetText": span_source['text'],
+ "targetType": span_source['type']
+ }
+
+ if 'links' in span_source:
+ span_source['links'].append(link_source)
+ else:
+ span_source['links'] = [link_source]
+
+ if 'links' in span_destination:
+ span_destination['links'].append(link_destination)
+ else:
+ span_destination['links'] = [link_destination]
+
+ return output_document
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description="Converter from XML (Grobid training data based on TEI) to lossy JSON (CORD-19) format")
+
+ parser.add_argument("--input",
+ help="Input file or directory",
+ required=True)
+ parser.add_argument("--output",
+ required=True,
+ help="Output directory")
+ parser.add_argument("--recursive",
+ action="store_true",
+ default=False,
+ help="Process input directory recursively. If input is a file, this parameter is ignored. ")
+ parser.add_argument("--use-paragraphs",
+ action="store_true",
+ default=False,
+ help="Use paragraphs instead of sentences.")
+
+ args = parser.parse_args()
+
+ input = args.input
+ output = args.output
+ recursive = args.recursive
+ use_paragraphs = args.use_paragraphs
+
+ if os.path.isdir(input):
+ path_list = []
+ output_path_list = []
+
+ if recursive:
+ for root, dirs, files in os.walk(input):
+ for dir in dirs:
+ abs_path_dir = os.path.join(root, dir)
+ output_path = abs_path_dir.replace(str(input.rstrip("/")), str(output))
+ os.makedirs(output_path, exist_ok=True)
+
+ for file_ in files:
+ if not file_.lower().endswith(".tei.xml"):
+ continue
+
+ file_input_path = os.path.join(root, file_)
+ output_path = file_input_path.replace(str(input.rstrip("/")), str(output))
+ file_output_path = output_path.replace(".xml", ".json").replace(".tei", "")
+ path_list.append([file_input_path, file_output_path])
+
+ else:
+ input_path_list = list(Path(input).glob('*.tei.xml'))
+ output_path_list = [str(input_path)
+ .replace(str(input), str(output))
+ .replace(".xml", ".json")
+ .replace(".tei", "") for input_path
+ in input_path_list]
+
+ path_list = list(zip(input_path_list, output_path_list))
+
+ for file_input_path, file_output_path in path_list:
+ print("Processing: ", file_input_path)
+
+ output_document = process_file(str(file_input_path), use_paragraphs)
+ with open(file_output_path, 'w') as fp:
+ json.dump(output_document, fp)
+
+ elif os.path.isfile(input):
+ input_path = Path(input)
+ output_filename = os.path.join(output, input_path.stem + ".json")
+ output_document = process_file(input_path, use_paragraphs)
+ with open(output_filename, 'w') as fp:
+ json.dump(output_document, fp)