Skip to content

Commit

Permalink
update xml2csv for entities
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Dec 5, 2023
1 parent 01ca6e5 commit ec6e997
Showing 1 changed file with 31 additions and 13 deletions.
44 changes: 31 additions & 13 deletions scripts/xml2csv_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,40 @@ def write_output(output_path, data, header, format="csv"):
fw.writerows(data)


def get_entity_data(data_sorted, ent_type):
ent_data = [[pid, data_sorted['doc_key'], pid, "".join(data_sorted['passages'][pid][entity[0]:entity[1]])] for
pid in range(0, len(data_sorted['ner'])) for entity in
filter(lambda e: e[2] == ent_type, data_sorted['ner'][pid])]

# We remove the duplicates of the materials that falls in the same passage
seen_values = set()
ent_data_no_duplicates = [item for item in ent_data if
str(item[1]) + str(item[2]) + str(item[3]) not in seen_values and not seen_values.add(
str(item[1]) + str(item[2]) + str(item[3]))]

return ent_data_no_duplicates
def get_entity_data(data_sorted, ent_type, remove_dups=False):
entities = []
record_id = 0
for passage in data_sorted['passages']:
text = passage['text']
spans = [span['text'] for span in filter(lambda s: s['type'] == ent_type, passage['spans'])]
if remove_dups:
ents = list(set(spans))
else:
ents = list(spans)
for ent in ents:
entities.append(
[
record_id,
data_sorted['doc_key'],
passage['id'],
ent
]
)
record_id += 1

# entities.append(
# {
# "text": text,
# "entities": ents
# }
# )

return entities


def get_texts(data_sorted):
text_data = [[idx, data_sorted['doc_key'], idx, "".join(data_sorted['passages'][idx])] for idx in
text_data = [[idx, data_sorted['doc_key'], data_sorted['passages'][idx]['id'], data_sorted['passages'][idx]['text']]
for idx in
range(0, len(data_sorted['passages']))]

return text_data
Expand Down

0 comments on commit ec6e997

Please sign in to comment.