Skip to content

Commit

Permalink
Add timers to post-mapping filters
Browse files Browse the repository at this point in the history
  • Loading branch information
rsgoncalves committed Jun 3, 2024
1 parent d2f7efc commit 3e38e9e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
2 changes: 1 addition & 1 deletion text2term/onto_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's',
'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined',
'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid',
'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'not', 'by',
'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'by',
'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls',
'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'}

Expand Down
24 changes: 18 additions & 6 deletions text2term/t2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "]
UNMAPPED_TAG = "unmapped"
OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label",
"Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"]
"Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"]

LOGGER = onto_utils.get_logger(__name__, level=logging.INFO)

Expand Down Expand Up @@ -217,15 +217,26 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi
if mapper == Mapper.BIOPORTAL:
LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score "
"filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.")
df = mappings_df
else:
df = _filter_mappings(mappings_df, min_score)
LOGGER.debug("Filtering mappings by their score...")
start_filter = time.time()
mappings_df = _filter_mappings(mappings_df, min_score)
LOGGER.debug("...done (filtering time: %.2fs seconds)", time.time() - start_filter)

# Include in output data frame any input terms that did not meet min_score threshold
if incl_unmapped:
df = _add_unmapped_terms(df, tags, source_terms, source_term_ids)
LOGGER.debug("Adding unmapped terms...")
start_unmapped = time.time()
mappings_df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids)
LOGGER.debug("...done (adding unmapped time: %.2fs seconds)", time.time() - start_unmapped)

# Add tags
df = _add_tags_to_df(df, tags)
return df
if not mappings_df.empty:
LOGGER.debug("Adding tags...")
start_tagging = time.time()
mappings_df = _add_tags_to_df(mappings_df, tags)
LOGGER.debug("...done (adding tags time: %.2fs seconds)", time.time() - start_tagging)
return mappings_df


# Takes in the tags and source terms and processes them accordingly
Expand Down Expand Up @@ -270,6 +281,7 @@ def _filter_mappings(mappings_df, min_score):
new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score]
return new_df


def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids):
if mappings_df.size == 0:
mapped = []
Expand Down

0 comments on commit 3e38e9e

Please sign in to comment.