Skip to content

Commit

Permalink
adjust counts to appropriately skip invalid clinical significance values
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Mar 6, 2024
1 parent 6d6e450 commit ddd9eaa
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 11 deletions.
6 changes: 6 additions & 0 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class ClinVarRecord:

# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
# Some records have been flagged by ClinVar and should not be used.
INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}

def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
Expand Down Expand Up @@ -143,6 +145,10 @@ def clinical_significance_list(self):
See /data-exploration/clinvar-variant-types/README.md for further explanation."""
return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))

@property
def valid_clinical_significances(self):
return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
Expand Down
26 changes: 16 additions & 10 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue

# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid
# allele origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 3 (skip). No functional consequences are available.
# Failure mode 4 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -106,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
# EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
# failure so we can continue to measure mapping coverage.
# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not any(group[-1] for group in grouped_diseases):
report.clinvar_skip_missing_efo_mapping += 1
unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
Expand All @@ -122,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
for allele_origins, disease_attributes, consequence_attributes in itertools.product(
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes)
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name,
disease_source_id, disease_mapped_efo_id,
consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
is_valid = validate_evidence_string(evidence_string, ot_schema_contents)
Expand Down Expand Up @@ -185,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
'allelicRequirements': clinvar_record.mode_of_inheritance,

# Levels of clinical significance reported for the variant.
'clinicalSignificances': clinvar_record.clinical_significance_list,
'clinicalSignificances': clinvar_record.valid_clinical_significances,

# Confidence (review status).
'confidence': clinvar_record.review_status,
Expand Down
4 changes: 3 additions & 1 deletion cmat/output_generation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
# ClinVar record counters.
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_fatal_no_clinical_significance = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
Expand Down Expand Up @@ -86,7 +87,7 @@ def load_from_file(self, filename):

def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
Expand All @@ -110,6 +111,7 @@ def print_report(self):
Total number of ClinVar records\t{self.clinvar_total}
Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Expand Down

0 comments on commit ddd9eaa

Please sign in to comment.