Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Continue parsing records when exceptions occur in evidence generation #418

Merged
merged 2 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class ClinVarRecord:
score_map = {
"no assertion provided": 0,
'no assertion criteria provided': 0,
'no classifications from unflagged records': 0,
'criteria provided, single submitter': 1,
'criteria provided, conflicting interpretations': 1,
'criteria provided, multiple submitters, no conflicts': 2,
Expand Down
134 changes: 71 additions & 63 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,12 @@ def write_unmapped_terms(self, dir_out):
def validate_evidence_string(ev_string, ot_schema_contents):
try:
jsonschema.validate(ev_string, ot_schema_contents, format_checker=jsonschema.FormatChecker())
return True
except jsonschema.exceptions.ValidationError as err:
logger.error('Error: evidence string does not validate against schema.')
logger.error(f'Error message: {err}')
logger.error(f'Complete evidence string: {json.dumps(ev_string)}')
sys.exit(1)
return False
except jsonschema.exceptions.SchemaError:
logger.error('Error: OpenTargets schema file is invalid')
sys.exit(1)
Expand Down Expand Up @@ -135,72 +136,79 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if report.clinvar_total % 1000 == 0:
logger.info(f'{report.clinvar_total} records processed')

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue

# Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue
# Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue

# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 3 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
# Failure mode 3 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue

# Gather consequence mapping counts for variants of interest
if clinvar_record.measure.is_repeat_expansion_variant:
report.repeat_expansion_variants += len(consequence_types)
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
# EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
# failure so we can continue to measure mapping coverage.
if not any(group[-1] for group in grouped_diseases):
report.clinvar_skip_missing_efo_mapping += 1
unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
report.unmapped_trait_names[unmapped_trait_name] += 1

assert grouped_allele_origins and grouped_diseases and consequence_types, \
'Some of the attribute lists are still empty even after passing all checks.'

complete_evidence_strings_generated = 0
evidence_strings_generated = 0
for allele_origins, disease_attributes, consequence_attributes in itertools.product(
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
validate_evidence_string(evidence_string, ot_schema_contents)
output_evidence_strings_file.write(json.dumps(evidence_string) + '\n')

# Record some evidence string and trait metrics.
evidence_strings_generated += 1
if disease_mapped_efo_id is not None:
complete_evidence_strings_generated += 1
report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))

assert evidence_strings_generated != 0, 'No evidence strings generated despite all attributes passing checks.'
if complete_evidence_strings_generated == 1:
report.clinvar_done_one_complete_evidence_string += 1
elif complete_evidence_strings_generated > 1:
report.clinvar_done_multiple_complete_evidence_strings += 1

report.complete_evidence_string_count += complete_evidence_strings_generated
report.evidence_string_count += evidence_strings_generated
# Gather consequence mapping counts for variants of interest
if clinvar_record.measure.is_repeat_expansion_variant:
report.repeat_expansion_variants += len(consequence_types)
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
# EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
# failure so we can continue to measure mapping coverage.
if not any(group[-1] for group in grouped_diseases):
report.clinvar_skip_missing_efo_mapping += 1
unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
report.unmapped_trait_names[unmapped_trait_name] += 1

assert grouped_allele_origins and grouped_diseases and consequence_types, \
'Some of the attribute lists are still empty even after passing all checks.'

complete_evidence_strings_generated = 0
evidence_strings_generated = 0
for allele_origins, disease_attributes, consequence_attributes in itertools.product(
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
is_valid = validate_evidence_string(evidence_string, ot_schema_contents)
if is_valid:
output_evidence_strings_file.write(json.dumps(evidence_string) + '\n')

# Record some evidence string and trait metrics.
evidence_strings_generated += 1
if disease_mapped_efo_id is not None:
complete_evidence_strings_generated += 1
report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))

if complete_evidence_strings_generated == 1:
report.clinvar_done_one_complete_evidence_string += 1
elif complete_evidence_strings_generated > 1:
report.clinvar_done_multiple_complete_evidence_strings += 1

report.complete_evidence_string_count += complete_evidence_strings_generated
report.evidence_string_count += evidence_strings_generated

except Exception as e:
logger.error(f'Problem generating evidence for {clinvar_record.accession}')
logger.error(f'Error: {e}')
continue

output_evidence_strings_file.close()
return report
Expand Down
Loading