diff --git a/data-exploration/other/somatic-oncogenic-records.ipynb b/data-exploration/other/somatic-oncogenic-records.ipynb new file mode 100644 index 00000000..166ed757 --- /dev/null +++ b/data-exploration/other/somatic-oncogenic-records.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 55, + "id": "94fb1395-8914-4aed-8004-6299f103efcd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "c8c06c5c-2fd8-4356-b29a-42ad736d5aaa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from filter_clinvar_xml import filter_xml, pprint\n", + "from cmat.clinvar_xml_io import *\n", + "from cmat.clinvar_xml_io.xml_parsing import find_elements\n", + "from collections import Counter, defaultdict" + ] + }, + { + "cell_type": "markdown", + "id": "1e4a50d4-c8a8-4d6b-9c94-78faccc4ec0c", + "metadata": {}, + "source": [ + "First get all the somatic/oncogenic records that were dropped from the most recent submission." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "85ed4498-b101-4492-bc9b-5b28f7df7ead", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Full paths redacted\n", + "invalid_evidence_rcvs = [r.strip() for r in open('batch-2024-12/logs/invalid_evidence_rcvs.txt').readlines()]\n", + "multiple_class_rcvs = [r.strip() for r in open('batch-2024-12/logs/multiple_classification_rcvs.txt').readlines()]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9a3134f3-4f34-482b-b041-94655a2113fd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "all_rcvs = set(invalid_evidence_rcvs + multiple_class_rcvs)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1b47d8c8-6229-4d6f-801e-af29dcc442e6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "505" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_rcvs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bca12df4-cd07-42f1-86d5-3c8eafd3d575", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "input_xml = 'clinvar.xml.gz'\n", + "somatic_xml = 'somatic.xml.gz'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "097a43fb-afca-4791-8301-d34667e2dbe4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filter_clinvar_xml:Records written: 505\n" + ] + } + ], + "source": [ + "filter_xml(\n", + " input_xml=input_xml,\n", + " output_xml=somatic_xml,\n", + " filter_fct=lambda r: r.accession in all_rcvs,\n", + " max_num=len(all_rcvs)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5876db9d-69ce-46ce-846c-0732a9936b82", + "metadata": {}, + "source": [ + "First issue is **multiplicity**.\n", + "\n", + "Currently:\n", + "* `clinicalSignificances` is a list of strings but these are parsed from a single description - e.g. [RCV000002127](https://www.ncbi.nlm.nih.gov/clinvar/RCV000002127/), \"Pathogenic/Likely pathogenic\" becomes `pathogenic` and `likely pathogenic`\n", + "* `confidence` is a single string, in ClinVar this is \"review status\" and is associated with a single description\n", + "\n", + "New version:\n", + "* ClinVar can contain multiple clinical classifications, each containing one (or more) description(s) and a review status.\n", + " \n", + "Example: [RCV000443639](https://www.ncbi.nlm.nih.gov/clinvar/RCV000443639/)\n", + "```\n", + "\n", + " \n", + " no assertion criteria provided\n", + " Likely pathogenic\n", + " \n", + " \n", + " criteria provided, single submitter\n", + " Oncogenic\n", + " \n", + "\n", + "```\n", + "\n", + "Sole example of multiple somatic: [RCV000426735](https://www.ncbi.nlm.nih.gov/clinvar/RCV000426735/) - note annoyingly description & review status are not 1:1\n", + "```\n", + "\n", + " \n", + " criteria provided, single submitter\n", + " Uncertain significance\n", + " \n", + " \n", + " no assertion criteria provided\n", + " Tier I - Strong\n", + " Tier I - Strong\n", + " \n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f6ff0435-1ca0-45ab-8575-7ea01aa1b078", + "metadata": {}, + "source": [ + "Second issue is **new values and fields**.\n", + "* New categorisation of clinical classification: Germline, Somatic, Oncogenicity\n", + "* New terms appear in the \"description\" field which we currently report in `clinicalSignificances`\n", + "* New fields for somatic clinical impact only - assertion type and (what they call) clinical significance\n", + "\n", + "See [here](https://github.com/EBIvariation/CMAT/issues/396#issuecomment-1898804129) for lists of values." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9064f581-d5fc-4725-bb4f-59bf0c869f58", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dataset = ClinVarDataset(somatic_xml)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b1447145-bfbb-455d-8496-3351e38e37b6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Terms in the description field - e.g. \"Tier I - Strong\" or \"Likely oncogenic\"\n", + "somatic_terms = Counter()\n", + "oncogenic_terms = Counter()\n", + "\n", + "# Additional somatic terms\n", + "somatic_impact_assertion_types = Counter()\n", + "somatic_impact_clin_sigs = Counter()\n", + "\n", + "# e.g. (somatic, somatic) or (germline, oncogenic) - nb. everything *not* in this list of 505 is just (germline,)\n", + "rcv_classifications = defaultdict(list)\n", + "\n", + "for r in dataset:\n", + " rcv_all_class = []\n", + " for c in r.clinical_classifications:\n", + " class_type = c.class_xml.tag\n", + " descriptions = find_elements(c.class_xml, './Description')\n", + " \n", + " if class_type == 'GermlineClassification':\n", + " # Assume we support germline terms, curious if there are any multiples here though...\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " \n", + " elif class_type == 'SomaticClinicalImpact':\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " clin_class_term = d.text.lower()\n", + " somatic_terms[clin_class_term] += 1\n", + " \n", + " assert_type = d.attrib.get('ClinicalImpactAssertionType', '').lower()\n", + " somatic_impact_assertion_types[assert_type] += 1\n", + " \n", + " clin_sig = d.attrib.get('ClinicalImpactClinicalSignificance', '').lower()\n", + " somatic_impact_clin_sigs[clin_sig] += 1\n", + " \n", + " elif class_type == 'OncogenicityClassification':\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " clin_class_term = d.text.lower()\n", + " oncogenic_terms[clin_class_term] += 1\n", + " else:\n", + " print(\"unknown classification type:\", class_type)\n", + " \n", + " rcv_all_class = tuple(sorted(rcv_all_class))\n", + " rcv_classifications[rcv_all_class].append(r.accession)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "bfe50aa4-f046-43f1-8dd8-b6baddb689a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rcv_class_counts = {\n", + " k: len(v) for k,v in rcv_classifications.items() \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "17b68c76-3cc5-4501-aae0-16d248899962", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{('SomaticClinicalImpact',): 18,\n", + " ('OncogenicityClassification',): 442,\n", + " ('OncogenicityClassification', 'SomaticClinicalImpact'): 1,\n", + " ('GermlineClassification', 'OncogenicityClassification'): 40,\n", + " ('GermlineClassification', 'SomaticClinicalImpact'): 3,\n", + " ('GermlineClassification',\n", + " 'SomaticClinicalImpact',\n", + " 'SomaticClinicalImpact'): 1}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rcv_class_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a8440e1c-f281-4d6c-81d9-1006c502aed8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'tier i - strong': 6,\n", + " 'tier iii - unknown': 9,\n", + " 'tier iv - benign/likely benign': 6,\n", + " 'tier ii - potential': 3})" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_terms" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "c110c5b7-4834-4b7b-bdc7-83e6453dfab9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'likely oncogenic': 305,\n", + " 'uncertain significance': 12,\n", + " 'oncogenic': 166})" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oncogenic_terms" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "1961aab7-3ad9-46fd-b819-ecfafcd0eb74", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'prognostic': 4, '': 15, 'therapeutic': 2, 'diagnostic': 3})" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_impact_assertion_types" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "27dc71de-de0d-48e3-b221-bfbe6d2a47d9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'poor outcome': 4,\n", + " '': 15,\n", + " 'sensitivity/response': 2,\n", + " 'supports diagnosis': 3})" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_impact_clin_sigs" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "e0ad136c-6d03-422b-8f4a-3d71572ea0d0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['RCV000426735']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rcv_classifications[('GermlineClassification',\n", + " 'SomaticClinicalImpact',\n", + " 'SomaticClinicalImpact')]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "90137975-f7dc-4b1c-be2f-23c80946326d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8752475247524752" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "442/505" + ] + }, + { + "cell_type": "markdown", + "id": "faf8a077-d606-493a-850c-96db2175a484", + "metadata": {}, + "source": [ + "Summary:\n", + "* All values and all fields are being used to varying degrees\n", + "* Most data involves oncogenic classification, so no assertion types etc.\n", + "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7789c0d9-0d84-42c6-80fe-91eea184ea15", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "my-pyenv", + "language": "python", + "name": "my-pyenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}