From f8f755281015a0345b1a7790ea2990ce447d0ae1 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 23 Oct 2024 14:34:13 +0100 Subject: [PATCH 1/3] notebook for somatic/oncogenic records --- .../other/somatic-oncogenic-records.ipynb | 489 ++++++++++++++++++ 1 file changed, 489 insertions(+) create mode 100644 data-exploration/other/somatic-oncogenic-records.ipynb diff --git a/data-exploration/other/somatic-oncogenic-records.ipynb b/data-exploration/other/somatic-oncogenic-records.ipynb new file mode 100644 index 00000000..c3a3fae4 --- /dev/null +++ b/data-exploration/other/somatic-oncogenic-records.ipynb @@ -0,0 +1,489 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "94fb1395-8914-4aed-8004-6299f103efcd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "25f84a99-c91c-4885-9f9a-95623f434827", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from filter_clinvar_xml import filter_xml, pprint\n", + "from cmat.clinvar_xml_io import *\n", + "from cmat.clinvar_xml_io.xml_parsing import find_elements\n", + "from collections import Counter, defaultdict" + ] + }, + { + "cell_type": "markdown", + "id": "1e4a50d4-c8a8-4d6b-9c94-78faccc4ec0c", + "metadata": {}, + "source": [ + "First get all the somatic/oncogenic records that were dropped from the most recent submission." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "85ed4498-b101-4492-bc9b-5b28f7df7ead", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Full paths redacted\n", + "invalid_evidence_rcvs = [r.strip() for r in open('batch-2024-12/logs/invalid_evidence_rcvs.txt').readlines()]\n", + "multiple_class_rcvs = [r.strip() for r in open('batch-2024-12/logs/multiple_classification_rcvs.txt').readlines()]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9a3134f3-4f34-482b-b041-94655a2113fd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "all_rcvs = set(invalid_evidence_rcvs + multiple_class_rcvs)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1b47d8c8-6229-4d6f-801e-af29dcc442e6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "505" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_rcvs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bca12df4-cd07-42f1-86d5-3c8eafd3d575", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "input_xml = 'batch-2024-12/work/dc/697fec64cae8a0faa43906f9b76bdc/clinvar.xml.gz'\n", + "somatic_xml = 'somatic.xml.gz'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "097a43fb-afca-4791-8301-d34667e2dbe4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filter_clinvar_xml:Records written: 505\n" + ] + } + ], + "source": [ + "filter_xml(\n", + " input_xml=input_xml,\n", + " output_xml=somatic_xml,\n", + " filter_fct=lambda r: r.accession in all_rcvs,\n", + " max_num=len(all_rcvs)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5876db9d-69ce-46ce-846c-0732a9936b82", + "metadata": {}, + "source": [ + "First issue is **multiplicity**.\n", + "\n", + "Currently:\n", + "* `clinicalSignificances` is a list of strings but these are parsed from a single description - e.g. [RCV000002127](https://www.ncbi.nlm.nih.gov/clinvar/RCV000002127/), \"Pathogenic/Likely pathogenic\" becomes `pathogenic` and `likely pathogenic`\n", + "* `confidence` is a single string, in ClinVar this is \"review status\" and is associated with a single description\n", + "\n", + "New version:\n", + "* ClinVar can contain multiple clinical classifications, each containing one (or more) description(s) and a review status.\n", + " \n", + "Example: [RCV000443639](https://www.ncbi.nlm.nih.gov/clinvar/RCV000443639/)\n", + "```\n", + "\n", + " \n", + " no assertion criteria provided\n", + " Likely pathogenic\n", + " \n", + " \n", + " criteria provided, single submitter\n", + " Oncogenic\n", + " \n", + "\n", + "```\n", + "\n", + "Sole example of multiple somatic: [RCV000426735](https://www.ncbi.nlm.nih.gov/clinvar/RCV000426735/) - note annoyingly description & review status are not 1:1\n", + "```\n", + "\n", + " \n", + " criteria provided, single submitter\n", + " Uncertain significance\n", + " \n", + " \n", + " no assertion criteria provided\n", + " Tier I - Strong\n", + " Tier I - Strong\n", + " \n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f6ff0435-1ca0-45ab-8575-7ea01aa1b078", + "metadata": {}, + "source": [ + "Second issue is **new values and fields**.\n", + "* New categorisation of clinical classification: Germline, Somatic, Oncogenicity\n", + "* New terms appear in the \"description\" field which we currently report in `clinicalSignificances`\n", + "* New fields for somatic clinical impact only - assertion type and (what they call) clinical significance\n", + "\n", + "See [here](https://github.com/EBIvariation/CMAT/issues/396#issuecomment-1898804129) for lists of values." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9064f581-d5fc-4725-bb4f-59bf0c869f58", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dataset = ClinVarDataset(somatic_xml)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b1447145-bfbb-455d-8496-3351e38e37b6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Terms in the description field - e.g. \"Tier I - Strong\" or \"Likely oncogenic\"\n", + "somatic_terms = Counter()\n", + "oncogenic_terms = Counter()\n", + "\n", + "# Additional somatic terms\n", + "somatic_impact_assertion_types = Counter()\n", + "somatic_impact_clin_sigs = Counter()\n", + "\n", + "# e.g. (somatic, somatic) or (germline, oncogenic) - nb. everything *not* in this list of 505 is just (germline,)\n", + "rcv_classifications = defaultdict(list)\n", + "\n", + "# i = 0\n", + "for r in dataset:\n", + " rcv_all_class = []\n", + " for c in r.clinical_classifications:\n", + " class_type = c.class_xml.tag\n", + " descriptions = find_elements(c.class_xml, './Description')\n", + " \n", + " if class_type == 'GermlineClassification':\n", + " # Assume we support germline terms, curious if there are any multiples here though...\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " \n", + " elif class_type == 'SomaticClinicalImpact':\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " clin_class_term = d.text.lower()\n", + " somatic_terms[clin_class_term] += 1\n", + " \n", + " assert_type = d.attrib.get('ClinicalImpactAssertionType', '').lower()\n", + " somatic_impact_assertion_types[assert_type] += 1\n", + " \n", + " clin_sig = d.attrib.get('ClinicalImpactClinicalSignificance', '').lower()\n", + " somatic_impact_clin_sigs[clin_sig] += 1\n", + " \n", + " elif class_type == 'OncogenicityClassification':\n", + " for d in descriptions:\n", + " rcv_all_class.append(class_type)\n", + " clin_class_term = d.text.lower()\n", + " oncogenic_terms[clin_class_term] += 1\n", + " else:\n", + " print(\"unknown classification type:\", class_type)\n", + " \n", + " rcv_all_class = tuple(sorted(rcv_all_class))\n", + " rcv_classifications[rcv_all_class].append(r.accession)\n", + " \n", + " # i += 1\n", + " # if i > 10:\n", + " # break" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "bfe50aa4-f046-43f1-8dd8-b6baddb689a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rcv_class_counts = {\n", + " k: len(v) for k,v in rcv_classifications.items() \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "17b68c76-3cc5-4501-aae0-16d248899962", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{('SomaticClinicalImpact',): 18,\n", + " ('OncogenicityClassification',): 442,\n", + " ('OncogenicityClassification', 'SomaticClinicalImpact'): 1,\n", + " ('GermlineClassification', 'OncogenicityClassification'): 40,\n", + " ('GermlineClassification', 'SomaticClinicalImpact'): 3,\n", + " ('GermlineClassification',\n", + " 'SomaticClinicalImpact',\n", + " 'SomaticClinicalImpact'): 1}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rcv_class_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a8440e1c-f281-4d6c-81d9-1006c502aed8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'tier i - strong': 6,\n", + " 'tier iii - unknown': 9,\n", + " 'tier iv - benign/likely benign': 6,\n", + " 'tier ii - potential': 3})" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_terms" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "c110c5b7-4834-4b7b-bdc7-83e6453dfab9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'likely oncogenic': 305,\n", + " 'uncertain significance': 12,\n", + " 'oncogenic': 166})" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oncogenic_terms" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "1961aab7-3ad9-46fd-b819-ecfafcd0eb74", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'prognostic': 4, '': 15, 'therapeutic': 2, 'diagnostic': 3})" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_impact_assertion_types" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "27dc71de-de0d-48e3-b221-bfbe6d2a47d9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'poor outcome': 4,\n", + " '': 15,\n", + " 'sensitivity/response': 2,\n", + " 'supports diagnosis': 3})" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "somatic_impact_clin_sigs" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "e0ad136c-6d03-422b-8f4a-3d71572ea0d0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['RCV000426735']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rcv_classifications[('GermlineClassification',\n", + " 'SomaticClinicalImpact',\n", + " 'SomaticClinicalImpact')]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "90137975-f7dc-4b1c-be2f-23c80946326d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8752475247524752" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "442/505" + ] + }, + { + "cell_type": "markdown", + "id": "faf8a077-d606-493a-850c-96db2175a484", + "metadata": {}, + "source": [ + "Summary:\n", + "* All values and all fields are being used to varying degrees\n", + "* Most data involves oncogenic classification, so no assertion types etc.\n", + "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30797c79-6d1f-4ffd-a54f-684122ec4354", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "my-pyenv", + "language": "python", + "name": "my-pyenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 6f8bc618ada38a154b2277682f2e2974b3b422da Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 23 Oct 2024 15:33:22 +0100 Subject: [PATCH 2/3] update notebook --- .../other/somatic-oncogenic-records.ipynb | 121 +++++++++++++++++- 1 file changed, 118 insertions(+), 3 deletions(-) diff --git a/data-exploration/other/somatic-oncogenic-records.ipynb b/data-exploration/other/somatic-oncogenic-records.ipynb index c3a3fae4..675ca687 100644 --- a/data-exploration/other/somatic-oncogenic-records.ipynb +++ b/data-exploration/other/somatic-oncogenic-records.ipynb @@ -95,7 +95,7 @@ }, "outputs": [], "source": [ - "input_xml = 'batch-2024-12/work/dc/697fec64cae8a0faa43906f9b76bdc/clinvar.xml.gz'\n", + "input_xml = 'clinvar.xml.gz'\n", "somatic_xml = 'somatic.xml.gz'" ] }, @@ -453,13 +453,128 @@ "Summary:\n", "* All values and all fields are being used to varying degrees\n", "* Most data involves oncogenic classification, so no assertion types etc.\n", - "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)" + "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)\n", + "\n", + "I did **not** check how presence of somatic/oncogenic/germline clinical classification corresponds to allele origins - does this matter to Open Targets? Note that most somatic evidence still has germline clinical classifications, and likely always will as ClinVar has said it's not doing a wholesale migration of old records. But we could perhaps try to associate \"like for like\" if both are present in both fields." ] }, { "cell_type": "code", "execution_count": null, - "id": "30797c79-6d1f-4ffd-a54f-684122ec4354", + "id": "7789c0d9-0d84-42c6-80fe-91eea184ea15", + "metadata": {}, + "outputs": [], + "source": [ + "for r in dataset:\n", + " if r.accession == 'RCV000426735':\n", + " pprint(r.record_xml)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "eb3bfbee-317e-4f8c-8b72-41b36325c849", + "metadata": {}, + "source": [ + "Schema proposals\n", + "\n", + "* Multiplicity:\n", + " * Keep reporting 1 confidence, choose most/least confident rating if multiple\n", + " * Report a flat list - parallel to clinical significances\n", + " * Report clinical significances as a list of objects, containing descriptive text & review status\n", + "* New terms:\n", + " * Add new description terms and ignore the other parts\n", + " * Might still need to think of the legibility of \"tier i - strong\"\n", + " * Add as \"compound\" terms - bit like how ClinVar does on website\n", + " * e.g. \"tier i - diagnostic - supports diagnosis\"\n", + " * Report as structured object" + ] + }, + { + "cell_type": "markdown", + "id": "8fce48ee-6722-41ca-b1a9-eeb7a458e67b", + "metadata": {}, + "source": [ + "Current:\n", + "```\n", + "{\n", + " \"clinicalSignificances\": [\"likely pathogenic\", \"pathogenic\"],\n", + " \"confidence\": \"criteria provided, multiple submitters, no conflicts\",\n", + "}\n", + "```\n", + "\n", + "Structured:\n", + "```\n", + "{\n", + " \"clinicalSignificances\": [\n", + " {\n", + " \"type\": \"germline\",\n", + " \"terms\": [\"likely pathogenic\", \"pathogenic\"],\n", + " \"confidence\": \"criteria provided, multiple submitters, no conflicts\"\n", + " },\n", + " {\n", + " \"type\": \"somatic\",\n", + " \"terms\": [\"tier i - strong\"],\n", + " \"confidence\": \"criteria provided, single submitter\",\n", + " \"somaticAssertionType\": \"diagnostic\",\n", + " \"somaticClinicalSignificance\": \"supports diagnosis\"\n", + " },\n", + " {\n", + " \"type\": \"somatic\",\n", + " \"terms\": [\"tier ii - potential\"],\n", + " \"confidence\": \"criteria provided, single submitter\",\n", + " \"somaticAssertionType\": \"prognostic\",\n", + " \"somaticClinicalSignificance\": \"poor outcome\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "* Terms need to be lists in all cases as RCV clinical classifications are aggregates - see [here](https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/)\n", + "\n", + "\n", + "Semi-structured:\n", + "```\n", + "{\n", + " \"clinicalSignificances\": [\n", + " {\n", + " \"terms\": [\"likely pathogenic\", \"pathogenic\"],\n", + " \"confidence\": \"criteria provided, multiple submitters, no conflicts\"\n", + " },\n", + " {\n", + " \"terms\": [\"tier i - diagnostic - supports diagnosis\"],\n", + " \"confidence\": \"criteria provided, single submitter\",\n", + " }\n", + " {\n", + " \"terms\": [\"tier i - prognostic - poor outcome\"],\n", + " \"confidence\": \"criteria provided, single submitter\",\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "* Could include type as separate field or part of the string\n", + "\n", + "\n", + "Flat:\n", + "```\n", + "{\n", + " \"clinicalSignificances\": [\"likely pathogenic\", \"pathogenic\", \"tier i - strong\", \"tier ii - potential\"],\n", + " \"confidences\": [\"criteria provided, multiple submitters, no conflicts\", \"criteria provided, single submitter\"]\n", + "}\n", + "```\n", + "* `clinicalSignificances` could be compound terms as above\n", + "* Lists could be always have the same length and be fully parallel - so if we explode a term, we repeat the confidence value" + ] + }, + { + "cell_type": "markdown", + "id": "3410c33a-dfd7-44d4-a74b-f0334d285cdb", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15146bb1-f038-4df5-b119-7d5a9c096a1e", "metadata": {}, "outputs": [], "source": [] From aaedf844b35814cee5e88be5d464a81fa5466f95 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 24 Oct 2024 11:22:11 +0100 Subject: [PATCH 3/3] clean up notebook a bit --- .../other/somatic-oncogenic-records.ipynb | 128 +----------------- 1 file changed, 4 insertions(+), 124 deletions(-) diff --git a/data-exploration/other/somatic-oncogenic-records.ipynb b/data-exploration/other/somatic-oncogenic-records.ipynb index 675ca687..166ed757 100644 --- a/data-exploration/other/somatic-oncogenic-records.ipynb +++ b/data-exploration/other/somatic-oncogenic-records.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 55, "id": "94fb1395-8914-4aed-8004-6299f103efcd", "metadata": { "tags": [] @@ -17,7 +17,7 @@ { "cell_type": "code", "execution_count": 45, - "id": "25f84a99-c91c-4885-9f9a-95623f434827", + "id": "c8c06c5c-2fd8-4356-b29a-42ad736d5aaa", "metadata": { "tags": [] }, @@ -213,7 +213,6 @@ "# e.g. (somatic, somatic) or (germline, oncogenic) - nb. everything *not* in this list of 505 is just (germline,)\n", "rcv_classifications = defaultdict(list)\n", "\n", - "# i = 0\n", "for r in dataset:\n", " rcv_all_class = []\n", " for c in r.clinical_classifications:\n", @@ -246,11 +245,7 @@ " print(\"unknown classification type:\", class_type)\n", " \n", " rcv_all_class = tuple(sorted(rcv_all_class))\n", - " rcv_classifications[rcv_all_class].append(r.accession)\n", - " \n", - " # i += 1\n", - " # if i > 10:\n", - " # break" + " rcv_classifications[rcv_all_class].append(r.accession)" ] }, { @@ -453,9 +448,7 @@ "Summary:\n", "* All values and all fields are being used to varying degrees\n", "* Most data involves oncogenic classification, so no assertion types etc.\n", - "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)\n", - "\n", - "I did **not** check how presence of somatic/oncogenic/germline clinical classification corresponds to allele origins - does this matter to Open Targets? Note that most somatic evidence still has germline clinical classifications, and likely always will as ClinVar has said it's not doing a wholesale migration of old records. But we could perhaps try to associate \"like for like\" if both are present in both fields." + "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)" ] }, { @@ -464,119 +457,6 @@ "id": "7789c0d9-0d84-42c6-80fe-91eea184ea15", "metadata": {}, "outputs": [], - "source": [ - "for r in dataset:\n", - " if r.accession == 'RCV000426735':\n", - " pprint(r.record_xml)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "eb3bfbee-317e-4f8c-8b72-41b36325c849", - "metadata": {}, - "source": [ - "Schema proposals\n", - "\n", - "* Multiplicity:\n", - " * Keep reporting 1 confidence, choose most/least confident rating if multiple\n", - " * Report a flat list - parallel to clinical significances\n", - " * Report clinical significances as a list of objects, containing descriptive text & review status\n", - "* New terms:\n", - " * Add new description terms and ignore the other parts\n", - " * Might still need to think of the legibility of \"tier i - strong\"\n", - " * Add as \"compound\" terms - bit like how ClinVar does on website\n", - " * e.g. \"tier i - diagnostic - supports diagnosis\"\n", - " * Report as structured object" - ] - }, - { - "cell_type": "markdown", - "id": "8fce48ee-6722-41ca-b1a9-eeb7a458e67b", - "metadata": {}, - "source": [ - "Current:\n", - "```\n", - "{\n", - " \"clinicalSignificances\": [\"likely pathogenic\", \"pathogenic\"],\n", - " \"confidence\": \"criteria provided, multiple submitters, no conflicts\",\n", - "}\n", - "```\n", - "\n", - "Structured:\n", - "```\n", - "{\n", - " \"clinicalSignificances\": [\n", - " {\n", - " \"type\": \"germline\",\n", - " \"terms\": [\"likely pathogenic\", \"pathogenic\"],\n", - " \"confidence\": \"criteria provided, multiple submitters, no conflicts\"\n", - " },\n", - " {\n", - " \"type\": \"somatic\",\n", - " \"terms\": [\"tier i - strong\"],\n", - " \"confidence\": \"criteria provided, single submitter\",\n", - " \"somaticAssertionType\": \"diagnostic\",\n", - " \"somaticClinicalSignificance\": \"supports diagnosis\"\n", - " },\n", - " {\n", - " \"type\": \"somatic\",\n", - " \"terms\": [\"tier ii - potential\"],\n", - " \"confidence\": \"criteria provided, single submitter\",\n", - " \"somaticAssertionType\": \"prognostic\",\n", - " \"somaticClinicalSignificance\": \"poor outcome\"\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "* Terms need to be lists in all cases as RCV clinical classifications are aggregates - see [here](https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/)\n", - "\n", - "\n", - "Semi-structured:\n", - "```\n", - "{\n", - " \"clinicalSignificances\": [\n", - " {\n", - " \"terms\": [\"likely pathogenic\", \"pathogenic\"],\n", - " \"confidence\": \"criteria provided, multiple submitters, no conflicts\"\n", - " },\n", - " {\n", - " \"terms\": [\"tier i - diagnostic - supports diagnosis\"],\n", - " \"confidence\": \"criteria provided, single submitter\",\n", - " }\n", - " {\n", - " \"terms\": [\"tier i - prognostic - poor outcome\"],\n", - " \"confidence\": \"criteria provided, single submitter\",\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "* Could include type as separate field or part of the string\n", - "\n", - "\n", - "Flat:\n", - "```\n", - "{\n", - " \"clinicalSignificances\": [\"likely pathogenic\", \"pathogenic\", \"tier i - strong\", \"tier ii - potential\"],\n", - " \"confidences\": [\"criteria provided, multiple submitters, no conflicts\", \"criteria provided, single submitter\"]\n", - "}\n", - "```\n", - "* `clinicalSignificances` could be compound terms as above\n", - "* Lists could be always have the same length and be fully parallel - so if we explode a term, we repeat the confidence value" - ] - }, - { - "cell_type": "markdown", - "id": "3410c33a-dfd7-44d4-a74b-f0334d285cdb", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15146bb1-f038-4df5-b119-7d5a9c096a1e", - "metadata": {}, - "outputs": [], "source": [] } ],