diff --git a/data-exploration/other/somatic-oncogenic-records.ipynb b/data-exploration/other/somatic-oncogenic-records.ipynb
new file mode 100644
index 00000000..166ed757
--- /dev/null
+++ b/data-exploration/other/somatic-oncogenic-records.ipynb
@@ -0,0 +1,484 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "94fb1395-8914-4aed-8004-6299f103efcd",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "sys.path.append('..')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "c8c06c5c-2fd8-4356-b29a-42ad736d5aaa",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from filter_clinvar_xml import filter_xml, pprint\n",
+ "from cmat.clinvar_xml_io import *\n",
+ "from cmat.clinvar_xml_io.xml_parsing import find_elements\n",
+ "from collections import Counter, defaultdict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e4a50d4-c8a8-4d6b-9c94-78faccc4ec0c",
+ "metadata": {},
+ "source": [
+ "First get all the somatic/oncogenic records that were dropped from the most recent submission."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "85ed4498-b101-4492-bc9b-5b28f7df7ead",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Full paths redacted\n",
+ "invalid_evidence_rcvs = [r.strip() for r in open('batch-2024-12/logs/invalid_evidence_rcvs.txt').readlines()]\n",
+ "multiple_class_rcvs = [r.strip() for r in open('batch-2024-12/logs/multiple_classification_rcvs.txt').readlines()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "9a3134f3-4f34-482b-b041-94655a2113fd",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "all_rcvs = set(invalid_evidence_rcvs + multiple_class_rcvs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "1b47d8c8-6229-4d6f-801e-af29dcc442e6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "505"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(all_rcvs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "bca12df4-cd07-42f1-86d5-3c8eafd3d575",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "input_xml = 'clinvar.xml.gz'\n",
+ "somatic_xml = 'somatic.xml.gz'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "097a43fb-afca-4791-8301-d34667e2dbe4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:filter_clinvar_xml:Records written: 505\n"
+ ]
+ }
+ ],
+ "source": [
+ "filter_xml(\n",
+ " input_xml=input_xml,\n",
+ " output_xml=somatic_xml,\n",
+ " filter_fct=lambda r: r.accession in all_rcvs,\n",
+ " max_num=len(all_rcvs)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5876db9d-69ce-46ce-846c-0732a9936b82",
+ "metadata": {},
+ "source": [
+ "First issue is **multiplicity**.\n",
+ "\n",
+ "Currently:\n",
+ "* `clinicalSignificances` is a list of strings but these are parsed from a single description - e.g. [RCV000002127](https://www.ncbi.nlm.nih.gov/clinvar/RCV000002127/), \"Pathogenic/Likely pathogenic\" becomes `pathogenic` and `likely pathogenic`\n",
+ "* `confidence` is a single string, in ClinVar this is \"review status\" and is associated with a single description\n",
+ "\n",
+ "New version:\n",
+ "* ClinVar can contain multiple clinical classifications, each containing one (or more) description(s) and a review status.\n",
+ " \n",
+ "Example: [RCV000443639](https://www.ncbi.nlm.nih.gov/clinvar/RCV000443639/)\n",
+ "```\n",
+ "\n",
+ " \n",
+ " no assertion criteria provided\n",
+ " Likely pathogenic\n",
+ " \n",
+ " \n",
+ " criteria provided, single submitter\n",
+ " Oncogenic\n",
+ " \n",
+ "\n",
+ "```\n",
+ "\n",
+ "Sole example of multiple somatic: [RCV000426735](https://www.ncbi.nlm.nih.gov/clinvar/RCV000426735/) - note annoyingly description & review status are not 1:1\n",
+ "```\n",
+ "\n",
+ " \n",
+ " criteria provided, single submitter\n",
+ " Uncertain significance\n",
+ " \n",
+ " \n",
+ " no assertion criteria provided\n",
+ " Tier I - Strong\n",
+ " Tier I - Strong\n",
+ " \n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6ff0435-1ca0-45ab-8575-7ea01aa1b078",
+ "metadata": {},
+ "source": [
+ "Second issue is **new values and fields**.\n",
+ "* New categorisation of clinical classification: Germline, Somatic, Oncogenicity\n",
+ "* New terms appear in the \"description\" field which we currently report in `clinicalSignificances`\n",
+ "* New fields for somatic clinical impact only - assertion type and (what they call) clinical significance\n",
+ "\n",
+ "See [here](https://github.com/EBIvariation/CMAT/issues/396#issuecomment-1898804129) for lists of values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "9064f581-d5fc-4725-bb4f-59bf0c869f58",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "dataset = ClinVarDataset(somatic_xml)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "b1447145-bfbb-455d-8496-3351e38e37b6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Terms in the description field - e.g. \"Tier I - Strong\" or \"Likely oncogenic\"\n",
+ "somatic_terms = Counter()\n",
+ "oncogenic_terms = Counter()\n",
+ "\n",
+ "# Additional somatic terms\n",
+ "somatic_impact_assertion_types = Counter()\n",
+ "somatic_impact_clin_sigs = Counter()\n",
+ "\n",
+ "# e.g. (somatic, somatic) or (germline, oncogenic) - nb. everything *not* in this list of 505 is just (germline,)\n",
+ "rcv_classifications = defaultdict(list)\n",
+ "\n",
+ "for r in dataset:\n",
+ " rcv_all_class = []\n",
+ " for c in r.clinical_classifications:\n",
+ " class_type = c.class_xml.tag\n",
+ " descriptions = find_elements(c.class_xml, './Description')\n",
+ " \n",
+ " if class_type == 'GermlineClassification':\n",
+ " # Assume we support germline terms, curious if there are any multiples here though...\n",
+ " for d in descriptions:\n",
+ " rcv_all_class.append(class_type)\n",
+ " \n",
+ " elif class_type == 'SomaticClinicalImpact':\n",
+ " for d in descriptions:\n",
+ " rcv_all_class.append(class_type)\n",
+ " clin_class_term = d.text.lower()\n",
+ " somatic_terms[clin_class_term] += 1\n",
+ " \n",
+ " assert_type = d.attrib.get('ClinicalImpactAssertionType', '').lower()\n",
+ " somatic_impact_assertion_types[assert_type] += 1\n",
+ " \n",
+ " clin_sig = d.attrib.get('ClinicalImpactClinicalSignificance', '').lower()\n",
+ " somatic_impact_clin_sigs[clin_sig] += 1\n",
+ " \n",
+ " elif class_type == 'OncogenicityClassification':\n",
+ " for d in descriptions:\n",
+ " rcv_all_class.append(class_type)\n",
+ " clin_class_term = d.text.lower()\n",
+ " oncogenic_terms[clin_class_term] += 1\n",
+ " else:\n",
+ " print(\"unknown classification type:\", class_type)\n",
+ " \n",
+ " rcv_all_class = tuple(sorted(rcv_all_class))\n",
+ " rcv_classifications[rcv_all_class].append(r.accession)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "bfe50aa4-f046-43f1-8dd8-b6baddb689a1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rcv_class_counts = {\n",
+ " k: len(v) for k,v in rcv_classifications.items() \n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "17b68c76-3cc5-4501-aae0-16d248899962",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{('SomaticClinicalImpact',): 18,\n",
+ " ('OncogenicityClassification',): 442,\n",
+ " ('OncogenicityClassification', 'SomaticClinicalImpact'): 1,\n",
+ " ('GermlineClassification', 'OncogenicityClassification'): 40,\n",
+ " ('GermlineClassification', 'SomaticClinicalImpact'): 3,\n",
+ " ('GermlineClassification',\n",
+ " 'SomaticClinicalImpact',\n",
+ " 'SomaticClinicalImpact'): 1}"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rcv_class_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "a8440e1c-f281-4d6c-81d9-1006c502aed8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'tier i - strong': 6,\n",
+ " 'tier iii - unknown': 9,\n",
+ " 'tier iv - benign/likely benign': 6,\n",
+ " 'tier ii - potential': 3})"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "somatic_terms"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "c110c5b7-4834-4b7b-bdc7-83e6453dfab9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'likely oncogenic': 305,\n",
+ " 'uncertain significance': 12,\n",
+ " 'oncogenic': 166})"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "oncogenic_terms"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "1961aab7-3ad9-46fd-b819-ecfafcd0eb74",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'prognostic': 4, '': 15, 'therapeutic': 2, 'diagnostic': 3})"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "somatic_impact_assertion_types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "27dc71de-de0d-48e3-b221-bfbe6d2a47d9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Counter({'poor outcome': 4,\n",
+ " '': 15,\n",
+ " 'sensitivity/response': 2,\n",
+ " 'supports diagnosis': 3})"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "somatic_impact_clin_sigs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "e0ad136c-6d03-422b-8f4a-3d71572ea0d0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['RCV000426735']"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rcv_classifications[('GermlineClassification',\n",
+ " 'SomaticClinicalImpact',\n",
+ " 'SomaticClinicalImpact')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "90137975-f7dc-4b1c-be2f-23c80946326d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8752475247524752"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "442/505"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "faf8a077-d606-493a-850c-96db2175a484",
+ "metadata": {},
+ "source": [
+ "Summary:\n",
+ "* All values and all fields are being used to varying degrees\n",
+ "* Most data involves oncogenic classification, so no assertion types etc.\n",
+ "* A fully future-proof implementation would support everything here, but a simple inclusion of the oncogenic classification terms in the `clinicalSignificances` enum would cover 87% of the missing data (on the other hand, if we're not future-proofing what's the point)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7789c0d9-0d84-42c6-80fe-91eea184ea15",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "my-pyenv",
+ "language": "python",
+ "name": "my-pyenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}