From 3898435c904860587c479564c75da8a73d0e1d0a Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 14 Jan 2024 14:46:05 -0500
Subject: [PATCH 01/11] Implement new request function and one endpoint

---
 indra/databases/cbio_client.py | 92 +++++++++++++++-------------------
 1 file changed, 40 insertions(+), 52 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index cd309d496a..29e6631f23 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -1,66 +1,53 @@
-from __future__ import absolute_import, print_function, unicode_literals
-from builtins import dict, str
-import os
+"""This is a client for the cBioPortal web service, with
+documentation at https://docs.cbioportal.org/web-api-and-clients/
+and Swagger definition at https://www.cbioportal.org/api/v2/api-docs.
+Note that the client implements direct requests to the API instead of
+adding an additional dependency to do so.
+"""
+__all__ = ["get_mutations", "get_case_lists", "get_profile_data",
+           "get_num_sequenced", "get_genetic_profiles",
+           "get_cancer_studies", "get_cancer_types", "get_ccle_mutations",
+           "get_ccle_lines_for_mutation", "get_ccle_cna",
+           "get_ccle_mrna"]
+
 import pandas
 import logging
 import requests
-from collections import defaultdict
-# Python3
-try:
-    from functools import lru_cache
-    from io import StringIO
-# Python2
-except ImportError:
-    from functools32 import lru_cache
-    from StringIO import StringIO
 
 
 logger = logging.getLogger(__name__)
 
-cbio_url = 'http://www.cbioportal.org/webservice.do'
+cbio_url = 'https://www.cbioportal.org/api'
 ccle_study = 'cellline_ccle_broad'
 
+# TODO: implement caching with json_data made immutable
+def send_request(method, endpoint, json_data):
+    """Return the results of a web service request to cBio portal.
 
-@lru_cache(maxsize=10000)
-def send_request(**kwargs):
-    """Return a data frame from a web service request to cBio portal.
-
-    Sends a web service requrest to the cBio portal with arguments given in
-    the dictionary data and returns a Pandas data frame on success.
+    Sends a web service request to the cBio portal with a specific endpoint,
+    method, and JSON data structure, and returns the resulting JSON
+    data structure on success.
 
-    More information about the service here:
-    http://www.cbioportal.org/web_api.jsp
+    More information about the service is available here:
+    https://www.cbioportal.org/api/v2/api-docs
 
     Parameters
     ----------
-    kwargs : dict
-        A dict of parameters for the query. Entries map directly to web service
-        calls with the exception of the optional 'skiprows' entry, whose value
-        is used as the number of rows to skip when reading the result data
-        frame.
+    TODO
 
     Returns
     -------
-    df : pandas.DataFrame
-        Response from cBioPortal as a Pandas DataFrame.
+    JSON
+        The JSON object returned by the web service call.
     """
-    skiprows = kwargs.pop('skiprows', None)
-    res = requests.get(cbio_url, params=kwargs)
-    if res.status_code == 200:
-        # Adaptively skip rows based on number of comment lines
-        if skiprows == -1:
-            lines = res.text.split('\n')
-            skiprows = 0
-            for line in lines:
-                if line.startswith('#'):
-                    skiprows += 1
-                else:
-                    break
-        csv_StringIO = StringIO(res.text)
-        df = pandas.read_csv(csv_StringIO, sep='\t', skiprows=skiprows)
-        return df
-    else:
+    if endpoint.startswith('/'):
+        endpoint = endpoint[1:]
+    request_fun = getattr(requests, method)
+    res = request_fun(cbio_url + '/' + endpoint, json=json_data)
+    if res.status_code != 200:
         logger.error('Request returned with code %d' % res.status_code)
+        return
+    return res.json()
 
 
 def get_mutations(study_id, gene_list, mutation_type=None,
@@ -242,20 +229,21 @@ def get_genetic_profiles(study_id, profile_filter=None):
         - MRNA_EXPRESSION
         - METHYLATION
         The genetic profiles can include "mutation", "CNA", "rppa",
-        "methylation", etc.
+        "methylation", etc. The filter is case insensitive.
 
     Returns
     -------
     genetic_profiles : list[str]
         A list of genetic profiles available  for the given study.
     """
-    data = {'cmd': 'getGeneticProfiles',
-            'cancer_study_id': study_id}
-    df = send_request(**data)
-    res = _filter_data_frame(df, ['genetic_profile_id'],
-                             'genetic_alteration_type', profile_filter)
-    genetic_profiles = list(res['genetic_profile_id'].values())
-    return genetic_profiles
+    res = send_request('post', 'molecular-profiles/fetch',
+                       {'studyIds': [study_id]})
+    if profile_filter:
+        res = [prof for prof in res
+               if (profile_filter.casefold()
+                   in prof['molecularAlterationType'].casefold())]
+    profile_ids = [prof['molecularProfileId'] for prof in res]
+    return profile_ids
 
 
 def get_cancer_studies(study_filter=None):

From ecbc2bf1bbfb855fca65b83f0ca5a7ce2b91e034 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 14 Jan 2024 15:11:17 -0500
Subject: [PATCH 02/11] Simplify getting molecular profiles

---
 indra/databases/cbio_client.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 29e6631f23..46c84386b3 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -21,7 +21,7 @@
 ccle_study = 'cellline_ccle_broad'
 
 # TODO: implement caching with json_data made immutable
-def send_request(method, endpoint, json_data):
+def send_request(method, endpoint, json_data=None):
     """Return the results of a web service request to cBio portal.
 
     Sends a web service request to the cBio portal with a specific endpoint,
@@ -113,6 +113,8 @@ def get_case_lists(study_id):
         A dict keyed to cases containing a dict keyed to genes
         containing int
     """
+    res = send_request('post', 'sample-lists/fetch',
+                       {'studyIds': [study_id]})
     data = {'cmd': 'getCaseLists',
             'cancer_study_id': study_id}
     df = send_request(**data)
@@ -236,8 +238,7 @@ def get_genetic_profiles(study_id, profile_filter=None):
     genetic_profiles : list[str]
         A list of genetic profiles available  for the given study.
     """
-    res = send_request('post', 'molecular-profiles/fetch',
-                       {'studyIds': [study_id]})
+    res = send_request('get', f'studies/{study_id}/molecular-profiles')
     if profile_filter:
         res = [prof for prof in res
                if (profile_filter.casefold()

From b3491ef11f525daa59198a364f09f79d37962286 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 14 Jan 2024 15:57:39 -0500
Subject: [PATCH 03/11] Implement getting cancer studies

---
 indra/databases/cbio_client.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 46c84386b3..e3a79b4dc6 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -266,11 +266,11 @@ def get_cancer_studies(study_filter=None):
         of study IDs with paad in their name like "paad_icgc", "paad_tcga",
         etc.
     """
-    data = {'cmd': 'getCancerStudies'}
-    df = send_request(**data)
-    res = _filter_data_frame(df, ['cancer_study_id'],
-                             'cancer_study_id', study_filter)
-    study_ids = list(res['cancer_study_id'].values())
+    studies = send_request('get', 'studies')
+    if study_filter:
+        studies = [s for s in studies
+                   if study_filter.casefold() in s['studyId'].casefold()]
+    study_ids = [s['studyId'] for s in studies]
     return study_ids
 
 

From 3ab2c9b335e7d9a4f863dd88aa5cf5e71a4dea0a Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 18 Jan 2024 19:32:27 -0500
Subject: [PATCH 04/11] Start implementing getting mutations

---
 indra/databases/cbio_client.py | 40 +++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index e3a79b4dc6..d5dfe58c8d 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -14,12 +14,15 @@
 import logging
 import requests
 
+from indra.databases import hgnc_client
+
 
 logger = logging.getLogger(__name__)
 
 cbio_url = 'https://www.cbioportal.org/api'
 ccle_study = 'cellline_ccle_broad'
 
+
 # TODO: implement caching with json_data made immutable
 def send_request(method, endpoint, json_data=None):
     """Return the results of a web service request to cBio portal.
@@ -43,14 +46,15 @@ def send_request(method, endpoint, json_data=None):
     if endpoint.startswith('/'):
         endpoint = endpoint[1:]
     request_fun = getattr(requests, method)
-    res = request_fun(cbio_url + '/' + endpoint, json=json_data)
+    res = request_fun(cbio_url + '/' + endpoint, json=json_data or {})
     if res.status_code != 200:
-        logger.error('Request returned with code %d' % res.status_code)
+        logger.error(f'Request returned with code {res.status_code}: '
+                     f'{res.text}')
         return
     return res.json()
 
 
-def get_mutations(study_id, gene_list, mutation_type=None,
+def get_mutations(study_id, gene_list=None, mutation_type=None,
                   case_id=None):
     """Return mutations as a list of genes and list of amino acid changes.
 
@@ -76,7 +80,26 @@ def get_mutations(study_id, gene_list, mutation_type=None,
         the second one a list of amino acid changes in those genes.
     """
     genetic_profile = get_genetic_profiles(study_id, 'mutation')[0]
-    gene_list_str = ','.join(gene_list)
+    breakpoint()
+
+    if gene_list:
+        hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list}
+        entrez_mappings = {hgnc_mappings[g]:
+                               hgnc_client.get_entrez_id(hgnc_mappings[g])
+                           for g in gene_list if hgnc_mappings[g] is not None}
+        entrez_ids = [e for e in entrez_mappings.values() if e is not None]
+    else:
+        entrez_ids = None
+
+    json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None
+
+    # FIXME: ERROR: [2024-01-14 17:18:15] indra.databases.cbio_client -
+    #  Request returned with code 400: {"message":"eitherSampleListIdOrSampleIdsPresent must be true"}
+    muts = send_request('post', f'molecular-profiles/{genetic_profile}/'
+                                f'mutations/fetch', json_data)
+    return muts
+#    if mutation_type:
+
 
     data = {'cmd': 'getMutationData',
             'case_set_id': study_id,
@@ -291,10 +314,11 @@ def get_cancer_types(cancer_filter=None):
         Example: for cancer_filter="pancreatic", the result includes
         "panet" (neuro-endocrine) and "paad" (adenocarcinoma)
     """
-    data = {'cmd': 'getTypesOfCancer'}
-    df = send_request(**data)
-    res = _filter_data_frame(df, ['type_of_cancer_id'], 'name', cancer_filter)
-    type_ids = list(res['type_of_cancer_id'].values())
+    cancer_types = send_request('get', 'cancer-types')
+    if cancer_filter:
+        cancer_types = [c for c in cancer_types
+                        if cancer_filter.casefold() in c['name'].casefold()]
+    type_ids = [c['cancerTypeId'] for c in cancer_types]
     return type_ids
 
 

From adb47cd482efaf7199c58bae4361abee8f2ccc3c Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 20 Jan 2024 18:30:11 -0500
Subject: [PATCH 05/11] Implement handling Entrez mappings and profile data

---
 indra/databases/cbio_client.py | 87 +++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index d5dfe58c8d..4d8b1afb79 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -46,7 +46,10 @@ def send_request(method, endpoint, json_data=None):
     if endpoint.startswith('/'):
         endpoint = endpoint[1:]
     request_fun = getattr(requests, method)
-    res = request_fun(cbio_url + '/' + endpoint, json=json_data or {})
+    full_url = cbio_url + '/' + endpoint
+    print('URL: %s' % full_url)
+    print('JSON: %s' % json_data)
+    res = request_fun(full_url, json=json_data or {})
     if res.status_code != 200:
         logger.error(f'Request returned with code {res.status_code}: '
                      f'{res.text}')
@@ -82,14 +85,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
     genetic_profile = get_genetic_profiles(study_id, 'mutation')[0]
     breakpoint()
 
-    if gene_list:
-        hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list}
-        entrez_mappings = {hgnc_mappings[g]:
-                               hgnc_client.get_entrez_id(hgnc_mappings[g])
-                           for g in gene_list if hgnc_mappings[g] is not None}
-        entrez_ids = [e for e in entrez_mappings.values() if e is not None]
-    else:
-        entrez_ids = None
+    entrez_ids = get_entrez_mappings(gene_list)
 
     json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None
 
@@ -116,9 +112,28 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
     return mutations
 
 
+def get_entrez_mappings(gene_list):
+    if gene_list:
+        # First we need to get HGNC IDs from HGNC symbols
+        hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list}
+        # Next, we map from HGNC symbols to Entrez IDs via the hgnc_mappings
+        entrez_mappings = {g: hgnc_client.get_entrez_id(hgnc_mappings[g])
+                           for g in gene_list if hgnc_mappings[g] is not None}
+        # Finally, we reverse the mapping, this will ensure that
+        # we can get the gene symbols back when generating results
+        entrez_to_gene_symbol = {v: k for k, v in entrez_mappings.items()
+                                 if v is not None and k is not None}
+    else:
+        entrez_to_gene_symbol = {}
+    return entrez_to_gene_symbol
+
+
 def get_case_lists(study_id):
     """Return a list of the case set ids for a particular study.
 
+    In v2 of the API these are called sample lists.
+
+    Old comment:
     TAKE NOTE the "case_list_id" are the same thing as "case_set_id"
     Within the data, this string is referred to as a "case_list_id".
     Within API calls it is referred to as a 'case_set_id'.
@@ -132,17 +147,12 @@ def get_case_lists(study_id):
 
     Returns
     -------
-    case_set_ids : dict[dict[int]]
-        A dict keyed to cases containing a dict keyed to genes
-        containing int
+    case_set_ids : list[str]
+        A list of case set IDs, e.g., ['cellline_ccle_broad_all',
+        'cellline_ccle_broad_cna', ...]
     """
-    res = send_request('post', 'sample-lists/fetch',
-                       {'studyIds': [study_id]})
-    data = {'cmd': 'getCaseLists',
-            'cancer_study_id': study_id}
-    df = send_request(**data)
-    case_set_ids = df['case_list_id'].tolist()
-    return case_set_ids
+    res = send_request('get', f'studies/{study_id}/sample-lists')
+    return [sl['sampleListId'] for sl in res]
 
 
 def get_profile_data(study_id, gene_list,
@@ -165,7 +175,7 @@ def get_profile_data(study_id, gene_list,
         - MRNA_EXPRESSION
         - METHYLATION
     case_set_filter : Optional[str]
-        A string that specifices which case_set_id to use, based on a complete
+        A string that specifies which case_set_id to use, based on a complete
         or partial match. If not provided, will look for study_id + '_all'
 
     Returns
@@ -179,29 +189,30 @@ def get_profile_data(study_id, gene_list,
         genetic_profile = genetic_profiles[0]
     else:
         return {}
-    gene_list_str = ','.join(gene_list)
     case_set_ids = get_case_lists(study_id)
     if case_set_filter:
         case_set_id = [x for x in case_set_ids if case_set_filter in x][0]
     else:
-        case_set_id = study_id + '_all'
         # based on looking at the cBioPortal, this is a common case_set_id
-    data = {'cmd': 'getProfileData',
-            'case_set_id': case_set_id,
-            'genetic_profile_id': genetic_profile,
-            'gene_list': gene_list_str,
-            'skiprows': -1}
-    df = send_request(**data)
-    case_list_df = [x for x in df.columns.tolist()
-                    if x not in ['GENE_ID', 'COMMON']]
-    profile_data = {case: {g: None for g in gene_list}
-                    for case in case_list_df}
-    for case in case_list_df:
-        profile_values = df[case].tolist()
-        df_gene_list = df['COMMON'].tolist()
-        for g, cv in zip(df_gene_list, profile_values):
-            if not pandas.isnull(cv):
-                profile_data[case][g] = cv
+        case_set_id = study_id + '_all'
+    entrez_to_gene_symbol = get_entrez_mappings(gene_list)
+    entrez_ids = list(entrez_to_gene_symbol)
+    res = send_request('post', f'molecular-profiles/{genetic_profile}/'
+                               f'molecular-data/fetch',
+                       {'sampleListId': case_set_id,
+                        'entrezGeneIds': entrez_ids})
+
+    profile_data = {}
+    # Each entry in the results contains something like
+    # {'entrezGeneId': 673, 'molecularProfileId': 'cellline_ccle_broad_cna',
+    #  'sampleId': '1321N1_CENTRAL_NERVOUS_SYSTEM',
+    #  'studyId': 'cellline_ccle_broad', 'value': 1, ...}
+    for sample in res:
+        sample_id = sample['sampleId']
+        if sample_id not in profile_data:
+            profile_data[sample_id] = {}
+        gene_symbol = entrez_to_gene_symbol[str(sample['entrezGeneId'])]
+        profile_data[sample_id][gene_symbol] = sample['value']
     return profile_data
 
 

From 6eafa341ff87c5a66223bed0e1fbee2864e946f2 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 20 Jan 2024 18:50:47 -0500
Subject: [PATCH 06/11] Implement further endpoints using new API

---
 indra/databases/cbio_client.py | 67 +++++++++++++++-------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 4d8b1afb79..2641e7b751 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -83,9 +83,9 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
         the second one a list of amino acid changes in those genes.
     """
     genetic_profile = get_genetic_profiles(study_id, 'mutation')[0]
-    breakpoint()
 
-    entrez_ids = get_entrez_mappings(gene_list)
+    entrez_to_gene_symbol = get_entrez_mappings(gene_list)
+    entrez_ids = list(entrez_to_gene_symbol)
 
     json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None
 
@@ -233,13 +233,16 @@ def get_num_sequenced(study_id):
     num_case : int
         The number of sequenced tumors in the given study
     """
-    data = {'cmd': 'getCaseLists',
-            'cancer_study_id': study_id}
-    df = send_request(**data)
-    if df.empty:
-        return 0
-    row_filter = df['case_list_id'].str.contains('sequenced', case=False)
-    num_case = len(df[row_filter]['case_ids'].tolist()[0].split(' '))
+    # First we get all the case lists for the study
+    case_lists = get_case_lists(study_id)
+    # Then we find ones that have 'sequenced' in the name
+    sequencing_case_list = [cl for cl in case_lists if 'sequenced' in cl]
+    # Then we look at the sample IDs and count them
+    cases = set()
+    for cl in sequencing_case_list:
+        res = send_request('get', f'/sample-lists/{cl}/sample-ids')
+        cases |= set(res)
+    num_case = len(cases)
     return num_case
 
 
@@ -401,7 +404,7 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change):
     return cell_lines
 
 
-def get_ccle_cna(gene_list, cell_lines):
+def get_ccle_cna(gene_list, cell_lines=None):
     """Return a dict of CNAs in given genes and cell lines from CCLE.
 
     CNA values correspond to the following alterations
@@ -420,7 +423,7 @@ def get_ccle_cna(gene_list, cell_lines):
     ----------
     gene_list : list[str]
         A list of HGNC gene symbols to get mutations in
-    cell_lines : list[str]
+    cell_lines : Optional[list[str]]
         A list of CCLE cell line names to get mutations for.
 
     Returns
@@ -431,19 +434,18 @@ def get_ccle_cna(gene_list, cell_lines):
     """
     profile_data = get_profile_data(ccle_study, gene_list,
                                     'COPY_NUMBER_ALTERATION', 'all')
-    profile_data = dict((key, value) for key, value in profile_data.items()
-                        if key in cell_lines)
-    return profile_data
+    return {cell_line: value for cell_line, value in profile_data.items()
+            if cell_lines is None or cell_line in cell_lines}
 
 
-def get_ccle_mrna(gene_list, cell_lines):
+def get_ccle_mrna(gene_list, cell_lines=None):
     """Return a dict of mRNA amounts in given genes and cell lines from CCLE.
 
     Parameters
     ----------
     gene_list : list[str]
         A list of HGNC gene symbols to get mRNA amounts for.
-    cell_lines : list[str]
+    cell_lines : Optional[list[str]]
         A list of CCLE cell line names to get mRNA amounts for.
 
     Returns
@@ -452,27 +454,18 @@ def get_ccle_mrna(gene_list, cell_lines):
         A dict keyed to cell lines containing a dict keyed to genes
         containing float
     """
-    gene_list_str = ','.join(gene_list)
-    data = {'cmd': 'getProfileData',
-            'case_set_id': ccle_study + '_mrna',
-            'genetic_profile_id': ccle_study + '_mrna',
-            'gene_list': gene_list_str,
-            'skiprows': -1}
-    df = send_request(**data)
-    mrna_amounts = {cl: {g: [] for g in gene_list} for cl in cell_lines}
-    for cell_line in cell_lines:
-        if cell_line in df.columns:
-            for gene in gene_list:
-                value_cell = df[cell_line][df['COMMON'] == gene]
-                if value_cell.empty:
-                    mrna_amounts[cell_line][gene] = None
-                elif pandas.isnull(value_cell.values[0]):
-                    mrna_amounts[cell_line][gene] = None
-                else:
-                    value = value_cell.values[0]
-                    mrna_amounts[cell_line][gene] = value
-        else:
-            mrna_amounts[cell_line] = None
+    profile_data = get_profile_data(ccle_study, gene_list,
+                                    'MRNA_EXPRESSION', 'all')
+    mrna_amounts = {cell_line: value
+                    for cell_line, value in profile_data.items()
+                    if cell_lines is None or cell_line in cell_lines}
+    # This is to make sure that if cell_lines were specified then
+    # we return None if there is no data for a given cell line
+    # This matches the old behavior of the function
+    if cell_lines:
+        for cell_line in cell_lines:
+            if cell_line not in mrna_amounts:
+                mrna_amounts[cell_line] = None
     return mrna_amounts
 
 

From 3b3b2fbf290a838898ca9c6a5a5002b1e1b7818b Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 21 Jan 2024 16:42:00 -0500
Subject: [PATCH 07/11] Implement getting mutations

---
 indra/databases/cbio_client.py | 62 ++++++++++++++++------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 2641e7b751..0e5528e8cb 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -78,38 +78,38 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
 
     Returns
     -------
-    mutations : tuple[list]
-        A tuple of two lists, the first one containing a list of genes, and
-        the second one a list of amino acid changes in those genes.
+    mutations : dict
+        A dict with entries for each gene symbol and another list
+        with entries for each corresponding amino acid change.
     """
     genetic_profile = get_genetic_profiles(study_id, 'mutation')[0]
 
     entrez_to_gene_symbol = get_entrez_mappings(gene_list)
     entrez_ids = list(entrez_to_gene_symbol)
 
-    json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None
-
-    # FIXME: ERROR: [2024-01-14 17:18:15] indra.databases.cbio_client -
-    #  Request returned with code 400: {"message":"eitherSampleListIdOrSampleIdsPresent must be true"}
-    muts = send_request('post', f'molecular-profiles/{genetic_profile}/'
-                                f'mutations/fetch', json_data)
-    return muts
-#    if mutation_type:
+    # Does this need to be parameterized?
+    case_set_id = study_id + '_all'
 
+    mutations = send_request('post',
+                             f'molecular-profiles/{genetic_profile}/'
+                             f'mutations/fetch',
+                             {'sampleListId': case_set_id,
+                              'entrezGeneIds': entrez_ids})
 
-    data = {'cmd': 'getMutationData',
-            'case_set_id': study_id,
-            'genetic_profile_id': genetic_profile,
-            'gene_list': gene_list_str,
-            'skiprows': -1}
-    df = send_request(**data)
     if case_id:
-        df = df[df['case_id'] == case_id]
-    res = _filter_data_frame(df, ['gene_symbol', 'amino_acid_change'],
-                             'mutation_type', mutation_type)
-    mutations = {'gene_symbol': list(res['gene_symbol'].values()),
-                 'amino_acid_change': list(res['amino_acid_change'].values())}
-    return mutations
+        mutations = [m for m in mutations if m['case_id'] == case_id]
+
+    if mutation_type:
+        mutations = [m for m in mutations if (mutation_type.casefold()
+                                              in m['mutationType'].casefold())]
+
+    mutations_dict = {
+        'gene_symbol': [entrez_to_gene_symbol[str(m['entrezGeneId'])]
+                        for m in mutations],
+        'amino_acid_change': [m['proteinChange'] for m in mutations],
+        'sample_id': [m['sampleId'] for m in mutations],
+    }
+    return mutations_dict
 
 
 def get_entrez_mappings(gene_list):
@@ -254,6 +254,8 @@ def get_genetic_profiles(study_id, profile_filter=None):
     'cellline_ccle_broad_mutations' for mutations, 'cellline_ccle_broad_CNA'
     for copy number alterations, etc.
 
+    NOTE: In the v2 API, the genetic profiles are called molecular profiles.
+
     Parameters
     ----------
     study_id : str
@@ -393,15 +395,11 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change):
     cell_lines : list
         A list of CCLE cell lines in which the given mutation occurs.
     """
-    data = {'cmd': 'getMutationData',
-            'case_set_id': ccle_study,
-            'genetic_profile_id': ccle_study + '_mutations',
-            'gene_list': gene,
-            'skiprows': 1}
-    df = send_request(**data)
-    df = df[df['amino_acid_change'] == amino_acid_change]
-    cell_lines = df['case_id'].unique().tolist()
-    return cell_lines
+    mutations = get_mutations(ccle_study, [gene], 'missense')
+    cell_lines = {cl for aac, cl
+                  in zip(mutations['amino_acid_change'], mutations['sample_id'])
+                  if aac == amino_acid_change}
+    return sorted(cell_lines)
 
 
 def get_ccle_cna(gene_list, cell_lines=None):

From beb3a0818b8baf71794d064503612c69e441ea96 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 21 Jan 2024 16:45:36 -0500
Subject: [PATCH 08/11] Improve docs and clean up

---
 indra/databases/cbio_client.py | 77 ++++++++++------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 0e5528e8cb..0ef44772c1 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -10,10 +10,9 @@
            "get_ccle_lines_for_mutation", "get_ccle_cna",
            "get_ccle_mrna"]
 
-import pandas
 import logging
 import requests
-
+from functools import lru_cache
 from indra.databases import hgnc_client
 
 
@@ -36,7 +35,14 @@ def send_request(method, endpoint, json_data=None):
 
     Parameters
     ----------
-    TODO
+    method : str
+        The HTTP method to use for the request.
+        Example: 'get' or 'post'
+    endpoint : str
+        The endpoint to use for the request.
+        Example: 'studies'
+    json_data : Optional[Dict]
+        The dict-like JSON data structure to send with the request.
 
     Returns
     -------
@@ -57,6 +63,7 @@ def send_request(method, endpoint, json_data=None):
     return res.json()
 
 
+@lru_cache(maxsize=1000)
 def get_mutations(study_id, gene_list=None, mutation_type=None,
                   case_id=None):
     """Return mutations as a list of genes and list of amino acid changes.
@@ -112,6 +119,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
     return mutations_dict
 
 
+@lru_cache(maxsize=1000)
 def get_entrez_mappings(gene_list):
     if gene_list:
         # First we need to get HGNC IDs from HGNC symbols
@@ -128,6 +136,7 @@ def get_entrez_mappings(gene_list):
     return entrez_to_gene_symbol
 
 
+@lru_cache(maxsize=1000)
 def get_case_lists(study_id):
     """Return a list of the case set ids for a particular study.
 
@@ -155,6 +164,7 @@ def get_case_lists(study_id):
     return [sl['sampleListId'] for sl in res]
 
 
+@lru_cache(maxsize=1000)
 def get_profile_data(study_id, gene_list,
                      profile_filter, case_set_filter=None):
     """Return dict of cases and genes and their respective values.
@@ -216,6 +226,7 @@ def get_profile_data(study_id, gene_list,
     return profile_data
 
 
+@lru_cache(maxsize=1000)
 def get_num_sequenced(study_id):
     """Return number of sequenced tumors for given study.
 
@@ -246,6 +257,7 @@ def get_num_sequenced(study_id):
     return num_case
 
 
+@lru_cache(maxsize=1000)
 def get_genetic_profiles(study_id, profile_filter=None):
     """Return all the genetic profiles (data sets) for a given study.
 
@@ -286,6 +298,7 @@ def get_genetic_profiles(study_id, profile_filter=None):
     return profile_ids
 
 
+@lru_cache(maxsize=1000)
 def get_cancer_studies(study_filter=None):
     """Return a list of cancer study identifiers, optionally filtered.
 
@@ -313,6 +326,7 @@ def get_cancer_studies(study_filter=None):
     return study_ids
 
 
+@lru_cache(maxsize=1000)
 def get_cancer_types(cancer_filter=None):
     """Return a list of cancer types, optionally filtered.
 
@@ -338,6 +352,7 @@ def get_cancer_types(cancer_filter=None):
     return type_ids
 
 
+@lru_cache(maxsize=1000)
 def get_ccle_mutations(gene_list, cell_lines, mutation_type=None):
     """Return a dict of mutations in given genes and cell lines from CCLE.
 
@@ -376,6 +391,7 @@ def get_ccle_mutations(gene_list, cell_lines, mutation_type=None):
     return mutations
 
 
+@lru_cache(maxsize=1000)
 def get_ccle_lines_for_mutation(gene, amino_acid_change):
     """Return cell lines with a given point mutation in a given gene.
 
@@ -402,6 +418,7 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change):
     return sorted(cell_lines)
 
 
+@lru_cache(maxsize=1000)
 def get_ccle_cna(gene_list, cell_lines=None):
     """Return a dict of CNAs in given genes and cell lines from CCLE.
 
@@ -436,6 +453,7 @@ def get_ccle_cna(gene_list, cell_lines=None):
             if cell_lines is None or cell_line in cell_lines}
 
 
+@lru_cache(maxsize=1000)
 def get_ccle_mrna(gene_list, cell_lines=None):
     """Return a dict of mRNA amounts in given genes and cell lines from CCLE.
 
@@ -465,56 +483,3 @@ def get_ccle_mrna(gene_list, cell_lines=None):
             if cell_line not in mrna_amounts:
                 mrna_amounts[cell_line] = None
     return mrna_amounts
-
-
-def _filter_data_frame(df, data_col, filter_col, filter_str=None):
-    """Return a filtered data frame as a dictionary."""
-    if filter_str is not None:
-        relevant_cols = data_col + [filter_col]
-        df.dropna(inplace=True, subset=relevant_cols)
-        row_filter = df[filter_col].str.contains(filter_str, case=False)
-        data_list = df[row_filter][data_col].to_dict()
-    else:
-        data_list = df[data_col].to_dict()
-    return data_list
-
-
-# Deactivate this section for the time being, can be reinstated
-# once these are fully integrated
-'''
-
-def _read_ccle_cna():
-    fname = os.path.dirname(os.path.abspath(__file__)) + \
-        '/../../data/ccle_CNA.txt'
-    try:
-        df = pandas.read_csv(fname, sep='\t')
-    except Exception:
-        df = None
-    return df
-
-ccle_cna_df = _read_ccle_cna()
-
-
-def _read_ccle_mrna():
-    fname = os.path.dirname(os.path.abspath(__file__)) + \
-        '/../../data/ccle_expression_median.txt'
-    try:
-        df = pandas.read_csv(fname, sep='\t')
-    except Exception:
-        df = None
-    return df
-
-ccle_mrna_df = _read_ccle_mrna()
-
-
-def _read_ccle_mutations():
-    fname = os.path.dirname(os.path.abspath(__file__)) + \
-        '/../../data/ccle_mutations_extended.txt'
-    try:
-        df = pandas.read_csv(fname, sep='\t', skiprows=2)
-    except Exception:
-        df = None
-    return df
-
-ccle_mutations_df = _read_ccle_mutations()
-'''

From 7ae40d1793837f333c5d69bcd7f696d6f6794538 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 21 Jan 2024 17:00:09 -0500
Subject: [PATCH 09/11] Remove cache and fix key

---
 indra/databases/cbio_client.py  | 19 ++++++-------------
 indra/tests/test_cbio_client.py | 12 ------------
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 0ef44772c1..cd6c338979 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -63,7 +63,6 @@ def send_request(method, endpoint, json_data=None):
     return res.json()
 
 
-@lru_cache(maxsize=1000)
 def get_mutations(study_id, gene_list=None, mutation_type=None,
                   case_id=None):
     """Return mutations as a list of genes and list of amino acid changes.
@@ -104,7 +103,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
                               'entrezGeneIds': entrez_ids})
 
     if case_id:
-        mutations = [m for m in mutations if m['case_id'] == case_id]
+        mutations = [m for m in mutations if m['sampleId'] == case_id]
 
     if mutation_type:
         mutations = [m for m in mutations if (mutation_type.casefold()
@@ -119,7 +118,6 @@ def get_mutations(study_id, gene_list=None, mutation_type=None,
     return mutations_dict
 
 
-@lru_cache(maxsize=1000)
 def get_entrez_mappings(gene_list):
     if gene_list:
         # First we need to get HGNC IDs from HGNC symbols
@@ -136,7 +134,6 @@ def get_entrez_mappings(gene_list):
     return entrez_to_gene_symbol
 
 
-@lru_cache(maxsize=1000)
 def get_case_lists(study_id):
     """Return a list of the case set ids for a particular study.
 
@@ -164,7 +161,6 @@ def get_case_lists(study_id):
     return [sl['sampleListId'] for sl in res]
 
 
-@lru_cache(maxsize=1000)
 def get_profile_data(study_id, gene_list,
                      profile_filter, case_set_filter=None):
     """Return dict of cases and genes and their respective values.
@@ -226,7 +222,6 @@ def get_profile_data(study_id, gene_list,
     return profile_data
 
 
-@lru_cache(maxsize=1000)
 def get_num_sequenced(study_id):
     """Return number of sequenced tumors for given study.
 
@@ -257,7 +252,6 @@ def get_num_sequenced(study_id):
     return num_case
 
 
-@lru_cache(maxsize=1000)
 def get_genetic_profiles(study_id, profile_filter=None):
     """Return all the genetic profiles (data sets) for a given study.
 
@@ -298,7 +292,6 @@ def get_genetic_profiles(study_id, profile_filter=None):
     return profile_ids
 
 
-@lru_cache(maxsize=1000)
 def get_cancer_studies(study_filter=None):
     """Return a list of cancer study identifiers, optionally filtered.
 
@@ -326,7 +319,6 @@ def get_cancer_studies(study_filter=None):
     return study_ids
 
 
-@lru_cache(maxsize=1000)
 def get_cancer_types(cancer_filter=None):
     """Return a list of cancer types, optionally filtered.
 
@@ -352,7 +344,6 @@ def get_cancer_types(cancer_filter=None):
     return type_ids
 
 
-@lru_cache(maxsize=1000)
 def get_ccle_mutations(gene_list, cell_lines, mutation_type=None):
     """Return a dict of mutations in given genes and cell lines from CCLE.
 
@@ -391,7 +382,6 @@ def get_ccle_mutations(gene_list, cell_lines, mutation_type=None):
     return mutations
 
 
-@lru_cache(maxsize=1000)
 def get_ccle_lines_for_mutation(gene, amino_acid_change):
     """Return cell lines with a given point mutation in a given gene.
 
@@ -418,7 +408,6 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change):
     return sorted(cell_lines)
 
 
-@lru_cache(maxsize=1000)
 def get_ccle_cna(gene_list, cell_lines=None):
     """Return a dict of CNAs in given genes and cell lines from CCLE.
 
@@ -453,7 +442,6 @@ def get_ccle_cna(gene_list, cell_lines=None):
             if cell_lines is None or cell_line in cell_lines}
 
 
-@lru_cache(maxsize=1000)
 def get_ccle_mrna(gene_list, cell_lines=None):
     """Return a dict of mRNA amounts in given genes and cell lines from CCLE.
 
@@ -472,6 +460,11 @@ def get_ccle_mrna(gene_list, cell_lines=None):
     """
     profile_data = get_profile_data(ccle_study, gene_list,
                                     'MRNA_EXPRESSION', 'all')
+    # FIXME: we need a data structure like this
+    #         assert mrna['A375_SKIN'] is not None
+    #         assert mrna['A375_SKIN']['MAP2K1'] > 10
+    # >       assert mrna['A375_SKIN']['XYZ'] is None
+    # E       KeyError: 'XYZ'
     mrna_amounts = {cell_line: value
                     for cell_line, value in profile_data.items()
                     if cell_lines is None or cell_line in cell_lines}
diff --git a/indra/tests/test_cbio_client.py b/indra/tests/test_cbio_client.py
index 38a5f4f645..489f189a13 100644
--- a/indra/tests/test_cbio_client.py
+++ b/indra/tests/test_cbio_client.py
@@ -30,18 +30,6 @@ def test_get_num_sequenced():
     assert num_case > 0
 
 
-@pytest.mark.webservice
-def test_send_request_ccle():
-    """Sends a request and gets back a dataframe of all cases in ccle study.
-
-    Check that the dataframe is longer than one.
-    """
-    data = {'cmd': 'getCaseLists',
-            'cancer_study_id': 'cellline_ccle_broad'}
-    df = cbio_client.send_request(**data)
-    assert len(df) > 0
-
-
 @pytest.mark.webservice
 def test_get_ccle_lines_for_mutation():
     """Check how many lines have BRAF V600E mutations.

From 7120143a703ac5ef3930f14ed8b8472ccca45b36 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 21 Jan 2024 19:18:15 -0500
Subject: [PATCH 10/11] Fix CNA calculation for backward compatibility

---
 indra/databases/cbio_client.py  |  9 ++++-----
 indra/tests/test_cbio_client.py | 10 ++++------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index cd6c338979..8dc3e5326b 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -460,11 +460,6 @@ def get_ccle_mrna(gene_list, cell_lines=None):
     """
     profile_data = get_profile_data(ccle_study, gene_list,
                                     'MRNA_EXPRESSION', 'all')
-    # FIXME: we need a data structure like this
-    #         assert mrna['A375_SKIN'] is not None
-    #         assert mrna['A375_SKIN']['MAP2K1'] > 10
-    # >       assert mrna['A375_SKIN']['XYZ'] is None
-    # E       KeyError: 'XYZ'
     mrna_amounts = {cell_line: value
                     for cell_line, value in profile_data.items()
                     if cell_lines is None or cell_line in cell_lines}
@@ -475,4 +470,8 @@ def get_ccle_mrna(gene_list, cell_lines=None):
         for cell_line in cell_lines:
             if cell_line not in mrna_amounts:
                 mrna_amounts[cell_line] = None
+            else:
+                for gene in gene_list:
+                    if gene not in mrna_amounts[cell_line]:
+                        mrna_amounts[cell_line][gene] = None
     return mrna_amounts
diff --git a/indra/tests/test_cbio_client.py b/indra/tests/test_cbio_client.py
index 489f189a13..dd22efda5f 100644
--- a/indra/tests/test_cbio_client.py
+++ b/indra/tests/test_cbio_client.py
@@ -94,7 +94,7 @@ def test_get_ccle_mrna():
 def test_get_ccle_cna_big():
     """
     Get the CNA data on 124 genes in 4 cell lines. Expect to have CNA values
-    that are [None, -2.0, -1.0, 0.0, 1.0, 2.0] . This tests the function at
+    that are {-2.0, -1.0, 0.0, 1.0, 2.0}. This tests the function at
     a greater scale. Also, test the cell lines' BRAF CNAs
     """
     genes = ["FOSL1", "GRB2", "RPS6KA3", "EIF4EBP1", "DUSP1", "PLXNB1", "SHC2",
@@ -117,13 +117,11 @@ def test_get_ccle_cna_big():
              "PAK1", "RHEB"]
     cell_lines = ['COLO679_SKIN', 'A2058_SKIN', 'IGR39_SKIN', 'HS294T_SKIN']
     cna = cbio_client.get_ccle_cna(genes, cell_lines)
-    values = []
+    values = set()
     for cl in cna:
         for g in cna[cl]:
-            val = cna[cl][g]
-            values.append(val)
-    values = list(set(values))
-    assert len(values) == 6
+            values.add(cna[cl][g])
+    assert values == {-2.0, -1.0, 0.0, 1.0, 2.0}
     assert cna['COLO679_SKIN']['BRAF'] == 2
     assert cna['A2058_SKIN']['BRAF'] == 1
     assert cna['IGR39_SKIN']['BRAF'] == 1

From 09ea5183e8e61f3d93851d187d16185647e0f52e Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 21 Jan 2024 19:31:30 -0500
Subject: [PATCH 11/11] Implement caching and improve docs

---
 indra/databases/cbio_client.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py
index 8dc3e5326b..9c163d9a29 100644
--- a/indra/databases/cbio_client.py
+++ b/indra/databases/cbio_client.py
@@ -10,6 +10,7 @@
            "get_ccle_lines_for_mutation", "get_ccle_cna",
            "get_ccle_mrna"]
 
+import json
 import logging
 import requests
 from functools import lru_cache
@@ -22,7 +23,6 @@
 ccle_study = 'cellline_ccle_broad'
 
 
-# TODO: implement caching with json_data made immutable
 def send_request(method, endpoint, json_data=None):
     """Return the results of a web service request to cBio portal.
 
@@ -49,13 +49,20 @@ def send_request(method, endpoint, json_data=None):
     JSON
         The JSON object returned by the web service call.
     """
+    json_data_str = json.dumps(json_data) if json_data else None
+    res = _send_request_cached(method, endpoint, json_data_str)
+    return res
+
+
+@lru_cache(maxsize=1000)
+def _send_request_cached(method, endpoint, json_data_str=None):
+    """The actual function running the request, using caching"""
     if endpoint.startswith('/'):
         endpoint = endpoint[1:]
+    json_data = json.loads(json_data_str) if json_data_str else {}
     request_fun = getattr(requests, method)
     full_url = cbio_url + '/' + endpoint
-    print('URL: %s' % full_url)
-    print('JSON: %s' % json_data)
-    res = request_fun(full_url, json=json_data or {})
+    res = request_fun(full_url, json=json_data)
     if res.status_code != 200:
         logger.error(f'Request returned with code {res.status_code}: '
                      f'{res.text}')
@@ -139,12 +146,6 @@ def get_case_lists(study_id):
 
     In v2 of the API these are called sample lists.
 
-    Old comment:
-    TAKE NOTE the "case_list_id" are the same thing as "case_set_id"
-    Within the data, this string is referred to as a "case_list_id".
-    Within API calls it is referred to as a 'case_set_id'.
-    The documentation does not make this explicitly clear.
-
     Parameters
     ----------
     study_id : str
@@ -161,8 +162,8 @@ def get_case_lists(study_id):
     return [sl['sampleListId'] for sl in res]
 
 
-def get_profile_data(study_id, gene_list,
-                     profile_filter, case_set_filter=None):
+def get_profile_data(study_id, gene_list, profile_filter,
+                     case_set_filter=None):
     """Return dict of cases and genes and their respective values.
 
     Parameters
@@ -187,8 +188,9 @@ def get_profile_data(study_id, gene_list,
     Returns
     -------
     profile_data : dict[dict[int]]
-        A dict keyed to cases containing a dict keyed to genes
-        containing int
+        A dict keyed to cases (cell lines if using CCLE) in turn
+        containing a dict keyed by genes, with values corresponding to
+        the given profile (e.g., CNA, mutations).
     """
     genetic_profiles = get_genetic_profiles(study_id, profile_filter)
     if genetic_profiles: