From 3898435c904860587c479564c75da8a73d0e1d0a Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 14 Jan 2024 14:46:05 -0500 Subject: [PATCH 01/11] Implement new request function and one endpoint --- indra/databases/cbio_client.py | 92 +++++++++++++++------------------- 1 file changed, 40 insertions(+), 52 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index cd309d496a..29e6631f23 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -1,66 +1,53 @@ -from __future__ import absolute_import, print_function, unicode_literals -from builtins import dict, str -import os +"""This is a client for the cBioPortal web service, with +documentation at https://docs.cbioportal.org/web-api-and-clients/ +and Swagger definition at https://www.cbioportal.org/api/v2/api-docs. +Note that the client implements direct requests to the API instead of +adding an additional dependency to do so. +""" +__all__ = ["get_mutations", "get_case_lists", "get_profile_data", + "get_num_sequenced", "get_genetic_profiles", + "get_cancer_studies", "get_cancer_types", "get_ccle_mutations", + "get_ccle_lines_for_mutation", "get_ccle_cna", + "get_ccle_mrna"] + import pandas import logging import requests -from collections import defaultdict -# Python3 -try: - from functools import lru_cache - from io import StringIO -# Python2 -except ImportError: - from functools32 import lru_cache - from StringIO import StringIO logger = logging.getLogger(__name__) -cbio_url = 'http://www.cbioportal.org/webservice.do' +cbio_url = 'https://www.cbioportal.org/api' ccle_study = 'cellline_ccle_broad' +# TODO: implement caching with json_data made immutable +def send_request(method, endpoint, json_data): + """Return the results of a web service request to cBio portal. -@lru_cache(maxsize=10000) -def send_request(**kwargs): - """Return a data frame from a web service request to cBio portal. - - Sends a web service requrest to the cBio portal with arguments given in - the dictionary data and returns a Pandas data frame on success. + Sends a web service request to the cBio portal with a specific endpoint, + method, and JSON data structure, and returns the resulting JSON + data structure on success. - More information about the service here: - http://www.cbioportal.org/web_api.jsp + More information about the service is available here: + https://www.cbioportal.org/api/v2/api-docs Parameters ---------- - kwargs : dict - A dict of parameters for the query. Entries map directly to web service - calls with the exception of the optional 'skiprows' entry, whose value - is used as the number of rows to skip when reading the result data - frame. + TODO Returns ------- - df : pandas.DataFrame - Response from cBioPortal as a Pandas DataFrame. + JSON + The JSON object returned by the web service call. """ - skiprows = kwargs.pop('skiprows', None) - res = requests.get(cbio_url, params=kwargs) - if res.status_code == 200: - # Adaptively skip rows based on number of comment lines - if skiprows == -1: - lines = res.text.split('\n') - skiprows = 0 - for line in lines: - if line.startswith('#'): - skiprows += 1 - else: - break - csv_StringIO = StringIO(res.text) - df = pandas.read_csv(csv_StringIO, sep='\t', skiprows=skiprows) - return df - else: + if endpoint.startswith('/'): + endpoint = endpoint[1:] + request_fun = getattr(requests, method) + res = request_fun(cbio_url + '/' + endpoint, json=json_data) + if res.status_code != 200: logger.error('Request returned with code %d' % res.status_code) + return + return res.json() def get_mutations(study_id, gene_list, mutation_type=None, @@ -242,20 +229,21 @@ def get_genetic_profiles(study_id, profile_filter=None): - MRNA_EXPRESSION - METHYLATION The genetic profiles can include "mutation", "CNA", "rppa", - "methylation", etc. + "methylation", etc. The filter is case insensitive. Returns ------- genetic_profiles : list[str] A list of genetic profiles available for the given study. """ - data = {'cmd': 'getGeneticProfiles', - 'cancer_study_id': study_id} - df = send_request(**data) - res = _filter_data_frame(df, ['genetic_profile_id'], - 'genetic_alteration_type', profile_filter) - genetic_profiles = list(res['genetic_profile_id'].values()) - return genetic_profiles + res = send_request('post', 'molecular-profiles/fetch', + {'studyIds': [study_id]}) + if profile_filter: + res = [prof for prof in res + if (profile_filter.casefold() + in prof['molecularAlterationType'].casefold())] + profile_ids = [prof['molecularProfileId'] for prof in res] + return profile_ids def get_cancer_studies(study_filter=None): From ecbc2bf1bbfb855fca65b83f0ca5a7ce2b91e034 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 14 Jan 2024 15:11:17 -0500 Subject: [PATCH 02/11] Simplify getting molecular profiles --- indra/databases/cbio_client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 29e6631f23..46c84386b3 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -21,7 +21,7 @@ ccle_study = 'cellline_ccle_broad' # TODO: implement caching with json_data made immutable -def send_request(method, endpoint, json_data): +def send_request(method, endpoint, json_data=None): """Return the results of a web service request to cBio portal. Sends a web service request to the cBio portal with a specific endpoint, @@ -113,6 +113,8 @@ def get_case_lists(study_id): A dict keyed to cases containing a dict keyed to genes containing int """ + res = send_request('post', 'sample-lists/fetch', + {'studyIds': [study_id]}) data = {'cmd': 'getCaseLists', 'cancer_study_id': study_id} df = send_request(**data) @@ -236,8 +238,7 @@ def get_genetic_profiles(study_id, profile_filter=None): genetic_profiles : list[str] A list of genetic profiles available for the given study. """ - res = send_request('post', 'molecular-profiles/fetch', - {'studyIds': [study_id]}) + res = send_request('get', f'studies/{study_id}/molecular-profiles') if profile_filter: res = [prof for prof in res if (profile_filter.casefold() From b3491ef11f525daa59198a364f09f79d37962286 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 14 Jan 2024 15:57:39 -0500 Subject: [PATCH 03/11] Implement getting cancer studies --- indra/databases/cbio_client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 46c84386b3..e3a79b4dc6 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -266,11 +266,11 @@ def get_cancer_studies(study_filter=None): of study IDs with paad in their name like "paad_icgc", "paad_tcga", etc. """ - data = {'cmd': 'getCancerStudies'} - df = send_request(**data) - res = _filter_data_frame(df, ['cancer_study_id'], - 'cancer_study_id', study_filter) - study_ids = list(res['cancer_study_id'].values()) + studies = send_request('get', 'studies') + if study_filter: + studies = [s for s in studies + if study_filter.casefold() in s['studyId'].casefold()] + study_ids = [s['studyId'] for s in studies] return study_ids From 3ab2c9b335e7d9a4f863dd88aa5cf5e71a4dea0a Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 18 Jan 2024 19:32:27 -0500 Subject: [PATCH 04/11] Start implementing getting mutations --- indra/databases/cbio_client.py | 40 +++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index e3a79b4dc6..d5dfe58c8d 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -14,12 +14,15 @@ import logging import requests +from indra.databases import hgnc_client + logger = logging.getLogger(__name__) cbio_url = 'https://www.cbioportal.org/api' ccle_study = 'cellline_ccle_broad' + # TODO: implement caching with json_data made immutable def send_request(method, endpoint, json_data=None): """Return the results of a web service request to cBio portal. @@ -43,14 +46,15 @@ def send_request(method, endpoint, json_data=None): if endpoint.startswith('/'): endpoint = endpoint[1:] request_fun = getattr(requests, method) - res = request_fun(cbio_url + '/' + endpoint, json=json_data) + res = request_fun(cbio_url + '/' + endpoint, json=json_data or {}) if res.status_code != 200: - logger.error('Request returned with code %d' % res.status_code) + logger.error(f'Request returned with code {res.status_code}: ' + f'{res.text}') return return res.json() -def get_mutations(study_id, gene_list, mutation_type=None, +def get_mutations(study_id, gene_list=None, mutation_type=None, case_id=None): """Return mutations as a list of genes and list of amino acid changes. @@ -76,7 +80,26 @@ def get_mutations(study_id, gene_list, mutation_type=None, the second one a list of amino acid changes in those genes. """ genetic_profile = get_genetic_profiles(study_id, 'mutation')[0] - gene_list_str = ','.join(gene_list) + breakpoint() + + if gene_list: + hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list} + entrez_mappings = {hgnc_mappings[g]: + hgnc_client.get_entrez_id(hgnc_mappings[g]) + for g in gene_list if hgnc_mappings[g] is not None} + entrez_ids = [e for e in entrez_mappings.values() if e is not None] + else: + entrez_ids = None + + json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None + + # FIXME: ERROR: [2024-01-14 17:18:15] indra.databases.cbio_client - + # Request returned with code 400: {"message":"eitherSampleListIdOrSampleIdsPresent must be true"} + muts = send_request('post', f'molecular-profiles/{genetic_profile}/' + f'mutations/fetch', json_data) + return muts +# if mutation_type: + data = {'cmd': 'getMutationData', 'case_set_id': study_id, @@ -291,10 +314,11 @@ def get_cancer_types(cancer_filter=None): Example: for cancer_filter="pancreatic", the result includes "panet" (neuro-endocrine) and "paad" (adenocarcinoma) """ - data = {'cmd': 'getTypesOfCancer'} - df = send_request(**data) - res = _filter_data_frame(df, ['type_of_cancer_id'], 'name', cancer_filter) - type_ids = list(res['type_of_cancer_id'].values()) + cancer_types = send_request('get', 'cancer-types') + if cancer_filter: + cancer_types = [c for c in cancer_types + if cancer_filter.casefold() in c['name'].casefold()] + type_ids = [c['cancerTypeId'] for c in cancer_types] return type_ids From adb47cd482efaf7199c58bae4361abee8f2ccc3c Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 20 Jan 2024 18:30:11 -0500 Subject: [PATCH 05/11] Implement handling Entrez mappings and profile data --- indra/databases/cbio_client.py | 87 +++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index d5dfe58c8d..4d8b1afb79 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -46,7 +46,10 @@ def send_request(method, endpoint, json_data=None): if endpoint.startswith('/'): endpoint = endpoint[1:] request_fun = getattr(requests, method) - res = request_fun(cbio_url + '/' + endpoint, json=json_data or {}) + full_url = cbio_url + '/' + endpoint + print('URL: %s' % full_url) + print('JSON: %s' % json_data) + res = request_fun(full_url, json=json_data or {}) if res.status_code != 200: logger.error(f'Request returned with code {res.status_code}: ' f'{res.text}') @@ -82,14 +85,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, genetic_profile = get_genetic_profiles(study_id, 'mutation')[0] breakpoint() - if gene_list: - hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list} - entrez_mappings = {hgnc_mappings[g]: - hgnc_client.get_entrez_id(hgnc_mappings[g]) - for g in gene_list if hgnc_mappings[g] is not None} - entrez_ids = [e for e in entrez_mappings.values() if e is not None] - else: - entrez_ids = None + entrez_ids = get_entrez_mappings(gene_list) json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None @@ -116,9 +112,28 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, return mutations +def get_entrez_mappings(gene_list): + if gene_list: + # First we need to get HGNC IDs from HGNC symbols + hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list} + # Next, we map from HGNC symbols to Entrez IDs via the hgnc_mappings + entrez_mappings = {g: hgnc_client.get_entrez_id(hgnc_mappings[g]) + for g in gene_list if hgnc_mappings[g] is not None} + # Finally, we reverse the mapping, this will ensure that + # we can get the gene symbols back when generating results + entrez_to_gene_symbol = {v: k for k, v in entrez_mappings.items() + if v is not None and k is not None} + else: + entrez_to_gene_symbol = {} + return entrez_to_gene_symbol + + def get_case_lists(study_id): """Return a list of the case set ids for a particular study. + In v2 of the API these are called sample lists. + + Old comment: TAKE NOTE the "case_list_id" are the same thing as "case_set_id" Within the data, this string is referred to as a "case_list_id". Within API calls it is referred to as a 'case_set_id'. @@ -132,17 +147,12 @@ def get_case_lists(study_id): Returns ------- - case_set_ids : dict[dict[int]] - A dict keyed to cases containing a dict keyed to genes - containing int + case_set_ids : list[str] + A list of case set IDs, e.g., ['cellline_ccle_broad_all', + 'cellline_ccle_broad_cna', ...] """ - res = send_request('post', 'sample-lists/fetch', - {'studyIds': [study_id]}) - data = {'cmd': 'getCaseLists', - 'cancer_study_id': study_id} - df = send_request(**data) - case_set_ids = df['case_list_id'].tolist() - return case_set_ids + res = send_request('get', f'studies/{study_id}/sample-lists') + return [sl['sampleListId'] for sl in res] def get_profile_data(study_id, gene_list, @@ -165,7 +175,7 @@ def get_profile_data(study_id, gene_list, - MRNA_EXPRESSION - METHYLATION case_set_filter : Optional[str] - A string that specifices which case_set_id to use, based on a complete + A string that specifies which case_set_id to use, based on a complete or partial match. If not provided, will look for study_id + '_all' Returns @@ -179,29 +189,30 @@ def get_profile_data(study_id, gene_list, genetic_profile = genetic_profiles[0] else: return {} - gene_list_str = ','.join(gene_list) case_set_ids = get_case_lists(study_id) if case_set_filter: case_set_id = [x for x in case_set_ids if case_set_filter in x][0] else: - case_set_id = study_id + '_all' # based on looking at the cBioPortal, this is a common case_set_id - data = {'cmd': 'getProfileData', - 'case_set_id': case_set_id, - 'genetic_profile_id': genetic_profile, - 'gene_list': gene_list_str, - 'skiprows': -1} - df = send_request(**data) - case_list_df = [x for x in df.columns.tolist() - if x not in ['GENE_ID', 'COMMON']] - profile_data = {case: {g: None for g in gene_list} - for case in case_list_df} - for case in case_list_df: - profile_values = df[case].tolist() - df_gene_list = df['COMMON'].tolist() - for g, cv in zip(df_gene_list, profile_values): - if not pandas.isnull(cv): - profile_data[case][g] = cv + case_set_id = study_id + '_all' + entrez_to_gene_symbol = get_entrez_mappings(gene_list) + entrez_ids = list(entrez_to_gene_symbol) + res = send_request('post', f'molecular-profiles/{genetic_profile}/' + f'molecular-data/fetch', + {'sampleListId': case_set_id, + 'entrezGeneIds': entrez_ids}) + + profile_data = {} + # Each entry in the results contains something like + # {'entrezGeneId': 673, 'molecularProfileId': 'cellline_ccle_broad_cna', + # 'sampleId': '1321N1_CENTRAL_NERVOUS_SYSTEM', + # 'studyId': 'cellline_ccle_broad', 'value': 1, ...} + for sample in res: + sample_id = sample['sampleId'] + if sample_id not in profile_data: + profile_data[sample_id] = {} + gene_symbol = entrez_to_gene_symbol[str(sample['entrezGeneId'])] + profile_data[sample_id][gene_symbol] = sample['value'] return profile_data From 6eafa341ff87c5a66223bed0e1fbee2864e946f2 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 20 Jan 2024 18:50:47 -0500 Subject: [PATCH 06/11] Implement further endpoints using new API --- indra/databases/cbio_client.py | 67 +++++++++++++++------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 4d8b1afb79..2641e7b751 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -83,9 +83,9 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, the second one a list of amino acid changes in those genes. """ genetic_profile = get_genetic_profiles(study_id, 'mutation')[0] - breakpoint() - entrez_ids = get_entrez_mappings(gene_list) + entrez_to_gene_symbol = get_entrez_mappings(gene_list) + entrez_ids = list(entrez_to_gene_symbol) json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None @@ -233,13 +233,16 @@ def get_num_sequenced(study_id): num_case : int The number of sequenced tumors in the given study """ - data = {'cmd': 'getCaseLists', - 'cancer_study_id': study_id} - df = send_request(**data) - if df.empty: - return 0 - row_filter = df['case_list_id'].str.contains('sequenced', case=False) - num_case = len(df[row_filter]['case_ids'].tolist()[0].split(' ')) + # First we get all the case lists for the study + case_lists = get_case_lists(study_id) + # Then we find ones that have 'sequenced' in the name + sequencing_case_list = [cl for cl in case_lists if 'sequenced' in cl] + # Then we look at the sample IDs and count them + cases = set() + for cl in sequencing_case_list: + res = send_request('get', f'/sample-lists/{cl}/sample-ids') + cases |= set(res) + num_case = len(cases) return num_case @@ -401,7 +404,7 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change): return cell_lines -def get_ccle_cna(gene_list, cell_lines): +def get_ccle_cna(gene_list, cell_lines=None): """Return a dict of CNAs in given genes and cell lines from CCLE. CNA values correspond to the following alterations @@ -420,7 +423,7 @@ def get_ccle_cna(gene_list, cell_lines): ---------- gene_list : list[str] A list of HGNC gene symbols to get mutations in - cell_lines : list[str] + cell_lines : Optional[list[str]] A list of CCLE cell line names to get mutations for. Returns @@ -431,19 +434,18 @@ def get_ccle_cna(gene_list, cell_lines): """ profile_data = get_profile_data(ccle_study, gene_list, 'COPY_NUMBER_ALTERATION', 'all') - profile_data = dict((key, value) for key, value in profile_data.items() - if key in cell_lines) - return profile_data + return {cell_line: value for cell_line, value in profile_data.items() + if cell_lines is None or cell_line in cell_lines} -def get_ccle_mrna(gene_list, cell_lines): +def get_ccle_mrna(gene_list, cell_lines=None): """Return a dict of mRNA amounts in given genes and cell lines from CCLE. Parameters ---------- gene_list : list[str] A list of HGNC gene symbols to get mRNA amounts for. - cell_lines : list[str] + cell_lines : Optional[list[str]] A list of CCLE cell line names to get mRNA amounts for. Returns @@ -452,27 +454,18 @@ def get_ccle_mrna(gene_list, cell_lines): A dict keyed to cell lines containing a dict keyed to genes containing float """ - gene_list_str = ','.join(gene_list) - data = {'cmd': 'getProfileData', - 'case_set_id': ccle_study + '_mrna', - 'genetic_profile_id': ccle_study + '_mrna', - 'gene_list': gene_list_str, - 'skiprows': -1} - df = send_request(**data) - mrna_amounts = {cl: {g: [] for g in gene_list} for cl in cell_lines} - for cell_line in cell_lines: - if cell_line in df.columns: - for gene in gene_list: - value_cell = df[cell_line][df['COMMON'] == gene] - if value_cell.empty: - mrna_amounts[cell_line][gene] = None - elif pandas.isnull(value_cell.values[0]): - mrna_amounts[cell_line][gene] = None - else: - value = value_cell.values[0] - mrna_amounts[cell_line][gene] = value - else: - mrna_amounts[cell_line] = None + profile_data = get_profile_data(ccle_study, gene_list, + 'MRNA_EXPRESSION', 'all') + mrna_amounts = {cell_line: value + for cell_line, value in profile_data.items() + if cell_lines is None or cell_line in cell_lines} + # This is to make sure that if cell_lines were specified then + # we return None if there is no data for a given cell line + # This matches the old behavior of the function + if cell_lines: + for cell_line in cell_lines: + if cell_line not in mrna_amounts: + mrna_amounts[cell_line] = None return mrna_amounts From 3b3b2fbf290a838898ca9c6a5a5002b1e1b7818b Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 21 Jan 2024 16:42:00 -0500 Subject: [PATCH 07/11] Implement getting mutations --- indra/databases/cbio_client.py | 62 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 2641e7b751..0e5528e8cb 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -78,38 +78,38 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, Returns ------- - mutations : tuple[list] - A tuple of two lists, the first one containing a list of genes, and - the second one a list of amino acid changes in those genes. + mutations : dict + A dict with entries for each gene symbol and another list + with entries for each corresponding amino acid change. """ genetic_profile = get_genetic_profiles(study_id, 'mutation')[0] entrez_to_gene_symbol = get_entrez_mappings(gene_list) entrez_ids = list(entrez_to_gene_symbol) - json_data = {'entrezGeneIds': entrez_ids} if entrez_ids else None - - # FIXME: ERROR: [2024-01-14 17:18:15] indra.databases.cbio_client - - # Request returned with code 400: {"message":"eitherSampleListIdOrSampleIdsPresent must be true"} - muts = send_request('post', f'molecular-profiles/{genetic_profile}/' - f'mutations/fetch', json_data) - return muts -# if mutation_type: + # Does this need to be parameterized? + case_set_id = study_id + '_all' + mutations = send_request('post', + f'molecular-profiles/{genetic_profile}/' + f'mutations/fetch', + {'sampleListId': case_set_id, + 'entrezGeneIds': entrez_ids}) - data = {'cmd': 'getMutationData', - 'case_set_id': study_id, - 'genetic_profile_id': genetic_profile, - 'gene_list': gene_list_str, - 'skiprows': -1} - df = send_request(**data) if case_id: - df = df[df['case_id'] == case_id] - res = _filter_data_frame(df, ['gene_symbol', 'amino_acid_change'], - 'mutation_type', mutation_type) - mutations = {'gene_symbol': list(res['gene_symbol'].values()), - 'amino_acid_change': list(res['amino_acid_change'].values())} - return mutations + mutations = [m for m in mutations if m['case_id'] == case_id] + + if mutation_type: + mutations = [m for m in mutations if (mutation_type.casefold() + in m['mutationType'].casefold())] + + mutations_dict = { + 'gene_symbol': [entrez_to_gene_symbol[str(m['entrezGeneId'])] + for m in mutations], + 'amino_acid_change': [m['proteinChange'] for m in mutations], + 'sample_id': [m['sampleId'] for m in mutations], + } + return mutations_dict def get_entrez_mappings(gene_list): @@ -254,6 +254,8 @@ def get_genetic_profiles(study_id, profile_filter=None): 'cellline_ccle_broad_mutations' for mutations, 'cellline_ccle_broad_CNA' for copy number alterations, etc. + NOTE: In the v2 API, the genetic profiles are called molecular profiles. + Parameters ---------- study_id : str @@ -393,15 +395,11 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change): cell_lines : list A list of CCLE cell lines in which the given mutation occurs. """ - data = {'cmd': 'getMutationData', - 'case_set_id': ccle_study, - 'genetic_profile_id': ccle_study + '_mutations', - 'gene_list': gene, - 'skiprows': 1} - df = send_request(**data) - df = df[df['amino_acid_change'] == amino_acid_change] - cell_lines = df['case_id'].unique().tolist() - return cell_lines + mutations = get_mutations(ccle_study, [gene], 'missense') + cell_lines = {cl for aac, cl + in zip(mutations['amino_acid_change'], mutations['sample_id']) + if aac == amino_acid_change} + return sorted(cell_lines) def get_ccle_cna(gene_list, cell_lines=None): From beb3a0818b8baf71794d064503612c69e441ea96 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 21 Jan 2024 16:45:36 -0500 Subject: [PATCH 08/11] Improve docs and clean up --- indra/databases/cbio_client.py | 77 ++++++++++------------------------ 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 0e5528e8cb..0ef44772c1 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -10,10 +10,9 @@ "get_ccle_lines_for_mutation", "get_ccle_cna", "get_ccle_mrna"] -import pandas import logging import requests - +from functools import lru_cache from indra.databases import hgnc_client @@ -36,7 +35,14 @@ def send_request(method, endpoint, json_data=None): Parameters ---------- - TODO + method : str + The HTTP method to use for the request. + Example: 'get' or 'post' + endpoint : str + The endpoint to use for the request. + Example: 'studies' + json_data : Optional[Dict] + The dict-like JSON data structure to send with the request. Returns ------- @@ -57,6 +63,7 @@ def send_request(method, endpoint, json_data=None): return res.json() +@lru_cache(maxsize=1000) def get_mutations(study_id, gene_list=None, mutation_type=None, case_id=None): """Return mutations as a list of genes and list of amino acid changes. @@ -112,6 +119,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, return mutations_dict +@lru_cache(maxsize=1000) def get_entrez_mappings(gene_list): if gene_list: # First we need to get HGNC IDs from HGNC symbols @@ -128,6 +136,7 @@ def get_entrez_mappings(gene_list): return entrez_to_gene_symbol +@lru_cache(maxsize=1000) def get_case_lists(study_id): """Return a list of the case set ids for a particular study. @@ -155,6 +164,7 @@ def get_case_lists(study_id): return [sl['sampleListId'] for sl in res] +@lru_cache(maxsize=1000) def get_profile_data(study_id, gene_list, profile_filter, case_set_filter=None): """Return dict of cases and genes and their respective values. @@ -216,6 +226,7 @@ def get_profile_data(study_id, gene_list, return profile_data +@lru_cache(maxsize=1000) def get_num_sequenced(study_id): """Return number of sequenced tumors for given study. @@ -246,6 +257,7 @@ def get_num_sequenced(study_id): return num_case +@lru_cache(maxsize=1000) def get_genetic_profiles(study_id, profile_filter=None): """Return all the genetic profiles (data sets) for a given study. @@ -286,6 +298,7 @@ def get_genetic_profiles(study_id, profile_filter=None): return profile_ids +@lru_cache(maxsize=1000) def get_cancer_studies(study_filter=None): """Return a list of cancer study identifiers, optionally filtered. @@ -313,6 +326,7 @@ def get_cancer_studies(study_filter=None): return study_ids +@lru_cache(maxsize=1000) def get_cancer_types(cancer_filter=None): """Return a list of cancer types, optionally filtered. @@ -338,6 +352,7 @@ def get_cancer_types(cancer_filter=None): return type_ids +@lru_cache(maxsize=1000) def get_ccle_mutations(gene_list, cell_lines, mutation_type=None): """Return a dict of mutations in given genes and cell lines from CCLE. @@ -376,6 +391,7 @@ def get_ccle_mutations(gene_list, cell_lines, mutation_type=None): return mutations +@lru_cache(maxsize=1000) def get_ccle_lines_for_mutation(gene, amino_acid_change): """Return cell lines with a given point mutation in a given gene. @@ -402,6 +418,7 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change): return sorted(cell_lines) +@lru_cache(maxsize=1000) def get_ccle_cna(gene_list, cell_lines=None): """Return a dict of CNAs in given genes and cell lines from CCLE. @@ -436,6 +453,7 @@ def get_ccle_cna(gene_list, cell_lines=None): if cell_lines is None or cell_line in cell_lines} +@lru_cache(maxsize=1000) def get_ccle_mrna(gene_list, cell_lines=None): """Return a dict of mRNA amounts in given genes and cell lines from CCLE. @@ -465,56 +483,3 @@ def get_ccle_mrna(gene_list, cell_lines=None): if cell_line not in mrna_amounts: mrna_amounts[cell_line] = None return mrna_amounts - - -def _filter_data_frame(df, data_col, filter_col, filter_str=None): - """Return a filtered data frame as a dictionary.""" - if filter_str is not None: - relevant_cols = data_col + [filter_col] - df.dropna(inplace=True, subset=relevant_cols) - row_filter = df[filter_col].str.contains(filter_str, case=False) - data_list = df[row_filter][data_col].to_dict() - else: - data_list = df[data_col].to_dict() - return data_list - - -# Deactivate this section for the time being, can be reinstated -# once these are fully integrated -''' - -def _read_ccle_cna(): - fname = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../data/ccle_CNA.txt' - try: - df = pandas.read_csv(fname, sep='\t') - except Exception: - df = None - return df - -ccle_cna_df = _read_ccle_cna() - - -def _read_ccle_mrna(): - fname = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../data/ccle_expression_median.txt' - try: - df = pandas.read_csv(fname, sep='\t') - except Exception: - df = None - return df - -ccle_mrna_df = _read_ccle_mrna() - - -def _read_ccle_mutations(): - fname = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../data/ccle_mutations_extended.txt' - try: - df = pandas.read_csv(fname, sep='\t', skiprows=2) - except Exception: - df = None - return df - -ccle_mutations_df = _read_ccle_mutations() -''' From 7ae40d1793837f333c5d69bcd7f696d6f6794538 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 21 Jan 2024 17:00:09 -0500 Subject: [PATCH 09/11] Remove cache and fix key --- indra/databases/cbio_client.py | 19 ++++++------------- indra/tests/test_cbio_client.py | 12 ------------ 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 0ef44772c1..cd6c338979 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -63,7 +63,6 @@ def send_request(method, endpoint, json_data=None): return res.json() -@lru_cache(maxsize=1000) def get_mutations(study_id, gene_list=None, mutation_type=None, case_id=None): """Return mutations as a list of genes and list of amino acid changes. @@ -104,7 +103,7 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, 'entrezGeneIds': entrez_ids}) if case_id: - mutations = [m for m in mutations if m['case_id'] == case_id] + mutations = [m for m in mutations if m['sampleId'] == case_id] if mutation_type: mutations = [m for m in mutations if (mutation_type.casefold() @@ -119,7 +118,6 @@ def get_mutations(study_id, gene_list=None, mutation_type=None, return mutations_dict -@lru_cache(maxsize=1000) def get_entrez_mappings(gene_list): if gene_list: # First we need to get HGNC IDs from HGNC symbols @@ -136,7 +134,6 @@ def get_entrez_mappings(gene_list): return entrez_to_gene_symbol -@lru_cache(maxsize=1000) def get_case_lists(study_id): """Return a list of the case set ids for a particular study. @@ -164,7 +161,6 @@ def get_case_lists(study_id): return [sl['sampleListId'] for sl in res] -@lru_cache(maxsize=1000) def get_profile_data(study_id, gene_list, profile_filter, case_set_filter=None): """Return dict of cases and genes and their respective values. @@ -226,7 +222,6 @@ def get_profile_data(study_id, gene_list, return profile_data -@lru_cache(maxsize=1000) def get_num_sequenced(study_id): """Return number of sequenced tumors for given study. @@ -257,7 +252,6 @@ def get_num_sequenced(study_id): return num_case -@lru_cache(maxsize=1000) def get_genetic_profiles(study_id, profile_filter=None): """Return all the genetic profiles (data sets) for a given study. @@ -298,7 +292,6 @@ def get_genetic_profiles(study_id, profile_filter=None): return profile_ids -@lru_cache(maxsize=1000) def get_cancer_studies(study_filter=None): """Return a list of cancer study identifiers, optionally filtered. @@ -326,7 +319,6 @@ def get_cancer_studies(study_filter=None): return study_ids -@lru_cache(maxsize=1000) def get_cancer_types(cancer_filter=None): """Return a list of cancer types, optionally filtered. @@ -352,7 +344,6 @@ def get_cancer_types(cancer_filter=None): return type_ids -@lru_cache(maxsize=1000) def get_ccle_mutations(gene_list, cell_lines, mutation_type=None): """Return a dict of mutations in given genes and cell lines from CCLE. @@ -391,7 +382,6 @@ def get_ccle_mutations(gene_list, cell_lines, mutation_type=None): return mutations -@lru_cache(maxsize=1000) def get_ccle_lines_for_mutation(gene, amino_acid_change): """Return cell lines with a given point mutation in a given gene. @@ -418,7 +408,6 @@ def get_ccle_lines_for_mutation(gene, amino_acid_change): return sorted(cell_lines) -@lru_cache(maxsize=1000) def get_ccle_cna(gene_list, cell_lines=None): """Return a dict of CNAs in given genes and cell lines from CCLE. @@ -453,7 +442,6 @@ def get_ccle_cna(gene_list, cell_lines=None): if cell_lines is None or cell_line in cell_lines} -@lru_cache(maxsize=1000) def get_ccle_mrna(gene_list, cell_lines=None): """Return a dict of mRNA amounts in given genes and cell lines from CCLE. @@ -472,6 +460,11 @@ def get_ccle_mrna(gene_list, cell_lines=None): """ profile_data = get_profile_data(ccle_study, gene_list, 'MRNA_EXPRESSION', 'all') + # FIXME: we need a data structure like this + # assert mrna['A375_SKIN'] is not None + # assert mrna['A375_SKIN']['MAP2K1'] > 10 + # > assert mrna['A375_SKIN']['XYZ'] is None + # E KeyError: 'XYZ' mrna_amounts = {cell_line: value for cell_line, value in profile_data.items() if cell_lines is None or cell_line in cell_lines} diff --git a/indra/tests/test_cbio_client.py b/indra/tests/test_cbio_client.py index 38a5f4f645..489f189a13 100644 --- a/indra/tests/test_cbio_client.py +++ b/indra/tests/test_cbio_client.py @@ -30,18 +30,6 @@ def test_get_num_sequenced(): assert num_case > 0 -@pytest.mark.webservice -def test_send_request_ccle(): - """Sends a request and gets back a dataframe of all cases in ccle study. - - Check that the dataframe is longer than one. - """ - data = {'cmd': 'getCaseLists', - 'cancer_study_id': 'cellline_ccle_broad'} - df = cbio_client.send_request(**data) - assert len(df) > 0 - - @pytest.mark.webservice def test_get_ccle_lines_for_mutation(): """Check how many lines have BRAF V600E mutations. From 7120143a703ac5ef3930f14ed8b8472ccca45b36 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 21 Jan 2024 19:18:15 -0500 Subject: [PATCH 10/11] Fix CNA calculation for backward compatibility --- indra/databases/cbio_client.py | 9 ++++----- indra/tests/test_cbio_client.py | 10 ++++------ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index cd6c338979..8dc3e5326b 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -460,11 +460,6 @@ def get_ccle_mrna(gene_list, cell_lines=None): """ profile_data = get_profile_data(ccle_study, gene_list, 'MRNA_EXPRESSION', 'all') - # FIXME: we need a data structure like this - # assert mrna['A375_SKIN'] is not None - # assert mrna['A375_SKIN']['MAP2K1'] > 10 - # > assert mrna['A375_SKIN']['XYZ'] is None - # E KeyError: 'XYZ' mrna_amounts = {cell_line: value for cell_line, value in profile_data.items() if cell_lines is None or cell_line in cell_lines} @@ -475,4 +470,8 @@ def get_ccle_mrna(gene_list, cell_lines=None): for cell_line in cell_lines: if cell_line not in mrna_amounts: mrna_amounts[cell_line] = None + else: + for gene in gene_list: + if gene not in mrna_amounts[cell_line]: + mrna_amounts[cell_line][gene] = None return mrna_amounts diff --git a/indra/tests/test_cbio_client.py b/indra/tests/test_cbio_client.py index 489f189a13..dd22efda5f 100644 --- a/indra/tests/test_cbio_client.py +++ b/indra/tests/test_cbio_client.py @@ -94,7 +94,7 @@ def test_get_ccle_mrna(): def test_get_ccle_cna_big(): """ Get the CNA data on 124 genes in 4 cell lines. Expect to have CNA values - that are [None, -2.0, -1.0, 0.0, 1.0, 2.0] . This tests the function at + that are {-2.0, -1.0, 0.0, 1.0, 2.0}. This tests the function at a greater scale. Also, test the cell lines' BRAF CNAs """ genes = ["FOSL1", "GRB2", "RPS6KA3", "EIF4EBP1", "DUSP1", "PLXNB1", "SHC2", @@ -117,13 +117,11 @@ def test_get_ccle_cna_big(): "PAK1", "RHEB"] cell_lines = ['COLO679_SKIN', 'A2058_SKIN', 'IGR39_SKIN', 'HS294T_SKIN'] cna = cbio_client.get_ccle_cna(genes, cell_lines) - values = [] + values = set() for cl in cna: for g in cna[cl]: - val = cna[cl][g] - values.append(val) - values = list(set(values)) - assert len(values) == 6 + values.add(cna[cl][g]) + assert values == {-2.0, -1.0, 0.0, 1.0, 2.0} assert cna['COLO679_SKIN']['BRAF'] == 2 assert cna['A2058_SKIN']['BRAF'] == 1 assert cna['IGR39_SKIN']['BRAF'] == 1 From 09ea5183e8e61f3d93851d187d16185647e0f52e Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 21 Jan 2024 19:31:30 -0500 Subject: [PATCH 11/11] Implement caching and improve docs --- indra/databases/cbio_client.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/indra/databases/cbio_client.py b/indra/databases/cbio_client.py index 8dc3e5326b..9c163d9a29 100644 --- a/indra/databases/cbio_client.py +++ b/indra/databases/cbio_client.py @@ -10,6 +10,7 @@ "get_ccle_lines_for_mutation", "get_ccle_cna", "get_ccle_mrna"] +import json import logging import requests from functools import lru_cache @@ -22,7 +23,6 @@ ccle_study = 'cellline_ccle_broad' -# TODO: implement caching with json_data made immutable def send_request(method, endpoint, json_data=None): """Return the results of a web service request to cBio portal. @@ -49,13 +49,20 @@ def send_request(method, endpoint, json_data=None): JSON The JSON object returned by the web service call. """ + json_data_str = json.dumps(json_data) if json_data else None + res = _send_request_cached(method, endpoint, json_data_str) + return res + + +@lru_cache(maxsize=1000) +def _send_request_cached(method, endpoint, json_data_str=None): + """The actual function running the request, using caching""" if endpoint.startswith('/'): endpoint = endpoint[1:] + json_data = json.loads(json_data_str) if json_data_str else {} request_fun = getattr(requests, method) full_url = cbio_url + '/' + endpoint - print('URL: %s' % full_url) - print('JSON: %s' % json_data) - res = request_fun(full_url, json=json_data or {}) + res = request_fun(full_url, json=json_data) if res.status_code != 200: logger.error(f'Request returned with code {res.status_code}: ' f'{res.text}') @@ -139,12 +146,6 @@ def get_case_lists(study_id): In v2 of the API these are called sample lists. - Old comment: - TAKE NOTE the "case_list_id" are the same thing as "case_set_id" - Within the data, this string is referred to as a "case_list_id". - Within API calls it is referred to as a 'case_set_id'. - The documentation does not make this explicitly clear. - Parameters ---------- study_id : str @@ -161,8 +162,8 @@ def get_case_lists(study_id): return [sl['sampleListId'] for sl in res] -def get_profile_data(study_id, gene_list, - profile_filter, case_set_filter=None): +def get_profile_data(study_id, gene_list, profile_filter, + case_set_filter=None): """Return dict of cases and genes and their respective values. Parameters @@ -187,8 +188,9 @@ def get_profile_data(study_id, gene_list, Returns ------- profile_data : dict[dict[int]] - A dict keyed to cases containing a dict keyed to genes - containing int + A dict keyed to cases (cell lines if using CCLE) in turn + containing a dict keyed by genes, with values corresponding to + the given profile (e.g., CNA, mutations). """ genetic_profiles = get_genetic_profiles(study_id, profile_filter) if genetic_profiles: