From 921f971f6874503bf4d6483cf0ff87b03173d3c1 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Tue, 26 Sep 2023 18:04:33 +0200 Subject: [PATCH 1/6] 1. download metadata from figshare 2. change cache dir 3. remove remotezip --- .gitignore | 1 + pertpy/tools/_metadata/_cell_line.py | 99 +++++++++++++--------------- pyproject.toml | 1 - 3 files changed, 47 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index ae8948d4..5d54ae1f 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ htmlcov/ .coverage .coverage.* .cache +.pertpy_cache nosetests.xml coverage.xml *.cover diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index 186854f0..0a75a812 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -4,9 +4,7 @@ from typing import TYPE_CHECKING, Literal import pandas as pd -from remotezip import RemoteZip from rich import print -from scanpy import settings from pertpy.data._dataloader import _download @@ -20,15 +18,19 @@ class CellLineMetaData: """Utilities to fetch cell line metadata.""" def __init__(self): + # Create cachedir if not exists + self.cachedir = ".pertpy_cache" + if not Path.exists(Path(self.cachedir)): + Path(self.cachedir).mkdir(parents=True) # Download cell line metadata from DepMap # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) - cell_line_file_path = settings.cachedir.__str__() + "/sample_info.csv" + cell_line_file_path = self.cachedir + "/sample_info.csv" if not Path(cell_line_file_path).exists(): print("[bold yellow]No DepMap metadata file found. Starting download now.") _download( url="https://ndownloader.figshare.com/files/35020903", output_file_name="sample_info.csv", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) @@ -36,10 +38,8 @@ def __init__(self): # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project # Source: https://www.cancerrxgene.org/celllines - cell_line_cancer_project_file_path = settings.cachedir.__str__() + "/cell_line_cancer_project.csv" - cell_line_cancer_project_transformed_path = ( - settings.cachedir.__str__() + "/cell_line_cancer_project_transformed.csv" - ) + cell_line_cancer_project_file_path = self.cachedir + "/cell_line_cancer_project.csv" + cell_line_cancer_project_transformed_path = self.cachedir + "/cell_line_cancer_project_transformed.csv" if not Path(cell_line_cancer_project_transformed_path).exists(): if not Path(cell_line_cancer_project_file_path).exists(): print( @@ -56,7 +56,7 @@ def __init__(self): "bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&" "bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv", output_file_name="cell_line_cancer_project.csv", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) @@ -84,80 +84,73 @@ def __init__(self): else: self.cl_cancer_project_meta = pd.read_csv(cell_line_cancer_project_transformed_path, index_col=0) - # Download metadata for driver genes from DepMap_Sanger + # Download metadata for driver genes from DepMap.Sanger # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation) - gene_annotation_file_path = settings.cachedir.__str__() + "/gene_identifiers_20191101.csv" + gene_annotation_file_path = self.cachedir + "/gene_identifiers_20191101.csv" if not Path(gene_annotation_file_path).exists(): print("[bold yellow]No metadata file was found for gene annotation." " Starting download now.") _download( url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv", output_file_name="gene_identifiers_20191101.csv", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) self.gene_annotation = pd.read_table(gene_annotation_file_path, delimiter=",") - # Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute fro DepMap.Sanger + # Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute from DepMap.Sanger # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data) - bulk_rna_sanger_file_path = settings.cachedir.__str__() + "/rnaseq_read_count_20220624.csv" + # issue: read count values contain random whitespace, not sure what it supposes to mean + # solution: remove the white space and convert to int before depmap updates the metadata + bulk_rna_sanger_file_path = self.cachedir + "/rnaseq_read_count_20220624_processed.csv" if not Path(bulk_rna_sanger_file_path).exists(): print( "[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line." " Starting download now..." ) - with RemoteZip("https://cog.sanger.ac.uk/cmp/download/rnaseq_all_20220624.zip") as zip_file: - zip_file.extract("rnaseq_read_count_20220624.csv", path=settings.cachedir) - - self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, skiprows=[2, 3], header=[0, 1], index_col=[0, 1]) - - # issue: read count values contain random whitespace, not sure what it supposes to mean - # solution: remove the white space and convert to int before depmap updates the metadata - self.bulk_rna_sanger = self.bulk_rna_sanger.applymap( - lambda x: int(x.replace(" ", "")) if isinstance(x, str) else x - ) - self.bulk_rna_sanger = self.bulk_rna_sanger.T - self.bulk_rna_sanger.index = self.bulk_rna_sanger.index.droplevel("model_id") - self.bulk_rna_sanger.columns = self.bulk_rna_sanger.columns.droplevel("gene_id") + _download( + url="https://figshare.com/ndownloader/files/42467103", + output_file_name="rnaseq_read_count_20220624_processed.csv", + output_path=self.cachedir, + block_size=4096, + is_zip=False, + ) + self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, index_col=0) # Download CCLE expression data from DepMap - # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) - # bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression.csv" - bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression_full.csv" + # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) + # bulk_rna_broad_file_path = self.cachedir + "/CCLE_expression.csv" + bulk_rna_broad_file_path = self.cachedir + "/CCLE_expression_full.csv" if not Path(bulk_rna_broad_file_path).exists(): print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.") _download( url="https://figshare.com/ndownloader/files/34989922", output_file_name="CCLE_expression_full.csv", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) self.bulk_rna_broad = pd.read_csv(bulk_rna_broad_file_path, index_col=0) - # Download proteomics data from DepMap.Sanger + # Download proteomics data processed by DepMap.Sanger # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics) - proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214.csv" - proteomics_trimm_path = settings.cachedir.__str__() + "/proteomics_all_20221214_trimm.csv" - if not Path(proteomics_trimm_path).exists(): - if not Path(proteomics_file_path).exists(): - print( - "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger)." - " Starting download now." - ) - with RemoteZip("https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip") as zip_file: - zip_file.extract("proteomics_all_20221214.csv", path=settings.cachedir) - self.proteomics_data = pd.read_csv(proteomics_file_path) - self.proteomics_data[["uniprot_id", "model_id", "model_name", "symbol"]] = self.proteomics_data[ - ["uniprot_id", "model_id", "model_name", "symbol"] - ].astype("category") - self.proteomics_data.to_csv(proteomics_trimm_path) - else: - self.proteomics_data = pd.read_csv(proteomics_trimm_path, index_col=0) + proteomics_file_path = self.cachedir + "/proteomics_all_20221214_processed.csv" + if not Path(proteomics_file_path).exists(): + print( + "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger)." " Starting download now." + ) + _download( + url="https://figshare.com/ndownloader/files/42468393", + output_file_name="proteomics_all_20221214_processed.csv", + output_path=self.cachedir, + block_size=4096, + is_zip=False, + ) + self.proteomics_data = pd.read_csv(proteomics_file_path, index_col=0) # Download GDSC drug response data # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s) - drug_response_gdsc1_file_path = settings.cachedir.__str__() + "/ic50_gdsc1.xlsx" + drug_response_gdsc1_file_path = self.cachedir + "/ic50_gdsc1.xlsx" if not Path(drug_response_gdsc1_file_path).exists(): print( "[bold yellow]No metadata file was found for drug response data of GDSC1 dataset." @@ -166,7 +159,7 @@ def __init__(self): _download( url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx", output_file_name="ic50_gdsc1.xlsx", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) @@ -178,7 +171,7 @@ def __init__(self): ] self.drug_response_gdsc1 = self.drug_response_gdsc1.reset_index(drop=True) - drug_response_gdsc2_file_path = settings.cachedir.__str__() + "/ic50_gdsc2.xlsx" + drug_response_gdsc2_file_path = self.cachedir + "/ic50_gdsc2.xlsx" if not Path(drug_response_gdsc2_file_path).exists(): print( "[bold yellow]No metadata file was found for drug response data of GDSC2 dataset." @@ -187,7 +180,7 @@ def __init__(self): _download( url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC2_fitted_dose_response_24Jul22.xlsx", output_file_name="ic50_gdsc2.xlsx", - output_path=settings.cachedir, + output_path=self.cachedir, block_size=4096, is_zip=False, ) diff --git a/pyproject.toml b/pyproject.toml index 90a8c08e..c957b4ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,6 @@ dependencies = [ "ott-jax", "sparsecca", "numba", - "remotezip", "openpyxl", ] From 862a5c356217cea5b764b5e83d7eb8f04f01a012 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Mon, 16 Oct 2023 16:16:06 +0200 Subject: [PATCH 2/6] set the scanpy cachedir to the pertpy_cache --- pertpy/tools/_metadata/_cell_line.py | 46 +++++++++++++++------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index 0a75a812..990b4fa1 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -5,6 +5,7 @@ import pandas as pd from rich import print +from scanpy import settings from pertpy.data._dataloader import _download @@ -18,19 +19,19 @@ class CellLineMetaData: """Utilities to fetch cell line metadata.""" def __init__(self): - # Create cachedir if not exists - self.cachedir = ".pertpy_cache" - if not Path.exists(Path(self.cachedir)): - Path(self.cachedir).mkdir(parents=True) + # Set scanpy cachedir to pertpy dir + settings.cachedir = ".pertpy_cache" + # if not Path.exists(Path(self.cachedir)): + # Path(self.cachedir).mkdir(parents=True) # Download cell line metadata from DepMap # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) - cell_line_file_path = self.cachedir + "/sample_info.csv" + cell_line_file_path = settings.cachedir.__str__() + "/sample_info.csv" if not Path(cell_line_file_path).exists(): print("[bold yellow]No DepMap metadata file found. Starting download now.") _download( url="https://ndownloader.figshare.com/files/35020903", output_file_name="sample_info.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -38,8 +39,10 @@ def __init__(self): # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project # Source: https://www.cancerrxgene.org/celllines - cell_line_cancer_project_file_path = self.cachedir + "/cell_line_cancer_project.csv" - cell_line_cancer_project_transformed_path = self.cachedir + "/cell_line_cancer_project_transformed.csv" + cell_line_cancer_project_file_path = settings.cachedir.__str__() + "/cell_line_cancer_project.csv" + cell_line_cancer_project_transformed_path = ( + settings.cachedir.__str__() + "/cell_line_cancer_project_transformed.csv" + ) if not Path(cell_line_cancer_project_transformed_path).exists(): if not Path(cell_line_cancer_project_file_path).exists(): print( @@ -56,7 +59,7 @@ def __init__(self): "bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&" "bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv", output_file_name="cell_line_cancer_project.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -86,13 +89,13 @@ def __init__(self): # Download metadata for driver genes from DepMap.Sanger # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation) - gene_annotation_file_path = self.cachedir + "/gene_identifiers_20191101.csv" + gene_annotation_file_path = settings.cachedir.__str__() + "/gene_identifiers_20191101.csv" if not Path(gene_annotation_file_path).exists(): print("[bold yellow]No metadata file was found for gene annotation." " Starting download now.") _download( url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv", output_file_name="gene_identifiers_20191101.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -102,7 +105,7 @@ def __init__(self): # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data) # issue: read count values contain random whitespace, not sure what it supposes to mean # solution: remove the white space and convert to int before depmap updates the metadata - bulk_rna_sanger_file_path = self.cachedir + "/rnaseq_read_count_20220624_processed.csv" + bulk_rna_sanger_file_path = settings.cachedir.__str__() + "/rnaseq_read_count_20220624_processed.csv" if not Path(bulk_rna_sanger_file_path).exists(): print( "[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line." @@ -111,7 +114,7 @@ def __init__(self): _download( url="https://figshare.com/ndownloader/files/42467103", output_file_name="rnaseq_read_count_20220624_processed.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -119,14 +122,13 @@ def __init__(self): # Download CCLE expression data from DepMap # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) - # bulk_rna_broad_file_path = self.cachedir + "/CCLE_expression.csv" - bulk_rna_broad_file_path = self.cachedir + "/CCLE_expression_full.csv" + bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression_full.csv" if not Path(bulk_rna_broad_file_path).exists(): print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.") _download( url="https://figshare.com/ndownloader/files/34989922", output_file_name="CCLE_expression_full.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -134,7 +136,7 @@ def __init__(self): # Download proteomics data processed by DepMap.Sanger # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics) - proteomics_file_path = self.cachedir + "/proteomics_all_20221214_processed.csv" + proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214_processed.csv" if not Path(proteomics_file_path).exists(): print( "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger)." " Starting download now." @@ -142,7 +144,7 @@ def __init__(self): _download( url="https://figshare.com/ndownloader/files/42468393", output_file_name="proteomics_all_20221214_processed.csv", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -150,7 +152,7 @@ def __init__(self): # Download GDSC drug response data # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s) - drug_response_gdsc1_file_path = self.cachedir + "/ic50_gdsc1.xlsx" + drug_response_gdsc1_file_path = settings.cachedir.__str__() + "/ic50_gdsc1.xlsx" if not Path(drug_response_gdsc1_file_path).exists(): print( "[bold yellow]No metadata file was found for drug response data of GDSC1 dataset." @@ -159,7 +161,7 @@ def __init__(self): _download( url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx", output_file_name="ic50_gdsc1.xlsx", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) @@ -171,7 +173,7 @@ def __init__(self): ] self.drug_response_gdsc1 = self.drug_response_gdsc1.reset_index(drop=True) - drug_response_gdsc2_file_path = self.cachedir + "/ic50_gdsc2.xlsx" + drug_response_gdsc2_file_path = settings.cachedir.__str__() + "/ic50_gdsc2.xlsx" if not Path(drug_response_gdsc2_file_path).exists(): print( "[bold yellow]No metadata file was found for drug response data of GDSC2 dataset." @@ -180,7 +182,7 @@ def __init__(self): _download( url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC2_fitted_dose_response_24Jul22.xlsx", output_file_name="ic50_gdsc2.xlsx", - output_path=self.cachedir, + output_path=settings.cachedir, block_size=4096, is_zip=False, ) From a396424cd4aaeee607ab93afad9fca13cf116304 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Mon, 16 Oct 2023 16:23:15 +0200 Subject: [PATCH 3/6] remove comments --- pertpy/tools/_metadata/_cell_line.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index 001fadb8..b06f292c 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -21,8 +21,6 @@ class CellLineMetaData: def __init__(self): # Set scanpy cachedir to pertpy dir settings.cachedir = ".pertpy_cache" - # if not Path.exists(Path(self.cachedir)): - # Path(self.cachedir).mkdir(parents=True) # Download cell line metadata from DepMap # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) cell_line_file_path = settings.cachedir.__str__() + "/sample_info.csv" From dbabe0f2d99ba0217e13d242af368a72c94bc205 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Mon, 16 Oct 2023 21:50:23 +0200 Subject: [PATCH 4/6] Update pertpy/tools/_metadata/_cell_line.py Fix quote Co-authored-by: Lukas Heumos --- pertpy/tools/_metadata/_cell_line.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index b06f292c..29d6005a 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -137,7 +137,8 @@ def __init__(self): proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214_processed.csv" if not Path(proteomics_file_path).exists(): print( - "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger)." " Starting download now." + "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger).\n + Starting download now." ) _download( url="https://figshare.com/ndownloader/files/42468393", From dc0f0ba5ba11c52924d29379147981cf152bdb43 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Mon, 16 Oct 2023 21:50:39 +0200 Subject: [PATCH 5/6] Remove useless comment Co-authored-by: Lukas Heumos --- pertpy/tools/_metadata/_cell_line.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index 29d6005a..47260084 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -19,7 +19,6 @@ class CellLineMetaData: """Utilities to fetch cell line metadata.""" def __init__(self): - # Set scanpy cachedir to pertpy dir settings.cachedir = ".pertpy_cache" # Download cell line metadata from DepMap # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2) From 8bbe14e7b5e9d49987b860514a75e172e0a12daf Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Mon, 16 Oct 2023 21:55:32 +0200 Subject: [PATCH 6/6] fix quote --- pertpy/tools/_metadata/_cell_line.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pertpy/tools/_metadata/_cell_line.py b/pertpy/tools/_metadata/_cell_line.py index 47260084..ff7e94b8 100644 --- a/pertpy/tools/_metadata/_cell_line.py +++ b/pertpy/tools/_metadata/_cell_line.py @@ -135,10 +135,7 @@ def __init__(self): # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics) proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214_processed.csv" if not Path(proteomics_file_path).exists(): - print( - "[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger).\n - Starting download now." - ) + print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.") _download( url="https://figshare.com/ndownloader/files/42468393", output_file_name="proteomics_all_20221214_processed.csv",