Skip to content

Commit

Permalink
Merge pull request #401 from theislab/dev_metadata
Browse files Browse the repository at this point in the history
Remove remotezip
  • Loading branch information
wxicu authored Oct 16, 2023
2 parents 67ee210 + 8bbe14e commit b01f318
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ htmlcov/
.coverage
.coverage.*
.cache
.pertpy_cache
nosetests.xml
coverage.xml
*.cover
Expand Down
64 changes: 27 additions & 37 deletions pertpy/tools/_metadata/_cell_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import TYPE_CHECKING, Literal

import pandas as pd
from remotezip import RemoteZip
from rich import print
from scanpy import settings

Expand All @@ -20,6 +19,7 @@ class CellLineMetaData:
"""Utilities to fetch cell line metadata."""

def __init__(self):
settings.cachedir = ".pertpy_cache"
# Download cell line metadata from DepMap
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
cell_line_file_path = settings.cachedir.__str__() + "/sample_info.csv"
Expand Down Expand Up @@ -84,7 +84,7 @@ def __init__(self):
else:
self.cl_cancer_project_meta = pd.read_csv(cell_line_cancer_project_transformed_path, index_col=0)

# Download metadata for driver genes from DepMap_Sanger
# Download metadata for driver genes from DepMap.Sanger
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
gene_annotation_file_path = settings.cachedir.__str__() + "/gene_identifiers_20191101.csv"
if not Path(gene_annotation_file_path).exists():
Expand All @@ -98,31 +98,27 @@ def __init__(self):
)
self.gene_annotation = pd.read_table(gene_annotation_file_path, delimiter=",")

# Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute fro DepMap.Sanger
# Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute from DepMap.Sanger
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data)
bulk_rna_sanger_file_path = settings.cachedir.__str__() + "/rnaseq_read_count_20220624.csv"
# issue: read count values contain random whitespace, not sure what it supposes to mean
# solution: remove the white space and convert to int before depmap updates the metadata
bulk_rna_sanger_file_path = settings.cachedir.__str__() + "/rnaseq_read_count_20220624_processed.csv"
if not Path(bulk_rna_sanger_file_path).exists():
print(
"[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
" Starting download now..."
)
with RemoteZip("https://cog.sanger.ac.uk/cmp/download/rnaseq_all_20220624.zip") as zip_file:
zip_file.extract("rnaseq_read_count_20220624.csv", path=settings.cachedir)

self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, skiprows=[2, 3], header=[0, 1], index_col=[0, 1])

# issue: read count values contain random whitespace, not sure what it supposes to mean
# solution: remove the white space and convert to int before depmap updates the metadata
self.bulk_rna_sanger = self.bulk_rna_sanger.applymap(
lambda x: int(x.replace(" ", "")) if isinstance(x, str) else x
)
self.bulk_rna_sanger = self.bulk_rna_sanger.T
self.bulk_rna_sanger.index = self.bulk_rna_sanger.index.droplevel("model_id")
self.bulk_rna_sanger.columns = self.bulk_rna_sanger.columns.droplevel("gene_id")
_download(
url="https://figshare.com/ndownloader/files/42467103",
output_file_name="rnaseq_read_count_20220624_processed.csv",
output_path=settings.cachedir,
block_size=4096,
is_zip=False,
)
self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, index_col=0)

# Download CCLE expression data from DepMap
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
# bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression.csv"
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression_full.csv"
if not Path(bulk_rna_broad_file_path).exists():
print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
Expand All @@ -135,25 +131,19 @@ def __init__(self):
)
self.bulk_rna_broad = pd.read_csv(bulk_rna_broad_file_path, index_col=0)

# Download proteomics data from DepMap.Sanger
# Download proteomics data processed by DepMap.Sanger
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214.csv"
proteomics_trimm_path = settings.cachedir.__str__() + "/proteomics_all_20221214_trimm.csv"
if not Path(proteomics_trimm_path).exists():
if not Path(proteomics_file_path).exists():
print(
"[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger)."
" Starting download now."
)
with RemoteZip("https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip") as zip_file:
zip_file.extract("proteomics_all_20221214.csv", path=settings.cachedir)
self.proteomics_data = pd.read_csv(proteomics_file_path)
self.proteomics_data[["uniprot_id", "model_id", "model_name", "symbol"]] = self.proteomics_data[
["uniprot_id", "model_id", "model_name", "symbol"]
].astype("category")
self.proteomics_data.to_csv(proteomics_trimm_path)
else:
self.proteomics_data = pd.read_csv(proteomics_trimm_path, index_col=0)
proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214_processed.csv"
if not Path(proteomics_file_path).exists():
print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
_download(
url="https://figshare.com/ndownloader/files/42468393",
output_file_name="proteomics_all_20221214_processed.csv",
output_path=settings.cachedir,
block_size=4096,
is_zip=False,
)
self.proteomics_data = pd.read_csv(proteomics_file_path, index_col=0)

# Download GDSC drug response data
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ dependencies = [
"ott-jax",
"sparsecca",
"numba",
"remotezip",
"openpyxl",
]

Expand Down

0 comments on commit b01f318

Please sign in to comment.