GilbertLabUCSF · abearab · Jun 24, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 12, 2024
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+        python-version: ["3.11"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         os-version: ["ubuntu-latest"]
-        python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+        python-version: ["3.11"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
diff --git a/CanDI/__version__.py b/CanDI/__version__.py
@@ -1 +1 @@
-version = "0.1.1"
+version = "0.1.2"
diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py
@@ -1,4 +1,6 @@
+from . import load
 from . import data
+
 data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects
-from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
 
+from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py
@@ -1,11 +1,11 @@
 # Classes for handling data aggregations
 import operator
-from collections import OrderedDict, MutableSequence
+from collections.abc import MutableSequence
 import itertools as it
 import pandas as pd
 import numpy as np
 from . import data, grabber
-from . import entity
+from ..structures import entity
 
 class SubsetHandler(object):
 

diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py
@@ -14,21 +14,27 @@ class Data(object):
     can be tuned to load specific datasets upon import by editing config.ini
     can call Data.load() to load any specific dataset
     """
-    def __init__(self):
+    def __init__(self, config_path='auto', verbose=False):
 
-        self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
-        config_path = self._file_path / 'data/config.ini'
+        if config_path == 'auto':
+            self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
+            config_path = self._file_path / 'data/config.ini'
+        elif os.path.exists(config_path) == False:
+            raise FileNotFoundError("Config file not found at {}".format(config_path))
+        elif os.path.exists(config_path) == True:
+            if verbose: print("Using config file at {}".format(config_path))
 
         parser = configparser.ConfigParser() #parses config for data sources
         parser.read(config_path)
 
         self._parser = parser
-        #self._verify_install()
+        self._verify_install()
         self._init_sources()
         self._init_depmap_paths()
-        # self._init_index_tables()
+        self._init_index_tables()
 
     def _verify_install(self): #ensures data being loaded is present
+        #TODO: add more checks for different data sources
         try:
             assert "depmap_urls" in self._parser.sections()
         except AssertionError:
@@ -91,6 +97,7 @@ def _handle_autoload(method, path):
             df = pd.read_csv(path,
                              memory_map=True,
                              low_memory=False,
+                             sep='\t',
                              index_col="DepMap_ID")
 
         elif method == "locations":

diff --git a/CanDI/pipelines/__init__.py b/CanDI/pipelines/__init__.py
diff --git a/CanDI/pipelines/coessentiality/__init__.py b/CanDI/pipelines/coessentiality/__init__.py
diff --git a/CanDI/pipelines/diffexp.py b/CanDI/pipelines/diffexp.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+import anndata as ad
+
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.default_inference import DefaultInference
+from pydeseq2.ds import DeseqStats
+from adpbulk import ADPBulk
+
+
+def pseudobulk_by_group(adt, groups, method="mean"):
+    # initialize the object
+    adpb = ADPBulk(adt, groupby=groups, method=method)
+
+    # perform the pseudobulking
+    pseudobulk_matrix = adpb.fit_transform()
+
+    # retrieve the sample metadata (useful for easy incorporation with edgeR)
+    sample_meta = adpb.get_meta()
+
+    out = ad.AnnData(
+        X=pseudobulk_matrix,
+        obs=sample_meta.set_index('SampleName')
+    )
+
+    return out
+
+
+def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):
+
+    inference = DefaultInference(n_cpus=n_cpus)
+
+    dds = DeseqDataSet(
+        counts=adata.to_df().astype(int),
+        metadata=adata.obs,
+        design_factors=design,  # compare samples based on the "condition"
+        refit_cooks=True,
+        inference=inference,
+    )
+
+    dds.deseq2()
+
+    stat_res = DeseqStats(
+        dds, 
+        contrast=[design, tested_level, ref_level], 
+        inference=inference
+    )
+    stat_res.summary()
+
+    df = stat_res.results_df
+
+    return df
diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py
@@ -9,6 +9,18 @@
 
 CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'
 
+
+### Datasets Metadata ###
+
+coessentiality_dataset_names = [
+    'genes',
+    # 10273535
+    'GLS_p',
+    # 10273534
+    'GLS_sign',
+    # 10273533
+]
+
 depmap_dataset_names = [
     'CCLE_expression',
     'CCLE_fusions',
@@ -22,6 +34,11 @@
 ]
 
 name2type = {
+    # Coessentiality datasets
+    'genes': 'txt',
+    'GLS_p': 'npy',
+    'GLS_sign': 'npy',
+    # DepMap datasets
     'CCLE_expression': 'csv',
     'CCLE_fusions': 'csv',
     'CCLE_gene_cn': 'csv',
@@ -34,6 +51,11 @@
 }
 
 name2id = {
+    # Coessentiality datasets
+    'genes': 10273535,
+    'GLS_p': 10273534,
+    'GLS_sign': 10273533,
+    # DepMap datasets
     'CCLE_expression': 8076862,
     'CCLE_fusions': 10085763,
     'CCLE_gene_cn': 8076861,
@@ -46,6 +68,7 @@
 }
 
 
+### Utility functions ###
 def print_sys(s):
     """system print
 
@@ -55,80 +78,102 @@ def print_sys(s):
     print(s, flush = True, file = sys.stderr)
 
 
-def dataverse_download(url, path, name, types):
-    """dataverse download helper with progress bar
-
-    Args:
-        url (str): the url of the dataset
-        path (str): the path to save the dataset
-        name (str): the dataset name
-        types (dict): a dictionary mapping from the dataset name to the file format
-    """
-    save_path = os.path.join(path, f"{name}.{types[name]}")
-    response = requests.get(url, stream=True)
-    total_size_in_bytes = int(response.headers.get("content-length", 0))
-    block_size = 1024
-    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-    with open(save_path, "wb") as file:
-        for data in response.iter_content(block_size):
-            progress_bar.update(len(data))
-            file.write(data)
-    progress_bar.close()
-
-
-def download_wrapper(name, path, return_type=None):
-    """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
-
-    Args:
-        name (str): the rough dataset query name
-        path (str): the path to save the dataset
-        return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+### Downloading scripts ###
+
+class Downloader:
+    def __init__(self):
+        pass
+
+    def _dataverse_download(self, url, path, name, types):
+        """dataverse download helper with progress bar
+
+        Args:
+            url (str): the url of the dataset
+            path (str): the path to save the dataset
+            name (str): the dataset name
+            types (dict): a dictionary mapping from the dataset name to the file format
+        """
+        save_path = os.path.join(path, f"{name}.{types[name]}")
+        response = requests.get(url, stream=True)
+        total_size_in_bytes = int(response.headers.get("content-length", 0))
+        block_size = 1024
+        progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+        with open(save_path, "wb") as file:
+            for data in response.iter_content(block_size):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+
+
+    def _download_wrapper(self, name, path, return_type=None):
+        """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
+
+        Args:
+            name (str): the rough dataset query name
+            path (str): the path to save the dataset
+            return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+
+        Returns:
+            str: the exact dataset query name
+        """
+        server_path = "https://dataverse.harvard.edu/api/access/datafile/"
+
+        url = server_path + str(name2id[name])
+
+        if not os.path.exists(path):
+            os.mkdir(path)
+
+        file_name = f"{name}.{name2type[name]}"
+
+        if os.path.exists(os.path.join(path, file_name)):
+            print_sys("Found local copy...")
+            os.path.join(path, file_name)
+        else:
+            print_sys("Downloading...")
+            self._dataverse_download(url, path, name, name2type)
+
+        if return_type == "url":
+            return url
+        elif return_type == "name":
+            return file_name
+        elif return_type == ["url", "name"]:
+            return url, file_name
 
-    Returns:
-        str: the exact dataset query name
-    """
-    server_path = "https://dataverse.harvard.edu/api/access/datafile/"
-
-    url = server_path + str(name2id[name])
-
-    if not os.path.exists(path):
-        os.mkdir(path)
-
-    file_name = f"{name}.{name2type[name]}"
-
-    if os.path.exists(os.path.join(path, file_name)):
-        print_sys("Found local copy...")
-        os.path.join(path, file_name)
-    else:
-        print_sys("Downloading...")
-        dataverse_download(url, path, name, name2type)
 
-    if return_type == "url":
-        return url
-    elif return_type == "name":
-        return file_name
-    elif return_type == ["url", "name"]:
-        return url, file_name
-
-
-def depmap_dataverse_download(path, return_type=None):
-    """download all datasets to the path
+    def run(self, path, datasets, return_type=None):
+        """download all datasets to the path
+
+        Args:
+            path (str): the path to save the datasets
+            return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+        """
+        url_list = []
+        file_names = []
+
+        for name in datasets:
+            url, file_name = self._download_wrapper(name, path, return_type=["url", "name"])
+            url_list.append(url)
+            file_names.append(file_name)
+
+        if return_type == "url":
+            return url_list
+        elif return_type == "name":
+            return file_names
+        elif return_type == ["url", "name"]:
+            return url_list, file_names
+
+
+class DepMapDownloader(Downloader):
+    def __init__(self):
+        super().__init__()
+
+    def download(self, path, return_type=None):
+        return self.run(path, depmap_dataset_names, return_type)
 
-    Args:
-        path (str): the path to save the datasets
-        return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
-    """
-    url_list = []
-    file_names = []
 
-    for name in depmap_dataset_names:
-        url, file_name = download_wrapper(name, path, return_type=["url", "name"])
-        url_list.append(url)
-        file_names.append(file_name)
+class CoessentialityDownloader(Downloader):
+    def __init__(self):
+        super().__init__()
 
-    if return_type == "url":
-        return url_list
-    elif return_type == "name":
-        return file_names
-    elif return_type == ["url", "name"]:
-        return url_list, file_names
+    def download(self, path, return_type=None):
+        return self.run(path, coessentiality_dataset_names, return_type)