Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debug, new modules, and code improvements #46

Merged
merged 51 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
1ba375a
update .gitignore file
abearab Jun 6, 2024
4fd9dd0
add load functino for coessentiality
abearab Jun 6, 2024
4fa9aa8
add `diffexp` module
abearab Jun 12, 2024
1324a41
add `cfig_path` argument
abearab Jun 23, 2024
dc54404
make `Manager` as parent class
abearab Jun 23, 2024
f47ff47
add additional packages
abearab Jun 23, 2024
79ed21a
Merge branch 'master' into abe-dev
abearab Jun 23, 2024
538cf57
fix import
abearab Jun 23, 2024
2aa82b6
mend
abearab Jun 23, 2024
00e2543
minor fixes
abearab Jun 23, 2024
026bb75
minor fixes
abearab Jun 23, 2024
f0904af
mend
abearab Jun 23, 2024
fe07d58
mend
abearab Jun 23, 2024
119f541
mend
abearab Jun 23, 2024
654f98f
bump version 0.1.2
abearab Jun 23, 2024
1a9a0d1
relative import
abearab Jun 24, 2024
65effc7
switch to python >3.11
abearab Jun 24, 2024
dcb70b0
mend
abearab Jun 24, 2024
6675812
mend
abearab Jun 24, 2024
1a6703a
switch to python >3.11
abearab Jun 24, 2024
2c8ca70
draft `Manager` class test
abearab Jun 24, 2024
87b92bc
update `.gitignore`
abearab Jun 24, 2024
b0a3d18
relative import
abearab Jun 24, 2024
686cd03
add `data_paths`
abearab Jun 24, 2024
2b511fa
switch to python >3.11
abearab Jun 24, 2024
8254f37
update README
abearab Jun 24, 2024
6dc9482
mend
abearab Jun 24, 2024
cee3f98
mend
abearab Jun 24, 2024
de52809
mend
abearab Jun 24, 2024
52f39b5
mend
abearab Jun 24, 2024
1b00131
mend
abearab Jun 24, 2024
422b2b3
mend
abearab Jun 24, 2024
3f045bc
mend
abearab Jun 24, 2024
a6cdb68
mend
abearab Jun 24, 2024
410d298
mend
abearab Jun 24, 2024
33cf352
add citation
abearab Jun 24, 2024
11ee76f
mend
abearab Jun 24, 2024
7403413
add badge
abearab Jun 24, 2024
c97223c
add downloads
abearab Jun 24, 2024
198c163
mend
abearab Jun 24, 2024
33083a5
update .gitignore file
abearab Jun 24, 2024
940eae3
set `include_package_data` as true
abearab Jun 24, 2024
0c81203
add uninstall scripts
abearab Jun 24, 2024
1d0e9b1
mend
abearab Jun 24, 2024
3d1ac9e
mend
abearab Jun 24, 2024
d74c0fa
minor debug
abearab Jun 24, 2024
04db890
debug
abearab Jun 24, 2024
bd64917
debug
abearab Jun 24, 2024
e6003d2
major changes in CanDI setup scripts and harmonize coessentiality ins…
abearab Jun 24, 2024
f2f2aba
draft coessentiality module
abearab Jun 24, 2024
fbdfee2
bump version 0.2.0
abearab Jun 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
fail-fast: false
matrix:
os-version: ["ubuntu-latest"]
python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
Binary file modified .gitignore
Binary file not shown.
2 changes: 1 addition & 1 deletion CanDI/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = "0.1.1"
version = "0.1.2"
4 changes: 3 additions & 1 deletion CanDI/candi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from . import load
from . import data

data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects
from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)

from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
4 changes: 2 additions & 2 deletions CanDI/candi/candi.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Classes for handling data aggregations
import operator
from collections import OrderedDict, MutableSequence
from collections.abc import MutableSequence
import itertools as it
import pandas as pd
import numpy as np
from . import data, grabber
from . import entity
from ..structures import entity

class SubsetHandler(object):

Expand Down
17 changes: 12 additions & 5 deletions CanDI/candi/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,27 @@ class Data(object):
can be tuned to load specific datasets upon import by editing config.ini
can call Data.load() to load any specific dataset
"""
def __init__(self):
def __init__(self, config_path='auto', verbose=False):

self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
config_path = self._file_path / 'data/config.ini'
if config_path == 'auto':
self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
config_path = self._file_path / 'data/config.ini'
elif os.path.exists(config_path) == False:
raise FileNotFoundError("Config file not found at {}".format(config_path))
elif os.path.exists(config_path) == True:
if verbose: print("Using config file at {}".format(config_path))

parser = configparser.ConfigParser() #parses config for data sources
parser.read(config_path)

self._parser = parser
#self._verify_install()
self._verify_install()
self._init_sources()
self._init_depmap_paths()
# self._init_index_tables()
self._init_index_tables()

def _verify_install(self): #ensures data being loaded is present
#TODO: add more checks for different data sources
try:
assert "depmap_urls" in self._parser.sections()
except AssertionError:
Expand Down Expand Up @@ -91,6 +97,7 @@ def _handle_autoload(method, path):
df = pd.read_csv(path,
memory_map=True,
low_memory=False,
sep='\t',
index_col="DepMap_ID")

elif method == "locations":
Expand Down
Empty file added CanDI/pipelines/__init__.py
Empty file.
Empty file.
52 changes: 52 additions & 0 deletions CanDI/pipelines/diffexp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import anndata as ad

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
from adpbulk import ADPBulk


def pseudobulk_by_group(adt, groups, method="mean"):
# initialize the object
adpb = ADPBulk(adt, groupby=groups, method=method)

# perform the pseudobulking
pseudobulk_matrix = adpb.fit_transform()

# retrieve the sample metadata (useful for easy incorporation with edgeR)
sample_meta = adpb.get_meta()

out = ad.AnnData(
X=pseudobulk_matrix,
obs=sample_meta.set_index('SampleName')
)

return out


def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):

inference = DefaultInference(n_cpus=n_cpus)

dds = DeseqDataSet(
counts=adata.to_df().astype(int),
metadata=adata.obs,
design_factors=design, # compare samples based on the "condition"
refit_cooks=True,
inference=inference,
)

dds.deseq2()

stat_res = DeseqStats(
dds,
contrast=[design, tested_level, ref_level],
inference=inference
)
stat_res.summary()

df = stat_res.results_df

return df
189 changes: 117 additions & 72 deletions CanDI/setup/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@

CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'


### Datasets Metadata ###

coessentiality_dataset_names = [
'genes',
# 10273535
'GLS_p',
# 10273534
'GLS_sign',
# 10273533
]

depmap_dataset_names = [
'CCLE_expression',
'CCLE_fusions',
Expand All @@ -22,6 +34,11 @@
]

name2type = {
# Coessentiality datasets
'genes': 'txt',
'GLS_p': 'npy',
'GLS_sign': 'npy',
# DepMap datasets
'CCLE_expression': 'csv',
'CCLE_fusions': 'csv',
'CCLE_gene_cn': 'csv',
Expand All @@ -34,6 +51,11 @@
}

name2id = {
# Coessentiality datasets
'genes': 10273535,
'GLS_p': 10273534,
'GLS_sign': 10273533,
# DepMap datasets
'CCLE_expression': 8076862,
'CCLE_fusions': 10085763,
'CCLE_gene_cn': 8076861,
Expand All @@ -46,6 +68,7 @@
}


### Utility functions ###
def print_sys(s):
"""system print

Expand All @@ -55,80 +78,102 @@ def print_sys(s):
print(s, flush = True, file = sys.stderr)


def dataverse_download(url, path, name, types):
"""dataverse download helper with progress bar

Args:
url (str): the url of the dataset
path (str): the path to save the dataset
name (str): the dataset name
types (dict): a dictionary mapping from the dataset name to the file format
"""
save_path = os.path.join(path, f"{name}.{types[name]}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def download_wrapper(name, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files

Args:
name (str): the rough dataset query name
path (str): the path to save the dataset
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
### Downloading scripts ###

class Downloader:
def __init__(self):
pass

def _dataverse_download(self, url, path, name, types):
"""dataverse download helper with progress bar

Args:
url (str): the url of the dataset
path (str): the path to save the dataset
name (str): the dataset name
types (dict): a dictionary mapping from the dataset name to the file format
"""
save_path = os.path.join(path, f"{name}.{types[name]}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def _download_wrapper(self, name, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files

Args:
name (str): the rough dataset query name
path (str): the path to save the dataset
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]

Returns:
str: the exact dataset query name
"""
server_path = "https://dataverse.harvard.edu/api/access/datafile/"

url = server_path + str(name2id[name])

if not os.path.exists(path):
os.mkdir(path)

file_name = f"{name}.{name2type[name]}"

if os.path.exists(os.path.join(path, file_name)):
print_sys("Found local copy...")
os.path.join(path, file_name)
else:
print_sys("Downloading...")
self._dataverse_download(url, path, name, name2type)

if return_type == "url":
return url
elif return_type == "name":
return file_name
elif return_type == ["url", "name"]:
return url, file_name

Returns:
str: the exact dataset query name
"""
server_path = "https://dataverse.harvard.edu/api/access/datafile/"

url = server_path + str(name2id[name])

if not os.path.exists(path):
os.mkdir(path)

file_name = f"{name}.{name2type[name]}"

if os.path.exists(os.path.join(path, file_name)):
print_sys("Found local copy...")
os.path.join(path, file_name)
else:
print_sys("Downloading...")
dataverse_download(url, path, name, name2type)

if return_type == "url":
return url
elif return_type == "name":
return file_name
elif return_type == ["url", "name"]:
return url, file_name


def depmap_dataverse_download(path, return_type=None):
"""download all datasets to the path
def run(self, path, datasets, return_type=None):
"""download all datasets to the path

Args:
path (str): the path to save the datasets
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
"""
url_list = []
file_names = []

for name in datasets:
url, file_name = self._download_wrapper(name, path, return_type=["url", "name"])
url_list.append(url)
file_names.append(file_name)

if return_type == "url":
return url_list
elif return_type == "name":
return file_names
elif return_type == ["url", "name"]:
return url_list, file_names


class DepMapDownloader(Downloader):
def __init__(self):
super().__init__()

def download(self, path, return_type=None):
return self.run(path, depmap_dataset_names, return_type)

Args:
path (str): the path to save the datasets
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
"""
url_list = []
file_names = []

for name in depmap_dataset_names:
url, file_name = download_wrapper(name, path, return_type=["url", "name"])
url_list.append(url)
file_names.append(file_name)
class CoessentialityDownloader(Downloader):
def __init__(self):
super().__init__()

if return_type == "url":
return url_list
elif return_type == "name":
return file_names
elif return_type == ["url", "name"]:
return url_list, file_names
def download(self, path, return_type=None):
return self.run(path, coessentiality_dataset_names, return_type)
Loading
Loading