Skip to content

Commit

Permalink
made dataset identification more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMenden committed Jan 12, 2021
1 parent 2dc2a83 commit ce7b587
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 68 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Scaden Changelog

### Version 1.0.1

* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66)

### Version 1.0.0

* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions
Expand Down
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Scaden Changelog

### Version 1.0.1

* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66)

### Version 1.0.0

* Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions
Expand Down
65 changes: 26 additions & 39 deletions scaden/preprocessing/bulk_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,7 @@ def create_fractions(no_celltypes):
return fracs


def create_subsample(x,
y,
sample_size,
celltypes,
available_celltypes,
sparse=False):
def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False):
"""
Generate artifical bulk subsample with random fractions of celltypes
If sparse is set to true, add random celltypes to the missing celltypes
Expand All @@ -46,9 +41,9 @@ def create_subsample(x,

if sparse:
no_keep = np.random.randint(1, len(available_celltypes))
keep = np.random.choice(list(range(len(available_celltypes))),
size=no_keep,
replace=False)
keep = np.random.choice(
list(range(len(available_celltypes))), size=no_keep, replace=False
)
available_celltypes = [available_celltypes[i] for i in keep]

no_avail_cts = len(available_celltypes)
Expand All @@ -68,8 +63,7 @@ def create_subsample(x,
for i in range(no_avail_cts):
ct = available_celltypes[i]
cells_sub = x.loc[np.array(y["Celltype"] == ct), :]
cells_fraction = np.random.randint(0, cells_sub.shape[0],
samp_fracs[i])
cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i])
cells_sub = cells_sub.iloc[cells_fraction, :]
artificial_samples.append(cells_sub)

Expand Down Expand Up @@ -99,8 +93,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
pbar = tqdm(range(no_samples))
pbar.set_description(desc="Normal samples")
for _ in pbar:
sample, label = create_subsample(x, y, sample_size, celltypes,
available_celltypes)
sample, label = create_subsample(
x, y, sample_size, celltypes, available_celltypes
)
X.append(sample)
Y.append(label)

Expand All @@ -109,12 +104,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
pbar = tqdm(range(n_sparse))
pbar.set_description(desc="Sparse samples")
for _ in pbar:
sample, label = create_subsample(x,
y,
sample_size,
celltypes,
available_celltypes,
sparse=True)
sample, label = create_subsample(
x, y, sample_size, celltypes, available_celltypes, sparse=True
)
X.append(sample)
Y.append(label)
X = pd.concat(X, axis=1).T
Expand Down Expand Up @@ -176,7 +168,7 @@ def load_celltypes(path, name):
try:
y = pd.read_table(path)
# Check if has Celltype column
if not 'Celltype' in y.columns:
if not "Celltype" in y.columns:
logger.error(
f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
)
Expand Down Expand Up @@ -206,7 +198,7 @@ def load_dataset(name, dir, pattern):
y = pd.read_table(os.path.join(dir, name + "_celltypes.txt"))
# Check if has Celltype column
print(y.columns)
if not 'Celltype' in y.columns:
if not "Celltype" in y.columns:
logger.error(
f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
)
Expand Down Expand Up @@ -244,9 +236,7 @@ def merge_unkown_celltypes(y, unknown_celltypes):
:return:
"""
celltypes = list(y["Celltype"])
new_celltypes = [
"Unknown" if x in unknown_celltypes else x for x in celltypes
]
new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes]
y["Celltype"] = new_celltypes
return y

Expand Down Expand Up @@ -316,8 +306,9 @@ def generate_signature(x, y):
return signature_matrix


def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
unknown_celltypes):
def simulate_bulk(
sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes
):
"""
Simulate artificial bulk samples from single cell datasets
:param sample_size: number of cells per sample
Expand All @@ -329,19 +320,18 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
"""

num_samples = int(
num_samples /
2) # divide by two so half is sparse and half is normal samples
num_samples / 2
) # divide by two so half is sparse and half is normal samples

# List available datasets
if not data_path.endswith("/"):
data_path += "/"
files = glob.glob(os.path.join(data_path, pattern))
files = [os.path.basename(x) for x in files]
datasets = [x.split("_")[0] for x in files]
datasets = [x.replace(pattern.replace("*", ""), "") for x in files]

if len(datasets) == 0:
logging.error(
"No datasets found! Have you specified the pattern correctly?")
logging.error("No datasets found! Have you specified the pattern correctly?")
sys.exit(1)

print("Datasets: " + str(datasets))
Expand Down Expand Up @@ -371,14 +361,11 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
# Create datasets
for i in range(len(xs)):
print("Subsampling " + datasets[i] + "...")
tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size,
celltypes, num_samples)
tmpx.to_csv(out_dir + datasets[i] + "_samples.txt",
sep="\t",
index=False)
tmpy.to_csv(out_dir + datasets[i] + "_labels.txt",
sep="\t",
index=False)
tmpx, tmpy = create_subsample_dataset(
xs[i], ys[i], sample_size, celltypes, num_samples
)
tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False)
tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False)
gc.collect()

print("Finished!")
12 changes: 6 additions & 6 deletions scaden/preprocessing/create_h5ad_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"):
# List available datasets
files = glob.glob(data_dir + pattern)
files = [os.path.basename(x) for x in files]
datasets = [x.split("_")[0] for x in files]
datasets = [x.replace(pattern.replace("*", ""), "") for x in files]

# get celltypes
celltypes = load_celltypes(data_dir)
Expand All @@ -95,15 +95,15 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"):

x = x.sort_index(axis=1)
ratios = pd.DataFrame(y, columns=celltypes)
ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]),
index=ratios.index)
ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index)

print("Processing " + str(train_file))
x = pd.DataFrame(x)
adata.append(
anndata.AnnData(X=x.to_numpy(),
obs=ratios,
var=pd.DataFrame(columns=[], index=list(x))))
anndata.AnnData(
X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x))
)
)

for i in range(1, len(adata)):
print("Concatenating " + str(i))
Expand Down
57 changes: 34 additions & 23 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,43 @@

from setuptools import setup, find_packages

version = '1.0.0'
version = "1.0.1"

with open("README.md", "r", encoding="UTF-8") as fh:
long_description = fh.read()

with open('LICENSE', encoding="UTF-8") as f:
with open("LICENSE", encoding="UTF-8") as f:
license = f.read()

setup(name='scaden',
version=version,
description="Cell type deconvolution using single cell data",
long_description=long_description,
long_description_content_type="text/markdown",
keywords=[
'bioinformatics', 'deep learning', 'machine learning',
'single cell sequencing', 'deconvolution'
],
author='Kevin Menden',
author_email='kevin.menden@t-online.de',
url='https://github.com/KevinMenden/scaden',
license="MIT License",
entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
packages=find_packages(),
include_package_data=True,
python_requires='>3.6.0',
install_requires=[
'pandas', 'numpy', 'scikit-learn', 'tensorflow>=2.0', 'anndata',
'tqdm', 'click', 'h5py~=2.10.0'
])
setup(
name="scaden",
version=version,
description="Cell type deconvolution using single cell data",
long_description=long_description,
long_description_content_type="text/markdown",
keywords=[
"bioinformatics",
"deep learning",
"machine learning",
"single cell sequencing",
"deconvolution",
],
author="Kevin Menden",
author_email="kevin.menden@t-online.de",
url="https://github.com/KevinMenden/scaden",
license="MIT License",
entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
packages=find_packages(),
include_package_data=True,
python_requires=">3.6.0",
install_requires=[
"pandas",
"numpy",
"scikit-learn",
"tensorflow>=2.0",
"anndata",
"tqdm",
"click",
"h5py~=2.10.0",
],
)

0 comments on commit ce7b587

Please sign in to comment.