Merge pull request #67 from KevinMenden/dataset-parsing-fix

made dataset identification more robust
KevinMenden · Jan 12, 2021 · 8e204ec · 8e204ec
2 parents 2dc2a83 + ce7b587
commit 8e204ec
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Scaden Changelog
 
+### Version 1.0.1
+
+* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66)
+
 ### Version 1.0.0
 
 * Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,9 @@
 # Scaden Changelog
 
+### Version 1.0.1
+
+* Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66)
+
 ### Version 1.0.0
 
 * Rebuild Scaden model and training to use TF2 Keras API instead of the old compatibility functions 

diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py
@@ -26,12 +26,7 @@ def create_fractions(no_celltypes):
     return fracs
 
 
-def create_subsample(x,
-                     y,
-                     sample_size,
-                     celltypes,
-                     available_celltypes,
-                     sparse=False):
+def create_subsample(x, y, sample_size, celltypes, available_celltypes, sparse=False):
     """
     Generate artifical bulk subsample with random fractions of celltypes
     If sparse is set to true, add random celltypes to the missing celltypes
@@ -46,9 +41,9 @@ def create_subsample(x,
 
     if sparse:
         no_keep = np.random.randint(1, len(available_celltypes))
-        keep = np.random.choice(list(range(len(available_celltypes))),
-                                size=no_keep,
-                                replace=False)
+        keep = np.random.choice(
+            list(range(len(available_celltypes))), size=no_keep, replace=False
+        )
         available_celltypes = [available_celltypes[i] for i in keep]
 
     no_avail_cts = len(available_celltypes)
@@ -68,8 +63,7 @@ def create_subsample(x,
     for i in range(no_avail_cts):
         ct = available_celltypes[i]
         cells_sub = x.loc[np.array(y["Celltype"] == ct), :]
-        cells_fraction = np.random.randint(0, cells_sub.shape[0],
-                                           samp_fracs[i])
+        cells_fraction = np.random.randint(0, cells_sub.shape[0], samp_fracs[i])
         cells_sub = cells_sub.iloc[cells_fraction, :]
         artificial_samples.append(cells_sub)
 
@@ -99,8 +93,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
     pbar = tqdm(range(no_samples))
     pbar.set_description(desc="Normal samples")
     for _ in pbar:
-        sample, label = create_subsample(x, y, sample_size, celltypes,
-                                         available_celltypes)
+        sample, label = create_subsample(
+            x, y, sample_size, celltypes, available_celltypes
+        )
         X.append(sample)
         Y.append(label)
 
@@ -109,12 +104,9 @@ def create_subsample_dataset(x, y, sample_size, celltypes, no_samples):
     pbar = tqdm(range(n_sparse))
     pbar.set_description(desc="Sparse samples")
     for _ in pbar:
-        sample, label = create_subsample(x,
-                                         y,
-                                         sample_size,
-                                         celltypes,
-                                         available_celltypes,
-                                         sparse=True)
+        sample, label = create_subsample(
+            x, y, sample_size, celltypes, available_celltypes, sparse=True
+        )
         X.append(sample)
         Y.append(label)
     X = pd.concat(X, axis=1).T
@@ -176,7 +168,7 @@ def load_celltypes(path, name):
     try:
         y = pd.read_table(path)
         # Check if has Celltype column
-        if not 'Celltype' in y.columns:
+        if not "Celltype" in y.columns:
             logger.error(
                 f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
             )
@@ -206,7 +198,7 @@ def load_dataset(name, dir, pattern):
         y = pd.read_table(os.path.join(dir, name + "_celltypes.txt"))
         # Check if has Celltype column
         print(y.columns)
-        if not 'Celltype' in y.columns:
+        if not "Celltype" in y.columns:
             logger.error(
                 f"No 'Celltype' column found in {name}_celltypes.txt! Please make sure to include this column."
             )
@@ -244,9 +236,7 @@ def merge_unkown_celltypes(y, unknown_celltypes):
     :return:
     """
     celltypes = list(y["Celltype"])
-    new_celltypes = [
-        "Unknown" if x in unknown_celltypes else x for x in celltypes
-    ]
+    new_celltypes = ["Unknown" if x in unknown_celltypes else x for x in celltypes]
     y["Celltype"] = new_celltypes
     return y
 
@@ -316,8 +306,9 @@ def generate_signature(x, y):
     return signature_matrix
 
 
-def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
-                  unknown_celltypes):
+def simulate_bulk(
+    sample_size, num_samples, data_path, out_dir, pattern, unknown_celltypes
+):
     """
     Simulate artificial bulk samples from single cell datasets
     :param sample_size: number of cells per sample
@@ -329,19 +320,18 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
     """
 
     num_samples = int(
-        num_samples /
-        2)  # divide by two so half is sparse and half is normal samples
+        num_samples / 2
+    )  # divide by two so half is sparse and half is normal samples
 
     # List available datasets
     if not data_path.endswith("/"):
         data_path += "/"
     files = glob.glob(os.path.join(data_path, pattern))
     files = [os.path.basename(x) for x in files]
-    datasets = [x.split("_")[0] for x in files]
+    datasets = [x.replace(pattern.replace("*", ""), "") for x in files]
 
     if len(datasets) == 0:
-        logging.error(
-            "No datasets found! Have you specified the pattern correctly?")
+        logging.error("No datasets found! Have you specified the pattern correctly?")
         sys.exit(1)
 
     print("Datasets: " + str(datasets))
@@ -371,14 +361,11 @@ def simulate_bulk(sample_size, num_samples, data_path, out_dir, pattern,
     # Create datasets
     for i in range(len(xs)):
         print("Subsampling " + datasets[i] + "...")
-        tmpx, tmpy = create_subsample_dataset(xs[i], ys[i], sample_size,
-                                              celltypes, num_samples)
-        tmpx.to_csv(out_dir + datasets[i] + "_samples.txt",
-                    sep="\t",
-                    index=False)
-        tmpy.to_csv(out_dir + datasets[i] + "_labels.txt",
-                    sep="\t",
-                    index=False)
+        tmpx, tmpy = create_subsample_dataset(
+            xs[i], ys[i], sample_size, celltypes, num_samples
+        )
+        tmpx.to_csv(out_dir + datasets[i] + "_samples.txt", sep="\t", index=False)
+        tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False)
         gc.collect()
 
     print("Finished!")
diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py
@@ -71,7 +71,7 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"):
     # List available datasets
     files = glob.glob(data_dir + pattern)
     files = [os.path.basename(x) for x in files]
-    datasets = [x.split("_")[0] for x in files]
+    datasets = [x.replace(pattern.replace("*", ""), "") for x in files]
 
     # get celltypes
     celltypes = load_celltypes(data_dir)
@@ -95,15 +95,15 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"):
 
         x = x.sort_index(axis=1)
         ratios = pd.DataFrame(y, columns=celltypes)
-        ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]),
-                                 index=ratios.index)
+        ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index)
 
         print("Processing " + str(train_file))
         x = pd.DataFrame(x)
         adata.append(
-            anndata.AnnData(X=x.to_numpy(),
-                            obs=ratios,
-                            var=pd.DataFrame(columns=[], index=list(x))))
+            anndata.AnnData(
+                X=x.to_numpy(), obs=ratios, var=pd.DataFrame(columns=[], index=list(x))
+            )
+        )
 
     for i in range(1, len(adata)):
         print("Concatenating " + str(i))

diff --git a/setup.py b/setup.py
@@ -2,32 +2,43 @@
 
 from setuptools import setup, find_packages
 
-version = '1.0.0'
+version = "1.0.1"
 
 with open("README.md", "r", encoding="UTF-8") as fh:
     long_description = fh.read()
 
-with open('LICENSE', encoding="UTF-8") as f:
+with open("LICENSE", encoding="UTF-8") as f:
     license = f.read()
 
-setup(name='scaden',
-      version=version,
-      description="Cell type deconvolution using single cell data",
-      long_description=long_description,
-      long_description_content_type="text/markdown",
-      keywords=[
-          'bioinformatics', 'deep learning', 'machine learning',
-          'single cell sequencing', 'deconvolution'
-      ],
-      author='Kevin Menden',
-      author_email='kevin.menden@t-online.de',
-      url='https://github.com/KevinMenden/scaden',
-      license="MIT License",
-      entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
-      packages=find_packages(),
-      include_package_data=True,
-      python_requires='>3.6.0',
-      install_requires=[
-          'pandas', 'numpy', 'scikit-learn', 'tensorflow>=2.0', 'anndata',
-          'tqdm', 'click', 'h5py~=2.10.0'
-      ])
+setup(
+    name="scaden",
+    version=version,
+    description="Cell type deconvolution using single cell data",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    keywords=[
+        "bioinformatics",
+        "deep learning",
+        "machine learning",
+        "single cell sequencing",
+        "deconvolution",
+    ],
+    author="Kevin Menden",
+    author_email="kevin.menden@t-online.de",
+    url="https://github.com/KevinMenden/scaden",
+    license="MIT License",
+    entry_points={"console_scripts": ["scaden=scaden.__main__:main"]},
+    packages=find_packages(),
+    include_package_data=True,
+    python_requires=">3.6.0",
+    install_requires=[
+        "pandas",
+        "numpy",
+        "scikit-learn",
+        "tensorflow>=2.0",
+        "anndata",
+        "tqdm",
+        "click",
+        "h5py~=2.10.0",
+    ],
+)