Merge branch 'ui-v2' of github.com:IGS/gEAR into ui-v2

IGS · Sep 19, 2024 · 414ecda · 414ecda
2 parents 08090b9 + bbfcddc
commit 414ecda
Show file tree

Hide file tree

Showing 30 changed files with 367 additions and 240 deletions.
diff --git a/bin/profile_single_projectr_tsne_run.py b/bin/profile_single_projectr_tsne_run.py
@@ -835,8 +835,8 @@ def run_tsne(dataset_id):
     # Rename to end the confusion
     adata.var = adata.var.rename(columns={adata.var.columns[0]: "ensembl_id"})
     # Modify the AnnData object to not include any duplicated gene symbols (keep only first entry)
+    scanpy_copy = ana.dataset_path().replace('.h5ad', '.scanpy_dups_removed.h5ad')
     if len(df.columns) > 1:
-        scanpy_copy = ana.dataset_path().replace('.h5ad', '.scanpy_dups_removed.h5ad')
         if os.path.exists(scanpy_copy):
             os.remove(scanpy_copy)
         adata = adata[:, adata.var.index.duplicated() == False].copy(filename=scanpy_copy)
@@ -845,7 +845,7 @@ def run_tsne(dataset_id):
     try:
         basis = PLOT_TYPE_TO_BASIS[plot_type]
     except:
-        raise("{} was not a valid plot type".format(plot_type))
+        raise Exception("{} was not a valid plot type".format(plot_type))
 
     # NOTE: This may change in the future if users want plots by group w/o the colorize_by plot added
     if plot_by_group:
@@ -993,6 +993,9 @@ def run_tsne(dataset_id):
     plt.clf()
     plt.close()  # Prevent zombie plots, which can cause issues
 
+    if os.path.exists(scanpy_copy):
+        os.remove(scanpy_copy)
+
     return {
         "success": success,
         "message": message,

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -57,8 +57,7 @@ RUN apt-get -qq update \
   # Required for R
 	gfortran \
   # Required for rpy2
-  r-base-dev \
-  r-base \
+  r-cran-rjava \
   # Required for R-package devtools (which is required for SJD)
   libharfbuzz-dev \
   libfribidi-dev \

diff --git a/docker/install_bioc.R b/docker/install_bioc.R
@@ -1,5 +1,6 @@
 #!/usr/bin/env Rscript --vanilla
 
 install.packages(c("BiocManager", "devtools"), dependencies=TRUE, repos="http://lib.stat.cmu.edu/R/CRAN/")
+BiocManager::install(version = "3.19")  # required for R 4.4.0
 BiocManager::install(c("genesofeve/projectR", "biomaRt"), ask=FALSE)
 library(devtools); install_github("CHuanSite/SJD")
diff --git a/docker/install_bioc.sh b/docker/install_bioc.sh
@@ -5,7 +5,7 @@ Rver="${Rmaj}.4.0"
 
 current_dir=$(pwd)
 
-curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt
+curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt || exit 1
 cd /opt/${Rver}
 /opt/${Rver}/configure --with-readline=no --enable-R-shlib --enable-BLAS-shlib --with-x=no || exit 1
 make || exit 1

diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -20,7 +20,7 @@ more_itertools==9.0.0
 mysql-connector-python==8.0.20
 numba==0.58.1
 numexpr==2.8.4
-numpy==1.26.0
+numpy==1.26.4
 opencv-python==4.5.5.64
 openpyxl==3.1.5
 pandas==2.2.1
@@ -29,11 +29,12 @@ pika==1.3.1
 plotly==5.6.0
 python-dotenv==0.20.0
 requests==2.31.0
-rpy2==3.5.1 # 3.5.2 and up gives errors with rpy2py and py2rpy
+rpy2==3.5.16
 sanic
 scanpy==1.10.1
 scikit-learn==1.0.2
 scipy==1.11.04
 seaborn==0.13.2
+shadows==0.1a0
 tables==3.9.2 # Read hdf5 files into pandas
 xlrd==1.2.0
diff --git a/docs/setup.python.md b/docs/setup.python.md
@@ -60,7 +60,7 @@ Check the requirement.txt file in <git_repo_root>/docker for the latest packages
       mysql-connector-python==8.0.20 \
       numba==0.58.1 \
       numexpr==2.8.4 \
-      numpy==1.26.0 \
+      numpy==1.26.4 \
       opencv-python==4.5.5.64 \
       openpyxl==3.1.5 \
       pandas==2.2.1 \
@@ -75,6 +75,7 @@ Check the requirement.txt file in <git_repo_root>/docker for the latest packages
       scikit-learn==1.0.2 \
       scipy==1.11.04 \
       seaborn==0.13.2 \
+      shadows==0.1a0 \
       tables==3.9.2 \
       xlrd==1.2.0
     $ sudo mkdir /opt/bin

diff --git a/lib/gear/plotting.py b/lib/gear/plotting.py
@@ -335,7 +335,7 @@ def _update_by_plot_type(fig, plot_type, force_overlay=False, use_jitter=False):
 
 def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
                       color_name=None, colormap=None, palette=None,
-                      reverse_palette=False, category_orders=None,
+                      reverse_palette=False, category_orders={},
                       plot_type='scatter', hide_x_labels=False, hide_y_labels=False,
                       hide_legend=None, text_name=None, jitter=False,
                       x_range=None, y_range=None, vlines=[], x_title=None, y_title=None,
@@ -374,7 +374,7 @@ def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
         , "facet_row":facet_row
         , "facet_col":facet_col
         , "color":color_name
-        , "category_orders": category_orders if category_orders else {}
+        , "category_orders": category_orders
         , "labels":labels_dict
         , "hover_name": text_name if text_name else "y_rounded"
         }

diff --git a/lib/geardb.py b/lib/geardb.py
@@ -613,7 +613,10 @@ def __init__(self, id=None, dataset_id=None, user_id=None, session_id=None, labe
 
     def __repr__(self):
         pipeline_file = self.settings_path()
-        return open(pipeline_file).read()
+        json_data = json.loads(open(pipeline_file).read())
+        # change "user_session_id" to "session_id" for consistency
+        json_data['session_id'] = json_data.pop('user_session_id')
+        return json.dumps(json_data, indent=4)
 
     def _serialize_json(self):
         # Called when json modules attempts to serialize
@@ -1979,10 +1982,13 @@ def get_shape(self, session_id=None, tuple_only=False):
             ## File is under datasets/${id}.h5ad
             h5ad_file_path = self.get_file_path(session_id=session_id)
 
-            import scanpy as sc
-            sc.settings.verbosity = 0
-            adata = sc.read_h5ad(h5ad_file_path)
+            from shadows import AnnDataShadow
+            adata = AnnDataShadow(h5ad_file_path)
+
             (n_obs, n_vars) = adata.shape
+
+            adata.close()
+
             if tuple_only:
                 return (n_obs, n_vars)
 

diff --git a/services/projectr/Dockerfile b/services/projectr/Dockerfile
@@ -11,8 +11,7 @@ RUN apt-get -qq update \
     # Required for R
 	gfortran \
     # Required for rpy2
-    r-base-dev \
-    r-base \
+    r-cran-rjava \
     # Required for R-package devtools (which is required for SJD)
     libharfbuzz-dev \
     libfribidi-dev \

diff --git a/services/projectr/install_bioc.R b/services/projectr/install_bioc.R
@@ -1,5 +1,6 @@
 #!/usr/bin/env Rscript --vanilla
 
 install.packages(c("BiocManager", "devtools"), dependencies=TRUE, repos="http://lib.stat.cmu.edu/R/CRAN/")
+BiocManager::install(version = "3.19")  # required for R 4.4.0
 BiocManager::install(c("genesofeve/projectR", "biomaRt"), ask=FALSE)
 library(devtools); install_github("CHuanSite/SJD")
diff --git a/services/projectr/install_bioc.sh b/services/projectr/install_bioc.sh
@@ -5,7 +5,7 @@ Rver="${Rmaj}.4.0"
 
 current_dir=$(pwd)
 
-curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt
+curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt || exit 1
 cd /opt/${Rver}
 /opt/${Rver}/configure --with-readline=no --enable-R-shlib --enable-BLAS-shlib --with-x=no || exit 1
 make || exit 1

diff --git a/services/projectr/requirements.txt b/services/projectr/requirements.txt
@@ -1,8 +1,6 @@
 Flask==3.0.0
 gunicorn==20.1.0
-rpy2==3.5.1 # 3.5.2 and up gives errors with rpy2py and py2rpy
-#rpy2==3.5.16
-#pandas==2.2.1
+rpy2==3.5.16
+pandas==2.2.1
 numpy==1.26.4   # https://stackoverflow.com/a/78641304
-pandas==1.4.1
 google-cloud-logging
diff --git a/services/projectr/rfuncs.py b/services/projectr/rfuncs.py
@@ -16,6 +16,10 @@
 from rpy2.robjects.conversion import localconverter
 from rpy2.robjects.vectors import StrVector
 
+# If running locally, need to ensure that multiple concurrent R calls do not conflict
+from rpy2.rinterface_lib import openrlib
+
+
 class RError(Exception):
     """Error based on issues that would manifest in any particular R-language call."""
     def __init__(self, message="") -> None:
@@ -45,51 +49,54 @@ def run_projectR_cmd(target_df, loading_df, algorithm):
     Return Pandas dataframe of the projectR output
     """
 
-    # Convert from pandas dataframe to R data.frame
-    with localconverter(ro.default_converter + pandas2ri.converter):
-        target_r_df = ro.conversion.py2rpy(target_df)
-        loading_r_df = ro.conversion.py2rpy(loading_df)
-
-    # data.frame to matrix (projectR has no data.frame signature)
-    target_r_matrix = convert_r_df_to_r_matrix(target_r_df)
-    loading_r_matrix = convert_r_df_to_r_matrix(loading_r_df)
-
-    # Assign Rownames to each matrix
-    # I don't know why but using ro.StrVector makes rpy2py fail where the output df is an incompatible class
-    # Guessing that there are some non-strings mixed into the indexes
-    target_r_matrix.rownames = StrVector(target_df.index)
-    loading_r_matrix.rownames = StrVector(loading_df.index)
-
-    # The NMF projectR method signature is based on the LinearEmbeddedMatrix class,
-    # Which has a featureLoadings property. That matrix is loaded and the default
-    # projectR signature is returned and used. So we can just pass the matrix as-is.
-    # https://rdrr.io/bioc/SingleCellExperiment/man/LinearEmbeddingMatrix.html
-
-    # Run project R command.  Get projectionPatterns matrix
-    try:
-        if algorithm == "nmf":
-            projectR = importr('projectR')
-            projection_patterns_r_matrix = projectR.projectR(data=target_r_matrix, loadings=loading_r_matrix, full=False)
-        elif algorithm == "fixednmf":
-            sjd = importr('SJD')
-            loading_list = ro.ListVector({"genesig": loading_r_matrix})
-
-            projection = sjd.projectNMF(proj_dataset=target_r_matrix, proj_group=True, list_component=loading_list)
-            projection_patterns_r_matrix = projection.rx2("proj_score_list").rx2("genesig")
-        else:
-            raise ValueError("Algorithm {} is not supported".format(algorithm))
-    except Exception as e:
-        # print stacktrace with line numbers
-        traceback.print_exc(file=sys.stderr)
-        raise RError("Error: Could not run projectR command.\tReason: {}".format(str(e)))
-
-    # matrix back to data.frame
-    projection_patterns_r_df = convert_r_matrix_to_r_df(projection_patterns_r_matrix)
-
-    # Convert from R data.frame to pandas dataframe
-    with localconverter(ro.default_converter + pandas2ri.converter):
-        projection_patterns_df = ro.conversion.rpy2py(projection_patterns_r_df)
-
-    return projection_patterns_df
+    # Ensure multithreading if running locally -> https://rpy2.github.io/doc/v3.5.x/html/rinterface.html#multithreading
+    with openrlib.rlock:
+
+        # Convert from pandas dataframe to R data.frame
+        with localconverter(ro.default_converter + pandas2ri.converter):
+            target_r_df = ro.conversion.py2rpy(target_df)
+            loading_r_df = ro.conversion.py2rpy(loading_df)
+
+        # data.frame to matrix (projectR has no data.frame signature)
+        target_r_matrix = convert_r_df_to_r_matrix(target_r_df)
+        loading_r_matrix = convert_r_df_to_r_matrix(loading_r_df)
+
+        # Assign Rownames to each matrix
+        # I don't know why but using ro.StrVector makes rpy2py fail where the output df is an incompatible class
+        # Guessing that there are some non-strings mixed into the indexes
+        target_r_matrix.rownames = StrVector(target_df.index)
+        loading_r_matrix.rownames = StrVector(loading_df.index)
+
+        # The NMF projectR method signature is based on the LinearEmbeddedMatrix class,
+        # Which has a featureLoadings property. That matrix is loaded and the default
+        # projectR signature is returned and used. So we can just pass the matrix as-is.
+        # https://rdrr.io/bioc/SingleCellExperiment/man/LinearEmbeddingMatrix.html
+
+        # Run project R command.  Get projectionPatterns matrix
+        try:
+            if algorithm == "nmf":
+                projectR = importr('projectR')
+                projection_patterns_r_matrix = projectR.projectR(data=target_r_matrix, loadings=loading_r_matrix, full=False)
+            elif algorithm == "fixednmf":
+                sjd = importr('SJD')
+                loading_list = ro.ListVector({"genesig": loading_r_matrix})
+
+                projection = sjd.projectNMF(proj_dataset=target_r_matrix, proj_group=True, list_component=loading_list)
+                projection_patterns_r_matrix = projection.rx2("proj_score_list").rx2("genesig")
+            else:
+                raise ValueError("Algorithm {} is not supported".format(algorithm))
+        except Exception as e:
+            # print stacktrace with line numbers
+            traceback.print_exc(file=sys.stderr)
+            raise RError("Error: Could not run projectR command.\tReason: {}".format(str(e)))
+
+        # matrix back to data.frame
+        projection_patterns_r_df = convert_r_matrix_to_r_df(projection_patterns_r_matrix)
+
+        # Convert from R data.frame to pandas dataframe
+        with localconverter(ro.default_converter + pandas2ri.converter):
+            projection_patterns_df = ro.conversion.rpy2py(projection_patterns_r_df)
+
+        return projection_patterns_df
 
 
diff --git a/www/api/resources/aggregations.py b/www/api/resources/aggregations.py
@@ -50,6 +50,7 @@ def post(self, dataset_id):
 
             ana = geardb.Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id)
             ana.discover_type()
+            adata = ana.get_adata()
 
         else:
             # Dataset is primary type

diff --git a/www/api/resources/plotly_data.py b/www/api/resources/plotly_data.py
@@ -232,6 +232,10 @@ def post(self, dataset_id):
         df = selected.to_df()
         df = pd.concat([df,selected.obs], axis=1)
 
+        # fill any missing adata.obs values with "NA"
+        # The below line gives the error - TypeError: Cannot setitem on a Categorical with a new category (NA), set the categories first
+        #df = df.fillna("NA")
+
         # Valid analysis column names from api/resources/h5ad.py
         analysis_tsne_columns = ['X_tsne_1', 'X_tsne_2']
         analysis_umap_columns = ['X_umap_1', 'X_umap_2']
@@ -268,6 +272,14 @@ def post(self, dataset_id):
                 message =  "WARNING: Color map has values not in the dataframe column '{}': {}\n".format(color_name, diff)
                 message += "Will set color map key values to the unique values in the dataframe column."
                 print(message, file=sys.stderr)
+                # If any element in diff is nan and color_map contains a valid missing value key like "NA", change the value in the dataframe to match the color_map key
+                for key in list(diff):
+                    if pd.isna(key) and "NA" in color_map.keys():
+                        df[color_name] = df[color_name].replace({key: "NA"})
+                        col_values.remove(key)  # Remove the nan value from the set
+                        col_values = col_values.union({"NA"})
+                        break
+
                 # Sort both the colormap and dataframe column alphabetically
                 sorted_column_values = sorted(col_values)
                 updated_color_map = {}

diff --git a/www/api/resources/projectr.py b/www/api/resources/projectr.py
@@ -321,8 +321,8 @@ def projectr_callback(dataset_id, genecart_id, projection_id, session_id, scope,
     # If dataset genes have duplicated index names, we need to rename them to avoid errors
     # in collecting rownames in projectR (which gives invalid output)
     # This means these duplicated genes will not be in the intersection of the dataset and pattern genes
+    dedup_copy = Path(ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad'))
     if (adata.var.index.duplicated(keep="first") == True).any():
-        dedup_copy = Path(ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad'))
         if dedup_copy.exists():
             dedup_copy.unlink()
         adata = adata[:, adata.var.index.duplicated(keep="first") == False].copy(filename=dedup_copy)
@@ -497,6 +497,9 @@ def projectr_callback(dataset_id, genecart_id, projection_id, session_id, scope,
                 , "num_dataset_genes": num_target_genes
             }
 
+    adata.close()
+    if dedup_copy.exists():
+        dedup_copy.unlink()
 
     # Have had cases where the column names are x1, x2, x3, etc. so load in the original pattern names
     projection_patterns_df = projection_patterns_df.set_axis(loading_df.columns, axis="columns")

diff --git a/www/api/resources/tsne_data.py b/www/api/resources/tsne_data.py
@@ -306,13 +306,6 @@ def post(self, dataset_id):
                 if flip_y:
                     adata.obsm[key][:,1] = -1 * adata.obsm[key][:,1]
 
-        # We also need to change the adata's Raw var dataframe
-        # We can't explicitly reset its index so we reinitialize it with
-        # the newer adata object.
-        # https://github.com/theislab/anndata/blob/master/anndata/base.py#L1020-L1022
-        if adata.raw is not None:
-            adata.raw = adata
-
         # Reorder the categorical values in the observation dataframe
         # Currently in UI only "plot_by_group" has reordering capabilities
         if order:
@@ -363,11 +356,11 @@ def post(self, dataset_id):
         # Rename to end the confusion
         selected.var = selected.var.rename(columns={selected.var.columns[0]: "ensembl_id"})
         # Modify the AnnData object to not include any duplicated gene symbols (keep only first entry)
+        dedup_copy = ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad')
         if (selected.var.index.duplicated(keep="first") == True).any():
             success = 2
             message = "WARNING: Multiple Ensemble IDs found for gene symbol '{}'.  Using the first stored Ensembl ID.".format(selected_gene)
 
-            dedup_copy = ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad')
             if os.path.exists(dedup_copy):
                 os.remove(dedup_copy)
             selected = selected[:, selected.var.index.duplicated() == False].copy(filename=dedup_copy)
@@ -564,6 +557,9 @@ def post(self, dataset_id):
         if selected.isbacked:
             selected.file.close()
 
+        if os.path.exists(dedup_copy):
+            os.remove(dedup_copy)
+
         with io.BytesIO() as io_pic:
             # Set the saved figure dpi based on the number of observations in the dataset after filtering
             if high_dpi: