Skip to content

Commit

Permalink
Merge branch 'ui-v2' of github.com:IGS/gEAR into ui-v2
Browse files Browse the repository at this point in the history
  • Loading branch information
jorvis committed Sep 19, 2024
2 parents 08090b9 + bbfcddc commit 414ecda
Show file tree
Hide file tree
Showing 30 changed files with 367 additions and 240 deletions.
7 changes: 5 additions & 2 deletions bin/profile_single_projectr_tsne_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,8 +835,8 @@ def run_tsne(dataset_id):
# Rename to end the confusion
adata.var = adata.var.rename(columns={adata.var.columns[0]: "ensembl_id"})
# Modify the AnnData object to not include any duplicated gene symbols (keep only first entry)
scanpy_copy = ana.dataset_path().replace('.h5ad', '.scanpy_dups_removed.h5ad')
if len(df.columns) > 1:
scanpy_copy = ana.dataset_path().replace('.h5ad', '.scanpy_dups_removed.h5ad')
if os.path.exists(scanpy_copy):
os.remove(scanpy_copy)
adata = adata[:, adata.var.index.duplicated() == False].copy(filename=scanpy_copy)
Expand All @@ -845,7 +845,7 @@ def run_tsne(dataset_id):
try:
basis = PLOT_TYPE_TO_BASIS[plot_type]
except:
raise("{} was not a valid plot type".format(plot_type))
raise Exception("{} was not a valid plot type".format(plot_type))

# NOTE: This may change in the future if users want plots by group w/o the colorize_by plot added
if plot_by_group:
Expand Down Expand Up @@ -993,6 +993,9 @@ def run_tsne(dataset_id):
plt.clf()
plt.close() # Prevent zombie plots, which can cause issues

if os.path.exists(scanpy_copy):
os.remove(scanpy_copy)

return {
"success": success,
"message": message,
Expand Down
3 changes: 1 addition & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ RUN apt-get -qq update \
# Required for R
gfortran \
# Required for rpy2
r-base-dev \
r-base \
r-cran-rjava \
# Required for R-package devtools (which is required for SJD)
libharfbuzz-dev \
libfribidi-dev \
Expand Down
1 change: 1 addition & 0 deletions docker/install_bioc.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env Rscript --vanilla

install.packages(c("BiocManager", "devtools"), dependencies=TRUE, repos="http://lib.stat.cmu.edu/R/CRAN/")
BiocManager::install(version = "3.19") # required for R 4.4.0
BiocManager::install(c("genesofeve/projectR", "biomaRt"), ask=FALSE)
library(devtools); install_github("CHuanSite/SJD")
2 changes: 1 addition & 1 deletion docker/install_bioc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Rver="${Rmaj}.4.0"

current_dir=$(pwd)

curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt
curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt || exit 1
cd /opt/${Rver}
/opt/${Rver}/configure --with-readline=no --enable-R-shlib --enable-BLAS-shlib --with-x=no || exit 1
make || exit 1
Expand Down
5 changes: 3 additions & 2 deletions docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ more_itertools==9.0.0
mysql-connector-python==8.0.20
numba==0.58.1
numexpr==2.8.4
numpy==1.26.0
numpy==1.26.4
opencv-python==4.5.5.64
openpyxl==3.1.5
pandas==2.2.1
Expand All @@ -29,11 +29,12 @@ pika==1.3.1
plotly==5.6.0
python-dotenv==0.20.0
requests==2.31.0
rpy2==3.5.1 # 3.5.2 and up gives errors with rpy2py and py2rpy
rpy2==3.5.16
sanic
scanpy==1.10.1
scikit-learn==1.0.2
scipy==1.11.04
seaborn==0.13.2
shadows==0.1a0
tables==3.9.2 # Read hdf5 files into pandas
xlrd==1.2.0
3 changes: 2 additions & 1 deletion docs/setup.python.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Check the requirement.txt file in <git_repo_root>/docker for the latest packages
mysql-connector-python==8.0.20 \
numba==0.58.1 \
numexpr==2.8.4 \
numpy==1.26.0 \
numpy==1.26.4 \
opencv-python==4.5.5.64 \
openpyxl==3.1.5 \
pandas==2.2.1 \
Expand All @@ -75,6 +75,7 @@ Check the requirement.txt file in <git_repo_root>/docker for the latest packages
scikit-learn==1.0.2 \
scipy==1.11.04 \
seaborn==0.13.2 \
shadows==0.1a0 \
tables==3.9.2 \
xlrd==1.2.0
$ sudo mkdir /opt/bin
Expand Down
4 changes: 2 additions & 2 deletions lib/gear/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def _update_by_plot_type(fig, plot_type, force_overlay=False, use_jitter=False):

def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
color_name=None, colormap=None, palette=None,
reverse_palette=False, category_orders=None,
reverse_palette=False, category_orders={},
plot_type='scatter', hide_x_labels=False, hide_y_labels=False,
hide_legend=None, text_name=None, jitter=False,
x_range=None, y_range=None, vlines=[], x_title=None, y_title=None,
Expand Down Expand Up @@ -374,7 +374,7 @@ def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
, "facet_row":facet_row
, "facet_col":facet_col
, "color":color_name
, "category_orders": category_orders if category_orders else {}
, "category_orders": category_orders
, "labels":labels_dict
, "hover_name": text_name if text_name else "y_rounded"
}
Expand Down
14 changes: 10 additions & 4 deletions lib/geardb.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,10 @@ def __init__(self, id=None, dataset_id=None, user_id=None, session_id=None, labe

def __repr__(self):
pipeline_file = self.settings_path()
return open(pipeline_file).read()
json_data = json.loads(open(pipeline_file).read())
# change "user_session_id" to "session_id" for consistency
json_data['session_id'] = json_data.pop('user_session_id')
return json.dumps(json_data, indent=4)

def _serialize_json(self):
# Called when json modules attempts to serialize
Expand Down Expand Up @@ -1979,10 +1982,13 @@ def get_shape(self, session_id=None, tuple_only=False):
## File is under datasets/${id}.h5ad
h5ad_file_path = self.get_file_path(session_id=session_id)

import scanpy as sc
sc.settings.verbosity = 0
adata = sc.read_h5ad(h5ad_file_path)
from shadows import AnnDataShadow
adata = AnnDataShadow(h5ad_file_path)

(n_obs, n_vars) = adata.shape

adata.close()

if tuple_only:
return (n_obs, n_vars)

Expand Down
3 changes: 1 addition & 2 deletions services/projectr/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ RUN apt-get -qq update \
# Required for R
gfortran \
# Required for rpy2
r-base-dev \
r-base \
r-cran-rjava \
# Required for R-package devtools (which is required for SJD)
libharfbuzz-dev \
libfribidi-dev \
Expand Down
1 change: 1 addition & 0 deletions services/projectr/install_bioc.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env Rscript --vanilla

install.packages(c("BiocManager", "devtools"), dependencies=TRUE, repos="http://lib.stat.cmu.edu/R/CRAN/")
BiocManager::install(version = "3.19") # required for R 4.4.0
BiocManager::install(c("genesofeve/projectR", "biomaRt"), ask=FALSE)
library(devtools); install_github("CHuanSite/SJD")
2 changes: 1 addition & 1 deletion services/projectr/install_bioc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Rver="${Rmaj}.4.0"

current_dir=$(pwd)

curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt
curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt || exit 1
cd /opt/${Rver}
/opt/${Rver}/configure --with-readline=no --enable-R-shlib --enable-BLAS-shlib --with-x=no || exit 1
make || exit 1
Expand Down
6 changes: 2 additions & 4 deletions services/projectr/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
Flask==3.0.0
gunicorn==20.1.0
rpy2==3.5.1 # 3.5.2 and up gives errors with rpy2py and py2rpy
#rpy2==3.5.16
#pandas==2.2.1
rpy2==3.5.16
pandas==2.2.1
numpy==1.26.4 # https://stackoverflow.com/a/78641304
pandas==1.4.1
google-cloud-logging
99 changes: 53 additions & 46 deletions services/projectr/rfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.vectors import StrVector

# If running locally, need to ensure that multiple concurrent R calls do not conflict
from rpy2.rinterface_lib import openrlib


class RError(Exception):
"""Error based on issues that would manifest in any particular R-language call."""
def __init__(self, message="") -> None:
Expand Down Expand Up @@ -45,51 +49,54 @@ def run_projectR_cmd(target_df, loading_df, algorithm):
Return Pandas dataframe of the projectR output
"""

# Convert from pandas dataframe to R data.frame
with localconverter(ro.default_converter + pandas2ri.converter):
target_r_df = ro.conversion.py2rpy(target_df)
loading_r_df = ro.conversion.py2rpy(loading_df)

# data.frame to matrix (projectR has no data.frame signature)
target_r_matrix = convert_r_df_to_r_matrix(target_r_df)
loading_r_matrix = convert_r_df_to_r_matrix(loading_r_df)

# Assign Rownames to each matrix
# I don't know why but using ro.StrVector makes rpy2py fail where the output df is an incompatible class
# Guessing that there are some non-strings mixed into the indexes
target_r_matrix.rownames = StrVector(target_df.index)
loading_r_matrix.rownames = StrVector(loading_df.index)

# The NMF projectR method signature is based on the LinearEmbeddedMatrix class,
# Which has a featureLoadings property. That matrix is loaded and the default
# projectR signature is returned and used. So we can just pass the matrix as-is.
# https://rdrr.io/bioc/SingleCellExperiment/man/LinearEmbeddingMatrix.html

# Run project R command. Get projectionPatterns matrix
try:
if algorithm == "nmf":
projectR = importr('projectR')
projection_patterns_r_matrix = projectR.projectR(data=target_r_matrix, loadings=loading_r_matrix, full=False)
elif algorithm == "fixednmf":
sjd = importr('SJD')
loading_list = ro.ListVector({"genesig": loading_r_matrix})

projection = sjd.projectNMF(proj_dataset=target_r_matrix, proj_group=True, list_component=loading_list)
projection_patterns_r_matrix = projection.rx2("proj_score_list").rx2("genesig")
else:
raise ValueError("Algorithm {} is not supported".format(algorithm))
except Exception as e:
# print stacktrace with line numbers
traceback.print_exc(file=sys.stderr)
raise RError("Error: Could not run projectR command.\tReason: {}".format(str(e)))

# matrix back to data.frame
projection_patterns_r_df = convert_r_matrix_to_r_df(projection_patterns_r_matrix)

# Convert from R data.frame to pandas dataframe
with localconverter(ro.default_converter + pandas2ri.converter):
projection_patterns_df = ro.conversion.rpy2py(projection_patterns_r_df)

return projection_patterns_df
# Ensure multithreading if running locally -> https://rpy2.github.io/doc/v3.5.x/html/rinterface.html#multithreading
with openrlib.rlock:

# Convert from pandas dataframe to R data.frame
with localconverter(ro.default_converter + pandas2ri.converter):
target_r_df = ro.conversion.py2rpy(target_df)
loading_r_df = ro.conversion.py2rpy(loading_df)

# data.frame to matrix (projectR has no data.frame signature)
target_r_matrix = convert_r_df_to_r_matrix(target_r_df)
loading_r_matrix = convert_r_df_to_r_matrix(loading_r_df)

# Assign Rownames to each matrix
# I don't know why but using ro.StrVector makes rpy2py fail where the output df is an incompatible class
# Guessing that there are some non-strings mixed into the indexes
target_r_matrix.rownames = StrVector(target_df.index)
loading_r_matrix.rownames = StrVector(loading_df.index)

# The NMF projectR method signature is based on the LinearEmbeddedMatrix class,
# Which has a featureLoadings property. That matrix is loaded and the default
# projectR signature is returned and used. So we can just pass the matrix as-is.
# https://rdrr.io/bioc/SingleCellExperiment/man/LinearEmbeddingMatrix.html

# Run project R command. Get projectionPatterns matrix
try:
if algorithm == "nmf":
projectR = importr('projectR')
projection_patterns_r_matrix = projectR.projectR(data=target_r_matrix, loadings=loading_r_matrix, full=False)
elif algorithm == "fixednmf":
sjd = importr('SJD')
loading_list = ro.ListVector({"genesig": loading_r_matrix})

projection = sjd.projectNMF(proj_dataset=target_r_matrix, proj_group=True, list_component=loading_list)
projection_patterns_r_matrix = projection.rx2("proj_score_list").rx2("genesig")
else:
raise ValueError("Algorithm {} is not supported".format(algorithm))
except Exception as e:
# print stacktrace with line numbers
traceback.print_exc(file=sys.stderr)
raise RError("Error: Could not run projectR command.\tReason: {}".format(str(e)))

# matrix back to data.frame
projection_patterns_r_df = convert_r_matrix_to_r_df(projection_patterns_r_matrix)

# Convert from R data.frame to pandas dataframe
with localconverter(ro.default_converter + pandas2ri.converter):
projection_patterns_df = ro.conversion.rpy2py(projection_patterns_r_df)

return projection_patterns_df


1 change: 1 addition & 0 deletions www/api/resources/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def post(self, dataset_id):

ana = geardb.Analysis(id=analysis_id, dataset_id=dataset_id, session_id=session_id, user_id=user.id)
ana.discover_type()
adata = ana.get_adata()

else:
# Dataset is primary type
Expand Down
12 changes: 12 additions & 0 deletions www/api/resources/plotly_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ def post(self, dataset_id):
df = selected.to_df()
df = pd.concat([df,selected.obs], axis=1)

# fill any missing adata.obs values with "NA"
# The below line gives the error - TypeError: Cannot setitem on a Categorical with a new category (NA), set the categories first
#df = df.fillna("NA")

# Valid analysis column names from api/resources/h5ad.py
analysis_tsne_columns = ['X_tsne_1', 'X_tsne_2']
analysis_umap_columns = ['X_umap_1', 'X_umap_2']
Expand Down Expand Up @@ -268,6 +272,14 @@ def post(self, dataset_id):
message = "WARNING: Color map has values not in the dataframe column '{}': {}\n".format(color_name, diff)
message += "Will set color map key values to the unique values in the dataframe column."
print(message, file=sys.stderr)
# If any element in diff is nan and color_map contains a valid missing value key like "NA", change the value in the dataframe to match the color_map key
for key in list(diff):
if pd.isna(key) and "NA" in color_map.keys():
df[color_name] = df[color_name].replace({key: "NA"})
col_values.remove(key) # Remove the nan value from the set
col_values = col_values.union({"NA"})
break

# Sort both the colormap and dataframe column alphabetically
sorted_column_values = sorted(col_values)
updated_color_map = {}
Expand Down
5 changes: 4 additions & 1 deletion www/api/resources/projectr.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,8 @@ def projectr_callback(dataset_id, genecart_id, projection_id, session_id, scope,
# If dataset genes have duplicated index names, we need to rename them to avoid errors
# in collecting rownames in projectR (which gives invalid output)
# This means these duplicated genes will not be in the intersection of the dataset and pattern genes
dedup_copy = Path(ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad'))
if (adata.var.index.duplicated(keep="first") == True).any():
dedup_copy = Path(ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad'))
if dedup_copy.exists():
dedup_copy.unlink()
adata = adata[:, adata.var.index.duplicated(keep="first") == False].copy(filename=dedup_copy)
Expand Down Expand Up @@ -497,6 +497,9 @@ def projectr_callback(dataset_id, genecart_id, projection_id, session_id, scope,
, "num_dataset_genes": num_target_genes
}

adata.close()
if dedup_copy.exists():
dedup_copy.unlink()

# Have had cases where the column names are x1, x2, x3, etc. so load in the original pattern names
projection_patterns_df = projection_patterns_df.set_axis(loading_df.columns, axis="columns")
Expand Down
12 changes: 4 additions & 8 deletions www/api/resources/tsne_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,13 +306,6 @@ def post(self, dataset_id):
if flip_y:
adata.obsm[key][:,1] = -1 * adata.obsm[key][:,1]

# We also need to change the adata's Raw var dataframe
# We can't explicitly reset its index so we reinitialize it with
# the newer adata object.
# https://github.com/theislab/anndata/blob/master/anndata/base.py#L1020-L1022
if adata.raw is not None:
adata.raw = adata

# Reorder the categorical values in the observation dataframe
# Currently in UI only "plot_by_group" has reordering capabilities
if order:
Expand Down Expand Up @@ -363,11 +356,11 @@ def post(self, dataset_id):
# Rename to end the confusion
selected.var = selected.var.rename(columns={selected.var.columns[0]: "ensembl_id"})
# Modify the AnnData object to not include any duplicated gene symbols (keep only first entry)
dedup_copy = ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad')
if (selected.var.index.duplicated(keep="first") == True).any():
success = 2
message = "WARNING: Multiple Ensemble IDs found for gene symbol '{}'. Using the first stored Ensembl ID.".format(selected_gene)

dedup_copy = ana.dataset_path().replace('.h5ad', '.dups_removed.h5ad')
if os.path.exists(dedup_copy):
os.remove(dedup_copy)
selected = selected[:, selected.var.index.duplicated() == False].copy(filename=dedup_copy)
Expand Down Expand Up @@ -564,6 +557,9 @@ def post(self, dataset_id):
if selected.isbacked:
selected.file.close()

if os.path.exists(dedup_copy):
os.remove(dedup_copy)

with io.BytesIO() as io_pic:
# Set the saved figure dpi based on the number of observations in the dataset after filtering
if high_dpi:
Expand Down
Loading

0 comments on commit 414ecda

Please sign in to comment.