Merge pull request #12 from adaptyvbio/pyproject

Pyproject
adaptyvbio · Feb 17, 2023 · d022b1a · d022b1a
2 parents 49eaa4f + 6d1968d
commit d022b1a
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 19 deletions.
diff --git a/.github/workflows/process-pull-request.yml b/.github/workflows/process-pull-request.yml
@@ -29,6 +29,7 @@ jobs:
           conda install -n proteinflow -c conda-forge -c bioconda mmseqs2
           conda run -n proteinflow python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
           conda run -n proteinflow python -m pip install awscli pytest flake8
+          conda run -n proteinflow python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
           conda run -n proteinflow python -m pip install -e .
 
       - name: Configure AWS Credentials

diff --git a/README.md b/README.md
@@ -8,16 +8,18 @@ This is a python library for handling the proteinflow data processing pipeline.
 [Read the documentation.](https://adaptyvbio.github.io/ProteinFlow/)
 
 ## Installation
-Recommended: create a new `conda` environment and install `proteinflow` and `mmseqs`. Note that the python version has to be between 3.8 and 3.10.
+Recommended: create a new `conda` environment and install `proteinflow` and `mmseqs`. Note that the python version has to be between 3.8 and 3.10. 
 ```
-git clone https://gitlab.com/adaptyvbio/ml-4/-/tree/library
-cd ml-4
 conda create --name proteinflow -y python=3.9
 conda activate proteinflow
 conda install -y -c conda-forge -c bioconda mmseqs2
-python -m pip install -e .
+python -m pip install proteinflow
 aws configure
 ```
+In addition, `proteinflow` depends on the `rcsbsearch` package and the latest release is currently failing. Follow the recommended fix:
+```
+python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
+```
 
 ## Usage
 ### Downloading pre-computed datasets

diff --git a/docs/index.html b/docs/index.html
@@ -377,6 +377,13 @@ <h2 id="citation">Citation</h2>
         pickle.dump(test_classes_dict, f)
 
 
+def _raise_rcsbsearch(e):
+    if &#34;404 Client Error&#34; in str(e):
+        raise RuntimeError(
+            &#39;Quering rcsbsearch is failing. Please install a version of rcsbsearch where this error is solved:\npython -m pip install &#34;rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e&#34;&#39;
+        )
+
+
 def _run_processing(
     tmp_folder=&#34;./data/tmp_pdb&#34;,
     output_folder=&#34;./data/pdb&#34;,
@@ -511,11 +518,14 @@ <h2 id="citation">Citation</h2>
     pdb_ids = pdb_ids.exec(&#34;assembly&#34;)
     if n is not None:
         pdbs = []
-        for i, x in enumerate(pdb_ids):
-            pdbs.append(x)
-            if i == n:
-                break
-        pdb_ids = pdbs
+        try:
+            for i, x in enumerate(pdb_ids):
+                pdbs.append(x)
+                if i == n:
+                    break
+            pdb_ids = pdbs
+        except Exception as e:
+            _raise_rcsbsearch(e)
 
     ordered_folders = [
         x.key
@@ -577,7 +587,10 @@ <h2 id="citation">Citation</h2>
                 _log_exception(e, LOG_FILE, pdb_id, TMP_FOLDER)
 
     # process_f(&#34;1a14-1&#34;, show_error=True, force=force)
-    _ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
+    try:
+        _ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
+    except Exception as e:
+        _raise_rcsbsearch(e)
 
     stats = get_error_summary(LOG_FILE, verbose=False)
     not_found_error = &#34;&lt;&lt;&lt; PDB / mmCIF file downloaded but not found&#34;

diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py
@@ -309,6 +309,13 @@ def _get_split_dictionaries(
         pickle.dump(test_classes_dict, f)
 
 
+def _raise_rcsbsearch(e):
+    if "404 Client Error" in str(e):
+        raise RuntimeError(
+            'Quering rcsbsearch is failing. Please install a version of rcsbsearch where this error is solved:\npython -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"'
+        )
+
+
 def _run_processing(
     tmp_folder="./data/tmp_pdb",
     output_folder="./data/pdb",
@@ -443,11 +450,14 @@ def _run_processing(
     pdb_ids = pdb_ids.exec("assembly")
     if n is not None:
         pdbs = []
-        for i, x in enumerate(pdb_ids):
-            pdbs.append(x)
-            if i == n:
-                break
-        pdb_ids = pdbs
+        try:
+            for i, x in enumerate(pdb_ids):
+                pdbs.append(x)
+                if i == n:
+                    break
+            pdb_ids = pdbs
+        except Exception as e:
+            _raise_rcsbsearch(e)
 
     ordered_folders = [
         x.key
@@ -509,7 +519,10 @@ def process_f(pdb_id, show_error=False, force=True, load_live=False):
                 _log_exception(e, LOG_FILE, pdb_id, TMP_FOLDER)
 
     # process_f("1a14-1", show_error=True, force=force)
-    _ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
+    try:
+        _ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
+    except Exception as e:
+        _raise_rcsbsearch(e)
 
     stats = get_error_summary(LOG_FILE, verbose=False)
     not_found_error = "<<< PDB / mmCIF file downloaded but not found"

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,14 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "proteinflow"
-version = "0.0.1"
-requires-python = ">=3.9"
+version = "1.0.0"
+authors = [
+    {name = "Elizaveta Kozlova", email = "liza@adaptyvbio.com"},
+    {name = "Arthur Valentin", email = "arthur@adaptyvbio.com"}
+]
+description = "Versatile pipeline for processing protein structure data for deep learning applications."
+readme = "README.md"
+requires-python = ">=3.8,<3.10"
 license = {text = "BSD-3-Clause"}
 dependencies = [
     "numpy",
@@ -21,8 +27,9 @@ dependencies = [
     "torch>=1.10.0",
     "biotite==0.35.0",
     "awscli",
-    "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e",
+    "rcsbsearch",
 ]
+keywords = ["bioinformatics", "dataset", "protein", "PDB", "deep learning"]
 
 [project.scripts]
 proteinflow = "proteinflow.scripts.proteinflow_cli:cli"