Skip to content

Commit

Permalink
Merge pull request #12 from adaptyvbio/pyproject
Browse files Browse the repository at this point in the history
Pyproject
  • Loading branch information
elkoz authored Feb 17, 2023
2 parents 49eaa4f + 6d1968d commit d022b1a
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 19 deletions.
1 change: 1 addition & 0 deletions .github/workflows/process-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
conda install -n proteinflow -c conda-forge -c bioconda mmseqs2
conda run -n proteinflow python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
conda run -n proteinflow python -m pip install awscli pytest flake8
conda run -n proteinflow python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
conda run -n proteinflow python -m pip install -e .
- name: Configure AWS Credentials
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,18 @@ This is a python library for handling the proteinflow data processing pipeline.
[Read the documentation.](https://adaptyvbio.github.io/ProteinFlow/)

## Installation
Recommended: create a new `conda` environment and install `proteinflow` and `mmseqs`. Note that the python version has to be between 3.8 and 3.10.
Recommended: create a new `conda` environment and install `proteinflow` and `mmseqs`. Note that the python version has to be between 3.8 and 3.10.
```
git clone https://gitlab.com/adaptyvbio/ml-4/-/tree/library
cd ml-4
conda create --name proteinflow -y python=3.9
conda activate proteinflow
conda install -y -c conda-forge -c bioconda mmseqs2
python -m pip install -e .
python -m pip install proteinflow
aws configure
```
In addition, `proteinflow` depends on the `rcsbsearch` package and the latest release is currently failing. Follow the recommended fix:
```
python -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"
```

## Usage
### Downloading pre-computed datasets
Expand Down
25 changes: 19 additions & 6 deletions docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,13 @@ <h2 id="citation">Citation</h2>
pickle.dump(test_classes_dict, f)


def _raise_rcsbsearch(e):
if &#34;404 Client Error&#34; in str(e):
raise RuntimeError(
&#39;Quering rcsbsearch is failing. Please install a version of rcsbsearch where this error is solved:\npython -m pip install &#34;rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e&#34;&#39;
)


def _run_processing(
tmp_folder=&#34;./data/tmp_pdb&#34;,
output_folder=&#34;./data/pdb&#34;,
Expand Down Expand Up @@ -511,11 +518,14 @@ <h2 id="citation">Citation</h2>
pdb_ids = pdb_ids.exec(&#34;assembly&#34;)
if n is not None:
pdbs = []
for i, x in enumerate(pdb_ids):
pdbs.append(x)
if i == n:
break
pdb_ids = pdbs
try:
for i, x in enumerate(pdb_ids):
pdbs.append(x)
if i == n:
break
pdb_ids = pdbs
except Exception as e:
_raise_rcsbsearch(e)

ordered_folders = [
x.key
Expand Down Expand Up @@ -577,7 +587,10 @@ <h2 id="citation">Citation</h2>
_log_exception(e, LOG_FILE, pdb_id, TMP_FOLDER)

# process_f(&#34;1a14-1&#34;, show_error=True, force=force)
_ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
try:
_ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
except Exception as e:
_raise_rcsbsearch(e)

stats = get_error_summary(LOG_FILE, verbose=False)
not_found_error = &#34;&lt;&lt;&lt; PDB / mmCIF file downloaded but not found&#34;
Expand Down
25 changes: 19 additions & 6 deletions proteinflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,13 @@ def _get_split_dictionaries(
pickle.dump(test_classes_dict, f)


def _raise_rcsbsearch(e):
if "404 Client Error" in str(e):
raise RuntimeError(
'Quering rcsbsearch is failing. Please install a version of rcsbsearch where this error is solved:\npython -m pip install "rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e"'
)


def _run_processing(
tmp_folder="./data/tmp_pdb",
output_folder="./data/pdb",
Expand Down Expand Up @@ -443,11 +450,14 @@ def _run_processing(
pdb_ids = pdb_ids.exec("assembly")
if n is not None:
pdbs = []
for i, x in enumerate(pdb_ids):
pdbs.append(x)
if i == n:
break
pdb_ids = pdbs
try:
for i, x in enumerate(pdb_ids):
pdbs.append(x)
if i == n:
break
pdb_ids = pdbs
except Exception as e:
_raise_rcsbsearch(e)

ordered_folders = [
x.key
Expand Down Expand Up @@ -509,7 +519,10 @@ def process_f(pdb_id, show_error=False, force=True, load_live=False):
_log_exception(e, LOG_FILE, pdb_id, TMP_FOLDER)

# process_f("1a14-1", show_error=True, force=force)
_ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
try:
_ = p_map(lambda x: process_f(x, force=force, load_live=load_live), pdb_ids)
except Exception as e:
_raise_rcsbsearch(e)

stats = get_error_summary(LOG_FILE, verbose=False)
not_found_error = "<<< PDB / mmCIF file downloaded but not found"
Expand Down
13 changes: 10 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@ build-backend = "setuptools.build_meta"

[project]
name = "proteinflow"
version = "0.0.1"
requires-python = ">=3.9"
version = "1.0.0"
authors = [
{name = "Elizaveta Kozlova", email = "liza@adaptyvbio.com"},
{name = "Arthur Valentin", email = "arthur@adaptyvbio.com"}
]
description = "Versatile pipeline for processing protein structure data for deep learning applications."
readme = "README.md"
requires-python = ">=3.8,<3.10"
license = {text = "BSD-3-Clause"}
dependencies = [
"numpy",
Expand All @@ -21,8 +27,9 @@ dependencies = [
"torch>=1.10.0",
"biotite==0.35.0",
"awscli",
"rcsbsearch @ git+https://github.com/sbliven/rcsbsearch@dbdfe3880cc88b0ce57163987db613d579400c8e",
"rcsbsearch",
]
keywords = ["bioinformatics", "dataset", "protein", "PDB", "deep learning"]

[project.scripts]
proteinflow = "proteinflow.scripts.proteinflow_cli:cli"
Expand Down

0 comments on commit d022b1a

Please sign in to comment.