Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add foldseek to search similar proteins #180

Merged
merged 13 commits into from
Feb 29, 2024
Merged
23 changes: 17 additions & 6 deletions backend/src/api/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def decode_base64(b64_header_and_data: str):
return b64decode(b64_data_only).decode("utf-8")


def pdb_file_name(protein_name: str):
def stored_pdb_file_name(protein_name: str):
return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb"


Expand All @@ -62,11 +62,16 @@ def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"):
elif encoding == "b64":
return PDB(decode_base64(file_contents), name)
elif encoding == "file":
return PDB(open(pdb_file_name(name), "r").read(), name)
return PDB(open(stored_pdb_file_name(name), "r").read(), name)
else:
raise ValueError(f"Invalid encoding: {encoding}")


def format_protein_name(name: str):
name = name.replace(" ", "_")
return name


def protein_name_found(name: str):
"""Checks if a protein name already exists in the database
Returns: True if exists | False if not exists
Expand Down Expand Up @@ -103,7 +108,9 @@ class UploadPNGBody(CamelModel):
@router.get("/protein/pdb/{protein_name:str}")
def get_pdb_file(protein_name: str):
if protein_name_found(protein_name):
return FileResponse(pdb_file_name(protein_name), filename=protein_name + ".pdb")
return FileResponse(
stored_pdb_file_name(protein_name), filename=protein_name + ".pdb"
)


@router.get("/protein/fasta/{protein_name:str}")
Expand Down Expand Up @@ -193,7 +200,7 @@ def delete_protein_entry(protein_name: str, req: Request):
[protein_name],
)
# delete the file from the data/ folder
os.remove(pdb_file_name(protein_name))
os.remove(stored_pdb_file_name(protein_name))
except Exception as e:
log.error(e)

Expand All @@ -212,6 +219,8 @@ def upload_protein_png(body: UploadPNGBody):
@router.post("/protein/upload", response_model=UploadError | None)
def upload_protein_entry(body: UploadBody, req: Request):
requiresAuthentication(req)

body.name = format_protein_name(body.name)
# check that the name is not already taken in the DB
if protein_name_found(body.name):
return UploadError.NAME_NOT_UNIQUE
Expand All @@ -225,7 +234,7 @@ def upload_protein_entry(body: UploadBody, req: Request):

try:
# write to file to data/ folder
with open(pdb_file_name(pdb.name), "w") as f:
with open(stored_pdb_file_name(pdb.name), "w") as f:
f.write(pdb.file_contents)
except Exception:
log.warn("Failed to write to file")
Expand Down Expand Up @@ -272,7 +281,9 @@ def edit_protein_entry(body: EditBody, req: Request):
requiresAuthentication(req)
try:
if body.new_name != body.old_name:
os.rename(pdb_file_name(body.old_name), pdb_file_name(body.new_name))
os.rename(
stored_pdb_file_name(body.old_name), stored_pdb_file_name(body.new_name)
)

with Database() as db:
name_changed = False
Expand Down
58 changes: 56 additions & 2 deletions backend/src/api/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,19 @@
import logging as log
from ..db import Database, bytea_to_str
from ..api_types import CamelModel, ProteinEntry
from ..foldseek import easy_search
from .protein import stored_pdb_file_name

router = APIRouter()


class SimilarProtein(CamelModel):
name: str
prob: float
evalue: float
description: str = ""


class RangeFilter(CamelModel):
min: int | float
max: int | float
Expand Down Expand Up @@ -51,6 +60,19 @@ def combine_where_clauses(clauses: list[str]) -> str:
return result


def get_descriptions(protein_names: list[str]):
if len(protein_names) > 0:
with Database() as db:
list_names = str(protein_names)[
1:-1
] # parse out the [] brackets and keep everything inside
query = f"""SELECT description FROM proteins WHERE name in ({list_names})"""
entry_sql = db.execute_return(query)
if entry_sql is not None:
return [d[0] for d in entry_sql]
return None


def gen_sql_filters(
species_filter: str | None,
length_filter: RangeFilter | None = None,
Expand Down Expand Up @@ -111,7 +133,7 @@ def search_proteins(body: SearchProteinsBody):
raise HTTPException(status_code=500, detail=str(e))


@router.get("/search/range/length")
@router.get("/search/range/length", response_model=RangeFilter)
def search_range_length():
try:
with Database() as db:
Expand All @@ -123,7 +145,7 @@ def search_range_length():
raise HTTPException(status_code=500, detail=str(e))


@router.get("/search/range/mass")
@router.get("/search/range/mass", response_model=RangeFilter)
def search_range_mass():
try:
with Database() as db:
Expand All @@ -145,3 +167,35 @@ def search_species():
return [d[0] for d in entry_sql]
except Exception:
return


@router.get(
"/search/venome/similar/{protein_name:str}",
response_model=list[SimilarProtein],
)
def search_venome_similar(protein_name: str):
venome_folder = "/app/src/data/pdbAlphaFold/"
# ignore the first since it's itself as the most similar
try:
similar = easy_search(
stored_pdb_file_name(protein_name),
venome_folder,
out_format="target,prob,evalue",
)[1:]
formatted = [
SimilarProtein(name=name.rstrip(".pdb"), prob=prob, evalue=evalue)
for [name, prob, evalue] in similar
]
except Exception:
raise HTTPException(404, "Foldseek not found on the system")

try:
# populate protein descriptions for the similar proteins
descriptions = get_descriptions([s.name for s in formatted])
if descriptions is not None:
for f, d in zip(formatted, descriptions):
f.description = d
except Exception:
raise HTTPException(500, "Error getting protein descriptions")

return formatted
99 changes: 99 additions & 0 deletions backend/src/foldseek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import subprocess
import logging as log
import os


def bash_cmd(cmd: str | list[str]) -> str:
return subprocess.check_output(cmd, shell=True).decode()


FOLDSEEK_LOCATION = "/app/foldseek"
FOLDSEEK_EXECUTABLE = f"{FOLDSEEK_LOCATION}/bin/foldseek"


def assert_foldseek_installed():
if os.path.exists(FOLDSEEK_EXECUTABLE):
return
else:
raise ImportError(
"foldseek executable not installed. Try ./run.sh add_foldseek"
)


active_caches = 0


class CreateUniqueDirName:
"""
Generates a new directory name
use this like
```python
with GenerateDirName() as name:
print(name)
```
on opening scope will create directory of the given name
on closing scope will delete directory of the given name
uses the global `active_caches` above to create a unique dir name
"""

def __enter__(self):
global active_caches
active_caches += 1
self.temp_dir = f"{FOLDSEEK_LOCATION}/temp_dir_{active_caches}"
return self.temp_dir

def __exit__(self, *args):
global active_caches
active_caches -= 1
bash_cmd("rm -rf " + self.temp_dir)


def parse_easy_search_output(filepath: str) -> list[list]:
with open(filepath, "r") as f:
lines = f.readlines()

parsed_lines = []
for line in lines:
parsed_line = []
for column in line.strip("\n").split("\t"):
try:
column = float(column)
except ValueError:
pass
parsed_line.append(column)
parsed_lines.append(parsed_line)

return parsed_lines


def easy_search(
query: str,
target: str,
out_format: str = "query, target, prob",
print_loading_info=False,
) -> list[list]:
"""easy_search just calls foldseek easy-search under the hood
TODO: use pybind to call the C++ function instead

Returns:
list[list]: a list of the matches from the search where the inner list is the same size as out_format
"""

assert_foldseek_installed()

with CreateUniqueDirName() as temp_dir:
out_file = temp_dir + "/output"

# Then call the easy-search
flags = f"--format-output {out_format}" if out_format else ""
cmd = f"{FOLDSEEK_EXECUTABLE} easy-search {query} {target} {out_file} {temp_dir} {flags}"
try:
stdout = bash_cmd(cmd)
except Exception as e:
log.warn(e)
return []

if print_loading_info:
log.warn(stdout)

return parse_easy_search_output(out_file)
14 changes: 14 additions & 0 deletions docs/backend.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,17 @@ https://github.com/xnought/venome/assets/65095341/c44f1d8c-0d58-407c-9aa2-29c4a9


this is where you can see print statements and other debug info / errors.

## Foldseek

For similarity search we use [Foldseek](https://github.com/steineggerlab/foldseek).

Without foldseek installed nothing will be computed and no errors. No harm at all.

However if you want to add foldseek run

```bash
./run.sh add_foldseek
```

to the docker container and then it will compute.
2 changes: 2 additions & 0 deletions docs/run.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ or
| `psql` | Opens up a direct terminal into the database to execute SQL commands live |
| `upload_all` | Uploads all the pdb files to the system via POST requests |
| `delete_all` | Deletes all protein entries and restarts the server from scratch |
| `add_foldseek` | installs foldseek onto the docker container via wget |
| `remove_foldseek` | deletes foldseek from the docker container |

There are actually many more functions, so please check out [`run.sh`](../run.sh).

8 changes: 6 additions & 2 deletions frontend/src/lib/ListProteins.svelte
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
<script lang="ts">
import { navigate } from "svelte-routing";
import type { ProteinEntry } from "./backend";
import { numberWithCommas } from "./format";
import {
formatProteinName,
numberWithCommas,
undoFormatProteinName,
} from "./format";

export let allEntries: ProteinEntry[] | null = null;
</script>
Expand All @@ -25,7 +29,7 @@
</div>
<div class="prot-info">
<div class="prot-name">
{entry.name}
{undoFormatProteinName(entry.name)}
</div>
<div class="prot-desc">
{#if entry.description}
Expand Down
Loading
Loading