Skip to content

Commit

Permalink
feat: similarity search
Browse files Browse the repository at this point in the history
  • Loading branch information
xnought committed Feb 1, 2024
1 parent 2744e37 commit 9969419
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 2 deletions.
5 changes: 5 additions & 0 deletions backend/src/api_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,8 @@ class EditBody(CamelModel):
new_species_name: str
new_content: str | None = None
new_refs: str | None = None


class SimilarProtein(CamelModel):
name: str
prob: float
121 changes: 121 additions & 0 deletions backend/src/foldseek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import subprocess
import logging as log

EXTERNAL_DATABASES = [
"Alphafold/UniProt",
"Alphafold/UniProt50",
"Alphafold/Proteome",
"Alphafold/Swiss",
"ESMAtlas30",
"PDB",
]


def bash_cmd(cmd: str | list[str]) -> str:
return subprocess.check_output(cmd, shell=True).decode()


def to_columnar_array(arr: list[list]) -> list[list]:
columnar = []
for i in range(len(arr[0])):
columnar.append([])
for j in range(len(arr)):
columnar[i].append(arr[j][i])
return columnar


def parse_output(filepath: str) -> list[list]:
with open(filepath, "r") as f:
lines = f.readlines()

parsed_lines = []
for line in lines:
parsed_line = []
for column in line.strip("\n").split("\t"):
try:
column = float(column)
except ValueError:
pass
parsed_line.append(column)
parsed_lines.append(parsed_line)

return parsed_lines


def easy_search(
query: str,
target: str,
out_format: list[str] = ["query", "target", "prob"],
out_file=".foldseek_cache/output",
temp_dir=".foldseek_cache",
print_stdout=False,
foldseek_executable="./foldseek/bin/foldseek",
columnar=False,
) -> list[list]:
"""easy_search just calls foldseek easy-search under the hood
TODO: use pybind to call the C++ function instead
Returns:
list[list]: a list of the matches from the search
"""

# Then call the easy-search
flags = f"--format-output {','.join(out_format)}" if len(out_format) > 0 else ""
cmd = f"{foldseek_executable} easy-search {query} {target} {out_file} {temp_dir} {flags}"
try:
stdout = bash_cmd(cmd)
except Exception as e:
log.warn(e)
return []

if print_stdout:
log.warn(stdout)

if columnar:
return to_columnar_array(parse_output(out_file))
else:
return parse_output(out_file)


def create_db(
dir: str,
db_name: str,
foldseek_executable="foldseek",
print_stdout=False,
temp_dir=".foldseek_cache",
):
# don't continue unless they actually have foldseek installed
if bash_cmd(f"which {foldseek_executable}") == "":
raise Exception("foldseek not found in PATH")

# check that our dir exists
try:
bash_cmd(f"ls {dir}")
except Exception:
if dir not in EXTERNAL_DATABASES:
raise Exception(f"Directory {dir} not found")

# if database already exists, don't create another
try:
bash_cmd(f"ls {db_name}")
except Exception:
if dir not in EXTERNAL_DATABASES:
cmd = f"{foldseek_executable} createdb {dir} {db_name}"
else:
cmd = f"{foldseek_executable} databases {dir} {db_name} {temp_dir}"
stdout = bash_cmd(cmd)
if print_stdout:
print(stdout)

return db_name


if __name__ == "__main__":
# search each protein in test_examples/ with every other one
test_targets = create_db(dir="test_examples", db_name="test")
output = easy_search(
query=test_targets,
target=test_targets,
out_format=["query", "target", "prob"],
)
print(output)
4 changes: 4 additions & 0 deletions backend/src/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def pdb_file_name(protein_name: str):
)


def revert_pdb_filename(file_name: str):
return file_name.replace(".pdb", "").replace("_", " ")


def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"):
if encoding == "str":
return PDB(file_contents, name)
Expand Down
19 changes: 17 additions & 2 deletions backend/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,31 @@
import os
from io import BytesIO
from fastapi.responses import FileResponse, StreamingResponse
from .api_types import ProteinEntry, UploadBody, UploadError, EditBody
from .api_types import ProteinEntry, UploadBody, UploadError, EditBody, SimilarProtein
from .db import Database, bytea_to_str, str_to_bytea
from .protein import parse_protein_pdb, pdb_file_name, protein_name_found, pdb_to_fasta
from .protein import (
parse_protein_pdb,
pdb_file_name,
protein_name_found,
pdb_to_fasta,
revert_pdb_filename,
)
from .setup import disable_cors, init_fastapi_app
from .foldseek import easy_search


app = init_fastapi_app()
disable_cors(app, origins=[os.environ["PUBLIC_FRONTEND_URL"]])


@app.get("/similar-venome/{protein_name:str}", response_model=list[SimilarProtein])
def get_venome_proteins(protein_name: str):
query_name = pdb_file_name(protein_name)
target_folder = "src/data/pdbAlphaFold/"
similar = easy_search(query_name, target_folder, out_format=["target", "prob"])
return [SimilarProtein(name=revert_pdb_filename(s[0]), prob=s[1]) for s in similar]


@app.get("/pdb/{protein_name:str}")
def get_pdb_file(protein_name: str):
if protein_name_found(protein_name):
Expand Down

0 comments on commit 9969419

Please sign in to comment.