diff --git a/backend/src/api_types.py b/backend/src/api_types.py index 69152eac..9797ee93 100644 --- a/backend/src/api_types.py +++ b/backend/src/api_types.py @@ -62,3 +62,8 @@ class EditBody(CamelModel): new_species_name: str new_content: str | None = None new_refs: str | None = None + + +class SimilarProtein(CamelModel): + name: str + prob: float diff --git a/backend/src/foldseek.py b/backend/src/foldseek.py new file mode 100644 index 00000000..cd0a4aad --- /dev/null +++ b/backend/src/foldseek.py @@ -0,0 +1,121 @@ +import subprocess +import logging as log + +EXTERNAL_DATABASES = [ + "Alphafold/UniProt", + "Alphafold/UniProt50", + "Alphafold/Proteome", + "Alphafold/Swiss", + "ESMAtlas30", + "PDB", +] + + +def bash_cmd(cmd: str | list[str]) -> str: + return subprocess.check_output(cmd, shell=True).decode() + + +def to_columnar_array(arr: list[list]) -> list[list]: + columnar = [] + for i in range(len(arr[0])): + columnar.append([]) + for j in range(len(arr)): + columnar[i].append(arr[j][i]) + return columnar + + +def parse_output(filepath: str) -> list[list]: + with open(filepath, "r") as f: + lines = f.readlines() + + parsed_lines = [] + for line in lines: + parsed_line = [] + for column in line.strip("\n").split("\t"): + try: + column = float(column) + except ValueError: + pass + parsed_line.append(column) + parsed_lines.append(parsed_line) + + return parsed_lines + + +def easy_search( + query: str, + target: str, + out_format: list[str] = ["query", "target", "prob"], + out_file=".foldseek_cache/output", + temp_dir=".foldseek_cache", + print_stdout=False, + foldseek_executable="./foldseek/bin/foldseek", + columnar=False, +) -> list[list]: + """easy_search just calls foldseek easy-search under the hood + TODO: use pybind to call the C++ function instead + + Returns: + list[list]: a list of the matches from the search + """ + + # Then call the easy-search + flags = f"--format-output {','.join(out_format)}" if len(out_format) > 0 else "" + cmd = f"{foldseek_executable} easy-search {query} {target} {out_file} {temp_dir} {flags}" + try: + stdout = bash_cmd(cmd) + except Exception as e: + log.warn(e) + return [] + + if print_stdout: + log.warn(stdout) + + if columnar: + return to_columnar_array(parse_output(out_file)) + else: + return parse_output(out_file) + + +def create_db( + dir: str, + db_name: str, + foldseek_executable="foldseek", + print_stdout=False, + temp_dir=".foldseek_cache", +): + # don't continue unless they actually have foldseek installed + if bash_cmd(f"which {foldseek_executable}") == "": + raise Exception("foldseek not found in PATH") + + # check that our dir exists + try: + bash_cmd(f"ls {dir}") + except Exception: + if dir not in EXTERNAL_DATABASES: + raise Exception(f"Directory {dir} not found") + + # if database already exists, don't create another + try: + bash_cmd(f"ls {db_name}") + except Exception: + if dir not in EXTERNAL_DATABASES: + cmd = f"{foldseek_executable} createdb {dir} {db_name}" + else: + cmd = f"{foldseek_executable} databases {dir} {db_name} {temp_dir}" + stdout = bash_cmd(cmd) + if print_stdout: + print(stdout) + + return db_name + + +if __name__ == "__main__": + # search each protein in test_examples/ with every other one + test_targets = create_db(dir="test_examples", db_name="test") + output = easy_search( + query=test_targets, + target=test_targets, + out_format=["query", "target", "prob"], + ) + print(output) diff --git a/backend/src/protein.py b/backend/src/protein.py index 14f6c9f7..74783244 100644 --- a/backend/src/protein.py +++ b/backend/src/protein.py @@ -49,6 +49,10 @@ def pdb_file_name(protein_name: str): ) +def revert_pdb_filename(file_name: str): + return file_name.replace(".pdb", "").replace("_", " ") + + def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"): if encoding == "str": return PDB(file_contents, name) diff --git a/backend/src/server.py b/backend/src/server.py index 953c4676..465a577d 100644 --- a/backend/src/server.py +++ b/backend/src/server.py @@ -2,16 +2,31 @@ import os from io import BytesIO from fastapi.responses import FileResponse, StreamingResponse -from .api_types import ProteinEntry, UploadBody, UploadError, EditBody +from .api_types import ProteinEntry, UploadBody, UploadError, EditBody, SimilarProtein from .db import Database, bytea_to_str, str_to_bytea -from .protein import parse_protein_pdb, pdb_file_name, protein_name_found, pdb_to_fasta +from .protein import ( + parse_protein_pdb, + pdb_file_name, + protein_name_found, + pdb_to_fasta, + revert_pdb_filename, +) from .setup import disable_cors, init_fastapi_app +from .foldseek import easy_search app = init_fastapi_app() disable_cors(app, origins=[os.environ["PUBLIC_FRONTEND_URL"]]) +@app.get("/similar-venome/{protein_name:str}", response_model=list[SimilarProtein]) +def get_venome_proteins(protein_name: str): + query_name = pdb_file_name(protein_name) + target_folder = "src/data/pdbAlphaFold/" + similar = easy_search(query_name, target_folder, out_format=["target", "prob"]) + return [SimilarProtein(name=revert_pdb_filename(s[0]), prob=s[1]) for s in similar] + + @app.get("/pdb/{protein_name:str}") def get_pdb_file(protein_name: str): if protein_name_found(protein_name):