From 8058b2e6fb4b3bbfa16ea9b65654e3035219ccba Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 20 Feb 2024 17:53:07 -0800 Subject: [PATCH 01/12] feat: command to add and remove foldseek from docker --- run.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/run.sh b/run.sh index e3f809c2..02d13af6 100755 --- a/run.sh +++ b/run.sh @@ -115,6 +115,17 @@ function delete_all() { cd galaxy && python3 delete_all.py && soft_restart } +function add_foldseek() { + docker exec -it venome-backend wget https://mmseqs.com/foldseek/foldseek-linux-sse2.tar.gz + docker exec -it venome-backend tar -xvf foldseek-linux-sse2.tar.gz + docker exec -it venome-backend rm -f foldseek-linux-sse2.tar.gz +} + +function remove_foldseek() { + docker exec -it venome-backend rm -f foldseek-linux-sse2.tar.gz* + docker exec -it venome-backend rm -fr foldseek/ +} + function scrape_func_names() { functions=($(grep -oE 'function[[:space:]]+[a-zA-Z_][a-zA-Z_0-9]*' ./run.sh | sed 's/function[[:space:]]*//')) } From 60aa2fe5ccb2bb49996695ccd7c4b6f3e2c3eca1 Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 20 Feb 2024 18:20:56 -0800 Subject: [PATCH 02/12] feat: add foldseek and add skeleton req --- backend/src/api/search.py | 9 ++++ backend/src/foldseek.py | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 backend/src/foldseek.py diff --git a/backend/src/api/search.py b/backend/src/api/search.py index 4e80fab5..22a29c8d 100644 --- a/backend/src/api/search.py +++ b/backend/src/api/search.py @@ -3,6 +3,7 @@ import logging as log from ..db import Database, bytea_to_str from ..api_types import CamelModel, ProteinEntry +from ..foldseek import easy_search router = APIRouter() @@ -145,3 +146,11 @@ def search_species(): return [d[0] for d in entry_sql] except Exception: return + + +@router.get("/search/venome/similar/{protein_name:str}", response_model=list) +def search_venome_similar(protein_name: str): + venome_folder = "/app/src/data/pdbAlphaFold/" + print(protein_name) + # similar = easy_search("", venome_folder, out_format="target,prob") + return [] diff --git a/backend/src/foldseek.py b/backend/src/foldseek.py new file mode 100644 index 00000000..d9a0a706 --- /dev/null +++ b/backend/src/foldseek.py @@ -0,0 +1,99 @@ +import subprocess +import logging as log +import os + + +def bash_cmd(cmd: str | list[str]) -> str: + return subprocess.check_output(cmd, shell=True).decode() + + +FOLDSEEK_LOCATION = "/app/foldseek" +FOLDSEEK_EXECUTABLE = f"{FOLDSEEK_LOCATION}/bin/foldseek" + + +def assert_foldseek_installed(): + if os.path.exists(FOLDSEEK_EXECUTABLE): + return + else: + raise ImportError( + "foldseek executable not installed. Try ./run.sh add_foldseek" + ) + + +active_caches = 0 + + +class CreateUniqueDirName: + """ + Generates a new directory name + use this like + ```python + with GenerateDirName() as name: + print(name) + ``` + on opening scope will create directory of the given name + on closing scope will delete directory of the given name + uses the global `active_caches` above to create a unique dir name + """ + + def __enter__(self): + global active_caches + active_caches += 1 + self.temp_dir = f"{FOLDSEEK_LOCATION}/temp_dir_{active_caches}" + return self.temp_dir + + def __exit__(self, *args): + global active_caches + active_caches -= 1 + bash_cmd("rm -rf " + self.temp_dir) + + +def parse_easy_search_output(filepath: str) -> list[list]: + with open(filepath, "r") as f: + lines = f.readlines() + + parsed_lines = [] + for line in lines: + parsed_line = [] + for column in line.strip("\n").split("\t"): + try: + column = float(column) + except ValueError: + pass + parsed_line.append(column) + parsed_lines.append(parsed_line) + + return parsed_lines + + +def easy_search( + query: str, + target: str, + out_format: str = "query, target, prob", + print_loading_info=False, +) -> list[list]: + """easy_search just calls foldseek easy-search under the hood + TODO: use pybind to call the C++ function instead + + Returns: + list[list]: a list of the matches from the search where the inner list is the same size as out_format + """ + + assert_foldseek_installed() + + with CreateUniqueDirName() as temp_dir: + out_file = temp_dir + "/output" + + # Then call the easy-search + flags = f"--format-output {out_format}" if out_format else "" + cmd = f"{FOLDSEEK_EXECUTABLE} easy-search {query} {target} {out_file} {temp_dir} {flags}" + try: + stdout = bash_cmd(cmd) + except Exception as e: + log.warn(e) + return [] + + if print_loading_info: + log.warn(stdout) + + return parse_easy_search_output(out_file) From c6d62bb5a7e45abc7580890d33d621c610c0931d Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 20 Feb 2024 18:32:23 -0800 Subject: [PATCH 03/12] feat: finish http endpoint for similarity --- backend/src/api/protein.py | 17 +++++++++++------ backend/src/api/search.py | 23 +++++++++++++++++++---- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/backend/src/api/protein.py b/backend/src/api/protein.py index 84cf640e..a49af9e4 100644 --- a/backend/src/api/protein.py +++ b/backend/src/api/protein.py @@ -50,7 +50,8 @@ def decode_base64(b64_header_and_data: str): return b64decode(b64_data_only).decode("utf-8") -def pdb_file_name(protein_name: str): +def stored_pdb_file_name(protein_name: str): + protein_name = protein_name.replace(" ", "_") return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb" @@ -60,7 +61,7 @@ def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"): elif encoding == "b64": return PDB(decode_base64(file_contents), name) elif encoding == "file": - return PDB(open(pdb_file_name(name), "r").read(), name) + return PDB(open(stored_pdb_file_name(name), "r").read(), name) else: raise ValueError(f"Invalid encoding: {encoding}") @@ -101,7 +102,9 @@ class UploadPNGBody(CamelModel): @router.get("/protein/pdb/{protein_name:str}") def get_pdb_file(protein_name: str): if protein_name_found(protein_name): - return FileResponse(pdb_file_name(protein_name), filename=protein_name + ".pdb") + return FileResponse( + stored_pdb_file_name(protein_name), filename=protein_name + ".pdb" + ) @router.get("/protein/fasta/{protein_name:str}") @@ -190,7 +193,7 @@ def delete_protein_entry(protein_name: str): [protein_name], ) # delete the file from the data/ folder - os.remove(pdb_file_name(protein_name)) + os.remove(stored_pdb_file_name(protein_name)) except Exception as e: log.error(e) @@ -221,7 +224,7 @@ def upload_protein_entry(body: UploadBody): try: # write to file to data/ folder - with open(pdb_file_name(pdb.name), "w") as f: + with open(stored_pdb_file_name(pdb.name), "w") as f: f.write(pdb.file_contents) except Exception: log.warn("Failed to write to file") @@ -268,7 +271,9 @@ def edit_protein_entry(body: EditBody): try: if body.new_name != body.old_name: - os.rename(pdb_file_name(body.old_name), pdb_file_name(body.new_name)) + os.rename( + stored_pdb_file_name(body.old_name), stored_pdb_file_name(body.new_name) + ) with Database() as db: name_changed = False diff --git a/backend/src/api/search.py b/backend/src/api/search.py index 22a29c8d..96b3027d 100644 --- a/backend/src/api/search.py +++ b/backend/src/api/search.py @@ -4,10 +4,16 @@ from ..db import Database, bytea_to_str from ..api_types import CamelModel, ProteinEntry from ..foldseek import easy_search +from .protein import stored_pdb_file_name router = APIRouter() +class SimilarProtein(CamelModel): + name: str + prob: float + + class RangeFilter(CamelModel): min: int | float max: int | float @@ -148,9 +154,18 @@ def search_species(): return -@router.get("/search/venome/similar/{protein_name:str}", response_model=list) +@router.get( + "/search/venome/similar/{protein_name:str}", response_model=list[SimilarProtein] +) def search_venome_similar(protein_name: str): venome_folder = "/app/src/data/pdbAlphaFold/" - print(protein_name) - # similar = easy_search("", venome_folder, out_format="target,prob") - return [] + # ignore the first since it's itself as the most similar + similar = easy_search( + stored_pdb_file_name(protein_name), venome_folder, out_format="target,prob" + )[1:] + # TODO: replace by returning ids and not names + formatted = [ + SimilarProtein(name=name.replace("_", " ").rstrip(".pdb"), prob=prob) + for [name, prob] in similar + ] + return formatted From 54e394c4d9a6d1fbea81431a8e20165011be1989 Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 20 Feb 2024 18:48:39 -0800 Subject: [PATCH 04/12] feat: shows similar proteins in frontend --- frontend/src/lib/SimilarProteins.svelte | 42 ++++--------------- frontend/src/lib/openapi/index.ts | 1 + .../src/lib/openapi/models/ProteinEntry.ts | 17 ++++---- .../src/lib/openapi/models/SimilarProtein.ts | 9 ++++ .../lib/openapi/services/DefaultService.ts | 21 ++++++++++ frontend/src/routes/Protein.svelte | 6 +-- 6 files changed, 51 insertions(+), 45 deletions(-) create mode 100644 frontend/src/lib/openapi/models/SimilarProtein.ts diff --git a/frontend/src/lib/SimilarProteins.svelte b/frontend/src/lib/SimilarProteins.svelte index 0d787b0d..5827eeef 100644 --- a/frontend/src/lib/SimilarProteins.svelte +++ b/frontend/src/lib/SimilarProteins.svelte @@ -1,61 +1,35 @@ - - + {#each similarProteins as protein} - {/each} - ... click to see more
Source Name Similar Desc. Prob.
- {protein.source.toUpperCase()} + {protein.name} {protein.name} DEscDEscDEscDEsc DEscDEsc DEsc DEsc {protein.prob}
diff --git a/frontend/src/lib/openapi/models/SimilarProtein.ts b/frontend/src/lib/openapi/models/SimilarProtein.ts index 3dd33216..a222499a 100644 --- a/frontend/src/lib/openapi/models/SimilarProtein.ts +++ b/frontend/src/lib/openapi/models/SimilarProtein.ts @@ -5,5 +5,7 @@ export type SimilarProtein = { name: string; prob: number; + evalue: number; + description?: string; }; From 90ed1837df7beb03d6b5de2e9b48e09b9e79dc60 Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 27 Feb 2024 16:37:50 -0800 Subject: [PATCH 10/12] fix: format left side correctly in protein entry --- frontend/src/routes/Protein.svelte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/routes/Protein.svelte b/frontend/src/routes/Protein.svelte index 5bd38e00..45b08f31 100644 --- a/frontend/src/routes/Protein.svelte +++ b/frontend/src/routes/Protein.svelte @@ -145,7 +145,7 @@ From 327da76345169988c42358e7819c690986759016 Mon Sep 17 00:00:00 2001 From: xnought Date: Tue, 27 Feb 2024 16:56:26 -0800 Subject: [PATCH 12/12] feat: docs on foldseek --- docs/backend.md | 14 ++++++++++++++ docs/run.md | 2 ++ 2 files changed, 16 insertions(+) diff --git a/docs/backend.md b/docs/backend.md index d06a06bd..36d48c9f 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -130,3 +130,17 @@ https://github.com/xnought/venome/assets/65095341/c44f1d8c-0d58-407c-9aa2-29c4a9 this is where you can see print statements and other debug info / errors. + +## Foldseek + +For similarity search we use [Foldseek](https://github.com/steineggerlab/foldseek). + +Without foldseek installed nothing will be computed and no errors. No harm at all. + +However if you want to add foldseek run + +```bash +./run.sh add_foldseek +``` + +to the docker container and then it will compute. \ No newline at end of file diff --git a/docs/run.md b/docs/run.md index bce674f4..da77509d 100644 --- a/docs/run.md +++ b/docs/run.md @@ -57,6 +57,8 @@ or | `psql` | Opens up a direct terminal into the database to execute SQL commands live | | `upload_all` | Uploads all the pdb files to the system via POST requests | | `delete_all` | Deletes all protein entries and restarts the server from scratch | +| `add_foldseek` | installs foldseek onto the docker container via wget | +| `remove_foldseek` | deletes foldseek from the docker container | There are actually many more functions, so please check out [`run.sh`](../run.sh).