Venom-Biochem-Lab · xnought · Feb 29, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/backend/src/api/protein.py b/backend/src/api/protein.py
@@ -52,7 +52,7 @@ def decode_base64(b64_header_and_data: str):
  return b64decode(b64_data_only).decode("utf-8")
 
 
-def pdb_file_name(protein_name: str):
+def stored_pdb_file_name(protein_name: str):
  return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb"
 
 
@@ -62,11 +62,16 @@ def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"):
  elif encoding == "b64":
  return PDB(decode_base64(file_contents), name)
  elif encoding == "file":
- return PDB(open(pdb_file_name(name), "r").read(), name)
+ return PDB(open(stored_pdb_file_name(name), "r").read(), name)
  else:
  raise ValueError(f"Invalid encoding: {encoding}")
 
 
+def format_protein_name(name: str):
+ name = name.replace(" ", "_")
+ return name
+
+
 def protein_name_found(name: str):
  """Checks if a protein name already exists in the database
  Returns: True if exists | False if not exists
@@ -103,7 +108,9 @@ class UploadPNGBody(CamelModel):
 @router.get("/protein/pdb/{protein_name:str}")
 def get_pdb_file(protein_name: str):
  if protein_name_found(protein_name):
- return FileResponse(pdb_file_name(protein_name), filename=protein_name + ".pdb")
+ return FileResponse(
+ stored_pdb_file_name(protein_name), filename=protein_name + ".pdb"
+ )
 
 
 @router.get("/protein/fasta/{protein_name:str}")
@@ -193,7 +200,7 @@ def delete_protein_entry(protein_name: str, req: Request):
  [protein_name],
  )
  # delete the file from the data/ folder
- os.remove(pdb_file_name(protein_name))
+ os.remove(stored_pdb_file_name(protein_name))
  except Exception as e:
  log.error(e)
 
@@ -212,6 +219,8 @@ def upload_protein_png(body: UploadPNGBody):
 @router.post("/protein/upload", response_model=UploadError | None)
 def upload_protein_entry(body: UploadBody, req: Request):
  requiresAuthentication(req)
+
+ body.name = format_protein_name(body.name)
  # check that the name is not already taken in the DB
  if protein_name_found(body.name):
  return UploadError.NAME_NOT_UNIQUE
@@ -225,7 +234,7 @@ def upload_protein_entry(body: UploadBody, req: Request):
 
  try:
  # write to file to data/ folder
- with open(pdb_file_name(pdb.name), "w") as f:
+ with open(stored_pdb_file_name(pdb.name), "w") as f:
  f.write(pdb.file_contents)
  except Exception:
  log.warn("Failed to write to file")
@@ -272,7 +281,9 @@ def edit_protein_entry(body: EditBody, req: Request):
  requiresAuthentication(req)
  try:
  if body.new_name != body.old_name:
- os.rename(pdb_file_name(body.old_name), pdb_file_name(body.new_name))
+ os.rename(
+ stored_pdb_file_name(body.old_name), stored_pdb_file_name(body.new_name)
+ )
 
  with Database() as db:
  name_changed = False

diff --git a/backend/src/api/search.py b/backend/src/api/search.py
@@ -3,10 +3,19 @@
 import logging as log
 from ..db import Database, bytea_to_str
 from ..api_types import CamelModel, ProteinEntry
+from ..foldseek import easy_search
+from .protein import stored_pdb_file_name
 
 router = APIRouter()
 
 
+class SimilarProtein(CamelModel):
+ name: str
+ prob: float
+ evalue: float
+ description: str = ""
+
+
 class RangeFilter(CamelModel):
  min: int | float
  max: int | float
@@ -51,6 +60,19 @@ def combine_where_clauses(clauses: list[str]) -> str:
  return result
 
 
+def get_descriptions(protein_names: list[str]):
+ if len(protein_names) > 0:
+ with Database() as db:
+ list_names = str(protein_names)[
+ 1:-1
+ ] # parse out the [] brackets and keep everything inside
+ query = f"""SELECT description FROM proteins WHERE name in ({list_names})"""
+ entry_sql = db.execute_return(query)
+ if entry_sql is not None:
+ return [d[0] for d in entry_sql]
+ return None
+
+
 def gen_sql_filters(
  species_filter: str | None,
  length_filter: RangeFilter | None = None,
@@ -111,7 +133,7 @@ def search_proteins(body: SearchProteinsBody):
  raise HTTPException(status_code=500, detail=str(e))
 
 
-@router.get("/search/range/length")
+@router.get("/search/range/length", response_model=RangeFilter)
 def search_range_length():
  try:
  with Database() as db:
@@ -123,7 +145,7 @@ def search_range_length():
  raise HTTPException(status_code=500, detail=str(e))
 
 
-@router.get("/search/range/mass")
+@router.get("/search/range/mass", response_model=RangeFilter)
 def search_range_mass():
  try:
  with Database() as db:
@@ -145,3 +167,35 @@ def search_species():
  return [d[0] for d in entry_sql]
  except Exception:
  return
+
+
+@router.get(
+ "/search/venome/similar/{protein_name:str}",
+ response_model=list[SimilarProtein],
+)
+def search_venome_similar(protein_name: str):
+ venome_folder = "/app/src/data/pdbAlphaFold/"
+ # ignore the first since it's itself as the most similar
+ try:
+ similar = easy_search(
+ stored_pdb_file_name(protein_name),
+ venome_folder,
+ out_format="target,prob,evalue",
+ )[1:]
+ formatted = [
+ SimilarProtein(name=name.rstrip(".pdb"), prob=prob, evalue=evalue)
+ for [name, prob, evalue] in similar
+ ]
+ except Exception:
+ raise HTTPException(404, "Foldseek not found on the system")
+
+ try:
+ # populate protein descriptions for the similar proteins
+ descriptions = get_descriptions([s.name for s in formatted])
+ if descriptions is not None:
+ for f, d in zip(formatted, descriptions):
+ f.description = d
+ except Exception:
+ raise HTTPException(500, "Error getting protein descriptions")
+
+ return formatted
diff --git a/backend/src/foldseek.py b/backend/src/foldseek.py
@@ -0,0 +1,99 @@
+import subprocess
+import logging as log
+import os
+
+
+def bash_cmd(cmd: str | list[str]) -> str:
+ return subprocess.check_output(cmd, shell=True).decode()
+
+
+FOLDSEEK_LOCATION = "/app/foldseek"
+FOLDSEEK_EXECUTABLE = f"{FOLDSEEK_LOCATION}/bin/foldseek"
+
+
+def assert_foldseek_installed():
+ if os.path.exists(FOLDSEEK_EXECUTABLE):
+ return
+ else:
+ raise ImportError(
+ "foldseek executable not installed. Try ./run.sh add_foldseek"
+ )
+
+
+active_caches = 0
+
+
+class CreateUniqueDirName:
+ """
+ Generates a new directory name
+ use this like
+ ```python
+ with GenerateDirName() as name:
+ print(name)
+ ```
+ on opening scope will create directory of the given name
+ on closing scope will delete directory of the given name
+ uses the global `active_caches` above to create a unique dir name
+ """
+
+ def __enter__(self):
+ global active_caches
+ active_caches += 1
+ self.temp_dir = f"{FOLDSEEK_LOCATION}/temp_dir_{active_caches}"
+ return self.temp_dir
+
+ def __exit__(self, *args):
+ global active_caches
+ active_caches -= 1
+ bash_cmd("rm -rf " + self.temp_dir)
+
+
+def parse_easy_search_output(filepath: str) -> list[list]:
+ with open(filepath, "r") as f:
+ lines = f.readlines()
+
+ parsed_lines = []
+ for line in lines:
+ parsed_line = []
+ for column in line.strip("\n").split("\t"):
+ try:
+ column = float(column)
+ except ValueError:
+ pass
+ parsed_line.append(column)
+ parsed_lines.append(parsed_line)
+
+ return parsed_lines
+
+
+def easy_search(
+ query: str,
+ target: str,
+ out_format: str = "query, target, prob",
+ print_loading_info=False,
+) -> list[list]:
+ """easy_search just calls foldseek easy-search under the hood
+ TODO: use pybind to call the C++ function instead
+
+ Returns:
+ list[list]: a list of the matches from the search where the inner list is the same size as out_format
+ """
+
+ assert_foldseek_installed()
+
+ with CreateUniqueDirName() as temp_dir:
+ out_file = temp_dir + "/output"
+
+ # Then call the easy-search
+ flags = f"--format-output {out_format}" if out_format else ""
+ cmd = f"{FOLDSEEK_EXECUTABLE} easy-search {query} {target} {out_file} {temp_dir} {flags}"
+ try:
+ stdout = bash_cmd(cmd)
+ except Exception as e:
+ log.warn(e)
+ return []
+
+ if print_loading_info:
+ log.warn(stdout)
+
+ return parse_easy_search_output(out_file)
diff --git a/docs/backend.md b/docs/backend.md
@@ -130,3 +130,17 @@ https://github.com/xnought/venome/assets/65095341/c44f1d8c-0d58-407c-9aa2-29c4a9
 
 
 this is where you can see print statements and other debug info / errors.
+
+## Foldseek
+
+For similarity search we use [Foldseek](https://github.com/steineggerlab/foldseek). 
+
+Without foldseek installed nothing will be computed and no errors. No harm at all.
+
+However if you want to add foldseek run
+
+```bash
+./run.sh add_foldseek
+```
+
+to the docker container and then it will compute.
diff --git a/docs/run.md b/docs/run.md
@@ -57,6 +57,8 @@ or
 | `psql` | Opens up a direct terminal into the database to execute SQL commands live |
 | `upload_all` | Uploads all the pdb files to the system via POST requests |
 | `delete_all` | Deletes all protein entries and restarts the server from scratch |
+| `add_foldseek` | installs foldseek onto the docker container via wget |
+| `remove_foldseek` | deletes foldseek from the docker container |
 
 There are actually many more functions, so please check out [`run.sh`](../run.sh).
 
diff --git a/frontend/src/lib/ListProteins.svelte b/frontend/src/lib/ListProteins.svelte
@@ -1,7 +1,11 @@
 <script lang="ts">
  import { navigate } from "svelte-routing";
  import type { ProteinEntry } from "./backend";
- import { numberWithCommas } from "./format";
+ import {
+ formatProteinName,
+ numberWithCommas,
+ undoFormatProteinName,
+ } from "./format";
 
  export let allEntries: ProteinEntry[] | null = null;
 </script>
@@ -25,7 +29,7 @@
  </div>
  <div class="prot-info">
  <div class="prot-name">
- {entry.name}
+ {undoFormatProteinName(entry.name)}
  </div>
  <div class="prot-desc">
  {#if entry.description}