diff --git a/backend/src/api/protein.py b/backend/src/api/protein.py
index 51053ec0..f35f9a3e 100644
--- a/backend/src/api/protein.py
+++ b/backend/src/api/protein.py
@@ -52,7 +52,7 @@ def decode_base64(b64_header_and_data: str):
return b64decode(b64_data_only).decode("utf-8")
-def pdb_file_name(protein_name: str):
+def stored_pdb_file_name(protein_name: str):
return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb"
@@ -62,11 +62,16 @@ def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"):
elif encoding == "b64":
return PDB(decode_base64(file_contents), name)
elif encoding == "file":
- return PDB(open(pdb_file_name(name), "r").read(), name)
+ return PDB(open(stored_pdb_file_name(name), "r").read(), name)
else:
raise ValueError(f"Invalid encoding: {encoding}")
+def format_protein_name(name: str):
+ name = name.replace(" ", "_")
+ return name
+
+
def protein_name_found(name: str):
"""Checks if a protein name already exists in the database
Returns: True if exists | False if not exists
@@ -103,7 +108,9 @@ class UploadPNGBody(CamelModel):
@router.get("/protein/pdb/{protein_name:str}")
def get_pdb_file(protein_name: str):
if protein_name_found(protein_name):
- return FileResponse(pdb_file_name(protein_name), filename=protein_name + ".pdb")
+ return FileResponse(
+ stored_pdb_file_name(protein_name), filename=protein_name + ".pdb"
+ )
@router.get("/protein/fasta/{protein_name:str}")
@@ -193,7 +200,7 @@ def delete_protein_entry(protein_name: str, req: Request):
[protein_name],
)
# delete the file from the data/ folder
- os.remove(pdb_file_name(protein_name))
+ os.remove(stored_pdb_file_name(protein_name))
except Exception as e:
log.error(e)
@@ -212,6 +219,8 @@ def upload_protein_png(body: UploadPNGBody):
@router.post("/protein/upload", response_model=UploadError | None)
def upload_protein_entry(body: UploadBody, req: Request):
requiresAuthentication(req)
+
+ body.name = format_protein_name(body.name)
# check that the name is not already taken in the DB
if protein_name_found(body.name):
return UploadError.NAME_NOT_UNIQUE
@@ -225,7 +234,7 @@ def upload_protein_entry(body: UploadBody, req: Request):
try:
# write to file to data/ folder
- with open(pdb_file_name(pdb.name), "w") as f:
+ with open(stored_pdb_file_name(pdb.name), "w") as f:
f.write(pdb.file_contents)
except Exception:
log.warn("Failed to write to file")
@@ -272,7 +281,9 @@ def edit_protein_entry(body: EditBody, req: Request):
requiresAuthentication(req)
try:
if body.new_name != body.old_name:
- os.rename(pdb_file_name(body.old_name), pdb_file_name(body.new_name))
+ os.rename(
+ stored_pdb_file_name(body.old_name), stored_pdb_file_name(body.new_name)
+ )
with Database() as db:
name_changed = False
diff --git a/backend/src/api/search.py b/backend/src/api/search.py
index 4e80fab5..b35488d7 100644
--- a/backend/src/api/search.py
+++ b/backend/src/api/search.py
@@ -3,10 +3,19 @@
import logging as log
from ..db import Database, bytea_to_str
from ..api_types import CamelModel, ProteinEntry
+from ..foldseek import easy_search
+from .protein import stored_pdb_file_name
router = APIRouter()
+class SimilarProtein(CamelModel):
+ name: str
+ prob: float
+ evalue: float
+ description: str = ""
+
+
class RangeFilter(CamelModel):
min: int | float
max: int | float
@@ -51,6 +60,19 @@ def combine_where_clauses(clauses: list[str]) -> str:
return result
+def get_descriptions(protein_names: list[str]):
+ if len(protein_names) > 0:
+ with Database() as db:
+ list_names = str(protein_names)[
+ 1:-1
+ ] # parse out the [] brackets and keep everything inside
+ query = f"""SELECT description FROM proteins WHERE name in ({list_names})"""
+ entry_sql = db.execute_return(query)
+ if entry_sql is not None:
+ return [d[0] for d in entry_sql]
+ return None
+
+
def gen_sql_filters(
species_filter: str | None,
length_filter: RangeFilter | None = None,
@@ -111,7 +133,7 @@ def search_proteins(body: SearchProteinsBody):
raise HTTPException(status_code=500, detail=str(e))
-@router.get("/search/range/length")
+@router.get("/search/range/length", response_model=RangeFilter)
def search_range_length():
try:
with Database() as db:
@@ -123,7 +145,7 @@ def search_range_length():
raise HTTPException(status_code=500, detail=str(e))
-@router.get("/search/range/mass")
+@router.get("/search/range/mass", response_model=RangeFilter)
def search_range_mass():
try:
with Database() as db:
@@ -145,3 +167,35 @@ def search_species():
return [d[0] for d in entry_sql]
except Exception:
return
+
+
+@router.get(
+ "/search/venome/similar/{protein_name:str}",
+ response_model=list[SimilarProtein],
+)
+def search_venome_similar(protein_name: str):
+ venome_folder = "/app/src/data/pdbAlphaFold/"
+ # ignore the first since it's itself as the most similar
+ try:
+ similar = easy_search(
+ stored_pdb_file_name(protein_name),
+ venome_folder,
+ out_format="target,prob,evalue",
+ )[1:]
+ formatted = [
+ SimilarProtein(name=name.rstrip(".pdb"), prob=prob, evalue=evalue)
+ for [name, prob, evalue] in similar
+ ]
+ except Exception:
+ raise HTTPException(404, "Foldseek not found on the system")
+
+ try:
+ # populate protein descriptions for the similar proteins
+ descriptions = get_descriptions([s.name for s in formatted])
+ if descriptions is not None:
+ for f, d in zip(formatted, descriptions):
+ f.description = d
+ except Exception:
+ raise HTTPException(500, "Error getting protein descriptions")
+
+ return formatted
diff --git a/backend/src/foldseek.py b/backend/src/foldseek.py
new file mode 100644
index 00000000..d9a0a706
--- /dev/null
+++ b/backend/src/foldseek.py
@@ -0,0 +1,99 @@
+import subprocess
+import logging as log
+import os
+
+
+def bash_cmd(cmd: str | list[str]) -> str:
+ return subprocess.check_output(cmd, shell=True).decode()
+
+
+FOLDSEEK_LOCATION = "/app/foldseek"
+FOLDSEEK_EXECUTABLE = f"{FOLDSEEK_LOCATION}/bin/foldseek"
+
+
+def assert_foldseek_installed():
+ if os.path.exists(FOLDSEEK_EXECUTABLE):
+ return
+ else:
+ raise ImportError(
+ "foldseek executable not installed. Try ./run.sh add_foldseek"
+ )
+
+
+active_caches = 0
+
+
+class CreateUniqueDirName:
+ """
+ Generates a new directory name
+ use this like
+ ```python
+ with GenerateDirName() as name:
+ print(name)
+ ```
+ on opening scope will create directory of the given name
+ on closing scope will delete directory of the given name
+ uses the global `active_caches` above to create a unique dir name
+ """
+
+ def __enter__(self):
+ global active_caches
+ active_caches += 1
+ self.temp_dir = f"{FOLDSEEK_LOCATION}/temp_dir_{active_caches}"
+ return self.temp_dir
+
+ def __exit__(self, *args):
+ global active_caches
+ active_caches -= 1
+ bash_cmd("rm -rf " + self.temp_dir)
+
+
+def parse_easy_search_output(filepath: str) -> list[list]:
+ with open(filepath, "r") as f:
+ lines = f.readlines()
+
+ parsed_lines = []
+ for line in lines:
+ parsed_line = []
+ for column in line.strip("\n").split("\t"):
+ try:
+ column = float(column)
+ except ValueError:
+ pass
+ parsed_line.append(column)
+ parsed_lines.append(parsed_line)
+
+ return parsed_lines
+
+
+def easy_search(
+ query: str,
+ target: str,
+ out_format: str = "query, target, prob",
+ print_loading_info=False,
+) -> list[list]:
+ """easy_search just calls foldseek easy-search under the hood
+ TODO: use pybind to call the C++ function instead
+
+ Returns:
+ list[list]: a list of the matches from the search where the inner list is the same size as out_format
+ """
+
+ assert_foldseek_installed()
+
+ with CreateUniqueDirName() as temp_dir:
+ out_file = temp_dir + "/output"
+
+ # Then call the easy-search
+ flags = f"--format-output {out_format}" if out_format else ""
+ cmd = f"{FOLDSEEK_EXECUTABLE} easy-search {query} {target} {out_file} {temp_dir} {flags}"
+ try:
+ stdout = bash_cmd(cmd)
+ except Exception as e:
+ log.warn(e)
+ return []
+
+ if print_loading_info:
+ log.warn(stdout)
+
+ return parse_easy_search_output(out_file)
diff --git a/docs/backend.md b/docs/backend.md
index d06a06bd..36d48c9f 100644
--- a/docs/backend.md
+++ b/docs/backend.md
@@ -130,3 +130,17 @@ https://github.com/xnought/venome/assets/65095341/c44f1d8c-0d58-407c-9aa2-29c4a9
this is where you can see print statements and other debug info / errors.
+
+## Foldseek
+
+For similarity search we use [Foldseek](https://github.com/steineggerlab/foldseek).
+
+Without foldseek installed nothing will be computed and no errors. No harm at all.
+
+However if you want to add foldseek run
+
+```bash
+./run.sh add_foldseek
+```
+
+to the docker container and then it will compute.
\ No newline at end of file
diff --git a/docs/run.md b/docs/run.md
index bce674f4..da77509d 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -57,6 +57,8 @@ or
| `psql` | Opens up a direct terminal into the database to execute SQL commands live |
| `upload_all` | Uploads all the pdb files to the system via POST requests |
| `delete_all` | Deletes all protein entries and restarts the server from scratch |
+| `add_foldseek` | installs foldseek onto the docker container via wget |
+| `remove_foldseek` | deletes foldseek from the docker container |
There are actually many more functions, so please check out [`run.sh`](../run.sh).
diff --git a/frontend/src/lib/ListProteins.svelte b/frontend/src/lib/ListProteins.svelte
index 94af5c68..c6ecfeb5 100644
--- a/frontend/src/lib/ListProteins.svelte
+++ b/frontend/src/lib/ListProteins.svelte
@@ -1,7 +1,11 @@
@@ -25,7 +29,7 @@
- {entry.name}
+ {undoFormatProteinName(entry.name)}
{#if entry.description}
diff --git a/frontend/src/lib/SimilarProteins.svelte b/frontend/src/lib/SimilarProteins.svelte
index 0d787b0d..a7d797f7 100644
--- a/frontend/src/lib/SimilarProteins.svelte
+++ b/frontend/src/lib/SimilarProteins.svelte
@@ -1,62 +1,50 @@
-
-
- Source |
- Name |
- Desc. |
- Prob. |
-
- {#each similarProteins as protein}
-
-
- {protein.source.toUpperCase()}
- |
- {protein.name} |
- DEscDEscDEscDEsc DEscDEsc DEsc DEsc |
- {protein.prob} |
+
+
+
+ Name |
+ Probability Match |
+ E-Value |
+ Description |
- {/each}
- ... click to see more
-
+ {#each similarProteins as protein}
+
+
+
+
+ {undoFormatProteinName(protein.name)}
+ |
+ {protein.prob} |
+ {protein.evalue} |
+ {protein.description} |
+
+ {/each}
+
+
diff --git a/frontend/src/lib/format.ts b/frontend/src/lib/format.ts
index 66b7142d..0a1c8f75 100644
--- a/frontend/src/lib/format.ts
+++ b/frontend/src/lib/format.ts
@@ -13,3 +13,10 @@ export function fileToString(f: File): Promise
{
reader.onerror = reject;
});
}
+
+export function formatProteinName(name: string) {
+ return name.replaceAll(" ", "_");
+}
+export function undoFormatProteinName(name: string) {
+ return name.replaceAll("_", " ");
+}
diff --git a/frontend/src/lib/openapi/index.ts b/frontend/src/lib/openapi/index.ts
index 373c870b..3a19e21e 100644
--- a/frontend/src/lib/openapi/index.ts
+++ b/frontend/src/lib/openapi/index.ts
@@ -15,6 +15,7 @@ export type { ProteinEntry } from './models/ProteinEntry';
export type { RangeFilter } from './models/RangeFilter';
export type { SearchProteinsBody } from './models/SearchProteinsBody';
export type { SearchProteinsResults } from './models/SearchProteinsResults';
+export type { SimilarProtein } from './models/SimilarProtein';
export type { Tutorial } from './models/Tutorial';
export type { UploadBody } from './models/UploadBody';
export { UploadError } from './models/UploadError';
diff --git a/frontend/src/lib/openapi/models/ProteinEntry.ts b/frontend/src/lib/openapi/models/ProteinEntry.ts
index d375cabb..a25dd470 100644
--- a/frontend/src/lib/openapi/models/ProteinEntry.ts
+++ b/frontend/src/lib/openapi/models/ProteinEntry.ts
@@ -3,12 +3,13 @@
/* tslint:disable */
/* eslint-disable */
export type ProteinEntry = {
- name: string;
- length: number;
- mass: number;
- speciesName: string;
- content?: string | null;
- refs?: string | null;
- thumbnail?: string | null;
- description?: string | null;
+ name: string;
+ length: number;
+ mass: number;
+ speciesName: string;
+ content?: (string | null);
+ refs?: (string | null);
+ thumbnail?: (string | null);
+ description?: (string | null);
};
+
diff --git a/frontend/src/lib/openapi/models/SimilarProtein.ts b/frontend/src/lib/openapi/models/SimilarProtein.ts
new file mode 100644
index 00000000..a222499a
--- /dev/null
+++ b/frontend/src/lib/openapi/models/SimilarProtein.ts
@@ -0,0 +1,11 @@
+/* generated using openapi-typescript-codegen -- do no edit */
+/* istanbul ignore file */
+/* tslint:disable */
+/* eslint-disable */
+export type SimilarProtein = {
+ name: string;
+ prob: number;
+ evalue: number;
+ description?: string;
+};
+
diff --git a/frontend/src/lib/openapi/services/DefaultService.ts b/frontend/src/lib/openapi/services/DefaultService.ts
index 8f45111f..2697272c 100644
--- a/frontend/src/lib/openapi/services/DefaultService.ts
+++ b/frontend/src/lib/openapi/services/DefaultService.ts
@@ -6,8 +6,10 @@ import type { EditBody } from '../models/EditBody';
import type { LoginBody } from '../models/LoginBody';
import type { LoginResponse } from '../models/LoginResponse';
import type { ProteinEntry } from '../models/ProteinEntry';
+import type { RangeFilter } from '../models/RangeFilter';
import type { SearchProteinsBody } from '../models/SearchProteinsBody';
import type { SearchProteinsResults } from '../models/SearchProteinsResults';
+import type { SimilarProtein } from '../models/SimilarProtein';
import type { Tutorial } from '../models/Tutorial';
import type { UploadBody } from '../models/UploadBody';
import type { UploadError } from '../models/UploadError';
@@ -56,10 +58,10 @@ export class DefaultService {
}
/**
* Search Range Length
- * @returns any Successful Response
+ * @returns RangeFilter Successful Response
* @throws ApiError
*/
- public static searchRangeLength(): CancelablePromise {
+ public static searchRangeLength(): CancelablePromise {
return __request(OpenAPI, {
method: 'GET',
url: '/search/range/length',
@@ -67,10 +69,10 @@ export class DefaultService {
}
/**
* Search Range Mass
- * @returns any Successful Response
+ * @returns RangeFilter Successful Response
* @throws ApiError
*/
- public static searchRangeMass(): CancelablePromise {
+ public static searchRangeMass(): CancelablePromise {
return __request(OpenAPI, {
method: 'GET',
url: '/search/range/mass',
@@ -87,6 +89,26 @@ export class DefaultService {
url: '/search/species',
});
}
+ /**
+ * Search Venome Similar
+ * @param proteinName
+ * @returns SimilarProtein Successful Response
+ * @throws ApiError
+ */
+ public static searchVenomeSimilar(
+ proteinName: string,
+ ): CancelablePromise> {
+ return __request(OpenAPI, {
+ method: 'GET',
+ url: '/search/venome/similar/{protein_name}',
+ path: {
+ 'protein_name': proteinName,
+ },
+ errors: {
+ 422: `Validation Error`,
+ },
+ });
+ }
/**
* Get Pdb File
* @param proteinName
diff --git a/frontend/src/routes/Protein.svelte b/frontend/src/routes/Protein.svelte
index d912070c..45b08f31 100644
--- a/frontend/src/routes/Protein.svelte
+++ b/frontend/src/routes/Protein.svelte
@@ -4,7 +4,7 @@
import ProteinVis from "../lib/ProteinVis.svelte";
import { Button, Dropdown, DropdownItem } from "flowbite-svelte";
import Markdown from "../lib/Markdown.svelte";
- import { numberWithCommas } from "../lib/format";
+ import { numberWithCommas, undoFormatProteinName } from "../lib/format";
import { navigate } from "svelte-routing";
import References from "../lib/References.svelte";
import { ChevronDownSolid, PenOutline } from "flowbite-svelte-icons";
@@ -27,8 +27,6 @@
entry = await Backend.getProteinEntry(urlId);
// if we could not find the entry, the id is garbo
if (entry == null) error = true;
-
- console.log("Received", entry);
});
@@ -41,7 +39,7 @@
- {entry.name}
+ {undoFormatProteinName(entry.name)}
@@ -60,7 +58,9 @@
Structurally Similar Proteins
-
+ {#if entry.name}
+
+ {/if}
@@ -145,7 +145,7 @@