diff --git a/backend/src/api/protein.py b/backend/src/api/protein.py index 51053ec0..f35f9a3e 100644 --- a/backend/src/api/protein.py +++ b/backend/src/api/protein.py @@ -52,7 +52,7 @@ def decode_base64(b64_header_and_data: str): return b64decode(b64_data_only).decode("utf-8") -def pdb_file_name(protein_name: str): +def stored_pdb_file_name(protein_name: str): return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb" @@ -62,11 +62,16 @@ def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"): elif encoding == "b64": return PDB(decode_base64(file_contents), name) elif encoding == "file": - return PDB(open(pdb_file_name(name), "r").read(), name) + return PDB(open(stored_pdb_file_name(name), "r").read(), name) else: raise ValueError(f"Invalid encoding: {encoding}") +def format_protein_name(name: str): + name = name.replace(" ", "_") + return name + + def protein_name_found(name: str): """Checks if a protein name already exists in the database Returns: True if exists | False if not exists @@ -103,7 +108,9 @@ class UploadPNGBody(CamelModel): @router.get("/protein/pdb/{protein_name:str}") def get_pdb_file(protein_name: str): if protein_name_found(protein_name): - return FileResponse(pdb_file_name(protein_name), filename=protein_name + ".pdb") + return FileResponse( + stored_pdb_file_name(protein_name), filename=protein_name + ".pdb" + ) @router.get("/protein/fasta/{protein_name:str}") @@ -193,7 +200,7 @@ def delete_protein_entry(protein_name: str, req: Request): [protein_name], ) # delete the file from the data/ folder - os.remove(pdb_file_name(protein_name)) + os.remove(stored_pdb_file_name(protein_name)) except Exception as e: log.error(e) @@ -212,6 +219,8 @@ def upload_protein_png(body: UploadPNGBody): @router.post("/protein/upload", response_model=UploadError | None) def upload_protein_entry(body: UploadBody, req: Request): requiresAuthentication(req) + + body.name = format_protein_name(body.name) # check that the name is not already taken in the DB if protein_name_found(body.name): return UploadError.NAME_NOT_UNIQUE @@ -225,7 +234,7 @@ def upload_protein_entry(body: UploadBody, req: Request): try: # write to file to data/ folder - with open(pdb_file_name(pdb.name), "w") as f: + with open(stored_pdb_file_name(pdb.name), "w") as f: f.write(pdb.file_contents) except Exception: log.warn("Failed to write to file") @@ -272,7 +281,9 @@ def edit_protein_entry(body: EditBody, req: Request): requiresAuthentication(req) try: if body.new_name != body.old_name: - os.rename(pdb_file_name(body.old_name), pdb_file_name(body.new_name)) + os.rename( + stored_pdb_file_name(body.old_name), stored_pdb_file_name(body.new_name) + ) with Database() as db: name_changed = False diff --git a/backend/src/api/search.py b/backend/src/api/search.py index 4e80fab5..b35488d7 100644 --- a/backend/src/api/search.py +++ b/backend/src/api/search.py @@ -3,10 +3,19 @@ import logging as log from ..db import Database, bytea_to_str from ..api_types import CamelModel, ProteinEntry +from ..foldseek import easy_search +from .protein import stored_pdb_file_name router = APIRouter() +class SimilarProtein(CamelModel): + name: str + prob: float + evalue: float + description: str = "" + + class RangeFilter(CamelModel): min: int | float max: int | float @@ -51,6 +60,19 @@ def combine_where_clauses(clauses: list[str]) -> str: return result +def get_descriptions(protein_names: list[str]): + if len(protein_names) > 0: + with Database() as db: + list_names = str(protein_names)[ + 1:-1 + ] # parse out the [] brackets and keep everything inside + query = f"""SELECT description FROM proteins WHERE name in ({list_names})""" + entry_sql = db.execute_return(query) + if entry_sql is not None: + return [d[0] for d in entry_sql] + return None + + def gen_sql_filters( species_filter: str | None, length_filter: RangeFilter | None = None, @@ -111,7 +133,7 @@ def search_proteins(body: SearchProteinsBody): raise HTTPException(status_code=500, detail=str(e)) -@router.get("/search/range/length") +@router.get("/search/range/length", response_model=RangeFilter) def search_range_length(): try: with Database() as db: @@ -123,7 +145,7 @@ def search_range_length(): raise HTTPException(status_code=500, detail=str(e)) -@router.get("/search/range/mass") +@router.get("/search/range/mass", response_model=RangeFilter) def search_range_mass(): try: with Database() as db: @@ -145,3 +167,35 @@ def search_species(): return [d[0] for d in entry_sql] except Exception: return + + +@router.get( + "/search/venome/similar/{protein_name:str}", + response_model=list[SimilarProtein], +) +def search_venome_similar(protein_name: str): + venome_folder = "/app/src/data/pdbAlphaFold/" + # ignore the first since it's itself as the most similar + try: + similar = easy_search( + stored_pdb_file_name(protein_name), + venome_folder, + out_format="target,prob,evalue", + )[1:] + formatted = [ + SimilarProtein(name=name.rstrip(".pdb"), prob=prob, evalue=evalue) + for [name, prob, evalue] in similar + ] + except Exception: + raise HTTPException(404, "Foldseek not found on the system") + + try: + # populate protein descriptions for the similar proteins + descriptions = get_descriptions([s.name for s in formatted]) + if descriptions is not None: + for f, d in zip(formatted, descriptions): + f.description = d + except Exception: + raise HTTPException(500, "Error getting protein descriptions") + + return formatted diff --git a/backend/src/foldseek.py b/backend/src/foldseek.py new file mode 100644 index 00000000..d9a0a706 --- /dev/null +++ b/backend/src/foldseek.py @@ -0,0 +1,99 @@ +import subprocess +import logging as log +import os + + +def bash_cmd(cmd: str | list[str]) -> str: + return subprocess.check_output(cmd, shell=True).decode() + + +FOLDSEEK_LOCATION = "/app/foldseek" +FOLDSEEK_EXECUTABLE = f"{FOLDSEEK_LOCATION}/bin/foldseek" + + +def assert_foldseek_installed(): + if os.path.exists(FOLDSEEK_EXECUTABLE): + return + else: + raise ImportError( + "foldseek executable not installed. Try ./run.sh add_foldseek" + ) + + +active_caches = 0 + + +class CreateUniqueDirName: + """ + Generates a new directory name + use this like + ```python + with GenerateDirName() as name: + print(name) + ``` + on opening scope will create directory of the given name + on closing scope will delete directory of the given name + uses the global `active_caches` above to create a unique dir name + """ + + def __enter__(self): + global active_caches + active_caches += 1 + self.temp_dir = f"{FOLDSEEK_LOCATION}/temp_dir_{active_caches}" + return self.temp_dir + + def __exit__(self, *args): + global active_caches + active_caches -= 1 + bash_cmd("rm -rf " + self.temp_dir) + + +def parse_easy_search_output(filepath: str) -> list[list]: + with open(filepath, "r") as f: + lines = f.readlines() + + parsed_lines = [] + for line in lines: + parsed_line = [] + for column in line.strip("\n").split("\t"): + try: + column = float(column) + except ValueError: + pass + parsed_line.append(column) + parsed_lines.append(parsed_line) + + return parsed_lines + + +def easy_search( + query: str, + target: str, + out_format: str = "query, target, prob", + print_loading_info=False, +) -> list[list]: + """easy_search just calls foldseek easy-search under the hood + TODO: use pybind to call the C++ function instead + + Returns: + list[list]: a list of the matches from the search where the inner list is the same size as out_format + """ + + assert_foldseek_installed() + + with CreateUniqueDirName() as temp_dir: + out_file = temp_dir + "/output" + + # Then call the easy-search + flags = f"--format-output {out_format}" if out_format else "" + cmd = f"{FOLDSEEK_EXECUTABLE} easy-search {query} {target} {out_file} {temp_dir} {flags}" + try: + stdout = bash_cmd(cmd) + except Exception as e: + log.warn(e) + return [] + + if print_loading_info: + log.warn(stdout) + + return parse_easy_search_output(out_file) diff --git a/docs/backend.md b/docs/backend.md index d06a06bd..36d48c9f 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -130,3 +130,17 @@ https://github.com/xnought/venome/assets/65095341/c44f1d8c-0d58-407c-9aa2-29c4a9 this is where you can see print statements and other debug info / errors. + +## Foldseek + +For similarity search we use [Foldseek](https://github.com/steineggerlab/foldseek). + +Without foldseek installed nothing will be computed and no errors. No harm at all. + +However if you want to add foldseek run + +```bash +./run.sh add_foldseek +``` + +to the docker container and then it will compute. \ No newline at end of file diff --git a/docs/run.md b/docs/run.md index bce674f4..da77509d 100644 --- a/docs/run.md +++ b/docs/run.md @@ -57,6 +57,8 @@ or | `psql` | Opens up a direct terminal into the database to execute SQL commands live | | `upload_all` | Uploads all the pdb files to the system via POST requests | | `delete_all` | Deletes all protein entries and restarts the server from scratch | +| `add_foldseek` | installs foldseek onto the docker container via wget | +| `remove_foldseek` | deletes foldseek from the docker container | There are actually many more functions, so please check out [`run.sh`](../run.sh). diff --git a/frontend/src/lib/ListProteins.svelte b/frontend/src/lib/ListProteins.svelte index 94af5c68..c6ecfeb5 100644 --- a/frontend/src/lib/ListProteins.svelte +++ b/frontend/src/lib/ListProteins.svelte @@ -1,7 +1,11 @@ @@ -25,7 +29,7 @@
- {entry.name} + {undoFormatProteinName(entry.name)}
{#if entry.description} diff --git a/frontend/src/lib/SimilarProteins.svelte b/frontend/src/lib/SimilarProteins.svelte index 0d787b0d..a7d797f7 100644 --- a/frontend/src/lib/SimilarProteins.svelte +++ b/frontend/src/lib/SimilarProteins.svelte @@ -1,62 +1,50 @@ - - - - - - - - {#each similarProteins as protein} - - - - - +
+
Source Name Desc. Prob.
- {protein.source.toUpperCase()} - {protein.name}DEscDEscDEscDEsc DEscDEsc DEsc DEsc {protein.prob}
+ + + + + - {/each} - ... click to see more -
Name Probability Match E-Value Description
+ {#each similarProteins as protein} + + + + + {undoFormatProteinName(protein.name)} + + {protein.prob} + {protein.evalue} + {protein.description} + + {/each} + +
diff --git a/frontend/src/lib/format.ts b/frontend/src/lib/format.ts index 66b7142d..0a1c8f75 100644 --- a/frontend/src/lib/format.ts +++ b/frontend/src/lib/format.ts @@ -13,3 +13,10 @@ export function fileToString(f: File): Promise { reader.onerror = reject; }); } + +export function formatProteinName(name: string) { + return name.replaceAll(" ", "_"); +} +export function undoFormatProteinName(name: string) { + return name.replaceAll("_", " "); +} diff --git a/frontend/src/lib/openapi/index.ts b/frontend/src/lib/openapi/index.ts index 373c870b..3a19e21e 100644 --- a/frontend/src/lib/openapi/index.ts +++ b/frontend/src/lib/openapi/index.ts @@ -15,6 +15,7 @@ export type { ProteinEntry } from './models/ProteinEntry'; export type { RangeFilter } from './models/RangeFilter'; export type { SearchProteinsBody } from './models/SearchProteinsBody'; export type { SearchProteinsResults } from './models/SearchProteinsResults'; +export type { SimilarProtein } from './models/SimilarProtein'; export type { Tutorial } from './models/Tutorial'; export type { UploadBody } from './models/UploadBody'; export { UploadError } from './models/UploadError'; diff --git a/frontend/src/lib/openapi/models/ProteinEntry.ts b/frontend/src/lib/openapi/models/ProteinEntry.ts index d375cabb..a25dd470 100644 --- a/frontend/src/lib/openapi/models/ProteinEntry.ts +++ b/frontend/src/lib/openapi/models/ProteinEntry.ts @@ -3,12 +3,13 @@ /* tslint:disable */ /* eslint-disable */ export type ProteinEntry = { - name: string; - length: number; - mass: number; - speciesName: string; - content?: string | null; - refs?: string | null; - thumbnail?: string | null; - description?: string | null; + name: string; + length: number; + mass: number; + speciesName: string; + content?: (string | null); + refs?: (string | null); + thumbnail?: (string | null); + description?: (string | null); }; + diff --git a/frontend/src/lib/openapi/models/SimilarProtein.ts b/frontend/src/lib/openapi/models/SimilarProtein.ts new file mode 100644 index 00000000..a222499a --- /dev/null +++ b/frontend/src/lib/openapi/models/SimilarProtein.ts @@ -0,0 +1,11 @@ +/* generated using openapi-typescript-codegen -- do no edit */ +/* istanbul ignore file */ +/* tslint:disable */ +/* eslint-disable */ +export type SimilarProtein = { + name: string; + prob: number; + evalue: number; + description?: string; +}; + diff --git a/frontend/src/lib/openapi/services/DefaultService.ts b/frontend/src/lib/openapi/services/DefaultService.ts index 8f45111f..2697272c 100644 --- a/frontend/src/lib/openapi/services/DefaultService.ts +++ b/frontend/src/lib/openapi/services/DefaultService.ts @@ -6,8 +6,10 @@ import type { EditBody } from '../models/EditBody'; import type { LoginBody } from '../models/LoginBody'; import type { LoginResponse } from '../models/LoginResponse'; import type { ProteinEntry } from '../models/ProteinEntry'; +import type { RangeFilter } from '../models/RangeFilter'; import type { SearchProteinsBody } from '../models/SearchProteinsBody'; import type { SearchProteinsResults } from '../models/SearchProteinsResults'; +import type { SimilarProtein } from '../models/SimilarProtein'; import type { Tutorial } from '../models/Tutorial'; import type { UploadBody } from '../models/UploadBody'; import type { UploadError } from '../models/UploadError'; @@ -56,10 +58,10 @@ export class DefaultService { } /** * Search Range Length - * @returns any Successful Response + * @returns RangeFilter Successful Response * @throws ApiError */ - public static searchRangeLength(): CancelablePromise { + public static searchRangeLength(): CancelablePromise { return __request(OpenAPI, { method: 'GET', url: '/search/range/length', @@ -67,10 +69,10 @@ export class DefaultService { } /** * Search Range Mass - * @returns any Successful Response + * @returns RangeFilter Successful Response * @throws ApiError */ - public static searchRangeMass(): CancelablePromise { + public static searchRangeMass(): CancelablePromise { return __request(OpenAPI, { method: 'GET', url: '/search/range/mass', @@ -87,6 +89,26 @@ export class DefaultService { url: '/search/species', }); } + /** + * Search Venome Similar + * @param proteinName + * @returns SimilarProtein Successful Response + * @throws ApiError + */ + public static searchVenomeSimilar( + proteinName: string, + ): CancelablePromise> { + return __request(OpenAPI, { + method: 'GET', + url: '/search/venome/similar/{protein_name}', + path: { + 'protein_name': proteinName, + }, + errors: { + 422: `Validation Error`, + }, + }); + } /** * Get Pdb File * @param proteinName diff --git a/frontend/src/routes/Protein.svelte b/frontend/src/routes/Protein.svelte index d912070c..45b08f31 100644 --- a/frontend/src/routes/Protein.svelte +++ b/frontend/src/routes/Protein.svelte @@ -4,7 +4,7 @@ import ProteinVis from "../lib/ProteinVis.svelte"; import { Button, Dropdown, DropdownItem } from "flowbite-svelte"; import Markdown from "../lib/Markdown.svelte"; - import { numberWithCommas } from "../lib/format"; + import { numberWithCommas, undoFormatProteinName } from "../lib/format"; import { navigate } from "svelte-routing"; import References from "../lib/References.svelte"; import { ChevronDownSolid, PenOutline } from "flowbite-svelte-icons"; @@ -27,8 +27,6 @@ entry = await Backend.getProteinEntry(urlId); // if we could not find the entry, the id is garbo if (entry == null) error = true; - - console.log("Received", entry); }); @@ -41,7 +39,7 @@

- {entry.name} + {undoFormatProteinName(entry.name)}

@@ -60,7 +58,9 @@
Structurally Similar Proteins - + {#if entry.name} + + {/if}
@@ -145,7 +145,7 @@