Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: clustering and similarity search #155

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ee1e07b
feat: change favicon and add logo to assets
xnought Jan 26, 2024
4b59459
feat: add new color scheme and font
xnought Jan 26, 2024
a734f83
feat: add icons to the home and upload
xnought Jan 26, 2024
9641629
feat: add search to navbar
xnought Jan 26, 2024
99c697a
feat: make the top search work for all tabs
xnought Jan 26, 2024
415c2e6
feat: restyle markdown headers
xnought Jan 27, 2024
8b015d2
feat: fix title
xnought Jan 27, 2024
0e54746
feat: add fake description
xnought Jan 27, 2024
a4ac2e6
feat: replace cards with new component
xnought Jan 27, 2024
e8a398f
fix: reactivity and search
xnought Jan 27, 2024
9f950bc
feat: don't reroute home
xnought Jan 30, 2024
0cd6959
feat: add new nav for search
xnought Jan 30, 2024
b9315ca
feat: single entry
xnought Jan 30, 2024
91b1bd4
feat: proteins view
xnought Jan 30, 2024
f2fe3a7
fix: layout
xnought Jan 30, 2024
0761289
fix: layout
xnought Jan 30, 2024
ae27dd5
feat: skeleton for similar proteins
xnought Jan 31, 2024
4f8d406
feat: add similar proteins data
xnought Jan 31, 2024
56fddd8
fix: transition on outer element
xnought Jan 31, 2024
71df2cd
fet: give protein names underscores when we save
xnought Feb 1, 2024
2744e37
feat: download foldseek in the docker
xnought Feb 1, 2024
9969419
feat: similarity search
xnought Feb 1, 2024
3a4a0c6
Merge branch 'main' into clustering
xnought Feb 1, 2024
b034f76
refactor: move to api/
xnought Feb 1, 2024
c634622
feat: similar search accross pdb
xnought Feb 1, 2024
58de4ae
feat: connect to frontend
xnought Feb 1, 2024
22f0923
fix: caches must be named differently
xnought Feb 1, 2024
b99c94d
feat: separate pdb and venome search into two tables
xnought Feb 1, 2024
67d03c8
feat: do everything with the foldseek folder
xnought Feb 1, 2024
fed9636
fix: ignore the first similar as the same
xnought Feb 1, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backend/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
__pycache__/
.venv
.venv
.foldseek_cache/
foldseek/bin/foldseek
foldseek/README.md
foldseek-linux-sse2.tar.gz
pdb*
3 changes: 3 additions & 0 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ RUN poetry config virtualenvs.in-project true

RUN poetry install

RUN wget https://mmseqs.com/foldseek/foldseek-linux-sse2.tar.gz && tar xvzf foldseek-linux-sse2.tar.gz
RUN rm -f foldseek-linux-sse2.tar.gz

EXPOSE 8000
29 changes: 29 additions & 0 deletions backend/src/api/similar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from fastapi import APIRouter
from ..api_types import SimilarProtein
from ..protein import (
pdb_file_name,
revert_pdb_filename,
)
from ..foldseek import easy_search, external_db

router = APIRouter()


PDB = external_db("PDB", "foldseek/dbs/pdb")


@router.get("/similar-pdb/{protein_name:str}", response_model=list[SimilarProtein])
def get_similar_pdb(protein_name: str):
global PDB
query_name = pdb_file_name(protein_name)
similar = easy_search(query_name, PDB, out_format="target,prob")
return [SimilarProtein(name=s[0].split(".")[0], prob=s[1]) for s in similar]


@router.get("/similar-venome/{protein_name:str}", response_model=list[SimilarProtein])
def get_similar_venome(protein_name: str):
query_name = pdb_file_name(protein_name)
target_folder = "src/data/pdbAlphaFold/"
# the first will always be the query itself so we skip it
similar = easy_search(query_name, target_folder, out_format="target,prob")[1:]
return [SimilarProtein(name=revert_pdb_filename(s[0]), prob=s[1]) for s in similar]
5 changes: 5 additions & 0 deletions backend/src/api_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ class EditBody(CamelModel):
new_refs: str | None = None


class SimilarProtein(CamelModel):
name: str
prob: float


class LoginBody(CamelModel):
email: str
password: str
Expand Down
156 changes: 156 additions & 0 deletions backend/src/foldseek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import subprocess
import logging as log
from functools import lru_cache

EXTERNAL_DATABASES = [
"Alphafold/UniProt",
"Alphafold/UniProt50",
"Alphafold/Proteome",
"Alphafold/Swiss",
"ESMAtlas30",
"PDB",
]


def bash_cmd(cmd: str | list[str]) -> str:
return subprocess.check_output(cmd, shell=True).decode()


active_caches = 0


class GenerateDirName:
def __enter__(self):
global active_caches
active_caches += 1
self.temp_dir = "foldseek/temp_dir_" + str(active_caches)
return self.temp_dir

def __exit__(self, *args):
global active_caches
active_caches -= 1
bash_cmd("rm -rf " + self.temp_dir)


def to_columnar_array(arr: list[list]) -> list[list]:
columnar = []
for i in range(len(arr[0])):
columnar.append([])
for j in range(len(arr)):
columnar[i].append(arr[j][i])
return columnar


def parse_output(filepath: str) -> list[list]:
with open(filepath, "r") as f:
lines = f.readlines()

parsed_lines = []
for line in lines:
parsed_line = []
for column in line.strip("\n").split("\t"):
try:
column = float(column)
except ValueError:
pass
parsed_line.append(column)
parsed_lines.append(parsed_line)

return parsed_lines


@lru_cache(maxsize=32)
def easy_search(
query: str,
target: str,
out_format: str = "query, target, prob",
print_stdout=False,
foldseek_executable="./foldseek/bin/foldseek",
columnar=False,
) -> list[list]:
"""easy_search just calls foldseek easy-search under the hood
TODO: use pybind to call the C++ function instead

Returns:
list[list]: a list of the matches from the search
"""
with GenerateDirName() as temp_dir:
out_file = temp_dir + "/output"

# Then call the easy-search
flags = f"--format-output {out_format}" if out_format else ""
cmd = f"{foldseek_executable} easy-search {query} {target} {out_file} {temp_dir} {flags}"
try:
stdout = bash_cmd(cmd)
except Exception as e:
log.warn(e)
return []

if print_stdout:
log.warn(stdout)

parsed_output = parse_output(out_file)
return to_columnar_array(parsed_output) if columnar else parsed_output


def external_db(
external_db_name: str,
db_name: str,
foldseek_executable="foldseek/bin/foldseek",
print_stdout=False,
):
if external_db_name not in EXTERNAL_DATABASES:
raise Exception(f"Directory {external_db_name} not found")

with GenerateDirName() as temp_dir:
try:
bash_cmd(f"ls {db_name}")
except Exception:
if dir not in EXTERNAL_DATABASES:
cmd = f"{foldseek_executable} createdb {external_db_name} {db_name}"
else:
cmd = f"{foldseek_executable} databases {external_db_name} {db_name} {temp_dir}"
stdout = bash_cmd(cmd)
if print_stdout:
print(stdout)

return db_name


def create_db(
dir: str,
db_name: str,
foldseek_executable="foldseek/bin/foldseek",
print_stdout=False,
temp_dir=".foldseek_cache",
):
# check that our dir exists
try:
bash_cmd(f"ls {dir}")
except Exception:
if dir not in EXTERNAL_DATABASES:
raise Exception(f"Directory {dir} not found")

# if database already exists, don't create another
try:
bash_cmd(f"ls {db_name}")
except Exception:
if dir not in EXTERNAL_DATABASES:
cmd = f"{foldseek_executable} createdb {dir} {db_name}"
else:
cmd = f"{foldseek_executable} databases {dir} {db_name} {temp_dir}"
stdout = bash_cmd(cmd)
if print_stdout:
print(stdout)

return db_name


if __name__ == "__main__":
# search each protein in test_examples/ with every other one
test_targets = create_db(dir="test_examples", db_name="test")
output = easy_search(
query=test_targets,
target=test_targets,
)
print(output)
8 changes: 7 additions & 1 deletion backend/src/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@ def decode_base64(b64_header_and_data: str):


def pdb_file_name(protein_name: str):
return os.path.join("src/data/pdbAlphaFold", protein_name) + ".pdb"
return (
os.path.join("src/data/pdbAlphaFold", protein_name.replace(" ", "_")) + ".pdb"
)


def revert_pdb_filename(file_name: str):
return file_name.replace(".pdb", "").replace("_", " ")


def parse_protein_pdb(name: str, file_contents: str = "", encoding="str"):
Expand Down
10 changes: 8 additions & 2 deletions backend/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@
from fastapi.responses import FileResponse, StreamingResponse
from .api_types import ProteinEntry, UploadBody, UploadError, EditBody
from .db import Database, bytea_to_str, str_to_bytea
from .protein import parse_protein_pdb, pdb_file_name, protein_name_found, pdb_to_fasta
from .protein import (
parse_protein_pdb,
pdb_file_name,
protein_name_found,
pdb_to_fasta,
)
from .setup import disable_cors, init_fastapi_app
from .api import users
from .api import users, similar


app = init_fastapi_app()
disable_cors(app, origins=[os.environ["PUBLIC_FRONTEND_URL"]])

app.include_router(users.router)
app.include_router(similar.router)


@app.get("/pdb/{protein_name:str}")
Expand Down
1 change: 1 addition & 0 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"openapi": "npx openapi-typescript-codegen --input http://localhost:8000/openapi.json --output ./src/openapi --client fetch"
},
"devDependencies": {
"@fontsource-variable/inter": "^5.0.16",
"@fontsource/fira-mono": "^4.5.10",
"@neoconfetti/svelte": "^1.0.0",
"@sveltejs/adapter-auto": "^2.0.0",
Expand Down
14 changes: 14 additions & 0 deletions frontend/src/app.postcss
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,17 @@
@tailwind base;
@tailwind components;
@tailwind utilities;

:root {
--darkblue-hsl: 205, 57%, 23%;
--darkblue: hsla(var(--darkblue-hsl), 1);

--lightblue-hsl: 198, 41%, 54%;
--lightblue: hsla(var(--lightblue-hsl), 1);

--darkorange-hsl: 27, 77%, 55%;
--darkorange: hsla(var(--darkorange-hsl), 1);

--lightorange-hsl: 38, 83%, 60%;
--lightorange: hsla(var(--lightorange-hsl), 1);
}
12 changes: 12 additions & 0 deletions frontend/src/lib/EntryCard.svelte
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<script lang="ts">
import { Card } from "flowbite-svelte";
export let title = "";
</script>

<Card
class="max-w-full mt-5"
style="padding-top: 15px; color: var(--color-text);"
>
<h2 class="text-darkblue mb-2">{title}</h2>
<slot />
</Card>
Loading
Loading