Skip to content

Commit

Permalink
Merge pull request #193 from xnought/search
Browse files Browse the repository at this point in the history
feat: title, description, and content fuzzy search
  • Loading branch information
ansengarvin authored Mar 6, 2024
2 parents 049f855 + 5ba3e70 commit ef4b431
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 27 deletions.
6 changes: 4 additions & 2 deletions backend/init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
-- Generated columns:
-- https://www.postgresql.org/docs/current/ddl-generated-columns.html

CREATE EXTENSION pg_trgm; -- for trigram matching fuzzy search similarity() func

/*
* Species Table
*/
Expand All @@ -28,8 +30,8 @@ CREATE TABLE proteins (
description text,
length integer, -- length of amino acid sequence
mass numeric, -- mass in amu/daltons
content bytea, -- stored markdown for the protein article (TODO: consider having a limit to how big this can be)
refs bytea, -- bibtex references mentioned in the content/article
content text, -- stored markdown for the protein article (TODO: consider having a limit to how big this can be)
refs text, -- bibtex references mentioned in the content/article
species_id integer NOT NULL,
thumbnail bytea, -- thumbnail image of the protein in base64 format
FOREIGN KEY (species_id) REFERENCES species(id) ON UPDATE CASCADE ON DELETE CASCADE
Expand Down
12 changes: 4 additions & 8 deletions backend/src/api/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,6 @@ def get_protein_entry(protein_name: str):
) = only_returned_entry

# if byte arrays are present, decode them into a string
if content is not None:
content = bytea_to_str(content)
if refs is not None:
refs = bytea_to_str(refs)
if thumbnail is not None:
thumbnail = bytea_to_str(thumbnail)

Expand Down Expand Up @@ -263,8 +259,8 @@ def upload_protein_entry(body: UploadBody, req: Request):
body.description,
pdb.num_amino_acids,
pdb.mass_daltons,
str_to_bytea(body.content),
str_to_bytea(body.refs),
body.content,
body.refs,
body.species_name,
],
)
Expand Down Expand Up @@ -310,7 +306,7 @@ def edit_protein_entry(body: EditBody, req: Request):
db.execute(
"""UPDATE proteins SET content = %s WHERE name = %s""",
[
str_to_bytea(body.new_content),
body.new_content,
body.old_name if not name_changed else body.new_name,
],
)
Expand All @@ -319,7 +315,7 @@ def edit_protein_entry(body: EditBody, req: Request):
db.execute(
"""UPDATE proteins SET refs = %s WHERE name = %s""",
[
str_to_bytea(body.new_refs),
body.new_refs,
body.old_name if not name_changed else body.new_name,
],
)
Expand Down
54 changes: 37 additions & 17 deletions backend/src/api/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,41 +74,61 @@ def get_descriptions(protein_names: list[str]):


def gen_sql_filters(
species_table: str,
proteins_table: str,
species_filter: str | None,
length_filter: RangeFilter | None = None,
mass_filter: RangeFilter | None = None,
) -> str:
filters = [
category_where_clause("species.name", species_filter),
range_where_clause("proteins.length", length_filter),
range_where_clause("proteins.mass", mass_filter),
category_where_clause(f"{species_table}.name", species_filter),
range_where_clause(f"{proteins_table}.length", length_filter),
range_where_clause(f"{proteins_table}.mass", mass_filter),
]
return " AND " + combine_where_clauses(filters) if any(filters) else ""


@router.post("/search/proteins", response_model=SearchProteinsResults)
def search_proteins(body: SearchProteinsBody):
title_query = sanitize_query(body.query)
text_query = sanitize_query(body.query)
with Database() as db:
try:
filter_clauses = gen_sql_filters(
body.species_filter, body.length_filter, body.mass_filter
"species",
"proteins_scores",
body.species_filter,
body.length_filter,
body.mass_filter,
)
entries_query = """SELECT proteins.name,
proteins.description,
proteins.length,
proteins.mass,
threshold = 0
score_filter = (
f"(proteins_scores.name_score >= {threshold} OR proteins_scores.desc_score >= {threshold} OR proteins_scores.content_score >= {threshold})" # show only the scores > 0
if len(text_query) > 0
else "TRUE" # show all scores
)
# cursed shit, edit this at some point
# note that we have a sub query since postgres can't do where clauses on aliased tables
entries_query = """SELECT proteins_scores.name,
proteins_scores.description,
proteins_scores.length,
proteins_scores.mass,
species.name,
proteins.thumbnail
FROM proteins
JOIN species ON species.id = proteins.species_id
WHERE proteins.name ILIKE %s"""
proteins_scores.thumbnail
FROM (SELECT *,
similarity(name, %s) as name_score,
similarity(description, %s) as desc_score,
similarity(content, %s) as content_score
FROM proteins) as proteins_scores
JOIN species ON species.id = proteins_scores.species_id
WHERE {} {}
ORDER BY (proteins_scores.name_score*4 + proteins_scores.desc_score*2 + proteins_scores.content_score) DESC;
""".format(
score_filter, filter_clauses
) # numbers in order by correspond to weighting
log.warn(filter_clauses)
entries_result = db.execute_return(
sanitize_query(entries_query + filter_clauses),
[
f"%{title_query}%",
],
sanitize_query(entries_query),
[text_query, text_query, text_query],
)
if entries_result is not None:
return SearchProteinsResults(
Expand Down
1 change: 1 addition & 0 deletions galaxy/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
master_venom_galaxy/

0 comments on commit ef4b431

Please sign in to comment.