Skip to content

Commit

Permalink
sim search refactoring (#466)
Browse files Browse the repository at this point in the history
* annotation scaling WIP

* add Document index

* working suggestions

* working typesense search

* working qdrant

* sim search interface and services

* adjust env variables

* adapt docker compose profiles

* dwts -> dats

* fix tests


---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 20, 2024
1 parent 7767297 commit f306212
Show file tree
Hide file tree
Showing 23 changed files with 1,398 additions and 387 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/backend_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ jobs:
- name: Build & Start Docker Containers
working-directory: docker
run: |
COMPOSE_PROFILES="ray,background,backend" docker compose build
COMPOSE_PROFILES="ray,background" docker compose up --wait --quiet-pull
COMPOSE_PROFILES="weaviate,ray,background,backend" docker compose build
COMPOSE_PROFILES="weaviate,ray,background" docker compose up --wait --quiet-pull
- name: Check 1 - pytest runs without errors
working-directory: docker
run: |
Expand All @@ -55,7 +55,7 @@ jobs:
- name: Start Remaining Docker Containers
working-directory: docker
run: |
COMPOSE_PROFILES="ray,background,backend" docker compose up --wait --quiet-pull
COMPOSE_PROFILES="weaviate,ray,background,backend" docker compose up --wait --quiet-pull
- name: Check 4 - Test End-2-End importer script
working-directory: tools/importer
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/frontend_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
env:
API_WORKERS: 1
VITE_APP_SERVER: http://localhost:13120
COMPOSE_PROFILES: "ray,background,backend,frontend"
COMPOSE_PROFILES: "weaviate,ray,background,backend,frontend"
RAY_CONFIG: "config_gpu.yaml"
JWT_SECRET: ${{ secrets.JWT_SECRET }}
steps:
Expand Down
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
"editor.tabSize": 2,
"editor.insertSpaces": true
},
"[dockercompose]": {
"editor.defaultFormatter": "ms-azuretools.vscode-docker"
},
"python.testing.pytestArgs": ["test"],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
Expand Down
1 change: 1 addition & 0 deletions backend/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dependencies:
- python-magic=0.4
- python-multipart=0.0.5
- python=3.11
- qdrant-client=1.9.1
- readability-lxml=0.8.1
- redis-py=4.3
- rope=1.9.0
Expand Down
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pytest-order==1.2.1
Scrapy==2.10.0
scrapy-playwright==0.0.31
scrapy-selenium==0.0.7
typesense==0.21.0
weaviate-client==3.24.1
webdriver-manager==4.0.1
yake==0.4.8
32 changes: 32 additions & 0 deletions backend/src/api/endpoints/annoscaling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List

from fastapi import APIRouter, Depends

from api.dependencies import get_current_user
from app.core.annoscaling.annoscaling_service import AnnoScalingService
from app.core.authorization.authz_user import AuthzUser

router = APIRouter(
prefix="/annoscaling",
tags=["annoscaling"],
dependencies=[Depends(get_current_user)],
)

ass: AnnoScalingService = AnnoScalingService()


@router.post(
"/suggest",
summary="Suggest annotations",
description="Suggest annotations",
)
async def suggest(
*,
project_id: int,
code_id: int,
top_k: int,
authz_user: AuthzUser = Depends(),
) -> List[str]:
authz_user.assert_in_project(project_id)

return ass.suggest(project_id, [authz_user.user.id], code_id, top_k)
Empty file.
120 changes: 120 additions & 0 deletions backend/src/app/core/annoscaling/annoscaling_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from time import perf_counter_ns
from typing import Dict, Iterable, List, Tuple

import numpy as np

from app.core.data.orm.annotation_document import AnnotationDocumentORM
from app.core.data.orm.code import CodeORM
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.source_document_data import SourceDocumentDataORM
from app.core.data.orm.span_annotation import SpanAnnotationORM
from app.core.db.simsearch_service import SimSearchService
from app.core.db.sql_service import SQLService

# from app.core.search.typesense_service import TypesenseService
from app.util.singleton_meta import SingletonMeta


class AnnoScalingService(metaclass=SingletonMeta):
def __new__(cls, *args, **kwargs):
cls.sqls = SQLService()
cls.sim = SimSearchService()
# cls.ts = TypesenseService()

return super(AnnoScalingService, cls).__new__(cls)

def suggest(
self,
project_id: int,
user_ids: List[int],
code_id: int,
top_k: int,
) -> List[str]:
start_time = perf_counter_ns()
# takes 4ms (small project)
occurrences = self.__get_annotations(project_id, user_ids, code_id)
end_time = perf_counter_ns()
print("it took", end_time - start_time, "ns to get annotations from the DB")

start_time = perf_counter_ns()
# takes 2ms (small project)
sdoc_sentences = self.__get_sentences({id for _, _, id in occurrences})
end_time = perf_counter_ns()
print("it took", end_time - start_time, "ns to get sentences from the DB")

sdoc_sent_ids = []

start_time = perf_counter_ns()
# takes around 0.1ms per annotation
for start, end, sdoc_id in occurrences:
# TODO loops are bad, need a much faster way to link annotations to sentences
# best: do everything in DB and only return sentence ID per annotation
# alternative: load all from DB (in chunks?) and compute via numpy
starts, ends, _ = sdoc_sentences[sdoc_id]
sent_match = self.__best_match(starts, ends, start, end)
sdoc_sent_ids.append((sdoc_id, sent_match))
end_time = perf_counter_ns()
print("it took", end_time - start_time, "ns to match annotations to sentences")
start_time = perf_counter_ns()
# takes around 20ms per object. so, 50 annotations take already 1 full second
hits = self.sim.suggest_similar_sentences(project_id, sdoc_sent_ids, top_k)
end_time = perf_counter_ns()
print(
"it took", end_time - start_time, "ns to get similar sentences from index"
)
sim_doc_sentences = self.__get_sentences({hit.sdoc_id for hit in hits})

texts = []
for hit in hits:
starts, ends, content = sim_doc_sentences[hit.sdoc_id]
texts.append(content[starts[hit.sentence_id] : ends[hit.sentence_id]])
return texts

def __get_annotations(self, project_id: int, user_ids: List[int], code_id: int):
with self.sqls.db_session() as db:
query = (
db.query(
SpanAnnotationORM,
SourceDocumentORM.id,
)
.join(
AnnotationDocumentORM,
AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
)
.join(
SpanAnnotationORM,
SpanAnnotationORM.annotation_document_id
== AnnotationDocumentORM.id,
)
.join(
CodeORM,
CodeORM.id == SpanAnnotationORM.code_id,
)
.filter(
SourceDocumentORM.project_id == project_id,
AnnotationDocumentORM.user_id.in_(user_ids),
CodeORM.id == code_id,
)
)
res = query.all()
return [(r[0].begin, r[0].end, r[1]) for r in res]

def __get_sentences(
self, sdoc_ids: Iterable[int]
) -> Dict[int, Tuple[List[int], List[int], str]]:
with self.sqls.db_session() as db:
query = db.query(
SourceDocumentDataORM.id,
SourceDocumentDataORM.sentence_starts,
SourceDocumentDataORM.sentence_ends,
SourceDocumentDataORM.content,
).filter(SourceDocumentDataORM.id.in_(sdoc_ids))
res = query.all()
return {r[0]: (r[1], r[2], r[3]) for r in res}

def __best_match(self, starts: List[int], ends: List[int], begin: int, end: int):
overlap = [self.__overlap(s, e, begin, end) for s, e in zip(starts, ends)]
return np.asarray(overlap).argmax().item()

def __overlap(self, s1: int, e1: int, s2: int, e2: int):
return max(min(e1, e2) - max(s1, s2), 0)
8 changes: 5 additions & 3 deletions backend/src/app/core/db/index_type.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from enum import Enum
from enum import StrEnum


class IndexType(str, Enum):
TEXT = "text"
class IndexType(StrEnum):
SENTENCE = "sentence"
IMAGE = "image"
NAMED_ENTITY = "named-entity"
DOCUMENT = "document"
Loading

0 comments on commit f306212

Please sign in to comment.