sim search refactoring (#466)

* annotation scaling WIP * add Document index * working suggestions * working typesense search * working qdrant * sim search interface and services * adjust env variables * adapt docker compose profiles * dwts -> dats * fix tests --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
uhh-lt · Nov 20, 2024 · f306212 · f306212
1 parent 7767297
commit f306212
Show file tree

Hide file tree

Showing 23 changed files with 1,398 additions and 387 deletions.
diff --git a/.github/workflows/backend_checks.yml b/.github/workflows/backend_checks.yml
@@ -38,8 +38,8 @@ jobs:
       - name: Build & Start Docker Containers
         working-directory: docker
         run: |
-          COMPOSE_PROFILES="ray,background,backend" docker compose build
-          COMPOSE_PROFILES="ray,background" docker compose up --wait --quiet-pull
+          COMPOSE_PROFILES="weaviate,ray,background,backend" docker compose build
+          COMPOSE_PROFILES="weaviate,ray,background" docker compose up --wait --quiet-pull
       - name: Check 1 - pytest runs without errors
         working-directory: docker
         run: |
@@ -55,7 +55,7 @@ jobs:
       - name: Start Remaining Docker Containers
         working-directory: docker
         run: |
-          COMPOSE_PROFILES="ray,background,backend" docker compose up --wait --quiet-pull
+          COMPOSE_PROFILES="weaviate,ray,background,backend" docker compose up --wait --quiet-pull
       - name: Check 4 - Test End-2-End importer script
         working-directory: tools/importer
         env:

diff --git a/.github/workflows/frontend_checks.yml b/.github/workflows/frontend_checks.yml
@@ -16,7 +16,7 @@ jobs:
     env:
       API_WORKERS: 1
       VITE_APP_SERVER: http://localhost:13120
-      COMPOSE_PROFILES: "ray,background,backend,frontend"
+      COMPOSE_PROFILES: "weaviate,ray,background,backend,frontend"
       RAY_CONFIG: "config_gpu.yaml"
       JWT_SECRET: ${{ secrets.JWT_SECRET }}
     steps:

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -14,6 +14,9 @@
     "editor.tabSize": 2,
     "editor.insertSpaces": true
   },
+  "[dockercompose]": {
+    "editor.defaultFormatter": "ms-azuretools.vscode-docker"
+  },
   "python.testing.pytestArgs": ["test"],
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,

diff --git a/backend/environment.yml b/backend/environment.yml
@@ -38,6 +38,7 @@ dependencies:
   - python-magic=0.4
   - python-multipart=0.0.5
   - python=3.11
+  - qdrant-client=1.9.1
   - readability-lxml=0.8.1
   - redis-py=4.3
   - rope=1.9.0

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -6,6 +6,7 @@ pytest-order==1.2.1
 Scrapy==2.10.0
 scrapy-playwright==0.0.31
 scrapy-selenium==0.0.7
+typesense==0.21.0
 weaviate-client==3.24.1
 webdriver-manager==4.0.1
 yake==0.4.8
diff --git a/backend/src/api/endpoints/annoscaling.py b/backend/src/api/endpoints/annoscaling.py
@@ -0,0 +1,32 @@
+from typing import List
+
+from fastapi import APIRouter, Depends
+
+from api.dependencies import get_current_user
+from app.core.annoscaling.annoscaling_service import AnnoScalingService
+from app.core.authorization.authz_user import AuthzUser
+
+router = APIRouter(
+    prefix="/annoscaling",
+    tags=["annoscaling"],
+    dependencies=[Depends(get_current_user)],
+)
+
+ass: AnnoScalingService = AnnoScalingService()
+
+
+@router.post(
+    "/suggest",
+    summary="Suggest annotations",
+    description="Suggest annotations",
+)
+async def suggest(
+    *,
+    project_id: int,
+    code_id: int,
+    top_k: int,
+    authz_user: AuthzUser = Depends(),
+) -> List[str]:
+    authz_user.assert_in_project(project_id)
+
+    return ass.suggest(project_id, [authz_user.user.id], code_id, top_k)
diff --git a/backend/src/app/core/annoscaling/__init__.py b/backend/src/app/core/annoscaling/__init__.py
diff --git a/backend/src/app/core/annoscaling/annoscaling_service.py b/backend/src/app/core/annoscaling/annoscaling_service.py
@@ -0,0 +1,120 @@
+from time import perf_counter_ns
+from typing import Dict, Iterable, List, Tuple
+
+import numpy as np
+
+from app.core.data.orm.annotation_document import AnnotationDocumentORM
+from app.core.data.orm.code import CodeORM
+from app.core.data.orm.source_document import SourceDocumentORM
+from app.core.data.orm.source_document_data import SourceDocumentDataORM
+from app.core.data.orm.span_annotation import SpanAnnotationORM
+from app.core.db.simsearch_service import SimSearchService
+from app.core.db.sql_service import SQLService
+
+# from app.core.search.typesense_service import TypesenseService
+from app.util.singleton_meta import SingletonMeta
+
+
+class AnnoScalingService(metaclass=SingletonMeta):
+    def __new__(cls, *args, **kwargs):
+        cls.sqls = SQLService()
+        cls.sim = SimSearchService()
+        # cls.ts = TypesenseService()
+
+        return super(AnnoScalingService, cls).__new__(cls)
+
+    def suggest(
+        self,
+        project_id: int,
+        user_ids: List[int],
+        code_id: int,
+        top_k: int,
+    ) -> List[str]:
+        start_time = perf_counter_ns()
+        # takes 4ms (small project)
+        occurrences = self.__get_annotations(project_id, user_ids, code_id)
+        end_time = perf_counter_ns()
+        print("it took", end_time - start_time, "ns to get annotations from the DB")
+
+        start_time = perf_counter_ns()
+        # takes 2ms (small project)
+        sdoc_sentences = self.__get_sentences({id for _, _, id in occurrences})
+        end_time = perf_counter_ns()
+        print("it took", end_time - start_time, "ns to get sentences from the DB")
+
+        sdoc_sent_ids = []
+
+        start_time = perf_counter_ns()
+        # takes around 0.1ms per annotation
+        for start, end, sdoc_id in occurrences:
+            # TODO loops are bad, need a much faster way to link annotations to sentences
+            # best: do everything in DB and only return sentence ID per annotation
+            # alternative: load all from DB (in chunks?) and compute via numpy
+            starts, ends, _ = sdoc_sentences[sdoc_id]
+            sent_match = self.__best_match(starts, ends, start, end)
+            sdoc_sent_ids.append((sdoc_id, sent_match))
+        end_time = perf_counter_ns()
+        print("it took", end_time - start_time, "ns to match annotations to sentences")
+        start_time = perf_counter_ns()
+        # takes around 20ms per object. so, 50 annotations take already 1 full second
+        hits = self.sim.suggest_similar_sentences(project_id, sdoc_sent_ids, top_k)
+        end_time = perf_counter_ns()
+        print(
+            "it took", end_time - start_time, "ns to get similar sentences from index"
+        )
+        sim_doc_sentences = self.__get_sentences({hit.sdoc_id for hit in hits})
+
+        texts = []
+        for hit in hits:
+            starts, ends, content = sim_doc_sentences[hit.sdoc_id]
+            texts.append(content[starts[hit.sentence_id] : ends[hit.sentence_id]])
+        return texts
+
+    def __get_annotations(self, project_id: int, user_ids: List[int], code_id: int):
+        with self.sqls.db_session() as db:
+            query = (
+                db.query(
+                    SpanAnnotationORM,
+                    SourceDocumentORM.id,
+                )
+                .join(
+                    AnnotationDocumentORM,
+                    AnnotationDocumentORM.source_document_id == SourceDocumentORM.id,
+                )
+                .join(
+                    SpanAnnotationORM,
+                    SpanAnnotationORM.annotation_document_id
+                    == AnnotationDocumentORM.id,
+                )
+                .join(
+                    CodeORM,
+                    CodeORM.id == SpanAnnotationORM.code_id,
+                )
+                .filter(
+                    SourceDocumentORM.project_id == project_id,
+                    AnnotationDocumentORM.user_id.in_(user_ids),
+                    CodeORM.id == code_id,
+                )
+            )
+            res = query.all()
+            return [(r[0].begin, r[0].end, r[1]) for r in res]
+
+    def __get_sentences(
+        self, sdoc_ids: Iterable[int]
+    ) -> Dict[int, Tuple[List[int], List[int], str]]:
+        with self.sqls.db_session() as db:
+            query = db.query(
+                SourceDocumentDataORM.id,
+                SourceDocumentDataORM.sentence_starts,
+                SourceDocumentDataORM.sentence_ends,
+                SourceDocumentDataORM.content,
+            ).filter(SourceDocumentDataORM.id.in_(sdoc_ids))
+            res = query.all()
+            return {r[0]: (r[1], r[2], r[3]) for r in res}
+
+    def __best_match(self, starts: List[int], ends: List[int], begin: int, end: int):
+        overlap = [self.__overlap(s, e, begin, end) for s, e in zip(starts, ends)]
+        return np.asarray(overlap).argmax().item()
+
+    def __overlap(self, s1: int, e1: int, s2: int, e2: int):
+        return max(min(e1, e2) - max(s1, s2), 0)
diff --git a/backend/src/app/core/db/index_type.py b/backend/src/app/core/db/index_type.py
@@ -1,6 +1,8 @@
-from enum import Enum
+from enum import StrEnum
 
 
-class IndexType(str, Enum):
-    TEXT = "text"
+class IndexType(StrEnum):
+    SENTENCE = "sentence"
     IMAGE = "image"
+    NAMED_ENTITY = "named-entity"
+    DOCUMENT = "document"