Merge pull request #33 from mindvalley/chore/merge-upstream-20240512501

chore/merge upstream 20240512501
mindvalley · May 26, 2024 · 51f5649 · 51f5649
2 parents 396d0cb + 8ca20c5
commit 51f5649
Show file tree

Hide file tree

Showing 109 changed files with 5,049 additions and 1,057 deletions.
diff --git a/.github/workflows/docker-build-push-backend-container-on-tag.yml b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -14,16 +14,16 @@ jobs:
       uses: actions/checkout@v2
 
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v1
+      uses: docker/setup-buildx-action@v3
 
     - name: Login to Docker Hub
-      uses: docker/login-action@v1
+      uses: docker/login-action@v3
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_TOKEN }}
 
     - name: Backend Image Docker Build and Push
-      uses: docker/build-push-action@v2
+      uses: docker/build-push-action@v5
       with:
         context: ./backend
         file: ./backend/Dockerfile

diff --git a/.github/workflows/docker-build-push-model-server-container-on-tag.yml b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -14,16 +14,16 @@ jobs:
       uses: actions/checkout@v2
 
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v1
+      uses: docker/setup-buildx-action@v3
 
     - name: Login to Docker Hub
-      uses: docker/login-action@v1
+      uses: docker/login-action@v3
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_TOKEN }}
 
     - name: Model Server Image Docker Build and Push
-      uses: docker/build-push-action@v2
+      uses: docker/build-push-action@v5
       with:
         context: ./backend
         file: ./backend/Dockerfile.model_server

diff --git a/.github/workflows/docker-build-push-web-container-on-tag.yml b/.github/workflows/docker-build-push-web-container-on-tag.yml
@@ -14,16 +14,16 @@ jobs:
       uses: actions/checkout@v2
 
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v1
+      uses: docker/setup-buildx-action@v3
 
     - name: Login to Docker Hub
-      uses: docker/login-action@v1
+      uses: docker/login-action@v3
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_TOKEN }}
 
     - name: Web Image Docker Build and Push
-      uses: docker/build-push-action@v2
+      uses: docker/build-push-action@v5
       with:
         context: ./web
         file: ./web/Dockerfile
@@ -34,6 +34,8 @@ jobs:
           danswer/danswer-web-server:latest
         build-args: |
           DANSWER_VERSION=${{ github.ref_name }}
+        # needed due to weird interactions with the builds for different platforms  
+        no-cache: true
 
     - name: Run Trivy vulnerability scanner
       uses: aquasecurity/trivy-action@master

diff --git a/backend/alembic/versions/3879338f8ba1_add_tool_table.py b/backend/alembic/versions/3879338f8ba1_add_tool_table.py
@@ -11,8 +11,8 @@
 # revision identifiers, used by Alembic.
 revision = "3879338f8ba1"
 down_revision = "f1c6478c3fd8"
-branch_labels = None
-depends_on = None
+branch_labels: None = None
+depends_on: None = None
 
 
 def upgrade() -> None:

diff --git a/backend/alembic/versions/70f00c45c0f2_more_descriptive_filestore.py b/backend/alembic/versions/70f00c45c0f2_more_descriptive_filestore.py
@@ -0,0 +1,68 @@
+"""More Descriptive Filestore
+
+Revision ID: 70f00c45c0f2
+Revises: 3879338f8ba1
+Create Date: 2024-05-17 17:51:41.926893
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "70f00c45c0f2"
+down_revision = "3879338f8ba1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column("file_store", sa.Column("display_name", sa.String(), nullable=True))
+    op.add_column(
+        "file_store",
+        sa.Column(
+            "file_origin",
+            sa.String(),
+            nullable=False,
+            server_default="connector",  # Default to connector
+        ),
+    )
+    op.add_column(
+        "file_store",
+        sa.Column(
+            "file_type", sa.String(), nullable=False, server_default="text/plain"
+        ),
+    )
+    op.add_column(
+        "file_store",
+        sa.Column(
+            "file_metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+    )
+
+    op.execute(
+        """
+        UPDATE file_store
+        SET file_origin = CASE
+            WHEN file_name LIKE 'chat__%' THEN 'chat_upload'
+            ELSE 'connector'
+        END,
+        file_name = CASE
+            WHEN file_name LIKE 'chat__%' THEN SUBSTR(file_name, 7)
+            ELSE file_name
+        END,
+        file_type = CASE
+            WHEN file_name LIKE 'chat__%' THEN 'image/png'
+            ELSE 'text/plain'
+        END
+    """
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("file_store", "file_metadata")
+    op.drop_column("file_store", "file_type")
+    op.drop_column("file_store", "file_origin")
+    op.drop_column("file_store", "display_name")
diff --git a/backend/alembic/versions/ec85f2b3c544_remove_last_attempt_status_from_cc_pair.py b/backend/alembic/versions/ec85f2b3c544_remove_last_attempt_status_from_cc_pair.py
@@ -0,0 +1,31 @@
+"""Remove Last Attempt Status from CC Pair
+
+Revision ID: ec85f2b3c544
+Revises: 3879338f8ba1
+Create Date: 2024-05-23 21:39:46.126010
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "ec85f2b3c544"
+down_revision = "70f00c45c0f2"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.drop_column("connector_credential_pair", "last_attempt_status")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "last_attempt_status",
+            sa.VARCHAR(),
+            autoincrement=False,
+            nullable=True,
+        ),
+    )
diff --git a/backend/danswer/background/celery/celery.py b/backend/danswer/background/celery/celery.py
@@ -68,7 +68,9 @@ def cleanup_connector_credential_pair_task(
                 f"{connector_id} and Credential ID: {credential_id} does not exist."
             )
 
-        deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(cc_pair)
+        deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(
+            connector_credential_pair=cc_pair, db_session=db_session
+        )
         if deletion_attempt_disallowed_reason:
             raise ValueError(deletion_attempt_disallowed_reason)
 

diff --git a/backend/danswer/background/indexing/run_indexing.py b/backend/danswer/background/indexing/run_indexing.py
@@ -160,19 +160,19 @@ def _run_indexing(
             source_type=db_connector.source,
         )
     ):
-        window_start = max(
-            window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
-            datetime(1970, 1, 1, tzinfo=timezone.utc),
-        )
+        try:
+            window_start = max(
+                window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
+                datetime(1970, 1, 1, tzinfo=timezone.utc),
+            )
 
-        doc_batch_generator, is_listing_complete = _get_document_generator(
-            db_session=db_session,
-            attempt=index_attempt,
-            start_time=window_start,
-            end_time=window_end,
-        )
+            doc_batch_generator, is_listing_complete = _get_document_generator(
+                db_session=db_session,
+                attempt=index_attempt,
+                start_time=window_start,
+                end_time=window_end,
+            )
 
-        try:
             all_connector_doc_ids: set[str] = set()
             for doc_batch in doc_batch_generator:
                 # Check if connector is disabled mid run and stop if so unless it's the secondary
@@ -263,7 +263,6 @@ def _run_indexing(
                     db_session=db_session,
                     connector_id=db_connector.id,
                     credential_id=db_credential.id,
-                    attempt_status=IndexingStatus.IN_PROGRESS,
                     net_docs=net_doc_change,
                     run_dt=run_end_dt,
                 )
@@ -294,7 +293,6 @@ def _run_indexing(
                         db_session=db_session,
                         connector_id=index_attempt.connector.id,
                         credential_id=index_attempt.credential.id,
-                        attempt_status=IndexingStatus.FAILED,
                         net_docs=net_doc_change,
                     )
                 raise e
@@ -309,7 +307,6 @@ def _run_indexing(
             db_session=db_session,
             connector_id=db_connector.id,
             credential_id=db_credential.id,
-            attempt_status=IndexingStatus.SUCCESS,
             run_dt=run_end_dt,
         )
 
@@ -343,15 +340,7 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA
 
     # only commit once, to make sure this all happens in a single transaction
     mark_attempt_in_progress__no_commit(attempt)
-    is_primary = attempt.embedding_model.status == IndexModelStatus.PRESENT
-    if is_primary:
-        update_connector_credential_pair(
-            db_session=db_session,
-            connector_id=attempt.connector.id,
-            credential_id=attempt.credential.id,
-            attempt_status=IndexingStatus.IN_PROGRESS,
-        )
-    else:
+    if attempt.embedding_model.status != IndexModelStatus.PRESENT:
         db_session.commit()
 
     return attempt

diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py
@@ -17,8 +17,6 @@
 from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from danswer.configs.app_configs import NUM_INDEXING_WORKERS
 from danswer.db.connector import fetch_connectors
-from danswer.db.connector_credential_pair import mark_all_in_progress_cc_pairs_failed
-from danswer.db.connector_credential_pair import update_connector_credential_pair
 from danswer.db.embedding_model import get_current_db_embedding_model
 from danswer.db.embedding_model import get_secondary_db_embedding_model
 from danswer.db.engine import get_db_current_time
@@ -119,17 +117,6 @@ def _mark_run_failed(
         db_session=db_session,
         failure_reason=failure_reason,
     )
-    if (
-        index_attempt.connector_id is not None
-        and index_attempt.credential_id is not None
-        and index_attempt.embedding_model.status == IndexModelStatus.PRESENT
-    ):
-        update_connector_credential_pair(
-            db_session=db_session,
-            connector_id=index_attempt.connector_id,
-            credential_id=index_attempt.credential_id,
-            attempt_status=IndexingStatus.FAILED,
-        )
 
 
 """Main funcs"""
@@ -192,16 +179,6 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
                         connector.id, credential.id, model.id, db_session
                     )
 
-                    # CC-Pair will have the status that it should for the primary index
-                    # Will be re-sync-ed once the indices are swapped
-                    if model.status == IndexModelStatus.PRESENT:
-                        update_connector_credential_pair(
-                            db_session=db_session,
-                            connector_id=connector.id,
-                            credential_id=credential.id,
-                            attempt_status=IndexingStatus.NOT_STARTED,
-                        )
-
 
 def cleanup_indexing_jobs(
     existing_jobs: dict[int, Future | SimpleJob],
@@ -391,11 +368,6 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non
 
     existing_jobs: dict[int, Future | SimpleJob] = {}
 
-    with Session(engine) as db_session:
-        # Previous version did not always clean up cc-pairs well leaving some connectors undeleteable
-        # This ensures that bad states get cleaned up
-        mark_all_in_progress_cc_pairs_failed(db_session)
-
     while True:
         start = time.time()
         start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")

diff --git a/backend/danswer/chat/process_message.py b/backend/danswer/chat/process_message.py
@@ -16,6 +16,7 @@
 from danswer.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
 from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
 from danswer.configs.constants import MessageType
+from danswer.db.chat import attach_files_to_chat_message
 from danswer.db.chat import create_db_search_doc
 from danswer.db.chat import create_new_chat_message
 from danswer.db.chat import get_chat_message
@@ -240,6 +241,7 @@ def stream_chat_message_objects(
         else:
             parent_message = root_message
 
+        user_message = None
         if not use_existing_user_message:
             # Create new message at the right place in the tree and update the parent's child pointer
             # Don't commit yet until we verify the chat message chain
@@ -250,10 +252,7 @@ def stream_chat_message_objects(
                 message=message_text,
                 token_count=len(llm_tokenizer_encode_func(message_text)),
                 message_type=MessageType.USER,
-                files=[
-                    {"id": str(file_id), "type": ChatFileType.IMAGE}
-                    for file_id in new_msg_req.file_ids
-                ],
+                files=None,  # Need to attach later for optimization to only load files once in parallel
                 db_session=db_session,
                 commit=False,
             )
@@ -283,11 +282,24 @@ def stream_chat_message_objects(
                 )
 
         # load all files needed for this chat chain in memory
-        files = load_all_chat_files(history_msgs, new_msg_req.file_ids, db_session)
+        files = load_all_chat_files(
+            history_msgs, new_msg_req.file_descriptors, db_session
+        )
         latest_query_files = [
-            file for file in files if file.file_id in new_msg_req.file_ids
+            file
+            for file in files
+            if file.file_id in [f["id"] for f in new_msg_req.file_descriptors]
         ]
 
+        if user_message:
+            attach_files_to_chat_message(
+                chat_message=user_message,
+                files=[
+                    new_file.to_file_descriptor() for new_file in latest_query_files
+                ],
+                db_session=db_session,
+            )
+
         selected_db_search_docs = None
         selected_llm_docs: list[LlmDoc] | None = None
         if reference_doc_ids:

diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
@@ -96,6 +96,8 @@ class DocumentSource(str, Enum):
     SHAREPOINT = "sharepoint"
     DISCOURSE = "discourse"
     AXERO = "axero"
+    MEDIAWIKI = "mediawiki"
+    WIKIPEDIA = "wikipedia"
 
 
 class DocumentIndexType(str, Enum):
@@ -130,3 +132,9 @@ class TokenRateLimitScope(str, Enum):
     USER = "user"
     USER_GROUP = "user_group"
     GLOBAL = "global"
+
+
+class FileOrigin(str, Enum):
+    CHAT_UPLOAD = "chat_upload"
+    CHAT_IMAGE_GEN = "chat_image_gen"
+    CONNECTOR = "connector"