Skip to content
This repository has been archived by the owner on Dec 11, 2024. It is now read-only.

Commit

Permalink
Merge pull request #33 from mindvalley/chore/merge-upstream-20240512501
Browse files Browse the repository at this point in the history
chore/merge upstream 20240512501
  • Loading branch information
onimsha authored May 26, 2024
2 parents 396d0cb + 8ca20c5 commit 51f5649
Show file tree
Hide file tree
Showing 109 changed files with 5,049 additions and 1,057 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ jobs:
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v1
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Backend Image Docker Build and Push
uses: docker/build-push-action@v2
uses: docker/build-push-action@v5
with:
context: ./backend
file: ./backend/Dockerfile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ jobs:
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v1
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Model Server Image Docker Build and Push
uses: docker/build-push-action@v2
uses: docker/build-push-action@v5
with:
context: ./backend
file: ./backend/Dockerfile.model_server
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/docker-build-push-web-container-on-tag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ jobs:
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v1
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Web Image Docker Build and Push
uses: docker/build-push-action@v2
uses: docker/build-push-action@v5
with:
context: ./web
file: ./web/Dockerfile
Expand All @@ -34,6 +34,8 @@ jobs:
danswer/danswer-web-server:latest
build-args: |
DANSWER_VERSION=${{ github.ref_name }}
# needed due to weird interactions with the builds for different platforms
no-cache: true

- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
Expand Down
4 changes: 2 additions & 2 deletions backend/alembic/versions/3879338f8ba1_add_tool_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
# revision identifiers, used by Alembic.
revision = "3879338f8ba1"
down_revision = "f1c6478c3fd8"
branch_labels = None
depends_on = None
branch_labels: None = None
depends_on: None = None


def upgrade() -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""More Descriptive Filestore
Revision ID: 70f00c45c0f2
Revises: 3879338f8ba1
Create Date: 2024-05-17 17:51:41.926893
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "70f00c45c0f2"
down_revision = "3879338f8ba1"
branch_labels: None = None
depends_on: None = None


def upgrade() -> None:
op.add_column("file_store", sa.Column("display_name", sa.String(), nullable=True))
op.add_column(
"file_store",
sa.Column(
"file_origin",
sa.String(),
nullable=False,
server_default="connector", # Default to connector
),
)
op.add_column(
"file_store",
sa.Column(
"file_type", sa.String(), nullable=False, server_default="text/plain"
),
)
op.add_column(
"file_store",
sa.Column(
"file_metadata",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
),
)

op.execute(
"""
UPDATE file_store
SET file_origin = CASE
WHEN file_name LIKE 'chat__%' THEN 'chat_upload'
ELSE 'connector'
END,
file_name = CASE
WHEN file_name LIKE 'chat__%' THEN SUBSTR(file_name, 7)
ELSE file_name
END,
file_type = CASE
WHEN file_name LIKE 'chat__%' THEN 'image/png'
ELSE 'text/plain'
END
"""
)


def downgrade() -> None:
op.drop_column("file_store", "file_metadata")
op.drop_column("file_store", "file_type")
op.drop_column("file_store", "file_origin")
op.drop_column("file_store", "display_name")
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Remove Last Attempt Status from CC Pair
Revision ID: ec85f2b3c544
Revises: 3879338f8ba1
Create Date: 2024-05-23 21:39:46.126010
"""
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "ec85f2b3c544"
down_revision = "70f00c45c0f2"
branch_labels: None = None
depends_on: None = None


def upgrade() -> None:
op.drop_column("connector_credential_pair", "last_attempt_status")


def downgrade() -> None:
op.add_column(
"connector_credential_pair",
sa.Column(
"last_attempt_status",
sa.VARCHAR(),
autoincrement=False,
nullable=True,
),
)
4 changes: 3 additions & 1 deletion backend/danswer/background/celery/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def cleanup_connector_credential_pair_task(
f"{connector_id} and Credential ID: {credential_id} does not exist."
)

deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(cc_pair)
deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(
connector_credential_pair=cc_pair, db_session=db_session
)
if deletion_attempt_disallowed_reason:
raise ValueError(deletion_attempt_disallowed_reason)

Expand Down
35 changes: 12 additions & 23 deletions backend/danswer/background/indexing/run_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,19 +160,19 @@ def _run_indexing(
source_type=db_connector.source,
)
):
window_start = max(
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
datetime(1970, 1, 1, tzinfo=timezone.utc),
)
try:
window_start = max(
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
datetime(1970, 1, 1, tzinfo=timezone.utc),
)

doc_batch_generator, is_listing_complete = _get_document_generator(
db_session=db_session,
attempt=index_attempt,
start_time=window_start,
end_time=window_end,
)
doc_batch_generator, is_listing_complete = _get_document_generator(
db_session=db_session,
attempt=index_attempt,
start_time=window_start,
end_time=window_end,
)

try:
all_connector_doc_ids: set[str] = set()
for doc_batch in doc_batch_generator:
# Check if connector is disabled mid run and stop if so unless it's the secondary
Expand Down Expand Up @@ -263,7 +263,6 @@ def _run_indexing(
db_session=db_session,
connector_id=db_connector.id,
credential_id=db_credential.id,
attempt_status=IndexingStatus.IN_PROGRESS,
net_docs=net_doc_change,
run_dt=run_end_dt,
)
Expand Down Expand Up @@ -294,7 +293,6 @@ def _run_indexing(
db_session=db_session,
connector_id=index_attempt.connector.id,
credential_id=index_attempt.credential.id,
attempt_status=IndexingStatus.FAILED,
net_docs=net_doc_change,
)
raise e
Expand All @@ -309,7 +307,6 @@ def _run_indexing(
db_session=db_session,
connector_id=db_connector.id,
credential_id=db_credential.id,
attempt_status=IndexingStatus.SUCCESS,
run_dt=run_end_dt,
)

Expand Down Expand Up @@ -343,15 +340,7 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA

# only commit once, to make sure this all happens in a single transaction
mark_attempt_in_progress__no_commit(attempt)
is_primary = attempt.embedding_model.status == IndexModelStatus.PRESENT
if is_primary:
update_connector_credential_pair(
db_session=db_session,
connector_id=attempt.connector.id,
credential_id=attempt.credential.id,
attempt_status=IndexingStatus.IN_PROGRESS,
)
else:
if attempt.embedding_model.status != IndexModelStatus.PRESENT:
db_session.commit()

return attempt
Expand Down
28 changes: 0 additions & 28 deletions backend/danswer/background/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
from danswer.configs.app_configs import NUM_INDEXING_WORKERS
from danswer.db.connector import fetch_connectors
from danswer.db.connector_credential_pair import mark_all_in_progress_cc_pairs_failed
from danswer.db.connector_credential_pair import update_connector_credential_pair
from danswer.db.embedding_model import get_current_db_embedding_model
from danswer.db.embedding_model import get_secondary_db_embedding_model
from danswer.db.engine import get_db_current_time
Expand Down Expand Up @@ -119,17 +117,6 @@ def _mark_run_failed(
db_session=db_session,
failure_reason=failure_reason,
)
if (
index_attempt.connector_id is not None
and index_attempt.credential_id is not None
and index_attempt.embedding_model.status == IndexModelStatus.PRESENT
):
update_connector_credential_pair(
db_session=db_session,
connector_id=index_attempt.connector_id,
credential_id=index_attempt.credential_id,
attempt_status=IndexingStatus.FAILED,
)


"""Main funcs"""
Expand Down Expand Up @@ -192,16 +179,6 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
connector.id, credential.id, model.id, db_session
)

# CC-Pair will have the status that it should for the primary index
# Will be re-sync-ed once the indices are swapped
if model.status == IndexModelStatus.PRESENT:
update_connector_credential_pair(
db_session=db_session,
connector_id=connector.id,
credential_id=credential.id,
attempt_status=IndexingStatus.NOT_STARTED,
)


def cleanup_indexing_jobs(
existing_jobs: dict[int, Future | SimpleJob],
Expand Down Expand Up @@ -391,11 +368,6 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non

existing_jobs: dict[int, Future | SimpleJob] = {}

with Session(engine) as db_session:
# Previous version did not always clean up cc-pairs well leaving some connectors undeleteable
# This ensures that bad states get cleaned up
mark_all_in_progress_cc_pairs_failed(db_session)

while True:
start = time.time()
start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
Expand Down
24 changes: 18 additions & 6 deletions backend/danswer/chat/process_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from danswer.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
from danswer.configs.constants import MessageType
from danswer.db.chat import attach_files_to_chat_message
from danswer.db.chat import create_db_search_doc
from danswer.db.chat import create_new_chat_message
from danswer.db.chat import get_chat_message
Expand Down Expand Up @@ -240,6 +241,7 @@ def stream_chat_message_objects(
else:
parent_message = root_message

user_message = None
if not use_existing_user_message:
# Create new message at the right place in the tree and update the parent's child pointer
# Don't commit yet until we verify the chat message chain
Expand All @@ -250,10 +252,7 @@ def stream_chat_message_objects(
message=message_text,
token_count=len(llm_tokenizer_encode_func(message_text)),
message_type=MessageType.USER,
files=[
{"id": str(file_id), "type": ChatFileType.IMAGE}
for file_id in new_msg_req.file_ids
],
files=None, # Need to attach later for optimization to only load files once in parallel
db_session=db_session,
commit=False,
)
Expand Down Expand Up @@ -283,11 +282,24 @@ def stream_chat_message_objects(
)

# load all files needed for this chat chain in memory
files = load_all_chat_files(history_msgs, new_msg_req.file_ids, db_session)
files = load_all_chat_files(
history_msgs, new_msg_req.file_descriptors, db_session
)
latest_query_files = [
file for file in files if file.file_id in new_msg_req.file_ids
file
for file in files
if file.file_id in [f["id"] for f in new_msg_req.file_descriptors]
]

if user_message:
attach_files_to_chat_message(
chat_message=user_message,
files=[
new_file.to_file_descriptor() for new_file in latest_query_files
],
db_session=db_session,
)

selected_db_search_docs = None
selected_llm_docs: list[LlmDoc] | None = None
if reference_doc_ids:
Expand Down
8 changes: 8 additions & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class DocumentSource(str, Enum):
SHAREPOINT = "sharepoint"
DISCOURSE = "discourse"
AXERO = "axero"
MEDIAWIKI = "mediawiki"
WIKIPEDIA = "wikipedia"


class DocumentIndexType(str, Enum):
Expand Down Expand Up @@ -130,3 +132,9 @@ class TokenRateLimitScope(str, Enum):
USER = "user"
USER_GROUP = "user_group"
GLOBAL = "global"


class FileOrigin(str, Enum):
CHAT_UPLOAD = "chat_upload"
CHAT_IMAGE_GEN = "chat_image_gen"
CONNECTOR = "connector"
Loading

0 comments on commit 51f5649

Please sign in to comment.