Merge remote-tracking branch 'upstream/main' into chore/merge-upstrea…

…m-2024082901
mindvalley · Aug 30, 2024 · 1620862 · 1620862
2 parents 5f8c2df + 1734a4a
commit 1620862
Show file tree

Hide file tree

Showing 9 changed files with 191 additions and 29 deletions.
diff --git a/backend/alembic/env.py b/backend/alembic/env.py
@@ -8,6 +8,7 @@
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import create_async_engine
 from celery.backends.database.session import ResultModelBase  # type: ignore
+from sqlalchemy.schema import SchemaItem
 
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
@@ -29,6 +30,20 @@
 # my_important_option = config.get_main_option("my_important_option")
 # ... etc.
 
+EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}
+
+
+def include_object(
+    object: SchemaItem,
+    name: str,
+    type_: str,
+    reflected: bool,
+    compare_to: SchemaItem | None,
+) -> bool:
+    if type_ == "table" and name in EXCLUDE_TABLES:
+        return False
+    return True
+
 
 def run_migrations_offline() -> None:
     """Run migrations in 'offline' mode.
@@ -55,7 +70,11 @@ def run_migrations_offline() -> None:
 
 
 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)  # type: ignore
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,  # type: ignore
+        include_object=include_object,
+    )  # type: ignore
 
     with context.begin_transaction():
         context.run_migrations()

diff --git a/backend/danswer/configs/danswerbot_configs.py b/backend/danswer/configs/danswerbot_configs.py
@@ -73,3 +73,15 @@
 DANSWER_BOT_REPHRASE_MESSAGE = (
     os.environ.get("DANSWER_BOT_REPHRASE_MESSAGE", "").lower() == "true"
 )
+
+# DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD is the number of
+# responses DanswerBot can send in a given time period.
+# Set to 0 to disable the limit.
+DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD = int(
+    os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD", "5000")
+)
+# DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS is the number
+# of seconds until the response limit is reset.
+DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS = int(
+    os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS", "86400")
+)
diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py
@@ -80,11 +80,16 @@
 GEN_AI_LLM_PROVIDER_TYPE = os.environ.get("GEN_AI_LLM_PROVIDER_TYPE") or None
 # Override the auto-detection of LLM max context length
 GEN_AI_MAX_TOKENS = int(os.environ.get("GEN_AI_MAX_TOKENS") or 0) or None
+
 # Set this to be enough for an answer + quotes. Also used for Chat
-GEN_AI_MAX_OUTPUT_TOKENS = int(os.environ.get("GEN_AI_MAX_OUTPUT_TOKENS") or 1024)
+# This is the minimum token context we will leave for the LLM to generate an answer
+GEN_AI_NUM_RESERVED_OUTPUT_TOKENS = int(
+    os.environ.get("GEN_AI_NUM_RESERVED_OUTPUT_TOKENS") or 1024
+)
 
 # Typically, GenAI models nowadays are at least 4K tokens
-GEN_AI_MODEL_DEFAULT_MAX_TOKENS = 4096
+GEN_AI_MODEL_FALLBACK_MAX_TOKENS = 4096
+
 # Number of tokens from chat history to include at maximum
 # 3000 should be enough context regardless of use, no need to include as much as possible
 # as this drives up the cost unnecessarily

diff --git a/backend/danswer/danswerbot/slack/listener.py b/backend/danswer/danswerbot/slack/listener.py
@@ -38,6 +38,7 @@
 from danswer.danswerbot.slack.handlers.handle_message import schedule_feedback_reminder
 from danswer.danswerbot.slack.models import SlackMessageInfo
 from danswer.danswerbot.slack.tokens import fetch_tokens
+from danswer.danswerbot.slack.utils import check_message_limit
 from danswer.danswerbot.slack.utils import decompose_action_id
 from danswer.danswerbot.slack.utils import get_channel_name_from_id
 from danswer.danswerbot.slack.utils import get_danswer_bot_app_id
@@ -130,9 +131,19 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
 
         if event_type == "message":
             bot_tag_id = get_danswer_bot_app_id(client.web_client)
+
+            is_dm = event.get("channel_type") == "im"
+            is_tagged = bot_tag_id and bot_tag_id in msg
+            is_danswer_bot_msg = bot_tag_id and bot_tag_id in event.get("user", "")
+
+            # DanswerBot should never respond to itself
+            if is_danswer_bot_msg:
+                logger.info("Ignoring message from DanswerBot")
+                return False
+
             # DMs with the bot don't pick up the @DanswerBot so we have to keep the
             # caught events_api
-            if bot_tag_id and bot_tag_id in msg and event.get("channel_type") != "im":
+            if is_tagged and not is_dm:
                 # Let the tag flow handle this case, don't reply twice
                 return False
 
@@ -200,6 +211,9 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
             )
             return False
 
+    if not check_message_limit():
+        return False
+
     logger.debug(f"Handling Slack request with Payload: '{req.payload}'")
     return True
 

diff --git a/backend/danswer/danswerbot/slack/utils.py b/backend/danswer/danswerbot/slack/utils.py
@@ -21,6 +21,12 @@
 from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_QPM
 from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_WAIT_TIME
 from danswer.configs.danswerbot_configs import DANSWER_BOT_NUM_RETRIES
+from danswer.configs.danswerbot_configs import (
+    DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD,
+)
+from danswer.configs.danswerbot_configs import (
+    DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS,
+)
 from danswer.connectors.slack.utils import make_slack_api_rate_limited
 from danswer.connectors.slack.utils import SlackTextCleaner
 from danswer.danswerbot.slack.constants import FeedbackVisibility
@@ -41,7 +47,41 @@
 logger = setup_logger()
 
 
-DANSWER_BOT_APP_ID: str | None = None
+_DANSWER_BOT_APP_ID: str | None = None
+_DANSWER_BOT_MESSAGE_COUNT: int = 0
+_DANSWER_BOT_COUNT_START_TIME: float = time.time()
+
+
+def get_danswer_bot_app_id(web_client: WebClient) -> Any:
+    global _DANSWER_BOT_APP_ID
+    if _DANSWER_BOT_APP_ID is None:
+        _DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
+    return _DANSWER_BOT_APP_ID
+
+
+def check_message_limit() -> bool:
+    """
+    This isnt a perfect solution.
+    High traffic at the end of one period and start of another could cause
+    the limit to be exceeded.
+    """
+    if DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD == 0:
+        return True
+    global _DANSWER_BOT_MESSAGE_COUNT
+    global _DANSWER_BOT_COUNT_START_TIME
+    time_since_start = time.time() - _DANSWER_BOT_COUNT_START_TIME
+    if time_since_start > DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS:
+        _DANSWER_BOT_MESSAGE_COUNT = 0
+        _DANSWER_BOT_COUNT_START_TIME = time.time()
+    if (_DANSWER_BOT_MESSAGE_COUNT + 1) > DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD:
+        logger.error(
+            f"DanswerBot has reached the message limit {DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD}"
+            f" for the time period {DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS} seconds."
+            " These limits are configurable in backend/danswer/configs/danswerbot_configs.py"
+        )
+        return False
+    _DANSWER_BOT_MESSAGE_COUNT += 1
+    return True
 
 
 def rephrase_slack_message(msg: str) -> str:
@@ -96,13 +136,6 @@ def update_emote_react(
             logger.error(f"Was not able to react to user message due to: {e}")
 
 
-def get_danswer_bot_app_id(web_client: WebClient) -> Any:
-    global DANSWER_BOT_APP_ID
-    if DANSWER_BOT_APP_ID is None:
-        DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
-    return DANSWER_BOT_APP_ID
-
-
 def remove_danswer_bot_tag(message_str: str, client: WebClient) -> str:
     bot_tag_id = get_danswer_bot_app_id(web_client=client)
     return re.sub(rf"<@{bot_tag_id}>\s", "", message_str)

diff --git a/backend/danswer/llm/chat_llm.py b/backend/danswer/llm/chat_llm.py
@@ -28,7 +28,6 @@
 from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
 from danswer.configs.model_configs import GEN_AI_API_VERSION
 from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
-from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
 from danswer.configs.model_configs import GEN_AI_TEMPERATURE
 from danswer.llm.interfaces import LLM
 from danswer.llm.interfaces import LLMConfig
@@ -193,10 +192,10 @@ def __init__(
         timeout: int,
         model_provider: str,
         model_name: str,
+        max_output_tokens: int | None = None,
         api_base: str | None = GEN_AI_API_ENDPOINT,
         api_version: str | None = GEN_AI_API_VERSION,
         custom_llm_provider: str | None = GEN_AI_LLM_PROVIDER_TYPE,
-        max_output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
         temperature: float = GEN_AI_TEMPERATURE,
         custom_config: dict[str, str] | None = None,
         extra_headers: dict[str, str] | None = None,
@@ -209,7 +208,17 @@ def __init__(
         self._api_base = api_base
         self._api_version = api_version
         self._custom_llm_provider = custom_llm_provider
-        self._max_output_tokens = max_output_tokens
+
+        # This can be used to store the maximum output tkoens for this model.
+        # self._max_output_tokens = (
+        #     max_output_tokens
+        #     if max_output_tokens is not None
+        #     else get_llm_max_output_tokens(
+        #         model_map=litellm.model_cost,
+        #         model_name=model_name,
+        #         model_provider=model_provider,
+        #     )
+        # )
         self._custom_config = custom_config
 
         # NOTE: have to set these as environment variables for Litellm since
@@ -228,6 +237,30 @@ def __init__(
     def log_model_configs(self) -> None:
         logger.debug(f"Config: {self.config}")
 
+    # def _calculate_max_output_tokens(self, prompt: LanguageModelInput) -> int:
+    #     # NOTE: This method can be used for calculating the maximum tokens for the stream,
+    #     # but it isn't used in practice due to the computational cost of counting tokens
+    #     # and because LLM providers automatically cut off at the maximum output.
+    #     # The implementation is kept for potential future use or debugging purposes.
+
+    #     # Get max input tokens for the model
+    #     max_context_tokens = get_max_input_tokens(
+    #         model_name=self.config.model_name, model_provider=self.config.model_provider
+    #     )
+
+    #     llm_tokenizer = get_tokenizer(
+    #         model_name=self.config.model_name,
+    #         provider_type=self.config.model_provider,
+    #     )
+    #     # Calculate tokens in the input prompt
+    #     input_tokens = sum(len(llm_tokenizer.encode(str(m))) for m in prompt)
+
+    #     # Calculate available tokens for output
+    #     available_output_tokens = max_context_tokens - input_tokens
+
+    #     # Return the lesser of available tokens or configured max
+    #     return min(self._max_output_tokens, available_output_tokens)
+
     def _completion(
         self,
         prompt: LanguageModelInput,
@@ -259,9 +292,6 @@ def _completion(
                 stream=stream,
                 # model params
                 temperature=self._temperature,
-                max_tokens=self._max_output_tokens
-                if self._max_output_tokens > 0
-                else None,
                 timeout=self._timeout,
                 # For now, we don't support parallel tool calls
                 # NOTE: we can't pass this in if tools are not specified

diff --git a/backend/danswer/llm/custom_llm.py b/backend/danswer/llm/custom_llm.py
@@ -8,7 +8,7 @@
 from requests import Timeout
 
 from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
-from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
+from danswer.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
 from danswer.llm.interfaces import LLM
 from danswer.llm.interfaces import ToolChoiceOptions
 from danswer.llm.utils import convert_lm_input_to_basic_string
@@ -38,7 +38,7 @@ def __init__(
         api_key: str | None,
         timeout: int,
         endpoint: str | None = GEN_AI_API_ENDPOINT,
-        max_output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
+        max_output_tokens: int = GEN_AI_NUM_RESERVED_OUTPUT_TOKENS,
     ):
         if not endpoint:
             raise ValueError(

diff --git a/backend/danswer/llm/utils.py b/backend/danswer/llm/utils.py
@@ -30,10 +30,10 @@
 from litellm.exceptions import UnprocessableEntityError  # type: ignore
 
 from danswer.configs.constants import MessageType
-from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
 from danswer.configs.model_configs import GEN_AI_MAX_TOKENS
-from danswer.configs.model_configs import GEN_AI_MODEL_DEFAULT_MAX_TOKENS
+from danswer.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
 from danswer.configs.model_configs import GEN_AI_MODEL_PROVIDER
+from danswer.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
 from danswer.db.models import ChatMessage
 from danswer.file_store.models import ChatFileType
 from danswer.file_store.models import InMemoryChatFile
@@ -336,31 +336,80 @@ def get_llm_max_tokens(
     """Best effort attempt to get the max tokens for the LLM"""
     if GEN_AI_MAX_TOKENS:
         # This is an override, so always return this
+        logger.info(f"Using override GEN_AI_MAX_TOKENS: {GEN_AI_MAX_TOKENS}")
         return GEN_AI_MAX_TOKENS
 
     try:
         model_obj = model_map.get(f"{model_provider}/{model_name}")
         if not model_obj:
             model_obj = model_map[model_name]
+            logger.debug(f"Using model object for {model_name}")
+        else:
+            logger.debug(f"Using model object for {model_provider}/{model_name}")
 
         if "max_input_tokens" in model_obj:
-            return model_obj["max_input_tokens"]
+            max_tokens = model_obj["max_input_tokens"]
+            logger.info(
+                f"Max tokens for {model_name}: {max_tokens} (from max_input_tokens)"
+            )
+            return max_tokens
 
         if "max_tokens" in model_obj:
-            return model_obj["max_tokens"]
+            max_tokens = model_obj["max_tokens"]
+            logger.info(f"Max tokens for {model_name}: {max_tokens} (from max_tokens)")
+            return max_tokens
 
+        logger.error(f"No max tokens found for LLM: {model_name}")
         raise RuntimeError("No max tokens found for LLM")
     except Exception:
         logger.exception(
-            f"Failed to get max tokens for LLM with name {model_name}. Defaulting to {GEN_AI_MODEL_DEFAULT_MAX_TOKENS}."
+            f"Failed to get max tokens for LLM with name {model_name}. Defaulting to {GEN_AI_MODEL_FALLBACK_MAX_TOKENS}."
+        )
+        return GEN_AI_MODEL_FALLBACK_MAX_TOKENS
+
+
+def get_llm_max_output_tokens(
+    model_map: dict,
+    model_name: str,
+    model_provider: str = GEN_AI_MODEL_PROVIDER,
+) -> int:
+    """Best effort attempt to get the max output tokens for the LLM"""
+    try:
+        model_obj = model_map.get(f"{model_provider}/{model_name}")
+        if not model_obj:
+            model_obj = model_map[model_name]
+            logger.debug(f"Using model object for {model_name}")
+        else:
+            logger.debug(f"Using model object for {model_provider}/{model_name}")
+
+        if "max_output_tokens" in model_obj:
+            max_output_tokens = model_obj["max_output_tokens"]
+            logger.info(f"Max output tokens for {model_name}: {max_output_tokens}")
+            return max_output_tokens
+
+        # Fallback to a fraction of max_tokens if max_output_tokens is not specified
+        if "max_tokens" in model_obj:
+            max_output_tokens = int(model_obj["max_tokens"] * 0.1)
+            logger.info(
+                f"Fallback max output tokens for {model_name}: {max_output_tokens} (10% of max_tokens)"
+            )
+            return max_output_tokens
+
+        logger.error(f"No max output tokens found for LLM: {model_name}")
+        raise RuntimeError("No max output tokens found for LLM")
+    except Exception:
+        default_output_tokens = int(GEN_AI_MODEL_FALLBACK_MAX_TOKENS)
+        logger.exception(
+            f"Failed to get max output tokens for LLM with name {model_name}. "
+            f"Defaulting to {default_output_tokens} (fallback max tokens)."
         )
-        return GEN_AI_MODEL_DEFAULT_MAX_TOKENS
+        return default_output_tokens
 
 
 def get_max_input_tokens(
     model_name: str,
     model_provider: str,
-    output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
+    output_tokens: int = GEN_AI_NUM_RESERVED_OUTPUT_TOKENS,
 ) -> int:
     # NOTE: we previously used `litellm.get_max_tokens()`, but despite the name, this actually
     # returns the max OUTPUT tokens. Under the hood, this uses the `litellm.model_cost` dict,