Skip to content
This repository has been archived by the owner on Dec 11, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into chore/merge-upstrea…
Browse files Browse the repository at this point in the history
…m-2024082901
  • Loading branch information
onimsha committed Aug 30, 2024
2 parents 5f8c2df + 1734a4a commit 1620862
Show file tree
Hide file tree
Showing 9 changed files with 191 additions and 29 deletions.
21 changes: 20 additions & 1 deletion backend/alembic/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sqlalchemy.engine import Connection
from sqlalchemy.ext.asyncio import create_async_engine
from celery.backends.database.session import ResultModelBase # type: ignore
from sqlalchemy.schema import SchemaItem

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand All @@ -29,6 +30,20 @@
# my_important_option = config.get_main_option("my_important_option")
# ... etc.

EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}


def include_object(
object: SchemaItem,
name: str,
type_: str,
reflected: bool,
compare_to: SchemaItem | None,
) -> bool:
if type_ == "table" and name in EXCLUDE_TABLES:
return False
return True


def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
Expand All @@ -55,7 +70,11 @@ def run_migrations_offline() -> None:


def do_run_migrations(connection: Connection) -> None:
context.configure(connection=connection, target_metadata=target_metadata) # type: ignore
context.configure(
connection=connection,
target_metadata=target_metadata, # type: ignore
include_object=include_object,
) # type: ignore

with context.begin_transaction():
context.run_migrations()
Expand Down
12 changes: 12 additions & 0 deletions backend/danswer/configs/danswerbot_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,15 @@
DANSWER_BOT_REPHRASE_MESSAGE = (
os.environ.get("DANSWER_BOT_REPHRASE_MESSAGE", "").lower() == "true"
)

# DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD is the number of
# responses DanswerBot can send in a given time period.
# Set to 0 to disable the limit.
DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD = int(
os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD", "5000")
)
# DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS is the number
# of seconds until the response limit is reset.
DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS = int(
os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS", "86400")
)
9 changes: 7 additions & 2 deletions backend/danswer/configs/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,16 @@
GEN_AI_LLM_PROVIDER_TYPE = os.environ.get("GEN_AI_LLM_PROVIDER_TYPE") or None
# Override the auto-detection of LLM max context length
GEN_AI_MAX_TOKENS = int(os.environ.get("GEN_AI_MAX_TOKENS") or 0) or None

# Set this to be enough for an answer + quotes. Also used for Chat
GEN_AI_MAX_OUTPUT_TOKENS = int(os.environ.get("GEN_AI_MAX_OUTPUT_TOKENS") or 1024)
# This is the minimum token context we will leave for the LLM to generate an answer
GEN_AI_NUM_RESERVED_OUTPUT_TOKENS = int(
os.environ.get("GEN_AI_NUM_RESERVED_OUTPUT_TOKENS") or 1024
)

# Typically, GenAI models nowadays are at least 4K tokens
GEN_AI_MODEL_DEFAULT_MAX_TOKENS = 4096
GEN_AI_MODEL_FALLBACK_MAX_TOKENS = 4096

# Number of tokens from chat history to include at maximum
# 3000 should be enough context regardless of use, no need to include as much as possible
# as this drives up the cost unnecessarily
Expand Down
16 changes: 15 additions & 1 deletion backend/danswer/danswerbot/slack/listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from danswer.danswerbot.slack.handlers.handle_message import schedule_feedback_reminder
from danswer.danswerbot.slack.models import SlackMessageInfo
from danswer.danswerbot.slack.tokens import fetch_tokens
from danswer.danswerbot.slack.utils import check_message_limit
from danswer.danswerbot.slack.utils import decompose_action_id
from danswer.danswerbot.slack.utils import get_channel_name_from_id
from danswer.danswerbot.slack.utils import get_danswer_bot_app_id
Expand Down Expand Up @@ -130,9 +131,19 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool

if event_type == "message":
bot_tag_id = get_danswer_bot_app_id(client.web_client)

is_dm = event.get("channel_type") == "im"
is_tagged = bot_tag_id and bot_tag_id in msg
is_danswer_bot_msg = bot_tag_id and bot_tag_id in event.get("user", "")

# DanswerBot should never respond to itself
if is_danswer_bot_msg:
logger.info("Ignoring message from DanswerBot")
return False

# DMs with the bot don't pick up the @DanswerBot so we have to keep the
# caught events_api
if bot_tag_id and bot_tag_id in msg and event.get("channel_type") != "im":
if is_tagged and not is_dm:
# Let the tag flow handle this case, don't reply twice
return False

Expand Down Expand Up @@ -200,6 +211,9 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
)
return False

if not check_message_limit():
return False

logger.debug(f"Handling Slack request with Payload: '{req.payload}'")
return True

Expand Down
49 changes: 41 additions & 8 deletions backend/danswer/danswerbot/slack/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_QPM
from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_WAIT_TIME
from danswer.configs.danswerbot_configs import DANSWER_BOT_NUM_RETRIES
from danswer.configs.danswerbot_configs import (
DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD,
)
from danswer.configs.danswerbot_configs import (
DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS,
)
from danswer.connectors.slack.utils import make_slack_api_rate_limited
from danswer.connectors.slack.utils import SlackTextCleaner
from danswer.danswerbot.slack.constants import FeedbackVisibility
Expand All @@ -41,7 +47,41 @@
logger = setup_logger()


DANSWER_BOT_APP_ID: str | None = None
_DANSWER_BOT_APP_ID: str | None = None
_DANSWER_BOT_MESSAGE_COUNT: int = 0
_DANSWER_BOT_COUNT_START_TIME: float = time.time()


def get_danswer_bot_app_id(web_client: WebClient) -> Any:
global _DANSWER_BOT_APP_ID
if _DANSWER_BOT_APP_ID is None:
_DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
return _DANSWER_BOT_APP_ID


def check_message_limit() -> bool:
"""
This isnt a perfect solution.
High traffic at the end of one period and start of another could cause
the limit to be exceeded.
"""
if DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD == 0:
return True
global _DANSWER_BOT_MESSAGE_COUNT
global _DANSWER_BOT_COUNT_START_TIME
time_since_start = time.time() - _DANSWER_BOT_COUNT_START_TIME
if time_since_start > DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS:
_DANSWER_BOT_MESSAGE_COUNT = 0
_DANSWER_BOT_COUNT_START_TIME = time.time()
if (_DANSWER_BOT_MESSAGE_COUNT + 1) > DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD:
logger.error(
f"DanswerBot has reached the message limit {DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD}"
f" for the time period {DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS} seconds."
" These limits are configurable in backend/danswer/configs/danswerbot_configs.py"
)
return False
_DANSWER_BOT_MESSAGE_COUNT += 1
return True


def rephrase_slack_message(msg: str) -> str:
Expand Down Expand Up @@ -96,13 +136,6 @@ def update_emote_react(
logger.error(f"Was not able to react to user message due to: {e}")


def get_danswer_bot_app_id(web_client: WebClient) -> Any:
global DANSWER_BOT_APP_ID
if DANSWER_BOT_APP_ID is None:
DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
return DANSWER_BOT_APP_ID


def remove_danswer_bot_tag(message_str: str, client: WebClient) -> str:
bot_tag_id = get_danswer_bot_app_id(web_client=client)
return re.sub(rf"<@{bot_tag_id}>\s", "", message_str)
Expand Down
42 changes: 36 additions & 6 deletions backend/danswer/llm/chat_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
from danswer.configs.model_configs import GEN_AI_API_VERSION
from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
from danswer.configs.model_configs import GEN_AI_TEMPERATURE
from danswer.llm.interfaces import LLM
from danswer.llm.interfaces import LLMConfig
Expand Down Expand Up @@ -193,10 +192,10 @@ def __init__(
timeout: int,
model_provider: str,
model_name: str,
max_output_tokens: int | None = None,
api_base: str | None = GEN_AI_API_ENDPOINT,
api_version: str | None = GEN_AI_API_VERSION,
custom_llm_provider: str | None = GEN_AI_LLM_PROVIDER_TYPE,
max_output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
temperature: float = GEN_AI_TEMPERATURE,
custom_config: dict[str, str] | None = None,
extra_headers: dict[str, str] | None = None,
Expand All @@ -209,7 +208,17 @@ def __init__(
self._api_base = api_base
self._api_version = api_version
self._custom_llm_provider = custom_llm_provider
self._max_output_tokens = max_output_tokens

# This can be used to store the maximum output tkoens for this model.
# self._max_output_tokens = (
# max_output_tokens
# if max_output_tokens is not None
# else get_llm_max_output_tokens(
# model_map=litellm.model_cost,
# model_name=model_name,
# model_provider=model_provider,
# )
# )
self._custom_config = custom_config

# NOTE: have to set these as environment variables for Litellm since
Expand All @@ -228,6 +237,30 @@ def __init__(
def log_model_configs(self) -> None:
logger.debug(f"Config: {self.config}")

# def _calculate_max_output_tokens(self, prompt: LanguageModelInput) -> int:
# # NOTE: This method can be used for calculating the maximum tokens for the stream,
# # but it isn't used in practice due to the computational cost of counting tokens
# # and because LLM providers automatically cut off at the maximum output.
# # The implementation is kept for potential future use or debugging purposes.

# # Get max input tokens for the model
# max_context_tokens = get_max_input_tokens(
# model_name=self.config.model_name, model_provider=self.config.model_provider
# )

# llm_tokenizer = get_tokenizer(
# model_name=self.config.model_name,
# provider_type=self.config.model_provider,
# )
# # Calculate tokens in the input prompt
# input_tokens = sum(len(llm_tokenizer.encode(str(m))) for m in prompt)

# # Calculate available tokens for output
# available_output_tokens = max_context_tokens - input_tokens

# # Return the lesser of available tokens or configured max
# return min(self._max_output_tokens, available_output_tokens)

def _completion(
self,
prompt: LanguageModelInput,
Expand Down Expand Up @@ -259,9 +292,6 @@ def _completion(
stream=stream,
# model params
temperature=self._temperature,
max_tokens=self._max_output_tokens
if self._max_output_tokens > 0
else None,
timeout=self._timeout,
# For now, we don't support parallel tool calls
# NOTE: we can't pass this in if tools are not specified
Expand Down
4 changes: 2 additions & 2 deletions backend/danswer/llm/custom_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from requests import Timeout

from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
from danswer.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
from danswer.llm.interfaces import LLM
from danswer.llm.interfaces import ToolChoiceOptions
from danswer.llm.utils import convert_lm_input_to_basic_string
Expand Down Expand Up @@ -38,7 +38,7 @@ def __init__(
api_key: str | None,
timeout: int,
endpoint: str | None = GEN_AI_API_ENDPOINT,
max_output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
max_output_tokens: int = GEN_AI_NUM_RESERVED_OUTPUT_TOKENS,
):
if not endpoint:
raise ValueError(
Expand Down
63 changes: 56 additions & 7 deletions backend/danswer/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@
from litellm.exceptions import UnprocessableEntityError # type: ignore

from danswer.configs.constants import MessageType
from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS
from danswer.configs.model_configs import GEN_AI_MAX_TOKENS
from danswer.configs.model_configs import GEN_AI_MODEL_DEFAULT_MAX_TOKENS
from danswer.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
from danswer.configs.model_configs import GEN_AI_MODEL_PROVIDER
from danswer.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
from danswer.db.models import ChatMessage
from danswer.file_store.models import ChatFileType
from danswer.file_store.models import InMemoryChatFile
Expand Down Expand Up @@ -336,31 +336,80 @@ def get_llm_max_tokens(
"""Best effort attempt to get the max tokens for the LLM"""
if GEN_AI_MAX_TOKENS:
# This is an override, so always return this
logger.info(f"Using override GEN_AI_MAX_TOKENS: {GEN_AI_MAX_TOKENS}")
return GEN_AI_MAX_TOKENS

try:
model_obj = model_map.get(f"{model_provider}/{model_name}")
if not model_obj:
model_obj = model_map[model_name]
logger.debug(f"Using model object for {model_name}")
else:
logger.debug(f"Using model object for {model_provider}/{model_name}")

if "max_input_tokens" in model_obj:
return model_obj["max_input_tokens"]
max_tokens = model_obj["max_input_tokens"]
logger.info(
f"Max tokens for {model_name}: {max_tokens} (from max_input_tokens)"
)
return max_tokens

if "max_tokens" in model_obj:
return model_obj["max_tokens"]
max_tokens = model_obj["max_tokens"]
logger.info(f"Max tokens for {model_name}: {max_tokens} (from max_tokens)")
return max_tokens

logger.error(f"No max tokens found for LLM: {model_name}")
raise RuntimeError("No max tokens found for LLM")
except Exception:
logger.exception(
f"Failed to get max tokens for LLM with name {model_name}. Defaulting to {GEN_AI_MODEL_DEFAULT_MAX_TOKENS}."
f"Failed to get max tokens for LLM with name {model_name}. Defaulting to {GEN_AI_MODEL_FALLBACK_MAX_TOKENS}."
)
return GEN_AI_MODEL_FALLBACK_MAX_TOKENS


def get_llm_max_output_tokens(
model_map: dict,
model_name: str,
model_provider: str = GEN_AI_MODEL_PROVIDER,
) -> int:
"""Best effort attempt to get the max output tokens for the LLM"""
try:
model_obj = model_map.get(f"{model_provider}/{model_name}")
if not model_obj:
model_obj = model_map[model_name]
logger.debug(f"Using model object for {model_name}")
else:
logger.debug(f"Using model object for {model_provider}/{model_name}")

if "max_output_tokens" in model_obj:
max_output_tokens = model_obj["max_output_tokens"]
logger.info(f"Max output tokens for {model_name}: {max_output_tokens}")
return max_output_tokens

# Fallback to a fraction of max_tokens if max_output_tokens is not specified
if "max_tokens" in model_obj:
max_output_tokens = int(model_obj["max_tokens"] * 0.1)
logger.info(
f"Fallback max output tokens for {model_name}: {max_output_tokens} (10% of max_tokens)"
)
return max_output_tokens

logger.error(f"No max output tokens found for LLM: {model_name}")
raise RuntimeError("No max output tokens found for LLM")
except Exception:
default_output_tokens = int(GEN_AI_MODEL_FALLBACK_MAX_TOKENS)
logger.exception(
f"Failed to get max output tokens for LLM with name {model_name}. "
f"Defaulting to {default_output_tokens} (fallback max tokens)."
)
return GEN_AI_MODEL_DEFAULT_MAX_TOKENS
return default_output_tokens


def get_max_input_tokens(
model_name: str,
model_provider: str,
output_tokens: int = GEN_AI_MAX_OUTPUT_TOKENS,
output_tokens: int = GEN_AI_NUM_RESERVED_OUTPUT_TOKENS,
) -> int:
# NOTE: we previously used `litellm.get_max_tokens()`, but despite the name, this actually
# returns the max OUTPUT tokens. Under the hood, this uses the `litellm.model_cost` dict,
Expand Down
Loading

0 comments on commit 1620862

Please sign in to comment.