Skip to content

Commit

Permalink
Option to have very verbose LLM logging (#500)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Oct 3, 2023
1 parent ab65b19 commit c2721c7
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 41 deletions.
2 changes: 0 additions & 2 deletions backend/danswer/chat/chat_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def _parse_embedded_json_streamed_response(
yield DanswerAnswerPiece(answer_piece=hold)
hold = ""

logger.debug(model_output)

model_final = extract_embedded_json(model_output)
if "action" not in model_final or "action_input" not in model_final:
raise ValueError("Model did not provide all required action values")
Expand Down
10 changes: 6 additions & 4 deletions backend/danswer/chat/chat_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

DANSWER_SYSTEM_MSG = (
"Given a conversation (between Human and Assistant) and a final message from Human, "
"rewrite the last message to be a standalone question that captures required/relevant context from the previous "
"conversation messages."
"rewrite the last message to be a standalone question which captures required/relevant context "
"from previous messages. This question must be useful for a semantic search engine. "
"It is used for a natural language search."
)

TOOL_TEMPLATE = """
Expand Down Expand Up @@ -181,8 +182,9 @@ def build_combined_query(
combined_query_msgs.append(
HumanMessage(
content=(
"Help me rewrite this final query into a standalone question that takes into consideration the "
f"past messages of the conversation. You must ONLY return the rewritten query and nothing else."
"Help me rewrite this final message into a standalone query that takes into consideration the "
f"past messages of the conversation if relevant. This query is used with a semantic search engine to "
f"retrieve documents. You must ONLY return the rewritten query and nothing else."
f"\n\nQuery:\n{query_message.message}"
)
)
Expand Down
5 changes: 5 additions & 0 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@
# each worker loads the embedding models into memory.
NUM_INDEXING_WORKERS = int(os.environ.get("NUM_INDEXING_WORKERS") or 1)

# Logs every model prompt and output, mostly used for development or exploration purposes
LOG_ALL_MODEL_INTERACTIONS = (
os.environ.get("LOG_ALL_MODEL_INTERACTIONS", "").lower() == "true"
)


#####
# Danswer Slack Bot Configs
Expand Down
70 changes: 38 additions & 32 deletions backend/danswer/connectors/hubspot/connector.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import requests
import json
from typing import Any
from datetime import datetime
from hubspot import HubSpot
from typing import Any

import requests
from hubspot import HubSpot # type: ignore

from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
Expand All @@ -19,19 +20,22 @@

logger = setup_logger()


class HubSpotConnector(LoadConnector, PollConnector):
def __init__(self, batch_size: int = INDEX_BATCH_SIZE, access_token: str | None = None) -> None:
def __init__(
self, batch_size: int = INDEX_BATCH_SIZE, access_token: str | None = None
) -> None:
self.batch_size = batch_size
self.access_token = access_token
self.portal_id = None
self.portal_id: str | None = None
self.ticket_base_url = HUBSPOT_BASE_URL

def get_portal_id(self) -> str:
headers = {
'Authorization': f'Bearer {self.access_token}',
'Content-Type': 'application/json'
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json",
}

response = requests.get(HUBSPOT_API_URL, headers=headers)
if response.status_code != 200:
raise Exception("Error fetching portal ID")
Expand All @@ -41,7 +45,7 @@ def get_portal_id(self) -> str:

def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.access_token = credentials["hubspot_access_token"]

if self.access_token:
self.portal_id = self.get_portal_id()
self.ticket_base_url = f"{HUBSPOT_BASE_URL}{self.portal_id}/ticket/"
Expand All @@ -53,44 +57,48 @@ def _process_tickets(
) -> GenerateDocumentsOutput:
if self.access_token is None:
raise ConnectorMissingCredentialError("HubSpot")

api_client = HubSpot(access_token=self.access_token)
all_tickets = api_client.crm.tickets.get_all(associations=['contacts', 'notes'])
all_tickets = api_client.crm.tickets.get_all(associations=["contacts", "notes"])

doc_batch: list[Document] = []

for ticket in all_tickets:
updated_at = ticket.updated_at.replace(tzinfo=None)
if start is not None and updated_at < start:
continue
if end is not None and updated_at > end:
continue

title = ticket.properties["subject"]
link = self.ticket_base_url + ticket.id
content_text = title + "\n" + ticket.properties["content"]

associated_emails = []
associated_notes = []
associated_emails: list[str] = []
associated_notes: list[str] = []

if ticket.associations:
contacts = ticket.associations.get("contacts")
notes = ticket.associations.get("notes")

if contacts:
for contact in contacts.results:
contact = api_client.crm.contacts.basic_api.get_by_id(contact_id=contact.id)
contact = api_client.crm.contacts.basic_api.get_by_id(
contact_id=contact.id
)
associated_emails.append(contact.properties["email"])

if notes:
if notes:
for note in notes.results:
note = api_client.crm.objects.notes.basic_api.get_by_id(note_id=note.id, properties=["content", "hs_body_preview"])
note = api_client.crm.objects.notes.basic_api.get_by_id(
note_id=note.id, properties=["content", "hs_body_preview"]
)
associated_notes.append(note.properties["hs_body_preview"])

associated_emails = " ,".join(associated_emails)
associated_notes = " ".join(associated_notes)

content_text = f"{content_text}\n emails: {associated_emails} \n notes: {associated_notes}"
associated_emails_str = " ,".join(associated_emails)
associated_notes_str = " ".join(associated_notes)

content_text = f"{content_text}\n emails: {associated_emails_str} \n notes: {associated_notes_str}"

doc_batch.append(
Document(
Expand All @@ -101,7 +109,7 @@ def _process_tickets(
metadata={},
)
)

if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
Expand All @@ -120,19 +128,17 @@ def poll_source(
return self._process_tickets(start_datetime, end_datetime)



if __name__ == "__main__":
import os
import time

test_connector = HubSpotConnector()
test_connector.load_credentials({
"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]
})
test_connector.load_credentials(
{"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
)
all_docs = test_connector.load_from_state()

current = time.time()
one_day_ago = current - 24 * 60 * 60 # 1 day
latest_docs = test_connector.poll_source(one_day_ago, current)
print(latest_docs)


2 changes: 1 addition & 1 deletion backend/danswer/direct_qa/qa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def process_model_tokens(
yield DanswerAnswerPiece(answer_piece=hold_quote + token)
hold_quote = ""

logger.debug(f"Raw model output: {model_output}")
logger.debug(f"Raw Model QnA Output: {model_output}")

# for a JSON prompt, make sure that we're only passing through the "JSON part"
# since that is what `extract_quotes_from_completed_token_stream` expects
Expand Down
30 changes: 28 additions & 2 deletions backend/danswer/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from langchain.chat_models.base import BaseChatModel
from langchain.schema.language_model import LanguageModelInput

from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
from danswer.llm.utils import message_generator_to_string_generator
from danswer.utils.logger import setup_logger

Expand Down Expand Up @@ -40,10 +41,35 @@ def _log_model_config(self) -> None:
f"Model Class: {self.llm.__class__.__name__}, Model Config: {self.llm.__dict__}"
)

@staticmethod
def _log_prompt(prompt: LanguageModelInput) -> None:
if isinstance(prompt, list):
for ind, msg in enumerate(prompt):
logger.debug(f"Message {ind}:\n{msg.content}")
if isinstance(prompt, str):
logger.debug(f"Prompt:\n{prompt}")

def invoke(self, prompt: LanguageModelInput) -> str:
self._log_model_config()
return self.llm.invoke(prompt).content
if LOG_ALL_MODEL_INTERACTIONS:
self._log_prompt(prompt)

model_raw = self.llm.invoke(prompt).content
if LOG_ALL_MODEL_INTERACTIONS:
logger.debug(f"Raw Model Output:\n{model_raw}")

return model_raw

def stream(self, prompt: LanguageModelInput) -> Iterator[str]:
self._log_model_config()
yield from message_generator_to_string_generator(self.llm.stream(prompt))
if LOG_ALL_MODEL_INTERACTIONS:
self._log_prompt(prompt)

output_tokens = []
for token in message_generator_to_string_generator(self.llm.stream(prompt)):
output_tokens.append(token)
yield token

full_output = "".join(output_tokens)
if LOG_ALL_MODEL_INTERACTIONS:
logger.debug(f"Raw Model Output:\n{full_output}")

1 comment on commit c2721c7

@vercel
Copy link

@vercel vercel bot commented on c2721c7 Oct 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.