diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index b1f1dc1d2df..04033b05f3f 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -20,76 +20,90 @@ logger = setup_logger() +_FRESHDESK_ID_PREFIX = "FRESHDESK_" + + +_TICKET_FIELDS_TO_INCLUDE = { + "fr_escalated", + "spam", + "priority", + "source", + "status", + "type", + "is_escalated", + "tags", + "nr_due_by", + "nr_escalated", + "cc_emails", + "fwd_emails", + "reply_cc_emails", + "ticket_cc_emails", + "support_email", + "to_emails", +} + +_SOURCE_NUMBER_TYPE_MAP = { + "1": "Email", + "2": "Portal", + "3": "Phone", + "7": "Chat", + "9": "Feedback Widget", + "10": "Outbound Email", +} + +_PRIORITY_NUMBER_TYPE_MAP = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} + +_STATUS_NUMBER_TYPE_MAP = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} + def _create_metadata_from_ticket(ticket: dict) -> dict: - included_fields = { - "fr_escalated", - "spam", - "priority", - "source", - "status", - "type", - "is_escalated", - "tags", - "nr_due_by", - "nr_escalated", - "cc_emails", - "fwd_emails", - "reply_cc_emails", - "ticket_cc_emails", - "support_email", - "to_emails", - } - - metadata = {} - email_data = {} + metadata: dict[str, str | list[str]] = {} + # Combine all emails into a list so there are no repeated emails + email_data: set[str] = set() for key, value in ticket.items(): - if ( - key in included_fields - and value is not None - and value != [] - and value != {} - and value != "[]" - and value != "" - ): - value_to_str = ( - [str(item) for item in value] if isinstance(value, List) else str(value) - ) - if "email" in key: - email_data[key] = value_to_str + # Skip fields that aren't useful for embedding + if key not in _TICKET_FIELDS_TO_INCLUDE: + continue + + # Skip empty fields + if not value or value == "[]": + continue + + # Convert strings or lists to strings + stringified_value: str | list[str] + if isinstance(value, list): + stringified_value = [str(item) for item in value] + else: + stringified_value = str(value) + + if "email" in key: + if isinstance(stringified_value, list): + email_data.update(stringified_value) else: - metadata[key] = value_to_str + email_data.add(stringified_value) + else: + metadata[key] = stringified_value if email_data: - metadata["email_data"] = str(email_data) - - # Convert source to human-parsable string - source_types = { - "1": "Email", - "2": "Portal", - "3": "Phone", - "7": "Chat", - "9": "Feedback Widget", - "10": "Outbound Email", - } - if ticket.get("source"): - metadata["source"] = source_types.get( - str(ticket.get("source")), "Unknown Source Type" + metadata["emails"] = list(email_data) + + # Convert source numbers to human-parsable string + if source_number := ticket.get("source"): + metadata["source"] = _SOURCE_NUMBER_TYPE_MAP.get( + str(source_number), "Unknown Source Type" ) - # Convert priority to human-parsable string - priority_types = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} - if ticket.get("priority"): - metadata["priority"] = priority_types.get( - str(ticket.get("priority")), "Unknown Priority" + # Convert priority numbers to human-parsable string + if priority_number := ticket.get("priority"): + metadata["priority"] = _PRIORITY_NUMBER_TYPE_MAP.get( + priority_number, "Unknown Priority" ) # Convert status to human-parsable string - status_types = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} - if ticket.get("status"): - metadata["status"] = status_types.get( - str(ticket.get("status")), "Unknown Status" + if status_number := ticket.get("status"): + metadata["status"] = _STATUS_NUMBER_TYPE_MAP.get( + str(status_number), "Unknown Status" ) due_by = datetime.fromisoformat(ticket["due_by"].replace("Z", "+00:00")) @@ -99,17 +113,24 @@ def _create_metadata_from_ticket(ticket: dict) -> dict: def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: + # Use the ticket description as the text + text = f"Ticket description: {parse_html_page_basic(ticket.get('description_text', ''))}" + metadata = _create_metadata_from_ticket(ticket) + + # This is also used in the ID because it is more unique than the just the ticket ID + link = f"https://{domain}.freshdesk.com/helpdesk/tickets/{ticket['id']}" + return Document( - id=str(ticket["id"]), + id=_FRESHDESK_ID_PREFIX + link, sections=[ Section( - link=f"https://{domain}.freshdesk.com/helpdesk/tickets/{int(ticket['id'])}", - text=f"description: {parse_html_page_basic(ticket.get('description_text', ''))}", + link=link, + text=text, ) ], source=DocumentSource.FRESHDESK, semantic_identifier=ticket["subject"], - metadata=_create_metadata_from_ticket(ticket), + metadata=metadata, doc_updated_at=datetime.fromisoformat( ticket["updated_at"].replace("Z", "+00:00") ), @@ -146,7 +167,7 @@ def _fetch_tickets( 'include' field available for this endpoint: https://developers.freshdesk.com/api/#filter_tickets """ - if any(attr is None for attr in [self.api_key, self.domain, self.password]): + if self.api_key is None or self.domain is None or self.password is None: raise ConnectorMissingCredentialError("freshdesk") base_url = f"https://{self.domain}.freshdesk.com/api/v2/tickets" @@ -187,7 +208,6 @@ def _process_tickets( for ticket_batch in self._fetch_tickets(start, end): for ticket in ticket_batch: - logger.info(_create_doc_from_ticket(ticket, self.domain)) doc_batch.append(_create_doc_from_ticket(ticket, self.domain)) if len(doc_batch) >= self.batch_size: