Skip to content

Commit

Permalink
standardized escaping of CQL strings
Browse files Browse the repository at this point in the history
  • Loading branch information
hagen-danswer committed Nov 20, 2024
1 parent 00f8ba1 commit 022d8e6
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 7 deletions.
14 changes: 8 additions & 6 deletions backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import quote

from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
Expand All @@ -12,6 +11,7 @@
from danswer.connectors.confluence.utils import build_confluence_client
from danswer.connectors.confluence.utils import build_confluence_document_id
from danswer.connectors.confluence.utils import datetime_from_string
from danswer.connectors.confluence.utils import escape_cql_string
from danswer.connectors.confluence.utils import extract_text_from_confluence_html
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
Expand Down Expand Up @@ -84,20 +84,22 @@ def __init__(
elif page_id:
# if a cql_query is not provided, we will use the page_id to fetch the page
if index_recursively:
cql_page_query += f" and ancestor='{page_id}'"
cql_page_query += f" and ancestor='{escape_cql_string(page_id)}'"
else:
cql_page_query += f" and id='{page_id}'"
cql_page_query += f" and id='{escape_cql_string(page_id)}'"
elif space:
# if no cql_query or page_id is provided, we will use the space to fetch the pages
cql_page_query += f" and space='{quote(space)}'"
cql_page_query += f" and space='{escape_cql_string(space)}'"

self.cql_page_query = cql_page_query
self.cql_time_filter = ""

self.cql_label_filter = ""
if labels_to_skip:
labels_to_skip = list(set(labels_to_skip))
comma_separated_labels = ",".join(f"'{label}'" for label in labels_to_skip)
comma_separated_labels = ",".join(
f"'{escape_cql_string(label)}'" for label in labels_to_skip
)
self.cql_label_filter = f" and label not in ({comma_separated_labels})"

def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
Expand Down Expand Up @@ -217,7 +219,7 @@ def _fetch_document_batches(self) -> GenerateDocumentsOutput:

# Fetch attachments as Documents
for confluence_page_id in confluence_page_ids:
attachment_cql = f"type=attachment and container='{confluence_page_id}'"
attachment_cql = f"type=attachment and container='{escape_cql_string(confluence_page_id)}'"
attachment_cql += self.cql_label_filter
# TODO: maybe should add time filter as well?
for attachments in self.confluence_client.paginated_cql_page_retrieval(
Expand Down
24 changes: 23 additions & 1 deletion backend/danswer/connectors/confluence/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@
_USER_EMAIL_CACHE: dict[str, str | None] = {}


def escape_cql_string(value: str) -> str:
"""Escape special characters in strings used in CQL queries.
Handles escaping of special characters that could cause CQL syntax issues.
"""
# First escape backslashes
value = value.replace("\\", "\\\\")
# Then handle other special characters
escapes = {
"'": "''", # Single quotes need to be doubled
'"': '\\"', # Double quotes need escaping
"[": "\\[", # Square brackets need escaping
"]": "\\]",
"%": "\\%", # For LIKE operators
"_": "\\_", # For LIKE operators
}
for char, escape_seq in escapes.items():
value = value.replace(char, escape_seq)
return value


def get_user_email_from_username__server(
confluence_client: OnyxConfluence, user_name: str
) -> str | None:
Expand Down Expand Up @@ -108,7 +128,9 @@ def extract_text_from_confluence_html(
if not page_title:
continue

page_query = f"type=page and title='{page_title}'"
# Escape single quotes in the title to prevent CQL syntax errors
escaped_page_title = escape_cql_string(page_title)
page_query = f"type=page and title='{escaped_page_title}'"

page_contents: dict[str, Any] | None = None
# Confluence enforces title uniqueness, so we should only get one result here
Expand Down

0 comments on commit 022d8e6

Please sign in to comment.