standardized escaping of CQL strings

danswer-ai · Nov 20, 2024 · 022d8e6 · 022d8e6
1 parent 00f8ba1
commit 022d8e6
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 7 deletions.
diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py
@@ -1,7 +1,6 @@
 from datetime import datetime
 from datetime import timezone
 from typing import Any
-from urllib.parse import quote
 
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
@@ -12,6 +11,7 @@
 from danswer.connectors.confluence.utils import build_confluence_client
 from danswer.connectors.confluence.utils import build_confluence_document_id
 from danswer.connectors.confluence.utils import datetime_from_string
+from danswer.connectors.confluence.utils import escape_cql_string
 from danswer.connectors.confluence.utils import extract_text_from_confluence_html
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import GenerateSlimDocumentOutput
@@ -84,20 +84,22 @@ def __init__(
         elif page_id:
             # if a cql_query is not provided, we will use the page_id to fetch the page
             if index_recursively:
-                cql_page_query += f" and ancestor='{page_id}'"
+                cql_page_query += f" and ancestor='{escape_cql_string(page_id)}'"
             else:
-                cql_page_query += f" and id='{page_id}'"
+                cql_page_query += f" and id='{escape_cql_string(page_id)}'"
         elif space:
             # if no cql_query or page_id is provided, we will use the space to fetch the pages
-            cql_page_query += f" and space='{quote(space)}'"
+            cql_page_query += f" and space='{escape_cql_string(space)}'"
 
         self.cql_page_query = cql_page_query
         self.cql_time_filter = ""
 
         self.cql_label_filter = ""
         if labels_to_skip:
             labels_to_skip = list(set(labels_to_skip))
-            comma_separated_labels = ",".join(f"'{label}'" for label in labels_to_skip)
+            comma_separated_labels = ",".join(
+                f"'{escape_cql_string(label)}'" for label in labels_to_skip
+            )
             self.cql_label_filter = f" and label not in ({comma_separated_labels})"
 
     def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@@ -217,7 +219,7 @@ def _fetch_document_batches(self) -> GenerateDocumentsOutput:
 
         # Fetch attachments as Documents
         for confluence_page_id in confluence_page_ids:
-            attachment_cql = f"type=attachment and container='{confluence_page_id}'"
+            attachment_cql = f"type=attachment and container='{escape_cql_string(confluence_page_id)}'"
             attachment_cql += self.cql_label_filter
             # TODO: maybe should add time filter as well?
             for attachments in self.confluence_client.paginated_cql_page_retrieval(

diff --git a/backend/danswer/connectors/confluence/utils.py b/backend/danswer/connectors/confluence/utils.py
@@ -22,6 +22,26 @@
 _USER_EMAIL_CACHE: dict[str, str | None] = {}
 
 
+def escape_cql_string(value: str) -> str:
+    """Escape special characters in strings used in CQL queries.
+    Handles escaping of special characters that could cause CQL syntax issues.
+    """
+    # First escape backslashes
+    value = value.replace("\\", "\\\\")
+    # Then handle other special characters
+    escapes = {
+        "'": "''",  # Single quotes need to be doubled
+        '"': '\\"',  # Double quotes need escaping
+        "[": "\\[",  # Square brackets need escaping
+        "]": "\\]",
+        "%": "\\%",  # For LIKE operators
+        "_": "\\_",  # For LIKE operators
+    }
+    for char, escape_seq in escapes.items():
+        value = value.replace(char, escape_seq)
+    return value
+
+
 def get_user_email_from_username__server(
     confluence_client: OnyxConfluence, user_name: str
 ) -> str | None:
@@ -108,7 +128,9 @@ def extract_text_from_confluence_html(
             if not page_title:
                 continue
 
-            page_query = f"type=page and title='{page_title}'"
+            # Escape single quotes in the title to prevent CQL syntax errors
+            escaped_page_title = escape_cql_string(page_title)
+            page_query = f"type=page and title='{escaped_page_title}'"
 
             page_contents: dict[str, Any] | None = None
             # Confluence enforces title uniqueness, so we should only get one result here