Skip to content

Commit

Permalink
Notion Connector Improvements (#2789)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Oct 14, 2024
1 parent a9bcc89 commit ba712d4
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 35 deletions.
171 changes: 137 additions & 34 deletions backend/danswer/connectors/notion/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
_NOTION_CALL_TIMEOUT = 30 # 30 seconds


# TODO: Tables need to be ingested, Pages need to have their metadata ingested


@dataclass
class NotionPage:
"""Represents a Notion Page object"""
Expand All @@ -40,13 +43,26 @@ class NotionPage:
properties: dict[str, Any]
url: str

database_name: str | None # Only applicable to the database type page (wiki)

def __init__(self, **kwargs: dict[str, Any]) -> None:
names = set([f.name for f in fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)


@dataclass
class NotionBlock:
"""Represents a Notion Block object"""

id: str # Used for the URL
text: str
# In a plaintext representation of the page, how this block should be joined
# with the existing text up to this point, separated out from text for clarity
prefix: str


@dataclass
class NotionSearchResponse:
"""Represents the response from the Notion Search API"""
Expand All @@ -62,7 +78,6 @@ def __init__(self, **kwargs: dict[str, Any]) -> None:
setattr(self, k, v)


# TODO - Add the ability to optionally limit to specific Notion databases
class NotionConnector(LoadConnector, PollConnector):
"""Notion Page connector that reads all Notion pages
this integration has been granted access to.
Expand Down Expand Up @@ -126,21 +141,47 @@ def _fetch_child_blocks(

@retry(tries=3, delay=1, backoff=2)
def _fetch_page(self, page_id: str) -> NotionPage:
"""Fetch a page from it's ID via the Notion API."""
"""Fetch a page from its ID via the Notion API, retry with database if page fetch fails."""
logger.debug(f"Fetching page for ID '{page_id}'")
block_url = f"https://api.notion.com/v1/pages/{page_id}"
page_url = f"https://api.notion.com/v1/pages/{page_id}"
res = rl_requests.get(
block_url,
page_url,
headers=self.headers,
timeout=_NOTION_CALL_TIMEOUT,
)
try:
res.raise_for_status()
except Exception as e:
logger.exception(f"Error fetching page - {res.json()}")
raise e
logger.warning(
f"Failed to fetch page, trying database for ID '{page_id}'. Exception: {e}"
)
# Try fetching as a database if page fetch fails, this happens if the page is set to a wiki
# it becomes a database from the notion perspective
return self._fetch_database_as_page(page_id)
return NotionPage(**res.json())

@retry(tries=3, delay=1, backoff=2)
def _fetch_database_as_page(self, database_id: str) -> NotionPage:
"""Attempt to fetch a database as a page."""
logger.debug(f"Fetching database for ID '{database_id}' as a page")
database_url = f"https://api.notion.com/v1/databases/{database_id}"
res = rl_requests.get(
database_url,
headers=self.headers,
timeout=_NOTION_CALL_TIMEOUT,
)
try:
res.raise_for_status()
except Exception as e:
logger.exception(f"Error fetching database as page - {res.json()}")
raise e
database_name = res.json().get("title")
database_name = (
database_name[0].get("text", {}).get("content") if database_name else None
)

return NotionPage(**res.json(), database_name=database_name)

@retry(tries=3, delay=1, backoff=2)
def _fetch_database(
self, database_id: str, cursor: str | None = None
Expand Down Expand Up @@ -171,8 +212,51 @@ def _fetch_database(
raise e
return res.json()

def _read_pages_from_database(self, database_id: str) -> list[str]:
"""Returns a list of all page IDs in the database"""
@staticmethod
def _properties_to_str(properties: dict[str, Any]) -> str:
"""Converts Notion properties to a string"""

def _recurse_properties(inner_dict: dict[str, Any]) -> str:
while "type" in inner_dict:
type_name = inner_dict["type"]
inner_dict = inner_dict[type_name]
if isinstance(inner_dict, list):
return ", ".join([_recurse_properties(item) for item in inner_dict])
# TODO there may be more types to handle here
if "name" in inner_dict:
return inner_dict["name"]
if "content" in inner_dict:
return inner_dict["content"]
start = inner_dict.get("start")
end = inner_dict.get("end")
if start is not None:
if end is not None:
return f"{start} - {end}"
return start
elif end is not None:
return f"Until {end}"

if "id" in inner_dict:
logger.debug("Skipping Notion Id field")
return "Unreadable Property"

logger.debug(f"Unreadable property from innermost prop: {inner_dict}")
return "Unreadable Property"

result = ""
for prop_name, prop in properties.items():
inner_value = _recurse_properties(prop)
# Not a perfect way to format Notion database tables but there's no perfect representation
# since this must be represented as plaintext
result += f"{prop_name}: {inner_value}\t"

return result

def _read_pages_from_database(
self, database_id: str
) -> tuple[list[NotionBlock], list[str]]:
"""Returns a list of top level blocks and all page IDs in the database"""
result_blocks: list[NotionBlock] = []
result_pages: list[str] = []
cursor = None
while True:
Expand All @@ -181,37 +265,41 @@ def _read_pages_from_database(self, database_id: str) -> list[str]:
for result in data["results"]:
obj_id = result["id"]
obj_type = result["object"]
text = self._properties_to_str(result.get("properties", {}))
if text:
result_blocks.append(NotionBlock(id=obj_id, text=text, prefix="\n"))
if obj_type == "page":
logger.debug(
f"Found page with ID '{obj_id}' in database '{database_id}'"
)
result_pages.append(result["id"])
elif obj_type == "database":
# TODO add block for database
logger.debug(
f"Found database with ID '{obj_id}' in database '{database_id}'"
)
result_pages.extend(self._read_pages_from_database(obj_id))
# The inner contents are ignored at this level
_, child_pages = self._read_pages_from_database(obj_id)
result_pages.extend(child_pages)

if data["next_cursor"] is None:
break

cursor = data["next_cursor"]

return result_pages
return result_blocks, result_pages

def _read_blocks(
self, base_block_id: str
) -> tuple[list[tuple[str, str]], list[str]]:
"""Reads all child blocks for the specified block"""
result_lines: list[tuple[str, str]] = []
def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]:
"""Reads all child blocks for the specified block, returns a list of blocks and child page ids"""
result_blocks: list[NotionBlock] = []
child_pages: list[str] = []
cursor = None
while True:
data = self._fetch_child_blocks(base_block_id, cursor)

# this happens when a block is not shared with the integration
if data is None:
return result_lines, child_pages
return result_blocks, child_pages

for result in data["results"]:
logger.debug(
Expand Down Expand Up @@ -255,39 +343,49 @@ def _read_blocks(

if result["has_children"]:
if result_type == "child_page":
# Child pages will not be included at this top level, it will be a separate document
child_pages.append(result_block_id)
else:
logger.debug(f"Entering sub-block: {result_block_id}")
subblock_result_lines, subblock_child_pages = self._read_blocks(
subblocks, subblock_child_pages = self._read_blocks(
result_block_id
)
logger.debug(f"Finished sub-block: {result_block_id}")
result_lines.extend(subblock_result_lines)
result_blocks.extend(subblocks)
child_pages.extend(subblock_child_pages)

if result_type == "child_database" and self.recursive_index_enabled:
child_pages.extend(self._read_pages_from_database(result_block_id))

cur_result_text = "\n".join(cur_result_text_arr)
if cur_result_text:
result_lines.append((cur_result_text, result_block_id))
inner_blocks, inner_child_pages = self._read_pages_from_database(
result_block_id
)
result_blocks.extend(inner_blocks)
child_pages.extend(inner_child_pages)

if cur_result_text_arr:
new_block = NotionBlock(
id=result_block_id,
text="\n".join(cur_result_text_arr),
prefix="\n",
)
result_blocks.append(new_block)

if data["next_cursor"] is None:
break

cursor = data["next_cursor"]

return result_lines, child_pages
return result_blocks, child_pages

def _read_page_title(self, page: NotionPage) -> str:
def _read_page_title(self, page: NotionPage) -> str | None:
"""Extracts the title from a Notion page"""
page_title = None
if hasattr(page, "database_name") and page.database_name:
return page.database_name
for _, prop in page.properties.items():
if prop["type"] == "title" and len(prop["title"]) > 0:
page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip()
break
if page_title is None:
page_title = f"Untitled Page [{page.id}]"

return page_title

def _read_pages(
Expand All @@ -304,18 +402,23 @@ def _read_pages(
logger.info(f"Reading page with ID '{page.id}', with url {page.url}")
page_blocks, child_page_ids = self._read_blocks(page.id)
all_child_page_ids.extend(child_page_ids)
page_title = self._read_page_title(page)

if not page_blocks:
continue

page_title = (
self._read_page_title(page) or f"Untitled Page with ID {page.id}"
)

yield (
Document(
id=page.id,
# Will add title to the first section later in processing
sections=[Section(link=page.url, text="")]
+ [
sections=[
Section(
link=f"{page.url}#{block_id.replace('-', '')}",
text=block_text,
link=f"{page.url}#{block.id.replace('-', '')}",
text=block.prefix + block.text,
)
for block_text, block_id in page_blocks
for block in page_blocks
],
source=DocumentSource.NOTION,
semantic_identifier=page_title,
Expand Down
6 changes: 5 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ explicit_package_bases = true
disallow_untyped_defs = true

[[tool.mypy.overrides]]
module = "alembic*.versions.*"
module = "alembic.versions.*"
disable_error_code = ["var-annotated"]

[[tool.mypy.overrides]]
module = "alembic_tenants.versions.*"
disable_error_code = ["var-annotated"]

[tool.ruff]
Expand Down
1 change: 1 addition & 0 deletions backend/requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ types-beautifulsoup4==4.12.0.3
types-html5lib==1.1.11.13
types-oauthlib==3.2.0.9
types-setuptools==68.0.0.3
types-Pillow==10.2.0.20240822
types-passlib==1.7.7.20240106
types-psutil==5.9.5.17
types-psycopg2==2.9.21.10
Expand Down

0 comments on commit ba712d4

Please sign in to comment.