Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add PDF Table Extract Tool #127

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions crewai_tools/tools/pdf_table_extract_tool/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# PDF Table Extract Tool

Extracting complex formatted tables from PDFs is a very difficult task and it has been observed that data is not retrieved properly and thereby it is difficult to query on that. Here's a specialized
tool for extracting tables from PDF documents and converting them to markdown format.

## Features
- Extracts tables from specified pages of PDF documents
- Converts tables to markdown format for easy integration
- Handles multiple tables per page
- Supports large tables
- Provides both synchronous and asynchronous interfaces

## Installation

```bash
pip install crewai-tools

Dependencies

PyMuPDF (fitz)
pandas
tabulate

Usage
With CrewAI

from crewai import Agent
from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool

# Initialize the tool
pdf_tool = PDFTableExtractTool()

# Create an agent with the tool
agent = Agent(
role='Data Analyst',
goal='Extract and analyze tables from PDF reports',
backstory="You are an expert at extracting and analyzing tabular data.",
tools=[pdf_tool]
)

Direct Usage

from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool

tool = PDFTableExtractTool()

# Extract table from first page
result = tool.run("document.pdf, 1")
print(result)

Input Format
The tool accepts input in the format: "pdf_path, page_number"

pdf_path: Path to the PDF file (required)
page_number: Page number to extract table from (optional, defaults to 1)

Output
Returns a string containing the extracted table in markdown format.
Error Handling

Returns error message if PDF file not found
Returns error message if page number is invalid
Returns error message if no table is found on the specified page

Testing
Run the test suite:

python -m pytest tests/tools/test_pdf_table_extract_tool.py -v
4 changes: 4 additions & 0 deletions crewai_tools/tools/pdf_table_extract_tool/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""PDF Table extraction tool for CrewAI."""
from .tool import PDFTableExtractTool

__all__ = ["PDFTableExtractTool"]
66 changes: 66 additions & 0 deletions crewai_tools/tools/pdf_table_extract_tool/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from io import StringIO

import fitz # PyMuPDF
import pandas as pd
from langchain.tools import BaseTool


class PDFTableExtractTool(BaseTool):
"""Tool for extracting table from PDF files and convert to markdown format"""

name: str = "pdf_table_extract_tool"
description: str = (
"Extracts tables from a specific page of a PDF file and converts them to "
"markdown format. Useful when you need to extract tabular data from PDF "
"documents for analysis or presentation. Returns the table in markdown "
"format as a string."
)

def _run(self, pdf_path: str, page_number: int, table_number: int = 0) -> str:
"""Extract a table from a PDF file and convert it to markdown format"""
try:
# Open the PDF
doc = fitz.open(pdf_path)

# Check page number
if page_number < 1 or page_number > len(doc):
return f"Invalid page number. PDF has {len(doc)} pages."

# Get the specified page (convert to 0-based index)
page = doc[page_number - 1]

# Extract text and split into lines
text = page.get_text("text")
lines = [line.strip() for line in text.split("\n") if line.strip()]

# Simple table detection - look for consistent delimiters or spacing
table_data = []
for line in lines:
# Split by multiple spaces or tabs
cells = [cell.strip() for cell in line.split(" ") if cell.strip()]
if cells: # Only add non-empty rows
table_data.append(cells)

if not table_data:
return "No table found on the specified page"

# Ensure all rows have the same number of columns
max_cols = max(len(row) for row in table_data)
table_data = [row + [""] * (max_cols - len(row)) for row in table_data]

# Convert to DataFrame
df = pd.DataFrame(table_data[1:], columns=table_data[0])

# Convert to markdown
buffer = StringIO()
df.to_markdown(buf=buffer, index=False)
return buffer.getvalue()

except Exception as e:
return f"Error extracting table: {str(e)}"

async def _arun(
self, pdf_path: str, page_number: int, table_number: int = 0
) -> str:
"""Async version of the tool"""
return self._run(pdf_path, page_number, table_number)
65 changes: 65 additions & 0 deletions tests/tools/test_pdf_table_extract_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os

import pytest

from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool


def test_pdf_table_extract_tool_initialization():
tool = PDFTableExtractTool()
assert tool.name == "pdf_table_extract_tool"
assert isinstance(tool.description, str)


def test_pdf_table_extract_tool_invalid_file():
tool = PDFTableExtractTool()
result = tool._run(pdf_path="nonexistent.pdf", page_number=1)
assert "Error" in result


def test_pdf_table_extract_tool_with_real_pdf():
# Ensure the fixtures directory exists
os.makedirs("tests/fixtures", exist_ok=True)

# Create a test PDF with a table
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Sample data
data = [
["Name", "Age", "Department", "Salary"],
["John Doe", "30", "Engineering", "$80,000"],
["Jane Smith", "25", "Marketing", "$70,000"],
["Bob Johnson", "35", "Sales", "$90,000"],
]

pdf_path = "tests/fixtures/sample_table.pdf"
with PdfPages(pdf_path) as pdf:
fig, ax = plt.subplots(figsize=(12, 4))
ax.axis("tight")
ax.axis("off")
table = ax.table(
cellText=data[1:], colLabels=data[0], cellLoc="center", loc="center"
)
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1.2, 1.5)
pdf.savefig(fig)
plt.close()

# Test the tool
tool = PDFTableExtractTool()
result = tool._run(pdf_path=pdf_path, page_number=1)
print("\nExtracted table:")
print(result)

assert isinstance(result, str)
assert "John" in result or "Name" in result
assert "No table found" not in result


@pytest.mark.asyncio
async def test_pdf_table_extract_tool_async():
tool = PDFTableExtractTool()
result = await tool._arun(pdf_path="nonexistent.pdf", page_number=1)
assert isinstance(result, str)