crewAIInc · sachinspanicker · Nov 27, 2024
diff --git a/crewai_tools/tools/pdf_table_extract_tool/README.md b/crewai_tools/tools/pdf_table_extract_tool/README.md
@@ -0,0 +1,68 @@
+# PDF Table Extract Tool
+
+Extracting complex formatted tables from PDFs is a very difficult task and it has been observed that data is not retrieved properly and thereby it is difficult to query on that. Here's a specialized
+tool for extracting tables from PDF documents and converting them to markdown format.
+
+## Features
+- Extracts tables from specified pages of PDF documents
+- Converts tables to markdown format for easy integration
+- Handles multiple tables per page
+- Supports large tables
+- Provides both synchronous and asynchronous interfaces
+
+## Installation
+
+```bash
+pip install crewai-tools
+
+Dependencies
+
+PyMuPDF (fitz)
+pandas
+tabulate
+
+Usage
+With CrewAI
+
+from crewai import Agent
+from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool
+
+# Initialize the tool
+pdf_tool = PDFTableExtractTool()
+
+# Create an agent with the tool
+agent = Agent(
+    role='Data Analyst',
+    goal='Extract and analyze tables from PDF reports',
+    backstory="You are an expert at extracting and analyzing tabular data.",
+    tools=[pdf_tool]
+)
+
+Direct Usage
+
+from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool
+
+tool = PDFTableExtractTool()
+
+# Extract table from first page
+result = tool.run("document.pdf, 1")
+print(result)
+
+Input Format
+The tool accepts input in the format: "pdf_path, page_number"
+
+pdf_path: Path to the PDF file (required)
+page_number: Page number to extract table from (optional, defaults to 1)
+
+Output
+Returns a string containing the extracted table in markdown format.
+Error Handling
+
+Returns error message if PDF file not found
+Returns error message if page number is invalid
+Returns error message if no table is found on the specified page
+
+Testing
+Run the test suite:
+
+python -m pytest tests/tools/test_pdf_table_extract_tool.py -v
diff --git a/crewai_tools/tools/pdf_table_extract_tool/__init__.py b/crewai_tools/tools/pdf_table_extract_tool/__init__.py
@@ -0,0 +1,4 @@
+"""PDF Table extraction tool for CrewAI."""
+from .tool import PDFTableExtractTool
+
+__all__ = ["PDFTableExtractTool"]
diff --git a/crewai_tools/tools/pdf_table_extract_tool/tool.py b/crewai_tools/tools/pdf_table_extract_tool/tool.py
@@ -0,0 +1,66 @@
+from io import StringIO
+
+import fitz  # PyMuPDF
+import pandas as pd
+from langchain.tools import BaseTool
+
+
+class PDFTableExtractTool(BaseTool):
+    """Tool for extracting table from PDF files and convert to markdown format"""
+
+    name: str = "pdf_table_extract_tool"
+    description: str = (
+        "Extracts tables from a specific page of a PDF file and converts them to "
+        "markdown format. Useful when you need to extract tabular data from PDF "
+        "documents for analysis or presentation. Returns the table in markdown "
+        "format as a string."
+    )
+
+    def _run(self, pdf_path: str, page_number: int, table_number: int = 0) -> str:
+        """Extract a table from a PDF file and convert it to markdown format"""
+        try:
+            # Open the PDF
+            doc = fitz.open(pdf_path)
+
+            # Check page number
+            if page_number < 1 or page_number > len(doc):
+                return f"Invalid page number. PDF has {len(doc)} pages."
+
+            # Get the specified page (convert to 0-based index)
+            page = doc[page_number - 1]
+
+            # Extract text and split into lines
+            text = page.get_text("text")
+            lines = [line.strip() for line in text.split("\n") if line.strip()]
+
+            # Simple table detection - look for consistent delimiters or spacing
+            table_data = []
+            for line in lines:
+                # Split by multiple spaces or tabs
+                cells = [cell.strip() for cell in line.split("  ") if cell.strip()]
+                if cells:  # Only add non-empty rows
+                    table_data.append(cells)
+
+            if not table_data:
+                return "No table found on the specified page"
+
+            # Ensure all rows have the same number of columns
+            max_cols = max(len(row) for row in table_data)
+            table_data = [row + [""] * (max_cols - len(row)) for row in table_data]
+
+            # Convert to DataFrame
+            df = pd.DataFrame(table_data[1:], columns=table_data[0])
+
+            # Convert to markdown
+            buffer = StringIO()
+            df.to_markdown(buf=buffer, index=False)
+            return buffer.getvalue()
+
+        except Exception as e:
+            return f"Error extracting table: {str(e)}"
+
+    async def _arun(
+        self, pdf_path: str, page_number: int, table_number: int = 0
+    ) -> str:
+        """Async version of the tool"""
+        return self._run(pdf_path, page_number, table_number)
diff --git a/tests/tools/test_pdf_table_extract_tool.py b/tests/tools/test_pdf_table_extract_tool.py
@@ -0,0 +1,65 @@
+import os
+
+import pytest
+
+from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool
+
+
+def test_pdf_table_extract_tool_initialization():
+    tool = PDFTableExtractTool()
+    assert tool.name == "pdf_table_extract_tool"
+    assert isinstance(tool.description, str)
+
+
+def test_pdf_table_extract_tool_invalid_file():
+    tool = PDFTableExtractTool()
+    result = tool._run(pdf_path="nonexistent.pdf", page_number=1)
+    assert "Error" in result
+
+
+def test_pdf_table_extract_tool_with_real_pdf():
+    # Ensure the fixtures directory exists
+    os.makedirs("tests/fixtures", exist_ok=True)
+
+    # Create a test PDF with a table
+    import matplotlib.pyplot as plt
+    from matplotlib.backends.backend_pdf import PdfPages
+
+    # Sample data
+    data = [
+        ["Name", "Age", "Department", "Salary"],
+        ["John Doe", "30", "Engineering", "$80,000"],
+        ["Jane Smith", "25", "Marketing", "$70,000"],
+        ["Bob Johnson", "35", "Sales", "$90,000"],
+    ]
+
+    pdf_path = "tests/fixtures/sample_table.pdf"
+    with PdfPages(pdf_path) as pdf:
+        fig, ax = plt.subplots(figsize=(12, 4))
+        ax.axis("tight")
+        ax.axis("off")
+        table = ax.table(
+            cellText=data[1:], colLabels=data[0], cellLoc="center", loc="center"
+        )
+        table.auto_set_font_size(False)
+        table.set_fontsize(9)
+        table.scale(1.2, 1.5)
+        pdf.savefig(fig)
+        plt.close()
+
+    # Test the tool
+    tool = PDFTableExtractTool()
+    result = tool._run(pdf_path=pdf_path, page_number=1)
+    print("\nExtracted table:")
+    print(result)
+
+    assert isinstance(result, str)
+    assert "John" in result or "Name" in result
+    assert "No table found" not in result
+
+
+@pytest.mark.asyncio
+async def test_pdf_table_extract_tool_async():
+    tool = PDFTableExtractTool()
+    result = await tool._arun(pdf_path="nonexistent.pdf", page_number=1)
+    assert isinstance(result, str)