From 776e04cbf39ceb5ada989cc9bbfbeb56cb5efc54 Mon Sep 17 00:00:00 2001 From: sachinspanicker Date: Wed, 27 Nov 2024 16:43:30 +0530 Subject: [PATCH] feat: Add PDF Table Extract Tool - Add PDFTableExtractTool for extracting tables from PDFs - Convert extracted tables to markdown format - Add comprehensive test suite - Add documentation and usage examples - Handle edge cases and error conditions - Support both sync and async operations --- .../tools/pdf_table_extract_tool/README.md | 68 +++++++++++++++++++ .../tools/pdf_table_extract_tool/__init__.py | 4 ++ .../tools/pdf_table_extract_tool/tool.py | 66 ++++++++++++++++++ tests/tools/test_pdf_table_extract_tool.py | 65 ++++++++++++++++++ 4 files changed, 203 insertions(+) create mode 100644 crewai_tools/tools/pdf_table_extract_tool/README.md create mode 100644 crewai_tools/tools/pdf_table_extract_tool/__init__.py create mode 100644 crewai_tools/tools/pdf_table_extract_tool/tool.py create mode 100644 tests/tools/test_pdf_table_extract_tool.py diff --git a/crewai_tools/tools/pdf_table_extract_tool/README.md b/crewai_tools/tools/pdf_table_extract_tool/README.md new file mode 100644 index 0000000..8bc46f7 --- /dev/null +++ b/crewai_tools/tools/pdf_table_extract_tool/README.md @@ -0,0 +1,68 @@ +# PDF Table Extract Tool + +Extracting complex formatted tables from PDFs is a very difficult task and it has been observed that data is not retrieved properly and thereby it is difficult to query on that. Here's a specialized +tool for extracting tables from PDF documents and converting them to markdown format. + +## Features +- Extracts tables from specified pages of PDF documents +- Converts tables to markdown format for easy integration +- Handles multiple tables per page +- Supports large tables +- Provides both synchronous and asynchronous interfaces + +## Installation + +```bash +pip install crewai-tools + +Dependencies + +PyMuPDF (fitz) +pandas +tabulate + +Usage +With CrewAI + +from crewai import Agent +from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool + +# Initialize the tool +pdf_tool = PDFTableExtractTool() + +# Create an agent with the tool +agent = Agent( + role='Data Analyst', + goal='Extract and analyze tables from PDF reports', + backstory="You are an expert at extracting and analyzing tabular data.", + tools=[pdf_tool] +) + +Direct Usage + +from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool + +tool = PDFTableExtractTool() + +# Extract table from first page +result = tool.run("document.pdf, 1") +print(result) + +Input Format +The tool accepts input in the format: "pdf_path, page_number" + +pdf_path: Path to the PDF file (required) +page_number: Page number to extract table from (optional, defaults to 1) + +Output +Returns a string containing the extracted table in markdown format. +Error Handling + +Returns error message if PDF file not found +Returns error message if page number is invalid +Returns error message if no table is found on the specified page + +Testing +Run the test suite: + +python -m pytest tests/tools/test_pdf_table_extract_tool.py -v diff --git a/crewai_tools/tools/pdf_table_extract_tool/__init__.py b/crewai_tools/tools/pdf_table_extract_tool/__init__.py new file mode 100644 index 0000000..51d086b --- /dev/null +++ b/crewai_tools/tools/pdf_table_extract_tool/__init__.py @@ -0,0 +1,4 @@ +"""PDF Table extraction tool for CrewAI.""" +from .tool import PDFTableExtractTool + +__all__ = ["PDFTableExtractTool"] diff --git a/crewai_tools/tools/pdf_table_extract_tool/tool.py b/crewai_tools/tools/pdf_table_extract_tool/tool.py new file mode 100644 index 0000000..c48259c --- /dev/null +++ b/crewai_tools/tools/pdf_table_extract_tool/tool.py @@ -0,0 +1,66 @@ +from io import StringIO + +import fitz # PyMuPDF +import pandas as pd +from langchain.tools import BaseTool + + +class PDFTableExtractTool(BaseTool): + """Tool for extracting table from PDF files and convert to markdown format""" + + name: str = "pdf_table_extract_tool" + description: str = ( + "Extracts tables from a specific page of a PDF file and converts them to " + "markdown format. Useful when you need to extract tabular data from PDF " + "documents for analysis or presentation. Returns the table in markdown " + "format as a string." + ) + + def _run(self, pdf_path: str, page_number: int, table_number: int = 0) -> str: + """Extract a table from a PDF file and convert it to markdown format""" + try: + # Open the PDF + doc = fitz.open(pdf_path) + + # Check page number + if page_number < 1 or page_number > len(doc): + return f"Invalid page number. PDF has {len(doc)} pages." + + # Get the specified page (convert to 0-based index) + page = doc[page_number - 1] + + # Extract text and split into lines + text = page.get_text("text") + lines = [line.strip() for line in text.split("\n") if line.strip()] + + # Simple table detection - look for consistent delimiters or spacing + table_data = [] + for line in lines: + # Split by multiple spaces or tabs + cells = [cell.strip() for cell in line.split(" ") if cell.strip()] + if cells: # Only add non-empty rows + table_data.append(cells) + + if not table_data: + return "No table found on the specified page" + + # Ensure all rows have the same number of columns + max_cols = max(len(row) for row in table_data) + table_data = [row + [""] * (max_cols - len(row)) for row in table_data] + + # Convert to DataFrame + df = pd.DataFrame(table_data[1:], columns=table_data[0]) + + # Convert to markdown + buffer = StringIO() + df.to_markdown(buf=buffer, index=False) + return buffer.getvalue() + + except Exception as e: + return f"Error extracting table: {str(e)}" + + async def _arun( + self, pdf_path: str, page_number: int, table_number: int = 0 + ) -> str: + """Async version of the tool""" + return self._run(pdf_path, page_number, table_number) diff --git a/tests/tools/test_pdf_table_extract_tool.py b/tests/tools/test_pdf_table_extract_tool.py new file mode 100644 index 0000000..77af863 --- /dev/null +++ b/tests/tools/test_pdf_table_extract_tool.py @@ -0,0 +1,65 @@ +import os + +import pytest + +from crewai_tools.tools.pdf_table_extract_tool import PDFTableExtractTool + + +def test_pdf_table_extract_tool_initialization(): + tool = PDFTableExtractTool() + assert tool.name == "pdf_table_extract_tool" + assert isinstance(tool.description, str) + + +def test_pdf_table_extract_tool_invalid_file(): + tool = PDFTableExtractTool() + result = tool._run(pdf_path="nonexistent.pdf", page_number=1) + assert "Error" in result + + +def test_pdf_table_extract_tool_with_real_pdf(): + # Ensure the fixtures directory exists + os.makedirs("tests/fixtures", exist_ok=True) + + # Create a test PDF with a table + import matplotlib.pyplot as plt + from matplotlib.backends.backend_pdf import PdfPages + + # Sample data + data = [ + ["Name", "Age", "Department", "Salary"], + ["John Doe", "30", "Engineering", "$80,000"], + ["Jane Smith", "25", "Marketing", "$70,000"], + ["Bob Johnson", "35", "Sales", "$90,000"], + ] + + pdf_path = "tests/fixtures/sample_table.pdf" + with PdfPages(pdf_path) as pdf: + fig, ax = plt.subplots(figsize=(12, 4)) + ax.axis("tight") + ax.axis("off") + table = ax.table( + cellText=data[1:], colLabels=data[0], cellLoc="center", loc="center" + ) + table.auto_set_font_size(False) + table.set_fontsize(9) + table.scale(1.2, 1.5) + pdf.savefig(fig) + plt.close() + + # Test the tool + tool = PDFTableExtractTool() + result = tool._run(pdf_path=pdf_path, page_number=1) + print("\nExtracted table:") + print(result) + + assert isinstance(result, str) + assert "John" in result or "Name" in result + assert "No table found" not in result + + +@pytest.mark.asyncio +async def test_pdf_table_extract_tool_async(): + tool = PDFTableExtractTool() + result = await tool._arun(pdf_path="nonexistent.pdf", page_number=1) + assert isinstance(result, str)