camelot-dev · foarsitter · Apr 29, 2022 · Feb 25, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - main
-      - master
 
 jobs:
   labeler:

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - main
-      - master
 
 jobs:
   release:
@@ -23,12 +22,12 @@ jobs:
 
       - name: Upgrade pip
         run: |
-          pip install --constraint=.github/workflows/constraints.txt pip
+          pip install --constraint=${PWD}/.github/workflows/constraints.txt pip
           pip --version
 
       - name: Install Poetry
         run: |
-          pip install --constraint=.github/workflows/constraints.txt poetry
+          pip install --constraint=${PWD}/.github/workflows/constraints.txt poetry
           poetry --version
 
       - name: Check if there is a parent commit
@@ -60,15 +59,7 @@ jobs:
         uses: pypa/gh-action-pypi-publish@v1.8.10
         with:
           user: __token__
-          password: ${{ secrets.PYPI_TOKEN }}
-
-      - name: Publish package on TestPyPI
-        if: "! steps.check-version.outputs.tag"
-        uses: pypa/gh-action-pypi-publish@v1.8.10
-        with:
-          user: __token__
-          password: ${{ secrets.TEST_PYPI_TOKEN }}
-          repository_url: https://test.pypi.org/legacy/
+          password: ${{ secrets.FLIT_PASSWORD }}
 
       - name: Publish the release notes
         uses: release-drafter/release-drafter@v5.24.0

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,8 +1,9 @@
 name: Tests
 
 on:
-  - push
-  - pull_request
+  push:
+    branches: [master]
+  pull_request:
 
 jobs:
   tests:
@@ -45,7 +46,7 @@ jobs:
 
       - name: Upgrade pip
         run: |
-          pip install --constraint=.github/workflows/constraints.txt pip
+          pip install --constraint=${PWD}/.github/workflows/constraints.txt pip
           pip --version
 
       - name: Upgrade pip in virtual environments
@@ -59,13 +60,13 @@ jobs:
 
       - name: Install Poetry
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
+          pipx install --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt poetry
           poetry --version
 
       - name: Install Nox
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
-          pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
+          pipx install --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt nox
+          pipx inject --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt nox nox-poetry
           nox --version
 
       - name: Install ghostscript
@@ -130,18 +131,18 @@ jobs:
 
       - name: Upgrade pip
         run: |
-          pip install --constraint=.github/workflows/constraints.txt pip
+          pip install --constraint=${PWD}/.github/workflows/constraints.txt pip
           pip --version
 
       - name: Install Poetry
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
+          pipx install --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt poetry
           poetry --version
 
       - name: Install Nox
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
-          pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
+          pipx install --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt nox
+          pipx inject --pip-args=--constraint=${PWD}/.github/workflows/constraints.txt nox nox-poetry
           nox --version
 
       - name: Download coverage data

diff --git a/camelot/backends/poppler_backend.py b/camelot/backends/poppler_backend.py
@@ -1,10 +1,13 @@
+import os
+import sys
 import shutil
 import subprocess
 
+path = os.path.dirname(sys.executable) + os.pathsep + os.environ['PATH']
 
 class PopplerBackend:
     def convert(self, pdf_path, png_path):
-        pdftopng_executable = shutil.which("pdftopng")
+        pdftopng_executable = shutil.which("pdftopng", path=path)
         if pdftopng_executable is None:
             raise OSError(
                 "pdftopng is not installed. You can install it using the 'pip install pdftopng' command."

diff --git a/camelot/cli.py b/camelot/cli.py
@@ -39,6 +39,12 @@ def set_config(self, key, value):
     default="1",
     help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
 )
+@click.option(
+    "--parallel",
+    is_flag=True,
+    default=False,
+    help="Read pdf pages in parallel using all CPU cores.",
+)
 @click.option("-pw", "--password", help="Password for decryption.")
 @click.option("-o", "--output", help="Output file path.")
 @click.option(

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 import os
 import sys
 from pathlib import Path
@@ -143,7 +144,12 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
             instream.close()
 
     def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
+        self,
+        flavor="lattice",
+        suppress_stdout=False,
+        parallel=False,
+        layout_kwargs=None,
+        **kwargs
     ):
         """Extracts tables by calling parser.get_tables on all single
         page PDFs.
@@ -153,8 +159,10 @@ def parse(
         flavor : str (default: 'lattice')
             The parsing method to use ('lattice' or 'stream').
             Lattice is used by default.
-        suppress_stdout : str (default: False)
+        suppress_stdout : bool (default: False)
             Suppress logs and warnings.
+        parallel : bool (default: False)
+            Process pages in parallel using all available cpu cores.
         layout_kwargs : dict, optional (default: {})
             A dict of `pdfminer.layout.LAParams
             <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -171,14 +179,56 @@ def parse(
             layout_kwargs = {}
 
         tables = []
+        parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
         with TemporaryDirectory() as tempdir:
-            for p in self.pages:
-                self._save_page(self.filepath, p, tempdir)
-            pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
-            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
-            for p in pages:
-                t = parser.extract_tables(
-                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
-                )
-                tables.extend(t)
+            cpu_count = mp.cpu_count()
+            # Using multiprocessing only when cpu_count > 1 to prevent a stallness issue
+            # when cpu_count is 1
+            if parallel and len(self.pages) > 1 and cpu_count > 1:
+                with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
+                    jobs = []
+                    for p in self.pages:
+                        j = pool.apply_async(
+                            self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                        )
+                        jobs.append(j)
+
+                    for j in jobs:
+                        t = j.get()
+                        tables.extend(t)
+            else:
+                for p in self.pages:
+                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                    tables.extend(t)
+
         return TableList(sorted(tables))
+
+    def _parse_page(
+        self, page, tempdir, parser, suppress_stdout, layout_kwargs
+    ):
+        """Extracts tables by calling parser.get_tables on a single
+        page PDF.
+
+        Parameters
+        ----------
+        page : str
+            Page number to parse
+        parser : Lattice or Stream
+            The parser to use (Lattice or Stream).
+        suppress_stdout : bool
+            Suppress logs and warnings.
+        layout_kwargs : dict, optional (default: {})
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+
+        Returns
+        -------
+        tables : camelot.core.TableList
+            List of tables found in PDF.
+
+        """
+        self._save_page(self.filepath, page, tempdir)
+        page_path = os.path.join(tempdir, f"page-{page}.pdf")
+        tables = parser.extract_tables(
+            page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+        )
+        return tables
diff --git a/camelot/io.py b/camelot/io.py
@@ -15,6 +15,7 @@ def read_pdf(
     password=None,
     flavor="lattice",
     suppress_stdout=False,
+    parallel=False,
     layout_kwargs=None,
     **kwargs
 ):
@@ -37,6 +38,8 @@ def read_pdf(
         Lattice is used by default.
     suppress_stdout : bool, optional (default: True)
         Print all logs and warnings.
+    parallel : bool, optional (default: False)
+        Process pages in parallel using all available cpu cores.
     layout_kwargs : dict, optional (default: {})
         A dict of `pdfminer.layout.LAParams
         <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -122,6 +125,7 @@ def read_pdf(
         tables = p.parse(
             flavor=flavor,
             suppress_stdout=suppress_stdout,
+            parallel=parallel,
             layout_kwargs=layout_kwargs,
             **kwargs
         )

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -888,12 +888,14 @@ def get_page_layout(
         rsrcmgr = PDFResourceManager()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
-        for page in PDFPage.create_pages(document):
-            interpreter.process_page(page)
-            layout = device.get_result()
-            width = layout.bbox[2]
-            height = layout.bbox[3]
-            dim = (width, height)
+        page = next(PDFPage.create_pages(document), None)
+        if page is None:
+            raise PDFTextExtractionNotAllowed
+        interpreter.process_page(page)
+        layout = device.get_result()
+        width = layout.bbox[2]
+        height = layout.bbox[3]
+        dim = (width, height)
         return layout, dim
 
 

diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst
@@ -99,6 +99,26 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
 
 The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
 
+Extract tables in parallel
+--------------------------
+
+Camelot supports extracting tables in parrallel using all the available CPU cores.
+
+::
+
+    >>> tables = camelot.read_pdf('foo.pdf', page='all', parallel=True)
+    >>> tables
+    <TableList n=1>
+
+.. tip::
+    Here's how you can do the same with the :ref:`command-line interface <cli>`.
+    ::
+
+        $ camelot --pages all --parallel lattice foo.pdf
+
+.. note:: The reading of the PDF document is parallelized by processing pages by different CPU core.
+    Therefore, a document with a low page count could be slower to process in parallel.  
+
 Reading encrypted PDFs
 ----------------------
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ on: @@
       push:
         branches:
           - main
-          - master
     jobs:
       labeler:
@@ Expand Down @@