diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 0cfecb0..1f4295f 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -35,22 +35,15 @@ jobs: python-version: ${{ matrix.python-version }} - name: Upgrade pip - run: | - python -m pip install --upgrade pip + run: python -m pip install --upgrade pip - name: Install requirements (Python 3) - run: | - pip install -r requirements/ci.txt - + run: pip install -r requirements/ci.txt -r requirements/dev.txt - name: Install pdfly - run: | - pip install . + run: pip install . - - name: Test with black - run: black --check . --exclude sample-files + - name: Run tests + run: pytest -vv - - name: Test with mypy - run: | - mypy . --ignore-missing-imports --exclude build codestyle: name: Check code style issues runs-on: ubuntu-20.04 @@ -65,19 +58,22 @@ jobs: with: path: '**/tests/pdf_cache/*' key: cache-downloaded-files + - name: Upgrade pip - run: | - python -m pip install --upgrade pip + run: python -m pip install --upgrade pip - name: Install requirements - run: | - pip install -r requirements/ci.txt + run: pip install -r requirements/ci.txt - name: Install pdfly - run: | - pip install . + run: pip install . + + - name: Lint with black + run: black --check --extend-exclude sample-files . + - name: Lint with mypy + run: mypy . --ignore-missing-imports --exclude build - name: Test with ruff run: | echo `ruff --version` - ruff . + ruff pdfly/ package: name: Build & verify package diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3371ea3..7309ae0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # pre-commit run --all-files repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-ast - id: check-byte-order-marker @@ -28,12 +28,13 @@ repos: - id: blacken-docs additional_dependencies: [black==22.1.0] - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.290 + rev: v0.0.292 hooks: - id: ruff args: ['--fix'] + exclude: "tests/" - repo: https://github.com/asottile/pyupgrade - rev: v3.12.0 + rev: v3.15.0 hooks: - id: pyupgrade args: [--py36-plus] diff --git a/pdfly/cat.py b/pdfly/cat.py index 36cd071..925b5ec 100644 --- a/pdfly/cat.py +++ b/pdfly/cat.py @@ -43,40 +43,61 @@ import os +import sys import traceback from pathlib import Path -from sys import exit, stderr, stdout -from typing import List +from typing import List, Tuple -from pypdf import PdfMerger, parse_filename_page_ranges +from pypdf import PageRange, PdfMerger, parse_filename_page_ranges def main( filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool ) -> None: - fn_pgrgs_l = list(fn_pgrgs) - fn_pgrgs_l.insert(0, str(filename)) - filename_page_ranges = parse_filename_page_ranges(fn_pgrgs_l) # type: ignore + filename_page_ranges = parse_filepaths_and_pagerange_args( + filename, fn_pgrgs + ) if output: output_fh = open(output, "wb") else: - stdout.flush() - output_fh = os.fdopen(stdout.fileno(), "wb") + sys.stdout.flush() + output_fh = os.fdopen(sys.stdout.fileno(), "wb") merger = PdfMerger() in_fs = {} try: for filename, page_range in filename_page_ranges: # type: ignore if verbose: - print(filename, page_range, file=stderr) + print(filename, page_range, file=sys.stderr) if filename not in in_fs: in_fs[filename] = open(filename, "rb") merger.append(in_fs[filename], pages=page_range) + merger.write(output_fh) except Exception: - print(traceback.format_exc(), file=stderr) - print(f"Error while reading {filename}", file=stderr) - exit(1) - merger.write(output_fh) - output_fh.close() + print(traceback.format_exc(), file=sys.stderr) + print(f"Error while reading {filename}", file=sys.stderr) + sys.exit(1) + finally: + output_fh.close() # In 3.0, input files must stay open until output is written. # Not closing the in_fs because this script exits now. + + +def parse_filepaths_and_pagerange_args( + filename: Path, fn_pgrgs: List[str] +) -> List[Tuple[Path, PageRange]]: + fn_pgrgs_l = list(fn_pgrgs) + fn_pgrgs_l.insert(0, str(filename)) + filename_page_ranges, invalid_filepaths = [], [] + for filename, page_range in parse_filename_page_ranges(fn_pgrgs_l): # type: ignore + if Path(filename).is_file(): + filename_page_ranges.append((filename, page_range)) + else: + invalid_filepaths.append(str(filename)) + if invalid_filepaths: + print( + f"Invalid file path or page range provided: {' '.join(invalid_filepaths)}", + file=sys.stderr, + ) + sys.exit(2) + return filename_page_ranges diff --git a/pyproject.toml b/pyproject.toml index 5e231db..3ad8e41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ pdfly = "pdfly.cli:entry_point" [tool.pytest.ini_options] addopts = "--disable-socket --doctest-modules --cov=. --cov-report html:tests/reports/coverage-html --cov-report term-missing --ignore=docs/ --durations=3 --timeout=30" -filterwarnings = ["error"] doctest_encoding = "utf-8" testpaths = ["tests"] diff --git a/requirements/ci.txt b/requirements/ci.txt index 0f82e26..5d7662d 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.7 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ci.in @@ -8,67 +8,52 @@ astor==0.8.1 # via flake8-simplify attrs==23.1.0 # via flake8-bugbear -black==23.3.0 +black==23.9.1 # via -r requirements/ci.in click==8.1.7 # via black -flake8==5.0.4 +flake8==6.1.0 # via # -r requirements/ci.in # flake8-bugbear # flake8-comprehensions # flake8-isort # flake8-simplify -flake8-bugbear==23.3.12 +flake8-bugbear==23.9.16 # via -r requirements/ci.in -flake8-comprehensions==3.13.0 +flake8-comprehensions==3.14.0 # via -r requirements/ci.in -flake8-isort==6.0.0 +flake8-isort==6.1.0 # via -r requirements/ci.in -flake8-simplify==0.20.0 +flake8-simplify==0.21.0 # via -r requirements/ci.in -importlib-metadata==4.2.0 - # via - # attrs - # click - # flake8 - # flake8-comprehensions - # flake8-simplify -isort==5.11.5 +isort==5.12.0 # via flake8-isort mccabe==0.7.0 # via flake8 -mypy==1.4.1 +mypy==1.5.1 # via -r requirements/ci.in mypy-extensions==1.0.0 # via # black # mypy -packaging==23.1 +packaging==23.2 # via black pathspec==0.11.2 # via black -platformdirs==3.10.0 +platformdirs==3.11.0 # via black -pycodestyle==2.9.1 +pycodestyle==2.11.0 # via flake8 -pyflakes==2.5.0 +pyflakes==3.1.0 # via flake8 -ruff==0.0.287 +ruff==0.0.292 # via -r requirements/ci.in tomli==2.0.1 # via # black # mypy -typed-ast==1.5.5 - # via - # black - # mypy -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # black - # importlib-metadata # mypy - # platformdirs -zipp==3.15.0 - # via importlib-metadata diff --git a/requirements/dev.in b/requirements/dev.in index 6a2b12f..e1a2640 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -1,6 +1,9 @@ black pip-tools pre-commit +pytest pytest-cov +pytest-socket +pytest-timeout twine wheel diff --git a/requirements/dev.txt b/requirements/dev.txt index 604987d..f91da16 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,30 +1,28 @@ # -# This file is autogenerated by pip-compile with Python 3.7 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/dev.in # -black==23.3.0 +black==23.9.1 # via -r requirements/dev.in -bleach==6.0.0 - # via readme-renderer build==1.0.3 # via pip-tools certifi==2023.7.22 # via requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography -cfgv==3.3.1 +cfgv==3.4.0 # via pre-commit -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via requests click==8.1.7 # via # black # pip-tools -coverage[toml]==7.2.7 +coverage[toml]==7.3.2 # via pytest-cov -cryptography==41.0.3 +cryptography==41.0.4 # via secretstorage distlib==0.3.7 # via virtualenv @@ -32,62 +30,59 @@ docutils==0.20.1 # via readme-renderer exceptiongroup==1.1.3 # via pytest -filelock==3.12.2 +filelock==3.12.4 # via virtualenv -identify==2.5.24 +identify==2.5.30 # via pre-commit idna==3.4 # via requests -importlib-metadata==6.7.0 +importlib-metadata==6.8.0 # via # build - # click # keyring - # pluggy - # pre-commit - # pytest # twine - # virtualenv -importlib-resources==5.12.0 +importlib-resources==6.1.0 # via keyring iniconfig==2.0.0 # via pytest -jaraco-classes==3.2.3 +jaraco-classes==3.3.0 # via keyring jeepney==0.8.0 # via # keyring # secretstorage -keyring==24.1.1 +keyring==24.2.0 # via twine -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -more-itertools==9.1.0 +more-itertools==10.1.0 # via jaraco-classes mypy-extensions==1.0.0 # via black +nh3==0.2.14 + # via readme-renderer nodeenv==1.8.0 # via pre-commit -packaging==23.1 +packaging==23.2 # via # black # build # pytest pathspec==0.11.2 # via black -pip-tools==6.14.0 +pip-tools==7.3.0 # via -r requirements/dev.in pkginfo==1.9.6 # via twine -platformdirs==3.10.0 +platformdirs==3.11.0 # via # black # virtualenv -pluggy==1.2.0 +pluggy==1.3.0 # via pytest -pre-commit==2.21.0 +pre-commit==3.4.0 # via -r requirements/dev.in pycparser==2.21 # via cffi @@ -98,12 +93,20 @@ pygments==2.16.1 pyproject-hooks==1.0.0 # via build pytest==7.4.2 - # via pytest-cov + # via + # -r requirements/dev.in + # pytest-cov + # pytest-socket + # pytest-timeout pytest-cov==4.1.0 # via -r requirements/dev.in +pytest-socket==0.6.0 + # via -r requirements/dev.in +pytest-timeout==2.2.0 + # via -r requirements/dev.in pyyaml==6.0.1 # via pre-commit -readme-renderer==37.3 +readme-renderer==42.0 # via twine requests==2.31.0 # via @@ -113,12 +116,10 @@ requests-toolbelt==1.0.0 # via twine rfc3986==2.0.0 # via twine -rich==13.5.2 +rich==13.6.0 # via twine secretstorage==3.3.3 # via keyring -six==1.16.0 - # via bleach tomli==2.0.1 # via # black @@ -129,28 +130,21 @@ tomli==2.0.1 # pytest twine==4.0.2 # via -r requirements/dev.in -typed-ast==1.5.5 - # via black -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # black - # importlib-metadata - # markdown-it-py - # platformdirs # rich -urllib3==2.0.4 +urllib3==2.0.6 # via # requests # twine virtualenv==20.24.5 # via pre-commit -webencodings==0.5.1 - # via bleach wheel==0.41.2 # via # -r requirements/dev.in # pip-tools -zipp==3.15.0 +zipp==3.17.0 # via # importlib-metadata # importlib-resources diff --git a/requirements/docs.txt b/requirements/docs.txt index 19c47cb..9dac485 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.7 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/docs.in @@ -8,11 +8,11 @@ alabaster==0.7.13 # via sphinx attrs==23.1.0 # via -r requirements/docs.in -babel==2.12.1 +babel==2.13.0 # via sphinx certifi==2023.7.22 # via requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via requests docutils==0.17.1 # via @@ -23,10 +23,8 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.7.0 - # via - # attrs - # sphinx +importlib-metadata==6.8.0 + # via sphinx jinja2==3.1.2 # via # myst-parser @@ -43,7 +41,7 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==0.16.1 # via -r requirements/docs.in -packaging==23.1 +packaging==23.2 # via sphinx pygments==2.16.1 # via sphinx @@ -63,11 +61,11 @@ sphinx==4.5.0 # sphinxcontrib-jquery sphinx-rtd-theme==1.3.0 # via -r requirements/docs.in -sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-applehelp==1.0.4 # via sphinx sphinxcontrib-devhelp==1.0.2 # via sphinx -sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-htmlhelp==2.0.1 # via sphinx sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme @@ -77,11 +75,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -typing-extensions==4.7.1 - # via - # importlib-metadata - # markdown-it-py -urllib3==2.0.4 +urllib3==2.0.6 # via requests -zipp==3.15.0 +zipp==3.17.0 # via importlib-metadata diff --git a/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf b/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf new file mode 100644 index 0000000..608056d Binary files /dev/null and b/resources/GeoBase_NHNC1_Data_Model_UML_EN.pdf differ diff --git a/resources/baleines.jpg b/resources/baleines.jpg new file mode 100644 index 0000000..ab20b2a Binary files /dev/null and b/resources/baleines.jpg differ diff --git a/resources/box.pdf b/resources/box.pdf new file mode 100644 index 0000000..a390ea2 Binary files /dev/null and b/resources/box.pdf differ diff --git a/resources/jpeg.pdf b/resources/jpeg.pdf new file mode 100644 index 0000000..07a7fbb Binary files /dev/null and b/resources/jpeg.pdf differ diff --git a/resources/pythonknight.png b/resources/pythonknight.png new file mode 100644 index 0000000..faa31c0 Binary files /dev/null and b/resources/pythonknight.png differ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..60e9800 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,49 @@ +"""Utilities and fixtures that are available automatically for all tests.""" + +import io, os +from pathlib import Path + +from fpdf import FPDF +import pytest + +from pdfly.cli import entry_point + +try: + from contextlib import chdir # type: ignore +except ImportError: # Fallback when not available (< Python 3.11): + from contextlib import contextmanager + + @contextmanager + def chdir(dir_path): + """Non thread-safe context manager to change the current working directory.""" + cwd = Path.cwd() + os.chdir(dir_path) + try: + yield + finally: + os.chdir(cwd) + + +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCES_ROOT = PROJECT_ROOT / "resources" + + +def run_cli(args): + try: + entry_point(args) + except SystemExit as error: + return error.code + + +@pytest.fixture +def two_pages_pdf_filepath(tmp_path): + "A PDF with 2 pages, and a different image on each page" + pdf = FPDF() + pdf.add_page() + pdf.image(RESOURCES_ROOT / "baleines.jpg") + pdf.add_page() + pdf.image(RESOURCES_ROOT / "pythonknight.png") + pdf_filepath = tmp_path / "two_pages.pdf" + pdf.output(pdf_filepath) + return pdf_filepath diff --git a/tests/test_cat.py b/tests/test_cat.py new file mode 100644 index 0000000..ff01413 --- /dev/null +++ b/tests/test_cat.py @@ -0,0 +1,124 @@ +import pytest +from pypdf import PdfReader + +from .conftest import RESOURCES_ROOT, chdir, run_cli + + +def test_cat_incorrect_number_of_args(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli(["cat", str(RESOURCES_ROOT / "box.pdf")]) + assert exit_code == 2 + captured = capsys.readouterr() + assert "Missing argument" in captured.err + + +def test_cat_two_files_ok(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli( + [ + "cat", + str(RESOURCES_ROOT / "box.pdf"), + str(RESOURCES_ROOT / "jpeg.pdf"), + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 0, captured + assert not captured.err + reader = PdfReader(tmp_path / "out.pdf") + assert len(reader.pages) == 2 + + +def test_cat_subset_ok(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli( + [ + "cat", + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), + "13:15", + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 0, captured + assert not captured.err + reader = PdfReader(tmp_path / "out.pdf") + assert len(reader.pages) == 2 + + +@pytest.mark.parametrize( + "page_range", + ["a", "-", "1-", "1-1-1", "1:1:1:1"], +) +def test_cat_subset_invalid_args(capsys, tmp_path, page_range): + with chdir(tmp_path): + exit_code = run_cli( + [ + "cat", + str(RESOURCES_ROOT / "jpeg.pdf"), + page_range, + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2, captured + assert "Invalid file path or page range provided" in captured.err + + +@pytest.mark.skip(reason="This check is not implemented yet") +def test_cat_subset_warn_on_missing_pages(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli( + [ + "cat", + str(RESOURCES_ROOT / "jpeg.pdf"), + "2", + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 0, captured + assert "WARN" in captured.out + + +@pytest.mark.xfail() # There is currently a bug there +def test_cat_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath): + exit_code = run_cli( + [ + "cat", + str(two_pages_pdf_filepath), + "0", + "--output", + str(tmp_path / "page1.pdf"), + ] + ) + assert exit_code == 0 + # The extracted PDF should only contain ONE image: + embedded_images = extract_embedded_images(tmp_path / "page1.pdf") + assert len(embedded_images) == 1 + + exit_code = run_cli( + [ + "cat", + str(two_pages_pdf_filepath), + "1", + "--output", + str(tmp_path / "page2.pdf"), + ] + ) + assert exit_code == 0 + # The extracted PDF should only contain ONE image: + embedded_images = extract_embedded_images(tmp_path / "page2.pdf") + assert len(embedded_images) == 1 + + +def extract_embedded_images(pdf_filepath): + images = [] + reader = PdfReader(pdf_filepath) + for page in reader.pages: + images.extend(page.images) + return images diff --git a/tests/test_cli.py b/tests/test_cli.py index 30a1e96..b441453 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,35 +1,25 @@ -""" -Every CLI command is called here with a typer CliRunner. +import sys +from subprocess import check_output -Here should only be end-to-end tests. -""" +from pypdf import __version__ as pypdf_version -from pathlib import Path +from .conftest import run_cli -from typer.testing import CliRunner -from pdfly.cli import entry_point - -runner = CliRunner() - - -def test_x2pdf(tmp_path: Path) -> None: - # Arrange - output = tmp_path / "out.pdf" - assert not output.exists() - - # Act - result = runner.invoke( - entry_point, - [ - "x2pdf", - "sample-files/003-pdflatex-image/page-0-Im1.jpg", - "--output", - str(output), - ], +def test_pypdf_cli_can_be_invoked_as_a_module(): + stdout = check_output( + [sys.executable, "-m", "pdfly", "--help"] # noqa: S603 + ).decode() + assert "pdfly [OPTIONS] COMMAND [ARGS]..." in stdout + assert ( + "pdfly is a pure-python cli application for manipulating PDF files." + in stdout ) - # Assert - assert result.exit_code == 0, result.stdout - assert result.stdout == "" - assert output.exists() + +def test_pypdf_cli_version(capsys): + exit_code = run_cli(["--version"]) + captured = capsys.readouterr() + assert not captured.err + assert pypdf_version in captured.out + assert exit_code == 0 diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py new file mode 100644 index 0000000..6e8e0c0 --- /dev/null +++ b/tests/test_extract_images.py @@ -0,0 +1,25 @@ +import pytest + +from .conftest import RESOURCES_ROOT, chdir, run_cli + + +def test_extract_images_jpg_png(capsys, tmp_path): + with chdir(tmp_path): + run_cli( + [ + "extract-images", + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), + ] + ) + captured = capsys.readouterr() + assert not captured.err + assert "Extracted 3 images" in captured.out + + +@pytest.mark.xfail() # There is currently a bug there +def test_extract_images_monochrome(capsys, tmp_path): + with chdir(tmp_path): + run_cli(["extract-images", str(RESOURCES_ROOT / "box.pdf")]) + captured = capsys.readouterr() + assert not captured.err + assert "Image extracted" in captured.out diff --git a/tests/test_x2pdf.py b/tests/test_x2pdf.py new file mode 100644 index 0000000..aa74f37 --- /dev/null +++ b/tests/test_x2pdf.py @@ -0,0 +1,31 @@ +""" +Every CLI command is called here with a typer CliRunner. + +Here should only be end-to-end tests. +""" + +from pathlib import Path + +from .conftest import run_cli + + +def test_x2pdf(capsys, tmp_path: Path) -> None: + # Arrange + output = tmp_path / "out.pdf" + assert not output.exists() + + # Act + exit_code = run_cli( + [ + "x2pdf", + "sample-files/003-pdflatex-image/page-0-Im1.jpg", + "--output", + str(output), + ] + ) + + # Assert + captured = capsys.readouterr() + assert exit_code == 0, captured + assert captured.out == "" + assert output.exists()