diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index c98fcc7..6175bba 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -36,6 +36,9 @@ jobs: - dockerfile: ./bin/pdf-page-extractor/Dockerfile context: ./bin/pdf-page-extractor image: ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-extractor + - dockerfile: ./bin/pdf-page-joiner/Dockerfile + context: ./bin/pdf-page-joiner + image: ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner runs-on: ubuntu-latest permissions: diff --git a/app/Actions/PdfPagesJoinerAction.php b/app/Actions/PdfPagesJoinerAction.php new file mode 100644 index 0000000..a628e86 --- /dev/null +++ b/app/Actions/PdfPagesJoinerAction.php @@ -0,0 +1,62 @@ +map(function($input_file) { + if(is_string($input_file)) + return new SplFileInfo($input_file); + + return $input_file; + }) + ->values() + ->mapWithKeys(fn(SplFileInfo $file, $index) => ['input-'.str_pad($index, 5, '0', STR_PAD_LEFT).'.pdf' => $file]); + + $temporaryDirectory = $this->tempDirMaker->create(); + + $input_files->each(function(SplFileInfo $input_file, $name) use ($temporaryDirectory) { + copy($input_file->getPathname(), $temporaryDirectory->path($name)); + }); + + $action = $this + ->runDockerContainerAction + ->withTemporaryDirectory($temporaryDirectory); + + $action->execute(dockerImageName: 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner',output: $output, return: $return); + + if (0 != $return) { + $temporaryDirectory->delete(); + throw new PdfPageContentsExtractorException( + command: $action->getCommand(dockerImageName: 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner'), + code: $return, + output: $output, + ); + } + + copy($temporaryDirectory->path('output.pdf'), $output_file->getPathname()); + + $temporaryDirectory->delete(); + } +} diff --git a/app/Actions/ScanBarcodes.php b/app/Actions/ScanBarcodes.php index 5880ab6..6e65173 100644 --- a/app/Actions/ScanBarcodes.php +++ b/app/Actions/ScanBarcodes.php @@ -47,7 +47,7 @@ public function execute(string|SplFileInfo $input_file): Collection $barcodes = collect(explode("\n", $barcodes)) ->map(fn($barcode) => trim($barcode)) ->filter(fn($barcode) => !empty($barcode)) - ->map(fn($barcode) => explode(':', $barcode)) + ->map(fn($barcode) => explode(':', $barcode, 2)) ->map(fn($barcode) => [ 'type' => $barcode[0], 'value' => $barcode[1], diff --git a/app/Commands/BuildDockerImagesCommand.php b/app/Commands/BuildDockerImagesCommand.php index 6bdd841..3313a38 100644 --- a/app/Commands/BuildDockerImagesCommand.php +++ b/app/Commands/BuildDockerImagesCommand.php @@ -47,6 +47,7 @@ public function handle(BuildDockerImageAction $builder, RunDockerContainerAction collect([ 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-extractor' => base_path('bin/pdf-page-extractor/'), + 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner' => base_path('bin/pdf-page-joiner/'), 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/barcode-scanner' => base_path('bin/barcode-scanner/'), ])->each(function ($path, $tag) use ($builder, $runner) { $this->info($builder->getCommand($tag, $path)); diff --git a/app/Commands/ProcessPdfFilesCommand.php b/app/Commands/ProcessPdfFilesCommand.php index 284213d..1d7fe00 100644 --- a/app/Commands/ProcessPdfFilesCommand.php +++ b/app/Commands/ProcessPdfFilesCommand.php @@ -2,10 +2,12 @@ namespace App\Commands; +use App\Actions\PdfPagesJoinerAction; use App\Actions\PdfPagesSplitterAction; use App\Actions\ScanBarcodes; use App\Actions\Tools\TemporaryDirectoryCreatorAction; use Illuminate\Console\Scheduling\Schedule; +use Illuminate\Support\Collection; use Illuminate\Support\Str; use LaravelZero\Framework\Commands\Command; use Spatie\TemporaryDirectory\TemporaryDirectory; @@ -34,7 +36,7 @@ class ProcessPdfFilesCommand extends Command * * @return mixed */ - public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, TemporaryDirectoryCreatorAction $tempDirMaker) + public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, TemporaryDirectoryCreatorAction $tempDirMaker, PdfPagesJoinerAction $joiner) { $inputs = collect($this->argument('pdf')) ->map(fn($path) => realpath($path)) @@ -66,7 +68,7 @@ public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, output_dir: $temporaryDirectory->path(), ); - $file_hash = sha1($file->getRealPath()); + $file_hash = sha1($file->getRealPath()).'_'.Str::random(8); foreach ($pages as $page) { $outputs[] = $output = $temporaryInputsDirectory->path($file_hash.'-'.basename($page)); rename($page, $output); @@ -76,15 +78,16 @@ public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, }); $final = []; + $tagged = collect([]); $outputs = collect($outputs)->map(fn($path) => new SplFileInfo($path)); $this->newLine(2); $this->info("Scanning pages for barcodes"); - $this->withProgressBar($outputs, function (SplFileInfo $page) use ($temporaryOutputDirectory, $scanner, &$final) { - $bc = $scanner->execute( + $this->withProgressBar($outputs, function (SplFileInfo $page) use ($temporaryOutputDirectory, $scanner, &$final, &$tagged) { + $scanned = $scanner->execute( input_file: $page ); - $bc = $bc->filter(fn($barcode) => $barcode['type'] == 'CODE-128'); + $bc = $scanned->filter(fn($barcode) => $barcode['type'] == 'CODE-128'); if($bc->count() > 0) { $bc = $bc->first(); $bc = $bc['value']; @@ -92,10 +95,105 @@ public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, $bc = 'UNKNOWN'; } - $final[] = $output = $this->getOutputPath($temporaryOutputDirectory, $bc); + $page_tags = $scanned + ->filter(fn($barcode) => $barcode['type'] == 'QR-Code') + ->map(function ($barcode) { + if(false === preg_match('/^([0-9A-Za-z]+):(\\d+)(:(\\d+))?$/um', $barcode['value'])) { + return null; + } + [$id, $page, $count] = explode(':', $barcode['value'].':::'); + + $barcode['tag'] = [ + 'id' => $id != "" ? $id : null, + 'page' => $page != "" ? $page : null, + 'count' => $count != "" ? $count : null, + ]; + + return $barcode; + }) + ->filter() + ->map(function ($barcode) use ($bc) { + if(is_null($barcode['tag']['id'])) { + $barcode['tag']['id'] = $bc; + } + + return $barcode; + }); + + if($page_tags->count() > 0) { + $t = $page_tags->first(); + $id = $t['tag']['id']; + $page_no = $t['tag']['page']; + + if(!isset($tagged[$id])) { + $tagged[$id] = collect(); + } + + if(!isset($tagged[$id][$page_no])) { + $tagged[$id][$page_no] = collect(); + } + + $output = $this->getOutputPath($temporaryOutputDirectory, 'TG_'.$bc); + + $tagged[$id][$page_no][] = [ + 'file' => $output, + 'tag' => $t['tag'], + 'barcode' => $bc, + ]; + + + } else { + $final[] = $output = $this->getOutputPath($temporaryOutputDirectory, $bc); + } rename($page, $output); }); + if($tagged->count()) { + $this->newLine(2); + $errors = []; + $this->info("Processing multi-page documents"); + $this->withProgressBar($tagged, function (Collection $tag_pages) use ($joiner, $temporaryOutputDirectory, &$errors, &$final) { + $tag_pages = $tag_pages->sortKeys(); + $tag_name = $tag_pages->first()->first()['tag']['id']; + + if($tag_pages->keys()->max() != $tag_pages->keys()->count()) { + $errors[] = $tag_name.' has missing pages - last page is '.$tag_pages->keys()->max(). ' but there are '.$tag_pages->keys()->count().' pages!'; + } + + $tag_pages = $tag_pages->map(function (Collection $tag_page, $page_number) use ($tag_name, &$errors, &$final) { + if($tag_page->count() > 1) { + $errors[] = $tag_name.':'.$page_number.' has been scanned multiple times!'; + } + + $used = $tag_page->pop(); + + foreach ($tag_page as $p) { + $file = $p['file']; + $new_file = pathinfo($file, PATHINFO_DIRNAME).DIRECTORY_SEPARATOR.'IGNORED_'.pathinfo($file, PATHINFO_FILENAME).'_'.$p['tag']['page'].'.'.pathinfo($file, PATHINFO_EXTENSION); + $errors[] = $tag_name.':'.$page_number.' - ignored file placed at '.basename($new_file); + + $final[] = $new_file; + rename($file, $new_file); + } + + return $used; + }); + + $barcode = $tag_pages->first()['barcode']; + + + $joiner->execute( + input_files: $tag_pages->pluck('file'), + output_file: $output = $this->getOutputPath($temporaryOutputDirectory, $barcode), + ); + $final[] = $output; + }); + $this->newLine(2); + foreach ($errors as $error) { + $this->error($error); + } + } + $this->newLine(2); $this->info("Moving files to output directory"); $final = collect($final)->map(fn($path) => new SplFileInfo($path)); diff --git a/bin/pdf-page-joiner/Dockerfile b/bin/pdf-page-joiner/Dockerfile new file mode 100644 index 0000000..94d6c64 --- /dev/null +++ b/bin/pdf-page-joiner/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:22.04 +#FROM ubuntu:16.04 + +WORKDIR /src + +RUN apt-get update && apt-get install -y \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +COPY run.sh run.sh + +WORKDIR /data + +ENTRYPOINT ["/src/run.sh"] diff --git a/bin/pdf-page-joiner/run.sh b/bin/pdf-page-joiner/run.sh new file mode 100755 index 0000000..8aafeca --- /dev/null +++ b/bin/pdf-page-joiner/run.sh @@ -0,0 +1,2 @@ +#!/bin/bash +/usr/bin/pdfunite /data/input-*.pdf '/data/output.pdf' || exit 1 diff --git a/test_data/test-randomized.pdf b/test_data/test-randomized.pdf new file mode 100644 index 0000000..588ac05 Binary files /dev/null and b/test_data/test-randomized.pdf differ diff --git a/test_data/test.pages b/test_data/test.pages new file mode 100755 index 0000000..8ea070d Binary files /dev/null and b/test_data/test.pages differ diff --git a/test_data/test.pdf b/test_data/test.pdf new file mode 100644 index 0000000..6b6bbf7 Binary files /dev/null and b/test_data/test.pdf differ