Skip to content

Commit

Permalink
Merge pull request #1 from kduma-OSS/feature/multipage
Browse files Browse the repository at this point in the history
Process tagged multipage documents and output combined PDF files
  • Loading branch information
kduma authored Nov 30, 2022
2 parents 8b38950 + 1f7b841 commit 28021cd
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ jobs:
- dockerfile: ./bin/pdf-page-extractor/Dockerfile
context: ./bin/pdf-page-extractor
image: ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-extractor
- dockerfile: ./bin/pdf-page-joiner/Dockerfile
context: ./bin/pdf-page-joiner
image: ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner

runs-on: ubuntu-latest
permissions:
Expand Down
62 changes: 62 additions & 0 deletions app/Actions/PdfPagesJoinerAction.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

namespace App\Actions;

use App\Actions\Tools\RunDockerContainerAction;
use App\Actions\Tools\Exceptions\PdfPageContentsExtractorException;
use App\Actions\Tools\TemporaryDirectoryCreatorAction;
use Illuminate\Support\Collection;
use Illuminate\Support\Str;
use SplFileInfo;

class PdfPagesJoinerAction
{
public function __construct(
protected RunDockerContainerAction $runDockerContainerAction,
protected TemporaryDirectoryCreatorAction $tempDirMaker
) {}

public function execute(array|Collection $input_files, string|SplFileInfo $output_file): void
{
if(is_string($output_file))
$output_file = new SplFileInfo($output_file);

if(is_array($input_files))
$input_files = collect($input_files);

$input_files = $input_files
->map(function($input_file) {
if(is_string($input_file))
return new SplFileInfo($input_file);

return $input_file;
})
->values()
->mapWithKeys(fn(SplFileInfo $file, $index) => ['input-'.str_pad($index, 5, '0', STR_PAD_LEFT).'.pdf' => $file]);

$temporaryDirectory = $this->tempDirMaker->create();

$input_files->each(function(SplFileInfo $input_file, $name) use ($temporaryDirectory) {
copy($input_file->getPathname(), $temporaryDirectory->path($name));
});

$action = $this
->runDockerContainerAction
->withTemporaryDirectory($temporaryDirectory);

$action->execute(dockerImageName: 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner',output: $output, return: $return);

if (0 != $return) {
$temporaryDirectory->delete();
throw new PdfPageContentsExtractorException(
command: $action->getCommand(dockerImageName: 'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner'),
code: $return,
output: $output,
);
}

copy($temporaryDirectory->path('output.pdf'), $output_file->getPathname());

$temporaryDirectory->delete();
}
}
2 changes: 1 addition & 1 deletion app/Actions/ScanBarcodes.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public function execute(string|SplFileInfo $input_file): Collection
$barcodes = collect(explode("\n", $barcodes))
->map(fn($barcode) => trim($barcode))
->filter(fn($barcode) => !empty($barcode))
->map(fn($barcode) => explode(':', $barcode))
->map(fn($barcode) => explode(':', $barcode, 2))
->map(fn($barcode) => [
'type' => $barcode[0],
'value' => $barcode[1],
Expand Down
1 change: 1 addition & 0 deletions app/Commands/BuildDockerImagesCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public function handle(BuildDockerImageAction $builder, RunDockerContainerAction

collect([
'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-extractor' => base_path('bin/pdf-page-extractor/'),
'ghcr.io/kduma-oss/cli-pdf-scan-splitter/pdf-page-joiner' => base_path('bin/pdf-page-joiner/'),
'ghcr.io/kduma-oss/cli-pdf-scan-splitter/barcode-scanner' => base_path('bin/barcode-scanner/'),
])->each(function ($path, $tag) use ($builder, $runner) {
$this->info($builder->getCommand($tag, $path));
Expand Down
110 changes: 104 additions & 6 deletions app/Commands/ProcessPdfFilesCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

namespace App\Commands;

use App\Actions\PdfPagesJoinerAction;
use App\Actions\PdfPagesSplitterAction;
use App\Actions\ScanBarcodes;
use App\Actions\Tools\TemporaryDirectoryCreatorAction;
use Illuminate\Console\Scheduling\Schedule;
use Illuminate\Support\Collection;
use Illuminate\Support\Str;
use LaravelZero\Framework\Commands\Command;
use Spatie\TemporaryDirectory\TemporaryDirectory;
Expand Down Expand Up @@ -34,7 +36,7 @@ class ProcessPdfFilesCommand extends Command
*
* @return mixed
*/
public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, TemporaryDirectoryCreatorAction $tempDirMaker)
public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner, TemporaryDirectoryCreatorAction $tempDirMaker, PdfPagesJoinerAction $joiner)
{
$inputs = collect($this->argument('pdf'))
->map(fn($path) => realpath($path))
Expand Down Expand Up @@ -66,7 +68,7 @@ public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner,
output_dir: $temporaryDirectory->path(),
);

$file_hash = sha1($file->getRealPath());
$file_hash = sha1($file->getRealPath()).'_'.Str::random(8);
foreach ($pages as $page) {
$outputs[] = $output = $temporaryInputsDirectory->path($file_hash.'-'.basename($page));
rename($page, $output);
Expand All @@ -76,26 +78,122 @@ public function handle(PdfPagesSplitterAction $extractor, ScanBarcodes $scanner,
});

$final = [];
$tagged = collect([]);
$outputs = collect($outputs)->map(fn($path) => new SplFileInfo($path));
$this->newLine(2);
$this->info("Scanning pages for barcodes");
$this->withProgressBar($outputs, function (SplFileInfo $page) use ($temporaryOutputDirectory, $scanner, &$final) {
$bc = $scanner->execute(
$this->withProgressBar($outputs, function (SplFileInfo $page) use ($temporaryOutputDirectory, $scanner, &$final, &$tagged) {
$scanned = $scanner->execute(
input_file: $page
);

$bc = $bc->filter(fn($barcode) => $barcode['type'] == 'CODE-128');
$bc = $scanned->filter(fn($barcode) => $barcode['type'] == 'CODE-128');
if($bc->count() > 0) {
$bc = $bc->first();
$bc = $bc['value'];
} else {
$bc = 'UNKNOWN';
}

$final[] = $output = $this->getOutputPath($temporaryOutputDirectory, $bc);
$page_tags = $scanned
->filter(fn($barcode) => $barcode['type'] == 'QR-Code')
->map(function ($barcode) {
if(false === preg_match('/^([0-9A-Za-z]+):(\\d+)(:(\\d+))?$/um', $barcode['value'])) {
return null;
}
[$id, $page, $count] = explode(':', $barcode['value'].':::');

$barcode['tag'] = [
'id' => $id != "" ? $id : null,
'page' => $page != "" ? $page : null,
'count' => $count != "" ? $count : null,
];

return $barcode;
})
->filter()
->map(function ($barcode) use ($bc) {
if(is_null($barcode['tag']['id'])) {
$barcode['tag']['id'] = $bc;
}

return $barcode;
});

if($page_tags->count() > 0) {
$t = $page_tags->first();
$id = $t['tag']['id'];
$page_no = $t['tag']['page'];

if(!isset($tagged[$id])) {
$tagged[$id] = collect();
}

if(!isset($tagged[$id][$page_no])) {
$tagged[$id][$page_no] = collect();
}

$output = $this->getOutputPath($temporaryOutputDirectory, 'TG_'.$bc);

$tagged[$id][$page_no][] = [
'file' => $output,
'tag' => $t['tag'],
'barcode' => $bc,
];


} else {
$final[] = $output = $this->getOutputPath($temporaryOutputDirectory, $bc);
}
rename($page, $output);
});

if($tagged->count()) {
$this->newLine(2);
$errors = [];
$this->info("Processing multi-page documents");
$this->withProgressBar($tagged, function (Collection $tag_pages) use ($joiner, $temporaryOutputDirectory, &$errors, &$final) {
$tag_pages = $tag_pages->sortKeys();
$tag_name = $tag_pages->first()->first()['tag']['id'];

if($tag_pages->keys()->max() != $tag_pages->keys()->count()) {
$errors[] = $tag_name.' has missing pages - last page is '.$tag_pages->keys()->max(). ' but there are '.$tag_pages->keys()->count().' pages!';
}

$tag_pages = $tag_pages->map(function (Collection $tag_page, $page_number) use ($tag_name, &$errors, &$final) {
if($tag_page->count() > 1) {
$errors[] = $tag_name.':'.$page_number.' has been scanned multiple times!';
}

$used = $tag_page->pop();

foreach ($tag_page as $p) {
$file = $p['file'];
$new_file = pathinfo($file, PATHINFO_DIRNAME).DIRECTORY_SEPARATOR.'IGNORED_'.pathinfo($file, PATHINFO_FILENAME).'_'.$p['tag']['page'].'.'.pathinfo($file, PATHINFO_EXTENSION);
$errors[] = $tag_name.':'.$page_number.' - ignored file placed at '.basename($new_file);

$final[] = $new_file;
rename($file, $new_file);
}

return $used;
});

$barcode = $tag_pages->first()['barcode'];


$joiner->execute(
input_files: $tag_pages->pluck('file'),
output_file: $output = $this->getOutputPath($temporaryOutputDirectory, $barcode),
);
$final[] = $output;
});
$this->newLine(2);
foreach ($errors as $error) {
$this->error($error);
}
}

$this->newLine(2);
$this->info("Moving files to output directory");
$final = collect($final)->map(fn($path) => new SplFileInfo($path));
Expand Down
14 changes: 14 additions & 0 deletions bin/pdf-page-joiner/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM ubuntu:22.04
#FROM ubuntu:16.04

WORKDIR /src

RUN apt-get update && apt-get install -y \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*

COPY run.sh run.sh

WORKDIR /data

ENTRYPOINT ["/src/run.sh"]
2 changes: 2 additions & 0 deletions bin/pdf-page-joiner/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
/usr/bin/pdfunite /data/input-*.pdf '/data/output.pdf' || exit 1
Binary file added test_data/test-randomized.pdf
Binary file not shown.
Binary file added test_data/test.pages
Binary file not shown.
Binary file added test_data/test.pdf
Binary file not shown.

0 comments on commit 28021cd

Please sign in to comment.