diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2577c78..b9ded33 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,7 @@ name: Build artifact on: + pull_request: workflow_dispatch: env: @@ -36,5 +37,5 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v1 with: - name: ${{ env.APP_NAME }}.tar.gz - path: ${{ env.APP_NAME }}/build/artifacts/appstore/${{ env.APP_NAME }}.tar.gz + name: ${{ env.APP_NAME }}.tar.gz + path: ${{ env.APP_NAME }}/build/artifacts/appstore/${{ env.APP_NAME }}.tar.gz diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index be9a767..7fca40f 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -42,6 +42,7 @@ jobs: uses: actions/checkout@v2 with: path: apps/${{ env.APP_NAME }} + fetch-depth: 0 - name: Set up php ${{ matrix.php-versions }} uses: shivammathur/setup-php@v2 diff --git a/.github/workflows/toc_generator.yml b/.github/workflows/toc_generator.yml deleted file mode 100644 index 2667613..0000000 --- a/.github/workflows/toc_generator.yml +++ /dev/null @@ -1,13 +0,0 @@ -on: - pull_request: - types: [opened, synchronize] -name: TOC Generator -jobs: - generateTOC: - name: TOC Generator - runs-on: ubuntu-20.04 - steps: - - uses: technote-space/toc-generator@v2 - with: - CREATE_PR: true - TOC_TITLE: "## Table of contents" \ No newline at end of file diff --git a/Makefile b/Makefile index 5885a82..1ceed4e 100644 --- a/Makefile +++ b/Makefile @@ -180,6 +180,7 @@ appstore: --exclude="../$(app_name)/*.json" \ --exclude="../$(app_name)/*.lock" \ --exclude="../$(app_name)/*.cov" \ + --exclude="../$(app_name)/psalm.xml" \ ../$(app_name) \ .PHONY: test diff --git a/README.md b/README.md index 9cf4b25..1ebeee6 100644 --- a/README.md +++ b/README.md @@ -6,36 +6,36 @@ [![Generic badge](https://img.shields.io/github/v/release/R0Wi/workflow_ocr)](https://github.com/R0Wi/workflow_ocr/releases) [![Generic badge](https://img.shields.io/badge/Nextcloud-24-orange)](https://github.com/nextcloud/server) - - ## Table of contents -- [Setup](#setup) - - [App installation](#app-installation) - - [Nextcloud background jobs](#nextcloud-background-jobs) - - [Backend](#backend) -- [Usage](#usage) - - [Useful triggers](#useful-triggers) - - [Trigger OCR if file was created or updated](#trigger-ocr-if-file-was-created-or-updated) - - [Trigger OCR on tag assigning](#trigger-ocr-on-tag-assigning) - - [Settings](#settings) - - [Per workflow settings](#per-workflow-settings) - - [Global settings](#global-settings) - - [Testing your configuration](#testing-your-configuration) -- [How it works](#how-it-works) - - [General](#general) - - [PDF](#pdf) - - [Images](#images) -- [Development](#development) - - [Dev setup](#dev-setup) - - [Debugging](#debugging) - - [`docker`-based setup](#docker-based-setup) - - [Executing tests](#executing-tests) - - [Adding a new `OcrProcessor`](#adding-a-new-ocrprocessor) -- [Limitations](#limitations) -- [Used libraries & components](#used-libraries--components) - - +- [Nextcloud Workflow OCR app](#nextcloud-workflow-ocr-app) + - [Table of contents](#table-of-contents) + - [Setup](#setup) + - [App installation](#app-installation) + - [Nextcloud background jobs](#nextcloud-background-jobs) + - [Backend](#backend) + - [Usage](#usage) + - [Useful triggers](#useful-triggers) + - [Trigger OCR if file was created or updated](#trigger-ocr-if-file-was-created-or-updated) + - [Trigger OCR on tag assigning](#trigger-ocr-on-tag-assigning) + - [Settings](#settings) + - [Per workflow settings](#per-workflow-settings) + - [Global settings](#global-settings) + - [Testing your configuration](#testing-your-configuration) + - [How it works](#how-it-works) + - [General](#general) + - [PDF](#pdf) + - [Images](#images) + - [Development](#development) + - [Dev setup](#dev-setup) + - [Debugging](#debugging) + - [`docker`-based setup](#docker-based-setup) + - [Executing tests](#executing-tests) + - [Adding a new `OcrProcessor`](#adding-a-new-ocrprocessor) + - [Events emitted by the app](#events-emitted-by-the-app) + - [`TextRecognizedEvent`](#textrecognizedevent) + - [Limitations](#limitations) + - [Used libraries & components](#used-libraries--components) ## Setup ### App installation @@ -334,6 +334,45 @@ public static function registerOcrProcessors(IRegistrationContext $context) : vo That's all. If you now create a new workflow based on your added mimetype, your implementation should be triggered by the app. The return value of `ocrFile(string $fileContent, WorkflowSettings $settings, GlobalSettings $globalSettings)` will be interpreted as the file content of the scanned file. This one is used to create a new file version in Nextcloud. +### Events emitted by the app + +The app currently emits the following events from `lib/Events`. You can use these hooks to extend the app's functionality inside your own app. +Use the following sample code to implement a listener for the events: + +```php +use OCA\WorkflowOcr\Events\TextRecognizedEvent; +use OCP\EventDispatcher\Event; +use OCP\EventDispatcher\IEventListener; + +class TextRecognizedListener implements IEventListener { + public function handle(Event $event): void { + if (!$event instanceof TextRecognizedEvent) { + return; + } + // Do something with the event ... + } +} +``` + +Your implementation should then be registered in your app's `Application.php`: + +```php +public function register(IRegistrationContext $context): void { + $context->registerEventListener(TextRecognizedEvent::class, TextRecognizedListener::class); +} +``` + +#### `TextRecognizedEvent` + +This event will be emitted when a OCR process has finished successfully. It contains the following information: + +| Method | Type | Description | +|--------|-------|------------| +| `getRecognizedText()` | `string` | Contains the text which was recognized by the OCR process. | +| `getFile()` | `OCP\Files\File` | The NC file node where the OCR processed file was stored to. | + +> **Note:** this event will be emitted even if the OCR content was empty. + ## Limitations * **Currently only pdf documents (`application/pdf`) can be used as input.** Other mimetypes are currently ignored but might be added in the future. * Pdf metadata (like author, comments, ...) is not available in the converted output pdf document. diff --git a/appinfo/routes.php b/appinfo/routes.php index 79e9b05..0ce5661 100644 --- a/appinfo/routes.php +++ b/appinfo/routes.php @@ -27,7 +27,8 @@ return [ 'routes' => [ - ['name' => 'GlobalSettings#getGlobalSettings', 'url' => '/globalsettings', 'verb' => 'GET'], - ['name' => 'GlobalSettings#setGlobalSettings', 'url' => '/globalsettings', 'verb' => 'PUT'] + ['name' => 'GlobalSettings#getGlobalSettings', 'url' => '/globalSettings', 'verb' => 'GET'], + ['name' => 'GlobalSettings#setGlobalSettings', 'url' => '/globalSettings', 'verb' => 'PUT'], + ['name' => 'OcrBackendInfo#getInstalledLanguages', 'url' => '/ocrBackendInfo/installedLangs', 'verb' => 'GET'] ] ]; diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index ab5ec64..a948da9 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -28,13 +28,19 @@ namespace OCA\WorkflowOcr\AppInfo; use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; +use OCA\WorkflowOcr\Helper\ISidecarFileAccessor; use OCA\WorkflowOcr\Helper\ProcessingFileAccessor; +use OCA\WorkflowOcr\Helper\SidecarFileAccessor; use OCA\WorkflowOcr\Listener\RegisterFlowOperationsListener; use OCA\WorkflowOcr\OcrProcessors\IOcrProcessorFactory; use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory; +use OCA\WorkflowOcr\Service\IEventService; +use OCA\WorkflowOcr\Service\EventService; use OCA\WorkflowOcr\Service\GlobalSettingsService; use OCA\WorkflowOcr\Service\IGlobalSettingsService; +use OCA\WorkflowOcr\Service\IOcrBackendInfoService; use OCA\WorkflowOcr\Service\IOcrService; +use OCA\WorkflowOcr\Service\OcrBackendInfoService; use OCA\WorkflowOcr\Service\OcrService; use OCA\WorkflowOcr\Wrapper\CommandWrapper; use OCA\WorkflowOcr\Wrapper\Filesystem; @@ -46,7 +52,10 @@ use OCP\AppFramework\Bootstrap\IBootContext; use OCP\AppFramework\Bootstrap\IBootstrap; use OCP\AppFramework\Bootstrap\IRegistrationContext; +use OCP\ITempManager; use OCP\WorkflowEngine\Events\RegisterOperationsEvent; +use Psr\Container\ContainerInterface; +use Psr\Log\LoggerInterface; class Application extends App implements IBootstrap { public const COMPOSER_DIR = __DIR__ . '/../../vendor/'; @@ -68,11 +77,16 @@ public function register(IRegistrationContext $context): void { $context->registerServiceAlias(IViewFactory::class, ViewFactory::class); $context->registerServiceAlias(IFilesystem::class, Filesystem::class); $context->registerServiceAlias(IGlobalSettingsService::class, GlobalSettingsService::class); + $context->registerServiceAlias(IEventService::class, EventService::class); + $context->registerServiceAlias(IOcrBackendInfoService::class, OcrBackendInfoService::class); // BUG #43 $context->registerService(ICommand::class, function () { return new CommandWrapper(); }, false); + $context->registerService(ISidecarFileAccessor::class, function (ContainerInterface $c) { + return new SidecarFileAccessor($c->get(ITempManager::class), $c->get(LoggerInterface::class)); + }, false); $context->registerService(IProcessingFileAccessor::class, function () { return ProcessingFileAccessor::getInstance(); diff --git a/lib/BackgroundJobs/ProcessFileJob.php b/lib/BackgroundJobs/ProcessFileJob.php index 301d145..3a59660 100644 --- a/lib/BackgroundJobs/ProcessFileJob.php +++ b/lib/BackgroundJobs/ProcessFileJob.php @@ -35,6 +35,7 @@ use OCA\WorkflowOcr\Helper\IProcessingFileAccessor; use OCA\WorkflowOcr\Model\WorkflowSettings; use OCA\WorkflowOcr\Service\IOcrService; +use OCA\WorkflowOcr\Service\IEventService; use OCA\WorkflowOcr\Wrapper\IFilesystem; use OCA\WorkflowOcr\Wrapper\IViewFactory; use OCP\AppFramework\Utility\ITimeFactory; @@ -57,6 +58,8 @@ class ProcessFileJob extends \OCP\BackgroundJob\QueuedJob { private $rootFolder; /** @var IOcrService */ private $ocrService; + /** @var IEventService */ + private $eventService; /** @var IViewFactory */ private $viewFactory; /** @var IFilesystem */ @@ -72,6 +75,7 @@ public function __construct( LoggerInterface $logger, IRootFolder $rootFolder, IOcrService $ocrService, + IEventService $eventService, IViewFactory $viewFactory, IFilesystem $filesystem, IUserManager $userManager, @@ -82,6 +86,7 @@ public function __construct( $this->logger = $logger; $this->rootFolder = $rootFolder; $this->ocrService = $ocrService; + $this->eventService = $eventService; $this->viewFactory = $viewFactory; $this->filesystem = $filesystem; $this->userManager = $userManager; @@ -179,6 +184,7 @@ private function processFile(string $filePath, WorkflowSettings $settings) : voi return; } + $fileContent = $ocrFile->getFileContent(); $nodeId = $node->getId(); $originalFileExtension = $node->getExtension(); @@ -186,8 +192,10 @@ private function processFile(string $filePath, WorkflowSettings $settings) : voi if ($originalFileExtension === $newFileExtension) { $this->createNewFileVersion($filePath, $fileContent, $nodeId); + $this->eventService->textRecognized($ocrFile, $node); } else { $this->createNewFileVersion($filePath.".pdf", $fileContent, $nodeId); + $this->eventService->textRecognized($ocrFile, $node); } } diff --git a/lib/Controller/ControllerBase.php b/lib/Controller/ControllerBase.php new file mode 100644 index 0000000..eb467fc --- /dev/null +++ b/lib/Controller/ControllerBase.php @@ -0,0 +1,41 @@ + + * + * @author Robin Windey + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +namespace OCA\WorkflowOcr\Controller; + +use OCP\AppFramework\Controller; +use OCP\AppFramework\Http\JSONResponse; + +abstract class ControllerBase extends Controller { + protected function tryExecute(callable $function) : JSONResponse { + try { + $result = $function(); + return new JSONResponse($result); + } catch (\Throwable $e) { + return new JSONResponse(['error' => $e->getMessage()], 500); + } + } +} diff --git a/lib/Controller/GlobalSettingsController.php b/lib/Controller/GlobalSettingsController.php index 03ae975..805172c 100644 --- a/lib/Controller/GlobalSettingsController.php +++ b/lib/Controller/GlobalSettingsController.php @@ -28,14 +28,13 @@ use OCA\WorkflowOcr\Model\GlobalSettings; use OCA\WorkflowOcr\Service\IGlobalSettingsService; -use OCP\AppFramework\Controller; use OCP\AppFramework\Http\JSONResponse; use OCP\IRequest; /** * This is the backend API controller for the Admin.vue component. */ -class GlobalSettingsController extends Controller { +class GlobalSettingsController extends ControllerBase { /** @var IGlobalSettingsService */ private $globalSettingsService; @@ -66,13 +65,4 @@ public function setGlobalSettings(array $globalSettings) : JSONResponse { return $this->globalSettingsService->getGlobalSettings(); }); } - - private function tryExecute(callable $function) : JSONResponse { - try { - $result = $function(); - return new JSONResponse($result); - } catch (\Throwable $e) { - return new JSONResponse(['error' => $e->getMessage()], 500); - } - } } diff --git a/lib/Controller/OcrBackendInfoController.php b/lib/Controller/OcrBackendInfoController.php new file mode 100644 index 0000000..ec43fc2 --- /dev/null +++ b/lib/Controller/OcrBackendInfoController.php @@ -0,0 +1,53 @@ + + * + * @author Robin Windey + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +namespace OCA\WorkflowOcr\Controller; + +use OCA\WorkflowOcr\Service\IOcrBackendInfoService; +use OCP\AppFramework\Http\JSONResponse; +use OCP\IRequest; + +/** + * This is the backend API controller which provides informations about the OCR backend system. + */ +class OcrBackendInfoController extends ControllerBase { + /** @var IOcrBackendInfoService */ + private $ocrBackendInfoService; + + public function __construct($AppName, IRequest $request, IOcrBackendInfoService $ocrBackendInfoService) { + parent::__construct($AppName, $request); + $this->ocrBackendInfoService = $ocrBackendInfoService; + } + + /** + * @return JSONResponse + */ + public function getInstalledLanguages() : JSONResponse { + return $this->tryExecute(function () { + return $this->ocrBackendInfoService->getInstalledLanguages(); + }); + } +} diff --git a/lib/Events/TextRecognizedEvent.php b/lib/Events/TextRecognizedEvent.php new file mode 100644 index 0000000..97a7245 --- /dev/null +++ b/lib/Events/TextRecognizedEvent.php @@ -0,0 +1,67 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Events; + +use OCP\EventDispatcher\Event; +use OCP\Files\File; + +/** + * Class TextRecognizedEvent + * + * @package OCA\WorkflowOcr\Events + */ +class TextRecognizedEvent extends Event { + + + /** @var string */ + private $recognizedText; + + /** @var File */ + private $file; + + + /** + * TextRecognizedEvent constructor. + */ + public function __construct(string $recognizedText, File $file) { + parent::__construct(); + + $this->recognizedText = $recognizedText; + $this->file = $file; + } + + /** + * @return string $recognizedText + */ + public function getRecognizedText(): string { + return $this->recognizedText; + } + + /** + * @return File $file + */ + public function getFile(): File { + return $this->file; + } +} diff --git a/lib/Exception/CommandException.php b/lib/Exception/CommandException.php new file mode 100644 index 0000000..8e21b56 --- /dev/null +++ b/lib/Exception/CommandException.php @@ -0,0 +1,32 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Exception; + +use Exception; + +class CommandException extends Exception { + public function __construct(string $message, string $command) { + $this->message = "The command '$command' produced an error: $message"; + } +} diff --git a/lib/Helper/ISidecarFileAccessor.php b/lib/Helper/ISidecarFileAccessor.php new file mode 100644 index 0000000..fc35ab0 --- /dev/null +++ b/lib/Helper/ISidecarFileAccessor.php @@ -0,0 +1,40 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Helper; + +interface ISidecarFileAccessor { + /** + * Creates a new temporary sidecar file for OCR text content. + * If a file was already created, the path to the existing file is returned. + * + * @return string|bool Path to the sidecar file or false if the file could not be created + */ + public function getOrCreateSidecarFile(); + + /** + * Gets the content of the created sidecar file. File has to be created + * before calling this method. + */ + public function getSidecarFileContent(): string; +} diff --git a/lib/Helper/SidecarFileAccessor.php b/lib/Helper/SidecarFileAccessor.php new file mode 100644 index 0000000..af7d821 --- /dev/null +++ b/lib/Helper/SidecarFileAccessor.php @@ -0,0 +1,64 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Helper; + +use OCP\ITempManager; +use Psr\Log\LoggerInterface; + +class SidecarFileAccessor implements ISidecarFileAccessor { + + /** @var ITempManager */ + private $tempManager; + + /** @var LoggerInterface */ + private $logger; + + /** @var string */ + private $sidecarFilePath; + + public function __construct(ITempManager $tempManager, LoggerInterface $logger) { + $this->tempManager = $tempManager; + $this->logger = $logger; + } + + /** + * {@inheritdoc} + */ + public function getOrCreateSidecarFile() { + if ($this->sidecarFilePath === null) { + $this->sidecarFilePath = $this->tempManager->getTemporaryFile('sidecar'); + if (!$this->sidecarFilePath) { + $this->logger->warning('Could not create temporary sidecar file'); + } + } + return $this->sidecarFilePath; + } + + /** + * {@inheritdoc} + */ + public function getSidecarFileContent(): string { + return $this->sidecarFilePath ? file_get_contents($this->sidecarFilePath) : ''; + } +} diff --git a/lib/Migration/Version2404Date20220903071748.php b/lib/Migration/Version2404Date20220903071748.php new file mode 100644 index 0000000..4d6add3 --- /dev/null +++ b/lib/Migration/Version2404Date20220903071748.php @@ -0,0 +1,142 @@ + + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +namespace OCA\WorkflowOcr\Migration; + +use Closure; +use Exception; +use OCP\DB\ISchemaWrapper; +use OCP\IDBConnection; +use OCP\Migration\IOutput; +use OCP\Migration\SimpleMigrationStep; + +class Version2404Date20220903071748 extends SimpleMigrationStep { + + /** @var IDBConnection */ + private $db; + + public function __construct(IDBConnection $db) { + $this->db = $db; + } + + /** + * {@inheritDoc} + */ + public function name(): string { + return 'migrate lang codes'; + } + + /** + * {@inheritDoc} + */ + public function description(): string { + return 'Execute migration of language codes towards tesseract langugage codes (e.g. deu instead of de)'; + } + + /** + * {@inheritDoc} + */ + public function changeSchema(IOutput $output, Closure $schemaClosure, array $options): ?ISchemaWrapper { + // 'id' and new 'operation' value will be stored here + $datasetsToMigrate = $this->getDatasetsToMigrate(); + $this->updateDatabase($datasetsToMigrate); + + return null; + } + + private function getDatasetsToMigrate() : array { + $langMapping = [ + 'de' => 'deu', + 'en' => 'eng', + 'fr' => 'fra', + 'it' => 'ita', + 'es' => 'spa', + 'pt' => 'por', + 'ru' => 'rus', + 'chi' => 'chi_sim' + ]; + + $builder = $this->db->getQueryBuilder(); + + $ocrFlowOperations = $builder->select('id', 'operation') + ->from('flow_operations') + ->where($builder->expr()->eq('class', $builder->createNamedParameter('OCA\WorkflowOcr\Operation'))) + ->executeQuery(); + + $datasetsToMigrate = []; + + try { + while ($row = $ocrFlowOperations->fetch()) { + $workflowSettings = json_decode($row['operation'], true); + $foundMapping = false; + $newLangArr = []; + $languagesArr = $workflowSettings['languages']; + + // Check if we need to migrate the languages code. + // If yes, we have to regenerate the whole 'operation' string. + foreach ($languagesArr as $existingLang) { + if (array_key_exists($existingLang, $langMapping)) { + $newLangArr[] = $langMapping[$existingLang]; + $foundMapping = true; + continue; + } + $newLangArr[] = $existingLang; + } + + if ($foundMapping) { + $workflowSettings['languages'] = $newLangArr; + $datasetsToMigrate[] = [ + 'id' => $row['id'], + 'operation' => json_encode($workflowSettings) + ]; + } + } + } finally { + $ocrFlowOperations->closeCursor(); + } + + return $datasetsToMigrate; + } + + private function updateDatabase(array $datasetsToMigrate) : void { + $this->db->beginTransaction(); + + try { + $builder = $this->db->getQueryBuilder(); + $builder->update('flow_operations') + ->set('operation', $builder->createParameter('operation')) + ->where($builder->expr()->eq('id', $builder->createParameter('id'))); + + foreach ($datasetsToMigrate as $dataset) { + $builder->setParameter('id', $dataset['id']); + $builder->setParameter('operation', $dataset['operation']); + $builder->executeStatement(); + } + } catch (Exception $e) { + $this->db->rollBack(); + throw $e; + } + + $this->db->commit(); + } +} diff --git a/lib/OcrProcessors/OcrMyPdfBasedProcessor.php b/lib/OcrProcessors/OcrMyPdfBasedProcessor.php index fdbf2d4..83a8416 100644 --- a/lib/OcrProcessors/OcrMyPdfBasedProcessor.php +++ b/lib/OcrProcessors/OcrMyPdfBasedProcessor.php @@ -25,6 +25,7 @@ use Cocur\Chain\Chain; use OCA\WorkflowOcr\Exception\OcrNotPossibleException; +use OCA\WorkflowOcr\Helper\ISidecarFileAccessor; use OCA\WorkflowOcr\Model\GlobalSettings; use OCA\WorkflowOcr\Model\WorkflowSettings; use OCA\WorkflowOcr\Wrapper\ICommand; @@ -32,21 +33,6 @@ use Psr\Log\LoggerInterface; abstract class OcrMyPdfBasedProcessor implements IOcrProcessor { - /** @var array - * Mapping for VUE frontend lang settings. - * See also https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc#languages - */ - private static $langMapping = [ - 'de' => 'deu', - 'en' => 'eng', - 'fr' => 'fra', - 'it' => 'ita', - 'es' => 'spa', - 'pt' => 'por', - 'ru' => 'rus', - 'chi' => 'chi_sim', - 'est' => 'est' - ]; /** @var ICommand */ private $command; @@ -54,9 +40,13 @@ abstract class OcrMyPdfBasedProcessor implements IOcrProcessor { /** @var LoggerInterface */ private $logger; - public function __construct(ICommand $command, LoggerInterface $logger) { + /** @var ISidecarFileAccessor */ + private $sidecarFileAccessor; + + public function __construct(ICommand $command, LoggerInterface $logger, ISidecarFileAccessor $sidecarFileAccessor) { $this->command = $command; $this->logger = $logger; + $this->sidecarFileAccessor = $sidecarFileAccessor; } public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult { @@ -93,9 +83,15 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $ throw new OcrNotPossibleException('OCRmyPDF did not produce any output'); } + $recognizedText = $this->sidecarFileAccessor->getSidecarFileContent(); + + if (!$recognizedText) { + $this->logger->info('Temporary sidecar file at \'{path}\' was empty', ['path' => $this->sidecarFileAccessor->getOrCreateSidecarFile()]); + } + $this->logger->debug("OCR processing was successful"); - return new OcrProcessorResult($ocrFileContent, "pdf"); + return new OcrProcessorResult($ocrFileContent, "pdf", $recognizedText); } /** @@ -115,14 +111,7 @@ private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $ // Language settings if ($settings->getLanguages()) { - $langStr = Chain::create($settings->getLanguages()) - ->map(function ($langCode) { - return self::$langMapping[(string)$langCode] ?? null; - }) - ->filter(function ($l) { - return $l !== null; - }) - ->join('+'); + $langStr = Chain::create($settings->getLanguages())->join('+'); $args[] = "-l $langStr"; } @@ -138,6 +127,12 @@ private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $ $args[] = '-j ' . $processorCount; } + // Save recognized text in tempfile + $sidecarFilePath = $this->sidecarFileAccessor->getOrCreateSidecarFile(); + if ($sidecarFilePath) { + $args[] = '--sidecar ' . $sidecarFilePath; + } + $resultArgs = array_merge($args, $this->getAdditionalCommandlineArgs($settings, $globalSettings)); return implode(' ', $resultArgs); diff --git a/lib/OcrProcessors/OcrProcessorFactory.php b/lib/OcrProcessors/OcrProcessorFactory.php index 87c3287..7b150e6 100644 --- a/lib/OcrProcessors/OcrProcessorFactory.php +++ b/lib/OcrProcessors/OcrProcessorFactory.php @@ -24,6 +24,7 @@ namespace OCA\WorkflowOcr\OcrProcessors; use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException; +use OCA\WorkflowOcr\Helper\ISidecarFileAccessor; use OCA\WorkflowOcr\Wrapper\ICommand; use OCP\AppFramework\Bootstrap\IRegistrationContext; use Psr\Container\ContainerInterface; @@ -52,10 +53,10 @@ public static function registerOcrProcessors(IRegistrationContext $context) : vo * under the hood. */ $context->registerService(PdfOcrProcessor::class, function (ContainerInterface $c) { - return new PdfOcrProcessor($c->get(ICommand::class), $c->get(LoggerInterface::class)); + return new PdfOcrProcessor($c->get(ICommand::class), $c->get(LoggerInterface::class), $c->get(ISidecarFileAccessor::class)); }, false); $context->registerService(ImageOcrProcessor::class, function (ContainerInterface $c) { - return new ImageOcrProcessor($c->get(ICommand::class), $c->get(LoggerInterface::class)); + return new ImageOcrProcessor($c->get(ICommand::class), $c->get(LoggerInterface::class), $c->get(ISidecarFileAccessor::class)); }, false); } diff --git a/lib/OcrProcessors/OcrProcessorResult.php b/lib/OcrProcessors/OcrProcessorResult.php index 5e6b9ce..e1d6b07 100644 --- a/lib/OcrProcessors/OcrProcessorResult.php +++ b/lib/OcrProcessors/OcrProcessorResult.php @@ -31,10 +31,14 @@ class OcrProcessorResult { private $fileContent; /** @var string */ private $fileExtension; + /** @var string */ + private $recognizedText; + - public function __construct(string $fileContent, string $fileExtension) { + public function __construct(string $fileContent, string $fileExtension, string $recognizedText) { $this->fileContent = $fileContent; $this->fileExtension = $fileExtension; + $this->recognizedText = $recognizedText; } public function getFileContent(): string { @@ -44,4 +48,8 @@ public function getFileContent(): string { public function getFileExtension(): string { return $this->fileExtension; } + + public function getRecognizedText(): string { + return $this->recognizedText; + } } diff --git a/lib/Service/EventService.php b/lib/Service/EventService.php new file mode 100644 index 0000000..ef62b3a --- /dev/null +++ b/lib/Service/EventService.php @@ -0,0 +1,45 @@ + + * + * @author g-schmitz + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ +namespace OCA\WorkflowOcr\Service; + +use OCP\EventDispatcher\IEventDispatcher; +use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult; +use OCA\WorkflowOcr\Events\TextRecognizedEvent; +use OCP\Files\File; + +class EventService implements IEventService { + /** @var IEventDispatcher */ + private $eventDispatcher; + + public function __construct(IEventDispatcher $eventDispatcher) { + $this->eventDispatcher = $eventDispatcher; + } + + public function textRecognized(OcrProcessorResult $result, File $node) { + $event = new TextRecognizedEvent($result->getRecognizedText(), $node); + $this->eventDispatcher->dispatchTyped($event); + } +} diff --git a/lib/Service/IEventService.php b/lib/Service/IEventService.php new file mode 100644 index 0000000..5f0a177 --- /dev/null +++ b/lib/Service/IEventService.php @@ -0,0 +1,40 @@ + + * + * @author g-schmitz + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +namespace OCA\WorkflowOcr\Service; + +use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult; +use OCP\Files\File; + +interface IEventService { + /** + * Emits events + * + * @param OcrProcessorResult $result The processed ocr result + * + */ + public function textRecognized(OcrProcessorResult $result, File $node); +} diff --git a/lib/Service/IOcrBackendInfoService.php b/lib/Service/IOcrBackendInfoService.php new file mode 100644 index 0000000..4f4e997 --- /dev/null +++ b/lib/Service/IOcrBackendInfoService.php @@ -0,0 +1,41 @@ + + * + * @author Robin Windey + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +namespace OCA\WorkflowOcr\Service; + +use OCA\WorkflowOcr\Exception\CommandException; + +interface IOcrBackendInfoService { + + /** + * Returns all languages that are supported by the OCR backend. + * Languages will be returned as an array of language-code-strings, + * currently defined at https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc#languages. + * @return array string[] + * @throws CommandException + */ + public function getInstalledLanguages() : array; +} diff --git a/lib/Service/OcrBackendInfoService.php b/lib/Service/OcrBackendInfoService.php new file mode 100644 index 0000000..0ecfa82 --- /dev/null +++ b/lib/Service/OcrBackendInfoService.php @@ -0,0 +1,81 @@ + + * + * @author Robin Windey + * + * @license GNU AGPL version 3 or any later version + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +namespace OCA\WorkflowOcr\Service; + +use Cocur\Chain\Chain; +use OCA\WorkflowOcr\Exception\CommandException; +use OCA\WorkflowOcr\Wrapper\ICommand; +use Psr\Log\LoggerInterface; + +class OcrBackendInfoService implements IOcrBackendInfoService { + /** @var ICommand */ + private $command; + + /** @var LoggerInterface */ + private $logger; + + public function __construct(ICommand $command, LoggerInterface $logger) { + $this->command = $command; + $this->logger = $logger; + } + + public function getInstalledLanguages() : array { + $commandStr = 'tesseract --list-langs'; + $this->command->setCommand($commandStr); + + $success = $this->command->execute(); + $errorOutput = $this->command->getError(); + $stdErr = $this->command->getStdErr(); + $exitCode = $this->command->getExitCode(); + + if (!$success) { + throw new CommandException('Exited abnormally with exit-code ' . $exitCode . '. Message: ' . $errorOutput . ' ' . $stdErr, $commandStr); + } + + if ($stdErr !== '' || $errorOutput !== '') { + $this->logger->warning('Tesseract list languages succeeded with warning(s): {stdErr}, {errorOutput}', [ + 'stdErr' => $stdErr, + 'errorOutput' => $errorOutput + ]); + } + + $installedLangsStr = $this->command->getOutput(); + + if (!$installedLangsStr) { + throw new CommandException('No output produced', $commandStr); + } + + $lines = explode("\n", $installedLangsStr); + $arr = Chain::create($lines) + ->slice(1) // Skip tesseract header line + ->filter(function ($line) { + return $line !== 'osd'; // Also skip "osd" (OSD is not a language) + }) + ->array; + return array_values($arr); + } +} diff --git a/phpunit.integration.xml b/phpunit.integration.xml index 0796947..4b72ad0 100644 --- a/phpunit.integration.xml +++ b/phpunit.integration.xml @@ -21,6 +21,7 @@ ./tests ./vendor ./node_modules + ./lib/Migration ./.php-cs-fixer.dist.php diff --git a/phpunit.xml b/phpunit.xml index 463b03a..1821383 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -21,6 +21,7 @@ ./tests ./vendor ./node_modules + ./lib/Migration ./.php-cs-fixer.dist.php diff --git a/src/components/WorkflowOcr.vue b/src/components/WorkflowOcr.vue index 9aec6a2..818cda2 100644 --- a/src/components/WorkflowOcr.vue +++ b/src/components/WorkflowOcr.vue @@ -55,24 +55,13 @@