diff --git a/Jenkinsfile b/Jenkinsfile index cf07a10..64276f4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,6 +4,6 @@ def repoName = "pdfOcr" def dependencyRegex = "itextcore" def solutionFile = "i7n-ocr.sln" -def csprojFramework = "netcoreapp2.0" +def csprojFramework = "net461" automaticDotnetBuild(repoName, dependencyRegex, solutionFile, csprojFramework) diff --git a/doxyfile b/doxyfile index 014ef9b..e8db83f 100644 --- a/doxyfile +++ b/doxyfile @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "pdfOCR 1.0.3 API" +PROJECT_NAME = "pdfOCR 2.0.0 API" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version diff --git a/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs b/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs index 535049e..22b13f9 100644 --- a/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs +++ b/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs @@ -15,6 +15,6 @@ [assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")] -[assembly: AssemblyVersion("1.0.3.0")] -[assembly: AssemblyFileVersion("1.0.3.0")] -[assembly: AssemblyInformationalVersion("1.0.3")] +[assembly: AssemblyVersion("2.0.0.0")] +[assembly: AssemblyFileVersion("2.0.0.0")] +[assembly: AssemblyInformationalVersion("2.0.0")] diff --git a/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj b/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj index 73d3ee2..8b25ae6 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj +++ b/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj @@ -9,7 +9,7 @@ library - net45 + net461 true @@ -25,7 +25,7 @@ - + diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs index aa676a8..9077a3c 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs @@ -23,44 +23,129 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; +using iText.Commons.Actions.Contexts; +using iText.Commons.Utils; using iText.IO.Image; -using iText.IO.Util; using iText.Kernel.Colors; using iText.Kernel.Font; using iText.Kernel.Geom; +using iText.Kernel.Pdf; +using iText.Pdfa; using iText.Pdfocr.Helpers; +using iText.Pdfocr.Logs; using iText.Test; using iText.Test.Attributes; namespace iText.Pdfocr { public class ApiTest : ExtendedITextTest { + public static readonly String DESTINATION_FOLDER = NUnit.Framework.TestContext.CurrentContext.TestDirectory + + "/test/itext/pdfocr"; + + [NUnit.Framework.OneTimeSetUp] + public static void BeforeClass() { + CreateOrClearDestinationFolder(DESTINATION_FOLDER); + } + [NUnit.Framework.Test] - public virtual void TestTextInfo() { - String path = PdfHelper.GetDefaultImagePath(); - IDictionary> result = new CustomOcrEngine().DoImageOcr(new FileInfo(path)); - NUnit.Framework.Assert.AreEqual(1, result.Count); - TextInfo textInfo = new TextInfo(); - textInfo.SetText("text"); - textInfo.SetBboxRect(new Rectangle(204.0f, 158.0f, 538.0f, 136.0f)); - int page = 2; - result.Put(page, JavaCollectionsUtil.SingletonList(textInfo)); - NUnit.Framework.Assert.AreEqual(2, result.Count); - NUnit.Framework.Assert.AreEqual(textInfo.GetText(), result.Get(page)[0].GetText()); + public virtual void CreatePdfWithFileTest() { + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetMetaInfo(new ApiTest.DummyMetaInfo()); + OcrPdfCreator pdfCreator = new OcrPdfCreator(new CustomOcrEngine(), props); + using (PdfDocument pdf = pdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper + .GetDefaultImagePath())), PdfHelper.GetPdfWriter(), new DocumentProperties().SetEventCountingMetaInfo( + new ApiTest.DummyMetaInfo()))) { + String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding + .UTF8); + NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); + } + } + + [NUnit.Framework.Test] + public virtual void CreatePdfFileWithFileTest() { + String output = DESTINATION_FOLDER + "createPdfFileWithFileTest.pdf"; + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetMetaInfo(new ApiTest.DummyMetaInfo()); + OcrPdfCreator pdfCreator = new OcrPdfCreator(new CustomOcrEngine(), props); + pdfCreator.CreatePdfFile(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper.GetDefaultImagePath + ())), new FileInfo(output)); + using (PdfDocument pdf = new PdfDocument(new PdfReader(output))) { + String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding + .UTF8); + NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); + } + } + + [NUnit.Framework.Test] + public virtual void CreatePdfAWithFileTest() { + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetMetaInfo(new ApiTest.DummyMetaInfo()).SetPdfLang + ("en-US"); + OcrPdfCreator pdfCreator = new OcrPdfCreator(new CustomOcrEngine(), props); + using (PdfDocument pdf = pdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper + .GetDefaultImagePath())), PdfHelper.GetPdfWriter(), new DocumentProperties().SetEventCountingMetaInfo( + new ApiTest.DummyMetaInfo()), PdfHelper.GetRGBPdfOutputIntent())) { + String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding + .UTF8); + NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); + NUnit.Framework.Assert.IsTrue(pdf is PdfADocument); + } } [NUnit.Framework.Test] - public virtual void TestTextInfoDeprecationMode() { + public virtual void CreatePdfAFileWithFileTest() { + String output = DESTINATION_FOLDER + "createPdfAFileWithFileTest.pdf"; + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetMetaInfo(new ApiTest.DummyMetaInfo()).SetPdfLang + ("en-US"); + OcrPdfCreator pdfCreator = new OcrPdfCreator(new CustomOcrEngine(), props); + pdfCreator.CreatePdfAFile(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper.GetDefaultImagePath + ())), new FileInfo(output), PdfHelper.GetRGBPdfOutputIntent()); + using (PdfDocument pdf = new PdfDocument(new PdfReader(output))) { + String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding + .UTF8); + NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); + PdfAConformanceLevel cl = pdf.GetReader().GetPdfAConformanceLevel(); + NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetConformance(), cl.GetConformance()); + NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetPart(), cl.GetPart()); + } + } + + [NUnit.Framework.Test] + public virtual void CreatePdfAFileWithFileNoMetaTest() { + String output = DESTINATION_FOLDER + "createPdfAFileWithFileNoMetaTest.pdf"; + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetPdfLang("en-US"); + OcrPdfCreator pdfCreator = new OcrPdfCreator(new CustomOcrEngine(), props); + pdfCreator.CreatePdfAFile(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper.GetDefaultImagePath + ())), new FileInfo(output), PdfHelper.GetRGBPdfOutputIntent()); + using (PdfDocument pdf = new PdfDocument(new PdfReader(output))) { + String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding + .UTF8); + NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); + PdfAConformanceLevel cl = pdf.GetReader().GetPdfAConformanceLevel(); + NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetConformance(), cl.GetConformance()); + NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetPart(), cl.GetPart()); + } + } + + [NUnit.Framework.Test] + public virtual void CreatePdfAFileWithFileProductAwareEngineTest() { + String output = DESTINATION_FOLDER + "createPdfAFileWithFileProductAwareEngineTest.pdf"; + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetPdfLang("en-US"); + CustomProductAwareOcrEngine ocrEngine = new CustomProductAwareOcrEngine(); + OcrPdfCreator pdfCreator = new OcrPdfCreator(ocrEngine, props); + pdfCreator.CreatePdfAFile(JavaCollectionsUtil.SingletonList(new FileInfo(PdfHelper.GetDefaultImagePath + ())), new FileInfo(output), PdfHelper.GetRGBPdfOutputIntent()); + NUnit.Framework.Assert.IsTrue(ocrEngine.IsGetMetaInfoContainerTriggered()); + } + + [NUnit.Framework.Test] + public virtual void TestTextInfo() { String path = PdfHelper.GetDefaultImagePath(); - IDictionary> result = new CustomOcrEngine(true).DoImageOcr(new FileInfo(path)); + IDictionary> result = new CustomOcrEngine().DoImageOcr(new FileInfo(path)); NUnit.Framework.Assert.AreEqual(1, result.Count); TextInfo textInfo = new TextInfo(); textInfo.SetText("text"); - textInfo.SetBbox(JavaUtil.ArraysAsList(204.0f, 158.0f, 742.0f, 294.0f)); + textInfo.SetBboxRect(new Rectangle(204.0f, 158.0f, 538.0f, 136.0f)); int page = 2; result.Put(page, JavaCollectionsUtil.SingletonList(textInfo)); NUnit.Framework.Assert.AreEqual(2, result.Count); NUnit.Framework.Assert.AreEqual(textInfo.GetText(), result.Get(page)[0].GetText()); - NUnit.Framework.Assert.AreEqual(textInfo.GetBbox().Count, result.Get(page)[0].GetBbox().Count); } [LogMessage(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, Count = 7)] @@ -113,18 +198,7 @@ public virtual ImageData ApplyRotation(ImageData imageData) { } } - [LogMessage(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, Count = 7)] - [NUnit.Framework.Test] - public virtual void TestThaiImageWithNotDefGlyphsDeprecationMode() { - String testName = "testThaiImageWithNotdefGlyphs"; - String path = PdfHelper.GetThaiImagePath(); - String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf"; - PdfHelper.CreatePdf(pdfPath, new FileInfo(path), new OcrPdfCreatorProperties().SetTextColor(DeviceRgb.BLACK - ), true); - ExtractionStrategy strategy = PdfHelper.GetExtractionStrategy(pdfPath); - PdfFont font = strategy.GetPdfFont(); - String fontName = font.GetFontProgram().GetFontNames().GetFontName(); - NUnit.Framework.Assert.IsTrue(fontName.Contains("LiberationSans")); + private class DummyMetaInfo : IMetaInfo { } } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrPdfCreatorEventHelperTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrPdfCreatorEventHelperTest.cs new file mode 100644 index 0000000..8e4a85c --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrPdfCreatorEventHelperTest.cs @@ -0,0 +1,137 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Data; +using iText.Commons.Actions.Sequence; +using iText.Commons.Utils; +using iText.Kernel.Actions.Data; +using iText.Pdfocr.Statistics; +using iText.Test; + +namespace iText.Pdfocr { + public class OcrPdfCreatorEventHelperTest : ExtendedITextTest { + private static readonly ProductData DUMMY_PRODUCT_DATA = new ProductData("test-product", "inner_product", + "1.0.0", 1900, 2100); + + private OcrPdfCreatorEventHelperTest.StoreEventsHandler storeEventsHandler; + + [NUnit.Framework.SetUp] + public virtual void Before() { + storeEventsHandler = new OcrPdfCreatorEventHelperTest.StoreEventsHandler(); + EventManager.GetInstance().Register(storeEventsHandler); + } + + [NUnit.Framework.TearDown] + public virtual void After() { + EventManager.GetInstance().Unregister(storeEventsHandler); + storeEventsHandler = null; + } + + [NUnit.Framework.Test] + public virtual void ProductContextBasedEventTest() { + OcrPdfCreatorEventHelper helper = new OcrPdfCreatorEventHelper(new SequenceId(), new OcrPdfCreatorEventHelperTest.DummyMetaInfo + ()); + OcrPdfCreatorEventHelperTest.DummyITextEvent @event = new OcrPdfCreatorEventHelperTest.DummyITextEvent(); + helper.OnEvent(@event); + NUnit.Framework.Assert.AreEqual(1, storeEventsHandler.GetEvents().Count); + NUnit.Framework.Assert.AreEqual(@event, storeEventsHandler.GetEvents()[0]); + } + + [NUnit.Framework.Test] + public virtual void PdfOcrStatisticsEventTest() { + OcrPdfCreatorEventHelper helper = new OcrPdfCreatorEventHelper(new SequenceId(), new OcrPdfCreatorEventHelperTest.DummyMetaInfo + ()); + PdfOcrOutputTypeStatisticsEvent e = new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA + ); + helper.OnEvent(e); + NUnit.Framework.Assert.AreEqual(0, storeEventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void CustomProductEventTest() { + OcrPdfCreatorEventHelper helper = new OcrPdfCreatorEventHelper(new SequenceId(), new OcrPdfCreatorEventHelperTest.DummyMetaInfo + ()); + AbstractProductITextEvent @event = new OcrPdfCreatorEventHelperTest.CustomProductITextEvent(DUMMY_PRODUCT_DATA + ); + helper.OnEvent(@event); + NUnit.Framework.Assert.AreEqual(1, storeEventsHandler.GetEvents().Count); + NUnit.Framework.Assert.AreEqual(@event, storeEventsHandler.GetEvents()[0]); + } + + [NUnit.Framework.Test] + public virtual void CustomStatisticsEventTest() { + OcrPdfCreatorEventHelper helper = new OcrPdfCreatorEventHelper(new SequenceId(), new OcrPdfCreatorEventHelperTest.DummyMetaInfo + ()); + OcrPdfCreatorEventHelperTest.CustomStatisticsEvent @event = new OcrPdfCreatorEventHelperTest.CustomStatisticsEvent + (DUMMY_PRODUCT_DATA); + helper.OnEvent(@event); + NUnit.Framework.Assert.AreEqual(1, storeEventsHandler.GetEvents().Count); + NUnit.Framework.Assert.AreEqual(@event, storeEventsHandler.GetEvents()[0]); + } + + private class DummyMetaInfo : IMetaInfo { + } + + private class DummyITextEvent : AbstractProductProcessITextEvent { + protected internal DummyITextEvent() + : base(ITextCoreProductData.GetInstance(), null, EventConfirmationType.ON_DEMAND) { + } + + public override String GetEventType() { + return "test-event"; + } + } + + private class CustomProductITextEvent : AbstractProductITextEvent { + protected internal CustomProductITextEvent(ProductData productData) + : base(productData) { + } + } + + private class CustomStatisticsEvent : AbstractStatisticsEvent { + protected internal CustomStatisticsEvent(ProductData productData) + : base(productData) { + } + + public override IList GetStatisticsNames() { + return JavaCollectionsUtil.SingletonList("custom-statistics"); + } + } + + private class StoreEventsHandler : IEventHandler { + private IList events = new List(); + + public virtual IList GetEvents() { + return events; + } + + public virtual void OnEvent(IEvent @event) { + events.Add(@event); + } + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrProcessContextTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrProcessContextTest.cs new file mode 100644 index 0000000..d118333 --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/OcrProcessContextTest.cs @@ -0,0 +1,51 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; +using iText.Test; + +namespace iText.Pdfocr { + public class OcrProcessContextTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void SetOcrEventHelperTest() { + AbstractPdfOcrEventHelper eventHelper = new OcrProcessContextTest.CustomEventHelper(); + OcrProcessContext context = new OcrProcessContext(eventHelper); + NUnit.Framework.Assert.AreSame(eventHelper, context.GetOcrEventHelper()); + } + + private class CustomEventHelper : AbstractPdfOcrEventHelper { + public override void OnEvent(AbstractProductITextEvent @event) { + } + + // Do nothing + public override SequenceId GetSequenceId() { + return null; + } + + public override EventConfirmationType GetConfirmationType() { + return EventConfirmationType.ON_DEMAND; + } + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs index c10b442..5164c08 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs @@ -22,14 +22,16 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using iText.IO.Util; -using iText.Kernel; +using iText.Commons.Utils; using iText.Kernel.Colors; +using iText.Kernel.Exceptions; using iText.Kernel.Font; using iText.Kernel.Pdf; using iText.Layout.Font; -using iText.Pdfa; +using iText.Pdfa.Exceptions; +using iText.Pdfocr.Exceptions; using iText.Pdfocr.Helpers; +using iText.Pdfocr.Logs; using iText.Test; using iText.Test.Attributes; @@ -104,7 +106,7 @@ public virtual void TestPdfCustomMetadata() { pdfDocument.Close(); } - [LogMessage(OcrException.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] + [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestNonCompliantThaiPdfA() { NUnit.Framework.Assert.That(() => { @@ -117,7 +119,7 @@ public virtual void TestNonCompliantThaiPdfA() { PdfHelper.CreatePdfA(pdfPath, new FileInfo(path), ocrPdfCreatorProperties, PdfHelper.GetRGBPdfOutputIntent ()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(OcrException.CANNOT_CREATE_PDF_DOCUMENT, MessageFormatUtil.Format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, 3611)))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, MessageFormatUtil.Format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, 3611)))) ; } @@ -147,7 +149,7 @@ public virtual void TestCompliantThaiPdfA() { NUnit.Framework.Assert.IsTrue(font.IsEmbedded()); } - [LogMessage(OcrException.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] + [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestPdfACreateWithoutPdfLangProperty() { NUnit.Framework.Assert.That(() => { @@ -157,7 +159,7 @@ public virtual void TestPdfACreateWithoutPdfLangProperty() { PdfHelper.CreatePdfA(pdfPath, new FileInfo(path), new OcrPdfCreatorProperties(), PdfHelper.GetRGBPdfOutputIntent ()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(OcrException.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET))) ; } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs index 89625d6..c8238ee 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs @@ -22,9 +22,11 @@ You should have received a copy of the GNU Affero General Public License */ using System.Collections.Generic; using System.IO; +using iText.Commons.Utils; using iText.IO.Image; -using iText.IO.Util; +using iText.Pdfocr.Exceptions; using iText.Pdfocr.Helpers; +using iText.Pdfocr.Logs; using iText.Test; using iText.Test.Attributes; @@ -70,7 +72,7 @@ public virtual void GetImageDataFromNotExistingImageTest() { NUnit.Framework.Assert.That(() => { PdfCreatorUtil.GetImageData(new FileInfo("no such path"), null); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } @@ -80,7 +82,7 @@ public virtual void GetImageDataFromInvalidImageTest() { NUnit.Framework.Assert.That(() => { PdfCreatorUtil.GetImageData(new FileInfo(PdfHelper.GetImagesTestDirectory() + "corrupted.jpg"), null); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(OcrException.CANNOT_READ_INPUT_IMAGE))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_READ_INPUT_IMAGE))) ; } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs index b71d408..4a29c44 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs @@ -22,12 +22,14 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; +using iText.Commons.Utils; using iText.IO.Font; -using iText.IO.Util; using iText.Kernel.Colors; using iText.Kernel.Font; using iText.Layout.Font; +using iText.Pdfocr.Exceptions; using iText.Pdfocr.Helpers; +using iText.Pdfocr.Logs; using iText.Test; using iText.Test.Attributes; @@ -51,7 +53,7 @@ public virtual void TestFontColor() { } [LogMessage(PdfOcrLogMessageConstant.PROVIDED_FONT_PROVIDER_IS_INVALID, Count = 1)] - [LogMessage(OcrException.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] + [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestInvalidFontWithInvalidDefaultFontFamily() { NUnit.Framework.Assert.That(() => { @@ -69,7 +71,7 @@ public virtual void TestInvalidFontWithInvalidDefaultFontFamily() { NUnit.Framework.Assert.AreEqual(PdfHelper.DEFAULT_TEXT, result); NUnit.Framework.Assert.AreEqual(ScaleMode.SCALE_TO_FIT, properties.GetScaleMode()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(OcrException.CANNOT_CREATE_PDF_DOCUMENT, OcrException.CANNOT_RESOLVE_PROVIDED_FONTS))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrExceptionMessageConstant.CANNOT_RESOLVE_PROVIDED_FONTS))) ; } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfInputImageTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfInputImageTest.cs index 65458df..1358041 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfInputImageTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfInputImageTest.cs @@ -22,7 +22,9 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; +using iText.Pdfocr.Exceptions; using iText.Pdfocr.Helpers; +using iText.Pdfocr.Logs; using iText.Test; using iText.Test.Attributes; @@ -37,7 +39,7 @@ public virtual void TestCorruptedImage() { NUnit.Framework.Assert.IsNotNull(realOutput); NUnit.Framework.Assert.AreEqual("", realOutput); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } @@ -50,7 +52,7 @@ public virtual void TestCorruptedImageWithoutExtension() { NUnit.Framework.Assert.IsNotNull(realOutput); NUnit.Framework.Assert.AreEqual("", realOutput); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } @@ -63,7 +65,7 @@ public virtual void TestInvalidImagePathWithoutDot() { NUnit.Framework.Assert.IsNotNull(realOutput); NUnit.Framework.Assert.AreEqual("", realOutput); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } @@ -76,7 +78,7 @@ public virtual void TestInvalidImagePathWithDot() { NUnit.Framework.Assert.IsNotNull(realOutput); NUnit.Framework.Assert.AreEqual("", realOutput); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } @@ -89,7 +91,7 @@ public virtual void TestValidImageWithoutExtension() { NUnit.Framework.Assert.IsNotNull(realOutput); NUnit.Framework.Assert.AreEqual("", realOutput); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfLayersTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfLayersTest.cs index 19eca01..2ba742b 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfLayersTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfLayersTest.cs @@ -23,7 +23,7 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Layer; using iText.Pdfocr.Helpers; diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfOcrMetaInfoContainerTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfOcrMetaInfoContainerTest.cs new file mode 100644 index 0000000..dafb173 --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfOcrMetaInfoContainerTest.cs @@ -0,0 +1,38 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions.Contexts; +using iText.Test; + +namespace iText.Pdfocr { + public class PdfOcrMetaInfoContainerTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void Test() { + PdfOcrMetaInfoContainerTest.DummyMetaInfo mi = new PdfOcrMetaInfoContainerTest.DummyMetaInfo(); + PdfOcrMetaInfoContainer instance = new PdfOcrMetaInfoContainer(mi); + NUnit.Framework.Assert.AreSame(mi, instance.GetMetaInfo()); + } + + private class DummyMetaInfo : IMetaInfo { + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ScaleModeTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ScaleModeTest.cs index 3826278..9a018d5 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ScaleModeTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ScaleModeTest.cs @@ -22,8 +22,8 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; +using iText.Commons.Utils; using iText.IO.Image; -using iText.IO.Util; using iText.Kernel.Geom; using iText.Kernel.Pdf; using iText.Pdfocr.Helpers; diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/events/EventCountingTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/events/EventCountingTest.cs deleted file mode 100644 index c7fd4cd..0000000 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/events/EventCountingTest.cs +++ /dev/null @@ -1,97 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using System.IO; -using iText.IO.Util; -using iText.Kernel.Pdf; -using iText.Metainfo; -using iText.Pdfocr; -using iText.Pdfocr.Helpers; -using iText.Test; - -namespace iText.Pdfocr.Events { - public class EventCountingTest : ExtendedITextTest { - protected internal static readonly String PROFILE_FOLDER = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext - .CurrentContext.TestDirectory) + "/resources/itext/pdfocr/profiles/"; - - protected internal static readonly String SOURCE_FOLDER = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext - .CurrentContext.TestDirectory) + "/resources/itext/pdfocr/events/"; - - private IOcrEngine tesseractReader; - - public EventCountingTest() { - tesseractReader = new CustomOcrEngine(); - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfEvent() { - ((CustomOcrEngine)tesseractReader).SetThreadLocalMetaInfo(new TestMetaInfo()); - DoImageToPdfOcr(tesseractReader, GetTestImageFile()); - NUnit.Framework.Assert.IsTrue(((CustomOcrEngine)tesseractReader).GetThreadLocalMetaInfo() is TestMetaInfo); - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfAEvent() { - ((CustomOcrEngine)tesseractReader).SetThreadLocalMetaInfo(new TestMetaInfo()); - DoImageToPdfAOcr(tesseractReader, GetTestImageFile()); - NUnit.Framework.Assert.IsTrue(((CustomOcrEngine)tesseractReader).GetThreadLocalMetaInfo() is TestMetaInfo); - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingImageEvent() { - ((CustomOcrEngine)tesseractReader).SetThreadLocalMetaInfo(new TestMetaInfo()); - DoImageOcr(tesseractReader, GetTestImageFile()); - NUnit.Framework.Assert.IsTrue(((CustomOcrEngine)tesseractReader).GetThreadLocalMetaInfo() is TestMetaInfo); - } - - private static void DoImageOcr(IOcrEngine tesseractReader, FileInfo imageFile) { - tesseractReader.DoImageOcr(imageFile); - } - - private static void DoImageToPdfOcr(IOcrEngine tesseractReader, FileInfo imageFile) { - OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); - ocrPdfCreator.CreatePdf(JavaUtil.ArraysAsList(imageFile), new PdfWriter(new MemoryStream())); - } - - private static void DoImageToPdfAOcr(IOcrEngine tesseractReader, FileInfo imageFile) { - OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, new OcrPdfCreatorProperties().SetPdfLang( - "en-US")); - Stream @is = null; - try { - @is = new FileStream(PROFILE_FOLDER + "sRGB_CS_profile.icm", FileMode.Open, FileAccess.Read); - } - catch (FileNotFoundException) { - } - // No expected - PdfOutputIntent outputIntent = new PdfOutputIntent("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1" - , @is); - ocrPdfCreator.CreatePdfA(JavaUtil.ArraysAsList(imageFile), new PdfWriter(new MemoryStream()), outputIntent - ); - } - - private static FileInfo GetTestImageFile() { - String imgPath = SOURCE_FOLDER + "numbers_01.jpg"; - return new FileInfo(imgPath); - } - } -} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/exceptions/PdfOcrExceptionTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/exceptions/PdfOcrExceptionTest.cs new file mode 100644 index 0000000..4c9aea8 --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/exceptions/PdfOcrExceptionTest.cs @@ -0,0 +1,59 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using iText.Test; + +namespace iText.Pdfocr.Exceptions { + public class PdfOcrExceptionTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void OcrExceptionThrowableConstructorTest() { + Exception cause = new System.IO.IOException(); + PdfOcrException exception = new PdfOcrException(cause); + NUnit.Framework.Assert.AreEqual(cause, exception.InnerException); + } + + [NUnit.Framework.Test] + public virtual void OcrInputExceptionThrowableConstructorTest() { + Exception cause = new System.IO.IOException(); + PdfOcrException exception = new PdfOcrInputException(cause); + NUnit.Framework.Assert.AreEqual(cause, exception.InnerException); + } + + [NUnit.Framework.Test] + public virtual void OcrInputExceptionStringConstructorTest() { + String message = "test message"; + PdfOcrException exception = new PdfOcrInputException(message); + NUnit.Framework.Assert.AreEqual(message, exception.Message); + } + + [NUnit.Framework.Test] + public virtual void OcrExceptiongetMessageParamsTest() { + String message = "test message {0}"; + String param = "param"; + String expectedMessage = "test message param"; + PdfOcrException exception = new PdfOcrInputException(message); + exception.SetMessageParams(param); + NUnit.Framework.Assert.AreEqual(expectedMessage, exception.Message); + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs index 470d03e..cf157c1 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs @@ -23,26 +23,15 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; -using iText.Kernel.Counter.Event; +using iText.Commons.Utils; using iText.Kernel.Geom; using iText.Pdfocr; -using iText.Pdfocr.Events; namespace iText.Pdfocr.Helpers { - public class CustomOcrEngine : IOcrEngine, IThreadLocalMetaInfoAware { + public class CustomOcrEngine : IOcrEngine { private OcrEngineProperties ocrEngineProperties; - private IMetaInfo threadLocalMetaInfo; - - private bool textInfoDeprecationMode = false; - - public CustomOcrEngine() - : this(false) { - } - - public CustomOcrEngine(bool textInfoDeprecationMode) { - this.textInfoDeprecationMode = textInfoDeprecationMode; + public CustomOcrEngine() { } public CustomOcrEngine(OcrEngineProperties ocrEngineProperties) { @@ -55,22 +44,21 @@ public virtual IDictionary> DoImageOcr(FileInfo input) { if (input.FullName.Contains(PdfHelper.THAI_IMAGE_NAME)) { text = PdfHelper.THAI_TEXT; } - TextInfo textInfo = this.textInfoDeprecationMode ? new TextInfo(text, JavaUtil.ArraysAsList(204.0f, 158.0f - , 742.0f, 294.0f)) : new TextInfo(text, new Rectangle(204.0f, 158.0f, 538.0f, 136.0f)); + TextInfo textInfo = new TextInfo(text, new Rectangle(204.0f, 158.0f, 538.0f, 136.0f)); result.Put(1, JavaCollectionsUtil.SingletonList(textInfo)); return result; } - public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile) { + public virtual IDictionary> DoImageOcr(FileInfo input, OcrProcessContext ocrProcessContext + ) { + return DoImageOcr(input); } - public virtual IMetaInfo GetThreadLocalMetaInfo() { - return threadLocalMetaInfo; + public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile) { } - public virtual IThreadLocalMetaInfoAware SetThreadLocalMetaInfo(IMetaInfo metaInfo) { - this.threadLocalMetaInfo = metaInfo; - return this; + public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext + ) { } public virtual OcrEngineProperties GetOcrEngineProperties() { diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs new file mode 100644 index 0000000..186268a --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs @@ -0,0 +1,73 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System.Collections.Generic; +using System.IO; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Data; +using iText.Commons.Utils; +using iText.Pdfocr; + +namespace iText.Pdfocr.Helpers { + public class CustomProductAwareOcrEngine : IOcrEngine, IProductAware { + private bool getMetaInfoContainerTriggered = false; + + public CustomProductAwareOcrEngine() { + } + + public virtual IDictionary> DoImageOcr(FileInfo input) { + return JavaCollectionsUtil.EmptyMap>(); + } + + public virtual IDictionary> DoImageOcr(FileInfo input, OcrProcessContext ocrProcessContext + ) { + return DoImageOcr(input); + } + + public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile) { + } + + public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext + ) { + } + + public virtual OcrEngineProperties GetOcrEngineProperties() { + return null; + } + + public virtual PdfOcrMetaInfoContainer GetMetaInfoContainer() { + getMetaInfoContainerTriggered = true; + return new PdfOcrMetaInfoContainer(new CustomProductAwareOcrEngine.DummyMetaInfo()); + } + + public virtual ProductData GetProductData() { + return null; + } + + public virtual bool IsGetMetaInfoContainerTriggered() { + return getMetaInfoContainerTriggered; + } + + private class DummyMetaInfo : IMetaInfo { + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs index 5a84f75..e50ef15 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs @@ -22,8 +22,9 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Canvas.Parser; using iText.Pdfocr; @@ -46,7 +47,7 @@ public class PdfHelper { public static readonly String TARGET_DIRECTORY = NUnit.Framework.TestContext.CurrentContext.TestDirectory + "/test/resources/itext/pdfocr/"; - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(PdfHelper)); + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(PdfHelper)); /// Returns images test directory. public static String GetImagesTestDirectory() { @@ -123,23 +124,14 @@ public static String GetTextFromPdfLayerUseActualText(String pdfPath, String lay /// of properties and save to the given path. /// public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties) { - CreatePdf(pdfPath, inputFile, properties, false); - } - - /// - /// Perform OCR with custom ocr engine using provided input image and set - /// of properties and save to the given path. - /// - public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties, bool - textInfoDeprecationMode) { - OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(textInfoDeprecationMode), properties); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); try { using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) { ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList(inputFile), pdfWriter).Close(); } } catch (System.IO.IOException e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } } @@ -157,7 +149,7 @@ public static void CreatePdfA(String pdfPath, FileInfo inputFile, OcrPdfCreatorP } } catch (System.IO.IOException e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } } @@ -171,7 +163,7 @@ public static String GetTextFromPdf(FileInfo file, String testName) { result = GetTextFromPdfLayer(pdfPath, "Text Layer"); } catch (System.IO.IOException e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } return result; } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregatorTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregatorTest.cs new file mode 100644 index 0000000..4a24516 --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregatorTest.cs @@ -0,0 +1,109 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using iText.Commons.Actions; +using iText.Commons.Actions.Data; +using iText.Test; + +namespace iText.Pdfocr.Statistics { + public class PdfOcrOutputTypeStatisticsAggregatorTest : ExtendedITextTest { + private static readonly ProductData DUMMY_PRODUCT_DATA = new ProductData("test-product", "inner_product", + "1.0.0", 1900, 2100); + + [NUnit.Framework.Test] + public virtual void AggregateEventTest() { + PdfOcrOutputTypeStatisticsAggregator aggregator = new PdfOcrOutputTypeStatisticsAggregator(); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA)); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDFA, DUMMY_PRODUCT_DATA)); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.DATA, DUMMY_PRODUCT_DATA)); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDFA, DUMMY_PRODUCT_DATA)); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA)); + IDictionary aggregation = (IDictionary)aggregator.RetrieveAggregation(); + NUnit.Framework.Assert.AreEqual(3, aggregation.Count); + long? numberOfOcrProcessesWithGivenOutput = aggregation.Get("data"); + NUnit.Framework.Assert.AreEqual(1L, numberOfOcrProcessesWithGivenOutput); + numberOfOcrProcessesWithGivenOutput = aggregation.Get("pdf"); + NUnit.Framework.Assert.AreEqual(2L, numberOfOcrProcessesWithGivenOutput); + numberOfOcrProcessesWithGivenOutput = aggregation.Get("pdfa"); + NUnit.Framework.Assert.AreEqual(2L, numberOfOcrProcessesWithGivenOutput); + } + + [NUnit.Framework.Test] + public virtual void MergeTest() { + PdfOcrOutputTypeStatisticsAggregator firstAggregator = new PdfOcrOutputTypeStatisticsAggregator(); + PdfOcrOutputTypeStatisticsAggregator secondAggregator = new PdfOcrOutputTypeStatisticsAggregator(); + firstAggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA)); + firstAggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDFA, DUMMY_PRODUCT_DATA)); + secondAggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.DATA, DUMMY_PRODUCT_DATA)); + secondAggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDFA, DUMMY_PRODUCT_DATA)); + secondAggregator.Aggregate(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA)); + firstAggregator.Merge(secondAggregator); + IDictionary aggregation = (IDictionary)firstAggregator.RetrieveAggregation(); + NUnit.Framework.Assert.AreEqual(3, aggregation.Count); + long? numberOfOcrProcessesWithGivenOutput = aggregation.Get("data"); + NUnit.Framework.Assert.AreEqual(1L, numberOfOcrProcessesWithGivenOutput); + numberOfOcrProcessesWithGivenOutput = aggregation.Get("pdf"); + NUnit.Framework.Assert.AreEqual(2L, numberOfOcrProcessesWithGivenOutput); + numberOfOcrProcessesWithGivenOutput = aggregation.Get("pdfa"); + NUnit.Framework.Assert.AreEqual(2L, numberOfOcrProcessesWithGivenOutput); + } + + [NUnit.Framework.Test] + public virtual void AggregateInvalidEventTest() { + PdfOcrOutputTypeStatisticsAggregator aggregator = new PdfOcrOutputTypeStatisticsAggregator(); + aggregator.Aggregate(new PdfOcrOutputTypeStatisticsAggregatorTest.DummyAbstractStatisticsEvent(DUMMY_PRODUCT_DATA + )); + NUnit.Framework.Assert.IsTrue(((IDictionary)aggregator.RetrieveAggregation()).IsEmpty()); + } + + [NUnit.Framework.Test] + public virtual void MergeInvalidAggregatorTest() { + PdfOcrOutputTypeStatisticsAggregator aggregator = new PdfOcrOutputTypeStatisticsAggregator(); + aggregator.Merge(new PdfOcrOutputTypeStatisticsAggregatorTest.DummyAbstractStatisticsAggregator()); + NUnit.Framework.Assert.IsTrue(((IDictionary)aggregator.RetrieveAggregation()).IsEmpty()); + } + + private class DummyAbstractStatisticsEvent : AbstractStatisticsEvent { + protected internal DummyAbstractStatisticsEvent(ProductData productData) + : base(productData) { + } + + public override IList GetStatisticsNames() { + return null; + } + } + + private class DummyAbstractStatisticsAggregator : AbstractStatisticsAggregator { + public override void Aggregate(AbstractStatisticsEvent @event) { + } + + public override Object RetrieveAggregation() { + return null; + } + + public override void Merge(AbstractStatisticsAggregator aggregator) { + } + } + } +} diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEventTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEventTest.cs new file mode 100644 index 0000000..48c3889 --- /dev/null +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEventTest.cs @@ -0,0 +1,52 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions.Data; +using iText.Commons.Logs; +using iText.Commons.Utils; +using iText.Test; +using iText.Test.Attributes; + +namespace iText.Pdfocr.Statistics { + public class PdfOcrOutputTypeStatisticsEventTest : ExtendedITextTest { + private static readonly ProductData DUMMY_PRODUCT_DATA = new ProductData("test-product", "inner_product", + "1.0.0", 1900, 2100); + + [NUnit.Framework.Test] + public virtual void DefaultEventTest() { + PdfOcrOutputTypeStatisticsEvent @event = new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA + ); + NUnit.Framework.Assert.AreEqual(PdfOcrOutputType.PDF, @event.GetPdfOcrStatisticsEventType()); + NUnit.Framework.Assert.AreEqual(JavaCollectionsUtil.SingletonList("ocrOutput"), @event.GetStatisticsNames( + )); + NUnit.Framework.Assert.AreEqual(typeof(PdfOcrOutputTypeStatisticsAggregator), @event.CreateStatisticsAggregatorFromName + ("ocrOutput").GetType()); + } + + [NUnit.Framework.Test] + [LogMessage(CommonsLogMessageConstant.INVALID_STATISTICS_NAME)] + public virtual void InvalidAggregatorNameTest() { + NUnit.Framework.Assert.IsNull(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, DUMMY_PRODUCT_DATA + ).CreateStatisticsAggregatorFromName("dummy name")); + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs b/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs index 930819f..3613622 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs @@ -15,6 +15,6 @@ [assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")] -[assembly: AssemblyVersion("1.0.3.0")] -[assembly: AssemblyFileVersion("1.0.3.0")] -[assembly: AssemblyInformationalVersion("1.0.3")] +[assembly: AssemblyVersion("2.0.0.0")] +[assembly: AssemblyFileVersion("2.0.0.0")] +[assembly: AssemblyInformationalVersion("2.0.0")] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj index 54e5bf9..fabc6dc 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj @@ -9,7 +9,7 @@ library - net45 + net461 true @@ -26,7 +26,7 @@ - + diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationEventHandlingTestHelper.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationEventHandlingTestHelper.cs new file mode 100644 index 0000000..4b7e508 --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationEventHandlingTestHelper.cs @@ -0,0 +1,186 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using System.IO; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Processors; +using iText.Commons.Actions.Producer; +using iText.Commons.Actions.Sequence; +using iText.Commons.Utils; +using iText.Kernel.Actions.Events; +using iText.Kernel.Pdf; +using iText.Pdfocr.Statistics; +using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Actions.Data; +using iText.Pdfocr.Tesseract4.Actions.Events; + +namespace iText.Pdfocr { + public abstract class IntegrationEventHandlingTestHelper : IntegrationTestHelper { + protected internal readonly AbstractTesseract4OcrEngine tesseractReader; + + protected internal IntegrationEventHandlingTestHelper.StoreEventsHandler eventsHandler; + + public IntegrationEventHandlingTestHelper(IntegrationTestHelper.ReaderType type) { + tesseractReader = GetTesseractReader(type); + } + + [NUnit.Framework.SetUp] + public virtual void Before() { + // init ocr engine + Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties(); + ocrEngineProperties.SetPathToTessData(GetTessDataDirectory()); + tesseractReader.SetTesseract4OcrEngineProperties(ocrEngineProperties); + // register event handler + eventsHandler = new IntegrationEventHandlingTestHelper.StoreEventsHandler(); + EventManager.GetInstance().Register(eventsHandler); + } + + [NUnit.Framework.TearDown] + public virtual void After() { + EventManager.GetInstance().Unregister(eventsHandler); + eventsHandler = null; + } + + protected internal static void ValidateUsageEvent(IEvent @event, EventConfirmationType expectedConfirmationType + ) { + NUnit.Framework.Assert.IsTrue(@event is PdfOcrTesseract4ProductEvent); + NUnit.Framework.Assert.AreEqual("process-image", ((PdfOcrTesseract4ProductEvent)@event).GetEventType()); + NUnit.Framework.Assert.AreEqual(expectedConfirmationType, ((PdfOcrTesseract4ProductEvent)@event).GetConfirmationType + ()); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ProductData.GetInstance(), ((PdfOcrTesseract4ProductEvent) + @event).GetProductData()); + } + + protected internal static void ValidateStatisticEvent(IEvent @event, PdfOcrOutputType outputType) { + NUnit.Framework.Assert.IsTrue(@event is PdfOcrOutputTypeStatisticsEvent); + NUnit.Framework.Assert.AreEqual(outputType, ((PdfOcrOutputTypeStatisticsEvent)@event).GetPdfOcrStatisticsEventType + ()); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ProductData.GetInstance(), ((PdfOcrOutputTypeStatisticsEvent + )@event).GetProductData()); + } + + protected internal static void ValidateConfirmEvent(IEvent @event, IEvent expectedConfirmedEvent) { + NUnit.Framework.Assert.IsTrue(@event is ConfirmEvent); + NUnit.Framework.Assert.AreSame(expectedConfirmedEvent, ((ConfirmEvent)@event).GetConfirmedEvent()); + } + + // we expect core events in case of API methods returning PdfDocument + protected internal static void ValidateCoreConfirmEvent(IEvent @event) { + NUnit.Framework.Assert.IsTrue(@event is ConfirmEvent); + NUnit.Framework.Assert.AreEqual(GetCoreEvent().GetEvent().GetEventType(), ((ConfirmEvent)@event).GetConfirmedEvent + ().GetEventType()); + NUnit.Framework.Assert.AreEqual(GetCoreEvent().GetEvent().GetConfirmationType(), ((ConfirmEvent)@event).GetConfirmedEvent + ().GetConfirmationType()); + } + + protected internal virtual void ValidatePdfProducerLine(String filePath, String expected) { + using (PdfDocument pdfDocument = new PdfDocument(new PdfReader(filePath))) { + NUnit.Framework.Assert.AreEqual(expected, pdfDocument.GetDocumentInfo().GetProducer()); + } + } + + protected internal static String CreateExpectedProducerLine(ConfirmedEventWrapper[] expectedEvents) { + IList listEvents = JavaUtil.ArraysAsList(expectedEvents); + return ProducerBuilder.ModifyProducer(listEvents, null); + } + + protected internal static ConfirmedEventWrapper GetPdfOcrEvent() { + DefaultITextProductEventProcessor processor = new DefaultITextProductEventProcessor(ProductNameConstant.PDF_HTML + ); + return new ConfirmedEventWrapper(PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(new SequenceId(), null + , EventConfirmationType.ON_CLOSE), processor.GetUsageType(), processor.GetProducer()); + } + + protected internal static ConfirmedEventWrapper GetCoreEvent() { + DefaultITextProductEventProcessor processor = new DefaultITextProductEventProcessor(ProductNameConstant.ITEXT_CORE + ); + return new ConfirmedEventWrapper(ITextCoreProductEvent.CreateProcessPdfEvent(new SequenceId(), null, EventConfirmationType + .ON_CLOSE), processor.GetUsageType(), processor.GetProducer()); + } + + protected internal static PdfOutputIntent GetRGBPdfOutputIntent() { + String defaultRGBColorProfilePath = TEST_DIRECTORY + "profiles" + "/sRGB_CS_profile.icm"; + Stream @is = new FileStream(defaultRGBColorProfilePath, FileMode.Open, FileAccess.Read); + return new PdfOutputIntent("", "", "", "sRGB IEC61966-2.1", @is); + } + + /// + /// Creates PDF document with + /// + /// and set event counting meta info. + /// + /// + /// engine to set in the + /// + /// + /// out pdf file + /// image file + /// meta info + protected internal virtual void CreatePdfAndSetEventCountingMetaInfo(IOcrEngine engine, FileInfo outPdfFile + , FileInfo imgFile, IMetaInfo metaInfo) { + using (PdfWriter pdfWriter = new PdfWriter(outPdfFile)) { + PdfDocument pdfDocument = new OcrPdfCreator(engine).CreatePdf(JavaCollectionsUtil.SingletonList(imgFile), + pdfWriter, new DocumentProperties().SetEventCountingMetaInfo(metaInfo)); + pdfDocument.Close(); + } + } + + /// + /// Creates PDF document with + /// + /// and set meta info to + /// . + /// + /// + /// engine to set in the + /// + /// + /// out pdf file + /// image file + /// meta info + protected internal virtual void CreatePdfFileAndSetMetaInfoToProps(IOcrEngine engine, FileInfo outPdfFile, + FileInfo imgFile, IMetaInfo metaInfo) { + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties().SetMetaInfo(metaInfo); + new OcrPdfCreator(engine, properties).CreatePdfFile(JavaCollectionsUtil.SingletonList(imgFile), outPdfFile + ); + } + + protected internal class StoreEventsHandler : IEventHandler { + private readonly IList events = new List(); + + public virtual IList GetEvents() { + return events; + } + + public virtual void OnEvent(IEvent @event) { + if (@event is PdfOcrTesseract4ProductEvent || @event is PdfOcrOutputTypeStatisticsEvent || @event is ConfirmEvent + ) { + events.Add(@event); + } + } + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs index 7b6cf0f..a01ec44 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs @@ -23,9 +23,10 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.IO.Font; -using iText.IO.Util; using iText.Kernel.Colors; using iText.Kernel.Font; using iText.Kernel.Geom; @@ -36,11 +37,13 @@ You should have received a copy of the GNU Affero General Public License using iText.Kernel.Pdf.Canvas.Parser.Listener; using iText.Layout.Font; using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test; namespace iText.Pdfocr { public class IntegrationTestHelper : ExtendedITextTest { - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.IntegrationTestHelper)); + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.IntegrationTestHelper + )); // directory with test files public static readonly String TEST_DIRECTORY = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext @@ -164,7 +167,7 @@ protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tes result = GetTextFromPdfLayer(pdfPath, null, page); } catch (System.IO.IOException e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } return result; } @@ -203,8 +206,7 @@ protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tes /// Get text from layer specified by name from page. protected internal virtual String GetTextFromPdfLayer(String pdfPath, String layerName, int page, bool useActualText ) { - PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath), new DocumentProperties().SetEventCountingMetaInfo - (new PdfOcrMetaInfo())); + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); IntegrationTestHelper.ExtractionStrategy textExtractionStrategy = new IntegrationTestHelper.ExtractionStrategy (layerName); textExtractionStrategy.SetUseActualText(useActualText); @@ -243,7 +245,7 @@ protected internal virtual String GetRecognizedTextFromTextFile(AbstractTesserac result = GetTextFromTextFile(new FileInfo(txtPath)); } catch (Exception e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } return result; } @@ -338,7 +340,7 @@ protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngin } } catch (System.IO.IOException e) { - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } } @@ -383,11 +385,11 @@ protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngin protected internal virtual String GetTextFromTextFile(FileInfo file) { String content = null; try { - content = iText.IO.Util.JavaUtil.GetStringForBytes(File.ReadAllBytes(file.FullName), System.Text.Encoding. - UTF8); + content = iText.Commons.Utils.JavaUtil.GetStringForBytes(File.ReadAllBytes(file.FullName), System.Text.Encoding + .UTF8); } catch (System.IO.IOException e) { - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_FILE, file.FullName, e.Message + LOGGER.LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_FILE, file.FullName, e.Message )); } return content; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs index 4ad2474..a4f6ac0 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs @@ -22,11 +22,13 @@ You should have received a copy of the GNU Affero General Public License */ using System.IO; using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr { public class TesseractExecutableIntegrationTest : IntegrationTestHelper { - [LogMessage(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] [NUnit.Framework.Test] public virtual void TestNullPathToTesseractExecutable() { NUnit.Framework.Assert.That(() => { @@ -36,23 +38,23 @@ public virtual void TestNullPathToTesseractExecutable() { tesseractExecutableReader.SetPathToExecutable(null); GetTextFromPdf(tesseractExecutableReader, file); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) ; } - [LogMessage(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] [NUnit.Framework.Test] public virtual void TestEmptyPathToTesseractExecutable() { NUnit.Framework.Assert.That(() => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); GetTextFromPdf(new Tesseract4ExecutableOcrEngine("", new Tesseract4OcrEngineProperties()), file); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) ; } [LogMessage(Tesseract4LogMessageConstant.COMMAND_FAILED, Count = 1)] - [LogMessage(Tesseract4OcrException.TESSERACT_NOT_FOUND, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectPathToTesseractExecutable() { NUnit.Framework.Assert.That(() => { @@ -60,7 +62,7 @@ public virtual void TestIncorrectPathToTesseractExecutable() { GetTextFromPdf(new Tesseract4ExecutableOcrEngine("path\\to\\executable\\", new Tesseract4OcrEngineProperties ()), file); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.TESSERACT_NOT_FOUND)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND)) ; } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingExecutableTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingExecutableTest.cs similarity index 85% rename from itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingExecutableTest.cs rename to itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingExecutableTest.cs index b4d66fd..9eef692 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingExecutableTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingExecutableTest.cs @@ -22,9 +22,9 @@ You should have received a copy of the GNU Affero General Public License */ using iText.Pdfocr; -namespace iText.Pdfocr.Events.Multithreading { - public class MultiThreadingExecutableTest : MultiThreadingTest { - public MultiThreadingExecutableTest() +namespace iText.Pdfocr.Actions { + public class Tesseract4EventHandlingExecutableTest : Tesseract4EventHandlingTest { + public Tesseract4EventHandlingExecutableTest() : base(IntegrationTestHelper.ReaderType.EXECUTABLE) { } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/IMetaInfoWrapper.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingLibTest.cs similarity index 74% rename from itext/itext.pdfocr.api/itext/pdfocr/IMetaInfoWrapper.cs rename to itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingLibTest.cs index b432cc7..7945112 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/IMetaInfoWrapper.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingLibTest.cs @@ -20,13 +20,12 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using iText.Kernel.Counter.Event; +using iText.Pdfocr; -namespace iText.Pdfocr { - /// The meta info wrapper that holds some meta info - public interface IMetaInfoWrapper { - /// Gets the wrapped meta info - /// the wrapped meta info - IMetaInfo GetWrappedMetaInfo(); +namespace iText.Pdfocr.Actions { + public class Tesseract4EventHandlingLibTest : Tesseract4EventHandlingTest { + public Tesseract4EventHandlingLibTest() + : base(IntegrationTestHelper.ReaderType.LIB) { + } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingTest.cs new file mode 100644 index 0000000..59aeb95 --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/Tesseract4EventHandlingTest.cs @@ -0,0 +1,401 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using System.IO; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Sequence; +using iText.Commons.Utils; +using iText.Kernel.Pdf; +using iText.Pdfocr; +using iText.Pdfocr.Exceptions; +using iText.Pdfocr.Statistics; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; +using iText.Test.Attributes; + +namespace iText.Pdfocr.Actions { + public abstract class Tesseract4EventHandlingTest : IntegrationEventHandlingTestHelper { + public Tesseract4EventHandlingTest(IntegrationTestHelper.ReaderType type) + : base(type) { + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + new OcrPdfCreator(tesseractReader).CreatePdfFile(JavaCollectionsUtil.SingletonList(imgFile), outPdfFile); + // check ocr events + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] + public virtual void OcrPdfCreatorCreatePdfFileNoImageTest() { + FileInfo imgFile = new FileInfo("unknown"); + IList images = JavaCollectionsUtil.SingletonList(imgFile); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => ocrPdfCreator.CreatePdfFile(images, outPdfFile + )); + // check ocr events + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfFileNoOutputFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + IList images = JavaCollectionsUtil.SingletonList(imgFile); + FileInfo outPdfFile = new FileInfo("no/no_file"); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.Catch(typeof(System.IO.IOException), () => ocrPdfCreator.CreatePdfFile(images, outPdfFile + )); + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfFileNullOutputFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + IList images = JavaCollectionsUtil.SingletonList(imgFile); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.Catch(typeof(NullReferenceException), () => ocrPdfCreator.CreatePdfFile(images, null + )); + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfFileTwoImagesTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + new OcrPdfCreator(tesseractReader).CreatePdfFile(JavaUtil.ArraysAsList(imgFile, imgFile), outPdfFile); + // check ocr events + NUnit.Framework.Assert.AreEqual(5, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent1 = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent1, EventConfirmationType.ON_CLOSE); + IEvent ocrUsageEvent2 = eventsHandler.GetEvents()[1]; + ValidateUsageEvent(ocrUsageEvent2, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], ocrUsageEvent1); + ValidateConfirmEvent(eventsHandler.GetEvents()[4], ocrUsageEvent2); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfFileTwoRunningsTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + new OcrPdfCreator(tesseractReader).CreatePdfFile(JavaCollectionsUtil.SingletonList(imgFile), outPdfFile); + new OcrPdfCreator(tesseractReader).CreatePdfFile(JavaCollectionsUtil.SingletonList(imgFile), outPdfFile); + NUnit.Framework.Assert.AreEqual(6, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], ocrUsageEvent); + // usage event + ocrUsageEvent = eventsHandler.GetEvents()[3]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[4], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[5], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + PdfWriter pdfWriter = new PdfWriter(outPdfFile); + PdfDocument pdfDocument = new OcrPdfCreator(tesseractReader).CreatePdf(JavaCollectionsUtil.SingletonList(imgFile + ), pdfWriter); + pdfDocument.Close(); + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateCoreConfirmEvent(eventsHandler.GetEvents()[2]); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], ocrUsageEvent); + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetCoreEvent(), GetPdfOcrEvent + () }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] + public virtual void OcrPdfCreatorCreatePdfNoImageTest() { + IList images = JavaCollectionsUtil.SingletonList(new FileInfo("no_image")); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + PdfWriter pdfWriter = new PdfWriter(outPdfFile); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => ocrPdfCreator.CreatePdf(images, pdfWriter + )); + pdfWriter.Dispose(); + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfNullWriterTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + IList images = JavaCollectionsUtil.SingletonList(imgFile); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.Catch(typeof(ArgumentException), () => ocrPdfCreator.CreatePdf(images, null)); + NUnit.Framework.Assert.AreEqual(1, eventsHandler.GetEvents().Count); + ValidateUsageEvent(eventsHandler.GetEvents()[0], EventConfirmationType.ON_CLOSE); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfAFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetPdfLang("en-US"); + new OcrPdfCreator(tesseractReader, props).CreatePdfAFile(JavaCollectionsUtil.SingletonList(imgFile), outPdfFile + , GetRGBPdfOutputIntent()); + // check ocr events + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDFA); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void OcrPdfCreatorCreatePdfATest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + PdfWriter pdfWriter = new PdfWriter(outPdfFile); + OcrPdfCreatorProperties props = new OcrPdfCreatorProperties().SetPdfLang("en-US"); + PdfDocument pdfDocument = new OcrPdfCreator(tesseractReader, props).CreatePdfA(JavaCollectionsUtil.SingletonList + (imgFile), pdfWriter, GetRGBPdfOutputIntent()); + pdfDocument.Close(); + // check ocr events + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDFA); + ValidateCoreConfirmEvent(eventsHandler.GetEvents()[2]); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetCoreEvent(), GetPdfOcrEvent + () }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void DoImageOcrTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.DoImageOcr(imgFile); + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], usageEvent); + } + + [NUnit.Framework.Test] + [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] + public virtual void DoImageOcrNoImageTest() { + FileInfo imgFile = new FileInfo("uncknown"); + NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => tesseractReader.DoImageOcr(imgFile)); + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + } + + [NUnit.Framework.Test] + public virtual void DoImageOcrTwoRunningsTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.DoImageOcr(imgFile); + tesseractReader.DoImageOcr(imgFile); + NUnit.Framework.Assert.AreEqual(6, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], usageEvent); + usageEvent = eventsHandler.GetEvents()[3]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[4], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[5], usageEvent); + } + + [NUnit.Framework.Test] + public virtual void CreateTxtFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.CreateTxtFile(JavaUtil.ArraysAsList(imgFile, imgFile), FileUtil.CreateTempFile("test", ".txt" + )); + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], usageEvent); + } + + [NUnit.Framework.Test] + public virtual void CreateTxtFileNullEventHelperTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.CreateTxtFile(JavaUtil.ArraysAsList(imgFile, imgFile), FileUtil.CreateTempFile("test", ".txt" + ), new OcrProcessContext(null)); + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], usageEvent); + } + + [NUnit.Framework.Test] + [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] + public virtual void CreateTxtFileNoImageTest() { + FileInfo imgFile = new FileInfo("no_image"); + IList images = JavaUtil.ArraysAsList(imgFile, imgFile); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".txt"); + NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => tesseractReader.CreateTxtFile(images, outPdfFile + )); + // only one usage event is expected and it is not confirmed (no confirm event + NUnit.Framework.Assert.AreEqual(1, eventsHandler.GetEvents().Count); + ValidateUsageEvent(eventsHandler.GetEvents()[0], EventConfirmationType.ON_DEMAND); + } + + [NUnit.Framework.Test] + public virtual void CreateTxtFileNoFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + IList images = JavaUtil.ArraysAsList(imgFile, imgFile); + FileInfo outPdfFile = new FileInfo("nopath/nofile"); + Exception e = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => tesseractReader.CreateTxtFile + (images, outPdfFile)); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e.Message); + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.DATA); + } + + [NUnit.Framework.Test] + public virtual void CreateTxtFileNullOutFileTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + IList images = JavaUtil.ArraysAsList(imgFile, imgFile); + NUnit.Framework.Assert.Catch(typeof(NullReferenceException), () => tesseractReader.CreateTxtFile(images, null + )); + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.DATA); + } + + // set meta info tests + [NUnit.Framework.Test] + public virtual void SetEventCountingMetaInfoTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + CreatePdfAndSetEventCountingMetaInfo(tesseractReader, outPdfFile, imgFile, new Tesseract4EventHandlingTest.TestMetaInfo + ()); + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateCoreConfirmEvent(eventsHandler.GetEvents()[2]); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], ocrUsageEvent); + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetCoreEvent(), GetPdfOcrEvent + () }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void CreatePdfFileTestMetaInfoTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + CreatePdfFileAndSetMetaInfoToProps(tesseractReader, outPdfFile, imgFile, new Tesseract4EventHandlingTest.TestMetaInfo + ()); + // check ocr events + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateCoreConfirmEvent(eventsHandler.GetEvents()[2]); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetCoreEvent(), GetPdfOcrEvent + () }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void DoImageOcrCustomEventHelperTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.DoImageOcr(imgFile, new OcrProcessContext(new Tesseract4EventHandlingTest.CustomEventHelper + ())); + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], usageEvent); + } + + [NUnit.Framework.Test] + public virtual void CreateTxtFileCustomEventHelperTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + tesseractReader.CreateTxtFile(JavaUtil.ArraysAsList(imgFile, imgFile), FileUtil.CreateTempFile("test", ".txt" + ), new OcrProcessContext(new Tesseract4EventHandlingTest.CustomEventHelper())); + NUnit.Framework.Assert.AreEqual(4, eventsHandler.GetEvents().Count); + IEvent usageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(usageEvent, EventConfirmationType.ON_DEMAND); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.DATA); + ValidateStatisticEvent(eventsHandler.GetEvents()[2], PdfOcrOutputType.DATA); + ValidateConfirmEvent(eventsHandler.GetEvents()[3], usageEvent); + } + + private class CustomEventHelper : AbstractPdfOcrEventHelper { + public override void OnEvent(AbstractProductITextEvent @event) { + if (@event is AbstractContextBasedITextEvent) { + ((AbstractContextBasedITextEvent)@event).SetMetaInfo(new Tesseract4EventHandlingTest.TestMetaInfo()); + } + EventManager.GetInstance().OnEvent(@event); + } + + public override SequenceId GetSequenceId() { + return new SequenceId(); + } + + public override EventConfirmationType GetConfirmationType() { + return EventConfirmationType.ON_DEMAND; + } + } + + private class TestMetaInfo : IMetaInfo { + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/events/PdfOcrTesseract4ProductEventTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/events/PdfOcrTesseract4ProductEventTest.cs new file mode 100644 index 0000000..695a8bf --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/actions/events/PdfOcrTesseract4ProductEventTest.cs @@ -0,0 +1,44 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; +using iText.Pdfocr.Tesseract4.Actions.Data; +using iText.Pdfocr.Tesseract4.Actions.Events; +using iText.Test; + +namespace iText.Pdfocr.Actions.Events { + public class PdfOcrTesseract4ProductEventTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void EventTypeTest() { + PdfOcrTesseract4ProductEvent e = PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(new SequenceId(), null + , EventConfirmationType.ON_DEMAND); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ProductEvent.PROCESS_IMAGE, e.GetEventType()); + } + + [NUnit.Framework.Test] + public virtual void ProductDataNameTest() { + NUnit.Framework.Assert.AreEqual("pdfOcr-tesseract4", PdfOcrTesseract4ProductData.GetInstance().GetProductName + ()); + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingTest.cs deleted file mode 100644 index aa83ae2..0000000 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingTest.cs +++ /dev/null @@ -1,280 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using System.Collections.Generic; -using System.IO; -using iText.IO.Util; -using iText.Kernel.Counter; -using iText.Kernel.Counter.Event; -using iText.Kernel.Pdf; -using iText.Metainfo; -using iText.Pdfocr; -using iText.Pdfocr.Tesseract4; -using iText.Pdfocr.Tesseract4.Events; - -namespace iText.Pdfocr.Events { - public abstract class EventCountingTest : IntegrationTestHelper { - protected internal static readonly String PROFILE_FOLDER = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext - .CurrentContext.TestDirectory) + "/resources/itext/pdfocr/events/"; - - internal AbstractTesseract4OcrEngine tesseractReader; - - internal String testFileTypeName; - - private bool isExecutableReaderType; - - public EventCountingTest(IntegrationTestHelper.ReaderType type) { - isExecutableReaderType = type.Equals(IntegrationTestHelper.ReaderType.EXECUTABLE); - if (isExecutableReaderType) { - testFileTypeName = "executable"; - } - else { - testFileTypeName = "lib"; - } - tesseractReader = GetTesseractReader(type); - } - - [NUnit.Framework.SetUp] - public virtual void InitTesseractProperties() { - Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties(); - ocrEngineProperties.SetPathToTessData(GetTessDataDirectory()); - tesseractReader.SetTesseract4OcrEngineProperties(ocrEngineProperties); - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfEvent() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.GetEvents()[0]); - NUnit.Framework.Assert.IsNull(eventCounter.GetMetaInfos()[0]); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingSeveralImagesOneImageToPdfEvent() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file, file)); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.GetEvents()[0]); - NUnit.Framework.Assert.IsNull(eventCounter.GetMetaInfos()[0]); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfAEvent() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - DoImageToPdfAOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA, eventCounter.GetEvents()[0] - ); - NUnit.Framework.Assert.IsNull(eventCounter.GetMetaInfos()[0]); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingTwoPdfEvents() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - NUnit.Framework.Assert.AreEqual(2, eventCounter.GetEvents().Count); - for (int i = 0; i < eventCounter.GetEvents().Count; i++) { - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.GetEvents()[i]); - NUnit.Framework.Assert.IsNull(eventCounter.GetMetaInfos()[i]); - } - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingImageEvent() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - DoImageOcr(tesseractReader, file); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, eventCounter.GetEvents()[0]); - NUnit.Framework.Assert.IsNull(eventCounter.GetMetaInfos()[0]); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingImageEventCustomMetaInfo() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - tesseractReader.SetThreadLocalMetaInfo(new TestMetaInfo()); - DoImageOcr(tesseractReader, file); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, eventCounter.GetEvents()[0]); - NUnit.Framework.Assert.IsTrue(eventCounter.GetMetaInfos()[0] is TestMetaInfo); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - tesseractReader.SetThreadLocalMetaInfo(null); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfEventCustomMetaInfo() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - tesseractReader.SetThreadLocalMetaInfo(new TestMetaInfo()); - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.GetEvents()[0]); - NUnit.Framework.Assert.IsTrue(eventCounter.GetMetaInfos()[0] is TestMetaInfo); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - tesseractReader.SetThreadLocalMetaInfo(null); - } - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingWithImprovedHocrParsing() { - String imgPath = TEST_IMAGES_DIRECTORY + "thai_03.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); - properties.SetTextPositioning(TextPositioning.BY_WORDS_AND_LINES); - properties.SetUseTxtToImproveHocrParsing(true); - properties.SetPathToTessData(new FileInfo(LANG_TESS_DATA_DIRECTORY)); - tesseractReader.SetTesseract4OcrEngineProperties(properties); - tesseractReader.DoImageOcr(file); - NUnit.Framework.Assert.AreEqual(1, eventCounter.GetEvents().Count); - NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR.GetEventType(), eventCounter.GetEvents - ()[0].GetEventType()); - EventCounterHandler.GetInstance().Unregister(factory); - } - - public virtual void TestEventCountingCustomMetaInfoError() { - String imgPath = TEST_IMAGES_DIRECTORY + "numbers_101.jpg"; - FileInfo file = new FileInfo(imgPath); - EventCountingTest.TestEventCounter eventCounter = new EventCountingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - IMetaInfo metaInfo = new TestMetaInfo(); - try { - tesseractReader.SetThreadLocalMetaInfo(metaInfo); - DoImageToPdfOcr(tesseractReader, JavaUtil.ArraysAsList(file)); - } - finally { - NUnit.Framework.Assert.AreEqual(metaInfo, tesseractReader.GetThreadLocalMetaInfo()); - EventCounterHandler.GetInstance().Unregister(factory); - tesseractReader.SetThreadLocalMetaInfo(null); - } - } - - private static void DoImageOcr(AbstractTesseract4OcrEngine tesseractReader, FileInfo imageFile) { - tesseractReader.DoImageOcr(imageFile); - } - - private static void DoImageToPdfOcr(AbstractTesseract4OcrEngine tesseractReader, IList imageFiles - ) { - OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); - ocrPdfCreator.CreatePdf(imageFiles, new PdfWriter(new MemoryStream())); - } - - private static void DoImageToPdfAOcr(AbstractTesseract4OcrEngine tesseractReader, IList imageFiles - ) { - OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, new OcrPdfCreatorProperties().SetPdfLang( - "en-US")); - Stream @is = null; - try { - @is = new FileStream(PROFILE_FOLDER + "sRGB_CS_profile.icm", FileMode.Open, FileAccess.Read); - } - catch (FileNotFoundException) { - } - // No expected - PdfOutputIntent outputIntent = new PdfOutputIntent("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1" - , @is); - ocrPdfCreator.CreatePdfA(imageFiles, new PdfWriter(new MemoryStream()), outputIntent); - } - - private class TestEventCounter : EventCounter { - private IList events = new List(); - - private IList metaInfos = new List(); - - public virtual IList GetEvents() { - return events; - } - - public virtual IList GetMetaInfos() { - return metaInfos; - } - - protected override void OnEvent(IEvent @event, IMetaInfo metaInfo) { - this.events.Add(@event); - this.metaInfos.Add(metaInfo); - } - } - } -} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/PdfOcrTesseract4EventTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/PdfOcrTesseract4EventTest.cs deleted file mode 100644 index 4837719..0000000 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/PdfOcrTesseract4EventTest.cs +++ /dev/null @@ -1,52 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using iText.Pdfocr; -using iText.Pdfocr.Tesseract4.Events; - -namespace iText.Pdfocr.Events { - public class PdfOcrTesseract4EventTest : IntegrationTestHelper { - private const String PDF_OCR_TESSERACT4_ORIGIN_ID = "iText.Pdfocr.Tesseract4"; - - [NUnit.Framework.Test] - public virtual void TestEventTypes() { - String[] expectedTypes = new String[] { "pdfOcr-tesseract4-image-ocr", "pdfOcr-tesseract4-image-to-pdf", "pdfOcr-tesseract4-image-to-pdfa" - }; - PdfOcrTesseract4Event[] testedEvents = new PdfOcrTesseract4Event[] { PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR - , PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA }; - for (int i = 0; i < testedEvents.Length; i++) { - NUnit.Framework.Assert.AreEqual(expectedTypes[i], testedEvents[i].GetEventType()); - } - } - - [NUnit.Framework.Test] - public virtual void TestOriginId() { - String expected = PDF_OCR_TESSERACT4_ORIGIN_ID; - PdfOcrTesseract4Event[] testedEvents = new PdfOcrTesseract4Event[] { PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF - , PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA }; - foreach (PdfOcrTesseract4Event @event in testedEvents) { - NUnit.Framework.Assert.AreEqual(expected, @event.GetOriginId()); - } - } - } -} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/DoImageOcrRunnable.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/DoImageOcrRunnable.cs deleted file mode 100644 index e4a91e7..0000000 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/DoImageOcrRunnable.cs +++ /dev/null @@ -1,69 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using System.IO; -using iText.IO.Util; -using iText.Kernel.Counter.Event; -using iText.Kernel.Pdf; -using iText.Pdfocr; -using iText.Pdfocr.Tesseract4; - -namespace iText.Pdfocr.Events.Multithreading { - public class DoImageOcrRunnable : Object { - private AbstractTesseract4OcrEngine tesseractReader; - - private FileInfo imgFile; - - private FileInfo outputFile; - - private bool createPdf; - - private IMetaInfo metaInfo; - - internal DoImageOcrRunnable(AbstractTesseract4OcrEngine tesseractReader, IMetaInfo metaInfo, FileInfo imgFile - , FileInfo outputFile, bool createPdf) { - this.tesseractReader = tesseractReader; - this.metaInfo = metaInfo; - this.imgFile = imgFile; - this.outputFile = outputFile; - this.createPdf = createPdf; - } - - public virtual void Run() { - try { - tesseractReader.SetThreadLocalMetaInfo(metaInfo); - if (createPdf) { - new OcrPdfCreator(tesseractReader).CreatePdf(JavaUtil.ArraysAsList(imgFile), new PdfWriter(outputFile)); - } - else { - tesseractReader.DoTesseractOcr(imgFile, outputFile, OutputFormat.TXT); - } - // for test purposes - System.Console.Out.WriteLine(imgFile.Name); - } - catch (Exception e) { - throw new Exception(e.Message); - } - } - } -} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingTest.cs deleted file mode 100644 index 9aa2489..0000000 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/multithreading/MultiThreadingTest.cs +++ /dev/null @@ -1,130 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using System.Collections.Generic; -using System.IO; -using System.Threading; -using iText.Kernel.Counter; -using iText.Kernel.Counter.Event; -using iText.Metainfo; -using iText.Pdfocr; -using iText.Pdfocr.Tesseract4; -using iText.Pdfocr.Tesseract4.Events; - -namespace iText.Pdfocr.Events.Multithreading { - public abstract class MultiThreadingTest : IntegrationTestHelper { - protected internal static readonly String destinationFolder = NUnit.Framework.TestContext.CurrentContext.TestDirectory - + "/test/itext/pdfocr/events/multithreading/"; - - protected internal static readonly String sourceFolder = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext - .CurrentContext.TestDirectory) + "/resources/itext/pdfocr/events/multithreading/"; - - internal AbstractTesseract4OcrEngine tesseractReader; - - public MultiThreadingTest(IntegrationTestHelper.ReaderType type) { - tesseractReader = GetTesseractReader(type); - } - - [NUnit.Framework.OneTimeSetUp] - public static void BeforeClass() { - CreateDestinationFolder(destinationFolder); - } - - [NUnit.Framework.SetUp] - public virtual void InitTesseractProperties() { - Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties(); - ocrEngineProperties.SetPathToTessData(new FileInfo(sourceFolder + "../../tessdata")); - tesseractReader.SetTesseract4OcrEngineProperties(ocrEngineProperties); - } - - [NUnit.Framework.Test] - public virtual void TestEventCountingPdfEvent() { - MultiThreadingTest.TestEventCounter eventCounter = new MultiThreadingTest.TestEventCounter(); - IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); - EventCounterHandler.GetInstance().Register(factory); - try { - int n = 16; - IMetaInfo metainfo = new TestMetaInfo(); - Thread[] threads = new Thread[n]; - for (int i = 0; i < n; i++) { - // We do not use Runnable as the variable's type because of porting issues - DoImageOcrRunnable runnable = new DoImageOcrRunnable(tesseractReader, metainfo, new FileInfo(sourceFolder - + "numbers_01.jpg"), new FileInfo(destinationFolder + "ocr-result-" + (i + 1) + ".txt"), 0 == i % 2); - threads[i] = GetThread(runnable); - } - for (int i = 0; i < n; i++) { - threads[i].Start(); - } - for (int i = 0; i < n; i++) { - threads[i].Join(); - } - NUnit.Framework.Assert.AreEqual(n, eventCounter.GetEvents().Count); - int expectedPdfEvents = n / 2; - int expectedImageEvents = n - expectedPdfEvents; - int foundPdfEvents = 0; - int foundImageEvents = 0; - for (int i = 0; i < n; i++) { - if (PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF == eventCounter.GetEvents()[i]) { - foundPdfEvents++; - } - else { - if (PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR == eventCounter.GetEvents()[i]) { - foundImageEvents++; - } - } - NUnit.Framework.Assert.AreEqual(metainfo, eventCounter.GetMetaInfos()[i]); - } - NUnit.Framework.Assert.AreEqual(expectedImageEvents, foundImageEvents); - NUnit.Framework.Assert.AreEqual(expectedPdfEvents, foundPdfEvents); - } - finally { - EventCounterHandler.GetInstance().Unregister(factory); - } - } - - private static Thread GetThread(DoImageOcrRunnable runnable) { - return new Thread(new ThreadStart(runnable.Run)); - } - - public class TestEventCounter : EventCounter { - private IList events = new List(); - - private IList metaInfos = new List(); - - public virtual IList GetEvents() { - return events; - } - - public virtual IList GetMetaInfos() { - return metaInfos; - } - - [System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.Synchronized - )] - protected override void OnEvent(IEvent @event, IMetaInfo metaInfo) { - this.events.Add(@event); - this.metaInfos.Add(metaInfo); - } - } - } -} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingLibTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/exceptions/PdfOcrTesseract4ExceptionTest.cs similarity index 53% rename from itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingLibTest.cs rename to itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/exceptions/PdfOcrTesseract4ExceptionTest.cs index b556ab4..f5d2cd5 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingLibTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/exceptions/PdfOcrTesseract4ExceptionTest.cs @@ -20,27 +20,24 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using System.IO; -using iText.IO.Util; -using iText.Pdfocr; -using iText.Pdfocr.Tesseract4; -using iText.Test.Attributes; +using System; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Test; -namespace iText.Pdfocr.Events { - public class EventCountingLibTest : EventCountingTest { - public EventCountingLibTest() - : base(IntegrationTestHelper.ReaderType.LIB) { +namespace iText.Pdfocr.Exceptions { + public class PdfOcrTesseract4ExceptionTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void Tesseract4PdfOcrExceptionThrowableConstructorTest() { + Exception cause = new System.IO.IOException(); + PdfOcrTesseract4Exception exception = new PdfOcrTesseract4Exception(cause); + NUnit.Framework.Assert.AreEqual(cause, exception.InnerException); } [NUnit.Framework.Test] - [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] - public override void TestEventCountingCustomMetaInfoError() { - FileInfo img = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_101.jpg"); - NUnit.Framework.Assert.That(() => { - base.TestEventCountingCustomMetaInfoError(); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, img.FullName))) -; + public virtual void Tesseract4PdfOcrInputExceptionThrowableConstructorTest() { + Exception cause = new System.IO.IOException(); + PdfOcrTesseract4Exception exception = new PdfOcrInputTesseract4Exception(cause); + NUnit.Framework.Assert.AreEqual(cause, exception.InnerException); } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs index 7534ebb..374e143 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs @@ -24,14 +24,16 @@ You should have received a copy of the GNU Affero General Public License using System.Collections.Generic; using System.IO; using System.Text; +using iText.Commons.Utils; using iText.IO.Source; -using iText.IO.Util; using iText.Kernel.Colors; using iText.Kernel.Geom; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Canvas.Parser; using iText.Pdfocr; using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr.General { @@ -129,7 +131,7 @@ public virtual void TestInputInvalidImage() { OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); ocrPdfCreator.CreatePdf(JavaUtil.ArraysAsList(file3, file1, file2, file3), GetPdfWriter()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "example.txt").FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "example.txt").FullName))) ; } @@ -159,7 +161,7 @@ public virtual void TestNullPathToTessData() { (null)); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("eng")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) ; } @@ -171,11 +173,11 @@ public virtual void TestPathToTessDataWithoutData() { (new FileInfo("test/"))); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("eng")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) ; } - [LogMessage(Tesseract4OcrException.INCORRECT_LANGUAGE)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE)] [NUnit.Framework.Test] public virtual void TestEmptyPathToTessData() { NUnit.Framework.Assert.That(() => { @@ -187,33 +189,33 @@ public virtual void TestEmptyPathToTessData() { NUnit.Framework.Assert.AreEqual(new FileInfo("").FullName, tesseractReader.GetTesseract4OcrEngineProperties ().GetPathToTessData().FullName); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "eng.traineddata", new FileInfo(".").FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "eng.traineddata", new FileInfo(".").FullName))) ; } - [LogMessage(Tesseract4OcrException.INCORRECT_LANGUAGE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectLanguage() { NUnit.Framework.Assert.That(() => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("spa_new")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) ; } - [LogMessage(Tesseract4OcrException.INCORRECT_LANGUAGE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestListOfLanguagesWithOneIncorrectLanguage() { NUnit.Framework.Assert.That(() => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); GetTextFromPdf(tesseractReader, file, JavaUtil.ArraysAsList("spa", "spa_new", "spa_old")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) ; } - [LogMessage(Tesseract4OcrException.INCORRECT_LANGUAGE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectScriptsName() { NUnit.Framework.Assert.That(() => { @@ -222,11 +224,11 @@ public virtual void TestIncorrectScriptsName() { (new FileInfo(SCRIPT_TESS_DATA_DIRECTORY))); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("English")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) ; } - [LogMessage(Tesseract4OcrException.INCORRECT_LANGUAGE, Count = 1)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestListOfScriptsWithOneIncorrect() { NUnit.Framework.Assert.That(() => { @@ -235,7 +237,7 @@ public virtual void TestListOfScriptsWithOneIncorrect() { (new FileInfo(SCRIPT_TESS_DATA_DIRECTORY))); GetTextFromPdf(tesseractReader, file, JavaUtil.ArraysAsList("Georgian", "Japanese", "English")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) ; } @@ -267,7 +269,7 @@ public virtual void TestTxtStringOutput() { ); String result = tesseractReader.DoImageOcr(file, OutputFormat.TXT); foreach (String line in expectedOutput) { - NUnit.Framework.Assert.IsTrue(iText.IO.Util.StringUtil.ReplaceAll(result, "\r", "").Contains(line)); + NUnit.Framework.Assert.IsTrue(iText.Commons.Utils.StringUtil.ReplaceAll(result, "\r", "").Contains(line)); } } @@ -280,7 +282,7 @@ public virtual void TestHocrStringOutput() { ); String result = tesseractReader.DoImageOcr(file, OutputFormat.HOCR); foreach (String line in expectedOutput) { - NUnit.Framework.Assert.IsTrue(iText.IO.Util.StringUtil.ReplaceAll(result, "\r", "").Contains(line)); + NUnit.Framework.Assert.IsTrue(iText.Commons.Utils.StringUtil.ReplaceAll(result, "\r", "").Contains(line)); } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs index 7b65a6d..fd99ae9 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs @@ -22,11 +22,13 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Kernel.Colors; using iText.Kernel.Utils; using iText.Pdfocr; using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr.Imageformats { @@ -66,8 +68,8 @@ public virtual void TestBMPText() { String expectedOutput = "This is a test message for OCR Scanner Test"; String realOutputHocr = GetTextFromPdf(tesseractReader, new FileInfo(path), JavaCollectionsUtil.SingletonList ("eng")); - realOutputHocr = iText.IO.Util.StringUtil.ReplaceAll(realOutputHocr, "[\n]", " "); - realOutputHocr = iText.IO.Util.StringUtil.ReplaceAll(realOutputHocr, "[‘]", ""); + realOutputHocr = iText.Commons.Utils.StringUtil.ReplaceAll(realOutputHocr, "[\n]", " "); + realOutputHocr = iText.Commons.Utils.StringUtil.ReplaceAll(realOutputHocr, "[‘]", ""); NUnit.Framework.Assert.IsTrue(realOutputHocr.Contains((expectedOutput))); } @@ -90,7 +92,7 @@ public virtual void TestBMPText02() { String expectedOutput = "This is a test message for OCR Scanner Test BMPTest"; String realOutputHocr = GetTextFromPdf(tesseractReader, new FileInfo(path), JavaCollectionsUtil.SingletonList ("eng")); - realOutputHocr = iText.IO.Util.StringUtil.ReplaceAll(realOutputHocr, "[\n]", " "); + realOutputHocr = iText.Commons.Utils.StringUtil.ReplaceAll(realOutputHocr, "[\n]", " "); NUnit.Framework.Assert.IsTrue(realOutputHocr.Contains((expectedOutput))); } @@ -226,7 +228,7 @@ public virtual void TestInputWrongFormat() { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "wierdwords.gif"); GetTextFromPdf(tesseractReader, file); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_INPUT_IMAGE_FORMAT, "wierdwords.gif"))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_INPUT_IMAGE_FORMAT, "wierdwords.gif"))) ; } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs index 0dfa64d..20f39a3 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs @@ -22,7 +22,7 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Kernel.Colors; using iText.Kernel.Pdf; using iText.Kernel.Utils; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs index 9ad330a..2a2d450 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs @@ -23,7 +23,7 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Layer; using iText.Pdfocr; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs index 7026623..4d855b8 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs @@ -21,11 +21,13 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ using System; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Kernel.Colors; using iText.Kernel.Utils; using iText.Pdfocr; +using iText.Pdfocr.Logs; using iText.Pdfocr.Tesseract4; +using iText.Pdfocr.Tesseract4.Exceptions; using iText.Test.Attributes; namespace iText.Pdfocr.Tessdata { @@ -34,7 +36,8 @@ public TessDataIntegrationLibTest() : base(IntegrationTestHelper.ReaderType.LIB) { } - [LogMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS + )] [NUnit.Framework.Test] public virtual void TestTessDataWithNonAsciiPath() { NUnit.Framework.Assert.That(() => { @@ -43,7 +46,7 @@ public virtual void TestTessDataWithNonAsciiPath() { NUnit.Framework.Assert.Fail("Should throw exception for the tesseract lib when tess data path contains non ASCII characters" ); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS)) ; } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs index d5dd493..8e91618 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs @@ -23,18 +23,20 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Kernel.Colors; using iText.Kernel.Pdf; using iText.Kernel.Utils; using iText.Pdfocr; +using iText.Pdfocr.Logs; using iText.Pdfocr.Tesseract4; using iText.Test.Attributes; namespace iText.Pdfocr.Tessdata { public abstract class TessDataIntegrationTest : IntegrationTestHelper { - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.Tessdata.TessDataIntegrationTest + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tessdata.TessDataIntegrationTest )); internal AbstractTesseract4OcrEngine tesseractReader; @@ -155,7 +157,7 @@ public virtual void TextJapaneseOutputFromTxtFile() { String expected = "日本語文法"; String result = GetRecognizedTextFromTextFile(tesseractReader, imgPath, JavaCollectionsUtil.SingletonList< String>("jpn")); - result = iText.IO.Util.StringUtil.ReplaceAll(result, "[\f\n]", ""); + result = iText.Commons.Utils.StringUtil.ReplaceAll(result, "[\f\n]", ""); // correct result with specified japanese language NUnit.Framework.Assert.IsTrue(result.Contains(expected)); } @@ -166,8 +168,8 @@ public virtual void TestFrenchOutputFromTxtFile() { String expectedFr = "RESTEZ\nCALME\nPARLEZ EN\nFRANÇAIS"; String result = GetRecognizedTextFromTextFile(tesseractReader, imgPath, JavaCollectionsUtil.SingletonList< String>("fra")); - result = iText.IO.Util.StringUtil.ReplaceAll(result, "(?:\\n\\f)+", "").Trim(); - result = iText.IO.Util.StringUtil.ReplaceAll(result, "\\n\\n", "\n").Trim(); + result = iText.Commons.Utils.StringUtil.ReplaceAll(result, "(?:\\n\\f)+", "").Trim(); + result = iText.Commons.Utils.StringUtil.ReplaceAll(result, "\\n\\n", "\n").Trim(); // correct result with specified spanish language NUnit.Framework.Assert.IsTrue(result.EndsWith(expectedFr)); // incorrect result when languages are not specified @@ -246,7 +248,7 @@ public virtual void TestArabicTextWithEng() { String result = GetTextFromPdf(tesseractReader, file, JavaUtil.ArraysAsList("ara", "eng"), CAIRO_FONT_PATH ); // correct result with specified arabic+english languages - NUnit.Framework.Assert.AreEqual(expected, iText.IO.Util.StringUtil.ReplaceAll(result, "[?]", "")); + NUnit.Framework.Assert.AreEqual(expected, iText.Commons.Utils.StringUtil.ReplaceAll(result, "[?]", "")); // incorrect result when languages are not specified // or languages were specified in the wrong order NUnit.Framework.Assert.AreNotEqual(expected, GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList @@ -572,9 +574,9 @@ private bool CompareTxtLines(IList expected, IList result) { } for (int i = 0; i < expected.Count; i++) { String exp = expected[i].Replace("\n", "").Replace("\f", ""); - exp = iText.IO.Util.StringUtil.ReplaceAll(exp, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + exp = iText.Commons.Utils.StringUtil.ReplaceAll(exp, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); String res = result[i].Replace("\n", "").Replace("\f", ""); - res = iText.IO.Util.StringUtil.ReplaceAll(res, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + res = iText.Commons.Utils.StringUtil.ReplaceAll(res, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); if (expected[i] == null || result[i] == null) { areEqual = false; break; @@ -599,7 +601,7 @@ private bool CompareTxtFiles(String expectedFilePath, String resultFilePath) { } catch (System.IO.IOException e) { areEqual = false; - LOGGER.Error(e.Message); + LOGGER.LogError(e.Message); } return areEqual; } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs index 677a711..75b4695 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs @@ -23,13 +23,15 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr.Tesseract4 { public class ApiTest : IntegrationTestHelper { - [LogMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)] [NUnit.Framework.Test] public virtual void TestDefaultTessDataPathValidationForLib() { NUnit.Framework.Assert.That(() => { @@ -38,11 +40,11 @@ public virtual void TestDefaultTessDataPathValidationForLib() { Tesseract4LibOcrEngine engine = new Tesseract4LibOcrEngine(new Tesseract4OcrEngineProperties()); engine.DoImageOcr(imgFile); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)) ; } - [LogMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)] [NUnit.Framework.Test] public virtual void TestDefaultTessDataPathValidationForExecutable() { NUnit.Framework.Assert.That(() => { @@ -52,7 +54,7 @@ public virtual void TestDefaultTessDataPathValidationForExecutable() { ()); engine.DoImageOcr(imgFile); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)) ; } @@ -66,12 +68,12 @@ public virtual void TestDoTesseractOcrForIncorrectImageForExecutable() { ().SetPathToTessData(GetTessDataDirectory())); engine.DoTesseractOcr(imgFile, null, OutputFormat.HOCR); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01").FullName))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01").FullName))) ; } [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] - [LogMessage(Tesseract4OcrException.TESSERACT_FAILED)] + [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED)] [LogMessage(Tesseract4LogMessageConstant.TESSERACT_FAILED)] [NUnit.Framework.Test] public virtual void TestOcrResultForSinglePageForNullImage() { @@ -82,7 +84,7 @@ public virtual void TestOcrResultForSinglePageForNullImage() { tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT); tesseract4LibOcrEngine.DoTesseractOcr(null, null, OutputFormat.HOCR); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(Tesseract4OcrException.TESSERACT_FAILED)) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED)) ; } @@ -113,10 +115,6 @@ public virtual void TestDetectAndFixBrokenBBoxes() { NUnit.Framework.Assert.AreEqual(136.5f, (float)textInfo.GetBboxRect().GetBottom(), 0.1); NUnit.Framework.Assert.AreEqual(385.5, (float)textInfo.GetBboxRect().GetRight(), 0.1); NUnit.Framework.Assert.AreEqual(162.75, (float)textInfo.GetBboxRect().GetTop(), 0.1); - NUnit.Framework.Assert.AreEqual(383.0f, (float)textInfo.GetBbox()[0], 0.1); - NUnit.Framework.Assert.AreEqual(101.0f, (float)textInfo.GetBbox()[1], 0.1); - NUnit.Framework.Assert.AreEqual(514.0f, (float)textInfo.GetBbox()[2], 0.1); - NUnit.Framework.Assert.AreEqual(136.0f, (float)textInfo.GetBbox()[3], 0.1); } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs index 2d26206..62d10de 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs @@ -23,14 +23,15 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Kernel.Utils; using iText.Pdfocr; namespace iText.Pdfocr.Tesseract4 { public abstract class ImageIntegrationTest : IntegrationTestHelper { - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImageIntegrationTest + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImageIntegrationTest )); internal AbstractTesseract4OcrEngine tesseractReader; @@ -70,8 +71,8 @@ public virtual void TestHocrRotatedImage() { NUnit.Framework.Assert.AreEqual("degrees", pageData.Get(1)[1].GetText()); NUnit.Framework.Assert.AreEqual("rotated", pageData.Get(1)[2].GetText()); NUnit.Framework.Assert.AreEqual("image", pageData.Get(1)[3].GetText()); - NUnit.Framework.Assert.IsTrue(pageData.Get(1)[1].GetBbox()[2] - pageData.Get(1)[0].GetBbox()[0] > 100); - NUnit.Framework.Assert.IsTrue(pageData.Get(1)[1].GetBbox()[3] - pageData.Get(1)[0].GetBbox()[1] < 100); + NUnit.Framework.Assert.IsTrue(pageData.Get(1)[1].GetBboxRect().GetWidth() > 100); + NUnit.Framework.Assert.IsTrue(pageData.Get(1)[1].GetBboxRect().GetHeight() < 100); } [NUnit.Framework.Test] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs index 848860f..c163488 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs @@ -22,9 +22,11 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; +using iText.Commons.Utils; using iText.IO.Image; -using iText.IO.Util; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr.Tesseract4 { @@ -44,7 +46,7 @@ public virtual void TestReadingInvalidImagePath() { FileInfo imgFile = new FileInfo(path); ImagePreprocessingUtil.PreprocessImage(imgFile, 1, new ImagePreprocessingOptions()); } - , NUnit.Framework.Throws.InstanceOf()) + , NUnit.Framework.Throws.InstanceOf()) ; } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelperTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelperTest.cs new file mode 100644 index 0000000..4e4ecb0 --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelperTest.cs @@ -0,0 +1,73 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System.Collections.Generic; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; +using iText.Pdfocr.Statistics; +using iText.Pdfocr.Tesseract4.Actions.Data; +using iText.Pdfocr.Tesseract4.Actions.Events; +using iText.Test; + +namespace iText.Pdfocr.Tesseract4 { + public class Tesseract4FileResultEventHelperTest : ExtendedITextTest { + [NUnit.Framework.Test] + public virtual void DefaultProcessImageEventTest() { + Tesseract4FileResultEventHelperTest.StoreEventsHandler eventsHandler = new Tesseract4FileResultEventHelperTest.StoreEventsHandler + (); + EventManager.GetInstance().Register(eventsHandler); + Tesseract4FileResultEventHelper helper = new Tesseract4FileResultEventHelper(); + helper.OnEvent(PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(new SequenceId(), null, EventConfirmationType + .ON_CLOSE)); + NUnit.Framework.Assert.AreEqual(0, eventsHandler.GetEvents().Count); + EventManager.GetInstance().Unregister(eventsHandler); + } + + [NUnit.Framework.Test] + public virtual void DefaultStatisticsEventTest() { + Tesseract4FileResultEventHelperTest.StoreEventsHandler eventsHandler = new Tesseract4FileResultEventHelperTest.StoreEventsHandler + (); + EventManager.GetInstance().Register(eventsHandler); + Tesseract4FileResultEventHelper helper = new Tesseract4FileResultEventHelper(); + helper.OnEvent(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.PDF, PdfOcrTesseract4ProductData.GetInstance + ())); + NUnit.Framework.Assert.AreEqual(1, eventsHandler.GetEvents().Count); + EventManager.GetInstance().Unregister(eventsHandler); + } + + protected internal class StoreEventsHandler : IEventHandler { + private readonly IList events = new List(); + + public virtual IList GetEvents() { + return events; + } + + public virtual void OnEvent(IEvent @event) { + if (@event is PdfOcrTesseract4ProductEvent || @event is PdfOcrOutputTypeStatisticsEvent || @event is ConfirmEvent + ) { + events.Add(@event); + } + } + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingExecutableTest.cs similarity index 68% rename from itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.cs rename to itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingExecutableTest.cs index fc1bd5e..8ee2a02 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingExecutableTest.cs @@ -20,18 +20,12 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using System; +using iText.Pdfocr; namespace iText.Pdfocr.Tesseract4 { - /// Product info about this iText add-on. - public class PdfOcrTesseract4ProductInfo { - /// The product name. - public const String PRODUCT_NAME = "pdfOcr-Tesseract4"; - - /// The major version number. - public const int MAJOR_VERSION = 1; - - /// The minor version number. - public const int MINOR_VERSION = 0; + public class Tesseract4MetaInfoEventHandlingExecutableTest : Tesseract4MetaInfoEventHandlingTest { + public Tesseract4MetaInfoEventHandlingExecutableTest() + : base(IntegrationTestHelper.ReaderType.EXECUTABLE) { + } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingLibTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingLibTest.cs new file mode 100644 index 0000000..f8b63bc --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingLibTest.cs @@ -0,0 +1,31 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Pdfocr; + +namespace iText.Pdfocr.Tesseract4 { + public class Tesseract4MetaInfoEventHandlingLibTest : Tesseract4MetaInfoEventHandlingTest { + public Tesseract4MetaInfoEventHandlingLibTest() + : base(IntegrationTestHelper.ReaderType.LIB) { + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingTest.cs new file mode 100644 index 0000000..c51024e --- /dev/null +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/Tesseract4MetaInfoEventHandlingTest.cs @@ -0,0 +1,74 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.IO; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Utils; +using iText.Pdfocr; +using iText.Pdfocr.Statistics; + +namespace iText.Pdfocr.Tesseract4 { + public abstract class Tesseract4MetaInfoEventHandlingTest : IntegrationEventHandlingTestHelper { + public Tesseract4MetaInfoEventHandlingTest(IntegrationTestHelper.ReaderType type) + : base(type) { + } + + // set meta info tests + [NUnit.Framework.Test] + public virtual void SetEventCountingMetaInfoTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + CreatePdfAndSetEventCountingMetaInfo(tesseractReader, outPdfFile, imgFile, new Tesseract4MetaInfoEventHandlingTest.TestMetaInfo + ()); + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], ocrUsageEvent); + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + [NUnit.Framework.Test] + public virtual void CreatePdfFileTestMetaInfoTest() { + FileInfo imgFile = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01.jpg"); + FileInfo outPdfFile = FileUtil.CreateTempFile("test", ".pdf"); + CreatePdfFileAndSetMetaInfoToProps(tesseractReader, outPdfFile, imgFile, new Tesseract4MetaInfoEventHandlingTest.TestMetaInfo + ()); + // check ocr events + NUnit.Framework.Assert.AreEqual(3, eventsHandler.GetEvents().Count); + IEvent ocrUsageEvent = eventsHandler.GetEvents()[0]; + ValidateUsageEvent(ocrUsageEvent, EventConfirmationType.ON_CLOSE); + ValidateStatisticEvent(eventsHandler.GetEvents()[1], PdfOcrOutputType.PDF); + ValidateConfirmEvent(eventsHandler.GetEvents()[2], ocrUsageEvent); + // check producer line in the output pdf + String expectedProdLine = CreateExpectedProducerLine(new ConfirmedEventWrapper[] { GetPdfOcrEvent() }); + ValidatePdfProducerLine(outPdfFile.FullName, expectedProdLine); + } + + private class TestMetaInfo : IMetaInfo { + } + } +} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperLibTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperLibTest.cs index c01f347..b5717ed 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperLibTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperLibTest.cs @@ -23,7 +23,7 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Pdfocr; namespace iText.Pdfocr.Tesseract4 { diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs index 978c5f6..33b0cf0 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs @@ -24,13 +24,14 @@ You should have received a copy of the GNU Affero General Public License using System.Collections.Generic; using System.IO; using System.Text; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Pdfocr; namespace iText.Pdfocr.Tesseract4 { public abstract class TesseractHelperTest : IntegrationTestHelper { - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.TesseractHelperTest + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.TesseractHelperTest )); internal AbstractTesseract4OcrEngine tesseractReader; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractOcrUtilTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractOcrUtilTest.cs index 994307b..92f33d1 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractOcrUtilTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractOcrUtilTest.cs @@ -25,6 +25,7 @@ You should have received a copy of the GNU Affero General Public License using Tesseract; using iText.IO.Image; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; namespace iText.Pdfocr.Tesseract4 { diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs index b8c98a0..fc022d0 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs @@ -23,8 +23,9 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using iText.IO.Util; +using iText.Commons.Utils; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Exceptions; namespace iText.Pdfocr.Tesseract4 { public abstract class UserWordsTest : IntegrationTestHelper { @@ -76,7 +77,7 @@ public virtual void TestCustomUserWordsWithListOfLanguages() { tesseractReader.SetTesseract4OcrEngineProperties(properties); String result = GetRecognizedTextFromTextFile(tesseractReader, imgPath); result = result.Replace("\n", "").Replace("\f", ""); - result = iText.IO.Util.StringUtil.ReplaceAll(result, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + result = iText.Commons.Utils.StringUtil.ReplaceAll(result, "[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); NUnit.Framework.Assert.IsTrue(result.StartsWith(expectedOutput)); NUnit.Framework.Assert.IsTrue(tesseractReader.GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(). EndsWith(".user-words")); @@ -90,7 +91,7 @@ public virtual void TestUserWordsWithLanguageNotInList() { properties.SetUserWords("spa", new FileStream(userWords, FileMode.Open, FileAccess.Read)); properties.SetLanguages(new List()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST, "spa"))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST, "spa"))) ; } @@ -101,7 +102,7 @@ public virtual void TestIncorrectLanguageForUserWordsAsList() { properties.SetUserWords("eng1", JavaUtil.ArraysAsList("word1", "word2")); properties.SetLanguages(new List()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST, "eng1"))) + , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST, "eng1"))) ; } diff --git a/itext/itext.pdfocr.api/PdfOcrExtensions.cs b/itext/itext.pdfocr.api/PdfOcrExtensions.cs index 1bdaacb..c0a1166 100644 --- a/itext/itext.pdfocr.api/PdfOcrExtensions.cs +++ b/itext/itext.pdfocr.api/PdfOcrExtensions.cs @@ -50,4 +50,8 @@ public static TValue Put(this IDictionary col, TKey return oldVal; } + public static bool IsEmpty(this ICollection> collection) { + return collection.Count == 0; + } + } diff --git a/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs b/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs index 8bbd834..f387a9b 100644 --- a/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs +++ b/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs @@ -14,9 +14,9 @@ [assembly: Guid("0c4ceb00-9a56-4547-a925-5974a85a6048")] -[assembly: AssemblyVersion("1.0.3.0")] -[assembly: AssemblyFileVersion("1.0.3.0")] -[assembly: AssemblyInformationalVersion("1.0.3")] +[assembly: AssemblyVersion("2.0.0.0")] +[assembly: AssemblyFileVersion("2.0.0.0")] +[assembly: AssemblyInformationalVersion("2.0.0")] [assembly: InternalsVisibleTo("itext.pdfocr.api.tests, PublicKey=" + "00240000048000009400000006020000002400005253413100040000010001008b21ed5b3fc1c1" + "1996390981fe22bbe71a39a9e11d3c2cefddd6ee92920fa871f9666ae0fa941af0280d0653df04" + diff --git a/itext/itext.pdfocr.api/itext.pdfocr.api.csproj b/itext/itext.pdfocr.api/itext.pdfocr.api.csproj index e46c9c4..00f0717 100644 --- a/itext/itext.pdfocr.api/itext.pdfocr.api.csproj +++ b/itext/itext.pdfocr.api/itext.pdfocr.api.csproj @@ -13,7 +13,7 @@ - net45 + net461 CS1591;CS1570;CS1572;CS1573;CS1574;CS1580;CS1584;CS1658 @@ -30,7 +30,7 @@ - + diff --git a/itext/itext.pdfocr.api/itext/pdfocr/AbstractPdfOcrEventHelper.cs b/itext/itext.pdfocr.api/itext/pdfocr/AbstractPdfOcrEventHelper.cs new file mode 100644 index 0000000..6d47763 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/AbstractPdfOcrEventHelper.cs @@ -0,0 +1,43 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; + +namespace iText.Pdfocr { + /// Helper class for working with events. + /// Helper class for working with events. This class is for internal usage. + public abstract class AbstractPdfOcrEventHelper : AbstractITextEvent { + /// Handles the event. + /// event + public abstract void OnEvent(AbstractProductITextEvent @event); + + /// Returns the sequence id + /// sequence id + public abstract SequenceId GetSequenceId(); + + /// Returns the confirmation type of event. + /// event confirmation type + public abstract EventConfirmationType GetConfirmationType(); + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs b/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs index 751f7bc..5efdc43 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs @@ -63,6 +63,31 @@ public interface IOcrEngine { /// IDictionary> DoImageOcr(FileInfo input); + /// + /// Reads data from the provided input image file and returns retrieved data + /// in the format described below. + /// + /// + /// input image + /// + /// + /// ocr processing context + /// + /// + /// + /// where key is + /// + /// representing the number of the page and value is + /// + /// of + /// + /// elements where each + /// + /// element contains a word or a line and its 4 + /// coordinates(bbox) + /// + IDictionary> DoImageOcr(FileInfo input, OcrProcessContext ocrProcessContext); + /// /// Performs OCR using provided /// @@ -84,5 +109,28 @@ public interface IOcrEngine { /// /// file to be created void CreateTxtFile(IList inputImages, FileInfo txtFile); + + /// + /// Performs OCR using provided + /// + /// for the given list of + /// input images and saves output to a text file using provided path. + /// + /// + /// Performs OCR using provided + /// + /// for the given list of + /// input images and saves output to a text file using provided path. + /// Note that a human reading order is not guaranteed + /// due to possible specifics of input images (multi column layout, tables etc) + /// + /// + /// + /// + /// of images to be OCRed + /// + /// file to be created + /// ocr processing context + void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext); } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/metainfo/TestMetaInfo.cs b/itext/itext.pdfocr.api/itext/pdfocr/IProductAware.cs similarity index 63% rename from itext.tests/itext.pdfocr.api.tests/itext/metainfo/TestMetaInfo.cs rename to itext/itext.pdfocr.api/itext/pdfocr/IProductAware.cs index 36601e1..b5e6864 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/metainfo/TestMetaInfo.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/IProductAware.cs @@ -20,15 +20,17 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using iText.Kernel.Counter.Event; +using iText.Commons.Actions.Data; -namespace iText.Metainfo { - /// This class is used for test purposes. - /// - /// This class is used for test purposes. - /// Please be aware that it's put in the com.itextpdf.metainfo deliberately, - /// so that it belongs neither to com.itextpdf.pdfocr nor com.itextpdf.pdfocr.tesseract4 packages - /// - public class TestMetaInfo : IMetaInfo { +namespace iText.Pdfocr { + /// The interface that holds information about product data and meta info. + public interface IProductAware { + /// Gets the container with meta info. + /// the held meta info container + PdfOcrMetaInfoContainer GetMetaInfoContainer(); + + /// Gets object containing information about the product. + /// product data + ProductData GetProductData(); } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrEngineProperties.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrEngineProperties.cs index 090394b..de0cf6d 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrEngineProperties.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrEngineProperties.cs @@ -22,7 +22,7 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.Collections.Generic; -using iText.IO.Util; +using iText.Commons.Utils; namespace iText.Pdfocr { public class OcrEngineProperties { diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs index 9ca32e8..56c4b92 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs @@ -23,11 +23,14 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Actions; +using iText.Commons.Actions.Sequence; +using iText.Commons.Utils; using iText.IO.Font.Otf; using iText.IO.Image; -using iText.IO.Util; -using iText.Kernel.Counter.Event; +using iText.Kernel.Actions.Events; using iText.Kernel.Font; using iText.Kernel.Geom; using iText.Kernel.Pdf; @@ -38,7 +41,9 @@ You should have received a copy of the GNU Affero General Public License using iText.Layout.Font; using iText.Layout.Properties; using iText.Pdfa; -using iText.Pdfocr.Events; +using iText.Pdfocr.Exceptions; +using iText.Pdfocr.Logs; +using iText.Pdfocr.Statistics; namespace iText.Pdfocr { /// @@ -67,16 +72,7 @@ namespace iText.Pdfocr { /// public class OcrPdfCreator { /// The logger. - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.OcrPdfCreator)); - - /// Indices in array representing bbox. - private const int LEFT_IDX = 0; - - private const int TOP_IDX = 1; - - private const int RIGHT_IDX = 2; - - private const int BOTTOM_IDX = 3; + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.OcrPdfCreator)); /// /// Selected @@ -151,6 +147,8 @@ public void SetOcrPdfCreatorProperties(OcrPdfCreatorProperties ocrPdfCreatorProp /// and /// creates PDF using provided /// + /// , + /// /// and /// . /// @@ -160,12 +158,23 @@ public void SetOcrPdfCreatorProperties(OcrPdfCreatorProperties ocrPdfCreatorProp /// and /// creates PDF using provided /// + /// , + /// /// and - /// . - /// PDF/A-3u document will be created if + /// + /// . PDF/A-3u document will be created if /// provided /// /// is not null. + /// + /// NOTE that after executing this method you will have a product event from + /// the both itextcore and pdfOcr. Therefore, use this method only if you need to work + /// with the generated + /// + /// . If you don't need this, use the + /// + /// method. In this case, only the pdfOcr event will be dispatched. /// /// /// @@ -178,6 +187,7 @@ public void SetOcrPdfCreatorProperties(OcrPdfCreatorProperties ocrPdfCreatorProp /// object /// to write final PDF document to /// + /// document properties /// /// /// @@ -188,34 +198,127 @@ public void SetOcrPdfCreatorProperties(OcrPdfCreatorProperties ocrPdfCreatorProp /// /// object /// - public PdfDocument CreatePdfA(IList inputImages, PdfWriter pdfWriter, PdfOutputIntent pdfOutputIntent - ) { - LOGGER.Info(MessageFormatUtil.Format(PdfOcrLogMessageConstant.START_OCR_FOR_IMAGES, inputImages.Count)); - IMetaInfo storedMetaInfo = null; - if (ocrEngine is IThreadLocalMetaInfoAware) { - storedMetaInfo = ((IThreadLocalMetaInfoAware)ocrEngine).GetThreadLocalMetaInfo(); - ((IThreadLocalMetaInfoAware)ocrEngine).SetThreadLocalMetaInfo(new OcrPdfCreatorMetaInfo(((IThreadLocalMetaInfoAware - )ocrEngine).GetThreadLocalMetaInfo(), Guid.NewGuid(), null != pdfOutputIntent ? OcrPdfCreatorMetaInfo.PdfDocumentType - .PDFA : OcrPdfCreatorMetaInfo.PdfDocumentType.PDF)); - } + public PdfDocument CreatePdfA(IList inputImages, PdfWriter pdfWriter, DocumentProperties documentProperties + , PdfOutputIntent pdfOutputIntent) { + LOGGER.LogInformation(MessageFormatUtil.Format(PdfOcrLogMessageConstant.START_OCR_FOR_IMAGES, inputImages. + Count)); + // create event helper + SequenceId pdfSequenceId = new SequenceId(); + OcrPdfCreatorEventHelper ocrEventHelper = new OcrPdfCreatorEventHelper(pdfSequenceId, ocrPdfCreatorProperties + .GetMetaInfo()); + OcrProcessContext ocrProcessContext = new OcrProcessContext(ocrEventHelper); // map contains: // keys: image files // values: // map pageNumber -> retrieved text data(text and its coordinates) IDictionary>> imagesTextData = new LinkedDictionary>>(); - try { - foreach (FileInfo inputImage in inputImages) { - imagesTextData.Put(inputImage, ocrEngine.DoImageOcr(inputImage)); - } - } - finally { - if (ocrEngine is IThreadLocalMetaInfoAware) { - ((IThreadLocalMetaInfoAware)ocrEngine).SetThreadLocalMetaInfo(storedMetaInfo); - } + foreach (FileInfo inputImage in inputImages) { + imagesTextData.Put(inputImage, ocrEngine.DoImageOcr(inputImage, ocrProcessContext)); } // create PdfDocument - return CreatePdfDocument(pdfWriter, pdfOutputIntent, imagesTextData); + return CreatePdfDocument(pdfWriter, pdfOutputIntent, imagesTextData, pdfSequenceId, documentProperties); + } + + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// + /// and + /// . + /// + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// + /// and + /// + /// . PDF/A-3u document will be created if + /// provided + /// + /// is not null. + /// + /// NOTE that after executing this method you will have a product event from + /// the both itextcore and pdfOcr. Therefore, use this method only if you need to work + /// with the generated + /// + /// . If you don't need this, use the + /// + /// method. In this case, only the pdfOcr event will be dispatched. + /// + /// + /// + /// + /// of images to be OCRed + /// + /// + /// the + /// + /// object + /// to write final PDF document to + /// + /// + /// + /// + /// for PDF/A-3u document + /// + /// + /// result PDF/A-3u + /// + /// object + /// + public PdfDocument CreatePdfA(IList inputImages, PdfWriter pdfWriter, PdfOutputIntent pdfOutputIntent + ) { + return CreatePdfA(inputImages, pdfWriter, new DocumentProperties(), pdfOutputIntent); + } + + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// . + /// + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// . + /// + /// NOTE that after executing this method you will have a product event from + /// the both itextcore and pdfOcr. Therefore, use this method only if you need to work + /// with the generated + /// + /// . If you don't need this, use the + /// + /// method. In this case, only the pdfOcr event will be dispatched. + /// + /// + /// + /// + /// of images to be OCRed + /// + /// + /// the + /// + /// object + /// to write final PDF document to + /// + /// document properties + /// + /// result + /// + /// object + /// + public PdfDocument CreatePdf(IList inputImages, PdfWriter pdfWriter, DocumentProperties documentProperties + ) { + return CreatePdfA(inputImages, pdfWriter, documentProperties, null); } /// @@ -225,6 +328,21 @@ public PdfDocument CreatePdfA(IList inputImages, PdfWriter pdfWriter, /// creates PDF using provided /// . /// + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// . + /// + /// NOTE that after executing this method you will have a product event from + /// the both itextcore and pdfOcr. Therefore, use this method only if you need to work + /// with the generated + /// + /// . If you don't need this, use the + /// + /// method. In this case, only the pdfOcr event will be dispatched. + /// /// /// /// @@ -242,7 +360,82 @@ public PdfDocument CreatePdfA(IList inputImages, PdfWriter pdfWriter, /// object /// public PdfDocument CreatePdf(IList inputImages, PdfWriter pdfWriter) { - return CreatePdfA(inputImages, pdfWriter, null); + return CreatePdfA(inputImages, pdfWriter, new DocumentProperties(), null); + } + + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// . + /// + /// + /// + /// + /// of images to be OCRed + /// + /// + /// the + /// + /// object to write final PDF document to + /// + public virtual void CreatePdfFile(IList inputImages, FileInfo outPdfFile) { + CreatePdfAFile(inputImages, outPdfFile, null); + } + + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// + /// and + /// . + /// + /// + /// Performs OCR with set parameters using provided + /// + /// and + /// creates PDF using provided + /// + /// and + /// . + /// PDF/A-3u document will be created if provided + /// + /// is not null. + /// + /// + /// + /// + /// of images to be OCRed + /// + /// + /// the + /// + /// object to write final PDF document to + /// + /// + /// + /// + /// for PDF/A-3u document + /// + public virtual void CreatePdfAFile(IList inputImages, FileInfo outPdfFile, PdfOutputIntent pdfOutputIntent + ) { + DocumentProperties documentProperties = new DocumentProperties(); + if (ocrPdfCreatorProperties.GetMetaInfo() != null) { + documentProperties.SetEventCountingMetaInfo(ocrPdfCreatorProperties.GetMetaInfo()); + } + else { + if (ocrEngine is IProductAware) { + documentProperties.SetEventCountingMetaInfo(((IProductAware)ocrEngine).GetMetaInfoContainer().GetMetaInfo( + )); + } + } + using (PdfWriter pdfWriter = new PdfWriter(outPdfFile.FullName)) { + PdfDocument pdfDocument = CreatePdfA(inputImages, pdfWriter, documentProperties, pdfOutputIntent); + pdfDocument.Close(); + } } /// @@ -319,59 +512,39 @@ private void AddToCanvas(PdfDocument pdfDocument, Rectangle imageSize, IList - /// Creates a new PDF document using provided properties, adds images with - /// recognized text. - /// - /// - /// the - /// - /// object - /// to write final PDF document to - /// - /// - /// - /// - /// for PDF/A-3u document - /// - /// - /// map that contains input image files as keys, - /// and as value: map pageNumber -> text for the page - /// - /// - /// result - /// - /// object - /// private PdfDocument CreatePdfDocument(PdfWriter pdfWriter, PdfOutputIntent pdfOutputIntent, IDictionary>> imagesTextData) { + , IDictionary>> imagesTextData, SequenceId pdfSequenceId, DocumentProperties documentProperties + ) { PdfDocument pdfDocument; bool createPdfA3u = pdfOutputIntent != null; if (createPdfA3u) { - pdfDocument = new PdfADocument(pdfWriter, PdfAConformanceLevel.PDF_A_3U, pdfOutputIntent, new DocumentProperties - ().SetEventCountingMetaInfo(new PdfOcrMetaInfo())); + pdfDocument = new PdfADocument(pdfWriter, PdfAConformanceLevel.PDF_A_3U, pdfOutputIntent, documentProperties + ); } else { - pdfDocument = new PdfDocument(pdfWriter, new DocumentProperties().SetEventCountingMetaInfo(new PdfOcrMetaInfo - ())); + pdfDocument = new PdfDocument(pdfWriter, documentProperties); } + LinkDocumentIdEvent linkDocumentIdEvent = new LinkDocumentIdEvent(pdfDocument, pdfSequenceId); + EventManager.GetInstance().OnEvent(linkDocumentIdEvent); // pdfLang should be set in PDF/A mode bool hasPdfLangProperty = ocrPdfCreatorProperties.GetPdfLang() != null && !ocrPdfCreatorProperties.GetPdfLang ().Equals(""); if (createPdfA3u && !hasPdfLangProperty) { - LOGGER.Error(MessageFormatUtil.Format(OcrException.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET - )); - throw new OcrException(OcrException.CANNOT_CREATE_PDF_DOCUMENT).SetMessageParams(PdfOcrLogMessageConstant. - PDF_LANGUAGE_PROPERTY_IS_NOT_SET); + LOGGER.LogError(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrLogMessageConstant + .PDF_LANGUAGE_PROPERTY_IS_NOT_SET)); + throw new PdfOcrException(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT).SetMessageParams(PdfOcrLogMessageConstant + .PDF_LANGUAGE_PROPERTY_IS_NOT_SET); } // add metadata if (hasPdfLangProperty) { @@ -386,6 +559,13 @@ private PdfDocument CreatePdfDocument(PdfWriter pdfWriter, PdfOutputIntent pdfOu // reset passed font provider ocrPdfCreatorProperties.GetFontProvider().Reset(); AddDataToPdfDocument(imagesTextData, pdfDocument, createPdfA3u); + // statisctics event about type of created pdf + if (ocrEngine is IProductAware && ((IProductAware)ocrEngine).GetProductData() != null) { + PdfOcrOutputType eventType = createPdfA3u ? PdfOcrOutputType.PDFA : PdfOcrOutputType.PDF; + PdfOcrOutputTypeStatisticsEvent docTypeStatisticsEvent = new PdfOcrOutputTypeStatisticsEvent(eventType, (( + IProductAware)ocrEngine).GetProductData()); + EventManager.GetInstance().OnEvent(docTypeStatisticsEvent); + } return pdfDocument; } @@ -406,8 +586,8 @@ private void AddDataToPdfDocument(IDictionary imageDataList = PdfCreatorUtil.GetImageData(inputImage, ocrPdfCreatorProperties.GetImageRotationHandler ()); - LOGGER.Info(MessageFormatUtil.Format(PdfOcrLogMessageConstant.NUMBER_OF_PAGES_IN_IMAGE, inputImage.ToString - (), imageDataList.Count)); + LOGGER.LogInformation(MessageFormatUtil.Format(PdfOcrLogMessageConstant.NUMBER_OF_PAGES_IN_IMAGE, inputImage + .ToString(), imageDataList.Count)); IDictionary> imageTextData = entry.Value; if (imageTextData.Keys.Count > 0) { for (int page = 0; page < imageDataList.Count; ++page) { @@ -435,14 +615,14 @@ private void AddDataToPdfDocument(IDictionaryGet left bound of text chunk. private static float GetLeft(TextInfo textInfo, float multiplier) { - if (textInfo.GetBboxRect() == null) { - return textInfo.GetBbox()[LEFT_IDX] * multiplier; - } - else { - return textInfo.GetBboxRect().GetLeft() * multiplier; - } + return textInfo.GetBboxRect().GetLeft() * multiplier; } /// Get right bound of text chunk. private static float GetRight(TextInfo textInfo, float multiplier) { - if (textInfo.GetBboxRect() == null) { - return (textInfo.GetBbox()[RIGHT_IDX] + 1) * multiplier - 1; - } - else { - return (textInfo.GetBboxRect().GetRight() + 1) * multiplier - 1; - } + return (textInfo.GetBboxRect().GetRight() + 1) * multiplier - 1; } /// Get top bound of text chunk. private static float GetTop(TextInfo textInfo, float multiplier) { - if (textInfo.GetBboxRect() == null) { - return textInfo.GetBbox()[TOP_IDX] * multiplier; - } - else { - return textInfo.GetBboxRect().GetTop() * multiplier; - } + return textInfo.GetBboxRect().GetTop() * multiplier; } /// Get bottom bound of text chunk. private static float GetBottom(TextInfo textInfo, float multiplier) { - if (textInfo.GetBboxRect() == null) { - return (textInfo.GetBbox()[BOTTOM_IDX] + 1) * multiplier - 1; - } - else { - return (textInfo.GetBboxRect().GetBottom() + 1) * multiplier - 1; - } + return (textInfo.GetBboxRect().GetBottom() + 1) * multiplier - 1; } /// Check if line is not empty. @@ -642,7 +802,7 @@ public override PdfCanvas ShowText(GlyphLine text) { if (this.createPdfA3u) { // exception is thrown only if PDF/A document is // being created - throw new OcrException(message); + throw new PdfOcrException(message); } // setting actual text to NotDef glyph glyphLine.SetActualTextToGlyph(i, glyphLine.ToUnicodeString(i, i + 1)); @@ -656,7 +816,7 @@ public override PdfCanvas ShowText(GlyphLine text) { } // Warning is logged if not PDF/A document is being created if (notDefGlyphsExists) { - LOGGER.Warn(message); + LOGGER.LogWarning(message); } return this.ShowText(glyphLine, new ActualTextIterator(glyphLine)); } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs new file mode 100644 index 0000000..114d4f0 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs @@ -0,0 +1,61 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Sequence; +using iText.Pdfocr.Statistics; + +namespace iText.Pdfocr { + internal class OcrPdfCreatorEventHelper : AbstractPdfOcrEventHelper { + private readonly SequenceId sequenceId; + + private readonly IMetaInfo metaInfo; + + internal OcrPdfCreatorEventHelper(SequenceId sequenceId, IMetaInfo metaInfo) { + this.sequenceId = sequenceId; + this.metaInfo = metaInfo; + } + + public override void OnEvent(AbstractProductITextEvent @event) { + if (@event is AbstractContextBasedITextEvent) { + ((AbstractContextBasedITextEvent)@event).SetMetaInfo(this.metaInfo); + } + else { + if (@event is PdfOcrOutputTypeStatisticsEvent) { + // do nothing as we would + return; + } + } + EventManager.GetInstance().OnEvent(@event); + } + + public override SequenceId GetSequenceId() { + return sequenceId; + } + + public override EventConfirmationType GetConfirmationType() { + return EventConfirmationType.ON_CLOSE; + } + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorMetaInfo.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorMetaInfo.cs deleted file mode 100644 index 4a539da..0000000 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorMetaInfo.cs +++ /dev/null @@ -1,68 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using iText.Kernel.Counter.Event; - -namespace iText.Pdfocr { - /// The meta info that is used internally by pdfOcr to pass a wrapped custom meta data - public class OcrPdfCreatorMetaInfo : IMetaInfo, IMetaInfoWrapper { - private IMetaInfo wrappedMetaInfo; - - private Guid uuid; - - private OcrPdfCreatorMetaInfo.PdfDocumentType pdfDocumentType; - - /// Creates an inner meta info wrapper - /// the meta info to be wrapped - /// a unique String which corresponds to the ocr event for which this meta info is passed - /// a type of the document which is created during the corresponding ocr event - public OcrPdfCreatorMetaInfo(IMetaInfo wrappedMetaInfo, Guid uuid, OcrPdfCreatorMetaInfo.PdfDocumentType pdfDocumentType - ) { - this.wrappedMetaInfo = wrappedMetaInfo; - this.uuid = uuid; - this.pdfDocumentType = pdfDocumentType; - } - - /// Gets the unique String which corresponds to the ocr event for which this meta info is passed - /// the unique String which corresponds to the ocr event for which this meta info is passed - public virtual Guid GetDocumentId() { - return uuid; - } - - /// Gets the type of the document which is created during the corresponding ocr event - /// the type of the document which is created during the corresponding ocr event - public virtual OcrPdfCreatorMetaInfo.PdfDocumentType GetPdfDocumentType() { - return pdfDocumentType; - } - - public virtual IMetaInfo GetWrappedMetaInfo() { - return wrappedMetaInfo; - } - - /// The enum which represents types of documents, for which pdfOcr sends different events - public enum PdfDocumentType { - PDF, - PDFA - } - } -} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs index fdb254f..d1b8610 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs @@ -21,6 +21,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ using System; +using iText.Commons.Actions.Contexts; using iText.Kernel.Colors; using iText.Kernel.Geom; using iText.Layout.Font; @@ -113,6 +114,8 @@ public class OcrPdfCreatorProperties { /// private IImageRotationHandler imageRotationHandler; + private IMetaInfo metaInfo; + /// /// Creates a new /// @@ -465,5 +468,25 @@ public virtual iText.Pdfocr.OcrPdfCreatorProperties SetImageRotationHandler(IIma this.imageRotationHandler = imageRotationDetector; return this; } + + /// + /// Set meta info for this + /// . + /// + /// meta info + /// + /// the instance of the current + /// + /// + public virtual iText.Pdfocr.OcrPdfCreatorProperties SetMetaInfo(IMetaInfo metaInfo) { + this.metaInfo = metaInfo; + return this; + } + + /// Returns meta info + /// meta info + internal virtual IMetaInfo GetMetaInfo() { + return metaInfo; + } } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs new file mode 100644 index 0000000..ec65907 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs @@ -0,0 +1,49 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +namespace iText.Pdfocr { + /// Class for storing ocr processing context. + public class OcrProcessContext { + private AbstractPdfOcrEventHelper ocrEventHelper; + + /// Creates an instance of ocr process context + /// helper class for working with events + public OcrProcessContext(AbstractPdfOcrEventHelper eventHelper) { + this.ocrEventHelper = eventHelper; + } + + /// Returns helper for working with events. + /// + /// an instance of + /// + /// + public virtual AbstractPdfOcrEventHelper GetOcrEventHelper() { + return ocrEventHelper; + } + + /// Sets ocr event helper. + /// event helper + public virtual void SetOcrEventHelper(AbstractPdfOcrEventHelper eventHelper) { + this.ocrEventHelper = eventHelper; + } + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs b/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs index dbc2704..bcf8279 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs @@ -23,15 +23,18 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.IO.Image; using iText.IO.Source; -using iText.IO.Util; using iText.Kernel.Geom; using iText.Layout; using iText.Layout.Element; using iText.Layout.Layout; using iText.Layout.Renderer; +using iText.Pdfocr.Exceptions; +using iText.Pdfocr.Logs; namespace iText.Pdfocr { internal class PdfCreatorUtil { @@ -42,7 +45,7 @@ internal class PdfCreatorUtil { private const float POINTS_PER_INCH = 72.0f; /// The logger. - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(PdfCreatorUtil)); + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(PdfCreatorUtil)); /// /// Calculates font size according to given bbox height, width and selected @@ -83,8 +86,8 @@ internal static float CalculateFontSize(Document document, String line, String f } } catch (InvalidOperationException e) { - LOGGER.Error(PdfOcrLogMessageConstant.PROVIDED_FONT_PROVIDER_IS_INVALID); - throw new OcrException(OcrException.CANNOT_RESOLVE_PROVIDED_FONTS, e); + LOGGER.LogError(PdfOcrLogMessageConstant.PROVIDED_FONT_PROVIDER_IS_INVALID); + throw new PdfOcrInputException(PdfOcrExceptionMessageConstant.CANNOT_RESOLVE_PROVIDED_FONTS, e); } return fontSize; } @@ -176,12 +179,12 @@ internal static IList GetImageData(FileInfo inputImage, IImageRotatio } } catch (System.IO.IOException e) { - LOGGER.Error(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); - throw new OcrException(OcrException.CANNOT_READ_INPUT_IMAGE, e); + LOGGER.LogError(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); + throw new PdfOcrInputException(PdfOcrExceptionMessageConstant.CANNOT_READ_INPUT_IMAGE, e); } - catch (iText.IO.IOException e) { - LOGGER.Error(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); - throw new OcrException(OcrException.CANNOT_READ_INPUT_IMAGE, e); + catch (iText.IO.Exceptions.IOException e) { + LOGGER.LogError(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); + throw new PdfOcrInputException(PdfOcrExceptionMessageConstant.CANNOT_READ_INPUT_IMAGE, e); } return images; } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrFontProvider.cs b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrFontProvider.cs index 2dd729b..ffe336e 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrFontProvider.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrFontProvider.cs @@ -22,10 +22,13 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using Common.Logging; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.IO.Font; using iText.IO.Util; using iText.Layout.Font; +using iText.Pdfocr.Logs; namespace iText.Pdfocr { public class PdfOcrFontProvider : FontProvider { @@ -70,7 +73,7 @@ private byte[] GetDefaultFont() { } } catch (System.IO.IOException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_DEFAULT_FONT + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(PdfOcrLogMessageConstant.CANNOT_READ_DEFAULT_FONT , e.Message)); return new byte[0]; } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs new file mode 100644 index 0000000..750cdb9 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs @@ -0,0 +1,40 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions.Contexts; + +namespace iText.Pdfocr { + /// Container to keep meta info. + public class PdfOcrMetaInfoContainer { + private readonly IMetaInfo metaInfo; + + /// Creates instance of container to keep passed meta info. + /// meta info + public PdfOcrMetaInfoContainer(IMetaInfo metaInfo) { + this.metaInfo = metaInfo; + } + + internal virtual IMetaInfo GetMetaInfo() { + return metaInfo; + } + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/TextInfo.cs b/itext/itext.pdfocr.api/itext/pdfocr/TextInfo.cs index 1d0a49b..5e57fab 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/TextInfo.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/TextInfo.cs @@ -21,8 +21,6 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ using System; -using System.Collections.Generic; -using iText.IO.Util; using iText.Kernel.Geom; namespace iText.Pdfocr { @@ -40,16 +38,6 @@ public class TextInfo { /// private Rectangle bboxRect; - /// Contains 4 float coordinates: bbox parameters. - /// - /// Contains 4 float coordinates: bbox parameters. - /// Alike bboxRect described by - /// - /// coordinates are upper-left based and expressed in pixels. - /// - [System.ObsoleteAttribute(@"since 1.0.1. Use bboxRect instead")] - private IList bbox = JavaCollectionsUtil.EmptyList(); - /// /// Creates a new /// @@ -67,7 +55,6 @@ public TextInfo() { public TextInfo(iText.Pdfocr.TextInfo textInfo) { this.text = textInfo.text; this.bboxRect = new Rectangle(textInfo.bboxRect); - this.bbox = JavaCollectionsUtil.UnmodifiableList(textInfo.bbox); } /// @@ -86,48 +73,6 @@ public TextInfo(String text, Rectangle bbox) { this.bboxRect = new Rectangle(bbox); } - /// - /// Creates a new - /// - /// instance. - /// - /// any text - /// - /// - /// - /// of bbox parameters - /// - [System.ObsoleteAttribute(@"since 1.0.1. Use TextInfo(System.String, iText.Kernel.Geom.Rectangle) instead" - )] - public TextInfo(String text, IList bbox) { - this.text = text; - this.bbox = JavaCollectionsUtil.UnmodifiableList(bbox); - } - - /// - /// Creates a new - /// - /// instance. - /// - /// any text - /// - /// - /// - /// describing text bbox - /// - /// - /// - /// - /// of bbox parameters - /// - [System.ObsoleteAttribute(@"since 1.0.1. Use TextInfo(System.String, iText.Kernel.Geom.Rectangle) instead" - )] - public TextInfo(String text, Rectangle bboxRect, IList bbox) { - this.text = text; - this.bboxRect = bboxRect; - this.bbox = JavaCollectionsUtil.UnmodifiableList(bbox); - } - /// Gets text element. /// String public virtual String GetText() { @@ -158,30 +103,6 @@ public virtual Rectangle GetBboxRect() { /// public virtual void SetBboxRect(Rectangle bbox) { this.bboxRect = new Rectangle(bbox); - this.bbox = JavaCollectionsUtil.EmptyList(); - } - - /// Gets bbox coordinates. - /// - /// - /// - /// of bbox parameters - /// - [System.ObsoleteAttribute(@"since 1.0.1. Use GetBboxRect() instead")] - public virtual IList GetBbox() { - return new List(bbox); - } - - /// Sets bbox coordinates. - /// - /// - /// - /// of bbox parameters - /// - [System.ObsoleteAttribute(@"since 1.0.1. Use SetBboxRect(iText.Kernel.Geom.Rectangle) instead")] - public virtual void SetBbox(IList bbox) { - this.bbox = JavaCollectionsUtil.UnmodifiableList(bbox); - this.bboxRect = null; } } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/events/IThreadLocalMetaInfoAware.cs b/itext/itext.pdfocr.api/itext/pdfocr/events/IThreadLocalMetaInfoAware.cs deleted file mode 100644 index 4a851eb..0000000 --- a/itext/itext.pdfocr.api/itext/pdfocr/events/IThreadLocalMetaInfoAware.cs +++ /dev/null @@ -1,43 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using iText.Kernel.Counter.Event; - -namespace iText.Pdfocr.Events { - /// - /// The interface which holds a thread local meta info, - /// meaning different threads operate with independent and different meta infos. - /// - public interface IThreadLocalMetaInfoAware { - /// Gets the meta info which is held by the interface. - /// the held thread local meta info - IMetaInfo GetThreadLocalMetaInfo(); - - /// Sets a thread local meta info. - /// a thread local meta info to be held - /// - /// this - /// - /// - IThreadLocalMetaInfoAware SetThreadLocalMetaInfo(IMetaInfo metaInfo); - } -} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrException.cs b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrException.cs similarity index 71% rename from itext/itext.pdfocr.api/itext/pdfocr/OcrException.cs rename to itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrException.cs index 1d4dc9b..6e8a6fe 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrException.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrException.cs @@ -22,20 +22,18 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.Collections.Generic; -using iText.IO.Util; +using iText.Commons.Exceptions; +using iText.Commons.Utils; -namespace iText.Pdfocr { +namespace iText.Pdfocr.Exceptions { /// Exception class for custom exceptions. - public class OcrException : Exception { - public const String CANNOT_READ_INPUT_IMAGE = "Cannot read input image"; - - public const String CANNOT_RESOLVE_PROVIDED_FONTS = "Cannot resolve " + "any of provided fonts. Please check provided FontProvider."; - - public const String CANNOT_CREATE_PDF_DOCUMENT = "Cannot create " + "PDF document: {0}"; - + public class PdfOcrException : ITextException { private IList messageParams; - /// Creates a new OcrException. + /// + /// Creates a new + /// . + /// /// the detail message. /// /// the cause @@ -44,16 +42,34 @@ public class OcrException : Exception { /// /// method). /// - public OcrException(String msg, Exception e) + public PdfOcrException(String msg, Exception e) : base(msg, e) { } - /// Creates a new OcrException. + /// + /// Creates a new + /// . + /// /// the detail message. - public OcrException(String msg) + public PdfOcrException(String msg) : base(msg) { } + /// + /// Creates a new + /// . + /// + /// + /// the cause + /// which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrException(Exception e) + : base(e) { + } + /// public override String Message { get { @@ -74,7 +90,7 @@ protected internal virtual Object[] GetMessageParams() { /// Sets additional params for Exception message. /// additional params. /// object itself. - public virtual iText.Pdfocr.OcrException SetMessageParams(params String[] messageParams) { + public virtual iText.Pdfocr.Exceptions.PdfOcrException SetMessageParams(params String[] messageParams) { this.messageParams = JavaUtil.ArraysAsList(messageParams); return this; } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrExceptionMessageConstant.cs b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrExceptionMessageConstant.cs new file mode 100644 index 0000000..e6f883d --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrExceptionMessageConstant.cs @@ -0,0 +1,41 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; + +namespace iText.Pdfocr.Exceptions { + public class PdfOcrExceptionMessageConstant { + public const String CANNOT_READ_INPUT_IMAGE = "Cannot read input image"; + + public const String CANNOT_RESOLVE_PROVIDED_FONTS = "Cannot resolve any of provided fonts. Please check provided FontProvider."; + + public const String CANNOT_CREATE_PDF_DOCUMENT = "Cannot create PDF document: {0}"; + + public const String STATISTICS_EVENT_TYPE_CANT_BE_NULL = "Statistics event type can't be null"; + + public const String STATISTICS_EVENT_TYPE_IS_NOT_DETECTED = "Statistics event type is not detected."; + + private PdfOcrExceptionMessageConstant() { + } + //Private constructor will prevent the instantiation of this class directly + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrInputException.cs b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrInputException.cs new file mode 100644 index 0000000..871f802 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/exceptions/PdfOcrInputException.cs @@ -0,0 +1,68 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; + +namespace iText.Pdfocr.Exceptions { + /// Exception class for input related exceptions. + public class PdfOcrInputException : PdfOcrException { + /// + /// Creates a new + /// . + /// + /// the detail message. + /// + /// the cause + /// (which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrInputException(String msg, Exception e) + : base(msg, e) { + } + + /// + /// Creates a new + /// . + /// + /// the detail message. + public PdfOcrInputException(String msg) + : base(msg) { + } + + /// + /// Creates a new + /// . + /// + /// + /// the cause + /// which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrInputException(Exception e) + : base(e) { + } + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrLogMessageConstant.cs b/itext/itext.pdfocr.api/itext/pdfocr/logs/PdfOcrLogMessageConstant.cs similarity index 90% rename from itext/itext.pdfocr.api/itext/pdfocr/PdfOcrLogMessageConstant.cs rename to itext/itext.pdfocr.api/itext/pdfocr/logs/PdfOcrLogMessageConstant.cs index c01cde3..7b9bc01 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrLogMessageConstant.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/logs/PdfOcrLogMessageConstant.cs @@ -22,12 +22,11 @@ You should have received a copy of the GNU Affero General Public License */ using System; -namespace iText.Pdfocr { +namespace iText.Pdfocr.Logs { public class PdfOcrLogMessageConstant { public const String CANNOT_READ_INPUT_IMAGE = "Cannot read input image {0}"; - public const String PROVIDED_FONT_PROVIDER_IS_INVALID = "Provided FontProvider is invalid. Please check that it contains " - + "valid fonts and default font family name."; + public const String PROVIDED_FONT_PROVIDER_IS_INVALID = "Provided FontProvider is invalid. Please check that it contains valid fonts and default font family name."; public const String CANNOT_READ_DEFAULT_FONT = "Cannot default read font: {0}"; @@ -37,12 +36,12 @@ public class PdfOcrLogMessageConstant { public const String NUMBER_OF_PAGES_IN_IMAGE = "Image {0} contains {1} page(s)"; - public const String COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER = "Could not find a glyph corresponding to Unicode character {0} " - + "in any of the fonts"; + public const String COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER = "Could not find a glyph corresponding to Unicode character {0} in any of the fonts"; public const String PDF_LANGUAGE_PROPERTY_IS_NOT_SET = "PDF language property is not set"; private PdfOcrLogMessageConstant() { } + //Private constructor will prevent the instantiation of this class directly } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/metainfo/TestMetaInfo.cs b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputType.cs similarity index 68% rename from itext.tests/itext.pdfocr.tesseract4.tests/itext/metainfo/TestMetaInfo.cs rename to itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputType.cs index 36601e1..0a49a8c 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/metainfo/TestMetaInfo.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputType.cs @@ -20,15 +20,14 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using iText.Kernel.Counter.Event; - -namespace iText.Metainfo { - /// This class is used for test purposes. - /// - /// This class is used for test purposes. - /// Please be aware that it's put in the com.itextpdf.metainfo deliberately, - /// so that it belongs neither to com.itextpdf.pdfocr nor com.itextpdf.pdfocr.tesseract4 packages - /// - public class TestMetaInfo : IMetaInfo { +namespace iText.Pdfocr.Statistics { + /// pdfOcr output types for statistics. + public enum PdfOcrOutputType { + /// Processing of an image in the engine with data output + DATA, + /// Creating a PDF file + PDF, + /// Creating a PDF-A file + PDFA } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs new file mode 100644 index 0000000..10779a0 --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs @@ -0,0 +1,114 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using iText.Commons.Actions; +using iText.Commons.Utils; +using iText.Pdfocr.Exceptions; + +namespace iText.Pdfocr.Statistics { + /// Statistics aggregator which aggregates types of ocr processing. + internal class PdfOcrOutputTypeStatisticsAggregator : AbstractStatisticsAggregator { + private const String STRING_FOR_DATA = "data"; + + private const String STRING_FOR_PDF = "pdf"; + + private const String STRING_FOR_PDFA = "pdfa"; + + private static readonly IDictionary OCR_OUTPUT_TYPES; + + static PdfOcrOutputTypeStatisticsAggregator() { + IDictionary temp = new Dictionary(); + temp.Put(PdfOcrOutputType.DATA, STRING_FOR_DATA); + temp.Put(PdfOcrOutputType.PDF, STRING_FOR_PDF); + temp.Put(PdfOcrOutputType.PDFA, STRING_FOR_PDFA); + OCR_OUTPUT_TYPES = JavaCollectionsUtil.UnmodifiableMap(temp); + } + + private readonly Object Lock = new Object(); + + private readonly IDictionary numberOfUsagesPerType = new LinkedDictionary(); + + /// Aggregates pdfOcr event type. + /// + /// + /// + /// instance + /// + public override void Aggregate(AbstractStatisticsEvent @event) { + if (!(@event is PdfOcrOutputTypeStatisticsEvent)) { + return; + } + // the event's properties are required to be not null + PdfOcrOutputType type = ((PdfOcrOutputTypeStatisticsEvent)@event).GetPdfOcrStatisticsEventType(); + String fileTypeKey = GetKeyForType(type); + if (null == fileTypeKey) { + // this line is not expected to be reached, since an exception should have been thrown on event creation + throw new PdfOcrException(PdfOcrExceptionMessageConstant.STATISTICS_EVENT_TYPE_IS_NOT_DETECTED); + } + lock (Lock) { + long? documentsOfThisRange = numberOfUsagesPerType.Get(fileTypeKey); + long? currentValue = documentsOfThisRange == null ? 1L : (documentsOfThisRange + 1L); + numberOfUsagesPerType.Put(fileTypeKey, currentValue); + } + } + + /// Retrieves Map where keys are pdfOcr event types and values are the amounts of such events. + /// + /// aggregated + /// + /// + public override Object RetrieveAggregation() { + return JavaCollectionsUtil.UnmodifiableMap(numberOfUsagesPerType); + } + + /// Merges data about amounts of pdfOcr event types from the provided aggregator into this aggregator. + /// + /// + /// + /// + /// from which data will be taken. + /// + public override void Merge(AbstractStatisticsAggregator aggregator) { + if (!(aggregator is PdfOcrOutputTypeStatisticsAggregator)) { + return; + } + IDictionary otherNumberOfFiles = ((PdfOcrOutputTypeStatisticsAggregator)aggregator).numberOfUsagesPerType; + lock (Lock) { + MapUtil.Merge(this.numberOfUsagesPerType, otherNumberOfFiles, (el1, el2) => { + if (el2 == null) { + return el1; + } + else { + return el1 + el2; + } + } + ); + } + } + + internal static String GetKeyForType(PdfOcrOutputType type) { + return OCR_OUTPUT_TYPES.Get(type); + } + } +} diff --git a/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEvent.cs b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEvent.cs new file mode 100644 index 0000000..103a25a --- /dev/null +++ b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsEvent.cs @@ -0,0 +1,74 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using System.Collections.Generic; +using iText.Commons.Actions; +using iText.Commons.Actions.Data; +using iText.Commons.Utils; +using iText.Pdfocr.Exceptions; + +namespace iText.Pdfocr.Statistics { + /// Class which represents an event for specifying type of an ocr processing. + /// + /// Class which represents an event for specifying type of an ocr processing. + /// For internal usage only. + /// + public class PdfOcrOutputTypeStatisticsEvent : AbstractStatisticsEvent { + private const String OCR_OUTPUT_TYPE = "ocrOutput"; + + private readonly PdfOcrOutputType type; + + /// Creates instance of pdfOcr statistics event. + /// pdfCcr output type + /// product data + public PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType type, ProductData productData) + : base(productData) { + if (type == null) { + throw new PdfOcrException(PdfOcrExceptionMessageConstant.STATISTICS_EVENT_TYPE_CANT_BE_NULL); + } + if (null == PdfOcrOutputTypeStatisticsAggregator.GetKeyForType(type)) { + throw new PdfOcrException(PdfOcrExceptionMessageConstant.STATISTICS_EVENT_TYPE_IS_NOT_DETECTED); + } + this.type = type; + } + + /// + public override AbstractStatisticsAggregator CreateStatisticsAggregatorFromName(String statisticsName) { + if (OCR_OUTPUT_TYPE.Equals(statisticsName)) { + return new PdfOcrOutputTypeStatisticsAggregator(); + } + return base.CreateStatisticsAggregatorFromName(statisticsName); + } + + /// + public override IList GetStatisticsNames() { + return JavaCollectionsUtil.SingletonList(OCR_OUTPUT_TYPE); + } + + /// Gets the type of statistic event. + /// the statistics event type + public virtual PdfOcrOutputType GetPdfOcrStatisticsEventType() { + return type; + } + } +} diff --git a/itext/itext.pdfocr.api/pdfocr-api.nuspec b/itext/itext.pdfocr.api/pdfocr-api.nuspec index a488365..ed5bf78 100644 --- a/itext/itext.pdfocr.api/pdfocr-api.nuspec +++ b/itext/itext.pdfocr.api/pdfocr-api.nuspec @@ -2,7 +2,7 @@ itext7.pdfocr.api - 1.0.3 + 2.0.0 iText 7 pdfOcr iText Software iText Software @@ -17,14 +17,14 @@ OCR PDF ligatures text glyphs iText Optical Character Recognition PDF/A ISO-compliant Tesseract open-source opensource English Mandarin Chinese Hindi Spanish French Arabic Bengali Russian Portuguese Indonesian scan image extractable data searchable diacritic sdk c# .net - - + + - - + + diff --git a/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs b/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs index f414d70..3ebb9ec 100644 --- a/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs +++ b/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs @@ -14,9 +14,9 @@ [assembly: Guid("0c4ceb00-9a56-4547-a925-5974a85a6048")] -[assembly: AssemblyVersion("1.0.3.0")] -[assembly: AssemblyFileVersion("1.0.3.0")] -[assembly: AssemblyInformationalVersion("1.0.3")] +[assembly: AssemblyVersion("2.0.0.0")] +[assembly: AssemblyFileVersion("2.0.0.0")] +[assembly: AssemblyInformationalVersion("2.0.0")] [assembly: InternalsVisibleTo("itext.pdfocr.tesseract4.tests, PublicKey=" + "00240000048000009400000006020000002400005253413100040000010001008b21ed5b3fc1c1" + "1996390981fe22bbe71a39a9e11d3c2cefddd6ee92920fa871f9666ae0fa941af0280d0653df04" + @@ -24,5 +24,4 @@ "009746bbdafcb75bcdbcecb7caf1f0f4b6e7d013906ba60b66eb1c8298e4efb052caf6cece4bf1" + "816902cc")] -[assembly: Versions.Attributes.KeyVersion("3.1.5.0")] -[assembly: Versions.Attributes.KernelVersion("7.1.16.0")] \ No newline at end of file +[assembly: Versions.Attributes.KernelVersion("7.2.0.0")] \ No newline at end of file diff --git a/itext/itext.pdfocr.tesseract4/Properties/KeyVersionAttribute.cs b/itext/itext.pdfocr.tesseract4/Properties/KeyVersionAttribute.cs deleted file mode 100644 index 9a8a754..0000000 --- a/itext/itext.pdfocr.tesseract4/Properties/KeyVersionAttribute.cs +++ /dev/null @@ -1,54 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License version 3 -as published by the Free Software Foundation with the addition of the -following permission added to Section 15 as permitted in Section 7(a): -FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY -ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT -OF THIRD PARTY RIGHTS - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. -See the GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program; if not, see http://www.gnu.org/licenses or write to -the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -Boston, MA, 02110-1301 USA, or download the license from the following URL: -http://itextpdf.com/terms-of-use/ - -The interactive user interfaces in modified source and object code versions -of this program must display Appropriate Legal Notices, as required under -Section 5 of the GNU Affero General Public License. - -In accordance with Section 7(b) of the GNU Affero General Public License, -a covered work must retain the producer line in every PDF that is created -or manipulated using iText. - -You can be released from the requirements of the license by purchasing -a commercial license. Buying such a license is mandatory as soon as you -develop commercial activities involving the iText software without -disclosing the source code of your own applications. -These activities include: offering paid services to customers as an ASP, -serving PDFs on the fly in a web application, shipping iText with a closed -source product. - -For more information, please contact iText Software Corp. at this -address: sales@itextpdf.com - */ -using System; - -namespace Versions.Attributes { - [AttributeUsage(AttributeTargets.Assembly)] - internal class KeyVersionAttribute : Attribute { - internal string KeyVersion { get; private set; } - - internal KeyVersionAttribute(string keyVersion) { - this.KeyVersion = keyVersion; - } - } -} diff --git a/itext/itext.pdfocr.tesseract4/itext.pdfocr.tesseract4.csproj b/itext/itext.pdfocr.tesseract4/itext.pdfocr.tesseract4.csproj index 0866c86..fe4abf8 100644 --- a/itext/itext.pdfocr.tesseract4/itext.pdfocr.tesseract4.csproj +++ b/itext/itext.pdfocr.tesseract4/itext.pdfocr.tesseract4.csproj @@ -13,7 +13,7 @@ - net45 + net461 CS1591;CS1570;CS1572;CS1573;CS1574;CS1580;CS1584;CS1658 diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs index 5046c68..a17635d 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs @@ -25,14 +25,19 @@ You should have received a copy of the GNU Affero General Public License using System.IO; using System.Text; using System.Threading; -using Common.Logging; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Data; +using iText.Commons.Utils; using iText.IO.Image; -using iText.IO.Util; -using iText.Kernel.Counter; -using iText.Kernel.Counter.Event; using iText.Pdfocr; -using iText.Pdfocr.Events; -using iText.Pdfocr.Tesseract4.Events; +using iText.Pdfocr.Statistics; +using iText.Pdfocr.Tesseract4.Actions.Data; +using iText.Pdfocr.Tesseract4.Actions.Events; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { /// @@ -47,7 +52,7 @@ namespace iText.Pdfocr.Tesseract4 { /// Also there are possibilities to use features of "tesseract" /// (optical character recognition engine for various operating systems). /// - public abstract class AbstractTesseract4OcrEngine : IOcrEngine, IThreadLocalMetaInfoAware { + public abstract class AbstractTesseract4OcrEngine : IOcrEngine, IProductAware { /// Supported image formats. private static readonly ICollection SUPPORTED_IMAGE_FORMATS = JavaCollectionsUtil.UnmodifiableSet (new HashSet(JavaUtil.ArraysAsList(ImageType.BMP, ImageType.PNG, ImageType.TIFF, ImageType. @@ -76,7 +81,25 @@ public AbstractTesseract4OcrEngine(Tesseract4OcrEngineProperties tesseract4OcrEn /// for tesseract /// public virtual void DoTesseractOcr(FileInfo inputImage, FileInfo outputFile, OutputFormat outputFormat) { - DoTesseractOcr(inputImage, JavaCollectionsUtil.SingletonList(outputFile), outputFormat, 1); + DoTesseractOcr(inputImage, outputFile, outputFormat, new OcrProcessContext(new Tesseract4EventHelper())); + } + + /// Performs tesseract OCR for the first (or for the only) image page. + /// + /// input image + /// + /// + /// output file for the result for the first page + /// + /// selected + /// + /// for tesseract + /// + /// ocr process context + public virtual void DoTesseractOcr(FileInfo inputImage, FileInfo outputFile, OutputFormat outputFormat, OcrProcessContext + ocrProcessContext) { + DoTesseractOcr(inputImage, JavaCollectionsUtil.SingletonList(outputFile), outputFormat, 1, ocrProcessContext + .GetOcrEventHelper()); } /// @@ -92,14 +115,52 @@ public virtual void DoTesseractOcr(FileInfo inputImage, FileInfo outputFile, Out /// /// file to be created public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile) { - LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.START_OCR_FOR_IMAGES - , inputImages.Count)); - StringBuilder content = new StringBuilder(); - foreach (FileInfo inputImage in inputImages) { - content.Append(DoImageOcr(inputImage, OutputFormat.TXT)); + CreateTxtFile(inputImages, txtFile, new OcrProcessContext(new Tesseract4EventHelper())); + } + + /// + /// Performs OCR using provided + /// + /// for the given list of + /// input images and saves output to a text file using provided path. + /// + /// + /// + /// + /// of images to be OCRed + /// + /// file to be created + /// ocr process context + public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext + ) { + ITextLogManager.GetLogger(GetType()).LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant. + START_OCR_FOR_IMAGES, inputImages.Count)); + AbstractPdfOcrEventHelper storedEventHelper; + if (ocrProcessContext.GetOcrEventHelper() == null) { + storedEventHelper = new Tesseract4EventHelper(); + } + else { + storedEventHelper = ocrProcessContext.GetOcrEventHelper(); + } + PdfOcrTesseract4ProductEvent @event = PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(storedEventHelper + .GetSequenceId(), null, storedEventHelper.GetConfirmationType()); + storedEventHelper.OnEvent(@event); + try { + // set Tesseract4FileResultEventHelper + ocrProcessContext.SetOcrEventHelper(new Tesseract4FileResultEventHelper(storedEventHelper)); + StringBuilder content = new StringBuilder(); + foreach (FileInfo inputImage in inputImages) { + content.Append(DoImageOcr(inputImage, OutputFormat.TXT, ocrProcessContext)); + } + // write to file + TesseractHelper.WriteToTextFile(txtFile.FullName, content.ToString()); + if (@event.GetConfirmationType() == EventConfirmationType.ON_DEMAND) { + storedEventHelper.OnEvent(new ConfirmEvent(@event)); + } + } + finally { + ocrProcessContext.SetOcrEventHelper(storedEventHelper); } - // write to file - TesseractHelper.WriteToTextFile(txtFile.FullName, content.ToString()); } /// @@ -171,7 +232,36 @@ public String GetLanguagesAsString() { public IDictionary> DoImageOcr(FileInfo input) { VerifyImageFormatValidity(input); return ((AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult)ProcessInputFiles(input, OutputFormat.HOCR - )).GetTextInfos(); + , new Tesseract4EventHelper())).GetTextInfos(); + } + + /// + /// Reads data from the provided input image file and returns retrieved + /// data in the format described below. + /// + /// + /// input image + /// + /// + /// ocr process context + /// + /// + /// + /// where key is + /// + /// representing the number of the page and value is + /// + /// of + /// + /// elements where each + /// + /// element contains a word or a line and its 4 + /// coordinates(bbox) + /// + public IDictionary> DoImageOcr(FileInfo input, OcrProcessContext ocrProcessContext) { + VerifyImageFormatValidity(input); + return ((AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult)ProcessInputFiles(input, OutputFormat.HOCR + , ocrProcessContext.GetOcrEventHelper())).GetTextInfos(); } /// @@ -187,16 +277,18 @@ public IDictionary> DoImageOcr(FileInfo input) { /// /// result /// + /// ocr process context /// /// OCR result as a /// /// that is /// returned after processing the given image /// - public String DoImageOcr(FileInfo input, OutputFormat outputFormat) { + public String DoImageOcr(FileInfo input, OutputFormat outputFormat, OcrProcessContext ocrProcessContext) { String result = ""; VerifyImageFormatValidity(input); - AbstractTesseract4OcrEngine.ITesseractOcrResult processedData = ProcessInputFiles(input, outputFormat); + AbstractTesseract4OcrEngine.ITesseractOcrResult processedData = ProcessInputFiles(input, outputFormat, ocrProcessContext + .GetOcrEventHelper()); if (processedData != null) { if (outputFormat.Equals(OutputFormat.TXT)) { result = ((AbstractTesseract4OcrEngine.StringTesseractOcrResult)processedData).GetData(); @@ -220,6 +312,29 @@ public String DoImageOcr(FileInfo input, OutputFormat outputFormat) { return result; } + /// + /// Reads data from the provided input image file and returns retrieved + /// data as string. + /// + /// + /// input image + /// + /// + /// + /// return + /// + /// result + /// + /// + /// OCR result as a + /// + /// that is + /// returned after processing the given image + /// + public String DoImageOcr(FileInfo input, OutputFormat outputFormat) { + return DoImageOcr(input, outputFormat, new OcrProcessContext(new Tesseract4EventHelper())); + } + /// Checks current os type. /// boolean true is current os is windows, otherwise - false public virtual bool IsWindows() { @@ -251,29 +366,27 @@ public virtual void ValidateLanguages(IList languagesList) { if (languagesList.Count == 0) { if (!new FileInfo(GetTessData() + System.IO.Path.DirectorySeparatorChar + GetTesseract4OcrEngineProperties ().GetDefaultLanguage() + suffix).Exists) { - throw new Tesseract4OcrException(Tesseract4OcrException.INCORRECT_LANGUAGE).SetMessageParams(GetTesseract4OcrEngineProperties - ().GetDefaultLanguage() + suffix, GetTessData()); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE).SetMessageParams + (GetTesseract4OcrEngineProperties().GetDefaultLanguage() + suffix, GetTessData()); } } else { foreach (String lang in languagesList) { if (!new FileInfo(GetTessData() + System.IO.Path.DirectorySeparatorChar + lang + suffix).Exists) { - throw new Tesseract4OcrException(Tesseract4OcrException.INCORRECT_LANGUAGE).SetMessageParams(lang + suffix - , GetTessData()); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE).SetMessageParams + (lang + suffix, GetTessData()); } } } } /// - public virtual IMetaInfo GetThreadLocalMetaInfo() { - return threadLocalMetaInfo.Value; + public virtual PdfOcrMetaInfoContainer GetMetaInfoContainer() { + return new PdfOcrMetaInfoContainer(new Tesseract4MetaInfo()); } - /// - public virtual IThreadLocalMetaInfoAware SetThreadLocalMetaInfo(IMetaInfo metaInfo) { - this.threadLocalMetaInfo.Value = metaInfo; - return this; + public virtual ProductData GetProductData() { + return PdfOcrTesseract4ProductData.GetInstance(); } /// @@ -307,8 +420,8 @@ public virtual IThreadLocalMetaInfoAware SetThreadLocalMetaInfo(IMetaInfo metaIn /// /// number of page to be processed internal virtual void DoTesseractOcr(FileInfo inputImage, IList outputFiles, OutputFormat outputFormat - , int pageNumber) { - DoTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true); + , int pageNumber, AbstractPdfOcrEventHelper eventHelper) { + DoTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true, eventHelper); } /// @@ -341,13 +454,10 @@ internal virtual void DoTesseractOcr(FileInfo inputImage, IList output /// for tesseract /// /// number of page to be processed - /// - /// indicates if - /// - /// needs to be dispatched - /// + /// indicates if event needs to be dispatched + /// event helper internal abstract void DoTesseractOcr(FileInfo inputImage, IList outputFiles, OutputFormat outputFormat - , int pageNumber, bool dispatchEvent); + , int pageNumber, bool dispatchEvent, AbstractPdfOcrEventHelper eventHelper); /// Gets path to provided tess data directory. /// @@ -356,32 +466,24 @@ internal abstract void DoTesseractOcr(FileInfo inputImage, IList outpu /// internal virtual String GetTessData() { if (GetTesseract4OcrEngineProperties().GetPathToTessData() == null) { - throw new Tesseract4OcrException(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET); } else { return GetTesseract4OcrEngineProperties().GetPathToTessData().FullName; } } - internal virtual void ScheduledCheck() { - ReflectionUtils.ScheduledCheck(); + internal virtual PdfOcrTesseract4ProductEvent OnEvent(AbstractPdfOcrEventHelper eventHelper) { + // usage event + PdfOcrTesseract4ProductEvent @event = PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(eventHelper.GetSequenceId + (), null, eventHelper.GetConfirmationType()); + eventHelper.OnEvent(@event); + return @event; } - internal virtual void OnEvent() { - IMetaInfo metaInfo = this.GetThreadLocalMetaInfo(); - if (!(metaInfo is OcrPdfCreatorMetaInfo)) { - EventCounterHandler.GetInstance().OnEvent(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, this.GetThreadLocalMetaInfo - (), GetType()); - } - else { - Guid uuid = ((OcrPdfCreatorMetaInfo)metaInfo).GetDocumentId(); - if (!processedUUID.Contains(uuid)) { - processedUUID.Add(uuid); - EventCounterHandler.GetInstance().OnEvent(OcrPdfCreatorMetaInfo.PdfDocumentType.PDFA.Equals(((OcrPdfCreatorMetaInfo - )metaInfo).GetPdfDocumentType()) ? PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA : PdfOcrTesseract4Event - .TESSERACT4_IMAGE_TO_PDF, ((OcrPdfCreatorMetaInfo)metaInfo).GetWrappedMetaInfo(), GetType()); - } - } + internal virtual void OnEventStatistics(AbstractPdfOcrEventHelper eventHelper) { + eventHelper.OnEvent(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.DATA, PdfOcrTesseract4ProductData + .GetInstance())); } /// Reads data from the provided input image file. @@ -396,6 +498,7 @@ internal virtual void OnEvent() { /// by /// /// + /// event helper /// /// /// @@ -406,7 +509,7 @@ internal virtual void OnEvent() { /// if the output format is HOCR /// private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat - ) { + , AbstractPdfOcrEventHelper eventHelper) { IDictionary> imageData = new LinkedDictionary>(); StringBuilder data = new StringBuilder(); IList tempFiles = new List(); @@ -423,7 +526,7 @@ private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileIn for (int i = 0; i < numOfFiles; i++) { tempFiles.Add(CreateTempFile(extension)); } - DoTesseractOcr(input, tempFiles, outputFormat, page); + DoTesseractOcr(input, tempFiles, outputFormat, page, true, eventHelper); if (outputFormat.Equals(OutputFormat.HOCR)) { IList tempTxtFiles = null; if (GetTesseract4OcrEngineProperties().IsUseTxtToImproveHocrParsing()) { @@ -431,7 +534,7 @@ private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileIn for (int i = 0; i < numOfFiles; i++) { tempTxtFiles.Add(CreateTempFile(".txt")); } - DoTesseractOcr(input, tempTxtFiles, OutputFormat.TXT, page, false); + DoTesseractOcr(input, tempTxtFiles, OutputFormat.TXT, page, false, eventHelper); } IDictionary> pageData = TesseractHelper.ParseHocrFile(tempFiles, tempTxtFiles, GetTesseract4OcrEngineProperties ()); @@ -454,7 +557,7 @@ private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileIn } } catch (System.IO.IOException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE , e.Message)); } finally { @@ -495,10 +598,10 @@ private void VerifyImageFormatValidity(FileInfo image) { ImageType type = ImagePreprocessingUtil.GetImageType(image); bool isValid = SUPPORTED_IMAGE_FORMATS.Contains(type); if (!isValid) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE , image.FullName)); - throw new Tesseract4OcrException(Tesseract4OcrException.INCORRECT_INPUT_IMAGE_FORMAT).SetMessageParams(image - .Name); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_INPUT_IMAGE_FORMAT + ).SetMessageParams(image.Name); } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs index 856b043..76a7021 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs @@ -22,11 +22,15 @@ You should have received a copy of the GNU Affero General Public License */ using System; using System.IO; -using Common.Logging; +using Microsoft.Extensions.Logging; using Tesseract; +using iText.Commons; +using iText.Commons.Utils; using iText.IO.Image; using iText.IO.Source; using iText.IO.Util; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { /// Utilities class to work with images. @@ -83,10 +87,10 @@ internal static ImageType GetImageType(FileInfo inputImage) { } catch (Exception e) { // NOSONAR - LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Error(MessageFormatUtil.Format - (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); - throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE).SetMessageParams(inputImage - .FullName); + ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).LogError(MessageFormatUtil + .Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE + ).SetMessageParams(inputImage.FullName); } return type; } @@ -158,8 +162,8 @@ internal static Pix PreprocessImage(FileInfo inputFile, int pageNumber, ImagePre pix = TesseractOcrUtil.ReadPix(inputFile); } if (pix == null) { - throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE).SetMessageParams(inputFile - .FullName); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE + ).SetMessageParams(inputFile.FullName); } return TesseractOcrUtil.PreprocessPix(pix, imagePreprocessingOptions); } @@ -190,12 +194,12 @@ internal static System.Drawing.Bitmap ReadImage(FileInfo inputImage) { bufferedImage = iText.Pdfocr.Tesseract4.ImagePreprocessingUtil.ReadImageFromFile(inputImage); } catch (ArgumentException ex) { - LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Info(MessageFormatUtil.Format - (Tesseract4LogMessageConstant.CANNOT_CREATE_BUFFERED_IMAGE, ex.Message)); + ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).LogInformation(MessageFormatUtil + .Format(Tesseract4LogMessageConstant.CANNOT_CREATE_BUFFERED_IMAGE, ex.Message)); } catch (System.IO.IOException ex) { - LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Info(MessageFormatUtil.Format - (Tesseract4LogMessageConstant.CANNOT_CREATE_BUFFERED_IMAGE, ex.Message)); + ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).LogInformation(MessageFormatUtil + .Format(Tesseract4LogMessageConstant.CANNOT_CREATE_BUFFERED_IMAGE, ex.Message)); } if (bufferedImage == null) { try { @@ -203,8 +207,8 @@ internal static System.Drawing.Bitmap ReadImage(FileInfo inputImage) { ); } catch (System.IO.IOException ex) { - LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Info(MessageFormatUtil.Format - (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, ex.Message)); + ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).LogInformation(MessageFormatUtil + .Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, ex.Message)); } } return bufferedImage; diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ReflectionUtils.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ReflectionUtils.cs deleted file mode 100644 index 7ec8b0f..0000000 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ReflectionUtils.cs +++ /dev/null @@ -1,137 +0,0 @@ -/* - -This file is part of the iText (R) project. - Copyright (c) 1998-2021 iText Group NV -Authors: Bruno Lowagie, Paulo Soares, et al. - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License version 3 -as published by the Free Software Foundation with the addition of the -following permission added to Section 15 as permitted in Section 7(a): -FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY -ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT -OF THIRD PARTY RIGHTS - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. -See the GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program; if not, see http://www.gnu.org/licenses or write to -the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -Boston, MA, 02110-1301 USA, or download the license from the following URL: -http://itextpdf.com/terms-of-use/ - -The interactive user interfaces in modified source and object code versions -of this program must display Appropriate Legal Notices, as required under -Section 5 of the GNU Affero General Public License. - -In accordance with Section 7(b) of the GNU Affero General Public License, -a covered work must retain the producer line in every PDF that is created -or manipulated using iText. - -You can be released from the requirements of the license by purchasing -a commercial license. Buying such a license is mandatory as soon as you -develop commercial activities involving the iText software without -disclosing the source code of your own applications. -These activities include: offering paid services to customers as an ASP, -serving PDFs on the fly in a web application, shipping iText with a closed -source product. - -For more information, please contact iText Software Corp. at this -address: sales@itextpdf.com -*/ -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Reflection; -using Common.Logging; -using iText.IO.Util; -using iText.Kernel.Counter; -using Versions.Attributes; - -namespace iText.Pdfocr.Tesseract4 { - public sealed class ReflectionUtils { - - private const String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one."; - - private ReflectionUtils() { - } - - public static void ScheduledCheck() { - try { - String licenseKeyClassName = "iText.License.LicenseKey, itext.licensekey"; - String licenseKeyProductClassName = "iText.License.LicenseKeyProduct, itext.licensekey"; - String checkLicenseKeyMethodName = "ScheduledCheck"; - Type licenseKeyClass = GetLicenseKeyClass(licenseKeyClassName); - if (licenseKeyClass != null) - { - Type licenseKeyProductClass = GetLicenseKeyClass(licenseKeyProductClassName); - object[] objects = new object[] - { - PdfOcrTesseract4ProductInfo.PRODUCT_NAME, - PdfOcrTesseract4ProductInfo.MAJOR_VERSION.ToString(), - PdfOcrTesseract4ProductInfo.MINOR_VERSION.ToString() - }; - Object productObject = System.Activator.CreateInstance(licenseKeyProductClass, objects); - MethodInfo m = licenseKeyClass.GetMethod(checkLicenseKeyMethodName); - m.Invoke(System.Activator.CreateInstance(licenseKeyClass), new object[] { productObject }); - } - } - catch (Exception e) { - if (null != e && null != e.InnerException) { - String message = e.InnerException.Message; - if (NO_PDFOCR_TESSERACT4.Equals(message)) { - throw new Exception(message, e.InnerException); - } - } - if (!iText.Kernel.Version.IsAGPLVersion()) { - throw; - } - } - } - - private static Type GetLicenseKeyClass(string className) - { - String licenseKeyClassFullName = null; - Assembly assembly = typeof(ReflectionUtils).GetAssembly(); - Attribute keyVersionAttr = assembly.GetCustomAttribute(typeof(KeyVersionAttribute)); - if (keyVersionAttr is KeyVersionAttribute) - { - String keyVersion = ((KeyVersionAttribute)keyVersionAttr).KeyVersion; - String format = "{0}, Version={1}, Culture=neutral, PublicKeyToken=8354ae6d2174ddca"; - licenseKeyClassFullName = String.Format(format, className, keyVersion); - } - Type type = null; - if (licenseKeyClassFullName != null) - { - String fileLoadExceptionMessage = null; - try - { - type = System.Type.GetType(licenseKeyClassFullName); - } - catch (FileLoadException fileLoadException) - { - fileLoadExceptionMessage = fileLoadException.Message; - } - if (type == null) - { - try - { - type = System.Type.GetType(className); - } - catch - { - // empty - } - if (type == null && fileLoadExceptionMessage != null) - { - LogManager.GetLogger(typeof(ReflectionUtils)).Error(fileLoadExceptionMessage); - } - } - } - return type; - } - } -} diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingExecutableTest.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs similarity index 52% rename from itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingExecutableTest.cs rename to itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs index bbc6be9..4b33f96 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/events/EventCountingExecutableTest.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs @@ -20,28 +20,31 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using System; -using System.IO; -using iText.IO.Util; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; using iText.Pdfocr; -using iText.Pdfocr.Tesseract4; -using iText.Test.Attributes; -namespace iText.Pdfocr.Events { - public class EventCountingExecutableTest : EventCountingTest { - public EventCountingExecutableTest() - : base(IntegrationTestHelper.ReaderType.EXECUTABLE) { +namespace iText.Pdfocr.Tesseract4 { + /// Helper class for working with events. + internal class Tesseract4EventHelper : AbstractPdfOcrEventHelper { + internal Tesseract4EventHelper() { } - [NUnit.Framework.Test] - [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] - public override void TestEventCountingCustomMetaInfoError() { - String imgPath = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_101.jpg").FullName; - NUnit.Framework.Assert.That(() => { - base.TestEventCountingCustomMetaInfoError(); + // do nothing + public override void OnEvent(AbstractProductITextEvent @event) { + if (@event is AbstractContextBasedITextEvent) { + ((AbstractContextBasedITextEvent)@event).SetMetaInfo(new Tesseract4MetaInfo()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, imgPath))) -; + EventManager.GetInstance().OnEvent(@event); + } + + public override SequenceId GetSequenceId() { + return new SequenceId(); + } + + public override EventConfirmationType GetConfirmationType() { + return EventConfirmationType.ON_DEMAND; } } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs index 21c941f..16a2803 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs @@ -24,9 +24,15 @@ You should have received a copy of the GNU Affero General Public License using System.Collections.Generic; using System.IO; using System.Security; -using Common.Logging; +using Microsoft.Extensions.Logging; using Tesseract; -using iText.IO.Util; +using iText.Commons; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Utils; +using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Actions.Events; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { /// @@ -121,23 +127,27 @@ public void SetPathToExecutable(String path) { /// for tesseract /// /// number of page to be processed - /// - /// indicates if - /// - /// needs to be dispatched - /// + /// indicates if event needs to be dispatched + /// event helper internal override void DoTesseractOcr(FileInfo inputImage, IList outputFiles, OutputFormat outputFormat - , int pageNumber, bool dispatchEvent) { - ScheduledCheck(); + , int pageNumber, bool dispatchEvent, AbstractPdfOcrEventHelper eventHelper) { IList @params = new List(); String execPath = null; String imagePath = null; String workingDirectory = null; + PdfOcrTesseract4ProductEvent @event = null; + if (eventHelper == null) { + eventHelper = new Tesseract4EventHelper(); + } + if (dispatchEvent) { + @event = OnEvent(eventHelper); + } try { imagePath = inputImage.FullName; // path to tesseract executable if (GetPathToExecutable() == null || String.IsNullOrEmpty(GetPathToExecutable())) { - throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE + ); } else { if (IsWindows()) { @@ -156,7 +166,7 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu imagePath = PreprocessImage(inputImage, pageNumber); // get the input file parent directory as working directory // as tesseract cannot parse non ascii characters in input path - String imageParentDir = TesseractOcrUtil.GetParentDirectory(imagePath); + String imageParentDir = TesseractOcrUtil.GetParentDirectoryFile(imagePath); String replacement = IsWindows() ? "" : "/"; workingDirectory = imageParentDir.Replace("file:///", replacement).Replace("file:/", replacement); // input file @@ -173,15 +183,18 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu AddPreserveInterwordSpaces(@params); // set default user defined dpi AddDefaultDpi(@params); - if (dispatchEvent) { - OnEvent(); - } // run tesseract process TesseractHelper.RunCommand(execPath, @params, workingDirectory); + // statistics event + OnEventStatistics(eventHelper); + // confrim on_demand event + if (@event != null && @event.GetConfirmationType() == EventConfirmationType.ON_DEMAND) { + eventHelper.OnEvent(new ConfirmEvent(@event)); + } } - catch (Tesseract4OcrException e) { - LogManager.GetLogger(GetType()).Error(e.Message); - throw new Tesseract4OcrException(e.Message, e); + catch (PdfOcrTesseract4Exception e) { + ITextLogManager.GetLogger(GetType()).LogError(e.Message); + throw new PdfOcrTesseract4Exception(e.Message, e); } finally { try { @@ -190,7 +203,7 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu } } catch (SecurityException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE , imagePath, e.Message)); } try { @@ -200,7 +213,7 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu } } catch (SecurityException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE , GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(), e.Message)); } } @@ -307,13 +320,13 @@ private void AddOutputFile(IList command, FileInfo outputFile, OutputFor .FullName; String fileName = new String(filePath.ToCharArray(), 0, filePath.IndexOf(extension, StringComparison.Ordinal )); - LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CREATED_TEMPORARY_FILE - , outputFile.FullName)); + ITextLogManager.GetLogger(GetType()).LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant. + CREATED_TEMPORARY_FILE, outputFile.FullName)); command.Add(AddQuotes(fileName)); } catch (Exception) { // NOSONAR - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } } @@ -364,7 +377,7 @@ private String PreprocessImage(FileInfo inputImage, int pageNumber) { } } catch (System.IO.IOException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE , e.Message)); } return path; @@ -379,8 +392,8 @@ private void CheckTesseractInstalled(String execPath) { try { TesseractHelper.RunCommand(execPath, JavaCollectionsUtil.SingletonList("--version")); } - catch (Tesseract4OcrException e) { - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_NOT_FOUND, e); + catch (PdfOcrTesseract4Exception e) { + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND, e); } } @@ -407,8 +420,8 @@ private String GetExtension(FileInfo inputImage) { /// path to the second file /// true if parent directories are equal, otherwise - false private bool AreEqualParentDirectories(String firstPath, String secondPath) { - String firstParentDir = TesseractOcrUtil.GetParentDirectory(firstPath); - String secondParentDir = TesseractOcrUtil.GetParentDirectory(secondPath); + String firstParentDir = TesseractOcrUtil.GetParentDirectoryFile(firstPath); + String secondParentDir = TesseractOcrUtil.GetParentDirectoryFile(secondPath); return firstParentDir != null && firstParentDir.Equals(secondParentDir); } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs new file mode 100644 index 0000000..f81437b --- /dev/null +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs @@ -0,0 +1,67 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Sequence; +using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Actions.Events; + +namespace iText.Pdfocr.Tesseract4 { + /// Helper class for working with events. + internal class Tesseract4FileResultEventHelper : AbstractPdfOcrEventHelper { + private AbstractPdfOcrEventHelper wrappedEventHelper; + + internal Tesseract4FileResultEventHelper() + : this(null) { + } + + internal Tesseract4FileResultEventHelper(AbstractPdfOcrEventHelper wrappedEventHelper) { + this.wrappedEventHelper = wrappedEventHelper == null ? new Tesseract4EventHelper() : wrappedEventHelper; + } + + public override void OnEvent(AbstractProductITextEvent @event) { + if (!IsProcessImageEvent(@event) && !IsConfirmForProcessImageEvent(@event)) { + wrappedEventHelper.OnEvent(@event); + } + } + + public override SequenceId GetSequenceId() { + return wrappedEventHelper.GetSequenceId(); + } + + public override EventConfirmationType GetConfirmationType() { + return wrappedEventHelper.GetConfirmationType(); + } + + private static bool IsProcessImageEvent(AbstractProductITextEvent @event) { + return @event is PdfOcrTesseract4ProductEvent && PdfOcrTesseract4ProductEvent.PROCESS_IMAGE.Equals(((PdfOcrTesseract4ProductEvent + )@event).GetEventType()); + } + + private static bool IsConfirmForProcessImageEvent(AbstractProductITextEvent @event) { + return @event is ConfirmEvent && ((ConfirmEvent)@event).GetConfirmedEvent() is PdfOcrTesseract4ProductEvent + && PdfOcrTesseract4ProductEvent.PROCESS_IMAGE.Equals(((ConfirmEvent)@event).GetConfirmedEvent().GetEventType + ()); + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs index f0ee1b9..2163e1e 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs @@ -24,9 +24,15 @@ You should have received a copy of the GNU Affero General Public License using System.Collections.Generic; using System.IO; using System.Text.RegularExpressions; -using Common.Logging; +using Microsoft.Extensions.Logging; using Tesseract; -using iText.IO.Util; +using iText.Commons; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Utils; +using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Actions.Events; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { /// @@ -56,7 +62,7 @@ public class Tesseract4LibOcrEngine : AbstractTesseract4OcrEngine { private TesseractEngine tesseractInstance = null; /// Pattern for matching ASCII string. - private static readonly Regex ASCII_STRING_PATTERN = iText.IO.Util.StringUtil.RegexCompile("^[\\u0000-\\u007F]*$" + private static readonly Regex ASCII_STRING_PATTERN = iText.Commons.Utils.StringUtil.RegexCompile("^[\\u0000-\\u007F]*$" ); /// @@ -144,22 +150,23 @@ public virtual void InitializeTesseract(OutputFormat outputFormat) { /// for tesseract /// /// number of page to be processed - /// - /// indicates if - /// - /// needs to be dispatched - /// + /// indicates if event needs to be dispatched + /// event helper internal override void DoTesseractOcr(FileInfo inputImage, IList outputFiles, OutputFormat outputFormat - , int pageNumber, bool dispatchEvent) { - ScheduledCheck(); + , int pageNumber, bool dispatchEvent, AbstractPdfOcrEventHelper eventHelper) { + PdfOcrTesseract4ProductEvent @event = null; + if (eventHelper == null) { + eventHelper = new Tesseract4EventHelper(); + } + // usage event + if (dispatchEvent) { + @event = OnEvent(eventHelper); + } try { // check tess data path for non ASCII characters ValidateTessDataPath(GetTessData()); ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); InitializeTesseract(outputFormat); - if (dispatchEvent) { - OnEvent(); - } // if preprocessing is not needed and provided image is tiff, // the image will be paginated and separate pages will be OCRed IList resultList = new List(); @@ -183,16 +190,20 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu } } catch (System.IO.IOException e) { - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE - , e.Message)); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e); } } } + // statistics event + OnEventStatistics(eventHelper); + // confirm on_demand event + if (@event != null && @event.GetConfirmationType() == EventConfirmationType.ON_DEMAND) { + eventHelper.OnEvent(new ConfirmEvent(@event)); + } } - catch (Tesseract4OcrException e) { - LogManager.GetLogger(GetType()).Error(e.Message); - throw new Tesseract4OcrException(e.Message, e); + catch (PdfOcrTesseract4Exception e) { + ITextLogManager.GetLogger(GetType()).LogError(e.Message); + throw new PdfOcrTesseract4Exception(e.Message, e); } finally { if (tesseractInstance != null) { @@ -220,9 +231,9 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu /// path to tess data /// private static void ValidateTessDataPath(String tessDataPath) { - Matcher asciiStringMatcher = iText.IO.Util.Matcher.Match(ASCII_STRING_PATTERN, tessDataPath); + Matcher asciiStringMatcher = iText.Commons.Utils.Matcher.Match(ASCII_STRING_PATTERN, tessDataPath); if (!asciiStringMatcher.Matches()) { - throw new Tesseract4OcrException(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS ); } } @@ -263,8 +274,8 @@ private IList GetOcrResultForMultiPage(FileInfo inputImage, OutputFormat } catch (TesseractException e) { String msg = MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED, e.Message); - LogManager.GetLogger(GetType()).Error(msg); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + ITextLogManager.GetLogger(GetType()).LogError(msg); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } finally { TesseractOcrUtil.DisposeTesseractInstance(GetTesseractInstance()); @@ -305,8 +316,8 @@ private String GetOcrResultForSinglePage(FileInfo inputImage, OutputFormat outpu } catch (Exception e) { // NOSONAR - LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE - , e.Message)); + ITextLogManager.GetLogger(GetType()).LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant. + CANNOT_PROCESS_IMAGE, e.Message)); } } if (result == null) { @@ -317,9 +328,9 @@ private String GetOcrResultForSinglePage(FileInfo inputImage, OutputFormat outpu } catch (Exception e) { // NOSONAR - LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED + ITextLogManager.GetLogger(GetType()).LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED , e.Message)); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } return result; } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfo.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs similarity index 88% rename from itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfo.cs rename to itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs index f60da53..268657f 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfo.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs @@ -20,9 +20,9 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ -using iText.Kernel.Counter.Event; +using iText.Commons.Actions.Contexts; -namespace iText.Pdfocr { - public class PdfOcrMetaInfo : IMetaInfo { +namespace iText.Pdfocr.Tesseract4 { + internal class Tesseract4MetaInfo : IMetaInfo { } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs index b5bc93a..e02ad00 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs @@ -23,9 +23,12 @@ You should have received a copy of the GNU Affero General Public License using System; using System.Collections.Generic; using System.IO; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { /// @@ -151,7 +154,8 @@ public FileInfo GetPathToTessData() { /// public iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetPathToTessData(FileInfo tessData) { if (tessData == null || !FileUtil.DirectoryExists(tessData.FullName)) { - throw new Tesseract4OcrException(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID + ); } this.tessDataDir = tessData; return this; @@ -291,7 +295,7 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo SetUserWords(language, inputStream); } catch (System.IO.IOException e) { - LogManager.GetLogger(GetType()).Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS + ITextLogManager.GetLogger(GetType()).LogWarning(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS , e.Message)); } } @@ -339,8 +343,8 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo SetLanguages(languagesList); } else { - throw new Tesseract4OcrException(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST).SetMessageParams(language - ); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST + ).SetMessageParams(language); } } String userWordsFileName = TesseractOcrUtil.GetTempFilePath(language, "." + DEFAULT_USER_WORDS_SUFFIX); @@ -357,7 +361,7 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo } catch (System.IO.IOException e) { SetPathToUserWordsFile(null); - LogManager.GetLogger(GetType()).Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS + ITextLogManager.GetLogger(GetType()).LogWarning(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS , e.Message)); } return this; diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs index fba8a58..c82f609 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs @@ -25,11 +25,15 @@ You should have received a copy of the GNU Affero General Public License using System.IO; using System.Linq; using System.Security; +using System.Text; using System.Text.RegularExpressions; -using Common.Logging; -using iText.IO.Util; +using Microsoft.Extensions.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.Kernel.Geom; using iText.Pdfocr; +using iText.Pdfocr.Tesseract4.Exceptions; +using iText.Pdfocr.Tesseract4.Logs; using iText.StyledXmlParser.Jsoup.Nodes; using iText.StyledXmlParser.Jsoup.Select; @@ -37,17 +41,17 @@ namespace iText.Pdfocr.Tesseract4 { /// Helper class. public class TesseractHelper { /// The logger. - private static readonly ILog LOGGER = LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.TesseractHelper) - ); + private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.TesseractHelper + )); /// Patterns for matching hOCR element bboxes. - private static readonly Regex BBOX_PATTERN = iText.IO.Util.StringUtil.RegexCompile(".*bbox(\\s+\\d+){4}.*" + private static readonly Regex BBOX_PATTERN = iText.Commons.Utils.StringUtil.RegexCompile(".*bbox(\\s+\\d+){4}.*" ); - private static readonly Regex BBOX_COORDINATE_PATTERN = iText.IO.Util.StringUtil.RegexCompile(".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*" + private static readonly Regex BBOX_COORDINATE_PATTERN = iText.Commons.Utils.StringUtil.RegexCompile(".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*" ); - private static readonly Regex WCONF_PATTERN = iText.IO.Util.StringUtil.RegexCompile("^.*(x_wconf *\\d+).*$" + private static readonly Regex WCONF_PATTERN = iText.Commons.Utils.StringUtil.RegexCompile("^.*(x_wconf *\\d+).*$" ); /// Size of the array containing bbox. @@ -93,37 +97,6 @@ public class TesseractHelper { private TesseractHelper() { } - /// - /// Parses each hocr file from the provided list, retrieves text, and - /// returns data in the format described below. - /// - /// list of input files - /// - /// - /// - /// - /// - /// - /// - /// where key is - /// - /// representing the number of the page and value is - /// - /// of - /// - /// elements where each - /// - /// element contains a word or a line and its 4 - /// coordinates(bbox) - /// - [System.ObsoleteAttribute(@"since 1.0.2. Use ParseHocrFile(System.Collections.Generic.IList{E}, System.Collections.Generic.IList{E}, Tesseract4OcrEngineProperties) instead" - )] - public static IDictionary> ParseHocrFile(IList inputFiles, TextPositioning - textPositioning) { - return ParseHocrFile(inputFiles, null, new Tesseract4OcrEngineProperties().SetTextPositioning(textPositioning - )); - } - /// /// Parses each hocr file from the provided list, retrieves text, and /// returns data in the format described below. @@ -171,7 +144,7 @@ internal static IDictionary> ParseHocrFile(IList inputFile.FullName); Elements pages = doc.GetElementsByClass(OCR_PAGE); foreach (iText.StyledXmlParser.Jsoup.Nodes.Element page in pages) { - String[] pageNum = iText.IO.Util.StringUtil.Split(page.Id(), PAGE_PREFIX_PATTERN); + String[] pageNum = iText.Commons.Utils.StringUtil.Split(page.Id(), PAGE_PREFIX_PATTERN); int pageNumber = Convert.ToInt32(pageNum[pageNum.Length - 1], System.Globalization.CultureInfo.InvariantCulture ); IList textData = GetTextData(page, tesseract4OcrEngineProperties, txt, unparsedBBoxes); @@ -186,8 +159,8 @@ internal static IDictionary> ParseHocrFile(IList } } foreach (iText.StyledXmlParser.Jsoup.Nodes.Node node in unparsedBBoxes.Values) { - LOGGER.Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, node.ToString()) - ); + LOGGER.LogWarning(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, node.ToString + ())); } return imageData; } @@ -216,9 +189,10 @@ internal static Rectangle GetAlignedBBox(iText.StyledXmlParser.Jsoup.Nodes.Eleme internal static Rectangle ParseBBox(iText.StyledXmlParser.Jsoup.Nodes.Node node, Rectangle pageBBox, IDictionary unparsedBBoxes) { IList bbox = new List(); - Matcher bboxMatcher = iText.IO.Util.Matcher.Match(BBOX_PATTERN, node.Attr(TITLE)); + Matcher bboxMatcher = iText.Commons.Utils.Matcher.Match(BBOX_PATTERN, node.Attr(TITLE)); if (bboxMatcher.Matches()) { - Matcher bboxCoordinateMatcher = iText.IO.Util.Matcher.Match(BBOX_COORDINATE_PATTERN, bboxMatcher.Group()); + Matcher bboxCoordinateMatcher = iText.Commons.Utils.Matcher.Match(BBOX_COORDINATE_PATTERN, bboxMatcher.Group + ()); if (bboxCoordinateMatcher.Matches()) { for (int i = 0; i < BBOX_ARRAY_SIZE; i++) { String coord = bboxCoordinateMatcher.Group(i + 1); @@ -293,12 +267,12 @@ internal static void DeleteFile(String pathToFile) { } } catch (System.IO.IOException e) { - LOGGER.Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, pathToFile, e.Message - )); + LOGGER.LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, pathToFile + , e.Message)); } catch (SecurityException e) { - LOGGER.Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, pathToFile, e.Message - )); + LOGGER.LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, pathToFile + , e.Message)); } } @@ -316,12 +290,12 @@ internal static void DeleteFile(String pathToFile) { internal static String ReadTxtFile(FileInfo txtFile) { String content = null; try { - content = iText.IO.Util.JavaUtil.GetStringForBytes(File.ReadAllBytes(txtFile.FullName), System.Text.Encoding + content = iText.Commons.Utils.JavaUtil.GetStringForBytes(File.ReadAllBytes(txtFile.FullName), System.Text.Encoding .UTF8); } catch (System.IO.IOException e) { - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_FILE, txtFile.FullName, e.Message - )); + LOGGER.LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_FILE, txtFile.FullName, + e.Message)); } return content; } @@ -349,7 +323,7 @@ internal static void WriteToTextFile(String path, String data) { } } catch (System.IO.IOException e) { - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE, path, e.Message)); + throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e); } } @@ -377,19 +351,19 @@ internal static void RunCommand(String execPath, IList paramsList, Strin String @params = String.Join(" ", paramsList); bool cmdSucceeded = SystemUtil.RunProcessAndWait(execPath, @params, workingDirPath); if (!cmdSucceeded) { - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, execPath + " " + @params + LOGGER.LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, execPath + " " + @params )); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } } catch (System.IO.IOException e) { // NOSONAR - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, e.Message)); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + LOGGER.LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, e.Message)); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } catch (Exception e) { - LOGGER.Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, e.Message)); - throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); + LOGGER.LogError(MessageFormatUtil.Format(Tesseract4LogMessageConstant.COMMAND_FAILED, e.Message)); + throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } } @@ -451,7 +425,7 @@ private static bool IsElementConfident(iText.StyledXmlParser.Jsoup.Nodes.Element foreach (iText.StyledXmlParser.Jsoup.Nodes.Node node in lineOrCaption.ChildNodes()) { if (node is iText.StyledXmlParser.Jsoup.Nodes.Element) { String title = ((iText.StyledXmlParser.Jsoup.Nodes.Element)node).Attr(TITLE); - Matcher matcher = iText.IO.Util.Matcher.Match(WCONF_PATTERN, title); + Matcher matcher = iText.Commons.Utils.Matcher.Match(WCONF_PATTERN, title); if (matcher.Matches()) { String wconf = null; try { @@ -461,7 +435,7 @@ private static bool IsElementConfident(iText.StyledXmlParser.Jsoup.Nodes.Element } //No need to do anything here if (wconf != null) { - wconf = iText.IO.Util.StringUtil.ReplaceAll(wconf, X_WCONF, "").Trim(); + wconf = iText.Commons.Utils.StringUtil.ReplaceAll(wconf, X_WCONF, "").Trim(); wconfTotal += Convert.ToInt32(wconf, System.Globalization.CultureInfo.InvariantCulture); wconfCount++; } @@ -485,21 +459,21 @@ private static IList GetTextDataForWords(iText.StyledXmlParser.Jsoup.N if (txtLine == null) { foreach (iText.StyledXmlParser.Jsoup.Nodes.Element word in lineOrCaption.GetElementsByClass(OCRX_WORD)) { Rectangle bboxRect = GetAlignedBBox(word, textPositioning, pageBbox, unparsedBBoxes); - AddToTextData(textData, word.Text(), bboxRect, pageBbox); + AddToTextData(textData, word.Text(), bboxRect); } } else { IList textInfos = new List(); - String txtLine1 = iText.IO.Util.StringUtil.ReplaceAll(txtLine, NEW_LINE_PATTERN, ""); - String txtLine2 = iText.IO.Util.StringUtil.ReplaceAll(txtLine1, SPACE_PATTERN, " "); - String[] lineItems = iText.IO.Util.StringUtil.Split(txtLine2, " "); + String txtLine1 = iText.Commons.Utils.StringUtil.ReplaceAll(txtLine, NEW_LINE_PATTERN, ""); + String txtLine2 = iText.Commons.Utils.StringUtil.ReplaceAll(txtLine1, SPACE_PATTERN, " "); + String[] lineItems = iText.Commons.Utils.StringUtil.Split(txtLine2, " "); foreach (iText.StyledXmlParser.Jsoup.Nodes.Element word in lineOrCaption.GetElementsByClass(OCRX_WORD)) { Rectangle bboxRect = GetAlignedBBox(word, textPositioning, pageBbox, unparsedBBoxes); textInfos.Add(new TextInfo(word.Text(), bboxRect)); - if (iText.IO.Util.StringUtil.ReplaceAll(lineItems[0], NEW_LINE_OR_SPACE_PATTERN, "").Equals(iText.IO.Util.StringUtil.ReplaceAll + if (iText.Commons.Utils.StringUtil.ReplaceAll(lineItems[0], NEW_LINE_OR_SPACE_PATTERN, "").Equals(iText.Commons.Utils.StringUtil.ReplaceAll (GetTextInfosText(textInfos), SPACE_PATTERN, ""))) { lineItems = JavaUtil.ArraysCopyOfRange(lineItems, 1, lineItems.Length); - AddToTextData(textData, MergeTextInfos(textInfos), pageBbox); + AddToTextData(textData, MergeTextInfos(textInfos)); textInfos.Clear(); } } @@ -514,37 +488,34 @@ private static IList GetTextDataForLines(iText.StyledXmlParser.Jsoup.N IList textData = new List(); Rectangle bboxRect = GetAlignedBBox(lineOrCaption, TextPositioning.BY_LINES, pageBbox, unparsedBBoxes); if (txtLine == null) { - AddToTextData(textData, lineOrCaption.Text(), bboxRect, pageBbox); + AddToTextData(textData, lineOrCaption.Text(), bboxRect); } else { - AddToTextData(textData, txtLine, bboxRect, pageBbox); + AddToTextData(textData, txtLine, bboxRect); } return textData; } /// Add text chunk represented by text and bbox to list of text infos. - private static void AddToTextData(IList textData, String text, Rectangle bboxRect, Rectangle pageBbox - ) { - IList bbox = JavaUtil.ArraysAsList(ToPixels(bboxRect.GetLeft()), ToPixels(pageBbox.GetTop() - bboxRect - .GetTop()), ToPixels(bboxRect.GetRight()), ToPixels(pageBbox.GetTop() - bboxRect.GetBottom())); - TextInfo textInfo = new TextInfo(text, bboxRect, bbox); + private static void AddToTextData(IList textData, String text, Rectangle bboxRect) { + TextInfo textInfo = new TextInfo(text, bboxRect); textData.Add(textInfo); } /// Add text chunk represented by text info to list of text infos. - private static void AddToTextData(IList textData, TextInfo textInfo, Rectangle pageBbox) { + private static void AddToTextData(IList textData, TextInfo textInfo) { String text = textInfo.GetText(); Rectangle bboxRect = textInfo.GetBboxRect(); - AddToTextData(textData, text, bboxRect, pageBbox); + AddToTextData(textData, text, bboxRect); } /// Gets common text for list of text infos. private static String GetTextInfosText(IList textInfos) { - String text = ""; + StringBuilder text = new StringBuilder(); foreach (TextInfo textInfo in textInfos) { - text = text + textInfo.GetText(); + text.Append(textInfo.GetText()); } - return text; + return text.ToString(); } /// Merges text infos. @@ -568,12 +539,12 @@ private static String FindHocrLineInTxt(iText.StyledXmlParser.Jsoup.Nodes.Elemen if (txt == null) { return null; } - String hocrLineText = iText.IO.Util.StringUtil.ReplaceAll(line.Text(), SPACE_PATTERN, ""); + String hocrLineText = iText.Commons.Utils.StringUtil.ReplaceAll(line.Text(), SPACE_PATTERN, ""); if (String.IsNullOrEmpty(hocrLineText)) { return null; } foreach (String txtLine in txt) { - if (iText.IO.Util.StringUtil.ReplaceAll(txtLine, SPACE_PATTERN, "").Equals(hocrLineText)) { + if (iText.Commons.Utils.StringUtil.ReplaceAll(txtLine, SPACE_PATTERN, "").Equals(hocrLineText)) { return txtLine; } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs index 5a3341d..f4b9786 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs @@ -28,9 +28,13 @@ You should have received a copy of the GNU Affero General Public License using System.IO; using System.Linq; using System.Runtime.InteropServices; -using Common.Logging; +using iText.Commons; +using iText.Commons.Utils; using iText.IO.Image; using iText.IO.Util; +using iText.Pdfocr.Tesseract4.Logs; +using Microsoft.Extensions.Logging; +using iText.Pdfocr.Tesseract4.Exceptions; using Tesseract; namespace iText.Pdfocr.Tesseract4 { @@ -153,8 +157,8 @@ internal static Pix ConvertToGrayscale(Pix pix) { } else { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_CONVERT_IMAGE_TO_GRAYSCALE, depth)); + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_CONVERT_IMAGE_TO_GRAYSCALE, depth)); return pix; } } @@ -207,16 +211,16 @@ internal static Pix OtsuImageThresholding(Pix pix, ImagePreprocessingOptions ima } else { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, pix.Depth)); + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, pix.Depth)); DestroyPix(thresholdPix); return pix; } } else { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, pix.Depth)); + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogInformation(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, pix.Depth)); return pix; } } @@ -324,7 +328,7 @@ internal static void SetTesseractProperties(TesseractEngine tesseractInstance, S /// method. In .Net all these properties /// are needed to be provided in tesseract constructor in order to /// initialize tesseract instance.Thus, tesseract initialization takes - /// place in constructor in + /// place in constructor in /// java, but in .Net it happens only after all properties are validated, /// i.e. just before OCR process. /// @@ -355,9 +359,9 @@ internal static TesseractEngine InitializeTesseractInstance(bool isWindows, Stri } catch (Exception e) { - throw new Tesseract4OcrException(isWindows ? - Tesseract4OcrException.TESSERACT_LIB_NOT_INSTALLED_WIN : - Tesseract4OcrException.TESSERACT_LIB_NOT_INSTALLED, e); + throw new PdfOcrTesseract4Exception(isWindows ? + PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_LIB_NOT_INSTALLED_WIN : + PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_LIB_NOT_INSTALLED, e); } } } @@ -437,7 +441,7 @@ internal static String GetTempFilePath(string name, string suffix) { /// Returns parent directory for the passed path. /// path path to file /// parent directory where the file is located - internal static String GetParentDirectory(string path) + internal static String GetParentDirectoryFile(string path) { return Directory.GetParent(path).FullName; } @@ -475,8 +479,8 @@ internal void InitializeImagesListFromTiff(FileInfo inputFile) SetListOfPages(bitmapList); } catch (Exception e) { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Error(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogError(MessageFormatUtil.Format( Tesseract4LogMessageConstant.CANNOT_RETRIEVE_PAGES_FROM_IMAGE, inputFile.FullName, e.Message)); @@ -505,8 +509,8 @@ internal static Bitmap GetImagePage(FileInfo input, int page) int pages = image.GetFrameCount(FrameDimension.Page); if (page >= pages) { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Warn(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogWarning(MessageFormatUtil.Format( Tesseract4LogMessageConstant.PAGE_NUMBER_IS_INCORRECT, page, input.FullName)); @@ -516,8 +520,8 @@ internal static Bitmap GetImagePage(FileInfo input, int page) img = new Bitmap(image); } catch (Exception e) { - LogManager.GetLogger(typeof(TesseractOcrUtil)) - .Error(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)) + .LogError(MessageFormatUtil.Format( Tesseract4LogMessageConstant.CANNOT_RETRIEVE_PAGES_FROM_IMAGE, input.FullName, e.Message)); @@ -727,7 +731,7 @@ internal static void SaveImageToTempPngFile(string tmpFileName, Bitmap image) } catch (Exception e) { - LogManager.GetLogger(typeof(TesseractOcrUtil)).Error(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogError(MessageFormatUtil.Format( Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE, e.Message)); } @@ -754,7 +758,7 @@ internal static void SavePixToPngFile(string filename, Pix pix) } catch (Exception e) { - LogManager.GetLogger(typeof(TesseractOcrUtil)).Info(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogInformation(MessageFormatUtil.Format( Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE, e.Message)); } @@ -835,7 +839,7 @@ internal static Pix ReadPix(FileInfo inputFile) catch (Exception e) { // NOSONAR - LogManager.GetLogger(typeof(TesseractOcrUtil)).Error(MessageFormatUtil.Format + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogError(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); } if (pix != null) @@ -870,7 +874,7 @@ internal static Pix ReadPix(byte[] imageBytes) catch (Exception e) { // NOSONAR - LogManager.GetLogger(typeof(TesseractOcrUtil)).Error(MessageFormatUtil.Format + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogError(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); return null; } @@ -895,7 +899,7 @@ internal static int DetectRotation(FileInfo inputFile) catch (Exception e) { // NOSONAR - LogManager.GetLogger(typeof(TesseractOcrUtil)).Error(MessageFormatUtil.Format + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogError(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); return ROTATION_0; } @@ -935,7 +939,7 @@ internal static int DetectRotation(byte[] imageBytes) catch (Exception e) { // NOSONAR - LogManager.GetLogger(typeof(TesseractOcrUtil)).Error(MessageFormatUtil.Format + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogError(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, e.Message)); return ROTATION_0; } @@ -970,7 +974,7 @@ internal static int ReadRotationFromMetadata(System.Drawing.Image image) case EXIF_ROTATION_270: return ROTATION_270; default: - LogManager.GetLogger(typeof(TesseractOcrUtil)).Warn(MessageFormatUtil.Format( + ITextLogManager.GetLogger(typeof(TesseractOcrUtil)).LogWarning(MessageFormatUtil.Format( Tesseract4LogMessageConstant.UNSUPPORTED_EXIF_ORIENTATION_VALUE, orientation)); return ROTATION_0; diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs new file mode 100644 index 0000000..070ad1e --- /dev/null +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs @@ -0,0 +1,56 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using iText.Commons.Actions.Data; + +namespace iText.Pdfocr.Tesseract4.Actions.Data { + /// + /// Stores an instance of + /// + /// related to iText pdfOcr Tesseract4 module. + /// + public class PdfOcrTesseract4ProductData { + private const String PDF_OCR_TESSERACT4_PRODUCT_NAME = "pdfOcr-tesseract4"; + + private const String PDF_OCR_TESSERACT4_PUBLIC_PRODUCT_NAME = "pdfOCR-Tesseract4"; + + private const String PDF_OCR_VERSION = "2.0.0"; + + private const int PDF_OCR_COPYRIGHT_SINCE = 2000; + + private const int PDF_OCR_COPYRIGHT_TO = 2021; + + private static readonly ProductData PDF_OCR_PRODUCT_DATA = new ProductData(PDF_OCR_TESSERACT4_PUBLIC_PRODUCT_NAME + , PDF_OCR_TESSERACT4_PRODUCT_NAME, PDF_OCR_VERSION, PDF_OCR_COPYRIGHT_SINCE, PDF_OCR_COPYRIGHT_TO); + + /// + /// Getter for an instance of + /// + /// related to iText pdfOcr Tesseract4 module. + /// + /// iText pdfOcr Tesseract4 product description + public static ProductData GetInstance() { + return PDF_OCR_PRODUCT_DATA; + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/events/PdfOcrTesseract4ProductEvent.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/events/PdfOcrTesseract4ProductEvent.cs new file mode 100644 index 0000000..14ef410 --- /dev/null +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/events/PdfOcrTesseract4ProductEvent.cs @@ -0,0 +1,64 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using iText.Commons.Actions; +using iText.Commons.Actions.Confirmations; +using iText.Commons.Actions.Contexts; +using iText.Commons.Actions.Sequence; +using iText.Pdfocr.Tesseract4.Actions.Data; + +namespace iText.Pdfocr.Tesseract4.Actions.Events { + /// Class represents events registered in iText pdfOcr Tesseract4 module. + public class PdfOcrTesseract4ProductEvent : AbstractProductProcessITextEvent { + /// Process image event type. + public const String PROCESS_IMAGE = "process-image"; + + private readonly String eventType; + + /// Creates an event associated with a general identifier and additional meta data. + /// is an identifier associated with the event + /// is an additional meta info + /// is a string description of the event + /// is an event confirmation type + private PdfOcrTesseract4ProductEvent(SequenceId sequenceId, IMetaInfo metaInfo, String eventType, EventConfirmationType + eventConfirmationType) + : base(sequenceId, PdfOcrTesseract4ProductData.GetInstance(), metaInfo, eventConfirmationType) { + this.eventType = eventType; + } + + /// Creates process-image event. + /// is an identifier associated with the event + /// is an additional meta info + /// is an event confirmation type + /// process-image event + public static iText.Pdfocr.Tesseract4.Actions.Events.PdfOcrTesseract4ProductEvent CreateProcessImageEvent( + SequenceId sequenceId, IMetaInfo metaInfo, EventConfirmationType eventConfirmationType) { + return new iText.Pdfocr.Tesseract4.Actions.Events.PdfOcrTesseract4ProductEvent(sequenceId, metaInfo, PROCESS_IMAGE + , eventConfirmationType); + } + + public override String GetEventType() { + return eventType; + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/events/PdfOcrTesseract4Event.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/events/PdfOcrTesseract4Event.cs deleted file mode 100644 index 0e2eb7f..0000000 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/events/PdfOcrTesseract4Event.cs +++ /dev/null @@ -1,54 +0,0 @@ -/* -This file is part of the iText (R) project. -Copyright (c) 1998-2021 iText Group NV -Authors: iText Software. - -This program is offered under a commercial and under the AGPL license. -For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. - -AGPL licensing: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . -*/ -using System; -using iText.Kernel.Counter.Event; - -namespace iText.Pdfocr.Tesseract4.Events { - /// Class for ocr events - public class PdfOcrTesseract4Event : IGenericEvent { - public static readonly iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event TESSERACT4_IMAGE_OCR = new iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event - ("tesseract4-image-ocr"); - - public static readonly iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event TESSERACT4_IMAGE_TO_PDF = new - iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event("tesseract4-image-to-pdf"); - - public static readonly iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event TESSERACT4_IMAGE_TO_PDFA = new - iText.Pdfocr.Tesseract4.Events.PdfOcrTesseract4Event("tesseract4-image-to-pdfa"); - - private const String PDF_OCR_TESSERACT4_ORIGIN_ID = "iText.Pdfocr.Tesseract4"; - - private readonly String subtype; - - private PdfOcrTesseract4Event(String subtype) { - this.subtype = subtype; - } - - public virtual String GetEventType() { - return "pdfOcr-" + subtype; - } - - public virtual String GetOriginId() { - return PDF_OCR_TESSERACT4_ORIGIN_ID; - } - } -} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrInputTesseract4Exception.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrInputTesseract4Exception.cs new file mode 100644 index 0000000..d951176 --- /dev/null +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrInputTesseract4Exception.cs @@ -0,0 +1,67 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; + +namespace iText.Pdfocr.Tesseract4.Exceptions { + public class PdfOcrInputTesseract4Exception : PdfOcrTesseract4Exception { + /// + /// Creates a new + /// . + /// + /// the detail message. + /// + /// the cause + /// (which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrInputTesseract4Exception(String msg, Exception e) + : base(msg, e) { + } + + /// + /// Creates a new + /// . + /// + /// the detail message. + public PdfOcrInputTesseract4Exception(String msg) + : base(msg) { + } + + /// + /// Creates a new + /// . + /// + /// + /// the cause + /// which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrInputTesseract4Exception(Exception e) + : base(e) { + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4Exception.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4Exception.cs new file mode 100644 index 0000000..a159fd6 --- /dev/null +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4Exception.cs @@ -0,0 +1,68 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2021 iText Group NV +Authors: iText Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System; +using iText.Pdfocr.Exceptions; + +namespace iText.Pdfocr.Tesseract4.Exceptions { + public class PdfOcrTesseract4Exception : PdfOcrException { + /// + /// Creates a new + /// . + /// + /// the detail message. + /// + /// the cause + /// (which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrTesseract4Exception(String msg, Exception e) + : base(msg, e) { + } + + /// + /// Creates a new + /// . + /// + /// the detail message. + public PdfOcrTesseract4Exception(String msg) + : base(msg) { + } + + /// + /// Creates a new + /// . + /// + /// + /// the cause + /// which is saved for later retrieval + /// by + /// + /// method). + /// + public PdfOcrTesseract4Exception(Exception e) + : base(e) { + } + } +} diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrException.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4ExceptionMessageConstant.cs similarity index 64% rename from itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrException.cs rename to itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4ExceptionMessageConstant.cs index 4994688..31aecb1 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrException.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/exceptions/PdfOcrTesseract4ExceptionMessageConstant.cs @@ -21,13 +21,9 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ using System; -using iText.Pdfocr; - -namespace iText.Pdfocr.Tesseract4 { - public class Tesseract4OcrException : OcrException { - public const String TESSERACT_LIB_NOT_INSTALLED_WIN = "Tesseract failed. " - + "Please ensure you have at least Visual C++ 2015 Redistributable installed"; +namespace iText.Pdfocr.Tesseract4.Exceptions { + public class PdfOcrTesseract4ExceptionMessageConstant { public const String INCORRECT_INPUT_IMAGE_FORMAT = "{0} format is not supported."; public const String INCORRECT_LANGUAGE = "{0} does not exist in {1}"; @@ -36,40 +32,27 @@ public class Tesseract4OcrException : OcrException { public const String CANNOT_READ_PROVIDED_IMAGE = "Cannot read input image {0}"; - public const String TESSERACT_FAILED = "Tesseract failed. " + "Please check provided parameters"; + public const String CANNOT_WRITE_TO_FILE = "Cannot write to file {0}: {1}"; + + public const String TESSERACT_FAILED = "Tesseract failed. Please check provided parameters"; + + public const String TESSERACT_LIB_NOT_INSTALLED = "Tesseract failed. Please ensure you have tesseract library installed"; - public const String TESSERACT_LIB_NOT_INSTALLED = "Tesseract failed. " + "Please ensure you have tesseract library installed"; + public const String TESSERACT_LIB_NOT_INSTALLED_WIN = "Tesseract failed. Please ensure you have latest Visual C++ Redistributable installed"; - public const String TESSERACT_NOT_FOUND = "Tesseract failed. " + "Please check that tesseract is installed and provided path to " + public const String TESSERACT_NOT_FOUND = "Tesseract failed. Please check that tesseract is installed and provided path to " + "tesseract executable directory is correct"; public const String CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE = "Cannot find path to tesseract executable."; - public const String PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID = "Provided path to tess data directory does not exist or it is " - + "an invalid directory"; + public const String PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID = "Provided path to tess data directory does not exist or it is an invalid directory"; - public const String PATH_TO_TESS_DATA_IS_NOT_SET = "Path to tess data directory cannot be null and must be set " - + "to a valid directory"; + public const String PATH_TO_TESS_DATA_IS_NOT_SET = "Path to tess data directory cannot be null and must be set to a valid directory"; public const String PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS = "Path to tess data should contain only ASCII characters"; - /// Creates a new TesseractException. - /// the detail message. - /// - /// the cause - /// (which is saved for later retrieval - /// by - /// - /// method). - /// - public Tesseract4OcrException(String msg, Exception e) - : base(msg, e) { - } - - /// Creates a new TesseractException. - /// the detail message. - public Tesseract4OcrException(String msg) - : base(msg) { + private PdfOcrTesseract4ExceptionMessageConstant() { } + //Private constructor will prevent the instantiation of this class directly } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LogMessageConstant.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/logs/Tesseract4LogMessageConstant.cs similarity index 83% rename from itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LogMessageConstant.cs rename to itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/logs/Tesseract4LogMessageConstant.cs index fa8de35..e9df364 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LogMessageConstant.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/logs/Tesseract4LogMessageConstant.cs @@ -22,7 +22,7 @@ You should have received a copy of the GNU Affero General Public License */ using System; -namespace iText.Pdfocr.Tesseract4 { +namespace iText.Pdfocr.Tesseract4.Logs { public class Tesseract4LogMessageConstant { public const String TESSERACT_FAILED = "Tesseract failed: {0}"; @@ -30,7 +30,7 @@ public class Tesseract4LogMessageConstant { public const String CANNOT_READ_FILE = "Cannot read file {0}: {1}"; - public const String CANNOT_OCR_INPUT_FILE = "Cannot ocr input file: {1}"; + public const String CANNOT_OCR_INPUT_FILE = "Cannot ocr input file: {0}"; public const String CANNOT_USE_USER_WORDS = "Cannot use custom user words: {0}"; @@ -40,14 +40,11 @@ public class Tesseract4LogMessageConstant { public const String CANNOT_DELETE_FILE = "File {0} cannot be deleted: {1}"; - public const String CANNOT_PROCESS_IMAGE = "Cannot process " + "image: {0}"; - - public const String CANNOT_WRITE_TO_FILE = "Cannot write to file {0}: {1}"; + public const String CANNOT_PROCESS_IMAGE = "Cannot process image: {0}"; public const String CREATED_TEMPORARY_FILE = "Created temp file {0}"; - /// Constant is not used. - [System.ObsoleteAttribute(@"since 1.0.1. Will be removed in 2.0.0")] + // Constant is used only in .NET version, but it's kept here for the sake of consistency and autoporting. public const String CANNOT_CONVERT_IMAGE_TO_GRAYSCALE = "Cannot convert to gray image with depth {0}"; public const String CANNOT_BINARIZE_IMAGE = "Cannot binarize image with depth {0}"; @@ -58,11 +55,7 @@ public class Tesseract4LogMessageConstant { public const String CANNOT_READ_INPUT_IMAGE = "Cannot read input image {0}"; - public const String CANNOT_GET_TEMPORARY_DIRECTORY = "Cannot get " + "temporary directory: {0}"; - - /// Constant is not used. - [System.ObsoleteAttribute(@"since 1.0.1. Will be removed in 2.0.0")] - public const String CANNOT_CONVERT_IMAGE_TO_PIX = "Cannot convert image to pix: {0}"; + public const String CANNOT_GET_TEMPORARY_DIRECTORY = "Cannot get temporary directory: {0}"; public const String CANNOT_PARSE_NODE_BBOX = "Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}"; @@ -72,5 +65,6 @@ public class Tesseract4LogMessageConstant { private Tesseract4LogMessageConstant() { } + //Private constructor will prevent the instantiation of this class directly } } diff --git a/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec b/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec index 113211e..9358518 100644 --- a/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec +++ b/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec @@ -2,7 +2,7 @@ itext7.pdfocr.tesseract4 - 1.0.3 + 2.0.0 iText 7 pdfOcr iText Software iText Software @@ -17,8 +17,8 @@ OCR PDF ligatures text glyphs iText Optical Character Recognition PDF/A ISO-compliant Tesseract open-source opensource English Mandarin Chinese Hindi Spanish French Arabic Bengali Russian Portuguese Indonesian scan image extractable data searchable diacritic sdk c# .net - - + + @@ -27,10 +27,10 @@ - - + + - + diff --git a/port-hash b/port-hash index 1fdc3a4..381754d 100644 --- a/port-hash +++ b/port-hash @@ -1 +1 @@ -c438260f7e5f29ec0bfe0306e06fb1a5ce0bd6db +fb9aa93bb391504fd844c0010192124ce0d7fc49