Skip to content

Commit

Permalink
[RELEASE] iText 7 pdfOcr - 1.0.2
Browse files Browse the repository at this point in the history
https://git.itextsupport.com/

* release/1.0.2:
  [RELEASE] 1.0.2-SNAPSHOT -> 1.0.2
  Update port-hash
  thai_03 test fails in .NET. Might be related to reading UTF-8 files issue
  Combine HOCR and TXT outputs for more precise text recognition
  Deprecate unused log message constant
  Stabilize test on different Tesseract versions
  Add possibility to set image preprocessing properties
  Tesseract does not respect image rotation when doing OCR
  Update port-hash
  Use tesseract executable from path instead of tesseractDir in tests
  If path to tessdata contains non ASCII characters, code unexpectedly fails
  TextInfo: move from List<Float> to Rectangle
  Use generalized Jenkinsfile in the pipeline-library
  Deprecate Tesseract4LogMessageConstant#CANNOT_CONVERT_IMAGE_TO_GRAYSCALE
  Update autoported files
  Non-Ascii characters support for the output file
  Use ImageTypeDetector from io module to detect image types
  Use new SystemUtil#runProcessAndWait overload from 7.1.12-SNAPSHOT accepting working directory
  Update port-hash
  Update port-hash after release
  [RELEASE] Update dependency versions
  • Loading branch information
iText-CI committed Oct 8, 2020
2 parents 0400706 + 6c3e305 commit 6386958
Show file tree
Hide file tree
Showing 72 changed files with 2,227 additions and 727 deletions.
219 changes: 4 additions & 215 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,219 +1,8 @@
#!/usr/bin/env groovy
@Library('pipeline-library')_

def schedule = env.BRANCH_NAME.contains('master') ? '@monthly' : env.BRANCH_NAME == 'develop' ? '@midnight' : ''
def repoName = "pdfOcr"
def dependencyRegex = "itextcore"
def solutionFile = "i7n-ocr.sln"

pipeline {

agent { label 'windows' }

environment {
tesseractDir = tool name: 'Tesseract', type: 'com.cloudbees.jenkins.plugins.customtools.CustomTool'
}

options {
ansiColor('xterm')
buildDiscarder(logRotator(artifactNumToKeepStr: '1'))
parallelsAlwaysFailFast()
retry(1)
skipStagesAfterUnstable()
timeout(time: 60, unit: 'MINUTES')
timestamps()
}

triggers {
cron(schedule)
}

stages {
stage('Abort possible previous builds') {
steps {
script {
abortPreviousBuilds()
}
}
}
stage('Wait for blocking jobs') {
steps {
script {
properties[[
$class : 'BuildBlockerProperty',
blockLevel : 'GLOBAL',
blockingJobs : "^iText_7_Java/itextcore/$env.JOB_BASE_NAME\$",
scanQueueFor : 'ALL',
useBuildBlocker: true
]]
}
}
}
stage('Clean workspace') {
options {
timeout(time: 5, unit: 'MINUTES')
}
steps {
cleanWs deleteDirs: true, patterns: [
[pattern: 'packages', type: 'INCLUDE'],
[pattern: 'global-packages', type: 'INCLUDE'],
[pattern: 'tmp/NuGetScratch', type: 'INCLUDE'],
[pattern: 'http-cache', type: 'INCLUDE'],
[pattern: 'plugins-cache', type: 'INCLUDE'],
[pattern: '**/obj', type: 'INCLUDE'],
[pattern: '**/bin', type: 'INCLUDE'],
[pattern: '**/*.nupkg', type: 'INCLUDE']
]
}
}
stage('Compile') {
options {
timeout(time: 20, unit: 'MINUTES')
}
steps {
echo "Tesseract directory is ${tesseractDir}"

withEnv(["NUGET_PACKAGES=${env.WORKSPACE}/global-packages",
"temp=${env.WORKSPACE}/tmp/NuGetScratch",
"NUGET_HTTP_CACHE_PATH=${env.WORKSPACE}/http-cache", "NUGET_PLUGINS_CACHE_PATH=${env.WORKSPACE}/plugins-cache", "gsExec=${gsExec}", "compareExec=${compareExec}", "tesseractDir=${tesseractDir}"]) {
bat "\"${env.NuGet}\" restore i7n-ocr.sln"
bat "dotnet restore i7n-ocr.sln"
bat "dotnet build i7n-ocr.sln --configuration Release --source ${env.WORKSPACE}/packages"
script {
createPackAllFile(findFiles(glob: '**/*.nuspec'))
load 'packAll.groovy'
}
}
}
}
stage('Run Tests') {
options {
timeout(time: 60, unit: 'MINUTES')
}
steps {
echo "Tesseract directory is ${tesseractDir}"

withEnv(["NUGET_PACKAGES=${env.WORKSPACE}/global-packages",
"temp=${env.WORKSPACE}/tmp/NuGetScratch",
"NUGET_HTTP_CACHE_PATH=${env.WORKSPACE}/http-cache", "NUGET_PLUGINS_CACHE_PATH=${env.WORKSPACE}/plugins-cache", "gsExec=${gsExec}", "compareExec=${compareExec}", "tesseractDir=${tesseractDir}"]) {
script {
createRunTestDllsFile(findFiles(glob: '**/itext.*.tests.dll'))
load 'runTestDlls.groovy'
}
}
}
}
stage('Artifactory Deploy') {
options {
timeout(time: 5, unit: 'MINUTES')
}
when {
anyOf {
branch "master"
branch "develop"
}
}
steps {
script {
getAndConfigureJFrogCLI()
findFiles(glob: '*.nupkg').each { item ->
upload(item)
}
}
}
}
stage('Branch Artifactory Deploy') {
options {
timeout time: 5, unit: 'MINUTES'
}
when {
not {
anyOf {
branch "master"
branch "develop"
}
}
}
steps {
script {
getAndConfigureJFrogCLI()
if (env.GIT_URL) {
repoName = ("$env.GIT_URL" =~ /(.*\/)(.*)(\.git)/)[0][2]
findFiles(glob: '*.nupkg').each { item ->
sh "./jfrog rt u \"$item.path\" branch-artifacts/$env.BRANCH_NAME/$repoName/dotnet/ --recursive=false --build-name $env.BRANCH_NAME --build-number $env.BUILD_NUMBER --props \"vcs.revision=$env.GIT_COMMIT;repo.name=$repoName\""
}
}
}
}
}
stage('Archive Artifacts') {
options {
timeout(time: 5, unit: 'MINUTES')
}
steps {
archiveArtifacts allowEmptyArchive: true, artifacts: '*.nupkg'
}
}
}

post {
always {
echo 'One way or another, I have finished \uD83E\uDD16'
}
success {
echo 'I succeeeded! \u263A'
cleanWs deleteDirs: true
}
unstable {
echo 'I am unstable \uD83D\uDE2E'
}
failure {
echo 'I failed \uD83D\uDCA9'
}
changed {
echo 'Things were different before... \uD83E\uDD14'
}
}

}

@NonCPS // has to be NonCPS or the build breaks on the call to .each
def createPackAllFile(list) {
// creates file because the bat command brakes the loop
def cmd = ''
list.each { item ->
if (!item.path.contains("packages")) {
cmd = cmd + "bat '\"${env.NuGet.replace('\\','\\\\')}\" pack \"${item.path.replace('\\','\\\\')}\"'\n"
}
}
writeFile file: 'packAll.groovy', text: cmd
}

@NonCPS // has to be NonCPS or the build breaks on the call to .each
def createRunTestDllsFile(list) {
// creates file because the bat command brakes the loop
def ws = "${env.WORKSPACE.replace('\\','\\\\')}"
def nunit = "${env.'Nunit3-console'.replace('\\','\\\\')}"
def cmd = ''
list.each { item ->
if (!item.path.contains("netcoreapp1.0") && !item.path.contains("obj")) {
cmd = cmd + "bat '\"${nunit}\" \"${ws}\\\\${item.path.replace('\\','\\\\')}\" --result=${item.name}-TestResult.xml'\n"
}
}
writeFile file: 'runTestDlls.groovy', text: cmd
}

@NonCPS // has to be NonCPS or the build breaks on the call to .each
def createRunTestCsProjsFile(list) {
// creates file because the bat command brakes the loop
def ws = "${env.WORKSPACE.replace('\\','\\\\')}"
def cmd = ''
list.each { item ->
cmd = cmd + "bat 'dotnet test ${ws}\\\\${item.path.replace('\\','\\\\')} --configuration Release --no-build --logger \"trx;LogFileName=results.trx\"'\n"
}
writeFile file: 'runTestCsProjs.groovy', text: cmd
}

@NonCPS
def upload(item) {
def itemArray = (item =~ /(.*?)(\.[0-9]*\.[0-9]*\.[0-9]*(-SNAPSHOT)?\.nupkg)/)
def dir = itemArray[ 0 ][ 1 ]
sh "./jfrog rt u \"${item.path}\" nuget/${dir}/ --flat=false --build-name="${env.BRANCH_NAME}" --build-number=${env.BUILD_NUMBER}"
}
automaticDotnetBuild(repoName, dependencyRegex, solutionFile)
2 changes: 1 addition & 1 deletion doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.

PROJECT_NAME = "pdfOCR 1.0.1 API"
PROJECT_NAME = "pdfOCR 1.0.2 API"

# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
Expand Down
6 changes: 3 additions & 3 deletions itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@

[assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")]

[assembly: AssemblyVersion("1.0.1.0")]
[assembly: AssemblyFileVersion("1.0.1.0")]
[assembly: AssemblyInformationalVersion("1.0.1")]
[assembly: AssemblyVersion("1.0.2.0")]
[assembly: AssemblyFileVersion("1.0.2.0")]
[assembly: AssemblyInformationalVersion("1.0.2")]
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj" Condition="Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
<PackageReference Include="itext7.pdftest" Version="7.1.12" Condition="!Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
<PackageReference Include="itext7.pdftest" Version="7.1.13" Condition="!Exists('..\..\..\itextcore\itext\itext.pdftest\itext.pdftest.netstandard.csproj')" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.6.0" />
<PackageReference Include="NUnit" Version="3.12.0" />
<PackageReference Include="NUnit3TestAdapter" Version="3.16.1">
Expand Down
66 changes: 66 additions & 0 deletions itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ You should have received a copy of the GNU Affero General Public License
using System;
using System.Collections.Generic;
using System.IO;
using iText.IO.Image;
using iText.IO.Util;
using iText.Kernel.Colors;
using iText.Kernel.Font;
using iText.Kernel.Geom;
using iText.Pdfocr.Helpers;
using iText.Test;
using iText.Test.Attributes;
Expand All @@ -39,6 +41,20 @@ public virtual void TestTextInfo() {
NUnit.Framework.Assert.AreEqual(1, result.Count);
TextInfo textInfo = new TextInfo();
textInfo.SetText("text");
textInfo.SetBboxRect(new Rectangle(204.0f, 158.0f, 538.0f, 136.0f));
int page = 2;
result.Put(page, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
NUnit.Framework.Assert.AreEqual(2, result.Count);
NUnit.Framework.Assert.AreEqual(textInfo.GetText(), result.Get(page)[0].GetText());
}

[NUnit.Framework.Test]
public virtual void TestTextInfoDeprecationMode() {
String path = PdfHelper.GetDefaultImagePath();
IDictionary<int, IList<TextInfo>> result = new CustomOcrEngine(true).DoImageOcr(new FileInfo(path));
NUnit.Framework.Assert.AreEqual(1, result.Count);
TextInfo textInfo = new TextInfo();
textInfo.SetText("text");
textInfo.SetBbox(JavaUtil.ArraysAsList(204.0f, 158.0f, 742.0f, 294.0f));
int page = 2;
result.Put(page, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
Expand All @@ -60,5 +76,55 @@ public virtual void TestThaiImageWithNotDefGlyphs() {
String fontName = font.GetFontProgram().GetFontNames().GetFontName();
NUnit.Framework.Assert.IsTrue(fontName.Contains("LiberationSans"));
}

[NUnit.Framework.Test]
public virtual void TestImageRotationHandler() {
NUnit.Framework.Assert.That(() => {
OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler());
String testName = "testSetAndGetImageRotationHandler";
String path = PdfHelper.GetImagesTestDirectory() + "90_degrees_rotated.jpg";
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties);
NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler());
}
, NUnit.Framework.Throws.InstanceOf<Exception>().With.Message.EqualTo("applyRotation is not implemented"))
;
}

[NUnit.Framework.Test]
public virtual void TestImageRotationHandlerForTiff() {
NUnit.Framework.Assert.That(() => {
OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler());
String testName = "testSetAndGetImageRotationHandler";
String path = PdfHelper.GetImagesTestDirectory() + "multipage.tiff";
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties);
NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler());
}
, NUnit.Framework.Throws.InstanceOf<Exception>().With.Message.EqualTo("applyRotation is not implemented"))
;
}

internal class NotImplementedImageRotationHandler : IImageRotationHandler {
public virtual ImageData ApplyRotation(ImageData imageData) {
throw new Exception("applyRotation is not implemented");
}
}

[LogMessage(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, Count = 7)]
[NUnit.Framework.Test]
public virtual void TestThaiImageWithNotDefGlyphsDeprecationMode() {
String testName = "testThaiImageWithNotdefGlyphs";
String path = PdfHelper.GetThaiImagePath();
String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf";
PdfHelper.CreatePdf(pdfPath, new FileInfo(path), new OcrPdfCreatorProperties().SetTextColor(DeviceRgb.BLACK
), true);
ExtractionStrategy strategy = PdfHelper.GetExtractionStrategy(pdfPath);
PdfFont font = strategy.GetPdfFont();
String fontName = font.GetFontProgram().GetFontNames().GetFontName();
NUnit.Framework.Assert.IsTrue(fontName.Contains("LiberationSans"));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ You should have received a copy of the GNU Affero General Public License
using System.IO;
using iText.IO.Util;
using iText.Kernel.Counter.Event;
using iText.Kernel.Geom;
using iText.Pdfocr;
using iText.Pdfocr.Events;

Expand All @@ -34,7 +35,14 @@ public class CustomOcrEngine : IOcrEngine, IThreadLocalMetaInfoAware {

private IMetaInfo threadLocalMetaInfo;

public CustomOcrEngine() {
private bool textInfoDeprecationMode = false;

public CustomOcrEngine()
: this(false) {
}

public CustomOcrEngine(bool textInfoDeprecationMode) {
this.textInfoDeprecationMode = textInfoDeprecationMode;
}

public CustomOcrEngine(OcrEngineProperties ocrEngineProperties) {
Expand All @@ -47,7 +55,8 @@ public virtual IDictionary<int, IList<TextInfo>> DoImageOcr(FileInfo input) {
if (input.FullName.Contains(PdfHelper.THAI_IMAGE_NAME)) {
text = PdfHelper.THAI_TEXT;
}
TextInfo textInfo = new TextInfo(text, JavaUtil.ArraysAsList(204.0f, 158.0f, 742.0f, 294.0f));
TextInfo textInfo = this.textInfoDeprecationMode ? new TextInfo(text, JavaUtil.ArraysAsList(204.0f, 158.0f
, 742.0f, 294.0f)) : new TextInfo(text, new Rectangle(204.0f, 158.0f, 538.0f, 136.0f));
result.Put(1, JavaCollectionsUtil.SingletonList<TextInfo>(textInfo));
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,16 @@ public static String GetTextFromPdfLayerUseActualText(String pdfPath, String lay
/// of properties and save to the given path.
/// </summary>
public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties) {
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties);
CreatePdf(pdfPath, inputFile, properties, false);
}

/// <summary>
/// Perform OCR with custom ocr engine using provided input image and set
/// of properties and save to the given path.
/// </summary>
public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties, bool
textInfoDeprecationMode) {
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(textInfoDeprecationMode), properties);
try {
using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) {
ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList<FileInfo>(inputFile), pdfWriter).Close();
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 6386958

Please sign in to comment.