Skip to content

Commit cec7cc4

Browse files
alexander.kischukSnipx
authored andcommitted
Add possibility to set image preprocessing properties
PDFOC-102
1 parent 3006e1c commit cec7cc4

12 files changed

Lines changed: 279 additions & 33 deletions

File tree

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2020 iText Group NV
4+
Authors: iText Software.
5+
6+
This program is offered under a commercial and under the AGPL license.
7+
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
8+
9+
AGPL licensing:
10+
This program is free software: you can redistribute it and/or modify
11+
it under the terms of the GNU Affero General Public License as published by
12+
the Free Software Foundation, either version 3 of the License, or
13+
(at your option) any later version.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU Affero General Public License for more details.
19+
20+
You should have received a copy of the GNU Affero General Public License
21+
along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
package com.itextpdf.pdfocr.tesseract4;
24+
25+
/**
26+
* Additional options applied on image preprocessing step.
27+
*/
28+
public class ImagePreprocessingOptions {
29+
30+
/**
31+
* Adaptive threshold tile width as described here: http://www.leptonica.org/binarization.html.
32+
* Default value of 0 is considered as full image width which means no tiling.
33+
*/
34+
private int tileWidth;
35+
36+
37+
/**
38+
* Adaptive threshold tile height as described here: http://www.leptonica.org/binarization.html.
39+
* Default value of 0 is considered as full image height which means no tiling.
40+
*/
41+
private int tileHeight;
42+
43+
/**
44+
* Adaptive threshold smoothing as described here: http://www.leptonica.org/binarization.html.
45+
*/
46+
private boolean smoothTiling = true;
47+
48+
public ImagePreprocessingOptions() {
49+
}
50+
51+
public ImagePreprocessingOptions(ImagePreprocessingOptions imagePreprocessingOptions) {
52+
this.tileWidth = imagePreprocessingOptions.tileWidth;
53+
this.tileHeight = imagePreprocessingOptions.tileHeight;
54+
this.smoothTiling = imagePreprocessingOptions.smoothTiling;
55+
}
56+
57+
/**
58+
* Gets {@link #tileWidth}.
59+
* @return tile width
60+
*/
61+
final public int getTileWidth() {
62+
return tileWidth;
63+
}
64+
65+
/**
66+
* Sets {@link #tileWidth}.
67+
* @param tileWidth tile width
68+
* @return {@link ImagePreprocessingOptions}
69+
*/
70+
final public ImagePreprocessingOptions setTileWidth(int tileWidth) {
71+
this.tileWidth = tileWidth;
72+
return this;
73+
}
74+
75+
/**
76+
* Gets {@link #tileHeight}.
77+
* @return tile height
78+
*/
79+
final public int getTileHeight() {
80+
return tileHeight;
81+
}
82+
83+
/**
84+
* Sets {@link #tileHeight}.
85+
* @param tileHeight tile height
86+
* @return {@link ImagePreprocessingOptions}
87+
*/
88+
final public ImagePreprocessingOptions setTileHeight(int tileHeight) {
89+
this.tileHeight = tileHeight;
90+
return this;
91+
}
92+
93+
/**
94+
* Gets {@link #smoothTiling}.
95+
* @return smooth tiling flag
96+
*/
97+
final public boolean isSmoothTiling() {
98+
return smoothTiling;
99+
}
100+
101+
/**
102+
* Sets {@link #smoothTiling}.
103+
* @param smoothTiling smooth tiling flag
104+
* @return {@link ImagePreprocessingOptions}
105+
*/
106+
final public ImagePreprocessingOptions setSmoothTiling(boolean smoothTiling) {
107+
this.smoothTiling = smoothTiling;
108+
return this;
109+
}
110+
}

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtil.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,17 @@ static BufferedImage readAsPixAndConvertToBufferedImage(
142142
* Performs basic image preprocessing using buffered image (if provided).
143143
* Preprocessed image will be saved in temporary directory.
144144
*
145-
* @param inputFile input image {@link java.io.File}
145+
* @param inputFile input image {@link File}
146146
* @param pageNumber number of page to be preprocessed
147+
* @param imagePreprocessingOptions {@link ImagePreprocessingOptions}
147148
* @return created preprocessed image as {@link net.sourceforge.lept4j.Pix}
148149
* @throws Tesseract4OcrException if it was not possible to read or convert
149150
* input file
150151
*/
151152
static Pix preprocessImage(final File inputFile,
152-
final int pageNumber) throws Tesseract4OcrException {
153+
final int pageNumber,
154+
final ImagePreprocessingOptions imagePreprocessingOptions)
155+
throws Tesseract4OcrException {
153156
Pix pix = null;
154157
// read image
155158
if (isTiffImage(inputFile)) {
@@ -163,7 +166,7 @@ static Pix preprocessImage(final File inputFile,
163166
Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE)
164167
.setMessageParams(inputFile.getAbsolutePath());
165168
}
166-
return TesseractOcrUtil.preprocessPix(pix);
169+
return TesseractOcrUtil.preprocessPix(pix, imagePreprocessingOptions);
167170
}
168171

169172
/**

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,9 @@ private String preprocessImage(final File inputImage,
366366
try {
367367
if (getTesseract4OcrEngineProperties().isPreprocessingImages()) {
368368
Pix pix = ImagePreprocessingUtil
369-
.preprocessImage(inputImage, pageNumber);
370-
TesseractOcrUtil.savePixToTempPngFile(tmpFileName, pix);
369+
.preprocessImage(inputImage, pageNumber,
370+
getTesseract4OcrEngineProperties().getImagePreprocessingOptions());
371+
TesseractOcrUtil.savePixToPngFile(tmpFileName, pix);
371372
if (!Files.exists(Paths.get(tmpFileName))) {
372373
BufferedImage img = TesseractOcrUtil.convertPixToImage(pix);
373374
if (img != null) {

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,8 @@ private String getOcrResultForSinglePage(final File inputImage,
283283
result = new TesseractOcrUtil().getOcrResultAsString(
284284
getTesseractInstance(),
285285
ImagePreprocessingUtil
286-
.preprocessImage(inputImage, pageNumber),
286+
.preprocessImage(inputImage, pageNumber,
287+
getTesseract4OcrEngineProperties().getImagePreprocessingOptions()),
287288
outputFormat);
288289
}
289290
if (result == null) {

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,6 @@ public class Tesseract4OcrEngineProperties extends OcrEngineProperties {
6666
*/
6767
private Integer pageSegMode = 3;
6868

69-
/**
70-
* "True" - if images need to be preprocessed, otherwise - false.
71-
* True by default.
72-
*/
73-
private boolean preprocessingImages = true;
74-
7569
/**
7670
* Defines the way text is retrieved from tesseract output.
7771
* Default text positioning is by lines.
@@ -90,6 +84,11 @@ public class Tesseract4OcrEngineProperties extends OcrEngineProperties {
9084
*/
9185
private boolean isUserWordsFileTemporary = false;
9286

87+
/**
88+
* Settings for image preprocessing.
89+
*/
90+
private ImagePreprocessingOptions imagePreprocessingOptions = new ImagePreprocessingOptions();
91+
9392
/**
9493
* Creates a new {@link Tesseract4OcrEngineProperties} instance.
9594
*/
@@ -107,9 +106,9 @@ public Tesseract4OcrEngineProperties(Tesseract4OcrEngineProperties other) {
107106
super(other);
108107
this.tessDataDir = other.tessDataDir;
109108
this.pageSegMode = other.pageSegMode;
110-
this.preprocessingImages = other.preprocessingImages;
111109
this.textPositioning = other.textPositioning;
112110
this.pathToUserWordsFile = other.pathToUserWordsFile;
111+
this.imagePreprocessingOptions = other.imagePreprocessingOptions;
113112
}
114113

115114
/**
@@ -194,7 +193,7 @@ public final Tesseract4OcrEngineProperties setPageSegMode(
194193
* @return true if images need to be preprocessed, otherwise - false
195194
*/
196195
public final boolean isPreprocessingImages() {
197-
return preprocessingImages;
196+
return imagePreprocessingOptions != null;
198197
}
199198

200199
/**
@@ -206,7 +205,13 @@ public final boolean isPreprocessingImages() {
206205
*/
207206
public final Tesseract4OcrEngineProperties setPreprocessingImages(
208207
final boolean preprocess) {
209-
preprocessingImages = preprocess;
208+
if (preprocess) {
209+
if (imagePreprocessingOptions == null) {
210+
imagePreprocessingOptions = new ImagePreprocessingOptions();
211+
}
212+
} else {
213+
imagePreprocessingOptions = null;
214+
}
210215
return this;
211216
}
212217

@@ -387,4 +392,25 @@ final boolean isUserWordsFileTemporary() {
387392
return isUserWordsFileTemporary;
388393
}
389394

395+
/**
396+
* Gets {@link #imagePreprocessingOptions}.
397+
* @return {@link ImagePreprocessingOptions}
398+
*/
399+
final public ImagePreprocessingOptions getImagePreprocessingOptions() {
400+
return imagePreprocessingOptions;
401+
}
402+
403+
/**
404+
* Sets {@link #imagePreprocessingOptions}.
405+
* @param imagePreprocessingOptions {@link ImagePreprocessingOptions}
406+
* @return the {@link Tesseract4OcrEngineProperties} instance
407+
*/
408+
final public Tesseract4OcrEngineProperties setImagePreprocessingOptions(
409+
ImagePreprocessingOptions imagePreprocessingOptions) {
410+
this.imagePreprocessingOptions = imagePreprocessingOptions;
411+
return this;
412+
}
413+
390414
}
415+
416+

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtil.java

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,15 @@ static Pix readPixPageFromTiff(final File inputFile,
126126
* converting to grayscale,
127127
* thresholding.
128128
*
129-
* @param pix {@link net.sourceforge.lept4j.Pix} object to be processed
129+
* @param pix {@link Pix} object to be processed
130+
* @param imagePreprocessingOptions {@link ImagePreprocessingOptions}
130131
* @return preprocessed {@link net.sourceforge.lept4j.Pix} object
131132
*/
132-
static Pix preprocessPix(Pix pix) {
133-
pix = convertToGrayscale(pix);
134-
pix = otsuImageThresholding(pix);
135-
return pix;
133+
static Pix preprocessPix(final Pix pix,
134+
final ImagePreprocessingOptions imagePreprocessingOptions) {
135+
Pix pix1 = convertToGrayscale(pix);
136+
pix1 = otsuImageThresholding(pix1, imagePreprocessingOptions);
137+
return pix1;
136138
}
137139

138140
/**
@@ -165,18 +167,27 @@ static Pix convertToGrayscale(final Pix pix) {
165167
* {@link net.sourceforge.lept4j.Leptonica#pixOtsuAdaptiveThreshold}
166168
* method.
167169
*
168-
* @param pix {@link net.sourceforge.lept4j.Pix} object to be processed
170+
* @param pix {@link Pix} object to be processed
171+
* @param imagePreprocessingOptions {@link ImagePreprocessingOptions}
169172
* @return {@link net.sourceforge.lept4j.Pix} object after thresholding
170173
*/
171-
static Pix otsuImageThresholding(final Pix pix) {
174+
static Pix otsuImageThresholding(final Pix pix,
175+
final ImagePreprocessingOptions imagePreprocessingOptions) {
172176
if (pix != null) {
173177
Pix thresholdPix = null;
174178
if (pix.d == 8) {
175179
PointerByReference pointer = new PointerByReference();
176180
Leptonica.INSTANCE
177-
.pixOtsuAdaptiveThreshold(pix, pix.w, pix.h,
178-
0, 0, 0,
179-
null, pointer);
181+
.pixOtsuAdaptiveThreshold(pix,
182+
getOtsuAdaptiveThresholdTileSize(pix.w,
183+
imagePreprocessingOptions.getTileWidth()),
184+
getOtsuAdaptiveThresholdTileSize(pix.h,
185+
imagePreprocessingOptions.getTileHeight()),
186+
getOtsuAdaptiveThresholdSmoothingTileSize(pix.w,
187+
imagePreprocessingOptions.isSmoothTiling()),
188+
getOtsuAdaptiveThresholdSmoothingTileSize(pix.h,
189+
imagePreprocessingOptions.isSmoothTiling()),
190+
0,null, pointer);
180191
thresholdPix = new Pix(pointer.getValue());
181192
if (thresholdPix.w > 0 && thresholdPix.h > 0) {
182193
// destroying original pix
@@ -201,6 +212,36 @@ static Pix otsuImageThresholding(final Pix pix) {
201212
}
202213
}
203214

215+
/**
216+
* Gets adaptive threshold tile size.
217+
*/
218+
static int getOtsuAdaptiveThresholdTileSize(int imageSize, int tileSize) {
219+
if (tileSize == 0) {
220+
return imageSize;
221+
} else {
222+
return tileSize;
223+
}
224+
}
225+
226+
/**
227+
* Gets adaptive threshold smoothing tile size.
228+
* Can be either equal to page size or 0.
229+
*/
230+
static int getOtsuAdaptiveThresholdSmoothingTileSize(int imageSize, boolean smoothTiling) {
231+
if (smoothTiling) {
232+
return imageSize;
233+
} else {
234+
return 0;
235+
}
236+
}
237+
238+
/**
239+
* Gets an integer pixel in the default RGB color model.
240+
*/
241+
static int getImagePixelColor(BufferedImage image, int x, int y) {
242+
return image.getRGB(x, y);
243+
}
244+
204245
/**
205246
* Destroys {@link net.sourceforge.lept4j.Pix} object.
206247
*
@@ -403,15 +444,15 @@ static void saveImageToTempPngFile(final String tmpFileName,
403444
/**
404445
* Saves passed {@link net.sourceforge.lept4j.Pix} to given path
405446
*
406-
* @param tmpFileName provided file path to save the
447+
* @param filename provided file path to save the
407448
* {@link net.sourceforge.lept4j.Pix}
408449
* @param pix provided {@link net.sourceforge.lept4j.Pix} to be saved
409450
*/
410-
static void savePixToTempPngFile(final String tmpFileName,
411-
final Pix pix) {
451+
static void savePixToPngFile(final String filename,
452+
final Pix pix) {
412453
if (pix != null) {
413454
try {
414-
Leptonica.INSTANCE.pixWritePng(tmpFileName, pix,
455+
Leptonica.INSTANCE.pixWritePng(filename, pix,
415456
ILeptonica.IFF_PNG);
416457
} catch (Exception e) { // NOSONAR
417458
LOGGER.info(MessageFormatUtil.format(

0 commit comments

Comments
 (0)