Skip to content

Commit b5ec35b

Browse files
committed
Combine HOCR and TXT outputs for more precise text recognition
PDFOC-103
1 parent fe61188 commit b5ec35b

16 files changed

Lines changed: 815 additions & 181 deletions

File tree

pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ public class TextInfo {
6060
public TextInfo() {
6161
}
6262

63+
/**
64+
* Creates a new {@link TextInfo} instance from existing one.
65+
*
66+
* @param textInfo to create from
67+
*/
68+
public TextInfo(final TextInfo textInfo) {
69+
this.text = textInfo.text;
70+
this.bboxRect = new Rectangle(textInfo.bboxRect);
71+
this.bbox = Collections.<Float>unmodifiableList(textInfo.bbox);
72+
}
73+
6374
/**
6475
* Creates a new {@link TextInfo} instance.
6576
*

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,32 @@ public IThreadLocalMetaInfoAware setThreadLocalMetaInfo(IMetaInfo metaInfo) {
294294
* @param outputFormat selected {@link OutputFormat} for tesseract
295295
* @param pageNumber number of page to be processed
296296
*/
297-
abstract void doTesseractOcr(File inputImage,
297+
void doTesseractOcr(File inputImage,
298298
List<File> outputFiles, OutputFormat outputFormat,
299-
int pageNumber);
299+
int pageNumber) {
300+
doTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true);
301+
}
302+
303+
/**
304+
* Performs tesseract OCR using command line tool
305+
* or a wrapper for Tesseract OCR API.
306+
*
307+
* Please note that list of output files is accepted instead of a single file because
308+
* page number parameter is not respected in case of TIFF images not requiring preprocessing.
309+
* In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties}
310+
* no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
311+
* is expected to be same as number of pages in the image, otherwise, only one file is expected
312+
*
313+
* @param inputImage input image {@link java.io.File}
314+
* @param outputFiles {@link java.util.List} of output files
315+
* (one per each page)
316+
* @param outputFormat selected {@link OutputFormat} for tesseract
317+
* @param pageNumber number of page to be processed
318+
* @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
319+
*/
320+
abstract void doTesseractOcr(File inputImage,
321+
List<File> outputFiles, OutputFormat outputFormat,
322+
int pageNumber, boolean dispatchEvent);
300323

301324
/**
302325
* Gets path to provided tess data directory.
@@ -374,10 +397,17 @@ private ITesseractOcrResult processInputFiles(
374397

375398
doTesseractOcr(input, tempFiles, outputFormat, page);
376399
if (outputFormat.equals(OutputFormat.HOCR)) {
400+
List<File> tempTxtFiles = null;
401+
if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
402+
tempTxtFiles = new ArrayList<>();
403+
for (int i = 0; i < numOfFiles; i++) {
404+
tempTxtFiles.add(createTempFile(".txt"));
405+
}
406+
doTesseractOcr(input, tempTxtFiles, OutputFormat.TXT, page, false);
407+
}
377408
Map<Integer, List<TextInfo>> pageData = TesseractHelper
378-
.parseHocrFile(tempFiles,
379-
getTesseract4OcrEngineProperties()
380-
.getTextPositioning());
409+
.parseHocrFile(tempFiles, tempTxtFiles,
410+
getTesseract4OcrEngineProperties());
381411

382412
if (getTesseract4OcrEngineProperties()
383413
.isPreprocessingImages()) {

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ This file is part of the iText (R) project.
3333
import java.util.Collections;
3434
import java.util.List;
3535
import java.util.UUID;
36+
37+
import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event;
3638
import net.sourceforge.lept4j.Pix;
3739
import org.slf4j.LoggerFactory;
3840

@@ -109,10 +111,11 @@ public final void setPathToExecutable(final String path) {
109111
* (one per each page)
110112
* @param outputFormat selected {@link OutputFormat} for tesseract
111113
* @param pageNumber number of page to be processed
114+
* @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
112115
*/
113116
void doTesseractOcr(final File inputImage,
114117
final List<File> outputFiles, final OutputFormat outputFormat,
115-
final int pageNumber) {
118+
final int pageNumber, final boolean dispatchEvent) {
116119
scheduledCheck();
117120
List<String> params = new ArrayList<String>();
118121
String execPath = null;
@@ -162,13 +165,17 @@ void doTesseractOcr(final File inputImage,
162165
addUserWords(params, imagePath);
163166
// required languages
164167
addLanguages(params);
165-
if (outputFormat.equals(OutputFormat.HOCR)) {
166-
// path to hocr script
167-
setHocrOutput(params);
168-
}
168+
169+
addOutputFormat(params, outputFormat);
170+
171+
addPreserveInterwordSpaces(params);
172+
169173
// set default user defined dpi
170174
addDefaultDpi(params);
171-
onEvent();
175+
176+
if (dispatchEvent) {
177+
onEvent();
178+
}
172179

173180
// run tesseract process
174181
TesseractHelper.runCommand(execPath, params, workingDirectory);
@@ -217,6 +224,30 @@ private void setHocrOutput(final List<String> command) {
217224
command.add("tessedit_create_hocr=1");
218225
}
219226

227+
/**
228+
* Sets preserve_interword_spaces option.
229+
*
230+
* @param command result command as list of strings
231+
*/
232+
private void addPreserveInterwordSpaces(final List<String> command) {
233+
if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
234+
command.add("-c");
235+
command.add("preserve_interword_spaces=1");
236+
}
237+
}
238+
239+
/**
240+
* Add output format.
241+
*
242+
* @param command result command as list of strings
243+
* @param outputFormat output format
244+
*/
245+
private void addOutputFormat(final List<String> command, OutputFormat outputFormat) {
246+
if (outputFormat == OutputFormat.HOCR) {
247+
setHocrOutput(command);
248+
}
249+
}
250+
220251
/**
221252
* Add path to user-words file for tesseract executable.
222253
*

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ This file is part of the iText (R) project.
3535
import java.util.List;
3636
import java.util.regex.Matcher;
3737
import java.util.regex.Pattern;
38+
39+
import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event;
3840
import net.sourceforge.tess4j.ITesseract;
3941
import net.sourceforge.tess4j.TesseractException;
4042
import org.slf4j.LoggerFactory;
@@ -103,6 +105,11 @@ public void initializeTesseract(final OutputFormat outputFormat) {
103105
getTesseractInstance()
104106
.setTessVariable("tessedit_create_hocr",
105107
outputFormat.equals(OutputFormat.HOCR) ? "1" : "0");
108+
109+
if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
110+
getTesseractInstance().setTessVariable("preserve_interword_spaces", "1");
111+
}
112+
106113
getTesseractInstance().setTessVariable("user_defined_dpi", "300");
107114
if (getTesseract4OcrEngineProperties()
108115
.getPathToUserWordsFile() != null) {
@@ -141,18 +148,21 @@ public void initializeTesseract(final OutputFormat outputFormat) {
141148
* (one per each page)
142149
* @param outputFormat selected {@link OutputFormat} for tesseract
143150
* @param pageNumber number of page to be processed
151+
* @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
144152
*/
145153
void doTesseractOcr(final File inputImage,
146154
final List<File> outputFiles, final OutputFormat outputFormat,
147-
final int pageNumber) {
155+
final int pageNumber, final boolean dispatchEvent) {
148156
scheduledCheck();
149157
try {
150158
// check tess data path for non ASCII characters
151159
validateTessDataPath(getTessData());
152160
validateLanguages(getTesseract4OcrEngineProperties()
153161
.getLanguages());
154162
initializeTesseract(outputFormat);
155-
onEvent();
163+
if (dispatchEvent) {
164+
onEvent();
165+
}
156166
// if preprocessing is not needed and provided image is tiff,
157167
// the image will be paginated and separate pages will be OCRed
158168
List<String> resultList = new ArrayList<String>();

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,27 @@ public class Tesseract4OcrEngineProperties extends OcrEngineProperties {
8484
*/
8585
private boolean isUserWordsFileTemporary = false;
8686

87+
/**
88+
* Used to make HOCR recognition result more precise.
89+
* This is needed for cases of Thai language or some Chinese dialects
90+
* where every character is interpreted as a single word.
91+
* For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
92+
*/
93+
private boolean useTxtToImproveHocrParsing;
94+
8795
/**
8896
* Settings for image preprocessing.
8997
*/
9098
private ImagePreprocessingOptions imagePreprocessingOptions = new ImagePreprocessingOptions();
9199

100+
/**
101+
* Minimal confidence level for HOCR line to be considered as properly recognized.
102+
* If real confidence level is lower then line is ignored
103+
* Default value is 0 which means that everything is considered as properly recognized
104+
* Value may vary in range of 0-100
105+
*/
106+
private int minimalConfidenceLevel;
107+
92108
/**
93109
* Creates a new {@link Tesseract4OcrEngineProperties} instance.
94110
*/
@@ -108,7 +124,9 @@ public Tesseract4OcrEngineProperties(Tesseract4OcrEngineProperties other) {
108124
this.pageSegMode = other.pageSegMode;
109125
this.textPositioning = other.textPositioning;
110126
this.pathToUserWordsFile = other.pathToUserWordsFile;
127+
this.useTxtToImproveHocrParsing = other.useTxtToImproveHocrParsing;
111128
this.imagePreprocessingOptions = other.imagePreprocessingOptions;
129+
this.minimalConfidenceLevel = other.minimalConfidenceLevel;
112130
}
113131

114132
/**
@@ -392,6 +410,33 @@ final boolean isUserWordsFileTemporary() {
392410
return isUserWordsFileTemporary;
393411
}
394412

413+
/**
414+
* Gets {@link #useTxtToImproveHocrParsing}.
415+
* Used to make HOCR recognition result more precise.
416+
* This is needed for cases of Thai language or some Chinese dialects
417+
* where every character is interpreted as a single word.
418+
* For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
419+
*
420+
* @return {@link #useTxtToImproveHocrParsing}
421+
*/
422+
final public boolean isUseTxtToImproveHocrParsing() {
423+
return useTxtToImproveHocrParsing;
424+
}
425+
426+
/**
427+
* Sets {@link #useTxtToImproveHocrParsing}.
428+
* Used to make HOCR recognition result more precise.
429+
* This is needed for cases of Thai language or some Chinese dialects
430+
* where every character is interpreted as a single word.
431+
* For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
432+
*
433+
* @param useTxtToImproveHocrParsing {@link #useTxtToImproveHocrParsing}
434+
*/
435+
final public Tesseract4OcrEngineProperties setUseTxtToImproveHocrParsing(boolean useTxtToImproveHocrParsing) {
436+
this.useTxtToImproveHocrParsing = useTxtToImproveHocrParsing;
437+
return this;
438+
}
439+
395440
/**
396441
* Gets {@link #imagePreprocessingOptions}.
397442
* @return {@link ImagePreprocessingOptions}
@@ -411,6 +456,26 @@ final public Tesseract4OcrEngineProperties setImagePreprocessingOptions(
411456
return this;
412457
}
413458

414-
}
459+
/**
460+
* Gets minimal confidence level for HOCR line to be considered as properly recognized.
461+
* If real confidence level is lower then line is ignored
462+
* Default value is 0 which means that everything is considered as properly recognized
463+
* Value may vary in range of 0-100
464+
*/
465+
final public int getMinimalConfidenceLevel() {
466+
return minimalConfidenceLevel;
467+
}
468+
469+
/**
470+
* Sets minimal confidence level for HOCR line to be considered as properly recognized.
471+
* If real confidence level is lower then line is ignored
472+
* Default value is 0 which means that everything is considered as properly recognized
473+
* Value may vary in range of 0-100
474+
*/
475+
final public Tesseract4OcrEngineProperties setMinimalConfidenceLevel(int minimalConfidenceLevel) {
476+
this.minimalConfidenceLevel = minimalConfidenceLevel;
477+
return this;
478+
}
415479

416480

481+
}

0 commit comments

Comments
 (0)