itext
diff --git a/‎pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java‎
Lines changed: 11 additions & 0 deletions b/‎pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java‎
Lines changed: 35 additions & 5 deletions b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java‎
Lines changed: 37 additions & 6 deletions b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java‎
Lines changed: 37 additions & 6 deletions
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java‎
Lines changed: 12 additions & 2 deletions b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java‎
Lines changed: 66 additions & 1 deletion b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java‎
Lines changed: 66 additions & 1 deletion
@@ -60,6 +60,17 @@ public class TextInfo {
     public TextInfo() {
     }
 
+    /**
+     * Creates a new {@link TextInfo} instance from existing one.
+     *
+     * @param textInfo to create from
+     */
+    public TextInfo(final TextInfo textInfo) {
+        this.text = textInfo.text;
+        this.bboxRect = new Rectangle(textInfo.bboxRect);
+        this.bbox = Collections.<Float>unmodifiableList(textInfo.bbox);
+    }
+
     /**
      * Creates a new {@link TextInfo} instance.
      *
 
@@ -294,9 +294,32 @@ public IThreadLocalMetaInfoAware setThreadLocalMetaInfo(IMetaInfo metaInfo) {
      * @param outputFormat selected {@link OutputFormat} for tesseract
      * @param pageNumber number of page to be processed
      */
-    abstract void doTesseractOcr(File inputImage,
+    void doTesseractOcr(File inputImage,
             List<File> outputFiles, OutputFormat outputFormat,
-            int pageNumber);
+            int pageNumber) {
+        doTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true);
+    }
+
+    /**
+     * Performs tesseract OCR using command line tool
+     * or a wrapper for Tesseract OCR API.
+     *
+     * Please note that list of output files is accepted instead of a single file because
+     * page number parameter is not respected in case of TIFF images not requiring preprocessing.
+     * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties}
+     * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
+     * is expected to be same as number of pages in the image, otherwise, only one file is expected
+     *
+     * @param inputImage input image {@link java.io.File}
+     * @param outputFiles {@link java.util.List} of output files
+     *                                          (one per each page)
+     * @param outputFormat selected {@link OutputFormat} for tesseract
+     * @param pageNumber number of page to be processed
+     * @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
+     */
+    abstract void doTesseractOcr(File inputImage,
+                        List<File> outputFiles, OutputFormat outputFormat,
+                        int pageNumber, boolean dispatchEvent);
 
     /**
      * Gets path to provided tess data directory.
@@ -374,10 +397,17 @@ private ITesseractOcrResult processInputFiles(
 
                 doTesseractOcr(input, tempFiles, outputFormat, page);
                 if (outputFormat.equals(OutputFormat.HOCR)) {
+                    List<File> tempTxtFiles = null;
+                    if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
+                        tempTxtFiles = new ArrayList<>();
+                        for (int i = 0; i < numOfFiles; i++) {
+                            tempTxtFiles.add(createTempFile(".txt"));
+                        }
+                        doTesseractOcr(input, tempTxtFiles, OutputFormat.TXT, page, false);
+                    }
                     Map<Integer, List<TextInfo>> pageData = TesseractHelper
-                            .parseHocrFile(tempFiles,
-                                    getTesseract4OcrEngineProperties()
-                                            .getTextPositioning());
+                            .parseHocrFile(tempFiles, tempTxtFiles,
+                                    getTesseract4OcrEngineProperties());
 
                     if (getTesseract4OcrEngineProperties()
                             .isPreprocessingImages()) {
 
@@ -33,6 +33,8 @@ This file is part of the iText (R) project.
 import java.util.Collections;
 import java.util.List;
 import java.util.UUID;
+
+import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event;
 import net.sourceforge.lept4j.Pix;
 import org.slf4j.LoggerFactory;
 
@@ -109,10 +111,11 @@ public final void setPathToExecutable(final String path) {
      *                                          (one per each page)
      * @param outputFormat selected {@link OutputFormat} for tesseract
      * @param pageNumber number of page to be processed
+     * @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
      */
     void doTesseractOcr(final File inputImage,
             final List<File> outputFiles, final OutputFormat outputFormat,
-            final int pageNumber) {
+            final int pageNumber, final boolean dispatchEvent) {
         scheduledCheck();
         List<String> params = new ArrayList<String>();
         String execPath = null;
@@ -162,13 +165,17 @@ void doTesseractOcr(final File inputImage,
             addUserWords(params, imagePath);
             // required languages
             addLanguages(params);
-            if (outputFormat.equals(OutputFormat.HOCR)) {
-                // path to hocr script
-                setHocrOutput(params);
-            }
+
+            addOutputFormat(params, outputFormat);
+
+            addPreserveInterwordSpaces(params);
+
             // set default user defined dpi
             addDefaultDpi(params);
-            onEvent();
+
+            if (dispatchEvent) {
+                onEvent();
+            }
 
             // run tesseract process
             TesseractHelper.runCommand(execPath, params, workingDirectory);
@@ -217,6 +224,30 @@ private void setHocrOutput(final List<String> command) {
         command.add("tessedit_create_hocr=1");
     }
 
+    /**
+     * Sets preserve_interword_spaces option.
+     *
+     * @param command result command as list of strings
+     */
+    private void addPreserveInterwordSpaces(final List<String> command) {
+        if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
+            command.add("-c");
+            command.add("preserve_interword_spaces=1");
+        }
+    }
+
+    /**
+     * Add output format.
+     *
+     * @param command result command as list of strings
+     * @param outputFormat output format
+     */
+    private void addOutputFormat(final List<String> command, OutputFormat outputFormat) {
+        if (outputFormat == OutputFormat.HOCR) {
+            setHocrOutput(command);
+        }
+    }
+
     /**
      * Add path to user-words file for tesseract executable.
      *
 
@@ -35,6 +35,8 @@ This file is part of the iText (R) project.
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+
+import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event;
 import net.sourceforge.tess4j.ITesseract;
 import net.sourceforge.tess4j.TesseractException;
 import org.slf4j.LoggerFactory;
@@ -103,6 +105,11 @@ public void initializeTesseract(final OutputFormat outputFormat) {
         getTesseractInstance()
                 .setTessVariable("tessedit_create_hocr",
                         outputFormat.equals(OutputFormat.HOCR) ? "1" : "0");
+
+        if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
+            getTesseractInstance().setTessVariable("preserve_interword_spaces", "1");
+        }
+
         getTesseractInstance().setTessVariable("user_defined_dpi", "300");
         if (getTesseract4OcrEngineProperties()
                 .getPathToUserWordsFile() != null) {
@@ -141,18 +148,21 @@ public void initializeTesseract(final OutputFormat outputFormat) {
      *                                          (one per each page)
      * @param outputFormat selected {@link OutputFormat} for tesseract
      * @param pageNumber number of page to be processed
+     * @param dispatchEvent indicates if {@link PdfOcrTesseract4Event} needs to be dispatched
      */
     void doTesseractOcr(final File inputImage,
             final List<File> outputFiles, final OutputFormat outputFormat,
-            final int pageNumber) {
+            final int pageNumber, final boolean dispatchEvent) {
         scheduledCheck();
         try {
             // check tess data path for non ASCII characters
             validateTessDataPath(getTessData());
             validateLanguages(getTesseract4OcrEngineProperties()
                     .getLanguages());
             initializeTesseract(outputFormat);
-            onEvent();
+            if (dispatchEvent) {
+                onEvent();
+            }
             // if preprocessing is not needed and provided image is tiff,
             // the image will be paginated and separate pages will be OCRed
             List<String> resultList = new ArrayList<String>();
 
@@ -84,11 +84,27 @@ public class Tesseract4OcrEngineProperties extends OcrEngineProperties {
      */
     private boolean isUserWordsFileTemporary = false;
 
+    /**
+     * Used to make HOCR recognition result more precise.
+     * This is needed for cases of Thai language or some Chinese dialects
+     * where every character is interpreted as a single word.
+     * For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
+     */
+    private boolean useTxtToImproveHocrParsing;
+
     /**
      * Settings for image preprocessing.
      */
     private ImagePreprocessingOptions imagePreprocessingOptions = new ImagePreprocessingOptions();
 
+    /**
+     * Minimal confidence level for HOCR line to be considered as properly recognized.
+     * If real confidence level is lower then line is ignored
+     * Default value is 0 which means that everything is considered as properly recognized
+     * Value may vary in range of 0-100
+     */
+    private int minimalConfidenceLevel;
+
     /**
      * Creates a new {@link Tesseract4OcrEngineProperties} instance.
      */
@@ -108,7 +124,9 @@ public Tesseract4OcrEngineProperties(Tesseract4OcrEngineProperties other) {
         this.pageSegMode = other.pageSegMode;
         this.textPositioning = other.textPositioning;
         this.pathToUserWordsFile = other.pathToUserWordsFile;
+        this.useTxtToImproveHocrParsing = other.useTxtToImproveHocrParsing;
         this.imagePreprocessingOptions = other.imagePreprocessingOptions;
+        this.minimalConfidenceLevel = other.minimalConfidenceLevel;
     }
 
     /**
@@ -392,6 +410,33 @@ final boolean isUserWordsFileTemporary() {
         return isUserWordsFileTemporary;
     }
 
+    /**
+     * Gets {@link #useTxtToImproveHocrParsing}.
+     * Used to make HOCR recognition result more precise.
+     * This is needed for cases of Thai language or some Chinese dialects
+     * where every character is interpreted as a single word.
+     * For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
+     *
+     * @return {@link #useTxtToImproveHocrParsing}
+     */
+    final public boolean isUseTxtToImproveHocrParsing() {
+        return useTxtToImproveHocrParsing;
+    }
+
+    /**
+     * Sets {@link #useTxtToImproveHocrParsing}.
+     * Used to make HOCR recognition result more precise.
+     * This is needed for cases of Thai language or some Chinese dialects
+     * where every character is interpreted as a single word.
+     * For more information see https://github.com/tesseract-ocr/tesseract/issues/2702
+     *
+     * @param useTxtToImproveHocrParsing {@link #useTxtToImproveHocrParsing}
+     */
+    final public Tesseract4OcrEngineProperties setUseTxtToImproveHocrParsing(boolean useTxtToImproveHocrParsing) {
+        this.useTxtToImproveHocrParsing = useTxtToImproveHocrParsing;
+        return this;
+    }
+
     /**
      * Gets {@link #imagePreprocessingOptions}.
      * @return {@link ImagePreprocessingOptions}
@@ -411,6 +456,26 @@ final public Tesseract4OcrEngineProperties setImagePreprocessingOptions(
         return this;
     }
 
-}
+    /**
+     * Gets minimal confidence level for HOCR line to be considered as properly recognized.
+     * If real confidence level is lower then line is ignored
+     * Default value is 0 which means that everything is considered as properly recognized
+     * Value may vary in range of 0-100
+     */
+    final public int getMinimalConfidenceLevel() {
+        return minimalConfidenceLevel;
+    }
+
+    /**
+     * Sets minimal confidence level for HOCR line to be considered as properly recognized.
+     * If real confidence level is lower then line is ignored
+     * Default value is 0 which means that everything is considered as properly recognized
+     * Value may vary in range of 0-100
+     */
+    final public Tesseract4OcrEngineProperties setMinimalConfidenceLevel(int minimalConfidenceLevel) {
+        this.minimalConfidenceLevel = minimalConfidenceLevel;
+        return this;
+    }
 
 
+}