itext
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java‎
Lines changed: 4 additions & 1 deletion b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java‎
Lines changed: 121 additions & 29 deletions b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java‎
Lines changed: 121 additions & 29 deletions
diff --git a/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java‎
Lines changed: 6 additions & 2 deletions b/‎pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java‎
Lines changed: 15 additions & 12 deletions b/‎pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java‎
Lines changed: 15 additions & 12 deletions
@@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
             + "temporary directory: {0}";
     public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
             "Cannot convert image to pix: {0}";
+    public static final String CANNOT_PARSE_NODE_BBOX =
+            "Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";
+
 
     private Tesseract4LogMessageConstant() {
     }
-}
+}
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.styledxmlparser.jsoup.Jsoup;
 import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
 import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
+import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
 import com.itextpdf.styledxmlparser.jsoup.select.Elements;
 
 import java.io.File;
@@ -60,6 +61,27 @@ public class TesseractHelper {
     private static final Logger LOGGER = LoggerFactory
             .getLogger(TesseractHelper.class);
 
+    /**
+     * Patterns for matching hOCR element bboxes.
+     */
+    private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
+    private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
+            .compile(
+                    ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
+
+    /**
+     * Indices in array representing bbox.
+     */
+    private static final int LEFT_IDX = 0;
+    private static final int BOTTOM_IDX = 1;
+    private static final int RIGHT_IDX = 2;
+    private static final int TOP_IDX = 3;
+
+    /**
+     * Size of the array containing bbox.
+     */
+    private static final int BBOX_ARRAY_SIZE = 4;
+
     /**
      * Creates a new {@link TesseractHelper} instance.
      */
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
             throws IOException {
         Map<Integer, List<TextInfo>> imageData =
                 new LinkedHashMap<Integer, List<TextInfo>>();
+        Map<String, Node> unparsedBBoxes = new LinkedHashMap<>();
 
         for (File inputFile : inputFiles) {
             if (inputFile != null
                     && Files.exists(
-                            java.nio.file.Paths
-                                    .get(inputFile.getAbsolutePath()))) {
+                    java.nio.file.Paths
+                            .get(inputFile.getAbsolutePath()))) {
                 FileInputStream fileInputStream =
                         new FileInputStream(inputFile.getAbsolutePath());
                 Document doc = Jsoup.parse(fileInputStream,
                         java.nio.charset.StandardCharsets.UTF_8.name(),
                         inputFile.getAbsolutePath());
                 Elements pages = doc.getElementsByClass("ocr_page");
 
-                Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
-                Pattern bboxCoordinatePattern = Pattern
-                        .compile(
-                                ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
                 List<String> searchedClasses = TextPositioning.BY_LINES
                         .equals(textPositioning)
                         ? Arrays.<String>asList("ocr_line", "ocr_caption")
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
                             }
                         }
                         for (Element obj : objects) {
-                            String value = obj.attr("title");
-                            Matcher bboxMatcher = bboxPattern.matcher(value);
-                            if (bboxMatcher.matches()) {
-                                Matcher bboxCoordinateMatcher =
-                                        bboxCoordinatePattern
-                                                .matcher(bboxMatcher.group());
-                                if (bboxCoordinateMatcher.matches()) {
-                                    List<Float> coordinates =
-                                            new ArrayList<Float>();
-                                    for (int i = 0; i < 4; i++) {
-                                        String coord = bboxCoordinateMatcher
-                                                .group(i + 1);
-                                        coordinates
-                                                .add(Float.parseFloat(coord));
-                                    }
-
-                                    textData.add(new TextInfo(obj.text(),
-                                            coordinates));
-                                }
-                            }
+                            List<Float> coordinates = getAlignedBBox(obj,
+                                    textPositioning,
+                                    unparsedBBoxes);
+                            textData.add(new TextInfo(obj.text(),
+                                    coordinates));
                         }
                     }
                     if (textData.size() > 0) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
                 fileInputStream.close();
             }
         }
+        for (Node node : unparsedBBoxes.values()) {
+            LOGGER.warn(MessageFormatUtil.format(
+                    Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
+                    node.toString()
+            ));
+        }
         return imageData;
     }
 
+    /**
+     * Get and align (if needed) bbox of the element.
+     */
+    static List<Float> getAlignedBBox(Element object,
+                                      TextPositioning textPositioning,
+                                      Map<String, Node> unparsedBBoxes) {
+        final List<Float> coordinates = parseBBox(object, unparsedBBoxes);
+        if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
+                || TextPositioning.BY_WORDS == textPositioning) {
+            Node line = object.parent();
+            final List<Float> lineCoordinates = parseBBox(line, unparsedBBoxes);
+            if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
+                coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
+                coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
+            }
+            detectAndFixBrokenBBoxes(object, coordinates,
+                    lineCoordinates, unparsedBBoxes);
+        }
+        return coordinates;
+    }
+
+    /**
+     * Parses element bbox.
+     *
+     * @param node element containing bbox
+     * @param unparsedBBoxes list of element ids with bboxes which could not be parsed
+     * @return parsed bbox
+     */
+    static List<Float> parseBBox(Node node, Map<String, Node> unparsedBBoxes) {
+        List<Float> bbox = new ArrayList<>();
+        Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
+        if (bboxMatcher.matches()) {
+            Matcher bboxCoordinateMatcher =
+                    BBOX_COORDINATE_PATTERN
+                            .matcher(bboxMatcher.group());
+            if (bboxCoordinateMatcher.matches()) {
+                for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
+                    String coord = bboxCoordinateMatcher
+                            .group(i + 1);
+                    bbox.add(Float.parseFloat(coord));
+                }
+            }
+        }
+        if (bbox.size() == 0) {
+            bbox = Arrays.asList(0f, 0f, 0f, 0f);
+            String id = node.attr("id");
+            if (id != null && !unparsedBBoxes.containsKey(id)) {
+                unparsedBBoxes.put(id, node);
+            }
+        }
+        return bbox;
+    }
+
+    /**
+     * Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
+     * This method attempts to detect and fix them.
+     */
+    static void detectAndFixBrokenBBoxes(Element object, List<Float> coordinates,
+                                         List<Float> lineCoordinates,
+                                         Map<String, Node> unparsedBBoxes) {
+        if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
+                || coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
+            if (object.previousElementSibling() == null) {
+                coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
+            } else {
+                Element sibling = object.previousElementSibling();
+                List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
+                coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
+            }
+        }
+        if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
+                || coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
+            if (object.nextElementSibling() == null) {
+                coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
+            } else {
+                Element sibling = object.nextElementSibling();
+                List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
+                coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
+            }
+        }
+    }
+
     /**
      * Deletes file using provided path.
      *
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
      * @param data text data in required format as {@link java.lang.String}
      */
     static void writeToTextFile(final String path,
-            final String data) {
+                                final String data) {
         try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
                 StandardCharsets.UTF_8)) {
             writer.write(data);
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
      * @throws Tesseract4OcrException if provided command failed
      */
     static void runCommand(final String execPath,
-            final List<String> paramsList) throws Tesseract4OcrException {
+                           final List<String> paramsList) throws Tesseract4OcrException {
         try {
             String params = String.join(" ", paramsList);
             boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
                             .TESSERACT_FAILED);
         }
     }
-}
+}
@@ -39,5 +39,9 @@ public enum TextPositioning {
     /**
      * Text will be located by words retrieved from hocr file.
      */
-    BY_WORDS
-}
+    BY_WORDS,
+    /**
+     * Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
+     */
+    BY_WORDS_AND_LINES,
+}
@@ -87,6 +87,8 @@ public class IntegrationTestHelper extends ExtendedITextTest {
 
     // path to font for hindi
     protected static final String NOTO_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSans-Regular.ttf";
+    // path to font for thai
+    protected static final String NOTO_SANS_THAI_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSansThai-Regular.ttf";
     // path to font for japanese
     protected static final String KOSUGI_FONT_PATH = TEST_FONTS_DIRECTORY + "Kosugi-Regular.ttf";
     // path to font for chinese
@@ -101,13 +103,14 @@ public class IntegrationTestHelper extends ExtendedITextTest {
     static {
         Map<String, String> fontPathToNameMap = new HashMap<>();
         fontPathToNameMap.put(NOTO_SANS_FONT_PATH, "NotoSans");
+        fontPathToNameMap.put(NOTO_SANS_THAI_FONT_PATH, "NotoSansThai");
         fontPathToNameMap.put(KOSUGI_FONT_PATH, "Kosugi");
         fontPathToNameMap.put(NOTO_SANS_SC_FONT_PATH, "NotoSansSC");
         fontPathToNameMap.put(CAIRO_FONT_PATH, "Cairo");
         fontPathToNameMap.put(FREE_SANS_FONT_PATH, "FreeSans");
         FONT_PATH_TO_FONT_NAME_MAP = Collections.unmodifiableMap(fontPathToNameMap);
     }
-    
+
     public enum ReaderType {
         LIB,
         EXECUTABLE
@@ -164,7 +167,7 @@ protected static File getTessDataDirectory() {
      * Retrieve text from specified page from given PDF document.
      */
     protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
-            File file, int page, List<String> languages, List<String> fonts) {
+                                    File file, int page, List<String> languages, List<String> fonts) {
         String result = null;
         String pdfPath = null;
         try {
@@ -183,7 +186,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
      * Retrieve text from specified page from given PDF document.
      */
     protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
-            File file, int page, List<String> languages, String fontPath) {
+                                    File file, int page, List<String> languages, String fontPath) {
         return getTextFromPdf(tesseractReader, file, page, languages,
                 Collections.<String>singletonList(fontPath));
     }
@@ -192,15 +195,15 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
      * Retrieve text from the first page of given PDF document setting font.
      */
     protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
-            List<String> languages, String fontPath) {
+                                    List<String> languages, String fontPath) {
         return getTextFromPdf(tesseractReader, file, 1, languages, fontPath);
     }
 
     /**
      * Retrieve text from the first page of given PDF document.
      */
     protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
-            List<String> languages) {
+                                    List<String> languages) {
         return getTextFromPdf(tesseractReader, file, 1, languages,
                 new ArrayList<String>());
     }
@@ -209,7 +212,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
      * Retrieve text from the required page of given PDF document.
      */
     protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, int page,
-            List<String> languages) {
+                                    List<String> languages) {
         return getTextFromPdf(tesseractReader, file, page, languages, new ArrayList<String>());
     }
 
@@ -224,7 +227,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
      * Get text from layer specified by name from page.
      */
     protected String getTextFromPdfLayer(String pdfPath, String layerName,
-            int page, boolean useActualText) throws IOException {
+                                         int page, boolean useActualText) throws IOException {
         PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath),
                 new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo()));
 
@@ -243,7 +246,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
      * Get text from layer specified by name from page.
      */
     protected String getTextFromPdfLayer(String pdfPath, String layerName,
-            int page) throws IOException {
+                                         int page) throws IOException {
         return getTextFromPdfLayer(pdfPath, layerName, page, false);
     }
 
@@ -253,7 +256,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
      * {@link LocationTextExtractionStrategy#getResultantText()}.
      */
     protected String getTextFromPdfLayerUsingActualText(String pdfPath,
-            String layerName, int page) throws IOException {
+                                                        String layerName, int page) throws IOException {
         return getTextFromPdfLayer(pdfPath, layerName, page, true)
                 .replace(" ", "");
     }
@@ -378,7 +381,7 @@ protected void doOcrAndSavePdfToPath(
      * (Text will be invisible)
      */
     protected void doOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath,
-            String pdfPath, List<String> languages, List<String> fonts) {
+                                         String pdfPath, List<String> languages, List<String> fonts) {
         doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath,
                 languages, fonts, null);
     }
@@ -469,7 +472,7 @@ public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRe
 
         @Override
         protected boolean isChunkAtWordBoundary(TextChunk chunk,
-                TextChunk previousChunk) {
+                                                TextChunk previousChunk) {
             ITextChunkLocation curLoc = chunk.getLocation();
             ITextChunkLocation prevLoc = previousChunk.getLocation();
 
@@ -522,4 +525,4 @@ else if (type.equals(EventType.RENDER_IMAGE)) {
                     : tagHierarchy.get(0).getProperties().get(PdfName.Name).toString();
         }
     }
-}
+}