Skip to content

Commit 57c6536

Browse files
committed
Improvements in word bbox calculation
PDFOC-96
1 parent e632d89 commit 57c6536

16 files changed

Lines changed: 320 additions & 61 deletions

File tree

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
5959
+ "temporary directory: {0}";
6060
public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
6161
"Cannot convert image to pix: {0}";
62+
public static final String CANNOT_PARSE_NODE_BBOX =
63+
"Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";
64+
6265

6366
private Tesseract4LogMessageConstant() {
6467
}
65-
}
68+
}

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java

Lines changed: 121 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
2828
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
2929
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
3030
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
31+
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
3132
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
3233

3334
import java.io.File;
@@ -60,6 +61,27 @@ public class TesseractHelper {
6061
private static final Logger LOGGER = LoggerFactory
6162
.getLogger(TesseractHelper.class);
6263

64+
/**
65+
* Patterns for matching hOCR element bboxes.
66+
*/
67+
private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
68+
private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
69+
.compile(
70+
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
71+
72+
/**
73+
* Indices in array representing bbox.
74+
*/
75+
private static final int LEFT_IDX = 0;
76+
private static final int BOTTOM_IDX = 1;
77+
private static final int RIGHT_IDX = 2;
78+
private static final int TOP_IDX = 3;
79+
80+
/**
81+
* Size of the array containing bbox.
82+
*/
83+
private static final int BBOX_ARRAY_SIZE = 4;
84+
6385
/**
6486
* Creates a new {@link TesseractHelper} instance.
6587
*/
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
86108
throws IOException {
87109
Map<Integer, List<TextInfo>> imageData =
88110
new LinkedHashMap<Integer, List<TextInfo>>();
111+
Map<String, Node> unparsedBBoxes = new LinkedHashMap<>();
89112

90113
for (File inputFile : inputFiles) {
91114
if (inputFile != null
92115
&& Files.exists(
93-
java.nio.file.Paths
94-
.get(inputFile.getAbsolutePath()))) {
116+
java.nio.file.Paths
117+
.get(inputFile.getAbsolutePath()))) {
95118
FileInputStream fileInputStream =
96119
new FileInputStream(inputFile.getAbsolutePath());
97120
Document doc = Jsoup.parse(fileInputStream,
98121
java.nio.charset.StandardCharsets.UTF_8.name(),
99122
inputFile.getAbsolutePath());
100123
Elements pages = doc.getElementsByClass("ocr_page");
101124

102-
Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
103-
Pattern bboxCoordinatePattern = Pattern
104-
.compile(
105-
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
106125
List<String> searchedClasses = TextPositioning.BY_LINES
107126
.equals(textPositioning)
108127
? Arrays.<String>asList("ocr_line", "ocr_caption")
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
124143
}
125144
}
126145
for (Element obj : objects) {
127-
String value = obj.attr("title");
128-
Matcher bboxMatcher = bboxPattern.matcher(value);
129-
if (bboxMatcher.matches()) {
130-
Matcher bboxCoordinateMatcher =
131-
bboxCoordinatePattern
132-
.matcher(bboxMatcher.group());
133-
if (bboxCoordinateMatcher.matches()) {
134-
List<Float> coordinates =
135-
new ArrayList<Float>();
136-
for (int i = 0; i < 4; i++) {
137-
String coord = bboxCoordinateMatcher
138-
.group(i + 1);
139-
coordinates
140-
.add(Float.parseFloat(coord));
141-
}
142-
143-
textData.add(new TextInfo(obj.text(),
144-
coordinates));
145-
}
146-
}
146+
List<Float> coordinates = getAlignedBBox(obj,
147+
textPositioning,
148+
unparsedBBoxes);
149+
textData.add(new TextInfo(obj.text(),
150+
coordinates));
147151
}
148152
}
149153
if (textData.size() > 0) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
157161
fileInputStream.close();
158162
}
159163
}
164+
for (Node node : unparsedBBoxes.values()) {
165+
LOGGER.warn(MessageFormatUtil.format(
166+
Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
167+
node.toString()
168+
));
169+
}
160170
return imageData;
161171
}
162172

173+
/**
174+
* Get and align (if needed) bbox of the element.
175+
*/
176+
static List<Float> getAlignedBBox(Element object,
177+
TextPositioning textPositioning,
178+
Map<String, Node> unparsedBBoxes) {
179+
final List<Float> coordinates = parseBBox(object, unparsedBBoxes);
180+
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
181+
|| TextPositioning.BY_WORDS == textPositioning) {
182+
Node line = object.parent();
183+
final List<Float> lineCoordinates = parseBBox(line, unparsedBBoxes);
184+
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
185+
coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
186+
coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
187+
}
188+
detectAndFixBrokenBBoxes(object, coordinates,
189+
lineCoordinates, unparsedBBoxes);
190+
}
191+
return coordinates;
192+
}
193+
194+
/**
195+
* Parses element bbox.
196+
*
197+
* @param node element containing bbox
198+
* @param unparsedBBoxes list of element ids with bboxes which could not be parsed
199+
* @return parsed bbox
200+
*/
201+
static List<Float> parseBBox(Node node, Map<String, Node> unparsedBBoxes) {
202+
List<Float> bbox = new ArrayList<>();
203+
Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
204+
if (bboxMatcher.matches()) {
205+
Matcher bboxCoordinateMatcher =
206+
BBOX_COORDINATE_PATTERN
207+
.matcher(bboxMatcher.group());
208+
if (bboxCoordinateMatcher.matches()) {
209+
for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
210+
String coord = bboxCoordinateMatcher
211+
.group(i + 1);
212+
bbox.add(Float.parseFloat(coord));
213+
}
214+
}
215+
}
216+
if (bbox.size() == 0) {
217+
bbox = Arrays.asList(0f, 0f, 0f, 0f);
218+
String id = node.attr("id");
219+
if (id != null && !unparsedBBoxes.containsKey(id)) {
220+
unparsedBBoxes.put(id, node);
221+
}
222+
}
223+
return bbox;
224+
}
225+
226+
/**
227+
* Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
228+
* This method attempts to detect and fix them.
229+
*/
230+
static void detectAndFixBrokenBBoxes(Element object, List<Float> coordinates,
231+
List<Float> lineCoordinates,
232+
Map<String, Node> unparsedBBoxes) {
233+
if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
234+
|| coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
235+
if (object.previousElementSibling() == null) {
236+
coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
237+
} else {
238+
Element sibling = object.previousElementSibling();
239+
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
240+
coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
241+
}
242+
}
243+
if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
244+
|| coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
245+
if (object.nextElementSibling() == null) {
246+
coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
247+
} else {
248+
Element sibling = object.nextElementSibling();
249+
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
250+
coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
251+
}
252+
}
253+
}
254+
163255
/**
164256
* Deletes file using provided path.
165257
*
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
208300
* @param data text data in required format as {@link java.lang.String}
209301
*/
210302
static void writeToTextFile(final String path,
211-
final String data) {
303+
final String data) {
212304
try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
213305
StandardCharsets.UTF_8)) {
214306
writer.write(data);
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
228320
* @throws Tesseract4OcrException if provided command failed
229321
*/
230322
static void runCommand(final String execPath,
231-
final List<String> paramsList) throws Tesseract4OcrException {
323+
final List<String> paramsList) throws Tesseract4OcrException {
232324
try {
233325
String params = String.join(" ", paramsList);
234326
boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
251343
.TESSERACT_FAILED);
252344
}
253345
}
254-
}
346+
}

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,9 @@ public enum TextPositioning {
3939
/**
4040
* Text will be located by words retrieved from hocr file.
4141
*/
42-
BY_WORDS
43-
}
42+
BY_WORDS,
43+
/**
44+
* Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
45+
*/
46+
BY_WORDS_AND_LINES,
47+
}

pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ public class IntegrationTestHelper extends ExtendedITextTest {
8787

8888
// path to font for hindi
8989
protected static final String NOTO_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSans-Regular.ttf";
90+
// path to font for thai
91+
protected static final String NOTO_SANS_THAI_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSansThai-Regular.ttf";
9092
// path to font for japanese
9193
protected static final String KOSUGI_FONT_PATH = TEST_FONTS_DIRECTORY + "Kosugi-Regular.ttf";
9294
// path to font for chinese
@@ -101,13 +103,14 @@ public class IntegrationTestHelper extends ExtendedITextTest {
101103
static {
102104
Map<String, String> fontPathToNameMap = new HashMap<>();
103105
fontPathToNameMap.put(NOTO_SANS_FONT_PATH, "NotoSans");
106+
fontPathToNameMap.put(NOTO_SANS_THAI_FONT_PATH, "NotoSansThai");
104107
fontPathToNameMap.put(KOSUGI_FONT_PATH, "Kosugi");
105108
fontPathToNameMap.put(NOTO_SANS_SC_FONT_PATH, "NotoSansSC");
106109
fontPathToNameMap.put(CAIRO_FONT_PATH, "Cairo");
107110
fontPathToNameMap.put(FREE_SANS_FONT_PATH, "FreeSans");
108111
FONT_PATH_TO_FONT_NAME_MAP = Collections.unmodifiableMap(fontPathToNameMap);
109112
}
110-
113+
111114
public enum ReaderType {
112115
LIB,
113116
EXECUTABLE
@@ -164,7 +167,7 @@ protected static File getTessDataDirectory() {
164167
* Retrieve text from specified page from given PDF document.
165168
*/
166169
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
167-
File file, int page, List<String> languages, List<String> fonts) {
170+
File file, int page, List<String> languages, List<String> fonts) {
168171
String result = null;
169172
String pdfPath = null;
170173
try {
@@ -183,7 +186,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
183186
* Retrieve text from specified page from given PDF document.
184187
*/
185188
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
186-
File file, int page, List<String> languages, String fontPath) {
189+
File file, int page, List<String> languages, String fontPath) {
187190
return getTextFromPdf(tesseractReader, file, page, languages,
188191
Collections.<String>singletonList(fontPath));
189192
}
@@ -192,15 +195,15 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
192195
* Retrieve text from the first page of given PDF document setting font.
193196
*/
194197
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
195-
List<String> languages, String fontPath) {
198+
List<String> languages, String fontPath) {
196199
return getTextFromPdf(tesseractReader, file, 1, languages, fontPath);
197200
}
198201

199202
/**
200203
* Retrieve text from the first page of given PDF document.
201204
*/
202205
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
203-
List<String> languages) {
206+
List<String> languages) {
204207
return getTextFromPdf(tesseractReader, file, 1, languages,
205208
new ArrayList<String>());
206209
}
@@ -209,7 +212,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
209212
* Retrieve text from the required page of given PDF document.
210213
*/
211214
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, int page,
212-
List<String> languages) {
215+
List<String> languages) {
213216
return getTextFromPdf(tesseractReader, file, page, languages, new ArrayList<String>());
214217
}
215218

@@ -224,7 +227,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
224227
* Get text from layer specified by name from page.
225228
*/
226229
protected String getTextFromPdfLayer(String pdfPath, String layerName,
227-
int page, boolean useActualText) throws IOException {
230+
int page, boolean useActualText) throws IOException {
228231
PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath),
229232
new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo()));
230233

@@ -243,7 +246,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
243246
* Get text from layer specified by name from page.
244247
*/
245248
protected String getTextFromPdfLayer(String pdfPath, String layerName,
246-
int page) throws IOException {
249+
int page) throws IOException {
247250
return getTextFromPdfLayer(pdfPath, layerName, page, false);
248251
}
249252

@@ -253,7 +256,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
253256
* {@link LocationTextExtractionStrategy#getResultantText()}.
254257
*/
255258
protected String getTextFromPdfLayerUsingActualText(String pdfPath,
256-
String layerName, int page) throws IOException {
259+
String layerName, int page) throws IOException {
257260
return getTextFromPdfLayer(pdfPath, layerName, page, true)
258261
.replace(" ", "");
259262
}
@@ -378,7 +381,7 @@ protected void doOcrAndSavePdfToPath(
378381
* (Text will be invisible)
379382
*/
380383
protected void doOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath,
381-
String pdfPath, List<String> languages, List<String> fonts) {
384+
String pdfPath, List<String> languages, List<String> fonts) {
382385
doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath,
383386
languages, fonts, null);
384387
}
@@ -469,7 +472,7 @@ public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRe
469472

470473
@Override
471474
protected boolean isChunkAtWordBoundary(TextChunk chunk,
472-
TextChunk previousChunk) {
475+
TextChunk previousChunk) {
473476
ITextChunkLocation curLoc = chunk.getLocation();
474477
ITextChunkLocation prevLoc = previousChunk.getLocation();
475478

@@ -522,4 +525,4 @@ else if (type.equals(EventType.RENDER_IMAGE)) {
522525
: tagHierarchy.get(0).getProperties().get(PdfName.Name).toString();
523526
}
524527
}
525-
}
528+
}

0 commit comments

Comments
 (0)