Skip to content

Commit a934d79

Browse files
committed
Get rid of deprecated OcrPdfCreator#getLogicalTree
DEVSIX-8570
1 parent c82915e commit a934d79

10 files changed

Lines changed: 63 additions & 31 deletions

File tree

pdfocr-api/src/main/java/com/itextpdf/pdfocr/IOcrEngine.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,11 @@ public interface IOcrEngine {
8585
* @param ocrProcessContext ocr processing context
8686
*/
8787
void createTxtFile(List<File> inputImages, File txtFile, OcrProcessContext ocrProcessContext);
88+
89+
/**
90+
* Checks whether tagging is supported by the OCR engine.
91+
*
92+
* @return {@code true} if tagging is supported by the engine, {@code false} otherwise
93+
*/
94+
boolean isTaggingSupported();
8895
}

pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreator.java

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ public OcrPdfCreator(final IOcrEngine ocrEngine) {
129129
*/
130130
public OcrPdfCreator(final IOcrEngine ocrEngine,
131131
final OcrPdfCreatorProperties ocrPdfCreatorProperties) {
132+
if (ocrPdfCreatorProperties.isTagged() && !ocrEngine.isTaggingSupported()) {
133+
throw new PdfOcrException(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED);
134+
}
132135
setOcrEngine(ocrEngine);
133136
setOcrPdfCreatorProperties(ocrPdfCreatorProperties);
134137
}
@@ -471,11 +474,7 @@ private void addToCanvas(final PdfDocument pdfDocument,
471474
// Logical tree, a list of top items, children can be retrieved out of them
472475
List<LogicalStructureTreeItem> logicalTree = new ArrayList<>();
473476
// A map of leaf LogicalStructureTreeItem's to TextInfo's attached to these leaves
474-
Map<LogicalStructureTreeItem, List<TextInfo>> leavesTextInfos = new HashMap<>();
475-
final boolean taggedSupported = getLogicalTree(pageText, logicalTree, leavesTextInfos);
476-
if (!taggedSupported) {
477-
throw new PdfOcrException(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED);
478-
}
477+
Map<LogicalStructureTreeItem, List<TextInfo>> leavesTextInfos = getLogicalTree(pageText, logicalTree);
479478
pdfDocument.setTagged();
480479

481480
// Create a map of TextInfo to tag pointers meanwhile creating the required tags.
@@ -634,19 +633,12 @@ private void addImageToCanvas(final ImageData imageData,
634633
}
635634
}
636635

637-
/**
638-
* @return {@code true} if tagging supported by the engine.
639-
* @deprecated In next major version we need to add boolean taggingSupported() method into IOcrEngine
640-
* and throw exception in OcrPdfCreator constructor if taggingSupported() returns false but
641-
* OcrPdfCreatorProperties.getTagged returns true.
642-
*/
643-
@Deprecated
644-
private static boolean getLogicalTree(List<TextInfo> textInfos,
645-
List<LogicalStructureTreeItem> logicalStructureTreeItems,
646-
Map<LogicalStructureTreeItem, List<TextInfo>> leavesTextInfos) {
647-
boolean taggedSupported = false;
636+
private static Map<LogicalStructureTreeItem, List<TextInfo>> getLogicalTree(
637+
List<TextInfo> textInfos, List<LogicalStructureTreeItem> logicalStructureTreeItems) {
638+
639+
Map<LogicalStructureTreeItem, List<TextInfo>> leavesTextInfos = new HashMap<>();
648640
if (textInfos == null) {
649-
return taggedSupported;
641+
return leavesTextInfos;
650642
}
651643

652644
for (TextInfo textInfo : textInfos) {
@@ -656,7 +648,6 @@ private static boolean getLogicalTree(List<TextInfo> textInfos,
656648
continue;
657649
} else if (structTreeItem != null) {
658650
topParent = getTopParent(structTreeItem);
659-
taggedSupported = true;
660651
} else {
661652
structTreeItem = new LogicalStructureTreeItem();
662653
textInfo.setLogicalStructureTreeItem(structTreeItem);
@@ -675,7 +666,7 @@ private static boolean getLogicalTree(List<TextInfo> textInfos,
675666
}
676667
}
677668

678-
return taggedSupported;
669+
return leavesTextInfos;
679670
}
680671

681672
private static LogicalStructureTreeItem getTopParent(LogicalStructureTreeItem structInfo) {

pdfocr-api/src/test/java/com/itextpdf/pdfocr/ApiTest.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -269,17 +269,14 @@ public void testTableStructureTree() throws IOException, InterruptedException {
269269
}
270270

271271
@Test
272-
@LogMessages(messages = @LogMessage(messageTemplate = PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT,
273-
logLevel = LogLevelConstants.ERROR))
274272
public void testTaggingNotSupported() {
275273
String input = PdfHelper.getImagesTestDirectory() + "numbers_01.jpg";
276274
String pdfPath = PdfHelper.getTargetDirectory() + "taggingNotSupported.pdf";
277275

278276
Exception e = Assertions.assertThrows(PdfOcrException.class,
279277
() -> PdfHelper.createPdf(pdfPath, new File(input), new OcrPdfCreatorProperties().setTagged(true))
280278
);
281-
Assertions.assertEquals(MessageFormatUtil.format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT,
282-
PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED), e.getMessage());
279+
Assertions.assertEquals(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.getMessage());
283280
}
284281

285282
static class NotImplementedImageRotationHandler implements IImageRotationHandler {

pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/CustomOcrEngine.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ public CustomOcrEngine(OcrEngineProperties ocrEngineProperties) {
4747

4848
@Override
4949
public Map<Integer, List<TextInfo>> doImageOcr(File input) {
50-
Map<Integer, List<TextInfo>> result =
51-
new HashMap<Integer, List<TextInfo>>();
50+
Map<Integer, List<TextInfo>> result = new HashMap<Integer, List<TextInfo>>();
5251
String text = PdfHelper.DEFAULT_TEXT;
5352
if (input.getAbsolutePath().contains(PdfHelper.THAI_IMAGE_NAME)) {
5453
text = PdfHelper.THAI_TEXT;
@@ -72,6 +71,10 @@ public void createTxtFile(List<File> inputImages, File txtFile, OcrProcessContex
7271

7372
}
7473

74+
@Override
75+
public boolean isTaggingSupported() {
76+
return false;
77+
}
7578

7679
public OcrEngineProperties getOcrEngineProperties() {
7780
return ocrEngineProperties;

pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/CustomProductAwareOcrEngine.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ This file is part of the iText (R) project.
2424

2525
import com.itextpdf.commons.actions.contexts.IMetaInfo;
2626
import com.itextpdf.commons.actions.data.ProductData;
27-
import com.itextpdf.kernel.geom.Rectangle;
2827
import com.itextpdf.pdfocr.IOcrEngine;
2928
import com.itextpdf.pdfocr.IProductAware;
3029
import com.itextpdf.pdfocr.OcrEngineProperties;
@@ -34,7 +33,6 @@ This file is part of the iText (R) project.
3433

3534
import java.io.File;
3635
import java.util.Collections;
37-
import java.util.HashMap;
3836
import java.util.List;
3937
import java.util.Map;
4038

@@ -64,6 +62,10 @@ public void createTxtFile(List<File> inputImages, File txtFile, OcrProcessContex
6462

6563
}
6664

65+
@Override
66+
public boolean isTaggingSupported() {
67+
return true;
68+
}
6769

6870
public OcrEngineProperties getOcrEngineProperties() {
6971
return null;

pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/TestStructureDetectionOcrEngine.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ This file is part of the iText (R) project.
3838
import java.util.HashMap;
3939
import java.util.List;
4040
import java.util.Map;
41-
import org.junit.jupiter.api.Test;
4241

4342
public class TestStructureDetectionOcrEngine implements IOcrEngine {
4443

@@ -101,4 +100,9 @@ public void createTxtFile(List<File> inputImages, File txtFile) {
101100
@Override
102101
public void createTxtFile(List<File> inputImages, File txtFile, OcrProcessContext ocrProcessContext) {
103102
}
103+
104+
@Override
105+
public boolean isTaggingSupported() {
106+
return true;
107+
}
104108
}

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ This file is part of the iText (R) project.
6060
/**
6161
* The implementation of {@link IOcrEngine}.
6262
*
63+
* <p>
6364
* This class provides possibilities to perform OCR, to read data from input
6465
* files and to return contained text in the required format.
65-
* Also there are possibilities to use features of "tesseract"
66+
* Also, there are possibilities to use features of "tesseract"
6667
* (optical character recognition engine for various operating systems).
6768
*/
6869
public abstract class AbstractTesseract4OcrEngine implements IOcrEngine, IProductAware {
@@ -379,6 +380,11 @@ public ProductData getProductData() {
379380
return PdfOcrTesseract4ProductData.getInstance();
380381
}
381382

383+
@Override
384+
public boolean isTaggingSupported() {
385+
return false;
386+
}
387+
382388
/**
383389
* Performs tesseract OCR using command line tool
384390
* or a wrapper for Tesseract OCR API.

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.pdfocr.tesseract4;
2424

25-
import com.itextpdf.commons.actions.EventManager;
2625
import com.itextpdf.commons.actions.confirmations.ConfirmEvent;
2726
import com.itextpdf.commons.actions.confirmations.EventConfirmationType;
2827
import com.itextpdf.commons.utils.MessageFormatUtil;
@@ -49,6 +48,7 @@ This file is part of the iText (R) project.
4948
/**
5049
* The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR.
5150
*
51+
* <p>
5252
* This class provides possibilities to use features of "tesseract" CL tool
5353
* (optical character recognition engine for various operating systems).
5454
* Please note that it's assumed that "tesseract" has already been

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.pdfocr.tesseract4;
2424

25-
import com.itextpdf.commons.actions.EventManager;
2625
import com.itextpdf.commons.actions.confirmations.ConfirmEvent;
2726
import com.itextpdf.commons.actions.confirmations.EventConfirmationType;
2827
import com.itextpdf.commons.utils.MessageFormatUtil;
@@ -53,9 +52,11 @@ This file is part of the iText (R) project.
5352
/**
5453
* The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR.
5554
*
55+
* <p>
5656
* This class provides possibilities to use features of "tesseract"
5757
* using tess4j.
5858
*
59+
* <p>
5960
* Please note that this class is not thread-safe, in other words this Tesseract engine cannot
6061
* be used for multithreaded processing. You should create one instance per thread
6162
*/

pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ This file is part of the iText (R) project.
2424

2525
import com.itextpdf.commons.utils.MessageFormatUtil;
2626
import com.itextpdf.pdfocr.IntegrationTestHelper;
27+
import com.itextpdf.pdfocr.OcrPdfCreator;
28+
import com.itextpdf.pdfocr.OcrPdfCreatorProperties;
2729
import com.itextpdf.pdfocr.TextInfo;
30+
import com.itextpdf.pdfocr.exceptions.PdfOcrException;
31+
import com.itextpdf.pdfocr.exceptions.PdfOcrExceptionMessageConstant;
2832
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
2933
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4ExceptionMessageConstant;
3034
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;
@@ -152,4 +156,21 @@ public void testDetectAndFixBrokenBBoxes() throws IOException {
152156
Assertions.assertEquals(162.75, (float)textInfo.getBboxRect().getTop(), 0.1);
153157
}
154158

159+
@Test
160+
public void testTaggingNotSupportedForTesseract4ExecutableOcrEngine() {
161+
Exception e = Assertions.assertThrows(PdfOcrException.class,
162+
() -> new OcrPdfCreator(new Tesseract4ExecutableOcrEngine(new Tesseract4OcrEngineProperties()),
163+
new OcrPdfCreatorProperties().setTagged(true))
164+
);
165+
Assertions.assertEquals(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.getMessage());
166+
}
167+
168+
@Test
169+
public void testTaggingNotSupportedForTesseract4LibOcrEngine() {
170+
Exception e = Assertions.assertThrows(PdfOcrException.class,
171+
() -> new OcrPdfCreator(new Tesseract4LibOcrEngine(new Tesseract4OcrEngineProperties()),
172+
new OcrPdfCreatorProperties().setTagged(true))
173+
);
174+
Assertions.assertEquals(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.getMessage());
175+
}
155176
}

0 commit comments

Comments
 (0)