Skip to content

Commit 44df9b9

Browse files
committed
thai_03 test fails in .NET. Might be related to reading UTF-8 files issue
1 parent b5ec35b commit 44df9b9

1 file changed

Lines changed: 6 additions & 6 deletions

File tree

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,10 @@ static Map<Integer, List<TextInfo>> parseHocrFile(
161161

162162
for (int inputFileIdx = 0; inputFileIdx < inputFiles.size(); inputFileIdx++) {
163163
final File inputFile = inputFiles.get(inputFileIdx);
164-
String txt = null;
164+
List<String> txt = null;
165165
if (txtInputFiles != null) {
166166
final File txtInputFile = txtInputFiles.get(inputFileIdx);
167-
txt = readTxtFile(txtInputFile);
167+
txt = Files.readAllLines(txtInputFile.toPath(), StandardCharsets.UTF_8);
168168
}
169169
if (inputFile != null
170170
&& Files.exists(
@@ -432,7 +432,7 @@ static void runCommand(final String execPath,
432432
*/
433433
private static List<TextInfo> getTextData(Element page,
434434
Tesseract4OcrEngineProperties tesseract4OcrEngineProperties,
435-
String txt,
435+
List<String> txt,
436436
Map<String, Node> unparsedBBoxes) {
437437
final Rectangle pageBbox = parseBBox(page, null, unparsedBBoxes);
438438
final List<String> searchedClasses = Arrays.<String>asList(OCR_LINE, OCR_CAPTION);
@@ -457,7 +457,7 @@ private static List<TextInfo> getTextData(Element page,
457457
*/
458458
private static List<TextInfo> getTextData(List<Element> pageObjects,
459459
Tesseract4OcrEngineProperties tesseract4OcrEngineProperties,
460-
String txt,
460+
List<String> txt,
461461
Rectangle pageBbox,
462462
Map<String, Node> unparsedBBoxes) {
463463
List<TextInfo> textData = new ArrayList<TextInfo>();
@@ -645,15 +645,15 @@ private static TextInfo mergeTextInfos(List<TextInfo> textInfos) {
645645
*
646646
* @return text line if found, otherwise null
647647
*/
648-
private static String findHocrLineInTxt(Element line, String txt) {
648+
private static String findHocrLineInTxt(Element line, List<String> txt) {
649649
if (txt == null) {
650650
return null;
651651
}
652652
String hocrLineText = line.text().replaceAll(SPACE_PATTERN, "");
653653
if (hocrLineText.isEmpty()) {
654654
return null;
655655
}
656-
for (String txtLine : txt.split("\n")) {
656+
for (String txtLine : txt) {
657657
if (txtLine.replaceAll(SPACE_PATTERN, "").equals(hocrLineText)) {
658658
return txtLine;
659659
}

0 commit comments

Comments
 (0)