@@ -161,10 +161,10 @@ static Map<Integer, List<TextInfo>> parseHocrFile(
161161
162162 for (int inputFileIdx = 0 ; inputFileIdx < inputFiles .size (); inputFileIdx ++) {
163163 final File inputFile = inputFiles .get (inputFileIdx );
164- String txt = null ;
164+ List < String > txt = null ;
165165 if (txtInputFiles != null ) {
166166 final File txtInputFile = txtInputFiles .get (inputFileIdx );
167- txt = readTxtFile (txtInputFile );
167+ txt = Files . readAllLines (txtInputFile . toPath (), StandardCharsets . UTF_8 );
168168 }
169169 if (inputFile != null
170170 && Files .exists (
@@ -432,7 +432,7 @@ static void runCommand(final String execPath,
432432 */
433433 private static List <TextInfo > getTextData (Element page ,
434434 Tesseract4OcrEngineProperties tesseract4OcrEngineProperties ,
435- String txt ,
435+ List < String > txt ,
436436 Map <String , Node > unparsedBBoxes ) {
437437 final Rectangle pageBbox = parseBBox (page , null , unparsedBBoxes );
438438 final List <String > searchedClasses = Arrays .<String >asList (OCR_LINE , OCR_CAPTION );
@@ -457,7 +457,7 @@ private static List<TextInfo> getTextData(Element page,
457457 */
458458 private static List <TextInfo > getTextData (List <Element > pageObjects ,
459459 Tesseract4OcrEngineProperties tesseract4OcrEngineProperties ,
460- String txt ,
460+ List < String > txt ,
461461 Rectangle pageBbox ,
462462 Map <String , Node > unparsedBBoxes ) {
463463 List <TextInfo > textData = new ArrayList <TextInfo >();
@@ -645,15 +645,15 @@ private static TextInfo mergeTextInfos(List<TextInfo> textInfos) {
645645 *
646646 * @return text line if found, otherwise null
647647 */
648- private static String findHocrLineInTxt (Element line , String txt ) {
648+ private static String findHocrLineInTxt (Element line , List < String > txt ) {
649649 if (txt == null ) {
650650 return null ;
651651 }
652652 String hocrLineText = line .text ().replaceAll (SPACE_PATTERN , "" );
653653 if (hocrLineText .isEmpty ()) {
654654 return null ;
655655 }
656- for (String txtLine : txt . split ( " \n " ) ) {
656+ for (String txtLine : txt ) {
657657 if (txtLine .replaceAll (SPACE_PATTERN , "" ).equals (hocrLineText )) {
658658 return txtLine ;
659659 }
0 commit comments