Skip to content

Commit cd524d8

Browse files
axherrmwelschsn
authored andcommitted
undo language recognition to move to another branch
1 parent 6289aad commit cd524d8

4 files changed

Lines changed: 19 additions & 57 deletions

File tree

pom.xml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -320,13 +320,6 @@
320320
<version>${pdfbox.version}</version>
321321
</dependency>
322322

323-
<!-- Language recognition -->
324-
<dependency>
325-
<groupId>com.github.pemistahl</groupId>
326-
<artifactId>lingua</artifactId>
327-
<version>1.2.2</version>
328-
</dependency>
329-
330323
<!-- for pdf signature validation -->
331324
<dependency>
332325
<groupId>org.bouncycastle</groupId>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
import javax.xml.transform.stream.StreamResult;
2222
import javax.xml.transform.stream.StreamSource;
2323

24-
import com.github.pemistahl.lingua.api.LanguageDetector;
25-
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2624
import org.apache.pdfbox.Loader;
2725
import org.apache.pdfbox.io.IOUtils;
2826
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -267,8 +265,6 @@ private static PDEmbeddedFile getEmbeddedFile(final PDComplexFileSpecification f
267265
* <li>{@link #CONTAINS_TEXT_KEY} whether the whole document contains any text (without line breaks)</li>
268266
* <li>{@link #TEXT_LENGTH_PER_PAGE_KEY} list of integers that indicate how long the text in each page is (only set if there is text at all)</li>
269267
* <li>{@link #TEXT_LENGTH_KEY} length of the text of the whole document (only set if there is text at all)</li>
270-
* <li>{@link #MOST_LIKELY_TEXT_LANGUAGE} detected language (only set if there is text at all)</li>
271-
* <li>{@link #TEXT_LANGUAGE_CONFIDENCE_VALUES} map of all possible languages, sorted by their confidence value (only set if there is text at all)</li>
272268
* </ul>
273269
*
274270
* @param pdfDetails map to which the results get added
@@ -292,32 +288,9 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
292288
final String pdfText = new PDFTextStripper().getText(doc);
293289
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
294290
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
295-
addLanguageInformation(pdfDetails, pdfText);
296291
}
297292
}
298293

299-
/**
300-
* Adds information about the given text to the given map.
301-
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
302-
* language detection is not reliably possible.
303-
*
304-
* @param pdfDetails map to which the results get added
305-
* @param text text to analyze
306-
*/
307-
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
308-
LanguageDetectorBuilder languageDetectorBuilder =
309-
LanguageDetectorBuilder
310-
.fromAllLanguages()
311-
.withMinimumRelativeDistance(0.1);
312-
if (text.length() > 120)
313-
languageDetectorBuilder.withLowAccuracyMode();
314-
final LanguageDetector languageDetector = languageDetectorBuilder.build();
315-
final long startTime = System.currentTimeMillis();
316-
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
317-
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
318-
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
319-
}
320-
321294
/**
322295
* Checks if the PDF is an electronic invoice.
323296
*

src/test/java/TestPDFMatcher.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,6 @@ void testContainsText(final String filePath, final boolean expected, final Strin
177177
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
178178
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
179179
assertEquals(totalTextLength, sum);
180-
if (!language.equals("null")) {
181-
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
182-
}
183-
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
184180
}
185181
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
186182
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text,language(ignored if null)
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN
1+
path,contains-text
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true

0 commit comments

Comments
 (0)