Skip to content

Commit 6289aad

Browse files
axherrmwelschsn
authored andcommitted
feat(JF-466): add language recognition
1 parent 1c7e81f commit 6289aad

4 files changed

Lines changed: 67 additions & 20 deletions

File tree

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,13 @@
320320
<version>${pdfbox.version}</version>
321321
</dependency>
322322

323+
<!-- Language recognition -->
324+
<dependency>
325+
<groupId>com.github.pemistahl</groupId>
326+
<artifactId>lingua</artifactId>
327+
<version>1.2.2</version>
328+
</dependency>
329+
323330
<!-- for pdf signature validation -->
324331
<dependency>
325332
<groupId>org.bouncycastle</groupId>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import javax.xml.transform.stream.StreamResult;
2222
import javax.xml.transform.stream.StreamSource;
2323

24+
import com.github.pemistahl.lingua.api.LanguageDetector;
25+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2426
import org.apache.pdfbox.Loader;
2527
import org.apache.pdfbox.io.IOUtils;
2628
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -77,6 +79,15 @@ public class PDFMatcher extends Matcher {
7779
public static final String TEXT_LENGTH_KEY = "text-length";
7880
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
7981

82+
/**
83+
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
84+
*/
85+
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
86+
/**
87+
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
88+
*/
89+
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
90+
8091
private static boolean lookForText() {
8192
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
8293
}
@@ -256,6 +267,8 @@ private static PDEmbeddedFile getEmbeddedFile(final PDComplexFileSpecification f
256267
* <li>{@link #CONTAINS_TEXT_KEY} whether the whole document contains any text (without line breaks)</li>
257268
* <li>{@link #TEXT_LENGTH_PER_PAGE_KEY} list of integers that indicate how long the text in each page is (only set if there is text at all)</li>
258269
* <li>{@link #TEXT_LENGTH_KEY} length of the text of the whole document (only set if there is text at all)</li>
270+
* <li>{@link #MOST_LIKELY_TEXT_LANGUAGE} detected language (only set if there is text at all)</li>
271+
* <li>{@link #TEXT_LANGUAGE_CONFIDENCE_VALUES} map of all possible languages, sorted by their confidence value (only set if there is text at all)</li>
259272
* </ul>
260273
*
261274
* @param pdfDetails map to which the results get added
@@ -279,9 +292,32 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
279292
final String pdfText = new PDFTextStripper().getText(doc);
280293
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
281294
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
295+
addLanguageInformation(pdfDetails, pdfText);
282296
}
283297
}
284298

299+
/**
300+
* Adds information about the given text to the given map.
301+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
302+
* language detection is not reliably possible.
303+
*
304+
* @param pdfDetails map to which the results get added
305+
* @param text text to analyze
306+
*/
307+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
308+
LanguageDetectorBuilder languageDetectorBuilder =
309+
LanguageDetectorBuilder
310+
.fromAllLanguages()
311+
.withMinimumRelativeDistance(0.1);
312+
if (text.length() > 120)
313+
languageDetectorBuilder.withLowAccuracyMode();
314+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
315+
final long startTime = System.currentTimeMillis();
316+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
317+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
318+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
319+
}
320+
285321
/**
286322
* Checks if the PDF is an electronic invoice.
287323
*

src/test/java/TestPDFMatcher.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount, fi
161161
@SuppressWarnings("unchecked")
162162
@ParameterizedTest
163163
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
164-
void testContainsText(final String filePath, final boolean expected) throws IOException {
164+
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
165165
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
166166
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
167167
assertNotNull(result);
@@ -177,6 +177,10 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
177177
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
178178
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
179179
assertEquals(totalTextLength, sum);
180+
if (!language.equals("null")) {
181+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
182+
}
183+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
180184
}
181185
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
182186
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)