Skip to content

Commit f3ba2ba

Browse files
committed
feat(JF-466): add language recognition
1 parent 5258c29 commit f3ba2ba

4 files changed

Lines changed: 68 additions & 21 deletions

File tree

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,13 @@
320320
<version>${pdfbox.version}</version>
321321
</dependency>
322322

323+
<!-- Language recognition -->
324+
<dependency>
325+
<groupId>com.github.pemistahl</groupId>
326+
<artifactId>lingua</artifactId>
327+
<version>1.2.2</version>
328+
</dependency>
329+
323330
<!-- Using Hamcrest in a Maven Project see http://hamcrest.org/JavaHamcrest/distributables#maven-upgrade-example -->
324331
<dependency>
325332
<groupId>org.hamcrest</groupId>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import javax.xml.transform.stream.StreamResult;
1616
import javax.xml.transform.stream.StreamSource;
1717

18+
import com.github.pemistahl.lingua.api.LanguageDetector;
19+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
1820
import org.apache.pdfbox.pdmodel.PDDocument;
1921
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
2022
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -39,7 +41,7 @@
3941

4042
/**
4143
* A {@link Matcher} for PDF documents .
42-
*
44+
* <p>
4345
* Caveat: for performance reasons, this should only be called from a context where the stream has
4446
* already be identified as a PDF file/stream.
4547
*/
@@ -68,6 +70,15 @@ public class PDFMatcher extends Matcher {
6870
public static final String TEXT_LENGTH_KEY = "text-length";
6971
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
7072

73+
/**
74+
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
75+
*/
76+
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
77+
/**
78+
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
79+
*/
80+
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
81+
7182
private static boolean lookForText() {
7283
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
7384
}
@@ -251,6 +262,8 @@ private static PDEmbeddedFile getEmbeddedFile(final PDComplexFileSpecification f
251262
* <li>{@link #CONTAINS_TEXT_KEY} whether the whole document contains any text (without line breaks)</li>
252263
* <li>{@link #TEXT_LENGTH_PER_PAGE_KEY} list of integers that indicate how long the text in each page is (only set if there is text at all)</li>
253264
* <li>{@link #TEXT_LENGTH_KEY} length of the text of the whole document (only set if there is text at all)</li>
265+
* <li>{@link #MOST_LIKELY_TEXT_LANGUAGE} detected language (only set if there is text at all)</li>
266+
* <li>{@link #TEXT_LANGUAGE_CONFIDENCE_VALUES} map of all possible languages, sorted by their confidence value (only set if there is text at all)</li>
254267
* </ul>
255268
*
256269
* @param pdfDetails map to which the results get added
@@ -274,9 +287,32 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
274287
final String pdfText = new PDFTextStripper().getText(doc);
275288
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
276289
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
290+
addLanguageInformation(pdfDetails, pdfText);
277291
}
278292
}
279293

294+
/**
295+
* Adds information about the given text to the given map.
296+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
297+
* language detection is not reliably possible.
298+
*
299+
* @param pdfDetails map to which the results get added
300+
* @param text text to analyze
301+
*/
302+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
303+
LanguageDetectorBuilder languageDetectorBuilder =
304+
LanguageDetectorBuilder
305+
.fromAllLanguages()
306+
.withMinimumRelativeDistance(0.1);
307+
if (text.length() > 120)
308+
languageDetectorBuilder.withLowAccuracyMode();
309+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
310+
final long startTime = System.currentTimeMillis();
311+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
312+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
313+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
314+
}
315+
280316
/**
281317
* Reads the whole stream to determine the length of it.
282318
*

src/test/java/TestPDFMatcher.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount) th
158158
@SuppressWarnings("unchecked")
159159
@ParameterizedTest
160160
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
161-
void testContainsText(final String filePath, final boolean expected) throws IOException {
161+
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
162162
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
163163
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
164164
assertNotNull(result);
@@ -174,6 +174,10 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
174174
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
175175
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
176176
assertEquals(totalTextLength, sum);
177+
if (!language.equals("null")) {
178+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
179+
}
180+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
177181
}
178182
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
179183
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)