Skip to content

Commit c8d10b8

Browse files
axherrmwelschsn
authored andcommitted
feat(JF-466): add language recognition
1 parent 1c7e81f commit c8d10b8

4 files changed

Lines changed: 88 additions & 24 deletions

File tree

pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
<jaxb-api.version>4.0.4</jaxb-api.version>
5656
<jaxb-core.version>4.0.6</jaxb-core.version>
5757
<jaxb-impl.version>4.0.6</jaxb-impl.version>
58+
<lingua.version>1.2.2</lingua.version>
5859
<opennlp-tools.version>1.9.3</opennlp-tools.version>
5960
<pdfbox.version>3.0.6</pdfbox.version>
6061
<poi.version>5.5.0</poi.version>
@@ -385,6 +386,13 @@
385386
<version>${opennlp-tools.version}</version>
386387
<scope>compile</scope>
387388
</dependency>
389+
390+
<!-- Language recognition -->
391+
<dependency>
392+
<groupId>com.github.pemistahl</groupId>
393+
<artifactId>lingua</artifactId>
394+
<version>${lingua.version}</version>
395+
</dependency>
388396
</dependencies>
389397

390398
<profiles>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import javax.xml.transform.stream.StreamResult;
2222
import javax.xml.transform.stream.StreamSource;
2323

24+
import com.github.pemistahl.lingua.api.LanguageDetector;
25+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2426
import org.apache.pdfbox.Loader;
2527
import org.apache.pdfbox.io.IOUtils;
2628
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -77,6 +79,20 @@ public class PDFMatcher extends Matcher {
7779
public static final String TEXT_LENGTH_KEY = "text-length";
7880
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
7981

82+
/**
83+
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
84+
*/
85+
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
86+
/**
87+
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
88+
*/
89+
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
90+
91+
92+
private static boolean checkLanguage() {
93+
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
94+
}
95+
8096
private static boolean lookForText() {
8197
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
8298
}
@@ -279,9 +295,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
279295
final String pdfText = new PDFTextStripper().getText(doc);
280296
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
281297
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
298+
if (checkLanguage())
299+
addLanguageInformation(pdfDetails, pdfText);
282300
}
283301
}
284302

303+
/**
304+
* Adds information about the given text to the given map.
305+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
306+
* language detection is not reliably possible.
307+
*
308+
* @param pdfDetails map to which the results get added
309+
* @param text text to analyze
310+
*/
311+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
312+
LanguageDetectorBuilder languageDetectorBuilder =
313+
LanguageDetectorBuilder
314+
.fromAllLanguages()
315+
.withMinimumRelativeDistance(0.1);
316+
if (text.length() > 120)
317+
languageDetectorBuilder.withLowAccuracyMode();
318+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
319+
final long startTime = System.currentTimeMillis();
320+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
321+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
322+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
323+
}
324+
285325
/**
286326
* Checks if the PDF is an electronic invoice.
287327
*

src/test/java/TestPDFMatcher.java

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import static org.hamcrest.CoreMatchers.containsString;
22
import static org.hamcrest.CoreMatchers.hasItems;
33
import static org.hamcrest.MatcherAssert.assertThat;
4-
import static org.hamcrest.Matchers.*;
4+
import static org.hamcrest.Matchers.equalTo;
5+
import static org.hamcrest.Matchers.hasEntry;
6+
import static org.hamcrest.Matchers.hasKey;
7+
import static org.hamcrest.Matchers.notNullValue;
58
import static org.junit.jupiter.api.Assertions.assertEquals;
69
import static org.junit.jupiter.api.Assertions.assertFalse;
710
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -19,8 +22,9 @@
1922
import org.jadice.filetype.database.MimeTypeAction;
2023
import org.jadice.filetype.matchers.PDFMatcher;
2124
import org.jadice.filetype.pdfutil.SignatureUtil;
22-
import org.junit.jupiter.api.BeforeAll;
25+
import org.junit.jupiter.api.BeforeEach;
2326
import org.junit.jupiter.api.Test;
27+
import org.junit.jupiter.api.TestInfo;
2428
import org.junit.jupiter.params.ParameterizedTest;
2529
import org.junit.jupiter.params.provider.CsvFileSource;
2630

@@ -37,8 +41,14 @@ class TestPDFMatcher {
3741

3842
private static Analyzer ANALYZER;
3943

40-
@BeforeAll
41-
public static void init() throws AnalyzerException {
44+
@BeforeEach
45+
public void init(TestInfo testInfo) throws AnalyzerException {
46+
try {
47+
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
48+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
49+
} catch (Exception e) {
50+
e.printStackTrace();
51+
}
4252
ANALYZER = Analyzer.getInstance("/magic.xml");
4353
}
4454

@@ -161,7 +171,8 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount, fi
161171
@SuppressWarnings("unchecked")
162172
@ParameterizedTest
163173
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
164-
void testContainsText(final String filePath, final boolean expected) throws IOException {
174+
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
175+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
165176
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
166177
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
167178
assertNotNull(result);
@@ -177,7 +188,12 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
177188
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
178189
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
179190
assertEquals(totalTextLength, sum);
191+
if (!language.equals("null")) {
192+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
193+
}
194+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
180195
}
196+
System.clearProperty(PDFMatcher.class.getName() + ".languageCheck");
181197
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
182198
}
183199

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)