Skip to content

Commit ba95d4a

Browse files
axherrmwelschsn
authored andcommitted
add language detection
1 parent cd524d8 commit ba95d4a

4 files changed

Lines changed: 73 additions & 23 deletions

File tree

pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
<jaxb-api.version>4.0.4</jaxb-api.version>
5656
<jaxb-core.version>4.0.6</jaxb-core.version>
5757
<jaxb-impl.version>4.0.6</jaxb-impl.version>
58+
<lingua.version>1.2.2</lingua.version>
5859
<opennlp-tools.version>1.9.3</opennlp-tools.version>
5960
<pdfbox.version>3.0.6</pdfbox.version>
6061
<poi.version>5.5.0</poi.version>
@@ -385,6 +386,13 @@
385386
<version>${opennlp-tools.version}</version>
386387
<scope>compile</scope>
387388
</dependency>
389+
390+
<!-- Language recognition -->
391+
<dependency>
392+
<groupId>com.github.pemistahl</groupId>
393+
<artifactId>lingua</artifactId>
394+
<version>${lingua.version}</version>
395+
</dependency>
388396
</dependencies>
389397

390398
<profiles>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import javax.xml.transform.stream.StreamResult;
2222
import javax.xml.transform.stream.StreamSource;
2323

24+
import com.github.pemistahl.lingua.api.LanguageDetector;
25+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2426
import org.apache.pdfbox.Loader;
2527
import org.apache.pdfbox.io.IOUtils;
2628
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -86,6 +88,13 @@ public class PDFMatcher extends Matcher {
8688
*/
8789
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
8890

91+
92+
private final static boolean languageCheck;
93+
94+
static {
95+
languageCheck = "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
96+
}
97+
8998
private static boolean lookForText() {
9099
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
91100
}
@@ -288,9 +297,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
288297
final String pdfText = new PDFTextStripper().getText(doc);
289298
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
290299
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
300+
if (languageCheck)
301+
addLanguageInformation(pdfDetails, pdfText);
291302
}
292303
}
293304

305+
/**
306+
* Adds information about the given text to the given map.
307+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
308+
* language detection is not reliably possible.
309+
*
310+
* @param pdfDetails map to which the results get added
311+
* @param text text to analyze
312+
*/
313+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
314+
LanguageDetectorBuilder languageDetectorBuilder =
315+
LanguageDetectorBuilder
316+
.fromAllLanguages()
317+
.withMinimumRelativeDistance(0.1);
318+
if (text.length() > 120)
319+
languageDetectorBuilder.withLowAccuracyMode();
320+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
321+
final long startTime = System.currentTimeMillis();
322+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
323+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
324+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
325+
}
326+
294327
/**
295328
* Checks if the PDF is an electronic invoice.
296329
*

src/test/java/TestPDFMatcher.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@
1919
import org.jadice.filetype.database.MimeTypeAction;
2020
import org.jadice.filetype.matchers.PDFMatcher;
2121
import org.jadice.filetype.pdfutil.SignatureUtil;
22-
import org.junit.jupiter.api.BeforeAll;
23-
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.api.*;
2423
import org.junit.jupiter.params.ParameterizedTest;
2524
import org.junit.jupiter.params.provider.CsvFileSource;
2625

@@ -37,8 +36,14 @@ class TestPDFMatcher {
3736

3837
private static Analyzer ANALYZER;
3938

40-
@BeforeAll
41-
public static void init() throws AnalyzerException {
39+
@BeforeEach
40+
public void init(TestInfo testInfo) throws AnalyzerException {
41+
try {
42+
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
43+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
44+
} catch (Exception e) {
45+
e.printStackTrace();
46+
}
4247
ANALYZER = Analyzer.getInstance("/magic.xml");
4348
}
4449

@@ -177,6 +182,10 @@ void testContainsText(final String filePath, final boolean expected, final Strin
177182
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
178183
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
179184
assertEquals(totalTextLength, sum);
185+
if (!language.equals("null")) {
186+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
187+
}
188+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
180189
}
181190
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
182191
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)