Skip to content

Commit 4d11dd4

Browse files
committed
add language detection
1 parent 4ef0a68 commit 4d11dd4

4 files changed

Lines changed: 74 additions & 23 deletions

File tree

pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@
7373
<sonar.projectKey>levigo_filetype-analyzer</sonar.projectKey>
7474
<sonar.organization>levigo</sonar.organization>
7575
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
76+
77+
<lingua.version>1.2.2</lingua.version>
7678
</properties>
7779

7880
<build>
@@ -373,6 +375,13 @@
373375
<version>${opennlp-tools.version}</version>
374376
<scope>compile</scope>
375377
</dependency>
378+
379+
<!-- Language recognition -->
380+
<dependency>
381+
<groupId>com.github.pemistahl</groupId>
382+
<artifactId>lingua</artifactId>
383+
<version>${lingua.version}</version>
384+
</dependency>
376385
</dependencies>
377386

378387
<distributionManagement>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import javax.xml.transform.stream.StreamResult;
1616
import javax.xml.transform.stream.StreamSource;
1717

18+
import com.github.pemistahl.lingua.api.LanguageDetector;
19+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
1820
import org.apache.pdfbox.pdmodel.PDDocument;
1921
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
2022
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -77,6 +79,13 @@ public class PDFMatcher extends Matcher {
7779
*/
7880
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
7981

82+
83+
private final static boolean languageCheck;
84+
85+
static {
86+
languageCheck = "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
87+
}
88+
8089
private static boolean lookForText() {
8190
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
8291
}
@@ -283,9 +292,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
283292
final String pdfText = new PDFTextStripper().getText(doc);
284293
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
285294
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
295+
if (languageCheck)
296+
addLanguageInformation(pdfDetails, pdfText);
286297
}
287298
}
288299

300+
/**
301+
* Adds information about the given text to the given map.
302+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
303+
* language detection is not reliably possible.
304+
*
305+
* @param pdfDetails map to which the results get added
306+
* @param text text to analyze
307+
*/
308+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
309+
LanguageDetectorBuilder languageDetectorBuilder =
310+
LanguageDetectorBuilder
311+
.fromAllLanguages()
312+
.withMinimumRelativeDistance(0.1);
313+
if (text.length() > 120)
314+
languageDetectorBuilder.withLowAccuracyMode();
315+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
316+
final long startTime = System.currentTimeMillis();
317+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
318+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
319+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
320+
}
321+
289322
/**
290323
* Reads the whole stream to determine the length of it.
291324
*

src/test/java/TestPDFMatcher.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@
1919
import org.jadice.filetype.database.MimeTypeAction;
2020
import org.jadice.filetype.matchers.PDFMatcher;
2121
import org.jadice.filetype.pdfutil.SignatureUtil;
22-
import org.junit.jupiter.api.BeforeAll;
23-
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.api.*;
2423
import org.junit.jupiter.params.ParameterizedTest;
2524
import org.junit.jupiter.params.provider.CsvFileSource;
2625

@@ -37,8 +36,14 @@ class TestPDFMatcher {
3736

3837
private static Analyzer ANALYZER;
3938

40-
@BeforeAll
41-
public static void init() throws AnalyzerException {
39+
@BeforeEach
40+
public void init(TestInfo testInfo) throws AnalyzerException {
41+
try {
42+
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
43+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
44+
} catch (Exception e) {
45+
e.printStackTrace();
46+
}
4247
ANALYZER = Analyzer.getInstance("/magic.xml");
4348
}
4449

@@ -174,6 +179,10 @@ void testContainsText(final String filePath, final boolean expected, final Strin
174179
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
175180
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
176181
assertEquals(totalTextLength, sum);
182+
if (!language.equals("null")) {
183+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
184+
}
185+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
177186
}
178187
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
179188
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)