Skip to content

Commit 76ebf14

Browse files
committed
add language detection
1 parent 3d4a33d commit 76ebf14

4 files changed

Lines changed: 84 additions & 24 deletions

File tree

pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@
7373
<sonar.projectKey>levigo_filetype-analyzer</sonar.projectKey>
7474
<sonar.organization>levigo</sonar.organization>
7575
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
76+
77+
<lingua.version>1.2.2</lingua.version>
7678
</properties>
7779

7880
<build>
@@ -373,6 +375,13 @@
373375
<version>${opennlp-tools.version}</version>
374376
<scope>compile</scope>
375377
</dependency>
378+
379+
<!-- Language recognition -->
380+
<dependency>
381+
<groupId>com.github.pemistahl</groupId>
382+
<artifactId>lingua</artifactId>
383+
<version>${lingua.version}</version>
384+
</dependency>
376385
</dependencies>
377386

378387
<distributionManagement>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import javax.xml.transform.stream.StreamResult;
1616
import javax.xml.transform.stream.StreamSource;
1717

18+
import com.github.pemistahl.lingua.api.LanguageDetector;
19+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
1820
import org.apache.pdfbox.pdmodel.PDDocument;
1921
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
2022
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -68,6 +70,22 @@ public class PDFMatcher extends Matcher {
6870
public static final String TEXT_LENGTH_KEY = "text-length";
6971
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
7072

73+
/**
74+
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
75+
*/
76+
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
77+
/**
78+
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
79+
*/
80+
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
81+
82+
83+
private final static boolean languageCheck;
84+
85+
static {
86+
languageCheck = "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
87+
}
88+
7189
private static boolean lookForText() {
7290
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
7391
}
@@ -274,9 +292,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
274292
final String pdfText = new PDFTextStripper().getText(doc);
275293
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
276294
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
295+
if (languageCheck)
296+
addLanguageInformation(pdfDetails, pdfText);
277297
}
278298
}
279299

300+
/**
301+
* Adds information about the given text to the given map.
302+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
303+
* language detection is not reliably possible.
304+
*
305+
* @param pdfDetails map to which the results get added
306+
* @param text text to analyze
307+
*/
308+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
309+
LanguageDetectorBuilder languageDetectorBuilder =
310+
LanguageDetectorBuilder
311+
.fromAllLanguages()
312+
.withMinimumRelativeDistance(0.1);
313+
if (text.length() > 120)
314+
languageDetectorBuilder.withLowAccuracyMode();
315+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
316+
final long startTime = System.currentTimeMillis();
317+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
318+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
319+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
320+
}
321+
280322
/**
281323
* Reads the whole stream to determine the length of it.
282324
*

src/test/java/TestPDFMatcher.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@
1919
import org.jadice.filetype.database.MimeTypeAction;
2020
import org.jadice.filetype.matchers.PDFMatcher;
2121
import org.jadice.filetype.pdfutil.SignatureUtil;
22-
import org.junit.jupiter.api.BeforeAll;
23-
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.api.*;
2423
import org.junit.jupiter.params.ParameterizedTest;
2524
import org.junit.jupiter.params.provider.CsvFileSource;
2625

@@ -37,8 +36,14 @@ class TestPDFMatcher {
3736

3837
private static Analyzer ANALYZER;
3938

40-
@BeforeAll
41-
public static void init() throws AnalyzerException {
39+
@BeforeEach
40+
public void init(TestInfo testInfo) throws AnalyzerException {
41+
try {
42+
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
43+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
44+
} catch (Exception e) {
45+
e.printStackTrace();
46+
}
4247
ANALYZER = Analyzer.getInstance("/magic.xml");
4348
}
4449

@@ -158,7 +163,7 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount) th
158163
@SuppressWarnings("unchecked")
159164
@ParameterizedTest
160165
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
161-
void testContainsText(final String filePath, final boolean expected) throws IOException {
166+
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
162167
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
163168
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
164169
assertNotNull(result);
@@ -174,6 +179,10 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
174179
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
175180
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
176181
assertEquals(totalTextLength, sum);
182+
if (!language.equals("null")) {
183+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
184+
}
185+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
177186
}
178187
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
179188
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)