Skip to content

Commit 258160c

Browse files
committed
add language detection
1 parent 187d261 commit 258160c

4 files changed

Lines changed: 87 additions & 25 deletions

File tree

pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@
7373
<sonar.projectKey>levigo_filetype-analyzer</sonar.projectKey>
7474
<sonar.organization>levigo</sonar.organization>
7575
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
76+
77+
<lingua.version>1.2.2</lingua.version>
7678
</properties>
7779

7880
<build>
@@ -373,6 +375,13 @@
373375
<version>${opennlp-tools.version}</version>
374376
<scope>compile</scope>
375377
</dependency>
378+
379+
<!-- Language recognition -->
380+
<dependency>
381+
<groupId>com.github.pemistahl</groupId>
382+
<artifactId>lingua</artifactId>
383+
<version>${lingua.version}</version>
384+
</dependency>
376385
</dependencies>
377386

378387
<distributionManagement>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import javax.xml.transform.stream.StreamResult;
1616
import javax.xml.transform.stream.StreamSource;
1717

18+
import com.github.pemistahl.lingua.api.LanguageDetector;
19+
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
1820
import org.apache.pdfbox.pdmodel.PDDocument;
1921
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
2022
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -68,6 +70,22 @@ public class PDFMatcher extends Matcher {
6870
public static final String TEXT_LENGTH_KEY = "text-length";
6971
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
7072

73+
/**
74+
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
75+
*/
76+
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
77+
/**
78+
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
79+
*/
80+
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
81+
82+
83+
private final static boolean languageCheck;
84+
85+
static {
86+
languageCheck = "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
87+
}
88+
7189
/*
7290
* (non-Javadoc)
7391
*
@@ -247,10 +265,12 @@ private static PDEmbeddedFile getEmbeddedFile(final PDComplexFileSpecification f
247265
* <li>{@link #CONTAINS_TEXT_KEY} whether the whole document contains any text (without line breaks)</li>
248266
* <li>{@link #TEXT_LENGTH_PER_PAGE_KEY} list of integers that indicate how long the text in each page is (only set if there is text at all)</li>
249267
* <li>{@link #TEXT_LENGTH_KEY} length of the text of the whole document (only set if there is text at all)</li>
268+
* <li>{@link #MOST_LIKELY_TEXT_LANGUAGE} detected language (only set if there is text at all)</li>
269+
* <li>{@link #TEXT_LANGUAGE_CONFIDENCE_VALUES} map of all possible languages, sorted by their confidence value (only set if there is text at all)</li>
250270
* </ul>
251271
*
252272
* @param pdfDetails map to which the results get added
253-
* @param doc document
273+
* @param doc document
254274
*/
255275
private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDocument doc) throws IOException {
256276
boolean containsText = false;
@@ -270,9 +290,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
270290
final String pdfText = new PDFTextStripper().getText(doc);
271291
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
272292
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
293+
if (languageCheck)
294+
addLanguageInformation(pdfDetails, pdfText);
273295
}
274296
}
275297

298+
/**
299+
* Adds information about the given text to the given map.
300+
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
301+
* language detection is not reliably possible.
302+
*
303+
* @param pdfDetails map to which the results get added
304+
* @param text text to analyze
305+
*/
306+
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
307+
LanguageDetectorBuilder languageDetectorBuilder =
308+
LanguageDetectorBuilder
309+
.fromAllLanguages()
310+
.withMinimumRelativeDistance(0.1);
311+
if (text.length() > 120)
312+
languageDetectorBuilder.withLowAccuracyMode();
313+
final LanguageDetector languageDetector = languageDetectorBuilder.build();
314+
final long startTime = System.currentTimeMillis();
315+
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
316+
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
317+
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
318+
}
319+
276320
/**
277321
* Reads the whole stream to determine the length of it.
278322
*

src/test/java/TestPDFMatcher.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@
1919
import org.jadice.filetype.database.MimeTypeAction;
2020
import org.jadice.filetype.matchers.PDFMatcher;
2121
import org.jadice.filetype.pdfutil.SignatureUtil;
22-
import org.junit.jupiter.api.BeforeAll;
23-
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.api.*;
2423
import org.junit.jupiter.params.ParameterizedTest;
2524
import org.junit.jupiter.params.provider.CsvFileSource;
2625

@@ -37,8 +36,14 @@ class TestPDFMatcher {
3736

3837
private static Analyzer ANALYZER;
3938

40-
@BeforeAll
41-
public static void init() throws AnalyzerException {
39+
@BeforeEach
40+
public void init(TestInfo testInfo) throws AnalyzerException {
41+
try {
42+
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
43+
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
44+
} catch (Exception e) {
45+
e.printStackTrace();
46+
}
4247
ANALYZER = Analyzer.getInstance("/magic.xml");
4348
}
4449

@@ -158,7 +163,7 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount) th
158163
@SuppressWarnings("unchecked")
159164
@ParameterizedTest
160165
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
161-
void testContainsText(final String filePath, final boolean expected) throws IOException {
166+
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
162167
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
163168
assertNotNull(result);
164169
assertTrue(result.containsKey(PDFMatcher.DETAILS_KEY));
@@ -173,6 +178,10 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
173178
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
174179
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
175180
assertEquals(totalTextLength, sum);
181+
if (!language.equals("null")) {
182+
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
183+
}
184+
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
176185
}
177186
}
178187

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
1+
path,contains-text,language(ignored if null)
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN

0 commit comments

Comments
 (0)