Skip to content

Commit 4ef0a68

Browse files
committed
undo language recognition to move to another branch
1 parent f3ba2ba commit 4ef0a68

4 files changed

Lines changed: 19 additions & 57 deletions

File tree

pom.xml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -320,13 +320,6 @@
320320
<version>${pdfbox.version}</version>
321321
</dependency>
322322

323-
<!-- Language recognition -->
324-
<dependency>
325-
<groupId>com.github.pemistahl</groupId>
326-
<artifactId>lingua</artifactId>
327-
<version>1.2.2</version>
328-
</dependency>
329-
330323
<!-- Using Hamcrest in a Maven Project see http://hamcrest.org/JavaHamcrest/distributables#maven-upgrade-example -->
331324
<dependency>
332325
<groupId>org.hamcrest</groupId>

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
import javax.xml.transform.stream.StreamResult;
1616
import javax.xml.transform.stream.StreamSource;
1717

18-
import com.github.pemistahl.lingua.api.LanguageDetector;
19-
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2018
import org.apache.pdfbox.pdmodel.PDDocument;
2119
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
2220
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -262,8 +260,6 @@ private static PDEmbeddedFile getEmbeddedFile(final PDComplexFileSpecification f
262260
* <li>{@link #CONTAINS_TEXT_KEY} whether the whole document contains any text (without line breaks)</li>
263261
* <li>{@link #TEXT_LENGTH_PER_PAGE_KEY} list of integers that indicate how long the text in each page is (only set if there is text at all)</li>
264262
* <li>{@link #TEXT_LENGTH_KEY} length of the text of the whole document (only set if there is text at all)</li>
265-
* <li>{@link #MOST_LIKELY_TEXT_LANGUAGE} detected language (only set if there is text at all)</li>
266-
* <li>{@link #TEXT_LANGUAGE_CONFIDENCE_VALUES} map of all possible languages, sorted by their confidence value (only set if there is text at all)</li>
267263
* </ul>
268264
*
269265
* @param pdfDetails map to which the results get added
@@ -287,32 +283,9 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
287283
final String pdfText = new PDFTextStripper().getText(doc);
288284
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
289285
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
290-
addLanguageInformation(pdfDetails, pdfText);
291286
}
292287
}
293288

294-
/**
295-
* Adds information about the given text to the given map.
296-
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
297-
* language detection is not reliably possible.
298-
*
299-
* @param pdfDetails map to which the results get added
300-
* @param text text to analyze
301-
*/
302-
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
303-
LanguageDetectorBuilder languageDetectorBuilder =
304-
LanguageDetectorBuilder
305-
.fromAllLanguages()
306-
.withMinimumRelativeDistance(0.1);
307-
if (text.length() > 120)
308-
languageDetectorBuilder.withLowAccuracyMode();
309-
final LanguageDetector languageDetector = languageDetectorBuilder.build();
310-
final long startTime = System.currentTimeMillis();
311-
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
312-
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
313-
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
314-
}
315-
316289
/**
317290
* Reads the whole stream to determine the length of it.
318291
*

src/test/java/TestPDFMatcher.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,6 @@ void testContainsText(final String filePath, final boolean expected, final Strin
174174
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
175175
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
176176
assertEquals(totalTextLength, sum);
177-
if (!language.equals("null")) {
178-
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
179-
}
180-
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
181177
}
182178
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
183179
}
Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
path,contains-text,language(ignored if null)
2-
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
3-
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
4-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
5-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
6-
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
7-
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
8-
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
9-
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
10-
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
11-
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
12-
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
13-
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
14-
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
15-
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
16-
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
17-
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
18-
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
19-
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN
1+
path,contains-text
2+
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
3+
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
4+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
5+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
6+
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
7+
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
8+
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
9+
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
10+
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
11+
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
12+
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
13+
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
14+
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
15+
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
16+
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
17+
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
18+
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
19+
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true

0 commit comments

Comments
 (0)