Skip to content

Commit 75205d5

Browse files
committed
feat(JS-2477): recognize x-rechnung PDF
1 parent ab6bde8 commit 75205d5

7 files changed

Lines changed: 659 additions & 59 deletions

File tree

src/main/java/org/jadice/filetype/matchers/PDFMatcher.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
package org.jadice.filetype.matchers;
22

3+
import static org.jadice.filetype.matchers.XMLMatcher.X_RECHNUNG_KEY;
4+
35
import java.io.IOException;
46
import java.io.InputStream;
57
import java.io.StringWriter;
8+
import java.nio.charset.StandardCharsets;
69
import java.util.*;
710
import java.util.Map.Entry;
811

@@ -32,6 +35,7 @@
3235
import org.apache.pdfbox.text.PDFTextStripper;
3336
import org.jadice.filetype.Context;
3437
import org.jadice.filetype.database.MimeTypeAction;
38+
import org.jadice.filetype.io.MemoryInputStream;
3539
import org.jadice.filetype.io.SeekableInputStream;
3640
import org.jadice.filetype.pdfutil.PDFBoxSignatureUtil;
3741
import org.slf4j.Logger;
@@ -85,7 +89,7 @@ public boolean matches(final Context context) {
8589
try (PDDocument document = PDDocument.load(sis)) {
8690
context.setProperty(MimeTypeAction.KEY, PDF_MIME_TYPE);
8791

88-
Map<String, Object> pdfDetails = new HashMap<String, Object>();
92+
Map<String, Object> pdfDetails = new HashMap<>();
8993
context.setProperty(DETAILS_KEY, pdfDetails);
9094

9195
pdfDetails.put(NUMBER_OF_PAGES_KEY, Integer.valueOf(document.getNumberOfPages()));
@@ -101,6 +105,7 @@ public boolean matches(final Context context) {
101105
PDMetadata meta = catalog.getMetadata();
102106
if (null != meta) {
103107
provideXMPMetadata(pdfDetails, meta);
108+
checkIfXRechnung(pdfDetails);
104109
}
105110

106111
PDEncryption encryption = document.getEncryption();
@@ -273,6 +278,29 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
273278
}
274279
}
275280

281+
/**
282+
* Checks if the PDF is an electronic invoice.
283+
*
284+
* @param pdfDetails the map of PDF details with the metadata XML
285+
*/
286+
private static void checkIfXRechnung(final Map<String, Object> pdfDetails) {
287+
final Object metadata = pdfDetails.get(METADATA_KEY);
288+
if (metadata instanceof String) {
289+
try {
290+
final XMLMatcher xmlMatcher = new XMLMatcher();
291+
final Context xmlContext = new Context(
292+
new MemoryInputStream(((String) metadata).getBytes(StandardCharsets.UTF_8)),
293+
new HashMap<>(), null, Locale.ENGLISH, "");
294+
final boolean isXRechnung = xmlMatcher.matches(xmlContext);
295+
if (isXRechnung) {
296+
pdfDetails.put(X_RECHNUNG_KEY, true);
297+
}
298+
} catch (IOException e) {
299+
LOGGER.error("Failed to parse metadata XML", e);
300+
}
301+
}
302+
}
303+
276304
/**
277305
* Reads the whole stream to determine the length of it.
278306
*
Lines changed: 118 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,118 @@
1-
import static org.junit.jupiter.api.Assertions.assertEquals;
2-
import static org.junit.jupiter.api.Assertions.assertNotNull;
3-
4-
import java.io.File;
5-
import java.io.IOException;
6-
import java.util.Map;
7-
8-
import org.jadice.filetype.Analyzer;
9-
import org.jadice.filetype.AnalyzerException;
10-
import org.jadice.filetype.database.DescriptionAction;
11-
import org.jadice.filetype.database.ExtensionAction;
12-
import org.jadice.filetype.database.MimeTypeAction;
13-
import org.jadice.filetype.io.MemoryInputStream;
14-
import org.junit.jupiter.api.BeforeAll;
15-
import org.junit.jupiter.api.Test;
16-
import org.slf4j.Logger;
17-
import org.slf4j.LoggerFactory;
18-
19-
class TestVariousTypes {
20-
21-
private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);
22-
23-
private static Analyzer analyzer;
24-
25-
@BeforeAll
26-
public static void createAnalyzer() throws AnalyzerException {
27-
analyzer = Analyzer.getInstance("/magic.xml");
28-
}
29-
30-
@Test
31-
void testVariousTypes() throws IOException {
32-
final File[] files = new File("src/test/resources/various_types").listFiles(
33-
pathname -> pathname.isFile() && pathname.canRead());
34-
assert files != null;
35-
36-
for (final File file : files) {
37-
LOGGER.info("File: " + file);
38-
final Map<String, Object> results = analyzer.analyze(file);
39-
assertNotNull(results, file + " could not be analyzed");
40-
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
41-
assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
42-
// extension can be null
43-
// assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
44-
for (final Map.Entry<String, Object> e : results.entrySet())
45-
LOGGER.info(" " + e.getKey() + "=" + e.getValue());
46-
LOGGER.info("\n-------------------");
47-
}
48-
}
49-
50-
@Test
51-
void testEmptyStream() throws Exception {
52-
Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
53-
assertNotNull(results, "empty stream could not be analyzed");
54-
assertEquals("text/plain", results.get(MimeTypeAction.KEY));
55-
assertEquals("txt", results.get(ExtensionAction.KEY));
56-
assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
57-
}
58-
}
1+
import static org.junit.jupiter.api.Assertions.assertEquals;
2+
import static org.junit.jupiter.api.Assertions.assertNotNull;
3+
import static org.junit.jupiter.api.Assertions.assertTrue;
4+
import static org.junit.jupiter.api.Assertions.fail;
5+
import static org.junit.jupiter.params.provider.Arguments.arguments;
6+
7+
import java.io.File;
8+
import java.io.IOException;
9+
import java.net.URL;
10+
import java.util.Map;
11+
import java.util.stream.Stream;
12+
13+
import org.jadice.filetype.Analyzer;
14+
import org.jadice.filetype.AnalyzerException;
15+
import org.jadice.filetype.database.DescriptionAction;
16+
import org.jadice.filetype.database.ExtensionAction;
17+
import org.jadice.filetype.database.MimeTypeAction;
18+
import org.jadice.filetype.io.MemoryInputStream;
19+
import org.jadice.filetype.matchers.PDFMatcher;
20+
import org.jadice.filetype.matchers.XMLMatcher;
21+
import org.junit.jupiter.api.BeforeAll;
22+
import org.junit.jupiter.api.Test;
23+
import org.junit.jupiter.params.ParameterizedTest;
24+
import org.junit.jupiter.params.provider.Arguments;
25+
import org.junit.jupiter.params.provider.MethodSource;
26+
import org.slf4j.Logger;
27+
import org.slf4j.LoggerFactory;
28+
29+
class TestVariousTypes {
30+
31+
private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);
32+
33+
private static Analyzer analyzer;
34+
35+
@BeforeAll
36+
public static void createAnalyzer() throws AnalyzerException {
37+
analyzer = Analyzer.getInstance("/magic.xml");
38+
}
39+
40+
@Test
41+
void testVariousTypes() throws IOException {
42+
final File[] files = new File("src/test/resources/various_types").listFiles(
43+
pathname -> pathname.isFile() && pathname.canRead());
44+
assert files != null;
45+
46+
for (final File file : files) {
47+
LOGGER.info("File: " + file);
48+
final Map<String, Object> results = analyzer.analyze(file);
49+
assertNotNull(results, file + " could not be analyzed");
50+
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
51+
assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
52+
// extension can be null
53+
// assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
54+
printResult(results);
55+
}
56+
}
57+
58+
@Test
59+
void testEmptyStream() throws Exception {
60+
Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
61+
assertNotNull(results, "empty stream could not be analyzed");
62+
assertEquals("text/plain", results.get(MimeTypeAction.KEY));
63+
assertEquals("txt", results.get(ExtensionAction.KEY));
64+
assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
65+
}
66+
67+
public static Stream<Arguments> dataProvider() {
68+
return Stream.of(
69+
arguments("/various_types/BASIC_Einfach.pdf", "application/pdf"),
70+
arguments("/various_types/EN16931_Einfach.pdf", "application/pdf"),
71+
arguments("/various_types/EN16931_Einfach.cii.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
72+
arguments("/various_types/EN16931_Einfach.ubl.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
73+
arguments("/various_types/ZUGFeRD-invoice_rabatte_3_abschlag_duepayableamount.xml", "application/xml;charset=UTF-8;x-rechnung=true")
74+
);
75+
}
76+
77+
@ParameterizedTest
78+
@MethodSource("dataProvider")
79+
void testXRechnung(String resource, String expectedMimeType) throws Exception {
80+
final URL url = getClass().getResource(resource);
81+
assertNotNull(url);
82+
final File file = new File(url.toURI());
83+
final Map<String, Object> results = analyzer.analyze(file);
84+
assertNotNull(results, file + " could not be analyzed");
85+
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing");
86+
assertEquals(expectedMimeType, results.get(MimeTypeAction.KEY), "wrong mimeType");
87+
assertNotNull(results.get(DescriptionAction.KEY), "description missing");
88+
assertNotNull(results.get(ExtensionAction.KEY), "could not be analyzed");
89+
checkForDetails(results);
90+
printResult(results);
91+
}
92+
93+
private void checkForDetails(final Map<String, Object> results) {
94+
final String mimeType = (String)results.get(MimeTypeAction.KEY);
95+
switch (mimeType) {
96+
case "application/pdf": ensureXRechnungIsTrue(results, PDFMatcher.DETAILS_KEY); break;
97+
case "application/xml;charset=UTF-8;x-rechnung=true": ensureXRechnungIsTrue(results, XMLMatcher.DETAILS_KEY); break;
98+
default: fail("unexpected mime type");
99+
}
100+
}
101+
102+
@SuppressWarnings("unchecked")
103+
private void ensureXRechnungIsTrue(final Map<String, Object> results, final String detailsKey) {
104+
final Object details = results.get(detailsKey);
105+
assertNotNull(details, "details are missing");
106+
final Map<String, Object> detailsMap = (Map<String, Object>) details;
107+
final boolean isXRechnung = (Boolean)detailsMap.get(XMLMatcher.X_RECHNUNG_KEY);
108+
assertTrue(isXRechnung, "x_rechnung should be true");
109+
}
110+
111+
112+
private static void printResult(final Map<String, Object> results) {
113+
for (final Map.Entry<String, Object> e : results.entrySet()) {
114+
LOGGER.info(" {}={}", e.getKey(), e.getValue());
115+
}
116+
LOGGER.info("\n-------------------");
117+
}
118+
}
142 KB
Binary file not shown.

0 commit comments

Comments
 (0)