1+ <!--
2+ Copyright 2016 DLR - German Aerospace Center
3+
4+ Licensed under the Apache License, Version 2.0 (the "License");
5+ you may not use this file except in compliance with the License.
6+ You may obtain a copy of the License at
7+
8+ http://www.apache.org/licenses/LICENSE-2.0
9+
10+ Unless required by applicable law or agreed to in writing, software
11+ distributed under the License is distributed on an "AS IS" BASIS,
12+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ See the License for the specific language governing permissions and
14+ limitations under the License.
15+ -->
16+ <dataConfig >
17+ <script ><![CDATA[
18+ function split(row) {
19+ var pieces;
20+ var arr;
21+ if(row.containsKey("keywordsText")){
22+ pieces = row.get("keywordsText").split(',');
23+ arr = new java.util.ArrayList();
24+ for (var i=0; i<pieces.length; i++) {
25+ arr.add(pieces[i].trim());
26+ }
27+ row.put("keywords", arr);
28+ }
29+ if(row.containsKey("categoriesText")){
30+ pieces = row.get("categoriesText").split(',');
31+ arr = new java.util.ArrayList();
32+ for (var i=0; i<pieces.length; i++) {
33+ arr.add(pieces[i].trim());
34+ }
35+ row.put("categories", arr);
36+ }
37+ return row;
38+ }
39+ ]]> </script >
40+ <dataSource type =" FileDataSource" />
41+ <document >
42+ <!-- the same transformers need to be defined for all entities -->
43+ <!-- the order of the transformer is the order in which they are executed.
44+ The order of the fields are not important -->
45+ <entity name =" file" processor =" FileListEntityProcessor"
46+ baseDir=" path/to/metadata/files"
47+ fileName=" .*xml" rootEntity =" false" datasource =" null"
48+ transformer=" de.dlr.knowledgefinder.dataimport.utils.transformer.FilePathTransformer" >
49+
50+ <field column =" filePath"
51+ filePrefix=" path/to/document/files"
52+ fileSuffix=" .pdf" oldFileSuffix =" .xml" srcColName =" file" />
53+
54+ <!-- import file content -->
55+ <entity name =" metadataImport" processor =" XPathEntityProcessor"
56+ forEach=" /documents/document" url =" ${file.fileAbsolutePath}"
57+ transformer=" TemplateTransformer,
58+ script:split,
59+ DateFormatTransformer,
60+ de.dlr.knowledgefinder.dataimport.utils.transformer.CategoriesSeparatedTransformer,
61+ de.dlr.knowledgefinder.dataimport.utils.transformer.ArrayToStringTransformer" >
62+
63+ <field column =" id" xpath =" /documents/document/id" />
64+ <field column =" title" xpath =" /documents/document/title" />
65+ <!-- FormatingDictTransformer -->
66+ <field column =" authors" xpath =" /documents/document/authors" />
67+ <field column =" description" xpath =" /documents/document/description" />
68+ <field column =" externalLink" xpath =" /documents/document/externalLink" />
69+ <field column =" publisher" xpath =" /documents/document/publisher" />
70+ <field column =" categories" xpath =" /documents/document/categoriesText" />
71+ <field column =" categoriesText" xpath =" /documents/document/categoriesText" />
72+ <field column =" keywords" xpath =" /documents/document/keywordsText" />
73+ <field column =" keywordsText" xpath =" /documents/document/keywordsText" />
74+ <field column =" license" xpath =" /documents/document/license" />
75+ <field column =" publishDate" xpath =" /documents/document/publishDate"
76+ dateTimeFormat=" MMMM dd, yyyy" locale =" en" />
77+
78+ <field column =" categories" categories =" import/categories.json"
79+ categories_split_prefix=" category_" concatArrayFromSource =" contentCategories" />
80+
81+ <entity name =" documentImport" processor =" CustomTikaEntityProcessor"
82+ url=" ${file.filePath}" format =" text" dataSource =" files" >
83+ <field column =" text" name =" content" />
84+ </entity >
85+ </entity >
86+
87+
88+
89+ </entity >
90+ </document >
91+ </dataConfig >
0 commit comments