Skip to content

Commit b5bb556

Browse files
author
Manuel Schaaf
committed
feat(abbyy): Updated type description
1 parent 5385180 commit b5bb556

1 file changed

Lines changed: 62 additions & 36 deletions

File tree

src/main/resources/desc/type/AbbyyFineReaderTypeSystem.xml

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,38 @@
99
<types>
1010
<typeDescription>
1111
<name>org.texttechnologylab.annotation.ocr.abbyy.StructuralElement</name>
12-
<description/>
12+
<description>Meta-Type for structural elements with pixel offsets in the ABBYY FineReader schema.</description>
1313
<supertypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div</supertypeName>
1414

1515
<features>
1616
<featureDescription>
1717
<name>top</name>
18-
<description/>
18+
<description>Coordinate of the top border, in pixels.</description>
1919
<rangeTypeName>uima.cas.Integer</rangeTypeName>
2020
</featureDescription>
2121

2222
<featureDescription>
2323
<name>bottom</name>
24-
<description/>
24+
<description>Coordinate of the bottom border, in pixels.</description>
2525
<rangeTypeName>uima.cas.Integer</rangeTypeName>
2626
</featureDescription>
2727

2828
<featureDescription>
2929
<name>left</name>
30-
<description/>
30+
<description>Coordinate of the left border, in pixels.</description>
3131
<rangeTypeName>uima.cas.Integer</rangeTypeName>
3232
</featureDescription>
3333

3434
<featureDescription>
3535
<name>right</name>
36-
<description/>
36+
<description>Coordinate of the right border, in pixels.</description>
3737
<rangeTypeName>uima.cas.Integer</rangeTypeName>
3838
</featureDescription>
3939
</features>
4040
</typeDescription>
4141
<typeDescription>
4242
<name>org.texttechnologylab.annotation.ocr.abbyy.Block</name>
43-
<description/>
43+
<description>A recognized block.</description>
4444
<supertypeName>org.texttechnologylab.annotation.ocr.abbyy.StructuralElement</supertypeName>
4545

4646
<features>
@@ -59,7 +59,7 @@
5959
</typeDescription>
6060
<typeDescription>
6161
<name>org.texttechnologylab.annotation.ocr.abbyy.BlockType</name>
62-
<description/>
62+
<description>The type of a block. It can be one of the following values: Text, Table, Picture, Barcode, Separator, SeparatorsBox, Checkmark, GroupCheckmark</description>
6363
<supertypeName>uima.cas.String</supertypeName>
6464

6565
<allowedValues>
@@ -91,56 +91,61 @@
9191
</typeDescription>
9292
<typeDescription>
9393
<name>org.texttechnologylab.annotation.ocr.abbyy.Line</name>
94-
<description/>
94+
<description>Line of a paragraph.</description>
9595
<supertypeName>org.texttechnologylab.annotation.ocr.abbyy.StructuralElement</supertypeName>
9696

9797
<features>
9898
<featureDescription>
9999
<name>baseline</name>
100-
<description/>
100+
<description>The distance from the baseline to the top edge of the page, in pixels.</description>
101101
<rangeTypeName>uima.cas.Integer</rangeTypeName>
102102
</featureDescription>
103103

104104
<featureDescription>
105105
<name>format</name>
106-
<description/>
106+
<description>
107+
If present, this line denotes a group of characters with uniform formatting.
108+
</description>
107109
<rangeTypeName>org.texttechnologylab.annotation.ocr.abbyy.Format</rangeTypeName>
108110
</featureDescription>
109111
</features>
110112
</typeDescription>
111113
<typeDescription>
112114
<name>org.texttechnologylab.annotation.ocr.abbyy.Paragraph</name>
113-
<description/>
115+
<description>Paragraph of a recognized text.</description>
114116
<supertypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph</supertypeName>
115117

116118
<features>
117119
<featureDescription>
118120
<name>leftIndent</name>
119-
<description/>
121+
<description>The left paragraph indent (optional, default value is 0)</description>
120122
<rangeTypeName>uima.cas.Integer</rangeTypeName>
121123
</featureDescription>
122124

123125
<featureDescription>
124126
<name>rightIndent</name>
125-
<description/>
127+
<description>The right paragraph indent (optional, default value is 0)</description>
126128
<rangeTypeName>uima.cas.Integer</rangeTypeName>
127129
</featureDescription>
128130

129131
<featureDescription>
130132
<name>startIndent</name>
131-
<description/>
133+
<description>The indent of the first line of the paragraph optional,default value is 0)</description>
132134
<rangeTypeName>uima.cas.Integer</rangeTypeName>
133135
</featureDescription>
134136

135137
<featureDescription>
136138
<name>lineSpacing</name>
137-
<description/>
139+
<description>The spacing between lines (optional, default value is 0)</description>
138140
<rangeTypeName>uima.cas.Integer</rangeTypeName>
139141
</featureDescription>
140142

141143
<featureDescription>
142144
<name>alignment</name>
143-
<description/>
145+
<description>
146+
The paragraph alignment (optional, default value is Left).
147+
It can be one of the following values: Left, Center, Right, Justified
148+
</description>
144149
<rangeTypeName>org.texttechnologylab.annotation.ocr.abbyy.ParagraphAlignment</rangeTypeName>
145150
</featureDescription>
146151
</features>
@@ -165,25 +170,25 @@
165170
</typeDescription>
166171
<typeDescription>
167172
<name>org.texttechnologylab.annotation.ocr.abbyy.Format</name>
168-
<description/>
173+
<description>Attributes of a line with uniform formatting.</description>
169174
<supertypeName>uima.tcas.Annotation</supertypeName>
170175

171176
<features>
172177
<featureDescription>
173178
<name>lang</name>
174-
<description/>
179+
<description>Name of the language.</description>
175180
<rangeTypeName>uima.cas.String</rangeTypeName>
176181
</featureDescription>
177182

178183
<featureDescription>
179184
<name>ff</name>
180-
<description/>
185+
<description>The name of the font.</description>
181186
<rangeTypeName>uima.cas.String</rangeTypeName>
182187
</featureDescription>
183188

184189
<featureDescription>
185190
<name>fs</name>
186-
<description/>
191+
<description>The size of the font.</description>
187192
<rangeTypeName>uima.cas.Float</rangeTypeName>
188193
</featureDescription>
189194

@@ -232,56 +237,69 @@
232237
</typeDescription>
233238
<typeDescription>
234239
<name>org.texttechnologylab.annotation.ocr.abbyy.Token</name>
235-
<description/>
240+
<description>Token type that denotes recognized words.</description>
236241
<supertypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token</supertypeName>
237242

238243
<features>
239244
<featureDescription>
240245
<name>subTokenList</name>
241-
<description/>
246+
<description>Present if a linebreak hyphen was recognized, contains the individual words.</description>
242247
<rangeTypeName>uima.cas.StringList</rangeTypeName>
243248
<multipleReferencesAllowed>false</multipleReferencesAllowed>
244249
</featureDescription>
245250

246251
<featureDescription>
247252
<name>isWordFromDictionary</name>
248-
<description/>
253+
<description>Specifies whether the word was found in the ABBYY FineReader dictionary.</description>
249254
<rangeTypeName>uima.cas.Boolean</rangeTypeName>
250255
</featureDescription>
251256

252257
<featureDescription>
253258
<name>isWordNormal</name>
254-
<description/>
259+
<description>
260+
Specifies whether the word was recognized with either a standard or user-defined language,
261+
and that it is not a number or an identifier.
262+
</description>
255263
<rangeTypeName>uima.cas.Boolean</rangeTypeName>
256264
</featureDescription>
257265

258266
<featureDescription>
259267
<name>isWordNumeric</name>
260-
<description/>
268+
<description>Specifies whether the word is a number</description>
261269
<rangeTypeName>uima.cas.Boolean</rangeTypeName>
262270
</featureDescription>
263271

264272
<featureDescription>
265273
<name>containsHyphen</name>
266-
<description/>
274+
<description>Specifies if the word contains a recognized linebreak hyphen.</description>
267275
<rangeTypeName>uima.cas.Boolean</rangeTypeName>
268276
</featureDescription>
269277

270278
<featureDescription>
271279
<name>suspiciousChars</name>
272-
<description/>
280+
<description>The number of characters that were recognized uncertainly.</description>
273281
<rangeTypeName>uima.cas.Integer</rangeTypeName>
274282
</featureDescription>
275283

276284
<featureDescription>
277285
<name>minCharConfidence</name>
278-
<description/>
286+
<description>
287+
The minimum character recognition confidence of all characters in this word.
288+
Use with caution, as these numbers are not guaranteed to be positive and, according to the
289+
ABBYY FineReader documentation, the only meaningful use of confidence is to compare different
290+
recognition variants of the same character.
291+
</description>
279292
<rangeTypeName>uima.cas.Short</rangeTypeName>
280293
</featureDescription>
281294

282295
<featureDescription>
283296
<name>meanCharConfidence</name>
284-
<description/>
297+
<description>
298+
The average character recognition confidence of all characters in this word.
299+
Use with caution, as these numbers are not guaranteed to be positive and, according to the
300+
ABBYY FineReader documentation, the only meaningful use of confidence is to compare different
301+
recognition variants of the same character.
302+
</description>
285303
<rangeTypeName>uima.cas.Float</rangeTypeName>
286304
</featureDescription>
287305
</features>
@@ -308,27 +326,30 @@
308326
</featureDescription>
309327
<featureDescription>
310328
<name>uri</name>
311-
<description/>
329+
<description>URI of this page, i.e. a Visual Library link.</description>
312330
<rangeTypeName>uima.cas.String</rangeTypeName>
313331
</featureDescription>
314332
<featureDescription>
315333
<name>width</name>
316-
<description/>
334+
<description>The image width in pixels.</description>
317335
<rangeTypeName>uima.cas.Integer</rangeTypeName>
318336
</featureDescription>
319337
<featureDescription>
320338
<name>height</name>
321-
<description/>
339+
<description>The image height in pixels.</description>
322340
<rangeTypeName>uima.cas.Integer</rangeTypeName>
323341
</featureDescription>
324342
<featureDescription>
325343
<name>resolution</name>
326-
<description/>
344+
<description>The image resolution in pixels per inch.</description>
327345
<rangeTypeName>uima.cas.Integer</rangeTypeName>
328346
</featureDescription>
329347
<featureDescription>
330348
<name>rotation</name>
331-
<description/>
349+
<description>
350+
The type of rotation applied to original page image before processing (optional).
351+
It can be one of the following values: Normal, RotatedClockwise, RotatedUpsideDown, RotatedCounterclockwise
352+
</description>
332353
<rangeTypeName>org.texttechnologylab.annotation.ocr.abbyy.Orientation</rangeTypeName>
333354
</featureDescription>
334355
</features>
@@ -345,38 +366,43 @@
345366
</typeDescription>
346367
<typeDescription>
347368
<name>org.texttechnologylab.annotation.ocr.abbyy.Document</name>
348-
<description/>
369+
<description>Document metadata.</description>
349370
<supertypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document</supertypeName>
350371

351372
<features>
352373
<featureDescription>
353374
<name>documentName</name>
354-
<description/>
375+
<description>The name of this document in the XML files.</description>
355376
<rangeTypeName>uima.cas.String</rangeTypeName>
356377
</featureDescription>
357378

358379
<featureDescription>
359380
<name>version</name>
381+
<description>The version of the source XML files.</description>
360382
<rangeTypeName>uima.cas.String</rangeTypeName>
361383
</featureDescription>
362384

363385
<featureDescription>
364386
<name>producer</name>
387+
<description>The producer of the source XML files.</description>
365388
<rangeTypeName>uima.cas.String</rangeTypeName>
366389
</featureDescription>
367390

368391
<featureDescription>
369392
<name>pagesCount</name>
393+
<description>The number of pages in this document (optional).</description>
370394
<rangeTypeName>uima.cas.Integer</rangeTypeName>
371395
</featureDescription>
372396

373397
<featureDescription>
374398
<name>mainLanguage</name>
399+
<description>The main language of this document.</description>
375400
<rangeTypeName>uima.cas.String</rangeTypeName>
376401
</featureDescription>
377402

378403
<featureDescription>
379404
<name>languages</name>
405+
<description>All languages used in this document (optional).</description>
380406
<rangeTypeName>uima.cas.String</rangeTypeName>
381407
</featureDescription>
382408
</features>

0 commit comments

Comments
 (0)