Skip to content

Commit 3d2b1d7

Browse files
author
Jonas Widmer
committed
update model and concurrency, removed callback (direct db access), adapted logging, added min-width of lines, changed status management, beginning of segmentation implementation (using Yolo with htrflow)
1 parent 1139373 commit 3d2b1d7

7 files changed

Lines changed: 231 additions & 120 deletions

File tree

flow_preprocessor/preprocessing_logic/models.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
from datetime import datetime
55
from typing import Optional, List
66
import enum
7+
from bson import ObjectId
78

8-
from pydantic import BaseModel, Field
9+
from pydantic import BaseModel, Field, ConfigDict
910

1011

1112
class StateEnum(enum.Enum):
@@ -21,9 +22,9 @@ class PreprocessState(BaseModel):
2122
"""
2223
The state of a preprocessing job
2324
"""
24-
process_id: str = Field(alias="process_id",
25-
description="The uniqueid of the preprocess status.",
26-
title="ID")
25+
id: ObjectId = Field(alias="_id",
26+
description="The uniqueid of the preprocess status.",
27+
title="ID")
2728
created_at: datetime = Field(alias="created_at",
2829
description="The timestamp of the preprocess status creation.",
2930
title="Created-At",
@@ -35,10 +36,10 @@ class PreprocessState(BaseModel):
3536
description="Folder in the repository the files are fetched from.",
3637
title="Repository-Folder",
3738
examples=["xml", "page"])
38-
abbreviation: bool = Field(default=False,
39-
alias="abbreviation",
40-
description="Whether to expand abbreviations in text.",
41-
title="Abbreviation")
39+
abbrev: bool = Field(default=False,
40+
alias="abbrev",
41+
description="Whether to expand abbreviations in text.",
42+
title="Abbreviation")
4243
crop: bool = Field(default=False,
4344
alias="crop",
4445
description="Whether to crop images to their linemask.",
@@ -87,12 +88,26 @@ class PreprocessState(BaseModel):
8788
description="The names of the lines images processed.",
8889
title="Filenames-Line-Images-Processed",
8990
default=[])
90-
runtime: Optional[int] = Field(alias="runtime",
91-
description="The runtime of the preprocess status.",
92-
title="Runtime",
93-
default=0)
91+
runtime_seconds: Optional[int] = Field(alias="runtime_seconds",
92+
description="The runtime_seconds of the preprocess status.",
93+
title="Runtime",
94+
default=0)
9495
segment: Optional[bool] = Field(alias="segment",
9596
description="Whether the images have to be segmented before processing.",
9697
title="Segment",
9798
default=False
9899
)
100+
minwidth: Optional[int] = Field(alias="min-width",
101+
description="The minimum width of an image needed to be processed.",
102+
title="Min-Width",
103+
default=None
104+
)
105+
106+
model_config = ConfigDict(
107+
populate_by_name=True,
108+
arbitrary_types_allowed=True,
109+
str_strip_whitespace=True,
110+
json_encoders={
111+
ObjectId: str
112+
},
113+
)

flow_preprocessor/preprocessing_logic/parse_textlines.py

Lines changed: 70 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from flow_preprocessor.exceptions.exceptions import ParseTextLinesException
1515
from flow_preprocessor.utils.logging.preprocessing_logger import logger
16+
# from flow_preprocessor.preprocessing_logic.segmentation import SegmenterYOLO
1617

1718

1819
# ===============================================================================
@@ -33,8 +34,8 @@ def __init__(self, x: float, y: float):
3334
:param x: x coordinate
3435
:param y: y coordinate
3536
"""
36-
self.x = x
37-
self.y = y
37+
self.x = float(x)
38+
self.y = float(y)
3839

3940
def __eq__(self, other) -> bool:
4041
"""Override the equality operator.
@@ -48,14 +49,41 @@ def __eq__(self, other) -> bool:
4849
return False
4950
return self.x == other.x and self.y == other.y
5051

51-
@classmethod
52-
def min_x(cls, coordinates: List['Coordinate']) -> float:
52+
@staticmethod
53+
def get_width(coordinates: List['Coordinate']) -> float:
54+
"""
55+
get the width (x_max - x_min)
56+
57+
:param coordinates: list of coordinates
58+
:return: width
59+
"""
60+
min_x = Coordinate.min_x(coordinates)
61+
max_x = Coordinate.max_x(coordinates)
62+
63+
return float(max_x - min_x)
64+
65+
@staticmethod
66+
def get_bbox(coordinates: List['Coordinate']) -> tuple[float, float, float, float]:
67+
"""
68+
get the bounding box of the coordinates.
69+
70+
:param coordinates: list of coordinates
71+
:return: bounding box (left, lower, right, upper)
72+
"""
73+
min_x = Coordinate.min_x(coordinates)
74+
max_x = Coordinate.max_x(coordinates)
75+
min_y = Coordinate.min_y(coordinates)
76+
max_y = Coordinate.max_y(coordinates)
77+
return min_x, min_y, max_x, max_y
78+
79+
@staticmethod
80+
def min_x(coordinates: List['Coordinate']) -> float:
5381
"""set minimum x coordinate.
5482
5583
:param coordinates: list of coordinates.
5684
:return: minimum x coordinate.
5785
"""
58-
return min(coord.x for coord in coordinates)
86+
return min([coord.x for coord in coordinates])
5987

6088
@staticmethod
6189
def max_x(coordinates: List['Coordinate']) -> float:
@@ -64,17 +92,17 @@ def max_x(coordinates: List['Coordinate']) -> float:
6492
:param coordinates: list of coordinates.
6593
:return: minimum y coordinate.
6694
"""
67-
return max(coord.x for coord in coordinates)
95+
return max([coord.x for coord in coordinates])
6896

6997
@staticmethod
7098
def min_y(coordinates: List['Coordinate']) -> float:
7199
"""set minimum y coordinate."""
72-
return min(coord.y for coord in coordinates)
100+
return min([coord.y for coord in coordinates])
73101

74102
@staticmethod
75103
def max_y(coordinates: List['Coordinate']) -> float:
76104
"""set maximum y coordinate."""
77-
return max(coord.y for coord in coordinates)
105+
return max([coord.y for coord in coordinates])
78106

79107

80108
# ===============================================================================
@@ -198,7 +226,11 @@ class Page:
198226
An XML Page.
199227
"""
200228

201-
def __init__(self, image_file_name, lines, metadata):
229+
def __init__(self,
230+
image_file_name: str,
231+
lines: List[Line],
232+
metadata: Metadata
233+
) -> None:
202234
"""
203235
initialise class parameters.
204236
@@ -252,20 +284,29 @@ def parse_xml_file(self, xml_file: str, segment: bool = False) -> None:
252284
self.namespace_uri = self.root.tag.split('}')[0][1:]
253285
self.namespace = {'prefix': self.namespace_uri}
254286
self.xmlns = {'ns': self.namespace_uri}
287+
image_filename = self.get_image_file_name()
255288

256-
# TODO: Write Segmenter package
257289
"""
258290
if segment:
259291
existing_segmentation = self.check_segmentation()
260292
if existing_segmentation == 'ground_truth':
261293
pass
294+
else:
295+
segmenter = SegmenterYOLO(
296+
models=['Riksarkivet/yolov9-regions-1', 'Riksarkivet/yolov9-lines-within-regions-1'],
297+
batch_sizes=4,
298+
order_lines=True,
299+
)
300+
self.tree = segmenter.segment(self.tree, image_filename)
301+
self.root = self.tree.getroot()
302+
262303
elif existing_segmentation == 'segmented':
263304
segmenter = Segmenter('linemasks')
264305
self.root = segmenter.segment(self.root)
265306
else:
266307
segmenter = Segmenter('yolo')
267308
self.root = segmenter.segment(self.root)
268-
"""
309+
"""
269310

270311
except (et.XMLSyntaxError, et.ParseError) as e:
271312
self.failed_processing.append(xml_file)
@@ -437,13 +478,13 @@ def get_line_text_string(self, text_line: et.Element) -> str:
437478

438479
unicode_text = text_line.find('./ns:TextEquiv/ns:Unicode', namespaces=self.xmlns)
439480
if unicode_text is not None and hasattr(unicode_text, 'text'):
440-
logger.info('%s - Got Unicode text: %s', self.__class__.__name__, unicode_text.text)
481+
logger.debug('%s - Got Unicode text: %s', self.__class__.__name__, unicode_text.text)
441482
if unicode_text.text is not None:
442483
text: str = unicode_text.text.strip()
443484
else:
444485
text: str = ''
445486
else:
446-
logger.info('%s - No Unicode text found', self.__class__.__name__)
487+
logger.debug('%s - No Unicode text found', self.__class__.__name__)
447488
text: str = ''
448489
return text
449490

@@ -543,8 +584,19 @@ def get_page(self):
543584
"""
544585
Get the Page-object from the XML file.
545586
"""
546-
return Page(
547-
self.get_image_file_name(),
548-
self.process_lines_from_xml_file(),
549-
self.get_metadata()
550-
)
587+
try:
588+
return Page(
589+
self.get_image_file_name(),
590+
self.process_lines_from_xml_file(),
591+
self.get_metadata()
592+
)
593+
except ParseTextLinesException as e:
594+
logger.error(
595+
'%s - Error parsing file %s',
596+
self.__class__.__name__,
597+
self.xml_filename,
598+
exc_info=True,
599+
)
600+
self.failed_processing.append(self.xml_filename)
601+
raise ParseTextLinesException(f'Error parsing file {self.xml_filename}: {e}') from e
602+

0 commit comments

Comments
 (0)