Skip to content

Commit 40e36f0

Browse files
committed
prepare for segmenter use and add logs
1 parent bcd54c4 commit 40e36f0

2 files changed

Lines changed: 30 additions & 24 deletions

File tree

flow_preprocessor/preprocessing_logic/parse_textlines.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
from flow_preprocessor.exceptions.exceptions import ParseTextLinesException
1515
from flow_preprocessor.utils.logging.preprocessing_logger import logger
16-
# from flow_preprocessor.preprocessing_logic.segmentation import SegmenterYOLO
1716

1817

1918
# ===============================================================================
@@ -284,29 +283,35 @@ def parse_xml_file(self, xml_file: str, segment: bool = False) -> None:
284283
self.namespace_uri = self.root.tag.split('}')[0][1:]
285284
self.namespace = {'prefix': self.namespace_uri}
286285
self.xmlns = {'ns': self.namespace_uri}
287-
image_filename = self.get_image_file_name()
286+
# image_filename = self.get_image_file_name()
288287

289-
"""
290288
if segment:
291289
existing_segmentation = self.check_segmentation()
292290
if existing_segmentation == 'ground_truth':
293-
pass
294-
else:
295-
segmenter = SegmenterYOLO(
296-
models=['Riksarkivet/yolov9-regions-1', 'Riksarkivet/yolov9-lines-within-regions-1'],
297-
batch_sizes=4,
298-
order_lines=True,
291+
# If the XML file already contains ground truth segmentation, do not segment again.
292+
logger.info(
293+
'%s - XML file %s already contains ground truth segmentation, skipping segmentation.',
294+
self.__class__.__name__,
295+
xml_file,
299296
)
300-
self.tree = segmenter.segment(self.tree, image_filename)
301-
self.root = self.tree.getroot()
302-
297+
pass
303298
elif existing_segmentation == 'segmented':
304-
segmenter = Segmenter('linemasks')
305-
self.root = segmenter.segment(self.root)
299+
# If the XML file is already segmented, do not segment again - maybe linemasks recognition?
300+
logger.info(
301+
'%s - XML file %s is already segmented, skipping segmentation.',
302+
self.__class__.__name__,
303+
xml_file,
304+
)
305+
pass
306306
else:
307-
segmenter = Segmenter('yolo')
308-
self.root = segmenter.segment(self.root)
309-
"""
307+
# If the XML file is not segmented, segment it.
308+
logger.info(
309+
'%s - XML file %s is not segmented, processing lines.',
310+
self.__class__.__name__,
311+
xml_file,
312+
)
313+
pass
314+
310315

311316
except (et.XMLSyntaxError, et.ParseError) as e:
312317
self.failed_processing.append(xml_file)

flow_preprocessor/preprocessing_logic/preprocess.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(
4545
abbrev: bool = False,
4646
stop_on_fail: bool = True,
4747
minwidth: Union[int, float] = None,
48-
# segment: bool = False,
48+
segment: bool = False,
4949
**kwargs,
5050
) -> None:
5151
"""
@@ -86,12 +86,13 @@ def __init__(
8686
else:
8787
self.minwidth = None
8888
self.kwargs = kwargs
89-
# TODO: Change as soon as Segmenter is implemented
90-
# self.segment = segment
91-
if 'segment' in kwargs:
92-
self.segment = kwargs['segment']
93-
del kwargs['segment']
89+
if segment and 'segmentation_models' in kwargs:
90+
self.segmentation_models = kwargs['segmentation_models']
91+
del kwargs['segmentation_models']
9492
else:
93+
self.segmentation_models = None
94+
95+
if not segment:
9596
self.segment = False
9697

9798
self.image_processor = ImageProcessor()
@@ -215,7 +216,7 @@ def preprocess_single_xml_file(self, xml_file: str) -> None:
215216
page = page_parser.get_page()
216217
gt_dict = {}
217218
except ParseTextLinesException as e:
218-
logger.error("Error parsing XML file %s: %s", xml_file, exc_info=True)
219+
logger.error("Error parsing XML file %s: %s", xml_file, e, exc_info=True)
219220
if self.stop_on_fail:
220221
raise
221222
return

0 commit comments

Comments
 (0)