Skip to content

Commit 908c875

Browse files
committed
better state handling
1 parent 9083e31 commit 908c875

1 file changed

Lines changed: 14 additions & 1 deletion

File tree

src/flow_preprocessor/preprocessing_logic/preprocess.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ def __init__(
6767
(default http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15).
6868
"""
6969

70+
self.state = 'in_progress'
71+
7072
# Hugging Face repository parameters
7173
self.huggingface_new_repo_name: Optional[
7274
str] = None if huggingface_new_repo_name is None else huggingface_new_repo_name
@@ -94,6 +96,7 @@ def __init__(
9496
logger.error(
9597
"Preprocessor.__init__(): min_width_line must be a positive integer or None."
9698
)
99+
self.state = 'failed'
97100
raise ValueError("min_width_line must be a positive integer or None.")
98101

99102
self.min_height_line: Optional[int] = int(min_height_line) if min_height_line is not None \
@@ -102,6 +105,7 @@ def __init__(
102105
logger.error(
103106
"Preprocessor.__init__(): min_height_line must be a positive integer or None."
104107
)
108+
self.state = 'failed'
105109
raise ValueError("min_height_line must be a positive integer or None.")
106110

107111
# Split parameters
@@ -110,6 +114,7 @@ def __init__(
110114
logger.error(
111115
"Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0."
112116
)
117+
self.state = 'failed'
113118
raise ValueError("split_train_ratio must be between 0.0 and 1.0.")
114119
self.split_train_ratio: Optional[float] = split_train_ratio
115120
self.split_seed: int = split_seed
@@ -120,14 +125,14 @@ def __init__(
120125

121126
self.pages = None
122127
self.stats = None
123-
self.state = 'in_progress'
124128
self.converter: XmlConverter = self.create_xmlconverter()
125129

126130
if isinstance(segmenter_config, dict):
127131
try:
128132
self.segmenter_config = SegmenterConfig(**segmenter_config)
129133
except ValidationError as e:
130134
logger.error("Preprocessor.__init__(): Error creating SegmenterConfig: %s", e)
135+
self.state = 'failed'
131136
raise ValidationError("Invalid segmenter_config provided.") from e
132137
else:
133138
self.segmenter_config: Optional[SegmenterConfig] = segmenter_config
@@ -150,6 +155,7 @@ async def segment_images(self) -> datasets.Dataset | None:
150155
"""
151156
if self.segmenter_config is None:
152157
logger.error("Preprocessor.segment_images(): segmenter_config is None.")
158+
self.state = 'failed'
153159
raise ValueError("segmenter_config must be provided when segment is True.")
154160
self.segmentation_models: Optional[Union[List[str], str]] = \
155161
self.segmenter_config.model_names
@@ -181,6 +187,7 @@ async def preprocess(self) -> None:
181187
)
182188
self.stats = self.converter.get_stats()
183189
if self.dataset is None:
190+
self.state = 'failed'
184191
raise RuntimeError("Preprocessor.preprocess(): Dataset is None.")
185192
if self.huggingface_new_repo_name:
186193
logger.info("Pushing to Hugging Face repo: %s", self.huggingface_new_repo_name)
@@ -236,6 +243,7 @@ def create_xmlconverter(self) -> XmlConverter:
236243
logger.info("XmlParser created successfully.")
237244
else:
238245
logger.error("Failed to create XmlParser.")
246+
self.state = 'failed'
239247
raise ValueError("Failed to create XmlParser.")
240248
if self.dataset is not None:
241249
logger.info("Using dataset for XML conversion.")
@@ -252,6 +260,7 @@ def create_xmlconverter(self) -> XmlConverter:
252260
logger.info("Parsed %s pages successfully.", len(pages))
253261
else:
254262
logger.error("Failed to parse pages.")
263+
self.state = 'failed'
255264
raise ValueError("Failed to parse pages.")
256265
self.pages = pages
257266
converter = XmlConverter(pages, source_path=self.input_path, source_type=source_type)
@@ -260,6 +269,7 @@ def create_xmlconverter(self) -> XmlConverter:
260269
return converter
261270
else:
262271
logger.error("Failed to create XmlConverter.")
272+
self.state = 'failed'
263273
raise ValueError("Failed to create XmlConverter.")
264274

265275
async def preprocess(self) -> None:
@@ -308,6 +318,7 @@ def create_xmlconverter(self) -> XmlConverter:
308318
logger.info("XmlParser created successfully.")
309319
else:
310320
logger.error("Failed to create XmlParser.")
321+
self.state = 'failed'
311322
raise ValueError("Failed to create XmlParser.")
312323
source_type = 'huggingface'
313324

@@ -321,6 +332,7 @@ def create_xmlconverter(self) -> XmlConverter:
321332
logger.info(f"Parsed {len(pages)} pages successfully.")
322333
else:
323334
logger.error("Failed to parse pages.")
335+
self.state = 'failed'
324336
raise ValueError("Failed to parse pages.")
325337
self.pages = pages
326338
converter = XmlConverter(pages, source_path=self.input_path, source_type=source_type)
@@ -329,6 +341,7 @@ def create_xmlconverter(self) -> XmlConverter:
329341
return converter
330342
else:
331343
logger.error("Failed to create XmlConverter.")
344+
self.state = 'failed'
332345
raise ValueError("Failed to create XmlConverter.")
333346

334347
async def preprocess(self) -> None:

0 commit comments

Comments
 (0)