@@ -67,6 +67,8 @@ def __init__(
6767 (default http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15).
6868 """
6969
70+ self .state = 'in_progress'
71+
7072 # Hugging Face repository parameters
7173 self .huggingface_new_repo_name : Optional [
7274 str ] = None if huggingface_new_repo_name is None else huggingface_new_repo_name
@@ -94,6 +96,7 @@ def __init__(
9496 logger .error (
9597 "Preprocessor.__init__(): min_width_line must be a positive integer or None."
9698 )
99+ self .state = 'failed'
97100 raise ValueError ("min_width_line must be a positive integer or None." )
98101
99102 self .min_height_line : Optional [int ] = int (min_height_line ) if min_height_line is not None \
@@ -102,6 +105,7 @@ def __init__(
102105 logger .error (
103106 "Preprocessor.__init__(): min_height_line must be a positive integer or None."
104107 )
108+ self .state = 'failed'
105109 raise ValueError ("min_height_line must be a positive integer or None." )
106110
107111 # Split parameters
@@ -110,6 +114,7 @@ def __init__(
110114 logger .error (
111115 "Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0."
112116 )
117+ self .state = 'failed'
113118 raise ValueError ("split_train_ratio must be between 0.0 and 1.0." )
114119 self .split_train_ratio : Optional [float ] = split_train_ratio
115120 self .split_seed : int = split_seed
@@ -120,14 +125,14 @@ def __init__(
120125
121126 self .pages = None
122127 self .stats = None
123- self .state = 'in_progress'
124128 self .converter : XmlConverter = self .create_xmlconverter ()
125129
126130 if isinstance (segmenter_config , dict ):
127131 try :
128132 self .segmenter_config = SegmenterConfig (** segmenter_config )
129133 except ValidationError as e :
130134 logger .error ("Preprocessor.__init__(): Error creating SegmenterConfig: %s" , e )
135+ self .state = 'failed'
131136 raise ValidationError ("Invalid segmenter_config provided." ) from e
132137 else :
133138 self .segmenter_config : Optional [SegmenterConfig ] = segmenter_config
@@ -150,6 +155,7 @@ async def segment_images(self) -> datasets.Dataset | None:
150155 """
151156 if self .segmenter_config is None :
152157 logger .error ("Preprocessor.segment_images(): segmenter_config is None." )
158+ self .state = 'failed'
153159 raise ValueError ("segmenter_config must be provided when segment is True." )
154160 self .segmentation_models : Optional [Union [List [str ], str ]] = \
155161 self .segmenter_config .model_names
@@ -181,6 +187,7 @@ async def preprocess(self) -> None:
181187 )
182188 self .stats = self .converter .get_stats ()
183189 if self .dataset is None :
190+ self .state = 'failed'
184191 raise RuntimeError ("Preprocessor.preprocess(): Dataset is None." )
185192 if self .huggingface_new_repo_name :
186193 logger .info ("Pushing to Hugging Face repo: %s" , self .huggingface_new_repo_name )
@@ -236,6 +243,7 @@ def create_xmlconverter(self) -> XmlConverter:
236243 logger .info ("XmlParser created successfully." )
237244 else :
238245 logger .error ("Failed to create XmlParser." )
246+ self .state = 'failed'
239247 raise ValueError ("Failed to create XmlParser." )
240248 if self .dataset is not None :
241249 logger .info ("Using dataset for XML conversion." )
@@ -252,6 +260,7 @@ def create_xmlconverter(self) -> XmlConverter:
252260 logger .info ("Parsed %s pages successfully." , len (pages ))
253261 else :
254262 logger .error ("Failed to parse pages." )
263+ self .state = 'failed'
255264 raise ValueError ("Failed to parse pages." )
256265 self .pages = pages
257266 converter = XmlConverter (pages , source_path = self .input_path , source_type = source_type )
@@ -260,6 +269,7 @@ def create_xmlconverter(self) -> XmlConverter:
260269 return converter
261270 else :
262271 logger .error ("Failed to create XmlConverter." )
272+ self .state = 'failed'
263273 raise ValueError ("Failed to create XmlConverter." )
264274
265275 async def preprocess (self ) -> None :
@@ -308,6 +318,7 @@ def create_xmlconverter(self) -> XmlConverter:
308318 logger .info ("XmlParser created successfully." )
309319 else :
310320 logger .error ("Failed to create XmlParser." )
321+ self .state = 'failed'
311322 raise ValueError ("Failed to create XmlParser." )
312323 source_type = 'huggingface'
313324
@@ -321,6 +332,7 @@ def create_xmlconverter(self) -> XmlConverter:
321332 logger .info (f"Parsed { len (pages )} pages successfully." )
322333 else :
323334 logger .error ("Failed to parse pages." )
335+ self .state = 'failed'
324336 raise ValueError ("Failed to parse pages." )
325337 self .pages = pages
326338 converter = XmlConverter (pages , source_path = self .input_path , source_type = source_type )
@@ -329,6 +341,7 @@ def create_xmlconverter(self) -> XmlConverter:
329341 return converter
330342 else :
331343 logger .error ("Failed to create XmlConverter." )
344+ self .state = 'failed'
332345 raise ValueError ("Failed to create XmlConverter." )
333346
334347 async def preprocess (self ) -> None :
0 commit comments