@@ -38,7 +38,7 @@ def __init__(
3838 min_width_line : Optional [Union [int , float ]] = None ,
3939 min_height_line : Optional [Union [int , float ]] = None ,
4040 allow_empty_lines : bool = False ,
41- huggingface_repo_private : bool = False ,
41+ huggingface_new_repo_private : bool = False ,
4242 split_train_ratio : Optional [float ] = None ,
4343 split_seed : int = 42 ,
4444 split_shuffle : bool = True ,
@@ -57,7 +57,8 @@ def __init__(
5757 :param min_width_line: Minimum width of the line to be processed.
5858 :param min_height_line: Minimum height of the line to be processed.
5959 :param allow_empty_lines: Whether to allow empty lines extracted.
60- :param huggingface_repo_private: Whether the Hugging Face repository is private (token needed).
60+ :param huggingface_new_repo_private: Whether the Hugging Face \
61+ repository is private (token needed).
6162 :param split_train_ratio: Ratio of training data to be split - if None, there is no split.
6263 :param split_seed: Seed for the random split.
6364 :param split_shuffle: Whether to shuffle the data before splitting.
@@ -69,8 +70,9 @@ def __init__(
6970 # Hugging Face repository parameters
7071 self .huggingface_new_repo_name : Optional [
7172 str ] = None if huggingface_new_repo_name is None else huggingface_new_repo_name
72- self .huggingface_token : Optional [str ] = None if huggingface_token is None else huggingface_token
73- self .huggingface_repo_private : bool = huggingface_repo_private
73+ self .huggingface_token : Optional [str ] = None if huggingface_token is None \
74+ else huggingface_token
75+ self .huggingface_repo_private : bool = huggingface_new_repo_private
7476 self .dataset : Optional [datasets .Dataset ] = None
7577
7678 # Data handling
@@ -82,22 +84,32 @@ def __init__(
8284 if namespace is not None :
8385 self .namespace = {'pc' : namespace }
8486 else :
85- self .namespace = {'pc' : 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' }
87+ self .namespace = {
88+ 'pc' : 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
89+ }
8690
87- self .min_width_line : Optional [int ] = int (min_width_line ) if min_width_line is not None else None
91+ self .min_width_line : Optional [int ] = int (min_width_line ) if min_width_line is not None \
92+ else None
8893 if self .min_width_line is not None and self .min_width_line < 0 :
89- logger .error ("Preprocessor.__init__(): min_width_line must be a positive integer or None." )
94+ logger .error (
95+ "Preprocessor.__init__(): min_width_line must be a positive integer or None."
96+ )
9097 raise ValueError ("min_width_line must be a positive integer or None." )
9198
92- self .min_height_line : Optional [int ] = int (min_height_line ) if min_height_line is not None else None
99+ self .min_height_line : Optional [int ] = int (min_height_line ) if min_height_line is not None \
100+ else None
93101 if self .min_height_line is not None and self .min_height_line < 0 :
94- logger .error ("Preprocessor.__init__(): min_height_line must be a positive integer or None." )
102+ logger .error (
103+ "Preprocessor.__init__(): min_height_line must be a positive integer or None."
104+ )
95105 raise ValueError ("min_height_line must be a positive integer or None." )
96106
97107 # Split parameters
98108 if split_train_ratio is not None :
99109 if split_train_ratio > 1.0 or split_train_ratio <= 0.0 :
100- logger .error ("Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0." )
110+ logger .error (
111+ "Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0."
112+ )
101113 raise ValueError ("split_train_ratio must be between 0.0 and 1.0." )
102114 self .split_train_ratio : Optional [float ] = split_train_ratio
103115 self .split_seed : int = split_seed
@@ -107,11 +119,13 @@ def __init__(
107119 self .segment : bool = segment
108120
109121 self .pages = None
122+ self .stats = None
123+ self .state = 'in_progress'
110124 self .converter : XmlConverter = self .create_xmlconverter ()
111125
112126 if isinstance (segmenter_config , dict ):
113127 try :
114- self .segmenter_config : SegmenterConfig = SegmenterConfig (** segmenter_config )
128+ self .segmenter_config = SegmenterConfig (** segmenter_config )
115129 except ValidationError as e :
116130 logger .error ("Preprocessor.__init__(): Error creating SegmenterConfig: %s" , e )
117131 raise ValidationError ("Invalid segmenter_config provided." ) from e
@@ -127,15 +141,18 @@ def create_xmlconverter(self) -> XmlConverter:
127141
128142 :return: An instance of XmlConverter.
129143 """
130- pass
131144
132- async def segment_images (self ) -> datasets .Dataset :
145+ async def segment_images (self ) -> datasets .Dataset | None :
133146 """
134147 Segment images in the dataset using the specified segmenter configuration.
135148
136149 :return: A new Hugging Face dataset with segmented images.
137150 """
138- self .segmentation_models : Optional [Union [List [str ], str ]] = self .segmenter_config .model_names
151+ if self .segmenter_config is None :
152+ logger .error ("Preprocessor.segment_images(): segmenter_config is None." )
153+ raise ValueError ("segmenter_config must be provided when segment is True." )
154+ self .segmentation_models : Optional [Union [List [str ], str ]] = \
155+ self .segmenter_config .model_names
139156 segmenter = SegmenterYOLO (config = self .segmenter_config )
140157 segmented_dataset = self .converter .convert (
141158 export_mode = 'raw_xml' ,
@@ -162,18 +179,24 @@ async def preprocess(self) -> None:
162179 min_height = self .min_height_line ,
163180 allow_empty = self .allow_empty_lines ,
164181 )
182+ self .stats = self .converter .get_stats ()
183+ if self .dataset is None :
184+ raise RuntimeError ("Preprocessor.preprocess(): Dataset is None." )
165185 if self .huggingface_new_repo_name :
166- logger .info (f "Pushing to Hugging Face repo: { self .huggingface_new_repo_name } " )
186+ logger .info ("Pushing to Hugging Face repo: %s" , self .huggingface_new_repo_name )
167187 repo_url = self .converter .upload_to_hub (
168188 dataset = self .dataset ,
169189 repo_id = self .huggingface_new_repo_name ,
170190 token = self .huggingface_token ,
171191 private = self .huggingface_repo_private ,
172192 )
173193 logger .info ('%s - HuggingFace repo URL: %s' , self .__class__ .__name__ , repo_url )
194+ self .state = 'completed'
174195 except Exception as e :
175- logger .error (f "Error during preprocessing/converting: { e } " )
196+ logger .error ("Error during preprocessing/converting: %s" , e )
176197 if self .stop_on_fail :
198+ self .state = 'failed'
199+ logger .error ("Stopping processing due to stop_on_fail=True." )
177200 raise e
178201
179202
@@ -192,8 +215,10 @@ def __init__(
192215 Initialize parameters for file preprocessing.
193216
194217 :param input_path: URL to fetch the ZIP-File or local path to the ZIP-File.
195- :param huggingface_new_repo_name: Name of the new Hugging Face repository, where the result is pushed to.
196- :param kwargs: Additional keyword arguments for the base Preprocessor class arguments with default values.
218+ :param huggingface_new_repo_name: Name of the new Hugging Face repository, \
219+ where the result is pushed to.
220+ :param kwargs: Additional keyword arguments for the \
221+ base Preprocessor class arguments with default values.
197222 """
198223 self .input_path : str = input_path
199224
@@ -206,7 +231,7 @@ def create_xmlconverter(self) -> XmlConverter:
206231 :return: An instance of XmlConverter configured with LineExporter.
207232 """
208233 parser = XmlParser (namespace = self .namespace ['pc' ])
209- logger .info (f "Creating XmlConverter for input path: { self .input_path } " )
234+ logger .info ("Creating XmlConverter for input path: %s" , self .input_path ) ")
210235 if parser :
211236 logger .info ("XmlParser created successfully." )
212237 else :
@@ -224,7 +249,7 @@ def create_xmlconverter(self) -> XmlConverter:
224249 source_type = 'zip'
225250 pages = parser .parse_zip (self .input_path )
226251 if pages is not None :
227- logger .info (f "Parsed { len ( pages ) } pages successfully." )
252+ logger .info ("Parsed %s pages successfully." , len ( pages ) )
228253 else :
229254 logger .error ("Failed to parse pages." )
230255 raise ValueError ("Failed to parse pages." )
@@ -239,11 +264,12 @@ def create_xmlconverter(self) -> XmlConverter:
239264
240265 async def preprocess (self ) -> None :
241266 """
242- Perform preprocessing steps on files in the input directory and save to the output directory.
267+ Perform preprocessing steps on files in the input directory
268+ and save to the output directory.
243269 """
244- logger .info (f "Preprocessing/converting { self .input_path } " )
270+ logger .info ("Preprocessing/converting %s" , self .input_path )
245271 await super ().preprocess ()
246- logger .info (f "Preprocessing/converting { self . input_path } completed." )
272+ logger .info ("Preprocessing/converting %s completed." , self . input_path )
247273
248274
249275class HuggingFacePreprocessor (Preprocessor ):
@@ -253,16 +279,18 @@ class HuggingFacePreprocessor(Preprocessor):
253279
254280 def __init__ (
255281 self ,
256- input_path : Optional [ Union [ str , datasets . Dataset ]] ,
282+ input_path : str ,
257283 huggingface_new_repo_name : Optional [str ] = None ,
258284 ** kwargs
259285 ) -> None :
260286 """
261287 Initialize parameters for file preprocessing.
262288
263289 :param input_path: Hugging Face dataset ID to fetch the XML files from.
264- :param huggingface_new_repo_name: Name of the new Hugging Face repository, where the result is pushed to.
265- :param kwargs: Additional keyword arguments for the base Preprocessor class arguments with default values.
290+ :param huggingface_new_repo_name: Name of the new Hugging Face repository, \
291+ where the result is pushed to.
292+ :param kwargs: Additional keyword arguments for the base Preprocessor class \
293+ arguments with default values.
266294 """
267295 self .input_path : str = input_path
268296
0 commit comments