Skip to content

Commit 08989dc

Browse files
committed
linting
1 parent 0e05dde commit 08989dc

1 file changed

Lines changed: 54 additions & 26 deletions

File tree

src/flow_preprocessor/preprocessing_logic/preprocess.py

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(
3838
min_width_line: Optional[Union[int, float]] = None,
3939
min_height_line: Optional[Union[int, float]] = None,
4040
allow_empty_lines: bool = False,
41-
huggingface_repo_private: bool = False,
41+
huggingface_new_repo_private: bool = False,
4242
split_train_ratio: Optional[float] = None,
4343
split_seed: int = 42,
4444
split_shuffle: bool = True,
@@ -57,7 +57,8 @@ def __init__(
5757
:param min_width_line: Minimum width of the line to be processed.
5858
:param min_height_line: Minimum height of the line to be processed.
5959
:param allow_empty_lines: Whether to allow empty lines extracted.
60-
:param huggingface_repo_private: Whether the Hugging Face repository is private (token needed).
60+
:param huggingface_new_repo_private: Whether the Hugging Face \
61+
repository is private (token needed).
6162
:param split_train_ratio: Ratio of training data to be split - if None, there is no split.
6263
:param split_seed: Seed for the random split.
6364
:param split_shuffle: Whether to shuffle the data before splitting.
@@ -69,8 +70,9 @@ def __init__(
6970
# Hugging Face repository parameters
7071
self.huggingface_new_repo_name: Optional[
7172
str] = None if huggingface_new_repo_name is None else huggingface_new_repo_name
72-
self.huggingface_token: Optional[str] = None if huggingface_token is None else huggingface_token
73-
self.huggingface_repo_private: bool = huggingface_repo_private
73+
self.huggingface_token: Optional[str] = None if huggingface_token is None \
74+
else huggingface_token
75+
self.huggingface_repo_private: bool = huggingface_new_repo_private
7476
self.dataset: Optional[datasets.Dataset] = None
7577

7678
# Data handling
@@ -82,22 +84,32 @@ def __init__(
8284
if namespace is not None:
8385
self.namespace = {'pc': namespace}
8486
else:
85-
self.namespace = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
87+
self.namespace = {
88+
'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
89+
}
8690

87-
self.min_width_line: Optional[int] = int(min_width_line) if min_width_line is not None else None
91+
self.min_width_line: Optional[int] = int(min_width_line) if min_width_line is not None \
92+
else None
8893
if self.min_width_line is not None and self.min_width_line < 0:
89-
logger.error("Preprocessor.__init__(): min_width_line must be a positive integer or None.")
94+
logger.error(
95+
"Preprocessor.__init__(): min_width_line must be a positive integer or None."
96+
)
9097
raise ValueError("min_width_line must be a positive integer or None.")
9198

92-
self.min_height_line: Optional[int] = int(min_height_line) if min_height_line is not None else None
99+
self.min_height_line: Optional[int] = int(min_height_line) if min_height_line is not None \
100+
else None
93101
if self.min_height_line is not None and self.min_height_line < 0:
94-
logger.error("Preprocessor.__init__(): min_height_line must be a positive integer or None.")
102+
logger.error(
103+
"Preprocessor.__init__(): min_height_line must be a positive integer or None."
104+
)
95105
raise ValueError("min_height_line must be a positive integer or None.")
96106

97107
# Split parameters
98108
if split_train_ratio is not None:
99109
if split_train_ratio > 1.0 or split_train_ratio <= 0.0:
100-
logger.error("Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0.")
110+
logger.error(
111+
"Preprocessor.__init__(): split_train_ratio must be between 0.0 and 1.0."
112+
)
101113
raise ValueError("split_train_ratio must be between 0.0 and 1.0.")
102114
self.split_train_ratio: Optional[float] = split_train_ratio
103115
self.split_seed: int = split_seed
@@ -107,11 +119,13 @@ def __init__(
107119
self.segment: bool = segment
108120

109121
self.pages = None
122+
self.stats = None
123+
self.state = 'in_progress'
110124
self.converter: XmlConverter = self.create_xmlconverter()
111125

112126
if isinstance(segmenter_config, dict):
113127
try:
114-
self.segmenter_config: SegmenterConfig = SegmenterConfig(**segmenter_config)
128+
self.segmenter_config = SegmenterConfig(**segmenter_config)
115129
except ValidationError as e:
116130
logger.error("Preprocessor.__init__(): Error creating SegmenterConfig: %s", e)
117131
raise ValidationError("Invalid segmenter_config provided.") from e
@@ -127,15 +141,18 @@ def create_xmlconverter(self) -> XmlConverter:
127141
128142
:return: An instance of XmlConverter.
129143
"""
130-
pass
131144

132-
async def segment_images(self) -> datasets.Dataset:
145+
async def segment_images(self) -> datasets.Dataset | None:
133146
"""
134147
Segment images in the dataset using the specified segmenter configuration.
135148
136149
:return: A new Hugging Face dataset with segmented images.
137150
"""
138-
self.segmentation_models: Optional[Union[List[str], str]] = self.segmenter_config.model_names
151+
if self.segmenter_config is None:
152+
logger.error("Preprocessor.segment_images(): segmenter_config is None.")
153+
raise ValueError("segmenter_config must be provided when segment is True.")
154+
self.segmentation_models: Optional[Union[List[str], str]] = \
155+
self.segmenter_config.model_names
139156
segmenter = SegmenterYOLO(config=self.segmenter_config)
140157
segmented_dataset = self.converter.convert(
141158
export_mode='raw_xml',
@@ -162,18 +179,24 @@ async def preprocess(self) -> None:
162179
min_height=self.min_height_line,
163180
allow_empty=self.allow_empty_lines,
164181
)
182+
self.stats = self.converter.get_stats()
183+
if self.dataset is None:
184+
raise RuntimeError("Preprocessor.preprocess(): Dataset is None.")
165185
if self.huggingface_new_repo_name:
166-
logger.info(f"Pushing to Hugging Face repo: {self.huggingface_new_repo_name}")
186+
logger.info("Pushing to Hugging Face repo: %s", self.huggingface_new_repo_name)
167187
repo_url = self.converter.upload_to_hub(
168188
dataset=self.dataset,
169189
repo_id=self.huggingface_new_repo_name,
170190
token=self.huggingface_token,
171191
private=self.huggingface_repo_private,
172192
)
173193
logger.info('%s - HuggingFace repo URL: %s', self.__class__.__name__, repo_url)
194+
self.state = 'completed'
174195
except Exception as e:
175-
logger.error(f"Error during preprocessing/converting: {e}")
196+
logger.error("Error during preprocessing/converting: %s", e)
176197
if self.stop_on_fail:
198+
self.state = 'failed'
199+
logger.error("Stopping processing due to stop_on_fail=True.")
177200
raise e
178201

179202

@@ -192,8 +215,10 @@ def __init__(
192215
Initialize parameters for file preprocessing.
193216
194217
:param input_path: URL to fetch the ZIP-File or local path to the ZIP-File.
195-
:param huggingface_new_repo_name: Name of the new Hugging Face repository, where the result is pushed to.
196-
:param kwargs: Additional keyword arguments for the base Preprocessor class arguments with default values.
218+
:param huggingface_new_repo_name: Name of the new Hugging Face repository, \
219+
where the result is pushed to.
220+
:param kwargs: Additional keyword arguments for the \
221+
base Preprocessor class arguments with default values.
197222
"""
198223
self.input_path: str = input_path
199224

@@ -206,7 +231,7 @@ def create_xmlconverter(self) -> XmlConverter:
206231
:return: An instance of XmlConverter configured with LineExporter.
207232
"""
208233
parser = XmlParser(namespace=self.namespace['pc'])
209-
logger.info(f"Creating XmlConverter for input path: {self.input_path}")
234+
logger.info("Creating XmlConverter for input path: %s", self.input_path)")
210235
if parser:
211236
logger.info("XmlParser created successfully.")
212237
else:
@@ -224,7 +249,7 @@ def create_xmlconverter(self) -> XmlConverter:
224249
source_type = 'zip'
225250
pages = parser.parse_zip(self.input_path)
226251
if pages is not None:
227-
logger.info(f"Parsed {len(pages)} pages successfully.")
252+
logger.info("Parsed %s pages successfully.", len(pages))
228253
else:
229254
logger.error("Failed to parse pages.")
230255
raise ValueError("Failed to parse pages.")
@@ -239,11 +264,12 @@ def create_xmlconverter(self) -> XmlConverter:
239264

240265
async def preprocess(self) -> None:
241266
"""
242-
Perform preprocessing steps on files in the input directory and save to the output directory.
267+
Perform preprocessing steps on files in the input directory
268+
and save to the output directory.
243269
"""
244-
logger.info(f"Preprocessing/converting {self.input_path}")
270+
logger.info("Preprocessing/converting %s", self.input_path)
245271
await super().preprocess()
246-
logger.info(f"Preprocessing/converting {self.input_path} completed.")
272+
logger.info("Preprocessing/converting %s completed.", self.input_path)
247273

248274

249275
class HuggingFacePreprocessor(Preprocessor):
@@ -253,16 +279,18 @@ class HuggingFacePreprocessor(Preprocessor):
253279

254280
def __init__(
255281
self,
256-
input_path: Optional[Union[str, datasets.Dataset]],
282+
input_path: str,
257283
huggingface_new_repo_name: Optional[str] = None,
258284
**kwargs
259285
) -> None:
260286
"""
261287
Initialize parameters for file preprocessing.
262288
263289
:param input_path: Hugging Face dataset ID to fetch the XML files from.
264-
:param huggingface_new_repo_name: Name of the new Hugging Face repository, where the result is pushed to.
265-
:param kwargs: Additional keyword arguments for the base Preprocessor class arguments with default values.
290+
:param huggingface_new_repo_name: Name of the new Hugging Face repository, \
291+
where the result is pushed to.
292+
:param kwargs: Additional keyword arguments for the base Preprocessor class \
293+
arguments with default values.
266294
"""
267295
self.input_path: str = input_path
268296

0 commit comments

Comments
 (0)