@@ -38,6 +38,7 @@ def __init__(
3838 min_width_line : Optional [Union [int , float ]] = None ,
3939 min_height_line : Optional [Union [int , float ]] = None ,
4040 allow_empty_lines : bool = False ,
41+ batch_size : int = 32 ,
4142 huggingface_new_repo_private : bool = False ,
4243 split_train_ratio : Optional [float ] = None ,
4344 split_seed : int = 42 ,
@@ -57,6 +58,7 @@ def __init__(
5758 :param min_width_line: Minimum width of the line to be processed.
5859 :param min_height_line: Minimum height of the line to be processed.
5960 :param allow_empty_lines: Whether to allow empty lines extracted.
61+ :param batch_size: Batch size for dataset mapping (default 32).
6062 :param huggingface_new_repo_private: Whether the Hugging Face \
6163 repository is private (token needed).
6264 :param split_train_ratio: Ratio of training data to be split - if None, there is no split.
@@ -82,6 +84,7 @@ def __init__(
8284 self .abbrev : bool = abbrev
8385 self .stop_on_fail : bool = stop_on_fail
8486 self .allow_empty_lines : bool = allow_empty_lines
87+ self .batch_size : int = batch_size
8588 self .export_mode : str = export_mode
8689 if namespace is not None :
8790 self .namespace = {'pc' : namespace }
@@ -164,6 +167,7 @@ async def segment_images(self) -> datasets.Dataset | None:
164167 export_mode = 'raw_xml' ,
165168 split_train = None ,
166169 allow_empty = self .allow_empty_lines ,
170+ batch_size = self .batch_size ,
167171 )
168172 self .dataset = segmenter .segment_dataset (segmented_dataset , new_column_name = 'xml' )
169173 self .converter = self .create_xmlconverter ()
@@ -181,6 +185,7 @@ async def preprocess(self) -> None:
181185 split_seed = self .split_seed ,
182186 split_shuffle = self .split_shuffle ,
183187 mask_crop = self .crop ,
188+ batch_size = self .batch_size ,
184189 min_width = self .min_width_line ,
185190 min_height = self .min_height_line ,
186191 allow_empty = self .allow_empty_lines ,
0 commit comments