Skip to content

Commit aea92b8

Browse files
committed
added batch size
1 parent fb3e7c0 commit aea92b8

1 file changed

Lines changed: 5 additions & 0 deletions

File tree

src/flow_preprocessor/preprocessing_logic/preprocess.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def __init__(
3838
min_width_line: Optional[Union[int, float]] = None,
3939
min_height_line: Optional[Union[int, float]] = None,
4040
allow_empty_lines: bool = False,
41+
batch_size: int = 32,
4142
huggingface_new_repo_private: bool = False,
4243
split_train_ratio: Optional[float] = None,
4344
split_seed: int = 42,
@@ -57,6 +58,7 @@ def __init__(
5758
:param min_width_line: Minimum width of the line to be processed.
5859
:param min_height_line: Minimum height of the line to be processed.
5960
:param allow_empty_lines: Whether to allow empty lines extracted.
61+
:param batch_size: Batch size for dataset mapping (default 32).
6062
:param huggingface_new_repo_private: Whether the Hugging Face \
6163
repository is private (token needed).
6264
:param split_train_ratio: Ratio of training data to be split - if None, there is no split.
@@ -82,6 +84,7 @@ def __init__(
8284
self.abbrev: bool = abbrev
8385
self.stop_on_fail: bool = stop_on_fail
8486
self.allow_empty_lines: bool = allow_empty_lines
87+
self.batch_size: int = batch_size
8588
self.export_mode: str = export_mode
8689
if namespace is not None:
8790
self.namespace = {'pc': namespace}
@@ -164,6 +167,7 @@ async def segment_images(self) -> datasets.Dataset | None:
164167
export_mode='raw_xml',
165168
split_train=None,
166169
allow_empty=self.allow_empty_lines,
170+
batch_size=self.batch_size,
167171
)
168172
self.dataset = segmenter.segment_dataset(segmented_dataset, new_column_name='xml')
169173
self.converter = self.create_xmlconverter()
@@ -181,6 +185,7 @@ async def preprocess(self) -> None:
181185
split_seed=self.split_seed,
182186
split_shuffle=self.split_shuffle,
183187
mask_crop=self.crop,
188+
batch_size=self.batch_size,
184189
min_width=self.min_width_line,
185190
min_height=self.min_height_line,
186191
allow_empty=self.allow_empty_lines,

0 commit comments

Comments
 (0)