Skip to content

Commit e74f5fb

Browse files
committed
chore: Merge remote-tracking branch 'origin/main' into conversion_modalities_to_huggingface
2 parents 000a9fa + 5525864 commit e74f5fb

16 files changed

Lines changed: 629 additions & 26 deletions

File tree

.github/workflows/linting.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
runs-on: ubuntu-latest
1414
strategy:
1515
matrix:
16-
python-version: ["3.10"]
16+
python-version: ["3.11"]
1717
steps:
1818
- uses: actions/checkout@v4
1919
- name: Set up Python ${{ matrix.python-version }}

src/modalities/__main__.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020
convert_pytorch_to_hf_checkpoint,
2121
create_raw_data_index,
2222
create_shuffled_dataset_chunk,
23+
create_shuffled_jsonl_dataset_chunk,
2324
generate_text,
2425
merge_packed_data_files,
2526
pack_encoded_data,
27+
shuffle_jsonl_data,
2628
shuffle_tokenized_data,
2729
)
2830
from modalities.batch import EvaluationResultBatch
@@ -308,6 +310,88 @@ def CMD_create_shuffled_dataset_chunk(
308310
)
309311

310312

313+
@data.command(name="create_shuffled_jsonl_chunk")
314+
@click.option(
315+
"--input_file_list_path",
316+
type=Path,
317+
required=True,
318+
help="Path to the file containing the list of jsonl files to be chunked.",
319+
)
320+
@click.option(
321+
"--input_data_root_path",
322+
type=Path,
323+
required=True,
324+
help="Directory path to the root of the input data.",
325+
)
326+
@click.option(
327+
"--output_chunk_file_path",
328+
type=Path,
329+
required=True,
330+
help="Path where the chunked jsonl dataset will be saved.",
331+
)
332+
@click.option(
333+
"--chunk_id",
334+
type=int,
335+
required=True,
336+
help="The id of the chunk to be created.",
337+
)
338+
@click.option(
339+
"--num_chunks",
340+
type=int,
341+
required=True,
342+
help="The number of chunks to create.",
343+
)
344+
@click.option(
345+
"--file_existence_policy",
346+
type=click.Choice([policy.value for policy in FileExistencePolicy]),
347+
default=FileExistencePolicy.ERROR.value,
348+
help="Policy for handling existing files.",
349+
)
350+
@click.option(
351+
"--global_seed",
352+
type=int,
353+
default=None,
354+
help="The global seed to use for shuffling.",
355+
)
356+
def CMD_create_shuffled_jsonl_dataset_chunk(
357+
input_file_list_path: Path,
358+
input_data_root_path: Path,
359+
output_chunk_file_path: Path,
360+
chunk_id: int,
361+
num_chunks: int,
362+
file_existence_policy: FileExistencePolicy,
363+
global_seed: Optional[int],
364+
):
365+
"""Utility to create a shuffled jsonl dataset chunk from a list of jsonl files.
366+
367+
Args:
368+
input_file_list_path (Path): Path to file that contains relative paths of
369+
jsonl files to be chunked and shuffled (one per line).
370+
input_data_root_path (Path): Path to the root directory that contains the jsonl files to be chunked.
371+
output_chunk_file_path (Path): File path to the chunked jsonl dataset.
372+
chunk_id (int): The id of the chunk to be created.
373+
num_chunks (int): Number of chunks in total.
374+
file_existence_policy (FileExistencePolicy): Policy for handling existing files.
375+
global_seed (Optional[int]): The global seed to use for shuffling.
376+
"""
377+
file_existence_policy = FileExistencePolicy(file_existence_policy)
378+
379+
with open(input_file_list_path, "r", encoding="utf-8") as f:
380+
file_path_list = f.readlines()
381+
file_path_list = [
382+
input_data_root_path / Path(file_path.strip()).with_suffix(".jsonl") for file_path in file_path_list
383+
]
384+
385+
create_shuffled_jsonl_dataset_chunk(
386+
file_path_list=file_path_list,
387+
output_chunk_file_path=output_chunk_file_path,
388+
chunk_id=chunk_id,
389+
num_chunks=num_chunks,
390+
file_existence_policy=file_existence_policy,
391+
global_seed=global_seed,
392+
)
393+
394+
311395
@data.command(name="merge_packed_data")
312396
@click.argument("src_paths", type=click.types.Path(exists=True, path_type=Path), nargs=-1, required=True)
313397
@click.argument("target_path", type=click.types.Path(file_okay=False, dir_okay=False, path_type=Path))
@@ -379,6 +463,54 @@ def CMD_shuffle_tokenized_data(
379463
)
380464

381465

466+
@data.command(name="shuffle_jsonl_data")
467+
@click.option(
468+
"--input_data_path",
469+
type=click_pathlib.Path(exists=True),
470+
required=True,
471+
help="Path to a jsonl file (.jsonl).",
472+
)
473+
@click.option(
474+
"--output_data_path",
475+
type=click_pathlib.Path(exists=False),
476+
required=True,
477+
help="Path to write the shuffled jsonl data (.jsonl).",
478+
)
479+
@click.option(
480+
"--file_existence_policy",
481+
type=click.Choice([policy.value for policy in FileExistencePolicy]),
482+
default=FileExistencePolicy.ERROR.value,
483+
help="Policy for handling existing files.",
484+
)
485+
@click.option(
486+
"--seed",
487+
type=int,
488+
default=None,
489+
help="The seed for shuffling the data.",
490+
)
491+
def CMD_shuffle_jsonl_data(
492+
input_data_path: Path, output_data_path: Path, file_existence_policy, seed: Optional[int]
493+
) -> None:
494+
"""Entrypoint for shuffling jsonl data.
495+
496+
Args:
497+
input_data_path (Path): The path to the input jsonl data (.jsonl).
498+
output_data_path (Path): File path to write the shuffled jsonl data (.jsonl).
499+
file_existence_policy (FileExistencePolicy): Policy for handling existing files.
500+
seed (Optional[int]): The seed for shuffling the data. Default is None.
501+
Returns:
502+
None
503+
"""
504+
file_existence_policy = FileExistencePolicy(file_existence_policy)
505+
506+
shuffle_jsonl_data(
507+
input_data_path=input_data_path,
508+
output_data_path=output_data_path,
509+
file_existence_policy=file_existence_policy,
510+
seed=seed,
511+
)
512+
513+
382514
class Main:
383515
"""Main class that orchestrates the training process."""
384516

src/modalities/api.py

Lines changed: 136 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#!/usr/bin/env python
22

3+
import itertools
34
import os
45
from enum import Enum
56
from pathlib import Path
6-
from typing import Optional
7+
from typing import Any, Callable, Optional
78

89
import numpy as np
910
import tqdm
@@ -17,10 +18,10 @@
1718
from modalities.dataloader.create_packed_data import EmbeddedStreamData, PackedDataGenerator, join_embedded_stream_data
1819
from modalities.dataloader.dataset import PackedMemMapDatasetBase
1920
from modalities.dataloader.large_file_lines_reader import LargeFileLinesReader
20-
from modalities.dataloader.preprocessing.chunking.create_chunks import Chunking
2121
from modalities.dataloader.preprocessing.tokenization.tokenized_file_writer import TokenizedFileWriter
22-
from modalities.dataloader.shuffle_tokenized_data import TokenizedDataShuffler
2322
from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter
23+
from modalities.preprocessing.create_chunks import Chunking
24+
from modalities.preprocessing.shuffle_data import DataShuffler
2425
from modalities.registry.components import COMPONENTS
2526
from modalities.registry.registry import Registry
2627
from modalities.utils.logging import get_logger
@@ -77,6 +78,8 @@ def create_raw_data_index(
7778
Raises:
7879
ValueError: If the index file already exists.
7980
"""
81+
if src_path == index_path:
82+
raise ValueError("Input and output index paths must be different.")
8083
index_path = LargeFileLinesReader.default_index_path(src_path, index_path)
8184
if index_path.exists():
8285
stop_process = enforce_file_existence_policy(index_path, file_existence_policy)
@@ -136,15 +139,77 @@ def shuffle_tokenized_data(
136139
file_existence_policy (FileExistencePolicy): Policy to apply when the output file already exists.
137140
seed (Optional[int]): The seed to use for shuffling.
138141
"""
142+
if input_data_path == output_data_path:
143+
raise ValueError("Input and output file paths must be different.")
139144
if output_data_path.exists():
140-
if not enforce_file_existence_policy(output_data_path, file_existence_policy):
145+
stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
146+
if stop_process:
141147
return
142148

143-
TokenizedDataShuffler.shuffle_tokenized_data(
149+
DataShuffler.shuffle_tokenized_data(
144150
input_data_path=input_data_path, output_data_path=output_data_path, batch_size=batch_size, seed=seed
145151
)
146152

147153

154+
def shuffle_jsonl_data(
155+
input_data_path: Path,
156+
output_data_path: Path,
157+
file_existence_policy: FileExistencePolicy,
158+
seed: Optional[int] = None,
159+
):
160+
"""Shuffles a JSONL file (.jsonl) and stores it on disc.
161+
162+
Args:
163+
input_data_path (Path): File path to the jsonl data (.jsonl).
164+
output_data_path (Path): File path to write the shuffled jsonl data.
165+
file_existence_policy (FileExistencePolicy): Policy to apply when the output file already exists.
166+
seed (Optional[int]): The seed to use for shuffling.
167+
"""
168+
if input_data_path == output_data_path:
169+
raise ValueError("Input and output file paths must be different.")
170+
if output_data_path.exists():
171+
stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
172+
if stop_process:
173+
return
174+
175+
DataShuffler.shuffle_jsonl_data(input_data_path=input_data_path, output_data_path=output_data_path, seed=seed)
176+
177+
178+
def create_filtered_tokenized_dataset(
179+
input_data_path: Path,
180+
filter_routine: Callable[[int], bool],
181+
output_data_path: Path,
182+
file_existence_policy: FileExistencePolicy,
183+
):
184+
if input_data_path == output_data_path:
185+
raise ValueError("Input and output file paths must be different.")
186+
if output_data_path.exists():
187+
stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
188+
if stop_process:
189+
return
190+
191+
sample_key = "text"
192+
193+
dataset = PackedMemMapDatasetBase(raw_data_path=input_data_path, sample_key=sample_key, load_index=True)
194+
195+
# Both generators below run lazily.
196+
filter_generator = (filter_routine(i) for i in range(len(dataset)))
197+
# We lazily extract the samples, as the TokenizedFileWriter.write_tokenized_dataset
198+
# expects an iterator of numpy arrays.
199+
samples_extrator = (dataset[i][sample_key] for i in range(len(dataset)))
200+
# Automatically skips samples for which the filter_routine returns False.
201+
# Also evaluates lazily.
202+
dataset_filtered = itertools.compress(samples_extrator, filter_generator)
203+
204+
get_logger(name="main").info(f"Writing filtered dataset to {str(output_data_path)} ...")
205+
TokenizedFileWriter.write_tokenized_dataset(
206+
tokenized_dataset=dataset_filtered,
207+
tokenized_dataset_file_path=output_data_path,
208+
token_size_in_bytes=dataset.token_size_in_bytes,
209+
)
210+
get_logger(name="main").info(f"Filtered dataset was successfully written to {str(output_data_path)}.")
211+
212+
148213
def create_shuffled_dataset_chunk(
149214
file_path_list: list[Path],
150215
output_chunk_file_path: Path,
@@ -171,19 +236,22 @@ def create_shuffled_dataset_chunk(
171236
ValueError: If the chunk has no samples.
172237
"""
173238
if output_chunk_file_path.exists():
174-
if not enforce_file_existence_policy(output_chunk_file_path, file_existence_policy):
239+
stop_process = enforce_file_existence_policy(output_chunk_file_path, file_existence_policy)
240+
if stop_process:
175241
return
176242

177243
samples = []
178244
token_size_in_bytes = None
179245
for file_path in tqdm.tqdm(file_path_list, desc=f"Loading file chunks of {chunk_id=}"):
246+
if file_path == output_chunk_file_path:
247+
raise ValueError("Input and output chunk file paths must be different.")
180248
dataset = PackedMemMapDatasetBase(raw_data_path=file_path, sample_key="text", load_index=True)
181249
if token_size_in_bytes is None:
182250
token_size_in_bytes = dataset.token_size_in_bytes
183251
elif token_size_in_bytes != dataset.token_size_in_bytes:
184252
raise ValueError("All datasets must have the same token size in bytes.")
185253

186-
file_samples: list[np.ndarray] = Chunking.get_file_chunk(
254+
file_samples: list[np.ndarray] = Chunking.get_tokenized_file_chunk(
187255
dataset=dataset, num_chunks=num_chunks, chunk_id=chunk_id
188256
)
189257
samples.extend(file_samples)
@@ -207,6 +275,65 @@ def create_shuffled_dataset_chunk(
207275
get_logger(name="main").info(f"Chunk {chunk_id} was successfully written to {str(output_chunk_file_path)}.")
208276

209277

278+
def create_shuffled_jsonl_dataset_chunk(
279+
file_path_list: list[Path],
280+
output_chunk_file_path: Path,
281+
chunk_id: int,
282+
num_chunks: int,
283+
file_existence_policy: FileExistencePolicy,
284+
global_seed: Optional[int] = None,
285+
):
286+
"""Creates a shuffled jsonl dataset chunk.
287+
Given a dataset consisting of multiple jsonl files, this function
288+
creates a shuffled dataset chunk for a given chunk id.
289+
From each jsonl file, the respective chunk is extracted, shuffled
290+
and written to a new jsonl file.
291+
292+
Args:
293+
file_path_list (list[Path]): List of paths to the input jsonl files.
294+
output_chunk_file_path (Path): Path to the output chunk which will be stored in jsonl format.
295+
chunk_id (int): The id of the chunk to create.
296+
num_chunks (int): The total number of chunks to create.
297+
file_existence_policy (FileExistencePolicy): Policy to apply when the output chunk file already exists.
298+
global_seed (Optional[int]): The global seed to use for shuffling.
299+
300+
Raises:
301+
ValueError: If the chunk has no samples.
302+
"""
303+
if output_chunk_file_path.exists():
304+
stop_process = enforce_file_existence_policy(output_chunk_file_path, file_existence_policy)
305+
if stop_process:
306+
return
307+
308+
samples = []
309+
for file_path in tqdm.tqdm(file_path_list, desc=f"Loading file chunks of {chunk_id=}"):
310+
if file_path == output_chunk_file_path:
311+
raise ValueError("Input and output chunk file paths must be different.")
312+
with open(file_path, "rb") as f:
313+
dataset = f.readlines()
314+
315+
file_samples: list[Any] = Chunking.get_jsonl_file_chunk(
316+
dataset=dataset, num_chunks=num_chunks, chunk_id=chunk_id
317+
)
318+
samples.extend(file_samples)
319+
320+
if len(samples) == 0:
321+
raise ValueError(
322+
f"Chunk {chunk_id} has no samples. Please decrease the number of chunks to less than {chunk_id}."
323+
)
324+
325+
# samples are shuffled in place
326+
get_logger(name="main").info(f"Shuffling chunk {chunk_id} ...")
327+
seed = calculate_hashed_seed(input_data=[str(global_seed), str(chunk_id)]) if global_seed is not None else None
328+
Chunking.shuffle_file_chunks_in_place(file_chunks=samples, seed=seed)
329+
330+
get_logger(name="main").info(f"Writing chunk {chunk_id} to {str(output_chunk_file_path)} ...")
331+
with open(output_chunk_file_path, "wb") as f:
332+
for sample in samples:
333+
f.write(sample)
334+
get_logger(name="main").info(f"Chunk {chunk_id} was successfully written to {str(output_chunk_file_path)}.")
335+
336+
210337
def pack_encoded_data(
211338
config_dict: dict,
212339
file_existence_policy: FileExistencePolicy,
@@ -234,7 +361,8 @@ def pack_encoded_data(
234361
)
235362

236363
if components.settings.dst_path.exists():
237-
if not enforce_file_existence_policy(components.settings.dst_path, file_existence_policy):
364+
stop_process = enforce_file_existence_policy(components.settings.dst_path, file_existence_policy)
365+
if stop_process:
238366
return
239367

240368
generator = PackedDataGenerator(

src/modalities/config/instantiation_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def _validate_all_paths(cls, values: dict[str, Any]) -> dict[str, Any]:
7979

8080
class WarmstartCheckpointPaths(BaseModel):
8181
model_checkpoint_path: Path
82-
optimizer_checkpoint_path: Path
82+
optimizer_checkpoint_path: Optional[Path] = None
8383

8484
experiment_id: str
8585
config_file_path: FilePath

src/modalities/dataloader/create_packed_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(
5151
processing_batch_size (int): Size of the batches that the workers process.
5252
raw_samples_queue_size (int): Maximum size of the raw samples queue.
5353
processed_samples_queue_size (int): Maximum size of the processed samples queue.
54-
index_path (Optional[FilePath], optional): Path to an index file,
54+
index_path (Optional[FilePath]): Path to an index file,
5555
which indicates the start character position
5656
and length of samples given in `src_path`. If not defined, an index file next to `src_path` is picked,
5757
by replacing its suffix with ".idx". Defaults to None.

src/modalities/dataloader/preprocessing/shuffling/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)