Modalities
diff --git a/‎.github/workflows/linting.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/linting.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/modalities/__main__.py‎
Lines changed: 132 additions & 0 deletions b/‎src/modalities/__main__.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎src/modalities/api.py‎
Lines changed: 136 additions & 8 deletions b/‎src/modalities/api.py‎
Lines changed: 136 additions & 8 deletions
diff --git a/‎src/modalities/config/instantiation_models.py‎
Lines changed: 1 addition & 1 deletion b/‎src/modalities/config/instantiation_models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/modalities/dataloader/create_packed_data.py‎
Lines changed: 1 addition & 1 deletion b/‎src/modalities/dataloader/create_packed_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/modalities/dataloader/preprocessing/shuffling/__init__.py‎ b/‎src/modalities/dataloader/preprocessing/shuffling/__init__.py‎
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
 
@@ -20,9 +20,11 @@
     convert_pytorch_to_hf_checkpoint,
     create_raw_data_index,
     create_shuffled_dataset_chunk,
+    create_shuffled_jsonl_dataset_chunk,
     generate_text,
     merge_packed_data_files,
     pack_encoded_data,
+    shuffle_jsonl_data,
     shuffle_tokenized_data,
 )
 from modalities.batch import EvaluationResultBatch
@@ -308,6 +310,88 @@ def CMD_create_shuffled_dataset_chunk(
     )
 
 
+@data.command(name="create_shuffled_jsonl_chunk")
+@click.option(
+    "--input_file_list_path",
+    type=Path,
+    required=True,
+    help="Path to the file containing the list of jsonl files to be chunked.",
+)
+@click.option(
+    "--input_data_root_path",
+    type=Path,
+    required=True,
+    help="Directory path to the root of the input data.",
+)
+@click.option(
+    "--output_chunk_file_path",
+    type=Path,
+    required=True,
+    help="Path where the chunked jsonl dataset will be saved.",
+)
+@click.option(
+    "--chunk_id",
+    type=int,
+    required=True,
+    help="The id of the chunk to be created.",
+)
+@click.option(
+    "--num_chunks",
+    type=int,
+    required=True,
+    help="The number of chunks to create.",
+)
+@click.option(
+    "--file_existence_policy",
+    type=click.Choice([policy.value for policy in FileExistencePolicy]),
+    default=FileExistencePolicy.ERROR.value,
+    help="Policy for handling existing files.",
+)
+@click.option(
+    "--global_seed",
+    type=int,
+    default=None,
+    help="The global seed to use for shuffling.",
+)
+def CMD_create_shuffled_jsonl_dataset_chunk(
+    input_file_list_path: Path,
+    input_data_root_path: Path,
+    output_chunk_file_path: Path,
+    chunk_id: int,
+    num_chunks: int,
+    file_existence_policy: FileExistencePolicy,
+    global_seed: Optional[int],
+):
+    """Utility to create a shuffled jsonl dataset chunk from a list of jsonl files.
+
+    Args:
+        input_file_list_path (Path): Path to file that contains relative paths of
+            jsonl files to be chunked and shuffled (one per line).
+        input_data_root_path (Path): Path to the root directory that contains the jsonl files to be chunked.
+        output_chunk_file_path (Path): File path to the chunked jsonl dataset.
+        chunk_id (int): The id of the chunk to be created.
+        num_chunks (int): Number of chunks in total.
+        file_existence_policy (FileExistencePolicy): Policy for handling existing files.
+        global_seed (Optional[int]): The global seed to use for shuffling.
+    """
+    file_existence_policy = FileExistencePolicy(file_existence_policy)
+
+    with open(input_file_list_path, "r", encoding="utf-8") as f:
+        file_path_list = f.readlines()
+    file_path_list = [
+        input_data_root_path / Path(file_path.strip()).with_suffix(".jsonl") for file_path in file_path_list
+    ]
+
+    create_shuffled_jsonl_dataset_chunk(
+        file_path_list=file_path_list,
+        output_chunk_file_path=output_chunk_file_path,
+        chunk_id=chunk_id,
+        num_chunks=num_chunks,
+        file_existence_policy=file_existence_policy,
+        global_seed=global_seed,
+    )
+
+
 @data.command(name="merge_packed_data")
 @click.argument("src_paths", type=click.types.Path(exists=True, path_type=Path), nargs=-1, required=True)
 @click.argument("target_path", type=click.types.Path(file_okay=False, dir_okay=False, path_type=Path))
@@ -379,6 +463,54 @@ def CMD_shuffle_tokenized_data(
     )
 
 
+@data.command(name="shuffle_jsonl_data")
+@click.option(
+    "--input_data_path",
+    type=click_pathlib.Path(exists=True),
+    required=True,
+    help="Path to a jsonl file (.jsonl).",
+)
+@click.option(
+    "--output_data_path",
+    type=click_pathlib.Path(exists=False),
+    required=True,
+    help="Path to write the shuffled jsonl data (.jsonl).",
+)
+@click.option(
+    "--file_existence_policy",
+    type=click.Choice([policy.value for policy in FileExistencePolicy]),
+    default=FileExistencePolicy.ERROR.value,
+    help="Policy for handling existing files.",
+)
+@click.option(
+    "--seed",
+    type=int,
+    default=None,
+    help="The seed for shuffling the data.",
+)
+def CMD_shuffle_jsonl_data(
+    input_data_path: Path, output_data_path: Path, file_existence_policy, seed: Optional[int]
+) -> None:
+    """Entrypoint for shuffling jsonl data.
+
+    Args:
+        input_data_path (Path): The path to the input jsonl data (.jsonl).
+        output_data_path (Path): File path to write the shuffled jsonl data (.jsonl).
+        file_existence_policy (FileExistencePolicy): Policy for handling existing files.
+        seed (Optional[int]): The seed for shuffling the data. Default is None.
+    Returns:
+        None
+    """
+    file_existence_policy = FileExistencePolicy(file_existence_policy)
+
+    shuffle_jsonl_data(
+        input_data_path=input_data_path,
+        output_data_path=output_data_path,
+        file_existence_policy=file_existence_policy,
+        seed=seed,
+    )
+
+
 class Main:
     """Main class that orchestrates the training process."""
 
 
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 
+import itertools
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Optional
+from typing import Any, Callable, Optional
 
 import numpy as np
 import tqdm
@@ -17,10 +18,10 @@
 from modalities.dataloader.create_packed_data import EmbeddedStreamData, PackedDataGenerator, join_embedded_stream_data
 from modalities.dataloader.dataset import PackedMemMapDatasetBase
 from modalities.dataloader.large_file_lines_reader import LargeFileLinesReader
-from modalities.dataloader.preprocessing.chunking.create_chunks import Chunking
 from modalities.dataloader.preprocessing.tokenization.tokenized_file_writer import TokenizedFileWriter
-from modalities.dataloader.shuffle_tokenized_data import TokenizedDataShuffler
 from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter
+from modalities.preprocessing.create_chunks import Chunking
+from modalities.preprocessing.shuffle_data import DataShuffler
 from modalities.registry.components import COMPONENTS
 from modalities.registry.registry import Registry
 from modalities.utils.logging import get_logger
@@ -77,6 +78,8 @@ def create_raw_data_index(
     Raises:
         ValueError: If the index file already exists.
     """
+    if src_path == index_path:
+        raise ValueError("Input and output index paths must be different.")
     index_path = LargeFileLinesReader.default_index_path(src_path, index_path)
     if index_path.exists():
         stop_process = enforce_file_existence_policy(index_path, file_existence_policy)
@@ -136,15 +139,77 @@ def shuffle_tokenized_data(
         file_existence_policy (FileExistencePolicy): Policy to apply when the output file already exists.
         seed (Optional[int]): The seed to use for shuffling.
     """
+    if input_data_path == output_data_path:
+        raise ValueError("Input and output file paths must be different.")
     if output_data_path.exists():
-        if not enforce_file_existence_policy(output_data_path, file_existence_policy):
+        stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
+        if stop_process:
             return
 
-    TokenizedDataShuffler.shuffle_tokenized_data(
+    DataShuffler.shuffle_tokenized_data(
         input_data_path=input_data_path, output_data_path=output_data_path, batch_size=batch_size, seed=seed
     )
 
 
+def shuffle_jsonl_data(
+    input_data_path: Path,
+    output_data_path: Path,
+    file_existence_policy: FileExistencePolicy,
+    seed: Optional[int] = None,
+):
+    """Shuffles a JSONL file (.jsonl) and stores it on disc.
+
+    Args:
+        input_data_path (Path): File path to the jsonl data (.jsonl).
+        output_data_path (Path): File path to write the shuffled jsonl data.
+        file_existence_policy (FileExistencePolicy): Policy to apply when the output file already exists.
+        seed (Optional[int]): The seed to use for shuffling.
+    """
+    if input_data_path == output_data_path:
+        raise ValueError("Input and output file paths must be different.")
+    if output_data_path.exists():
+        stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
+        if stop_process:
+            return
+
+    DataShuffler.shuffle_jsonl_data(input_data_path=input_data_path, output_data_path=output_data_path, seed=seed)
+
+
+def create_filtered_tokenized_dataset(
+    input_data_path: Path,
+    filter_routine: Callable[[int], bool],
+    output_data_path: Path,
+    file_existence_policy: FileExistencePolicy,
+):
+    if input_data_path == output_data_path:
+        raise ValueError("Input and output file paths must be different.")
+    if output_data_path.exists():
+        stop_process = enforce_file_existence_policy(output_data_path, file_existence_policy)
+        if stop_process:
+            return
+
+    sample_key = "text"
+
+    dataset = PackedMemMapDatasetBase(raw_data_path=input_data_path, sample_key=sample_key, load_index=True)
+
+    # Both generators below run lazily.
+    filter_generator = (filter_routine(i) for i in range(len(dataset)))
+    # We lazily extract the samples, as the TokenizedFileWriter.write_tokenized_dataset
+    # expects an iterator of numpy arrays.
+    samples_extrator = (dataset[i][sample_key] for i in range(len(dataset)))
+    # Automatically skips samples for which the filter_routine returns False.
+    # Also evaluates lazily.
+    dataset_filtered = itertools.compress(samples_extrator, filter_generator)
+
+    get_logger(name="main").info(f"Writing filtered dataset to {str(output_data_path)} ...")
+    TokenizedFileWriter.write_tokenized_dataset(
+        tokenized_dataset=dataset_filtered,
+        tokenized_dataset_file_path=output_data_path,
+        token_size_in_bytes=dataset.token_size_in_bytes,
+    )
+    get_logger(name="main").info(f"Filtered dataset was successfully written to {str(output_data_path)}.")
+
+
 def create_shuffled_dataset_chunk(
     file_path_list: list[Path],
     output_chunk_file_path: Path,
@@ -171,19 +236,22 @@ def create_shuffled_dataset_chunk(
         ValueError: If the chunk has no samples.
     """
     if output_chunk_file_path.exists():
-        if not enforce_file_existence_policy(output_chunk_file_path, file_existence_policy):
+        stop_process = enforce_file_existence_policy(output_chunk_file_path, file_existence_policy)
+        if stop_process:
             return
 
     samples = []
     token_size_in_bytes = None
     for file_path in tqdm.tqdm(file_path_list, desc=f"Loading file chunks of {chunk_id=}"):
+        if file_path == output_chunk_file_path:
+            raise ValueError("Input and output chunk file paths must be different.")
         dataset = PackedMemMapDatasetBase(raw_data_path=file_path, sample_key="text", load_index=True)
         if token_size_in_bytes is None:
             token_size_in_bytes = dataset.token_size_in_bytes
         elif token_size_in_bytes != dataset.token_size_in_bytes:
             raise ValueError("All datasets must have the same token size in bytes.")
 
-        file_samples: list[np.ndarray] = Chunking.get_file_chunk(
+        file_samples: list[np.ndarray] = Chunking.get_tokenized_file_chunk(
             dataset=dataset, num_chunks=num_chunks, chunk_id=chunk_id
         )
         samples.extend(file_samples)
@@ -207,6 +275,65 @@ def create_shuffled_dataset_chunk(
     get_logger(name="main").info(f"Chunk {chunk_id} was successfully written to {str(output_chunk_file_path)}.")
 
 
+def create_shuffled_jsonl_dataset_chunk(
+    file_path_list: list[Path],
+    output_chunk_file_path: Path,
+    chunk_id: int,
+    num_chunks: int,
+    file_existence_policy: FileExistencePolicy,
+    global_seed: Optional[int] = None,
+):
+    """Creates a shuffled jsonl dataset chunk.
+    Given a dataset consisting of multiple jsonl files, this function
+    creates a shuffled dataset chunk for a given chunk id.
+    From each jsonl file, the respective chunk is extracted, shuffled
+    and written to a new jsonl file.
+
+    Args:
+        file_path_list (list[Path]): List of paths to the input jsonl files.
+        output_chunk_file_path (Path): Path to the output chunk which will be stored in jsonl format.
+        chunk_id (int): The id of the chunk to create.
+        num_chunks (int): The total number of chunks to create.
+        file_existence_policy (FileExistencePolicy): Policy to apply when the output chunk file already exists.
+        global_seed (Optional[int]): The global seed to use for shuffling.
+
+    Raises:
+        ValueError: If the chunk has no samples.
+    """
+    if output_chunk_file_path.exists():
+        stop_process = enforce_file_existence_policy(output_chunk_file_path, file_existence_policy)
+        if stop_process:
+            return
+
+    samples = []
+    for file_path in tqdm.tqdm(file_path_list, desc=f"Loading file chunks of {chunk_id=}"):
+        if file_path == output_chunk_file_path:
+            raise ValueError("Input and output chunk file paths must be different.")
+        with open(file_path, "rb") as f:
+            dataset = f.readlines()
+
+        file_samples: list[Any] = Chunking.get_jsonl_file_chunk(
+            dataset=dataset, num_chunks=num_chunks, chunk_id=chunk_id
+        )
+        samples.extend(file_samples)
+
+    if len(samples) == 0:
+        raise ValueError(
+            f"Chunk {chunk_id} has no samples. Please decrease the number of chunks to less than {chunk_id}."
+        )
+
+    # samples are shuffled in place
+    get_logger(name="main").info(f"Shuffling chunk {chunk_id} ...")
+    seed = calculate_hashed_seed(input_data=[str(global_seed), str(chunk_id)]) if global_seed is not None else None
+    Chunking.shuffle_file_chunks_in_place(file_chunks=samples, seed=seed)
+
+    get_logger(name="main").info(f"Writing chunk {chunk_id} to {str(output_chunk_file_path)} ...")
+    with open(output_chunk_file_path, "wb") as f:
+        for sample in samples:
+            f.write(sample)
+    get_logger(name="main").info(f"Chunk {chunk_id} was successfully written to {str(output_chunk_file_path)}.")
+
+
 def pack_encoded_data(
     config_dict: dict,
     file_existence_policy: FileExistencePolicy,
@@ -234,7 +361,8 @@ def pack_encoded_data(
     )
 
     if components.settings.dst_path.exists():
-        if not enforce_file_existence_policy(components.settings.dst_path, file_existence_policy):
+        stop_process = enforce_file_existence_policy(components.settings.dst_path, file_existence_policy)
+        if stop_process:
             return
 
     generator = PackedDataGenerator(
 
@@ -79,7 +79,7 @@ def _validate_all_paths(cls, values: dict[str, Any]) -> dict[str, Any]:
 
         class WarmstartCheckpointPaths(BaseModel):
             model_checkpoint_path: Path
-            optimizer_checkpoint_path: Path
+            optimizer_checkpoint_path: Optional[Path] = None
 
         experiment_id: str
         config_file_path: FilePath
 
@@ -51,7 +51,7 @@ def __init__(
             processing_batch_size (int): Size of the batches that the workers process.
             raw_samples_queue_size (int): Maximum size of the raw samples queue.
             processed_samples_queue_size (int): Maximum size of the processed samples queue.
-            index_path (Optional[FilePath], optional): Path to an index file,
+            index_path (Optional[FilePath]): Path to an index file,
                 which indicates the start character position
                 and length of samples given in `src_path`. If not defined, an index file next to `src_path` is picked,
                 by replacing its suffix with ".idx". Defaults to None.