google
diff --git a/‎checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py‎
Lines changed: 10 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py‎
Lines changed: 13 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/experimental/v1/_src/layout/safetensors_benchmark.py‎
Lines changed: 340 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/experimental/v1/_src/layout/safetensors_benchmark.py‎
Lines changed: 340 additions & 0 deletions
@@ -119,6 +119,7 @@ def __init__(
       checkpoint_layout: options_lib.CheckpointLayout | None = None,
       deletion_options: options_lib.DeletionOptions | None = None,
       memory_options: options_lib.MemoryOptions | None = None,
+      safetensors_options: options_lib.SafetensorsOptions | None = None,
   ):
     self._pytree_options = pytree_options or (
         context.pytree_options if context else options_lib.PyTreeOptions()
@@ -156,6 +157,11 @@ def __init__(
     self._memory_options = memory_options or (
         context.memory_options if context else options_lib.MemoryOptions()
     )
+    self._safetensors_options = safetensors_options or (
+        context.safetensors_options
+        if context
+        else options_lib.SafetensorsOptions()
+    )
 
   @property
   def pytree_options(self) -> options_lib.PyTreeOptions:
@@ -197,6 +203,10 @@ def deletion_options(self) -> options_lib.DeletionOptions:
   def memory_options(self) -> options_lib.MemoryOptions:
     return self._memory_options
 
+  @property
+  def safetensors_options(self) -> options_lib.SafetensorsOptions:
+    return self._safetensors_options
+
   def operation_id(self) -> str:
     return synchronization.OperationIdGenerator.get_current_operation_id()
 
 
@@ -570,6 +570,19 @@ class MemoryOptions:
   is_prioritized_key_fn: serialization_types.IsPrioritizedKeyFn | None = None
 
 
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SafetensorsOptions:
+  """Options for configuring Safetensors loading.
+
+  Attributes:
+    ignore_load_sharding: If True, skips sharding of the tensors across
+      hosts/devices during load. Whole tensors will be present on each host,
+      allowing for efficient conversion.
+  """
+
+  ignore_load_sharding: bool = False
+
+
 class CheckpointLayout(enum.Enum):
   """The layout of the checkpoint.
 
 
@@ -0,0 +1,340 @@
+# Copyright 2026 The Orbax Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Benchmark for safetensors layout."""
+
+import asyncio
+import json
+import time
+
+from absl import app
+from absl import flags
+from absl import logging
+from etils import epath
+import jax
+import jax.sharding
+import numpy as np
+from orbax.checkpoint._src.arrays import numpy_utils
+from orbax.checkpoint._src.path import async_path
+from orbax.checkpoint.experimental.v1._src.layout import safetensors_layout
+
+
+Mesh = jax.sharding.Mesh
+NamedSharding = jax.sharding.NamedSharding
+PartitionSpec = jax.sharding.PartitionSpec
+
+_ROWS = 128
+
+FLAGS = flags.FLAGS
+
+_TENSOR_SIZES_MB = flags.DEFINE_list(
+    "tensor_sizes_mb",
+    ["256"],
+    "List of tensor sizes in MB to include in the file.",
+)
+_GCS_DIR = flags.DEFINE_string(
+    "gcs_dir",
+    None,
+    "GCS directory for benchmark.",
+    required=True,
+)
+_DISABLE_OLD_BENCHMARK = flags.DEFINE_boolean(
+    "disable_old_benchmark",
+    False,
+    "If true, only run the new benchmark (layout).",
+)
+
+
+# Wrapper for tracking read bytes while performing real IO.
+class TrackingFile:
+  """Wrapper for tracking read bytes while performing real IO."""
+
+  def __init__(self, f):
+    self.f = f
+    self.bytes_read = 0
+
+  async def seek(self, offset):
+    await self.f.seek(offset)
+
+  async def read(self, size=-1):
+    data = await self.f.read(size)
+    self.bytes_read += len(data)
+    return data
+
+
+async def _read_non_contiguous_slice(
+    f, idx, stored_shape, stored_dtype, tensor_file_offset
+):
+  """Reads a non-contiguous slice from a file."""
+  if not idx:
+    await f.seek(tensor_file_offset)
+    num_bytes = np.dtype(stored_dtype).itemsize
+    data = await f.read(num_bytes)
+    return np.frombuffer(data, dtype=stored_dtype)
+
+  # Calculate global strides for the stored shape.
+  itemsize = np.dtype(stored_dtype).itemsize
+  global_strides = [itemsize] * len(stored_shape)
+  for i in range(len(stored_shape) - 2, -1, -1):
+    global_strides[i] = global_strides[i + 1] * stored_shape[i + 1]
+
+  shard_shape = numpy_utils.slice_shape(idx)
+  out_array = np.empty(shard_shape, dtype=stored_dtype)
+
+  # Recursively read the slice.
+  async def _read_slice_recursively(
+      dim: int, base_offset: int, out_idx: tuple[int, ...]
+  ):
+    s = idx[dim]
+    if dim == len(stored_shape) - 1:
+      start = base_offset + s.start * global_strides[dim]
+      num_bytes = (s.stop - s.start) * itemsize
+      await f.seek(tensor_file_offset + start)
+      data = await f.read(num_bytes)
+
+      # Assign the chunk of bytes into the correct slice of the output array
+      out_array[out_idx] = np.frombuffer(data, dtype=stored_dtype)
+      return
+
+    # Recursively read the slice for each dimension.
+    for out_i, i in enumerate(range(s.start, s.stop)):
+      offset = base_offset + i * global_strides[dim]
+      await _read_slice_recursively(dim + 1, offset, out_idx + (out_i,))
+
+  # Start the recursive reading process from the first dimension.
+  await _read_slice_recursively(dim=0, base_offset=0, out_idx=())
+  return out_array
+
+
+async def _benchmark_old(file_path, sharding, tensor_sizes: list[int]):
+  """Benchmarks a current read."""
+  logging.info("Starting _benchmark_old for %s", file_path)
+  async with async_path.open_file(file_path, mode="rb") as raw_f:
+    f = TrackingFile(raw_f)
+    target_dtype = np.float32
+
+    # Read header size from file.
+    header_size_bytes = await f.read(8)
+    header_size = int.from_bytes(header_size_bytes, byteorder="little")
+    start_data_offset = 8 + header_size
+
+    current_offset = 0
+    restored_tensors = []
+    for i, size_mb in enumerate(tensor_sizes):
+      num_elements = size_mb * 1024 * 1024 // 4
+      rows = _ROWS
+      cols = num_elements // rows
+      target_shape = (rows, cols)
+      tensor_size_bytes = num_elements * 4
+      tensor_offset = start_data_offset + current_offset
+      current_offset += tensor_size_bytes
+
+      device_indices_map = sharding.addressable_devices_indices_map(
+          target_shape
+      )
+      logging.info(
+          "Reading shards for tensor_%d for %d addressable devices",
+          i,
+          len(sharding.addressable_devices),
+      )
+      device_map = []
+      # Guarantee strict iteration order matching addressable_devices
+      for device in sharding.addressable_devices:
+        idx = device_indices_map[device]
+        resolved_idx = numpy_utils.resolve_slice(idx, target_shape)
+        shard_shape = numpy_utils.slice_shape(resolved_idx)
+
+        shard_np = await _read_non_contiguous_slice(
+            f, resolved_idx, target_shape, target_dtype, tensor_offset
+        )
+        shard_np = shard_np.reshape(shard_shape)
+        device_map.append(jax.device_put(shard_np, device))
+
+      logging.info(
+          "Assembling device arrays into global array for tensor_%d", i
+      )
+      restored = jax.make_array_from_single_device_arrays(
+          target_shape, sharding, device_map
+      )
+      restored_tensors.append(restored)
+
+    logging.info("Blocking until ready (old)")
+    for restored in restored_tensors:
+      jax.block_until_ready(restored)
+    logging.info("Finished _benchmark_old")
+
+  return restored_tensors, np.int64(f.bytes_read)
+
+
+async def _benchmark_current(file_path, sharding, tensor_sizes: list[int]):
+  """Benchmarks the new SafetensorsLayout implementation."""
+  logging.info("Starting _benchmark_current (new) for %s", file_path)
+  layout = safetensors_layout.SafetensorsLayout()
+  abstract_pytree = {}
+  for i, size_mb in enumerate(tensor_sizes):
+    num_elements = size_mb * 1024 * 1024 // 4
+    rows = _ROWS
+    cols = num_elements // rows
+    shape = (rows, cols)
+    abstract_pytree[f"tensor_{i}"] = jax.ShapeDtypeStruct(
+        shape=shape, dtype=np.float32, sharding=sharding
+    )
+
+  restore_fn = await layout.load_pytree(
+      file_path, abstract_pytree=abstract_pytree
+  )
+  restored_pytree = await restore_fn
+
+  logging.info("Blocking until ready (current)")
+  for i in range(len(tensor_sizes)):
+    jax.block_until_ready(restored_pytree[f"tensor_{i}"])
+  logging.info("Finished _benchmark_current")
+
+  num_hosts = jax.process_count()
+  total_size_bytes = sum(size * 1024 * 1024 for size in tensor_sizes)
+  bytes_read = total_size_bytes // num_hosts
+
+  return restored_pytree, np.int64(bytes_read)
+
+
+async def _create_file_if_needed(
+    path: epath.Path,
+    tensor_sizes: list[int],
+):
+  """Creates a dummy safetensors file if it doesn't exist."""
+  if jax.process_index() != 0:
+    return
+
+  header_dict = {}
+  current_offset = 0
+  for i, size_mb in enumerate(tensor_sizes):
+    num_elements = size_mb * 1024 * 1024 // 4
+    rows = _ROWS
+    cols = num_elements // rows
+    shape = [rows, cols]
+    size_bytes = num_elements * 4
+    header_dict[f"tensor_{i}"] = {
+        "dtype": "F32",
+        "shape": shape,
+        "data_offsets": [current_offset, current_offset + size_bytes],
+    }
+    current_offset += size_bytes
+
+  header_json = json.dumps(header_dict).encode("utf-8")
+
+  # Pad header to multiple of 8 bytes.
+  padding_len = (8 - len(header_json) % 8) % 8
+  header_json += b" " * padding_len
+
+  header_size = len(header_json)
+  header_size_bytes = header_size.to_bytes(8, byteorder="little")
+
+  total_bytes_to_write = current_offset
+  expected_file_size = 8 + header_size + total_bytes_to_write
+
+  if path.exists() and path.stat().length == expected_file_size:
+    logging.info(
+        "File %s already exists with correct size, skipping creation.", path
+    )
+    return
+
+  logging.info("Creating dummy file %s with size %d", path, expected_file_size)
+  with path.open("wb") as f:
+    f.write(header_size_bytes)
+    f.write(header_json)
+    chunk_size = 1024 * 1024 * 100
+    bytes_written = 0
+    while bytes_written < total_bytes_to_write:
+      write_size = min(chunk_size, total_bytes_to_write - bytes_written)
+      f.write(b"\0" * write_size)
+      bytes_written += write_size
+
+
+async def run_benchmarks(sharding_type, tensor_sizes: list[int]):
+  """Runs benchmarks for a given sharding type and tensor sizes."""
+  if not _GCS_DIR.value:
+    return
+
+  dir_path = epath.Path(_GCS_DIR.value)
+  if jax.process_index() == 0 and not dir_path.exists():
+    dir_path.mkdir(parents=True, exist_ok=True)
+
+  # Ensure directory is created by rank 0 before others proceed
+  jax.experimental.multihost_utils.sync_global_devices("mkdir")
+
+  devices = jax.devices()
+  mesh_shape = (len(devices) // 2, 2)
+  mesh = Mesh(np.array(devices).reshape(mesh_shape), ("data", "model"))
+
+  if sharding_type == "leading":
+    sharding_spec = PartitionSpec("data", None)
+  else:
+    sharding_spec = PartitionSpec(None, "model")
+
+  sharding = NamedSharding(mesh, sharding_spec)
+
+  sizes_str = "_".join(map(str, tensor_sizes))
+  file_path = (
+      dir_path / f"benchmark_v2_{sharding_type}_{sizes_str}mb.safetensors"
+  )
+  await _create_file_if_needed(file_path, tensor_sizes)
+  jax.experimental.multihost_utils.sync_global_devices("create_file")
+
+  t_old = 0.0
+  bytes_old_total = 0
+  num_hosts = jax.process_count()
+
+  if not _DISABLE_OLD_BENCHMARK.value:
+    t0 = time.time()
+    _, bytes_old = await _benchmark_old(file_path, sharding, tensor_sizes)
+    t_old = time.time() - t0
+    bytes_old_total = int(bytes_old) * num_hosts
+
+  jax.experimental.multihost_utils.sync_global_devices(
+      "sync_between_benchmarks"
+  )
+
+  t0 = time.time()
+  _, bytes_new = await _benchmark_current(file_path, sharding, tensor_sizes)
+  t_new = time.time() - t0
+
+  bytes_new_total = int(bytes_new) * num_hosts
+
+  if jax.process_index() == 0:
+    res = "\n=======================================================\n"
+    res += (
+        f"Results for {sharding_type} sharding, sizes: {tensor_sizes} MB, "
+        f"{num_hosts} hosts, gcs storage\n"
+    )
+    if not _DISABLE_OLD_BENCHMARK.value:
+      res += (
+          f"Old (Manual): {t_old*1000:.2f}ms, Bytes read:"
+          f" {bytes_old_total / 1024 / 1024:.2f}MB\n"
+      )
+    res += (
+        f"New (Layout): {t_new*1000:.2f}ms, Bytes read:"
+        f" {bytes_new_total / 1024 / 1024:.2f}MB\n"
+    )
+    res += "=======================================================\n"
+    logging.info(res)
+
+
+def main(_):
+  tensor_sizes = [int(s) for s in _TENSOR_SIZES_MB.value]
+  for sharding_type in ["leading", "trailing"]:
+    asyncio.run(run_benchmarks(sharding_type, tensor_sizes))
+
+
+if __name__ == "__main__":
+  app.run(main)