diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index e9b8bc85a..f992636d8 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -431,6 +431,18 @@ ArgOptions SDContextParams::get_options() {
          "--rpc-servers",
          "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
          &rpc_servers},
+        {"",
+         "--multi-gpu-mode",
+         "how to split a too-large DiT across GPUs (auto-fit): "
+         "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off "
+         "(default: row)",
+         &multi_gpu_mode},
+        {"",
+         "--fit-compute-reserve",
+         "auto-fit: per-component compute-buffer reserve in MiB as a component "
+         "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in "
+         "defaults)",
+         &fit_compute_reserve},
         {"",
          "--max-vram",
          "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
@@ -447,6 +459,10 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
     };
 
     options.bool_options = {
@@ -518,6 +534,24 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit --backend / --params-backend flags",
+         false, &auto_fit},
+        {"",
+         "--no-multi-gpu",
+         "auto-fit: keep all components on a single GPU when they fit "
+         "(by default, multi-GPU placements are preferred to balance load)",
+         false, &auto_multi_gpu},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -616,6 +650,15 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--list-devices",
+         "list available ggml backend devices (one per line, "
+         "name<TAB>description) and exit",
+         [](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             sd_list_devices();
+             std::exit(0);
+             return 0;
+         }},
     };
 
     return options;
@@ -760,9 +803,12 @@ std::string SDContextParams::to_string() const {
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-        << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-        << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-        << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+        << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
+        << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
+        << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  fit_compute_reserve: \"" << fit_compute_reserve << "\",\n"
+        << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -837,6 +883,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
     sd_ctx_params.stream_layers                   = stream_layers;
     sd_ctx_params.backend                         = effective_backend.c_str();
     sd_ctx_params.params_backend                  = effective_params_backend.c_str();
+    sd_ctx_params.auto_fit                        = auto_fit;
+    sd_ctx_params.auto_fit_target_mb              = auto_fit_target_mb;
+    sd_ctx_params.auto_fit_dry_run                = auto_fit_dry_run;
+    sd_ctx_params.auto_fit_compute_reserve        = fit_compute_reserve.c_str();
+    sd_ctx_params.auto_multi_gpu                  = auto_multi_gpu;
+    sd_ctx_params.multi_gpu_mode                  = multi_gpu_mode.c_str();
     sd_ctx_params.rpc_servers                     = rpc_servers.c_str();
     return sd_ctx_params;
 }
diff --git a/examples/common/common.h b/examples/common/common.h
index 55fa5ac0a..1549ca9c1 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -152,9 +152,6 @@ struct SDContextParams {
     std::string effective_backend;
     std::string effective_params_backend;
     bool enable_mmap           = false;
-    bool control_net_cpu       = false;
-    bool clip_on_cpu           = false;
-    bool vae_on_cpu            = false;
     bool flash_attn            = false;
     bool diffusion_flash_attn  = false;
     bool diffusion_conv_direct = false;
@@ -170,6 +167,23 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit defaults — placement is computed automatically based on free
+    // VRAM. Pass --no-auto-fit to disable and use explicit --backend specs.
+    bool auto_fit           = true;
+    int  auto_fit_target_mb = 512;
+    bool auto_fit_dry_run   = false;
+    // Per-component compute-buffer reserve in MiB as a component map,
+    // e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults.
+    std::string fit_compute_reserve;
+    bool auto_multi_gpu = true;
+    std::string multi_gpu_mode = "row";
+
+    // Deprecated aliases for --backend <component>=cpu (kept for
+    // backwards compatibility with the pre-auto-fit CLI).
+    bool control_net_cpu = false;
+    bool clip_on_cpu     = false;
+    bool vae_on_cpu      = false;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 00f3e4e97..41f561e38 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -220,6 +220,35 @@ typedef struct {
     bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
     const char* backend;
     const char* params_backend;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), `backend` / `params_backend` are
+    // ignored and the placement is computed automatically (the plan is fed
+    // into the same backend assignment that `backend` / `params_backend` use).
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve` tunes the per-component compute-buffer
+    // reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512"
+    // (same component-key style as `backend`); missing keys / NULL keep the
+    // built-in defaults.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    const char* auto_fit_compute_reserve;
+
+    // When more than one GPU device is present, prefer placing different
+    // components on different GPUs to balance load and fit larger total
+    // working sets. Set false to keep all components on a single GPU when
+    // they fit. Defaults to true. Each component still lives entirely on
+    // one device unless multi_gpu_mode splits it (see below).
+    bool auto_multi_gpu;
+
+    // How to split a single component (currently only the DiT) across GPUs
+    // when it doesn't fit on one but fits across several: "row" (matmul rows
+    // split via the backend's stock split buffer type, CUDA/SYCL),
+    // "layer" (whole blocks per GPU, routed by a scheduler, backend-generic),
+    // or "off" (never split a single component). NULL / empty => "row".
+    const char* multi_gpu_mode;
     const char* rpc_servers;
 } sd_ctx_params_t;
 
@@ -485,6 +514,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
+// per-line format. The output is intended to be parsed by tools and used as
+// device names in the --backend / --params-backend assignment specs.
+SD_API void sd_list_devices(void);
+
 // for C API, caller needs to call free_sd_images to free the memory after use
 // This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT.
 SD_API void free_sd_images(sd_image_t* result_images, int num_images);
diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp
new file mode 100644
index 000000000..c8a4ff2a0
--- /dev/null
+++ b/src/backend_fit.hpp
@@ -0,0 +1,729 @@
+#ifndef __SD_BACKEND_FIT_HPP__
+#define __SD_BACKEND_FIT_HPP__
+
+// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the
+// available GPU devices and system RAM.
+//
+// Each component is treated as a single atomic unit that lives entirely on
+// one device (plus its compute buffer on the same device). There is no
+// intra-tensor row split: cross-device parallelism comes from placing
+// different components on different GPUs, not from splitting individual
+// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER
+// at the component granularity.
+//
+// Placement priority: DiT + compute buffer -> VAE -> Conditioner.
+// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that
+// support streaming params from RAM at compute time).
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include "model_loader.h"
+#include "core/util.h"
+
+namespace backend_fit {
+
+constexpr int64_t MiB           = 1024 * 1024;
+constexpr int     DEVICE_ID_CPU = -1;
+
+static inline int bit_count(unsigned int value) {
+    int count = 0;
+    while (value != 0) {
+        count += static_cast<int>(value & 1U);
+        value >>= 1;
+    }
+    return count;
+}
+
+enum class ComponentKind {
+    DIT,
+    VAE,
+    CONDITIONER,
+};
+
+enum class Placement {
+    CPU,
+    GPU,
+    GPU_OFFLOAD_PARAMS,    // params in RAM, compute on GPU
+    GPU_LAYER_SPLIT,       // params split across multiple GPUs at block boundaries (sched-based)
+    GPU_TENSOR_SPLIT,      // matmul weights row-split across GPUs (CUDA split-buft, single backend)
+};
+
+struct Component {
+    ComponentKind kind;
+    std::string   name;
+    int64_t       params_bytes     = 0;
+    int64_t       compute_bytes    = 0;
+    bool          supports_offload = false;
+};
+
+struct Device {
+    int                id = DEVICE_ID_CPU;
+    std::string        name;
+    std::string        description;
+    int64_t            free_bytes  = 0;
+    int64_t            total_bytes = 0;
+    ggml_backend_dev_t dev         = nullptr;  // backing ggml device handle (GPU only)
+};
+
+struct Decision {
+    ComponentKind kind;
+    std::string   name;
+    Placement     placement       = Placement::CPU;
+    int           device_id       = DEVICE_ID_CPU;
+    int64_t       on_device_bytes = 0;
+    int64_t       on_host_bytes   = 0;
+
+    // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs
+    // that share this component (in order) and each device's estimated share
+    // of the params. The order also defines block-range partitioning: the
+    // i-th device gets a contiguous range of blocks proportional to share[i].
+    std::vector<int>     split_device_ids;
+    std::vector<int64_t> split_share_bytes;
+};
+
+struct Plan {
+    std::vector<Decision>  decisions;
+    std::map<int, int64_t> device_bytes;
+    int64_t                host_bytes  = 0;
+    bool                   any_changes = false;
+};
+
+struct ComputeReserves {
+    int64_t dit_bytes         = int64_t(2048) * MiB;
+    int64_t vae_bytes         = int64_t(1024) * MiB;
+    int64_t conditioner_bytes = int64_t(512) * MiB;
+};
+
+enum class MultiGpuMode {
+    OFF,    // never split a single component across GPUs
+    ROW,    // CUDA-only: row-split matmul weights via cuda_split_buffer_type
+    LAYER,  // generic: assign block-indexed tensors to per-block backends + sched
+};
+
+inline const char* multi_gpu_mode_str(MultiGpuMode m) {
+    switch (m) {
+        case MultiGpuMode::OFF:   return "off";
+        case MultiGpuMode::ROW:   return "row";
+        case MultiGpuMode::LAYER: return "layer";
+    }
+    return "?";
+}
+
+inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) {
+    if (s == "off")   return MultiGpuMode::OFF;
+    if (s == "row")   return MultiGpuMode::ROW;
+    if (s == "layer") return MultiGpuMode::LAYER;
+    return MultiGpuMode::ROW;  // default
+}
+
+// --- Classification -------------------------------------------------------
+
+inline bool classify_tensor(const std::string& name, ComponentKind& out) {
+    auto contains = [&](const char* s) { return name.find(s) != std::string::npos; };
+
+    if (contains("model.diffusion_model.") || contains("unet.")) {
+        out = ComponentKind::DIT;
+        return true;
+    }
+
+    if (contains("first_stage_model.") ||
+        name.rfind("vae.", 0) == 0 ||
+        name.rfind("tae.", 0) == 0) {
+        out = ComponentKind::VAE;
+        return true;
+    }
+
+    if (contains("text_encoders") ||
+        contains("cond_stage_model") ||
+        contains("te.text_model.") ||
+        contains("conditioner") ||
+        name.rfind("text_encoder.", 0) == 0 ||
+        // Connector / text projection layers that run on the conditioner
+        // backend (e.g. LTX-2's text_embedding_projection: video/audio
+        // aggregate embeds + projection that map LLM hidden states into
+        // DiT-input space).
+        name.rfind("text_embedding_projection.", 0) == 0 ||
+        contains(".aggregate_embed.")) {
+        out = ComponentKind::CONDITIONER;
+        return true;
+    }
+
+    return false;
+}
+
+// --- Memory estimation ----------------------------------------------------
+
+inline std::vector<Component> estimate_components(ModelLoader&           loader,
+                                                  ggml_type              override_wtype,
+                                                  int64_t                alignment,
+                                                  const ComputeReserves& reserves) {
+    auto& storage = loader.get_tensor_storage_map();
+
+    int64_t bytes[3] = {0, 0, 0};
+
+    for (auto& [name, ts_const] : storage) {
+        TensorStorage ts = ts_const;
+        if (is_unused_tensor(ts.name)) {
+            continue;
+        }
+
+        ComponentKind k;
+        if (!classify_tensor(ts.name, k)) {
+            continue;
+        }
+
+        if (override_wtype != GGML_TYPE_COUNT &&
+            loader.tensor_should_be_converted(ts, override_wtype)) {
+            ts.type = override_wtype;
+        } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) {
+            ts.type = ts.expected_type;
+        }
+
+        bytes[int(k)] += ts.nbytes() + alignment;
+    }
+
+    std::vector<Component> out;
+    out.reserve(3);
+    out.push_back({ComponentKind::DIT, "DiT",
+                   bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true});
+    out.push_back({ComponentKind::VAE, "VAE",
+                   bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false});
+    out.push_back({ComponentKind::CONDITIONER, "Conditioner",
+                   bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true});
+    return out;
+}
+
+// --- Device enumeration ---------------------------------------------------
+
+inline std::vector<Device> enumerate_gpu_devices() {
+    // Make sure the dynamically-loaded backends are registered before we query
+    // the device list. This runs before SDBackendManager initializes any
+    // backend, so nothing else has triggered the (file-local) lazy load yet.
+    // Safe to call once here: the manager's own load-all-once guard short
+    // circuits afterwards because the device count is already non-zero.
+    ggml_backend_load_all();
+
+    std::vector<Device> out;
+    int gpu_idx = 0;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            continue;
+        }
+        Device d;
+        d.id          = gpu_idx++;
+        d.dev         = dev;
+        d.name        = ggml_backend_dev_name(dev);
+        d.description = ggml_backend_dev_description(dev);
+        size_t free_b = 0, total_b = 0;
+        ggml_backend_dev_memory(dev, &free_b, &total_b);
+        d.free_bytes  = int64_t(free_b);
+        d.total_bytes = int64_t(total_b);
+        out.push_back(d);
+    }
+    return out;
+}
+
+// --- Core algorithm -------------------------------------------------------
+
+// Per-GPU share for a layer-split component: free-VRAM-weighted partition
+// of params, plus the full compute reserve on each participating device.
+// (Compute reserve is per-device since each shard activates its own kernels.)
+inline std::vector<int64_t> layer_split_shares(int64_t                    params_bytes,
+                                               int64_t                    compute_bytes,
+                                               const std::vector<Device>& devices,
+                                               const std::vector<size_t>& gpu_idxs,
+                                               int64_t                    margin_bytes = 0) {
+    // Every participating device hosts its param share PLUS a full compute
+    // reserve (the sched allocates a compute buffer per backend), so weight the
+    // param shares by what remains AFTER compute + margin. This guarantees
+    // share_k + compute <= free_k - margin whenever the total fits at all;
+    // weighting by raw free overcommits the smaller GPU and the planner then
+    // rejects layer-split as infeasible (observed: 22B DiT fell to CPU).
+    std::vector<int64_t> avail(gpu_idxs.size(), 0);
+    int64_t              total = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        int64_t a = std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes - compute_bytes - margin_bytes);
+        avail[k]  = a;
+        total += a;
+    }
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    if (total <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        double r = double(avail[k]) / double(total);
+        out[k]   = int64_t(double(params_bytes) * r) + compute_bytes;
+    }
+    return out;
+}
+
+// Per-GPU PARAM share for a row (tensor) split. Unlike layer-split, the graph
+// runs on a single MAIN backend (the biggest GPU at gpu_idxs[main_pos]), so
+// ONLY the main device also hosts the compute buffer. We therefore reserve
+// `compute_bytes` of the main device's free VRAM before weighting, so the main
+// doesn't get so many matmul rows that its compute buffer no longer fits. The
+// caller adds compute_bytes back when computing the main device's peak. Returns
+// param bytes per device (no compute folded in) — these become the split ratios.
+inline std::vector<int64_t> row_split_shares(int64_t                    params_bytes,
+                                             int64_t                    compute_bytes,
+                                             const std::vector<Device>& devices,
+                                             const std::vector<size_t>& gpu_idxs,
+                                             size_t                     main_pos) {
+    std::vector<int64_t> avail(gpu_idxs.size(), 0);
+    int64_t              total = 0;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        int64_t a = std::max<int64_t>(0, devices[gpu_idxs[k]].free_bytes);
+        if (k == main_pos) {
+            a = std::max<int64_t>(0, a - compute_bytes);
+        }
+        avail[k] = a;
+        total += a;
+    }
+    std::vector<int64_t> out(gpu_idxs.size(), 0);
+    if (total <= 0) return out;
+    for (size_t k = 0; k < gpu_idxs.size(); k++) {
+        out[k] = int64_t(double(params_bytes) * double(avail[k]) / double(total));
+    }
+    return out;
+}
+
+// Peak per device = MAX of any single component's footprint on that device.
+// Components free their params between phases (free_params_immediately; the
+// split runners load lazily and free after each phase too), so they time-share
+// VRAM rather than coexisting — hence MAX, not sum.
+inline int64_t gpu_peak(int                           gpu_idx,
+                        const std::vector<Placement>& pl,
+                        const std::vector<int>&       dev,
+                        const std::vector<Component>& components,
+                        const std::vector<Device>&    devices = {}) {
+    int64_t peak = 0;
+    for (size_t i = 0; i < components.size(); i++) {
+        int64_t footprint = 0;
+        if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+            if (dev[i] != gpu_idx) continue;
+            footprint = components[i].params_bytes + components[i].compute_bytes;
+        } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            // Row-split: every GPU in the mask gets a free-VRAM-weighted
+            // share of params; the compute reserve lands on the BIGGEST
+            // GPU (which becomes the runner's main backend).
+            const int mask = dev[i];
+            if (!(mask & (1 << gpu_idx))) continue;
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            int slot = -1;
+            int biggest_slot = 0;
+            int64_t biggest_mem = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) slot = int(k);
+                if (devices[gpu_idxs[k]].total_bytes > biggest_mem) {
+                    biggest_mem  = devices[gpu_idxs[k]].total_bytes;
+                    biggest_slot = int(k);
+                }
+            }
+            if (slot < 0) continue;
+            // Row-split: graph runs on the main (= biggest) GPU, which reserves
+            // its compute buffer; param rows are weighted by the remaining free.
+            auto shares = row_split_shares(components[i].params_bytes,
+                                           components[i].compute_bytes,
+                                           devices, gpu_idxs, size_t(biggest_slot));
+            footprint = shares[slot];
+            if (slot == biggest_slot) {
+                footprint += components[i].compute_bytes;
+            }
+        } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+            // dev[i] holds the bitmask of participating GPU indices into the
+            // devices[] vector (encoded by the planner). Look up our slot.
+            const int mask = dev[i];
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < devices.size(); k++) {
+                if (mask & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Find this gpu's slot in gpu_idxs.
+            int slot = -1;
+            for (size_t k = 0; k < gpu_idxs.size(); k++) {
+                if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; }
+            }
+            if (slot < 0) continue;
+            auto shares = layer_split_shares(components[i].params_bytes,
+                                             components[i].compute_bytes,
+                                             devices, gpu_idxs);
+            footprint = shares[slot];
+        }
+        peak = std::max(peak, footprint);
+    }
+    return peak;
+}
+
+inline Plan compute_plan(const std::vector<Component>& components,
+                         const std::vector<Device>&    devices,
+                         int64_t                       margin_bytes,
+                         bool                          allow_multi_gpu = true,
+                         MultiGpuMode                  mode = MultiGpuMode::ROW) {
+    const size_t nC = components.size();
+    const size_t nG = devices.size();
+    if (!allow_multi_gpu) {
+        mode = MultiGpuMode::OFF;
+    }
+
+    std::vector<int64_t> cap(nG, 0);
+    for (size_t g = 0; g < nG; g++) {
+        cap[g] = std::max<int64_t>(0, devices[g].free_bytes - margin_bytes);
+    }
+
+    struct OptionSlot {
+        Placement placement;
+        int       device_idx;
+    };
+
+    // ROW-split is DiT-exclusive. Keeping a single homogeneous row-split
+    // component (same tensor sizes every phase/generation) lets the driver
+    // reuse freed split-buffer chunks, which is what avoids the
+    // cuda_split_buffer fragmentation a ggml patch would otherwise be needed
+    // for. The DiT is also the per-step bottleneck, where row-split's small
+    // compute buffer matters most.
+    auto supports_tensor_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT;
+    };
+    // LAYER-split (regular per-device buffers routed by a scheduler) is
+    // general and fragmentation-free, so any block-structured component can
+    // use it. The Conditioner (e.g. Gemma) splits this way when it is too big
+    // for one GPU; its (larger) cross-backend compute buffer is acceptable
+    // because it runs once at encode time and frees before the DiT loop.
+    auto supports_layer_split = [](ComponentKind k) {
+        return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER;
+    };
+
+    auto build_options = [&](const Component& c) {
+        std::vector<OptionSlot> opts;
+        for (size_t g = 0; g < nG; g++) {
+            opts.push_back({Placement::GPU, int(g)});
+            if (c.supports_offload) {
+                opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)});
+            }
+        }
+        if (nG >= 2) {
+            // ROW-split: DiT only, in row mode. Spans all GPUs (one option).
+            if (mode == MultiGpuMode::ROW && supports_tensor_split(c.kind)) {
+                opts.push_back({Placement::GPU_TENSOR_SPLIT, (1 << nG) - 1});
+            }
+            // LAYER-split: the DiT in layer mode, and any OTHER layer-split
+            // candidate (the Conditioner) regardless of mode — non-DiT
+            // components never row-split, preserving the single-row invariant.
+            const bool want_layer = supports_layer_split(c.kind) &&
+                                    (mode == MultiGpuMode::LAYER ||
+                                     (mode == MultiGpuMode::ROW && !supports_tensor_split(c.kind)));
+            if (want_layer) {
+                const int max_mask = 1 << nG;
+                for (int mask = 1; mask < max_mask; mask++) {
+                    if (bit_count(static_cast<unsigned int>(mask)) < 2) continue;
+                    opts.push_back({Placement::GPU_LAYER_SPLIT, mask});
+                }
+            }
+        }
+        opts.push_back({Placement::CPU, -1});
+        return opts;
+    };
+
+    std::vector<std::vector<OptionSlot>> options;
+    options.reserve(nC);
+    for (const Component& c : components) {
+        options.push_back(build_options(c));
+    }
+
+    auto priority_weight = [](ComponentKind k) -> int {
+        switch (k) {
+            case ComponentKind::DIT:         return 300;
+            case ComponentKind::CONDITIONER: return 120;
+            case ComponentKind::VAE:         return 60;
+        }
+        return 1;
+    };
+
+    auto score = [&](const std::vector<Placement>& pl, const std::vector<int>& dev) {
+        int64_t       s = 0;
+        std::set<int> gpus_used;
+        for (size_t i = 0; i < nC; i++) {
+            const int pw = priority_weight(components[i].kind);
+            if (pl[i] == Placement::GPU) {
+                s += 10 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                s += 5 * pw;
+                gpus_used.insert(dev[i]);
+            } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) {
+                // Row-split: cheaper than layer-split (no sched cross-
+                // backend doubling) but pays per-matmul cross-device
+                // reductions. Score it slightly above LAYER_SPLIT so the
+                // planner prefers it when both fit.
+                s += 8 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
+            } else if (pl[i] == Placement::GPU_LAYER_SPLIT) {
+                // Better than CPU but worse than fitting on a single GPU
+                // (cross-GPU traffic between blocks).
+                s += 7 * pw;
+                for (size_t g = 0; g < nG; g++) {
+                    if (dev[i] & (1 << g)) gpus_used.insert(int(g));
+                }
+            } else {
+                s -= 10 * pw;
+            }
+        }
+        if (allow_multi_gpu) {
+            s += 2 * int64_t(gpus_used.size());
+        }
+        return s;
+    };
+
+    std::vector<size_t>    idx(nC, 0);
+    std::vector<Placement> best_pl;
+    std::vector<int>       best_dev;
+    int64_t                best_score = std::numeric_limits<int64_t>::min();
+    bool                   found_any  = false;
+
+    while (true) {
+        std::vector<Placement> pl(nC);
+        std::vector<int>       dev(nC);
+        for (size_t i = 0; i < nC; i++) {
+            pl[i]  = options[i][idx[i]].placement;
+            dev[i] = options[i][idx[i]].device_idx;
+        }
+        // Constraint: when multi-GPU is disabled, all GPU placements must
+        // share the same device index.
+        if (!allow_multi_gpu) {
+            int common = -1;
+            bool ok = true;
+            for (size_t i = 0; i < nC; i++) {
+                if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) {
+                    if (common < 0) common = dev[i];
+                    else if (dev[i] != common) { ok = false; break; }
+                }
+            }
+            if (ok) {
+                bool feasible = true;
+                for (size_t g = 0; g < nG; g++) {
+                    if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
+                }
+                if (feasible) {
+                    int64_t sc = score(pl, dev);
+                    if (sc > best_score) {
+                        best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                    }
+                }
+            }
+        } else {
+            bool feasible = true;
+            for (size_t g = 0; g < nG; g++) {
+                if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; }
+            }
+            if (feasible) {
+                int64_t sc = score(pl, dev);
+                if (sc > best_score) {
+                    best_score = sc; best_pl = pl; best_dev = dev; found_any = true;
+                }
+            }
+        }
+
+        size_t pos = 0;
+        while (pos < nC) {
+            idx[pos]++;
+            if (idx[pos] < options[pos].size()) break;
+            idx[pos] = 0;
+            pos++;
+        }
+        if (pos >= nC) break;
+    }
+
+    Plan plan;
+    if (!found_any) {
+        best_pl.assign(nC, Placement::CPU);
+        best_dev.assign(nC, -1);
+    }
+
+    for (size_t i = 0; i < nC; i++) {
+        const Component& c = components[i];
+        Decision         d;
+        d.kind      = c.kind;
+        d.name      = c.name;
+        d.placement = best_pl[i];
+        if (best_pl[i] == Placement::CPU) {
+            d.device_id      = DEVICE_ID_CPU;
+            d.on_host_bytes  = c.params_bytes + c.compute_bytes;
+            plan.any_changes = true;
+        } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            // Sort participating GPUs by descending TOTAL memory so the
+            // largest device is the "main" (runs the graph + hosts the compute
+            // buffer + sub-runners that don't get their own spec). This matches
+            // the user's preference: always use the bigger GPU as main.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+            // PARAM shares for the split ratio: the main (order[0]) reserves its
+            // compute buffer first so it doesn't get over-loaded with rows.
+            auto shares = row_split_shares(c.params_bytes, c.compute_bytes,
+                                           devices, gpu_idxs, order[0]);
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                // split_share_bytes drives the row ratio in apply_dit -> keep it
+                // param-only. The main device's peak (params + compute) is folded
+                // into on_device_bytes for the plan display / feasibility.
+                d.split_share_bytes.push_back(shares[k]);
+                int64_t peak = shares[k] + (pos == 0 ? c.compute_bytes : 0);
+                max_share    = std::max(max_share, peak);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
+        } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) {
+            std::vector<size_t> gpu_idxs;
+            for (size_t k = 0; k < nG; k++) {
+                if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k);
+            }
+            auto shares = layer_split_shares(c.params_bytes, c.compute_bytes,
+                                             devices, gpu_idxs);
+            // Sort participating GPUs by descending TOTAL memory so the
+            // physically bigger GPU is listed first (and becomes the runner's
+            // main backend). Sub-runners that don't get the layer-split spec
+            // (e.g. the LTX-2 text projection) follow the main backend.
+            std::vector<size_t> order(gpu_idxs.size());
+            std::iota(order.begin(), order.end(), 0);
+            std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+                return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes;
+            });
+
+            int64_t max_share = 0;
+            for (size_t pos = 0; pos < order.size(); pos++) {
+                size_t k = order[pos];
+                d.split_device_ids.push_back(devices[gpu_idxs[k]].id);
+                d.split_share_bytes.push_back(shares[k]);
+                max_share = std::max(max_share, shares[k]);
+            }
+            d.device_id        = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0];
+            d.on_device_bytes  = max_share;
+            plan.any_changes   = true;
+        } else {
+            d.device_id = devices[best_dev[i]].id;
+            if (best_pl[i] == Placement::GPU) {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+            } else {
+                d.on_device_bytes = c.params_bytes + c.compute_bytes;
+                d.on_host_bytes   = c.params_bytes;
+                plan.any_changes  = true;
+            }
+        }
+        plan.decisions.push_back(d);
+        plan.host_bytes += d.on_host_bytes;
+    }
+
+    for (size_t g = 0; g < nG; g++) {
+        plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices);
+    }
+    return plan;
+}
+
+inline const char* placement_str(Placement p) {
+    switch (p) {
+        case Placement::CPU: return "CPU";
+        case Placement::GPU: return "GPU";
+        case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)";
+        case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)";
+        case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)";
+    }
+    return "?";
+}
+
+inline void print_plan(const Plan&                   plan,
+                       const std::vector<Component>& components,
+                       const std::vector<Device>&    devices,
+                       int64_t                       margin_bytes) {
+    LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB));
+    LOG_INFO("  available devices:");
+    if (devices.empty()) {
+        LOG_INFO("    (no GPU devices detected — all components will run on CPU)");
+    }
+    for (const Device& d : devices) {
+        LOG_INFO("    %-12s %-32s free %6lld / %6lld MiB",
+                 d.name.c_str(), d.description.c_str(),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)(d.total_bytes / MiB));
+    }
+    LOG_INFO("  components:");
+    for (const Component& c : components) {
+        LOG_INFO("    %-12s params %6lld MiB, compute reserve %6lld MiB",
+                 c.name.c_str(),
+                 (long long)(c.params_bytes / MiB),
+                 (long long)(c.compute_bytes / MiB));
+    }
+    LOG_INFO("  decisions:");
+    for (const Decision& d : plan.decisions) {
+        if (d.placement == Placement::CPU) {
+            LOG_INFO("    %-12s -> CPU                (RAM %lld MiB)",
+                     d.name.c_str(), (long long)(d.on_host_bytes / MiB));
+        } else if (d.placement == Placement::GPU) {
+            LOG_INFO("    %-12s -> GPU %d              (VRAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB));
+        } else if (d.placement == Placement::GPU_LAYER_SPLIT ||
+                   d.placement == Placement::GPU_TENSOR_SPLIT) {
+            std::string ids;
+            const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer";
+            for (size_t k = 0; k < d.split_device_ids.size(); k++) {
+                if (k > 0) ids += "+";
+                ids += "GPU" + std::to_string(d.split_device_ids[k]);
+                ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)";
+            }
+            LOG_INFO("    %-12s -> %s-split %s",
+                     d.name.c_str(), tag, ids.c_str());
+        } else {
+            LOG_INFO("    %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)",
+                     d.name.c_str(), d.device_id,
+                     (long long)(d.on_device_bytes / MiB),
+                     (long long)(d.on_host_bytes / MiB));
+        }
+    }
+    LOG_INFO("  projected per-device peak:");
+    for (const Device& d : devices) {
+        int64_t peak = 0;
+        auto    it   = plan.device_bytes.find(d.id);
+        if (it != plan.device_bytes.end()) peak = it->second;
+        LOG_INFO("    %-12s peak %6lld / %6lld MiB free  (remaining %lld MiB)",
+                 d.name.c_str(),
+                 (long long)(peak / MiB),
+                 (long long)(d.free_bytes / MiB),
+                 (long long)((d.free_bytes - peak) / MiB));
+    }
+    LOG_INFO("    %-12s host RAM additional %lld MiB", "CPU",
+             (long long)(plan.host_bytes / MiB));
+}
+
+inline const Decision* find_decision(const Plan& plan, ComponentKind kind) {
+    for (const Decision& d : plan.decisions) {
+        if (d.kind == kind) return &d;
+    }
+    return nullptr;
+}
+
+}  // namespace backend_fit
+
+#endif  // __SD_BACKEND_FIT_HPP__
diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp
index b5dda4c0e..f193e0704 100644
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@@ -116,6 +116,13 @@ struct Conditioner {
     virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)           = 0;
     virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
     virtual void set_stream_layers_enabled(bool enabled) {}
+    // Multi-GPU + lazy-load hooks. Default no-op; LLM-backed conditioners
+    // forward them to their (heavy) LLM sub-runner so it can be split across
+    // GPUs (layer-split) and/or have its params alloc+load deferred to the
+    // first compute so it time-shares VRAM with the DiT.
+    virtual void set_lazy_load(std::function<bool()> fn) {}
+    virtual void set_multi_backend_spec(const MultiBackendSpec& spec) {}
+    virtual void set_weight_manager(std::shared_ptr<RunnerWeightManager> manager) {}
     virtual void set_flash_attention_enabled(bool enabled) = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
     virtual void runner_done() {}
@@ -1410,6 +1417,18 @@ struct AnimaConditioner : public Conditioner {
         llm->set_stream_layers_enabled(enabled);
     }
 
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
+    void set_weight_manager(std::shared_ptr<RunnerWeightManager> manager) override {
+        llm->set_weight_manager(std::move(manager));
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
@@ -1555,6 +1574,20 @@ struct LLMEmbedder : public Conditioner {
         llm->set_stream_layers_enabled(enabled);
     }
 
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
+    void set_weight_manager(std::shared_ptr<RunnerWeightManager> manager) override {
+        if (llm) {
+            llm->set_weight_manager(std::move(manager));
+        }
+    }
+
     void set_flash_attention_enabled(bool enabled) override {
         llm->set_flash_attention_enabled(enabled);
     }
@@ -2137,6 +2170,20 @@ struct LTXAVEmbedder : public Conditioner {
         projector->set_flash_attention_enabled(enabled);
     }
 
+    // Split/lazy apply to the heavy LLM only; the small projector stays on the
+    // main backend and loads eagerly.
+    void set_lazy_load(std::function<bool()> fn) override {
+        llm->set_lazy_load(std::move(fn));
+    }
+
+    void set_multi_backend_spec(const MultiBackendSpec& spec) override {
+        llm->set_multi_backend_spec(spec);
+    }
+
+    void set_weight_manager(std::shared_ptr<RunnerWeightManager> manager) override {
+        llm->set_weight_manager(std::move(manager));
+    }
+
     void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
         llm->set_max_graph_vram_bytes(max_vram_bytes);
         projector->set_max_graph_vram_bytes(max_vram_bytes);
@@ -2180,6 +2227,7 @@ struct LTXAVEmbedder : public Conditioner {
 
         std::vector<float> mask;
         tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength);
+
         return {tokens, weights, mask};
     }
 
@@ -2220,6 +2268,7 @@ struct LTXAVEmbedder : public Conditioner {
                                           true,
                                           true);
         GGML_ASSERT(!hidden_states.empty());
+
         hidden_states = apply_token_weights(std::move(hidden_states), weights);
 
         int64_t valid_tokens = 0;
diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
index a3dda16b2..0d2f9b27b 100644
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@@ -1690,6 +1690,39 @@ struct GGMLRunnerContext {
     }
 };
 
+// Multi-GPU split of a single runner across several GPU backends, on stock
+// ggml (no ggml patch needed). Two modes:
+//   LAYER_SPLIT: whole transformer blocks are assigned to per-block backends
+//                and a ggml_backend_sched routes cross-device ops. Works on
+//                any multi-GPU set.
+//   ROW_SPLIT:   matmul weights are split row-wise via the backend's stock
+//                split buffer type (CUDA/SYCL `ggml_backend_split_buffer_type`),
+//                non-matmul weights live on the main GPU; sched still wires the
+//                extra backends so it can route the cross-device reductions.
+// The split params are allocated once and kept resident (the runner is not
+// freed+realloc'd between generations), which is what lets us avoid the
+// split-buffer fragmentation a ggml patch would otherwise be needed for.
+enum class MultiBackendMode {
+    LAYER_SPLIT,
+    ROW_SPLIT,
+};
+
+struct MultiBackendSpec {
+    MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT;
+    // Extra GPU backends beyond the runner's main (runtime) backend. The main
+    // backend is implicit and is NOT listed here. Borrowed handles — owned by
+    // the SDBackendManager, never freed by the runner.
+    std::vector<ggml_backend_t> additional_backends;
+    // LAYER_SPLIT: map a param tensor to the backend that should hold it (the
+    // main backend, or one of additional_backends). nullptr => main. Keyed by
+    // tensor POINTER, not name: param tensors are unnamed at alloc time.
+    std::function<ggml_backend_t(ggml_tensor*)> tensor_backend_fn;
+    // ROW_SPLIT: per-device weight ratios (length = the backend registry's
+    // device count) and the main device index that owns the non-split portion.
+    std::vector<float> tensor_split_ratios;
+    int                main_device = 0;
+};
+
 struct GGMLRunner {
 protected:
     typedef std::function<ggml_cgraph*()> get_graph_cb_t;
@@ -1710,6 +1743,34 @@ struct GGMLRunner {
     bool stream_layers_enabled            = false;
     size_t observed_max_effective_budget_ = 0;
 
+    // --- multi-GPU split state (layer-split via sched OR row-split via the
+    //     stock split buffer type). Inactive unless set_multi_backend_spec()
+    //     was called before alloc_params_buffer(). ---
+    ggml_backend_t params_backend         = nullptr;
+    ggml_backend_buffer_t params_buffer   = nullptr;
+    bool                                        multi_backend_mode = false;
+    MultiBackendMode                            multi_backend_kind = MultiBackendMode::LAYER_SPLIT;
+    std::vector<ggml_backend_t>                 additional_backends;  // borrowed (manager-owned)
+    std::function<ggml_backend_t(ggml_tensor*)> tensor_backend_fn    = nullptr;
+    ggml_backend_sched_t                        sched                = nullptr;  // owned
+    bool                                        sched_reserved       = false;
+    ggml_backend_t                              cpu_fallback_backend = nullptr;
+    bool                                        owns_cpu_fallback_backend = false;
+    // LAYER_SPLIT: one resident params buffer per participating backend.
+    std::vector<ggml_backend_buffer_t>          multi_params_buffers;  // owned
+    // ROW_SPLIT: resident split + main buffers and the split buft (buft is
+    // backend-cached, not owned).
+    std::vector<float>                          row_split_ratios;
+    int                                         row_main_device  = 0;
+    ggml_backend_buffer_type_t                  row_split_buft   = nullptr;
+    ggml_backend_buffer_t                       row_split_buffer = nullptr;  // owned
+    ggml_backend_buffer_t                       row_main_buffer  = nullptr;  // owned
+
+    // Lazy-load: when set, params alloc + tensor-data load is deferred to the
+    // first compute() (ensure_params_loaded) and freed after each phase, so
+    // components time-share VRAM instead of all coexisting at init.
+    std::function<bool()> lazy_load_fn = nullptr;
+
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
     std::weak_ptr<RunnerWeightManager> weight_manager;
     std::unordered_set<const ggml_tensor*> kept_compute_param_tensor_set;
@@ -1877,9 +1938,13 @@ struct GGMLRunner {
         }
         auto manager = weight_manager.lock();
         if (manager == nullptr) {
-            if (!params_to_prepare.empty()) {
-                LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str());
-                return false;
+            for (ggml_tensor* param : params_to_prepare) {
+                if (param != nullptr && param->data == nullptr) {
+                    LOG_ERROR("%s param '%s' is not loaded and weight manager is not set",
+                              get_desc().c_str(),
+                              ggml_get_name(param));
+                    return false;
+                }
             }
             return true;
         }
@@ -1977,7 +2042,167 @@ struct GGMLRunner {
         return true;
     }
 
+    // Build the multi-backend scheduler (lazily). Backends in priority order:
+    // main runtime backend, then the additional GPU backends, then a CPU
+    // fallback last (ggml_backend_sched_new requires the last backend be CPU).
+    bool ensure_sched() {
+        if (sched != nullptr) {
+            return true;
+        }
+        std::vector<ggml_backend_t> backends;
+        backends.reserve(1 + additional_backends.size() + 1);
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) {
+            backends.push_back(b);
+        }
+        if (cpu_fallback_backend == nullptr) {
+            cpu_fallback_backend      = sd_backend_cpu_init();
+            owns_cpu_fallback_backend = true;
+        }
+        backends.push_back(cpu_fallback_backend);
+        // Build an explicit per-backend buffer-type array instead of passing
+        // nullptr. ggml_backend_sched uses these in buffer_supported() to decide
+        // whether a cross-backend src needs a copy; with nullptr it synthesizes
+        // them from default backend types, and CUDA devices can spuriously report
+        // supporting each other's buffers -> a needed copy is skipped and a node
+        // (e.g. a cont in attention) reads another device's memory -> illegal
+        // access. For the trailing CPU slot, use device-0's host buffer type
+        // (pinned host memory) exactly as llama.cpp does (llama-context.cpp).
+        std::vector<ggml_backend_buffer_type_t> bufts;
+        bufts.reserve(backends.size());
+        ggml_backend_dev_t dev0 = ggml_backend_get_device(runtime_backend);
+        for (auto* b : backends) {
+            if (b == cpu_fallback_backend && dev0 != nullptr) {
+                ggml_backend_buffer_type_t host = ggml_backend_dev_host_buffer_type(dev0);
+                bufts.push_back(host != nullptr ? host : ggml_backend_get_default_buffer_type(b));
+            } else {
+                bufts.push_back(ggml_backend_get_default_buffer_type(b));
+            }
+        }
+        sched = ggml_backend_sched_new(backends.data(),
+                                       bufts.data(),
+                                       (int)backends.size(),
+                                       MAX_GRAPH_SIZE,
+                                       /*parallel=*/false,
+                                       /*op_offload=*/false);
+        if (sched == nullptr) {
+            LOG_ERROR("%s: failed to create backend sched", get_desc().c_str());
+            return false;
+        }
+        return true;
+    }
+
+    // Map a weight tensor to the backend it was allocated on in a layer split.
+    ggml_backend_t backend_of_weight(ggml_tensor* t) const {
+        if (t == nullptr || t->buffer == nullptr) {
+            return nullptr;
+        }
+        if (ggml_backend_buffer_get_usage(t->buffer) != GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            return nullptr;
+        }
+        for (size_t i = 0; i < multi_params_buffers.size(); i++) {
+            if (multi_params_buffers[i] == t->buffer) {
+                if (i == 0) {
+                    return runtime_backend;
+                }
+                if (i - 1 < additional_backends.size()) {
+                    return additional_backends[i - 1];
+                }
+            }
+        }
+        return nullptr;
+    }
+
+    // Pin compute nodes to their layer's device for a LAYER split. Stock
+    // ggml_backend_sched anchors weight-bearing ops (matmuls) to the weight's
+    // device, but weightless ops (norm, residual add, permute, cont) have no
+    // anchor and are placed by a heuristic that, for the attention `cont`, can
+    // land on the wrong device and then read it without a cross-device copy ->
+    // CUDA illegal access. llama.cpp pins each layer-boundary norm to the
+    // layer's device for exactly this reason (llama-context.cpp). We generalise:
+    // walk the graph in execution order, track the device of the most recently
+    // consumed weight (= the current layer's device), and pin every node to it.
+    // This forces clean per-layer cuts so sched copies only the residual stream
+    // across the boundary. No-op outside a layer split.
+    void pin_layer_split_nodes(ggml_cgraph* gf) {
+        if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::LAYER_SPLIT) {
+            return;
+        }
+        if (sched == nullptr || multi_params_buffers.empty() || gf == nullptr) {
+            return;
+        }
+        ggml_backend_t cur     = runtime_backend;
+        const int      n_nodes = ggml_graph_n_nodes(gf);
+        for (int i = 0; i < n_nodes; i++) {
+            ggml_tensor* node = ggml_graph_node(gf, i);
+            for (int s = 0; s < GGML_MAX_SRC; s++) {
+                ggml_backend_t wb = backend_of_weight(node->src[s]);
+                if (wb != nullptr) {
+                    cur = wb;
+                }
+            }
+            // NEVER pin view ops (view/reshape/permute/transpose): a view
+            // assigned to a different backend than its view_src's data makes
+            // the sched skip the cross-device copy for consumers (the copy
+            // decision trusts the assigned id), and a kernel then dereferences
+            // the other device's pointer. The sched places views correctly on
+            // its own by following view_src.
+            if (node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE ||
+                node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+                continue;
+            }
+            if (cur != nullptr && ggml_backend_supports_op(cur, node)) {
+                ggml_backend_sched_set_tensor_backend(sched, node, cur);
+            }
+        }
+    }
+
+    // Pin un-allocated graph-input leaves (rope pe tables, timesteps, latents…)
+    // to the MAIN backend before sched alloc. Left to its own heuristics the
+    // sched places them on the CPU/host slot and emits per-split host->device
+    // input copies; those copies were observed landing LATE (first pass reads
+    // zeros / stale pool garbage, second pass reads the first pass's data).
+    // Pinning them to the main backend makes our copy_data_to_backend_tensor
+    // fill a device-resident tensor directly (synchronous H2D) and removes the
+    // cross-backend input copies entirely.
+    void pin_input_leaves(ggml_cgraph* gf) {
+        // ROW_SPLIT only: the whole graph computes on the main backend, so
+        // graph inputs trivially belong there; pinning them avoids per-split
+        // host->device input copies. (Layer-split graphs span devices and the
+        // sched routes their inputs correctly on its own.)
+        if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::ROW_SPLIT ||
+            sched == nullptr || gf == nullptr || runtime_backend == nullptr) {
+            return;
+        }
+        const int n_nodes = ggml_graph_n_nodes(gf);
+        for (int i = 0; i < n_nodes; i++) {
+            ggml_tensor* node = ggml_graph_node(gf, i);
+            for (int s = 0; s < GGML_MAX_SRC && node->src[s] != nullptr; s++) {
+                ggml_tensor* t = node->src[s];
+                while (t->view_src != nullptr) {
+                    t = t->view_src;
+                }
+                // op NONE + no buffer yet = a graph input the sched will
+                // allocate (weights already sit in params buffers).
+                if (t->op == GGML_OP_NONE && t->buffer == nullptr) {
+                    ggml_backend_sched_set_tensor_backend(sched, t, runtime_backend);
+                }
+            }
+        }
+    }
+
     bool alloc_compute_buffer(ggml_cgraph* gf) {
+        if (multi_backend_mode) {
+            // Do NOT ggml_backend_sched_reserve(gf) here: reserve runs
+            // split_graph, which REWIRES gf's src pointers to sched-internal
+            // copy tensors. execute_graph then sched_alloc_graph's the SAME gf,
+            // and the second split sees the stale reserve-epoch copies (measure
+            // layout) as valid inputs — silently corrupting every cross-backend
+            // input (garbage rope pe, garbage Gemma stack) or crashing. A graph
+            // must be split at most once; the first sched_alloc_graph in
+            // execute_graph performs the real allocation instead.
+            return ensure_sched();
+        }
         if (compute_allocr != nullptr) {
             return true;
         }
@@ -2193,12 +2418,14 @@ struct GGMLRunner {
                plan.valid &&
                max_graph_vram_bytes > 0 &&
                plan.segments.size() > 1 &&
-               !sd_backend_is_cpu(runtime_backend);
+               !sd_backend_is_cpu(runtime_backend) &&
+               !multi_backend_mode;
     }
 
     bool can_attempt_graph_cut_segmented_compute() const {
         return max_graph_vram_bytes > 0 &&
-               !sd_backend_is_cpu(runtime_backend);
+               !sd_backend_is_cpu(runtime_backend) &&
+               !multi_backend_mode;
     }
 
     bool resolve_graph_cut_plan(ggml_cgraph* gf,
@@ -2454,7 +2681,15 @@ struct GGMLRunner {
         };
         ComputeBufferGuard compute_buffer_guard(this, free_compute_buffer);
 
-        if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
+        if (multi_backend_mode) {
+            ggml_backend_sched_reset(sched);
+            pin_layer_split_nodes(gf);  // reset clears pins; re-apply before alloc
+            pin_input_leaves(gf);
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LOG_ERROR("%s sched alloc compute graph failed", get_desc().c_str());
+                return std::nullopt;
+            }
+        } else if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
             LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
             return std::nullopt;
         }
@@ -2463,8 +2698,19 @@ struct GGMLRunner {
         if (sd_backend_is_cpu(runtime_backend)) {
             sd_backend_cpu_set_n_threads(runtime_backend, n_threads);
         }
+        if (multi_backend_mode && cpu_fallback_backend != nullptr && sd_backend_is_cpu(cpu_fallback_backend)) {
+            sd_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads);
+        }
 
-        ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
+        ggml_status status;
+        if (multi_backend_mode) {
+            status = ggml_backend_sched_graph_compute(sched, gf);
+            if (status == GGML_STATUS_SUCCESS) {
+                ggml_backend_sched_synchronize(sched);
+            }
+        } else {
+            status = ggml_backend_graph_compute(runtime_backend, gf);
+        }
         if (status != GGML_STATUS_SUCCESS) {
             LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
             return std::nullopt;
@@ -2623,6 +2869,9 @@ struct GGMLRunner {
         kept_compute_param_tensor_set.clear();
         free_compute_backend_param_tensors(tensors_to_release);
         free_params_backend_param_tensors(tensors_to_release);
+        if (lazy_load_fn) {
+            free_params_buffer();
+        }
     }
 
 public:
@@ -2631,6 +2880,7 @@ struct GGMLRunner {
     GGMLRunner(ggml_backend_t backend,
                std::shared_ptr<RunnerWeightManager> manager = nullptr)
         : runtime_backend(backend),
+          params_backend(backend),
           weight_manager(manager) {
         GGML_ASSERT(runtime_backend != nullptr);
         alloc_params_ctx();
@@ -2638,9 +2888,20 @@ struct GGMLRunner {
 
     virtual ~GGMLRunner() {
         free_compute_buffer();
+        free_params_buffer();
         free_params_ctx();
         free_compute_ctx();
         free_cache_ctx_and_buffer();
+        // Multi-GPU split teardown. additional_backends are owned by the
+        // SDBackendManager (not freed here); row_split_buft is backend-cached.
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched = nullptr;
+        }
+        if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) {
+            ggml_backend_free(cpu_fallback_backend);
+            cpu_fallback_backend = nullptr;
+        }
     }
 
     virtual GGMLRunnerContext get_context() {
@@ -2670,6 +2931,311 @@ struct GGMLRunner {
         alloc_compute_ctx();
     }
 
+    // Row-split eligibility: contiguous, rank-2, both dims >= 256, not a view.
+    // 1D biases/norms, embeddings, small projections and views fall back to the
+    // main GPU's regular per-device buft. Excluding views respects the split
+    // buft's documented contract (GGML_ASSERT(view_src == nullptr)) so we never
+    // need to patch ggml.
+    static bool is_row_split_eligible(const ggml_tensor* t) {
+        if (t->view_src != nullptr) return false;
+        if (!ggml_is_contiguous(t)) return false;
+        if (ggml_n_dims(t) != 2) return false;
+        if (t->ne[0] < 256 || t->ne[1] < 256) return false;
+        return true;
+    }
+
+    // ROW_SPLIT: matmul-eligible weights -> row_split_buft (split row-wise
+    // across GPUs by the CUDA/SYCL backend), everything else -> the main GPU's
+    // default buft. Each is allocated ONCE into a single resident buffer and
+    // suballocated via ggml_tallocr — no per-tensor churn, no free->realloc.
+    bool alloc_params_buffer_row_split() {
+        if (row_split_buft == nullptr) {
+            LOG_ERROR("%s row-split buft not initialized (backend lacks ggml_backend_split_buffer_type)",
+                      get_desc().c_str());
+            return false;
+        }
+        ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend);
+        const size_t main_align  = ggml_backend_buft_get_alignment(main_buft);
+        const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft);
+
+        size_t main_size = 0, split_size = 0;
+        size_t main_count = 0, split_count = 0;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            if (is_row_split_eligible(t)) {
+                split_size += GGML_PAD(ggml_backend_buft_get_alloc_size(row_split_buft, t), split_align);
+                split_count++;
+            } else {
+                main_size += GGML_PAD(ggml_backend_buft_get_alloc_size(main_buft, t), main_align);
+                main_count++;
+            }
+        }
+
+        if (main_size > 0) {
+            row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size);
+            if (row_main_buffer == nullptr) {
+                LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", get_desc().c_str(), main_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+        if (split_size > 0) {
+            row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size);
+            if (row_split_buffer == nullptr) {
+                LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", get_desc().c_str(), split_size / (1024.f * 1024.f));
+                return false;
+            }
+        }
+
+        ggml_tallocr main_alloc{};
+        ggml_tallocr split_alloc{};
+        if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer);
+        if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer);
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            ggml_status st = is_row_split_eligible(t) ? ggml_tallocr_alloc(&split_alloc, t) : ggml_tallocr_alloc(&main_alloc, t);
+            if (st != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s row-split tallocr_alloc failed", get_desc().c_str());
+                return false;
+            }
+        }
+        if (row_main_buffer != nullptr) ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        if (row_split_buffer != nullptr) ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        rebuild_params_tensor_set();
+        LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)",
+                 get_desc().c_str(), main_size / (1024.f * 1024.f), main_count, split_size / (1024.f * 1024.f), split_count);
+        return true;
+    }
+
+    // LAYER_SPLIT: assign each param tensor to a backend (via tensor_backend_fn,
+    // keyed by tensor pointer), allocate one resident buffer per backend on its
+    // default buft, and suballocate via ggml_tallocr.
+    bool alloc_params_buffer_layer_split() {
+        std::vector<ggml_backend_t> backends;
+        backends.push_back(runtime_backend);
+        for (auto* b : additional_backends) backends.push_back(b);
+
+        std::vector<ggml_backend_buffer_type_t> bufts(backends.size());
+        std::vector<size_t> aligns(backends.size());
+        std::vector<size_t> sizes(backends.size(), 0);
+        std::vector<size_t> counts(backends.size(), 0);
+        for (size_t i = 0; i < backends.size(); i++) {
+            bufts[i]  = ggml_backend_get_default_buffer_type(backends[i]);
+            aligns[i] = ggml_backend_buft_get_alignment(bufts[i]);
+        }
+
+        std::map<ggml_tensor*, int> tensor_backend_idx;
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            int idx = 0;
+            if (tensor_backend_fn) {
+                ggml_backend_t target = tensor_backend_fn(t);
+                if (target != nullptr) {
+                    for (size_t i = 0; i < backends.size(); i++) {
+                        if (backends[i] == target) { idx = int(i); break; }
+                    }
+                }
+            }
+            tensor_backend_idx[t] = idx;
+            sizes[idx] += GGML_PAD(ggml_backend_buft_get_alloc_size(bufts[idx], t), aligns[idx]);
+            counts[idx] += 1;
+        }
+
+        multi_params_buffers.assign(backends.size(), nullptr);
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (sizes[i] == 0) continue;
+            ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]);
+            size_t free_pre = 0, total_pre = 0;
+            if (dev) ggml_backend_dev_memory(dev, &free_pre, &total_pre);
+            multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]);
+            if (multi_params_buffers[i] == nullptr) {
+                LOG_ERROR("%s layer-split alloc on %s failed (%.1f MB)", get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f));
+                return false;
+            }
+            size_t free_post = 0, total_post = 0;
+            if (dev) ggml_backend_dev_memory(dev, &free_post, &total_post);
+            LOG_DEBUG("%s layer-split alloc[%zu] %s req=%.1f MB dev_free %.1f -> %.1f MB is_host=%d",
+                      get_desc().c_str(), i, ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f),
+                      free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f),
+                      (int)ggml_backend_buffer_is_host(multi_params_buffers[i]));
+        }
+
+        std::vector<ggml_tallocr> tallocs(backends.size());
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (multi_params_buffers[i] != nullptr) tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]);
+        }
+        for (auto& kv : tensor_backend_idx) {
+            if (ggml_tallocr_alloc(&tallocs[kv.second], kv.first) != GGML_STATUS_SUCCESS) {
+                LOG_ERROR("%s layer-split tallocr_alloc failed", get_desc().c_str());
+                return false;
+            }
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+        rebuild_params_tensor_set();
+        for (size_t i = 0; i < backends.size(); i++) {
+            if (counts[i] == 0) continue;
+            LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)",
+                     get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), counts[i]);
+        }
+        return true;
+    }
+
+    // Lazy mode: defer alloc + tensor-data load until the first compute().
+    // The caller still runs alloc_params_buffer + get_param_tensors at init,
+    // but for a lazy runner alloc_params_buffer is a no-op and the bulk loader
+    // skips this runner's tensors (they have no buffer yet); ensure_params_loaded()
+    // then allocates and invokes lazy_load_fn() on demand, and the params are
+    // freed after the phase (free_params_immediately) so components time-share VRAM.
+    void set_lazy_load(std::function<bool()> fn) {
+        lazy_load_fn = std::move(fn);
+    }
+
+    // True once a (non-lazy) buffer exists OR a lazy load has materialized one.
+    bool params_loaded() const {
+        return params_buffer != nullptr || !multi_params_buffers.empty() ||
+               row_split_buffer != nullptr || row_main_buffer != nullptr;
+    }
+
+    bool ensure_params_loaded() {
+        if (params_loaded()) {
+            return true;
+        }
+        if (!lazy_load_fn) {
+            // Non-lazy runner with no buffer: either it had no tensors, or its
+            // params are mmap-resident (data already set). Nothing to do.
+            return true;
+        }
+        int64_t t0 = ggml_time_ms();
+        if (!do_alloc_params_buffer()) {
+            return false;
+        }
+        if (!lazy_load_fn()) {
+            LOG_ERROR("%s: lazy params load failed", get_desc().c_str());
+            return false;
+        }
+        LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (ggml_time_ms() - t0) / 1000.f);
+        return true;
+    }
+
+    bool alloc_params_buffer() {
+        // Defer to first compute() for lazy runners (see set_lazy_load).
+        if (lazy_load_fn) {
+            return true;
+        }
+        return do_alloc_params_buffer();
+    }
+
+    bool do_alloc_params_buffer() {
+        if (multi_backend_mode) {
+            // Split allocation bypasses the mmap fast-path: the params must land
+            // in the GPU split buffers, not stay mmap'd.
+            if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+                return alloc_params_buffer_row_split();
+            }
+            return alloc_params_buffer_layer_split();
+        }
+        size_t num_tensors = ggml_tensor_num(params_ctx);
+        if (num_tensors > 0) {
+            // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
+            // (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch.
+            bool all_have_data = true;
+            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+                if (t->data == nullptr) {
+                    all_have_data = false;
+                    break;
+                }
+            }
+            if (all_have_data) {
+                LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str());
+                params_buffer = nullptr;
+                rebuild_params_tensor_set();
+                return true;
+            }
+        } else {
+            LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
+            return true;
+        }
+        // Pinned host buffer when CPU-offloaded for DMA-direct H2D.
+        ggml_backend_buffer_type_t params_buft = nullptr;
+        if (params_backend != runtime_backend) {
+            ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
+            if (runtime_dev != nullptr) {
+                params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
+            }
+        }
+        if (params_buft == nullptr) {
+            params_buft = ggml_backend_get_default_buffer_type(params_backend);
+        }
+        params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
+        if (params_buffer == nullptr) {
+            LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
+                      get_desc().c_str(),
+                      num_tensors);
+            return false;
+        }
+        rebuild_params_tensor_set();
+        ggml_backend_buffer_set_usage(params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
+        LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
+                  get_desc().c_str(),
+                  params_buffer_size / (1024.f * 1024.f),
+                  sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
+                  num_tensors);
+        return true;
+    }
+
+    void free_params_buffer() {
+        if (params_buffer != nullptr) {
+            ggml_backend_buffer_free(params_buffer);
+            params_buffer = nullptr;
+        }
+        // Multi-GPU split buffers (layer-split: one per backend; row-split:
+        // split + main). The split buft itself is backend-cached, not freed.
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_free(buf);
+            }
+        }
+        multi_params_buffers.clear();
+        if (row_split_buffer != nullptr) {
+            ggml_backend_buffer_free(row_split_buffer);
+            row_split_buffer = nullptr;
+        }
+        if (row_main_buffer != nullptr) {
+            ggml_backend_buffer_free(row_main_buffer);
+            row_main_buffer = nullptr;
+        }
+        // Release the multi-backend scheduler as well. Its reserved compute
+        // buffers can be GBs on each device, and free_compute_buffer only
+        // sched_reset()s them (kept alive across the sampling loop to avoid a
+        // per-step rebuild). free_params_buffer is the end-of-phase release, so
+        // here we actually free the sched so the next component can claim that
+        // VRAM (time-share). It is recreated lazily on the next compute().
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+            sched          = nullptr;
+            sched_reserved = false;
+        }
+        observed_max_effective_budget_ = 0;
+    }
+
+    size_t get_params_buffer_size() {
+        size_t total = 0;
+        if (params_buffer != nullptr) {
+            total += ggml_backend_buffer_get_size(params_buffer);
+        }
+        for (auto* buf : multi_params_buffers) {
+            if (buf != nullptr) {
+                total += ggml_backend_buffer_get_size(buf);
+            }
+        }
+        if (row_split_buffer != nullptr) {
+            total += ggml_backend_buffer_get_size(row_split_buffer);
+        }
+        if (row_main_buffer != nullptr) {
+            total += ggml_backend_buffer_get_size(row_main_buffer);
+        }
+        return total;
+    }
+
 public:
     void free_cache_ctx_and_buffer() {
         free_cache_buffer();
@@ -2681,10 +3247,23 @@ struct GGMLRunner {
             ggml_gallocr_free(compute_allocr);
             compute_allocr = nullptr;
         }
+        if (sched != nullptr) {
+            // Reset (not free): keeping the sched alive across the sampling
+            // loop's compute() calls avoids a per-step rebuild. It is freed in
+            // the destructor.
+            ggml_backend_sched_reset(sched);
+            sched_reserved = false;
+        }
     }
 
     // do copy after alloc graph
     void set_backend_tensor_data(ggml_tensor* tensor, const void* data) {
+        // In multi-backend mode, sched needs the tensor flagged as input so it
+        // gets a concrete backend assignment (tensors with no producers and no
+        // consumers otherwise stay at backend_id = -1 and never get a buffer).
+        if (multi_backend_mode) {
+            ggml_set_input(tensor);
+        }
         backend_tensor_data_map[tensor] = data;
     }
 
@@ -2768,6 +3347,11 @@ struct GGMLRunner {
         };
         RunnerDoneGuard runner_done_guard(this, auto_free);
 
+        // Lazy split runners allocate and load params on first use of the phase.
+        if (!ensure_params_loaded()) {
+            return std::nullopt;
+        }
+
         ggml_cgraph* gf = nullptr;
         if (!prepare_compute_graph(get_graph, &gf)) {
             return std::nullopt;
@@ -2815,6 +3399,10 @@ struct GGMLRunner {
         weight_adapter = adapter;
     }
 
+    void set_weight_manager(std::shared_ptr<RunnerWeightManager> manager) {
+        weight_manager = std::move(manager);
+    }
+
     void set_max_graph_vram_bytes(size_t max_vram_bytes) {
         max_graph_vram_bytes = max_vram_bytes;
     }
@@ -2822,6 +3410,53 @@ struct GGMLRunner {
     void set_stream_layers_enabled(bool enabled) {
         stream_layers_enabled = enabled;
     }
+
+    // Configure a multi-GPU split for this runner. Must be called AFTER
+    // construction + get_param_tensors() and BEFORE alloc_params_buffer().
+    // For ROW_SPLIT, resolves the backend's stock split buffer type; if the
+    // backend has none (non-CUDA/SYCL), it cleanly falls back to single-GPU.
+    void set_multi_backend_spec(const MultiBackendSpec& spec) {
+        if (params_buffer != nullptr || !multi_params_buffers.empty() ||
+            row_split_buffer != nullptr || row_main_buffer != nullptr) {
+            LOG_ERROR("%s set_multi_backend_spec called after params were allocated; ignoring",
+                      get_desc().c_str());
+            return;
+        }
+        multi_backend_mode  = true;
+        multi_backend_kind  = spec.mode;
+        additional_backends = spec.additional_backends;
+        tensor_backend_fn   = spec.tensor_backend_fn;
+        row_split_ratios    = spec.tensor_split_ratios;
+        row_main_device     = spec.main_device;
+        if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) {
+            row_split_buft = sd_backend_split_buffer_type(
+                runtime_backend,
+                row_main_device,
+                row_split_ratios.empty() ? nullptr : row_split_ratios.data());
+            if (row_split_buft == nullptr) {
+                LOG_WARN("%s row-split unavailable on this backend; falling back to single-GPU",
+                         get_desc().c_str());
+                multi_backend_mode = false;
+                additional_backends.clear();
+                tensor_backend_fn = nullptr;
+                return;
+            }
+        }
+        // Streaming (graph-cut param offload) is mutually exclusive with split.
+        stream_layers_enabled = false;
+    }
+
+    bool is_multi_backend() const {
+        return multi_backend_mode;
+    }
+
+    ggml_backend_t get_runtime_backend() {
+        return runtime_backend;
+    }
+
+    ggml_backend_t get_params_backend() {
+        return params_backend;
+    }
 };
 
 class GGMLBlock {
diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp
index f3e2cceba..73a28df10 100644
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@@ -544,6 +544,10 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
     return init_cached_backend(name);
 }
 
+ggml_backend_t SDBackendManager::ensure_backend(const std::string& device_name) {
+    return init_cached_backend(device_name);
+}
+
 bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
     return sd_backend_is_cpu(runtime_backend(module));
 }
@@ -687,3 +691,22 @@ const char* sd_backend_module_name(SDBackendModule module) {
     }
     return "unknown";
 }
+
+ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) {
+    if (backend == nullptr) {
+        return nullptr;
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (dev == nullptr) {
+        return nullptr;
+    }
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    if (reg == nullptr) {
+        return nullptr;
+    }
+    auto fn = (ggml_backend_split_buffer_type_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+    if (fn == nullptr) {
+        return nullptr;  // backend has no row-split support (non-CUDA/SYCL)
+    }
+    return fn(main_device, tensor_split);
+}
diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h
index 9aecf97c0..d1a492750 100644
--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@@ -57,6 +57,12 @@ class SDBackendManager {
     ggml_backend_t runtime_backend(SDBackendModule module);
     ggml_backend_t params_backend(SDBackendModule module);
 
+    // Return (creating + caching on first use) the backend for an explicit
+    // ggml device name (e.g. "CUDA1"). Used to obtain the additional GPU
+    // backends a multi-GPU split needs; the manager owns the handle and frees
+    // it once at teardown, so callers only borrow it.
+    ggml_backend_t ensure_backend(const std::string& device_name);
+
     bool runtime_backend_is_cpu(SDBackendModule module);
     bool params_backend_is_cpu(SDBackendModule module);
     bool params_backend_is_disk(SDBackendModule module) const;
@@ -74,5 +80,13 @@ bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 std::string sd_backend_resolve_name(const std::string& name);
 const char* sd_backend_module_name(SDBackendModule module);
 void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
+
+// Runtime lookup of a backend's row-split buffer type, published by the CUDA
+// and SYCL backends as the "ggml_backend_split_buffer_type" proc. Returns
+// nullptr when the backend does not support row-split (the caller then falls
+// back to a non-split single-GPU path). `tensor_split` is a per-device weight
+// array of length = the backend registry's device count; `main_device` is the
+// index of the device that owns the non-split portion.
+ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split);
 bool add_rpc_devices(const std::string& servers);
 #endif  // __SD_CORE_GGML_EXTEND_BACKEND_H__
diff --git a/src/core/util.cpp b/src/core/util.cpp
index 7325607e0..b10e53ed7 100644
--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@@ -25,6 +25,7 @@
 #include <unistd.h>
 #endif
 
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "stable-diffusion.h"
 
@@ -972,3 +973,12 @@ std::vector<std::pair<std::string, float>> split_quotation_attention(
     }
     return result;
 }
+
+void sd_list_devices(void) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        const char* name       = ggml_backend_dev_name(dev);
+        const char* desc       = ggml_backend_dev_description(dev);
+        printf("%s\t%s\n", name ? name : "", desc ? desc : "");
+    }
+}
diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp
index b89ff32c6..8ef2ffe81 100644
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
@@ -1606,8 +1606,13 @@ namespace LTXV {
             if (config.cross_attention_adaln) {
                 auto prompt_adaln_single       = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["prompt_adaln_single"]);
                 auto audio_prompt_adaln_single = std::dynamic_pointer_cast<AdaLayerNormSingle>(blocks["audio_prompt_adaln_single"]);
-                v_prompt_timestep_mod          = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
-                a_prompt_timestep_mod          = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
+                // The reference feeds modality.sigma (the RAW per-batch sigma) to
+                // both prompt adalns. effective_audio_timestep is exactly that:
+                // audio timesteps are never denoise-masked, so it carries the
+                // unmasked sigma even in i2v. The VIDEO timestep tensor is the
+                // denoise-masked per-token one and must NOT be used here.
+                v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
+                a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first;
             }
 
             auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep);
diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp
index 74dc232e5..685af7502 100644
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@@ -1,4 +1,4 @@
-﻿#ifndef __SD_MODEL_TE_LLM_HPP__
+#ifndef __SD_MODEL_TE_LLM_HPP__
 #define __SD_MODEL_TE_LLM_HPP__
 
 #include <algorithm>
diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp
index 23da08222..745442aee 100644
--- a/src/model/te/t5.hpp
+++ b/src/model/te/t5.hpp
@@ -1,608 +1,608 @@
-﻿#ifndef __SD_MODEL_TE_T5_HPP__
-#define __SD_MODEL_TE_T5_HPP__
-
-#include <cfloat>
-#include <limits>
-#include <map>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "core/ggml_extend.hpp"
-#include "model_loader.h"
-#include "model_manager.h"
-#include "tokenizers/t5_unigram_tokenizer.h"
-
-struct T5Config {
-    int64_t num_layers      = 24;
-    int64_t model_dim       = 4096;
-    int64_t ff_dim          = 10240;
-    int64_t num_heads       = 64;
-    int64_t vocab_size      = 32128;
-    bool relative_attention = true;
-
-    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
-                                        const std::string& prefix,
-                                        bool is_umt5 = false) {
-        (void)tensor_storage_map;
-        (void)prefix;
-        T5Config config;
-        if (is_umt5) {
-            config.vocab_size         = 256384;
-            config.relative_attention = false;
-        }
-        return config;
-    }
-};
-
-class T5LayerNorm : public UnaryBlock {
-protected:
-    int64_t hidden_size;
-    float eps;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = GGML_TYPE_F32;
-        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
-    }
-
-public:
-    T5LayerNorm(int64_t hidden_size,
-                float eps = 1e-06f)
-        : hidden_size(hidden_size),
-          eps(eps) {}
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        ggml_tensor* w = params["weight"];
-        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x              = ggml_mul(ctx->ggml_ctx, x, w);
-        return x;
-    }
-};
-
-struct T5DenseActDense : public UnaryBlock {
-public:
-    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
-        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        x = wi->forward(ctx, x);
-        x = ggml_relu_inplace(ctx->ggml_ctx, x);
-        x = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5DenseGatedActDense : public UnaryBlock {
-public:
-    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        float scale    = 1.f / 32.f;
-        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
-        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
-        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
-        auto hidden_linear = wi_1->forward(ctx, x);
-        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
-        x                  = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5LayerFF : public UnaryBlock {
-public:
-    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
-        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
-        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
-        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto forwarded_states = layer_norm->forward(ctx, x);
-        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
-        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
-        return x;
-    }
-};
-
-class T5Attention : public GGMLBlock {
-protected:
-    int64_t model_dim;
-    int64_t inner_dim;
-    int64_t num_heads;
-    bool using_relative_attention_bias;
-    int64_t relative_attention_num_buckets  = 32;
-    int64_t relative_attention_max_distance = 128;
-
-public:
-    T5Attention(int64_t model_dim,
-                int64_t inner_dim,
-                int64_t num_heads,
-                bool using_relative_attention_bias = false)
-        : model_dim(model_dim),
-          inner_dim(inner_dim),
-          num_heads(num_heads),
-          using_relative_attention_bias(using_relative_attention_bias) {
-        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
-        if (using_relative_attention_bias) {
-            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
-        }
-    }
-
-    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
-                              ggml_tensor* relative_position_bucket) {
-        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
-
-        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
-        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
-        return values;
-    }
-
-    // x: [N, n_token, model_dim]
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
-
-        int64_t n_head = num_heads;
-        int64_t d_head = inner_dim / n_head;
-
-        auto q = q_proj->forward(ctx, x);
-        auto k = k_proj->forward(ctx, x);
-        auto v = v_proj->forward(ctx, x);
-
-        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
-            past_bias = compute_bias(ctx, relative_position_bucket);
-        }
-        if (past_bias != nullptr) {
-            if (mask != nullptr) {
-                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
-                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
-            } else {
-                mask = past_bias;
-            }
-        }
-
-        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
-
-        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
-
-        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
-        return {x, past_bias};
-    }
-};
-
-struct T5LayerSelfAttention : public GGMLBlock {
-public:
-    T5LayerSelfAttention(int64_t model_dim,
-                         int64_t inner_dim,
-                         int64_t ff_dim,
-                         int64_t num_heads,
-                         bool using_relative_attention_bias) {
-        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
-        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
-        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto normed_hidden_state = layer_norm->forward(ctx, x);
-        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
-        auto output              = ret.first;
-        past_bias                = ret.second;
-
-        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Block : public GGMLBlock {
-public:
-    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
-        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
-        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
-        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
-
-        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
-        x         = ret.first;
-        past_bias = ret.second;
-        x         = layer_1->forward(ctx, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Stack : public GGMLBlock {
-    int64_t num_layers;
-
-public:
-    T5Stack(int64_t num_layers,
-            int64_t model_dim,
-            int64_t inner_dim,
-            int64_t ff_dim,
-            int64_t num_heads,
-            bool relative_attention = true)
-        : num_layers(num_layers) {
-        for (int i = 0; i < num_layers; i++) {
-            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
-        }
-
-        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr,
-                         const std::string& graph_cut_prefix   = "") {
-        // x: [N, n_token, model_dim]
-        for (int i = 0; i < num_layers; i++) {
-            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
-
-            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-            x         = ret.first;
-            past_bias = ret.second;
-            if (!graph_cut_prefix.empty()) {
-                sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x");
-            }
-        }
-
-        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
-
-        x = final_layer_norm->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5 : public GGMLBlock {
-    T5Config config;
-
-public:
-    T5() {}
-    T5(T5Config config)
-        : config(config) {
-        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(config.num_layers,
-                                                                   config.model_dim,
-                                                                   config.model_dim,
-                                                                   config.ff_dim,
-                                                                   config.num_heads,
-                                                                   config.relative_attention));
-        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size,
-                                                                     config.model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr) {
-        // input_ids: [N, n_token]
-
-        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
-        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
-
-        auto x = shared->forward(ctx, input_ids);
-        sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x");
-        x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5");
-        return x;
-    }
-};
-
-struct T5Runner : public GGMLRunner {
-    T5Config config;
-    T5 model;
-    std::vector<int> relative_position_bucket_vec;
-
-    T5Runner(ggml_backend_t backend,
-             const String2TensorStorage& tensor_storage_map,
-             const std::string prefix,
-             bool is_umt5                                        = false,
-             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : GGMLRunner(backend, weight_manager),
-          config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
-        model = T5(config);
-        model.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    std::string get_desc() override {
-        return "t5";
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* relative_position_bucket,
-                         ggml_tensor* attention_mask = nullptr) {
-        size_t N       = input_ids->ne[1];
-        size_t n_token = input_ids->ne[0];
-
-        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
-        return hidden_states;
-    }
-
-    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
-                             const sd::Tensor<float>& attention_mask_tensor = {}) {
-        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
-        ggml_tensor* input_ids      = make_input(input_ids_tensor);
-        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
-
-        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
-
-        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
-        //     if (i % 77 == 0) {
-        //         printf("\n");
-        //     }
-        //     printf("%d ", relative_position_bucket_vec[i]);
-        // }
-
-        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
-                                                           GGML_TYPE_I32,
-                                                           input_ids->ne[0],
-                                                           input_ids->ne[0]);
-        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
-
-        auto runner_ctx            = get_context();
-        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    sd::Tensor<float> compute(const int n_threads,
-                              const sd::Tensor<int32_t>& input_ids,
-                              const sd::Tensor<float>& attention_mask,
-                              bool auto_free           = true,
-                              bool free_compute_buffer = true,
-                              bool free_compute_params = true) {
-        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(input_ids, attention_mask);
-        };
-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3);
-    }
-
-    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
-                                                      bool bidirectional = true,
-                                                      int num_buckets    = 32,
-                                                      int max_distance   = 128) {
-        std::vector<int> relative_buckets(relative_position.size(), 0);
-        std::vector<int> abs_relative_position = relative_position;
-
-        if (bidirectional) {
-            num_buckets = num_buckets / 2;
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                if (relative_position[i] > 0) {
-                    relative_buckets[i] += num_buckets;
-                }
-                abs_relative_position[i] = std::abs(relative_position[i]);
-            }
-        } else {
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                abs_relative_position[i] = std::max(-relative_position[i], 0);
-            }
-        }
-
-        int max_exact = num_buckets / 2;
-        std::vector<int> relative_position_if_large(relative_position.size(), 0);
-
-        for (size_t i = 0; i < relative_position.size(); ++i) {
-            if (abs_relative_position[i] < max_exact) {
-                relative_buckets[i] += abs_relative_position[i];
-            } else {
-                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
-                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
-                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
-                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
-                relative_buckets[i] += relative_position_if_large[i];
-            }
-        }
-
-        return relative_buckets;
-    }
-
-    std::vector<int> compute_relative_position_bucket(int query_length,
-                                                      int key_length) {
-        std::vector<int> context_position(query_length);
-        std::vector<int> memory_position(key_length);
-
-        for (int i = 0; i < query_length; ++i) {
-            context_position[i] = i;
-        }
-        for (int i = 0; i < key_length; ++i) {
-            memory_position[i] = i;
-        }
-
-        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
-        for (int i = 0; i < query_length; ++i) {
-            for (int j = 0; j < key_length; ++j) {
-                relative_position[i][j] = memory_position[j] - context_position[i];
-            }
-        }
-
-        std::vector<int> relative_position_bucket;
-        for (int i = 0; i < query_length; ++i) {
-            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
-            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
-        }
-
-        return relative_position_bucket;
-    }
-};
-
-struct T5Embedder {
-    T5UniGramTokenizer tokenizer;
-    T5Runner model;
-
-    T5Embedder(ggml_backend_t backend,
-               const String2TensorStorage& tensor_storage_map      = {},
-               const std::string prefix                            = "",
-               bool is_umt5                                        = false,
-               std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
-        : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) {
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
-                                                                                  size_t max_length = 0,
-                                                                                  bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.encode(curr_text);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-
-        std::vector<float> attention_mask;
-
-        tokenizer.pad_tokens(tokens, &weights, &attention_mask, padding ? max_length : 0, padding ? max_length : 100000000, padding);
-        for (auto& mask_value : attention_mask) {
-            mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF;
-        }
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights, attention_mask};
-    }
-
-    void test() {
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        ggml_context* ctx = ggml_init(params);
-        GGML_ASSERT(ctx != nullptr);
-
-        {
-            std::string text("a lovely cat");
-            auto tokens_and_weights     = tokenize(text, 512, true);
-            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
-            std::vector<float>& weights = std::get<1>(tokens_and_weights);
-            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
-            for (auto token : tokens) {
-                printf("%d ", token);
-            }
-            printf("\n");
-            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
-            auto attention_mask = sd::Tensor<float>::from_vector(masks);
-            sd::Tensor<float> out;
-
-            int64_t t0   = ggml_time_ms();
-            auto out_opt = model.compute(8, input_ids, attention_mask);
-            int64_t t1   = ggml_time_ms();
-
-            GGML_ASSERT(!out_opt.empty());
-            out = std::move(out_opt);
-            print_sd_tensor(out);
-            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
-        }
-    }
-
-    static void load_from_file_and_test(const std::string& file_path) {
-        // cpu f16: pass
-        // cpu f32: pass
-        // cuda f16: pass
-        // cuda f32: pass
-        // cuda q8_0: pass
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
-        ggml_backend_t backend    = sd_backend_cpu_init();
-        ggml_type model_data_type = GGML_TYPE_F16;
-
-        auto model_manager        = std::make_shared<ModelManager>();
-        ModelLoader& model_loader = model_manager->loader();
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
-            return;
-        }
-
-        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (ends_with(name, "weight")) {
-                tensor_storage.expected_type = model_data_type;
-            }
-        }
-
-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, tensor_storage_map, "", true, model_manager);
-
-        if (!model_manager->register_runner_params("T5 test",
-                                                   *t5,
-                                                   "",
-                                                   ModelManager::ResidencyMode::ParamBackend,
-                                                   backend,
-                                                   backend) ||
-            !model_manager->validate_registered_tensors()) {
-            LOG_ERROR("register t5 tensors with model manager failed");
-            return;
-        }
-
-        LOG_INFO("t5 model loaded");
-        t5->test();
-    }
-};
-
-#endif  // __SD_MODEL_TE_T5_HPP__
+#ifndef __SD_MODEL_TE_T5_HPP__
+#define __SD_MODEL_TE_T5_HPP__
+
+#include <cfloat>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "core/ggml_extend.hpp"
+#include "model_loader.h"
+#include "model_manager.h"
+#include "tokenizers/t5_unigram_tokenizer.h"
+
+struct T5Config {
+    int64_t num_layers      = 24;
+    int64_t model_dim       = 4096;
+    int64_t ff_dim          = 10240;
+    int64_t num_heads       = 64;
+    int64_t vocab_size      = 32128;
+    bool relative_attention = true;
+
+    static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                        const std::string& prefix,
+                                        bool is_umt5 = false) {
+        (void)tensor_storage_map;
+        (void)prefix;
+        T5Config config;
+        if (is_umt5) {
+            config.vocab_size         = 256384;
+            config.relative_attention = false;
+        }
+        return config;
+    }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        ggml_tensor* w = params["weight"];
+        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x              = ggml_mul(ctx->ggml_ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx->ggml_ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        float scale    = 1.f / 32.f;
+        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
+                              ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != nullptr) {
+            if (mask != nullptr) {
+                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
+                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads,
+            bool relative_attention = true)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr,
+                         const std::string& graph_cut_prefix   = "") {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+            if (!graph_cut_prefix.empty()) {
+                sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x");
+            }
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5 : public GGMLBlock {
+    T5Config config;
+
+public:
+    T5() {}
+    T5(T5Config config)
+        : config(config) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(config.num_layers,
+                                                                   config.model_dim,
+                                                                   config.model_dim,
+                                                                   config.ff_dim,
+                                                                   config.num_heads,
+                                                                   config.relative_attention));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(config.vocab_size,
+                                                                     config.model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x");
+        x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5");
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5Config config;
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             const String2TensorStorage& tensor_storage_map,
+             const std::string prefix,
+             bool is_umt5                                        = false,
+             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : GGMLRunner(backend, weight_manager),
+          config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
+        model = T5(config);
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    std::string get_desc() override {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* relative_position_bucket,
+                         ggml_tensor* attention_mask = nullptr) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                             const sd::Tensor<float>& attention_mask_tensor = {}) {
+        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
+        ggml_tensor* input_ids      = make_input(input_ids_tensor);
+        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        auto runner_ctx            = get_context();
+        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              const sd::Tensor<float>& attention_mask,
+                              bool auto_free           = true,
+                              bool free_compute_buffer = true,
+                              bool free_compute_params = true) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(input_ids, attention_mask);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               const String2TensorStorage& tensor_storage_map      = {},
+               const std::string prefix                            = "",
+               bool is_umt5                                        = false,
+               std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) {
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
+                                                                                  size_t max_length = 0,
+                                                                                  bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        std::vector<float> attention_mask;
+
+        tokenizer.pad_tokens(tokens, &weights, &attention_mask, padding ? max_length : 0, padding ? max_length : 100000000, padding);
+        for (auto& mask_value : attention_mask) {
+            mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF;
+        }
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights, attention_mask};
+    }
+
+    void test() {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = nullptr;
+        params.no_alloc   = false;
+
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
+
+        {
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 512, true);
+            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+            std::vector<float>& weights = std::get<1>(tokens_and_weights);
+            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
+            auto attention_mask = sd::Tensor<float>::from_vector(masks);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = model.compute(8, input_ids, attention_mask);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // cpu f16: pass
+        // cpu f32: pass
+        // cuda f16: pass
+        // cuda f32: pass
+        // cuda q8_0: pass
+        // ggml_backend_t backend = ggml_backend_cuda_init(0);
+        ggml_backend_t backend    = sd_backend_cpu_init();
+        ggml_type model_data_type = GGML_TYPE_F16;
+
+        auto model_manager        = std::make_shared<ModelManager>();
+        ModelLoader& model_loader = model_manager->loader();
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+            return;
+        }
+
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (ends_with(name, "weight")) {
+                tensor_storage.expected_type = model_data_type;
+            }
+        }
+
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, tensor_storage_map, "", true, model_manager);
+
+        if (!model_manager->register_runner_params("T5 test",
+                                                   *t5,
+                                                   "",
+                                                   ModelManager::ResidencyMode::ParamBackend,
+                                                   backend,
+                                                   backend) ||
+            !model_manager->validate_registered_tensors()) {
+            LOG_ERROR("register t5 tensors with model manager failed");
+            return;
+        }
+
+        LOG_INFO("t5 model loaded");
+        t5->test();
+    }
+};
+
+#endif  // __SD_MODEL_TE_T5_HPP__
diff --git a/src/model_loader.h b/src/model_loader.h
index 4dc700f20..529f3e890 100644
--- a/src/model_loader.h
+++ b/src/model_loader.h
@@ -27,6 +27,8 @@ struct MmapTensorStore {
     std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
 };
 
+bool is_unused_tensor(const std::string& name);
+
 class ModelLoader {
 protected:
     SDVersion version_ = VERSION_COUNT;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 836b0f85b..13a5e14ed 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -12,6 +12,8 @@
 #include "core/rng_mt19937.hpp"
 #include "core/rng_philox.hpp"
 #include "core/util.h"
+
+#include "backend_fit.hpp"
 #include "model_loader.h"
 #include "model_manager.h"
 #include "stable-diffusion.h"
@@ -194,6 +196,27 @@ class StableDiffusionGGML {
     std::string backend_spec;
     std::string params_backend_spec;
 
+    // DiT multi-GPU split decision captured from the auto-fit plan and applied
+    // to the diffusion runner(s) before param load. OFF when the DiT is not
+    // split. device_ids[0] is the "main" GPU (largest); share_bytes is the
+    // per-device VRAM share (same order as device_ids).
+    backend_fit::MultiGpuMode fit_dit_split_mode = backend_fit::MultiGpuMode::OFF;
+    std::vector<std::string>  fit_dit_split_device_names;  // ggml device names, [0] = main
+    std::vector<int64_t>      fit_dit_split_share_bytes;
+    // Conditioner (LLM) split decision — always layer-split when it splits
+    // (only the DiT ever row-splits; see backend_fit::supports_tensor_split).
+    backend_fit::MultiGpuMode fit_cond_split_mode = backend_fit::MultiGpuMode::OFF;
+    std::vector<std::string>  fit_cond_split_device_names;
+    std::vector<int64_t>      fit_cond_split_share_bytes;
+
+    // Auto-fit decided the components can't all be resident at once (the
+    // per-component MAX plan only fits if they time-share), so defer the heavy
+    // components' param alloc+load to their compute phase and free after.
+    bool auto_lazy_load = false;
+    // auto-fit is on: when a VAE decode OOMs we may auto-enable tiling and retry
+    // (temporal for LTX video, spatial otherwise) instead of failing.
+    bool auto_fit_enabled = false;
+
     bool is_using_v_parameterization     = false;
     bool is_using_edm_v_parameterization = false;
 
@@ -233,6 +256,12 @@ class StableDiffusionGGML {
         return params_backend_for(module) != nullptr;
     }
 
+    // Initialize the backend manager from backend_spec / params_backend_spec.
+    // These hold the user's --backend / --params-backend by default, but when
+    // auto-fit is enabled they are overwritten with the computed plan before
+    // this runs. The keep_*_on_cpu shortcuts were replaced by the spec
+    // mechanism (e.g. "vae=cpu"), so they are always false here.
+
     template <typename T>
     bool register_runner_params(const std::string& desc,
                                 const std::shared_ptr<T>& model,
@@ -265,6 +294,338 @@ class StableDiffusionGGML {
         return ensure_backend_pair(SDBackendModule::DIFFUSION);
     }
 
+    // Parse a transformer block index out of a weight name, or -1 if none.
+    static int dit_block_index_of(const std::string& name) {
+        static const char* kw[] = {"transformer_blocks.", "joint_blocks.", "double_blocks.",
+                                   "single_blocks.", "blocks.", "layers."};
+        for (const char* k : kw) {
+            size_t p = name.find(k);
+            if (p == std::string::npos) {
+                continue;
+            }
+            p += strlen(k);
+            size_t e = p;
+            while (e < name.size() && name[e] >= '0' && name[e] <= '9') {
+                e++;
+            }
+            if (e > p) {
+                return atoi(name.substr(p, e - p).c_str());
+            }
+        }
+        return -1;
+    }
+
+    // Build a MultiBackendSpec from the auto-fit DiT split decision and apply it
+    // to a diffusion runner BEFORE its params are allocated. No-op when the DiT
+    // is not split. Always returns true (any failure falls back to single-GPU).
+    bool apply_dit_multi_gpu_split(const std::shared_ptr<DiffusionModelRunner>& runner,
+                                   ModelLoader& model_loader) {
+        if (!runner || fit_dit_split_mode == backend_fit::MultiGpuMode::OFF ||
+            fit_dit_split_device_names.size() < 2) {
+            return true;
+        }
+        const auto& devnames = fit_dit_split_device_names;
+        const auto& shares   = fit_dit_split_share_bytes;
+        ggml_backend_t main_backend = runner->get_runtime_backend();
+        MultiBackendSpec spec;
+
+        if (fit_dit_split_mode == backend_fit::MultiGpuMode::ROW) {
+            // ROW: one main backend; matmul rows are split across the devices by
+            // the stock split buft. sched still needs the extra backends so it
+            // can route the cross-device reductions.
+            auto reg_prefix_of = [](const std::string& n) -> std::string {
+                size_t i = 0;
+                while (i < n.size() && !(n[i] >= '0' && n[i] <= '9')) {
+                    i++;
+                }
+                return n.substr(0, i);
+            };
+            std::string        reg_name = reg_prefix_of(devnames[0]);
+            ggml_backend_reg_t reg      = ggml_backend_reg_by_name(reg_name.c_str());
+            if (reg == nullptr) {
+                LOG_WARN("row-split: backend registry '%s' not found; using single GPU", reg_name.c_str());
+                return true;
+            }
+            int dev_count = (int)ggml_backend_reg_dev_count(reg);
+            if (dev_count <= 0) {
+                return true;
+            }
+            auto reg_index_of = [&](const std::string& n) -> int {
+                if (n.rfind(reg_name, 0) != 0) {
+                    return -1;
+                }
+                try {
+                    return std::stoi(n.substr(reg_name.size()));
+                } catch (...) {
+                    return -1;
+                }
+            };
+            int64_t total = 0;
+            for (auto b : shares) {
+                total += b;
+            }
+            if (total <= 0) {
+                return true;
+            }
+            std::vector<float> ratios(dev_count, 0.f);
+            for (size_t k = 0; k < devnames.size(); k++) {
+                int idx = reg_index_of(devnames[k]);
+                if (idx < 0 || idx >= dev_count) {
+                    continue;
+                }
+                ratios[idx] = float(double(shares[k]) / double(total));
+            }
+            // The main device must be the runner's runtime backend, which the
+            // planner set to devnames[0] (the largest-VRAM GPU, listed first).
+            // Keeping these aligned ensures the split buft's non-split portion
+            // and the runner's compute buffer live on the same device.
+            int main_dev = reg_index_of(devnames[0]);
+            if (main_dev < 0 || main_dev >= dev_count) {
+                return true;
+            }
+            for (size_t k = 0; k < devnames.size(); k++) {
+                int idx = reg_index_of(devnames[k]);
+                if (idx == main_dev || idx < 0) {
+                    continue;
+                }
+                ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+                if (b != nullptr) {
+                    spec.additional_backends.push_back(b);
+                } else {
+                    LOG_WARN("row-split: failed to init backend %s", devnames[k].c_str());
+                }
+            }
+            spec.mode                = MultiBackendMode::ROW_SPLIT;
+            spec.tensor_split_ratios = ratios;
+            spec.main_device         = main_dev;
+            LOG_INFO("DiT row-split across %zu devices (main reg-index %d)", devnames.size(), main_dev);
+        } else {
+            // LAYER: assign contiguous block ranges to per-device backends.
+            std::vector<ggml_backend_t> all_backends;
+            all_backends.push_back(main_backend);
+            for (size_t k = 1; k < devnames.size(); k++) {
+                ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+                if (b == nullptr) {
+                    LOG_WARN("layer-split: failed to init backend %s; using single GPU", devnames[k].c_str());
+                    return true;
+                }
+                spec.additional_backends.push_back(b);
+                all_backends.push_back(b);
+            }
+            const std::string tensor_prefix = "model.diffusion_model.";
+            std::map<int, int64_t> block_bytes;
+            int64_t                non_block_bytes = 0;
+            int                    max_block_idx   = -1;
+            for (const auto& kv : model_loader.get_tensor_storage_map()) {
+                if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                    continue;
+                }
+                int64_t bytes = (int64_t)kv.second.nbytes();
+                int     idx   = dit_block_index_of(kv.first);
+                if (idx >= 0) {
+                    block_bytes[idx] += bytes;
+                    if (idx > max_block_idx) {
+                        max_block_idx = idx;
+                    }
+                } else {
+                    non_block_bytes += bytes;
+                }
+            }
+            if (max_block_idx < 0) {
+                LOG_WARN("layer-split: no transformer blocks found; using single GPU");
+                return true;
+            }
+            const int n_blocks    = max_block_idx + 1;
+            int64_t   total_share = 0, total_block = 0;
+            for (auto s : shares) {
+                total_share += s;
+            }
+            for (const auto& kv : block_bytes) {
+                total_block += kv.second;
+            }
+            if (total_share <= 0) {
+                return true;
+            }
+            std::vector<int64_t> budgets(shares.size(), 0);
+            for (size_t k = 0; k < shares.size(); k++) {
+                int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share));
+                if (k == 0) {
+                    b = std::max<int64_t>(b - non_block_bytes, 0);  // backend 0 also holds non-block weights
+                }
+                budgets[k] = b;
+            }
+            std::vector<int> boundaries(shares.size(), 0);
+            size_t           cur     = 0;
+            int64_t          cur_use = 0;
+            for (int b = 0; b < n_blocks; b++) {
+                int64_t bb = block_bytes[b];
+                if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) {
+                    boundaries[cur] = b;
+                    cur++;
+                    cur_use = 0;
+                }
+                cur_use += bb;
+            }
+            for (size_t k = cur; k < boundaries.size(); k++) {
+                boundaries[k] = n_blocks;
+            }
+            for (size_t k = 0; k < boundaries.size(); k++) {
+                int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+                if (boundaries[k] < min_bound) {
+                    boundaries[k] = std::min(min_bound, n_blocks);
+                }
+            }
+            // Map each param tensor pointer to its backend (block range -> device).
+            auto ptr_backend = std::make_shared<std::map<ggml_tensor*, ggml_backend_t>>();
+            std::map<std::string, ggml_tensor*> dit_map;
+            runner->get_param_tensors(dit_map);
+            for (const auto& kv : dit_map) {
+                ggml_backend_t target = all_backends[0];
+                if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) == 0) {
+                    int idx = dit_block_index_of(kv.first);
+                    if (idx >= 0) {
+                        for (size_t k = 0; k < boundaries.size(); k++) {
+                            if (idx < boundaries[k]) {
+                                target = all_backends[std::min(k, all_backends.size() - 1)];
+                                break;
+                            }
+                        }
+                    }
+                }
+                (*ptr_backend)[kv.second] = target;
+            }
+            spec.mode              = MultiBackendMode::LAYER_SPLIT;
+            spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t {
+                auto it = ptr_backend->find(t);
+                return it != ptr_backend->end() ? it->second : main_backend;
+            };
+            LOG_INFO("DiT layer-split: %d blocks across %zu devices", n_blocks, all_backends.size());
+        }
+
+        runner->set_multi_backend_spec(spec);
+        return true;
+    }
+
+    // Conditioner (LLM) layer-split: same block-partition approach as the DiT
+    // layer-split, but applied to the conditioner's LLM sub-runner (tensors
+    // under "text_encoders.llm."). LAYER only — the conditioner never row-splits
+    // (only the DiT does, preserving the single-row-component invariant). The
+    // conditioner's small projector stays on the main backend.
+    bool apply_cond_multi_gpu_split(const std::shared_ptr<Conditioner>& cond, ModelLoader& model_loader) {
+        if (!cond || fit_cond_split_mode == backend_fit::MultiGpuMode::OFF ||
+            fit_cond_split_device_names.size() < 2) {
+            return true;
+        }
+        ggml_backend_t main_backend = backend_for(SDBackendModule::TE);
+        if (main_backend == nullptr) {
+            return true;
+        }
+        const auto& devnames = fit_cond_split_device_names;
+        const auto& shares   = fit_cond_split_share_bytes;
+        std::vector<ggml_backend_t> all_backends;
+        all_backends.push_back(main_backend);
+        MultiBackendSpec spec;
+        for (size_t k = 1; k < devnames.size(); k++) {
+            ggml_backend_t b = backend_manager.ensure_backend(devnames[k]);
+            if (b == nullptr) {
+                LOG_WARN("cond layer-split: failed to init backend %s; using single GPU", devnames[k].c_str());
+                return true;
+            }
+            spec.additional_backends.push_back(b);
+            all_backends.push_back(b);
+        }
+        const std::string tensor_prefix = "text_encoders.llm.";
+        std::map<int, int64_t> block_bytes;
+        int64_t                non_block_bytes = 0;
+        int                    max_block_idx   = -1;
+        for (const auto& kv : model_loader.get_tensor_storage_map()) {
+            if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                continue;
+            }
+            int64_t bytes = (int64_t)kv.second.nbytes();
+            int     idx   = dit_block_index_of(kv.first);
+            if (idx >= 0) {
+                block_bytes[idx] += bytes;
+                if (idx > max_block_idx) {
+                    max_block_idx = idx;
+                }
+            } else {
+                non_block_bytes += bytes;
+            }
+        }
+        if (max_block_idx < 0) {
+            LOG_WARN("cond layer-split: no transformer blocks under '%s'; using single GPU", tensor_prefix.c_str());
+            return true;
+        }
+        const int n_blocks    = max_block_idx + 1;
+        int64_t   total_share = 0, total_block = 0;
+        for (auto s : shares) {
+            total_share += s;
+        }
+        for (const auto& kv : block_bytes) {
+            total_block += kv.second;
+        }
+        if (total_share <= 0) {
+            return true;
+        }
+        std::vector<int64_t> budgets(shares.size(), 0);
+        for (size_t k = 0; k < shares.size(); k++) {
+            int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share));
+            if (k == 0) {
+                b = std::max<int64_t>(b - non_block_bytes, 0);
+            }
+            budgets[k] = b;
+        }
+        std::vector<int> boundaries(shares.size(), 0);
+        size_t           cur     = 0;
+        int64_t          cur_use = 0;
+        for (int b = 0; b < n_blocks; b++) {
+            int64_t bb = block_bytes[b];
+            if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) {
+                boundaries[cur] = b;
+                cur++;
+                cur_use = 0;
+            }
+            cur_use += bb;
+        }
+        for (size_t k = cur; k < boundaries.size(); k++) {
+            boundaries[k] = n_blocks;
+        }
+        for (size_t k = 0; k < boundaries.size(); k++) {
+            int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1;
+            if (boundaries[k] < min_bound) {
+                boundaries[k] = std::min(min_bound, n_blocks);
+            }
+        }
+        auto ptr_backend = std::make_shared<std::map<ggml_tensor*, ggml_backend_t>>();
+        std::map<std::string, ggml_tensor*> cond_map;
+        cond->get_param_tensors(cond_map);
+        for (const auto& kv : cond_map) {
+            if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) {
+                continue;  // only the LLM tensors are split; projector stays on main
+            }
+            ggml_backend_t target = all_backends[0];
+            int            idx    = dit_block_index_of(kv.first);
+            if (idx >= 0) {
+                for (size_t k = 0; k < boundaries.size(); k++) {
+                    if (idx < boundaries[k]) {
+                        target = all_backends[std::min(k, all_backends.size() - 1)];
+                        break;
+                    }
+                }
+            }
+            (*ptr_backend)[kv.second] = target;
+        }
+        spec.mode              = MultiBackendMode::LAYER_SPLIT;
+        spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t {
+            auto it = ptr_backend->find(t);
+            return it != ptr_backend->end() ? it->second : main_backend;
+        };
+        cond->set_multi_backend_spec(spec);
+        LOG_INFO("Conditioner LLM layer-split: %d blocks across %zu devices", n_blocks, all_backends.size());
+        return true;
+    }
+
     std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
         if (rng_type == STD_DEFAULT_RNG) {
             return std::make_shared<STDDefaultRNG>();
@@ -347,21 +708,10 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        if (!init_backend()) {
-            return false;
-        }
-        {
-            std::string error;
-            if (!max_vram_assignment.canonicalize_backend_keys(&error)) {
-                LOG_ERROR("%s", error.c_str());
-                return false;
-            }
-        }
-        if (stream_layers && !backend_manager.params_backend_is_cpu(SDBackendModule::DIFFUSION)) {
-            LOG_WARN("--stream-layers has no effect unless diffusion params backend is cpu; ignoring");
-            stream_layers = false;
-        }
-
+        // Backend initialization is deferred until after the model metadata is
+        // loaded, so auto-fit can size the components and choose device
+        // placements before the backends are created (see the auto-fit block
+        // below, which feeds its plan into init_backend()).
         model_manager = std::make_shared<ModelManager>();
         model_manager->set_n_threads(n_threads);
         model_manager->set_enable_mmap(enable_mmap);
@@ -523,6 +873,185 @@ class StableDiffusionGGML {
             return oss.str();
         };
 
+        auto_fit_enabled = sd_ctx_params->auto_fit;
+        if (sd_ctx_params->auto_fit) {
+            if (!backend_spec.empty() || !params_backend_spec.empty()) {
+                LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend "
+                         "(pass --no-auto-fit to set device placement manually)");
+            }
+
+            backend_fit::ComputeReserves reserves;
+            // Parse the per-component reserve map ("dit=2048,vae=1024,cond=512").
+            // Missing keys keep the built-in defaults.
+            if (sd_ctx_params->auto_fit_compute_reserve != nullptr) {
+                std::string spec(sd_ctx_params->auto_fit_compute_reserve);
+                size_t      pos = 0;
+                while (pos < spec.size()) {
+                    size_t      comma = spec.find(',', pos);
+                    std::string entry = spec.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos);
+                    pos               = comma == std::string::npos ? spec.size() : comma + 1;
+                    size_t eq         = entry.find('=');
+                    if (eq == std::string::npos) {
+                        LOG_WARN("auto-fit: ignoring malformed compute-reserve entry '%s' (expected component=MiB)", entry.c_str());
+                        continue;
+                    }
+                    std::string key = entry.substr(0, eq);
+                    int64_t     mib = std::atoll(entry.c_str() + eq + 1);
+                    if (mib <= 0) {
+                        LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (value must be a positive MiB count)", entry.c_str());
+                        continue;
+                    }
+                    backend_fit::ComponentKind kind;
+                    if (key == "dit" || key == "diffusion" || key == "model" || key == "unet") {
+                        kind = backend_fit::ComponentKind::DIT;
+                    } else if (key == "vae") {
+                        kind = backend_fit::ComponentKind::VAE;
+                    } else if (key == "cond" || key == "conditioner" || key == "te" || key == "clip") {
+                        kind = backend_fit::ComponentKind::CONDITIONER;
+                    } else {
+                        LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (unknown component, expected dit/vae/cond)", entry.c_str());
+                        continue;
+                    }
+                    switch (kind) {
+                        case backend_fit::ComponentKind::DIT:
+                            reserves.dit_bytes = mib * backend_fit::MiB;
+                            break;
+                        case backend_fit::ComponentKind::VAE:
+                            reserves.vae_bytes = mib * backend_fit::MiB;
+                            break;
+                        case backend_fit::ComponentKind::CONDITIONER:
+                            reserves.conditioner_bytes = mib * backend_fit::MiB;
+                            break;
+                    }
+                }
+            }
+            auto components = backend_fit::estimate_components(
+                model_loader, wtype, /*alignment=*/64, reserves);
+            auto    devices = backend_fit::enumerate_gpu_devices();
+            int64_t margin_bytes =
+                int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB;
+            backend_fit::MultiGpuMode multi_gpu_mode =
+                backend_fit::str_to_multi_gpu_mode(SAFE_STR(sd_ctx_params->multi_gpu_mode));
+            auto plan = backend_fit::compute_plan(
+                components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu, multi_gpu_mode);
+            backend_fit::print_plan(plan, components, devices, margin_bytes);
+
+            if (sd_ctx_params->auto_fit_dry_run) {
+                LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models");
+                return false;
+            }
+
+            // Translate the plan into the backend-assignment specs consumed by
+            // SDBackendManager. Each component lives entirely on one device:
+            //   GPU                -> runtime=<dev>             (params follow runtime)
+            //   GPU_OFFLOAD_PARAMS -> runtime=<dev>, params=cpu (params streamed from RAM)
+            //   CPU                -> runtime=cpu               (params follow runtime)
+            // Modules the planner doesn't cover (clip_vision, control_net,
+            // photomaker, upscaler) fall back to the default backend.
+            std::string runtime_spec;
+            std::string params_spec;
+            auto append_assignment = [](std::string& spec, const char* key, const std::string& value) {
+                if (!spec.empty()) {
+                    spec += ",";
+                }
+                spec += key;
+                spec += "=";
+                spec += value;
+            };
+            auto dev_name_by_id = [&](int id) -> std::string {
+                for (const auto& dev : devices) {
+                    if (dev.id == id) {
+                        return dev.name;
+                    }
+                }
+                return "";
+            };
+            auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) {
+                if (d == nullptr) {
+                    return;
+                }
+                if (d->placement == backend_fit::Placement::CPU) {
+                    append_assignment(runtime_spec, module_key, "cpu");
+                    return;
+                }
+                // Multi-GPU split (DiT only): the runner's main backend is the
+                // largest participating GPU (split_device_ids[0]); the actual
+                // per-tensor distribution is applied later via a MultiBackendSpec
+                // (see prepare_*_split_spec). Record the decision for that step.
+                if (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT ||
+                    d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) {
+                    std::string main_dev = d->split_device_ids.empty() ? "" : dev_name_by_id(d->split_device_ids[0]);
+                    if (main_dev.empty()) {
+                        return;  // fall back to default backend
+                    }
+                    append_assignment(runtime_spec, module_key, main_dev);
+                    backend_fit::MultiGpuMode m = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT)
+                                                      ? backend_fit::MultiGpuMode::ROW
+                                                      : backend_fit::MultiGpuMode::LAYER;
+                    std::vector<std::string> names;
+                    for (int id : d->split_device_ids) {
+                        names.push_back(dev_name_by_id(id));
+                    }
+                    if (std::string(module_key) == "diffusion") {
+                        fit_dit_split_mode         = m;
+                        fit_dit_split_device_names = names;
+                        fit_dit_split_share_bytes  = d->split_share_bytes;
+                    } else if (std::string(module_key) == "te") {
+                        fit_cond_split_mode         = m;
+                        fit_cond_split_device_names = names;
+                        fit_cond_split_share_bytes  = d->split_share_bytes;
+                    }
+                    return;
+                }
+                std::string dev_name = dev_name_by_id(d->device_id);
+                if (dev_name.empty()) {
+                    return;  // no matching device; fall back to the default backend
+                }
+                append_assignment(runtime_spec, module_key, dev_name);
+                if (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) {
+                    append_assignment(params_spec, module_key, "cpu");
+                }
+            };
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), "diffusion");
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), "te");
+            apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), "vae");
+
+            backend_spec        = runtime_spec;
+            params_backend_spec = params_spec;
+            LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'",
+                     backend_spec.empty() ? "(default)" : backend_spec.c_str(),
+                     params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str());
+
+            // When a component is split across GPUs the working set is tight:
+            // the split component (and the others sharing those GPUs) cannot all
+            // be resident at once. Enable lazy-load so the DiT / conditioner /
+            // VAE defer their param alloc+load to their compute phase and free
+            // after, time-sharing VRAM (the per-component MAX plan assumes this).
+            if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF ||
+                fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) {
+                auto_lazy_load = true;
+                LOG_INFO("auto-fit: enabling lazy-load (components time-share VRAM across phases)");
+            }
+        }
+
+        // Create the backends now that the placement (manual or auto-fit) is
+        // settled, then canonicalize graph-cut VRAM budget assignments against
+        // the initialized backend registry.
+        if (!init_backend()) {
+            return false;
+        }
+        {
+            std::string error;
+            if (!max_vram_assignment.canonicalize_backend_keys(&error)) {
+                LOG_ERROR("%s", error.c_str());
+                return false;
+            }
+        }
+        if (stream_layers && !backend_manager.params_backend_is_cpu(SDBackendModule::DIFFUSION)) {
+            LOG_WARN("--stream-layers has no effect unless diffusion params backend is cpu; ignoring");
+            stream_layers = false;
+        }
+
         LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
         LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
         LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
@@ -810,8 +1339,19 @@ class StableDiffusionGGML {
                 return false;
             }
 
+            // When the DiT is split across GPUs its params live resident in the
+            // (per-device) split buffers, so it must not be mmap'd and must not
+            // use the RAM-streaming path (mutually exclusive with split).
+            const bool dit_split = fit_dit_split_mode != backend_fit::MultiGpuMode::OFF &&
+                                   fit_dit_split_device_names.size() >= 2;
+            if (dit_split && stream_layers) {
+                LOG_WARN("--stream-layers is ignored for the diffusion model when it is "
+                         "split across GPUs (--multi-gpu-mode=%s)",
+                         backend_fit::multi_gpu_mode_str(fit_dit_split_mode));
+            }
+
             diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes_for_module(SDBackendModule::DIFFUSION));
-            diffusion_model->set_stream_layers_enabled(stream_layers);
+            diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers);
             if (!register_runner_params("Diffusion model",
                                         diffusion_model,
                                         SDBackendModule::DIFFUSION,
@@ -821,7 +1361,7 @@ class StableDiffusionGGML {
 
             if (high_noise_diffusion_model) {
                 high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes_for_module(SDBackendModule::DIFFUSION));
-                high_noise_diffusion_model->set_stream_layers_enabled(stream_layers);
+                high_noise_diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers);
                 if (!register_runner_params("High noise diffusion model",
                                             high_noise_diffusion_model,
                                             SDBackendModule::DIFFUSION,
@@ -1099,6 +1639,59 @@ class StableDiffusionGGML {
             ignore_tensors.insert("model.visual.deepstack_merger_list.");
         }
 
+        // --- Multi-GPU split + lazy-load (auto-fit) ------------------------
+        // Apply split specs before any params are prepared. Split runners use
+        // runner-owned buffers, so their weight manager is disabled and their
+        // tensors are loaded directly by a lazy callback at first compute.
+        apply_dit_multi_gpu_split(diffusion_model, model_loader);
+        apply_dit_multi_gpu_split(high_noise_diffusion_model, model_loader);
+        apply_cond_multi_gpu_split(cond_stage_model, model_loader);
+
+        if (auto_lazy_load) {
+            const bool   lazy_mmap  = sd_ctx_params->enable_mmap;
+            ModelLoader* loader_ptr = &model_loader;
+            auto make_lazy = [&](auto&& component,
+                                 const std::function<void(std::map<std::string, ggml_tensor*>&)>& collect,
+                                 const std::string& only_prefix) {
+                if (!component) {
+                    return;
+                }
+                std::map<std::string, ggml_tensor*> all;
+                collect(all);
+                auto sub = std::make_shared<std::map<std::string, ggml_tensor*>>();
+                for (const auto& kv : all) {
+                    if (!only_prefix.empty() &&
+                        kv.first.compare(0, only_prefix.size(), only_prefix) != 0) {
+                        continue;
+                    }
+                    (*sub)[kv.first] = kv.second;
+                }
+                if (sub->empty()) {
+                    return;
+                }
+                component->set_weight_manager(nullptr);
+                component->set_lazy_load([loader_ptr, sub, lazy_mmap]() -> bool {
+                    auto local = *sub;
+                    return loader_ptr->load_tensors(local, {}, lazy_mmap);
+                });
+                LOG_INFO("auto-fit: deferring %zu split tensors to first compute (lazy-load)", sub->size());
+            };
+            if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF) {
+                make_lazy(diffusion_model,
+                          [&](std::map<std::string, ggml_tensor*>& m) { diffusion_model->get_param_tensors(m); },
+                          "");
+                make_lazy(high_noise_diffusion_model,
+                          [&](std::map<std::string, ggml_tensor*>& m) { high_noise_diffusion_model->get_param_tensors(m); },
+                          "");
+            }
+            if (fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) {
+                make_lazy(cond_stage_model,
+                          [&](std::map<std::string, ggml_tensor*>& m) { cond_stage_model->get_param_tensors(m); },
+                          "text_encoders.llm.");
+            }
+        }
+        // ------------------------------------------------------------------
+
         model_manager->set_common_ignore_tensors(ignore_tensors);
         if (!model_manager->validate_registered_tensors()) {
             LOG_ERROR("model metadata validation failed");
@@ -2294,7 +2887,35 @@ class StableDiffusionGGML {
         }
         auto latents = first_stage_model->diffusion_to_vae_latents(x);
         first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
-        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+        auto decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+        // Auto-fit tiling fallback: a full-frame video decode can need ~10 GB of
+        // compute buffer and OOM (a graceful failure -> empty result, not an
+        // abort). Under auto-fit, enable tiling and retry once instead of failing.
+        // Temporal tiling is LTX-only (its 3D VAE supports temporal_tile_frames);
+        // every other architecture falls back to ordinary spatial tiling.
+        if (decoded.empty() && auto_fit_enabled) {
+            bool changed = false;
+            if (version == VERSION_LTXAV) {
+                if (!vae_tiling_params.temporal_tiling) {
+                    vae_tiling_params.temporal_tiling = true;
+                    changed                           = true;
+                }
+            } else if (!vae_tiling_params.enabled) {
+                vae_tiling_params.enabled = true;
+                // Reasonable default tile if the user didn't set one.
+                if (vae_tiling_params.tile_size_x <= 0) vae_tiling_params.tile_size_x = 256;
+                if (vae_tiling_params.tile_size_y <= 0) vae_tiling_params.tile_size_y = 256;
+                changed = true;
+            }
+            if (changed) {
+                LOG_WARN("auto-fit: VAE decode failed (likely OOM); retrying with %s tiling",
+                         version == VERSION_LTXAV ? "temporal" : "spatial");
+                first_stage_model->free_compute_buffer();
+                first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling);
+                decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
+            }
+        }
+        return decoded;
     }
 
     sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
@@ -2641,6 +3262,12 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->vae_format           = SD_VAE_FORMAT_AUTO;
     sd_ctx_params->backend              = nullptr;
     sd_ctx_params->params_backend       = nullptr;
+    sd_ctx_params->auto_fit             = true;
+    sd_ctx_params->auto_fit_target_mb   = 512;
+    sd_ctx_params->auto_fit_dry_run     = false;
+    sd_ctx_params->auto_fit_compute_reserve = nullptr;
+    sd_ctx_params->auto_multi_gpu       = true;
+    sd_ctx_params->multi_gpu_mode       = "row";
     sd_ctx_params->rpc_servers          = nullptr;
 }
 
@@ -2677,6 +3304,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "stream_layers: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
+             "auto_fit: %s\n"
+             "auto_fit_target_mb: %d\n"
+             "auto_fit_dry_run: %s\n"
+             "auto_fit_compute_reserve: %s\n"
+             "auto_multi_gpu: %s\n"
+             "multi_gpu_mode: %s\n"
+             "rpc_servers: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2711,6 +3345,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->stream_layers),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
+             BOOL_STR(sd_ctx_params->auto_fit),
+             sd_ctx_params->auto_fit_target_mb,
+             BOOL_STR(sd_ctx_params->auto_fit_dry_run),
+             SAFE_STR(sd_ctx_params->auto_fit_compute_reserve),
+             BOOL_STR(sd_ctx_params->auto_multi_gpu),
+             SAFE_STR(sd_ctx_params->multi_gpu_mode),
+             SAFE_STR(sd_ctx_params->rpc_servers),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),