diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 3ae5faba7..bc40f86d1 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -449,6 +449,10 @@ ArgOptions SDContextParams::get_options() {
          "--stream-layers",
          "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
          true, &stream_layers},
+        {"",
+         "--multi-gpu",
+         "distribute model params across multiple GPUs for parallel computation (defaults to false)",
+         true, &multi_gpu},
         {"",
          "--force-sdxl-vae-conv-scale",
          "force use of conv scale on sdxl vae",
@@ -733,6 +737,7 @@ std::string SDContextParams::to_string() const {
         << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
         << "  max_vram: " << max_vram << ",\n"
         << "  stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
+        << "  multi_gpu: " << (multi_gpu ? "true" : "false") << ",\n"
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
@@ -815,6 +820,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         str_to_vae_format(vae_format),
         max_vram,
         stream_layers,
+        multi_gpu,
         backend.c_str(),
         params_backend.c_str(),
     };
diff --git a/examples/common/common.h b/examples/common/common.h
index a90a33132..885a8af9a 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -146,6 +146,7 @@ struct SDContextParams {
     bool offload_params_to_cpu  = false;
     float max_vram              = 0.f;
     bool stream_layers          = false;
+    bool multi_gpu              = false;
     std::string backend;
     std::string params_backend;
     bool enable_mmap           = false;
diff --git a/ggml b/ggml
index 0ce7ad348..37050baa1 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
+Subproject commit 37050baa178eeb5f518871a013f12f35db35d6e0
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 2175f895a..9560fe675 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -224,6 +224,7 @@ typedef struct {
     enum sd_vae_format_t vae_format;
     float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
     bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
+    bool multi_gpu;      // Distribute model params across multiple GPUs for parallel computation
     const char* backend;
     const char* params_backend;
 } sd_ctx_params_t;
diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
index 70703d244..9f8c61ada 100644
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@@ -1699,6 +1699,13 @@ struct GGMLRunner {
     ggml_backend_t params_backend  = nullptr;
     ggml_backend_t runtime_backend = nullptr;
 
+    // Multi-GPU support: additional GPU backends for parallel parameter distribution
+    std::vector<ggml_backend_t> extra_gpu_backends;
+    ggml_backend_sched_t multi_gpu_sched = nullptr;
+    bool multi_gpu_enabled               = false;
+    std::vector<ggml_backend_buffer_t> multi_gpu_params_buffers;
+    std::vector<ggml_context*> multi_gpu_params_ctxs;
+
     ggml_context* params_ctx            = nullptr;
     ggml_backend_buffer_t params_buffer = nullptr;
 
@@ -2147,6 +2154,9 @@ struct GGMLRunner {
             graph_tensor_set.insert(ggml_graph_node(gf, i));
         }
 
+        int copied_count = 0;
+        int skipped_no_buf = 0;
+        int skipped_not_in_graph = 0;
         for (auto& kv : backend_tensor_data_map) {
             auto tensor = kv.first;
             auto data   = kv.second;
@@ -2155,9 +2165,11 @@ struct GGMLRunner {
             }
             const char* name = ggml_get_name(tensor);
             if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) {
+                skipped_not_in_graph++;
                 continue;
             }
             if (tensor->buffer == nullptr) {
+                skipped_no_buf++;
                 LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
                          get_desc().c_str(),
                          name != nullptr ? name : "",
@@ -2182,8 +2194,13 @@ struct GGMLRunner {
             }
 
             ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
+            copied_count++;
         }
 
+        LOG_INFO("%s copy_data_to_backend_tensor: copied=%d skipped_no_buf=%d skipped_not_in_graph=%d total_in_map=%zu graph_leafs=%d graph_nodes=%d",
+                 get_desc().c_str(), copied_count, skipped_no_buf, skipped_not_in_graph,
+                 backend_tensor_data_map.size(), n_leafs, n_nodes);
+
         if (clear_after_copy) {
             backend_tensor_data_map.clear();
         }
@@ -2434,6 +2451,21 @@ struct GGMLRunner {
         };
         GraphWeightDoneGuard graph_weight_done_guard(this, &params_to_prepare);
 
+        // Multi-GPU execution path using ggml_backend_sched
+        if (multi_gpu_enabled && !extra_gpu_backends.empty() && !sd_backend_is_cpu(runtime_backend)) {
+            auto result = execute_graph_multi_gpu<T>(gf,
+                                                     n_threads,
+                                                     free_compute_buffer,
+                                                     free_compute_params,
+                                                     preserve_backend_tensor_data_map,
+                                                     no_return,
+                                                     cache_keep_names,
+                                                     params_to_prepare);
+            // Dismiss the guard since multi-GPU path handles cleanup internally
+            graph_weight_done_guard.dismiss();
+            return result;
+        }
+
         if (!alloc_compute_buffer(gf)) {
             LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
             return std::nullopt;
@@ -2538,6 +2570,181 @@ struct GGMLRunner {
         return output;
     }
 
+    template <typename T>
+    std::optional<sd::Tensor<T>> execute_graph_multi_gpu(
+        ggml_cgraph* gf,
+        int n_threads,
+        bool free_compute_buffer,
+        bool free_compute_params,
+        bool preserve_backend_tensor_data_map,
+        bool no_return,
+        const std::unordered_set<std::string>* cache_keep_names,
+        const std::vector<ggml_tensor*>& params_to_prepare) {
+        // Collect all GPU backends
+        std::vector<ggml_backend_t> all_backends;
+        all_backends.push_back(runtime_backend);
+        for (auto& b : extra_gpu_backends) {
+            all_backends.push_back(b);
+        }
+        // Add CPU backend for fallback
+        ggml_backend_t cpu_backend = sd_backend_cpu_init();
+        all_backends.push_back(cpu_backend);
+
+        int n_backends = (int)all_backends.size();
+
+        // Get buffer types for each backend
+        std::vector<ggml_backend_buffer_type_t> bufts(n_backends);
+        for (int i = 0; i < n_backends; i++) {
+            bufts[i] = ggml_backend_get_default_buffer_type(all_backends[i]);
+        }
+
+        // Determine graph size from the actual graph
+        int n_nodes = ggml_graph_n_nodes(gf);
+        int n_leafs = sd::ggml_graph_cut::leaf_count(gf);
+        size_t graph_size = (size_t)n_nodes + (size_t)n_leafs + 256;  // extra margin
+
+        // Create scheduler only once, reuse across iterations
+        if (multi_gpu_sched == nullptr) {
+            multi_gpu_sched = ggml_backend_sched_new(all_backends.data(),
+                                                      bufts.data(),
+                                                      n_backends,
+                                                      graph_size,
+                                                      false,   // sequential - split graph into segments per GPU
+                                                      true);   // op_offload
+
+            if (multi_gpu_sched == nullptr) {
+                LOG_ERROR("%s multi-GPU: failed to create backend scheduler", get_desc().c_str());
+                return std::nullopt;
+            }
+        }
+
+        // Reset the scheduler for this iteration
+        ggml_backend_sched_reset(multi_gpu_sched);
+
+        // After reset, manually assign backends for leaf tensors without buffers
+        // (graph inputs like latent, timestep, positional encoding, etc.)
+        {
+            int n_leafs_dbg = sd::ggml_graph_cut::leaf_count(gf);
+            int no_buffer_count = 0;
+            for (int i = 0; i < n_leafs_dbg; i++) {
+                ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i);
+                if (t == nullptr) continue;
+                ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+                if (buf == nullptr) {
+                    no_buffer_count++;
+                    ggml_backend_sched_set_tensor_backend(multi_gpu_sched, t, runtime_backend);
+                }
+            }
+            LOG_INFO("%s multi-GPU: assigned %d leaf tensors without buffer to primary GPU",
+                     get_desc().c_str(), no_buffer_count);
+        }
+
+        if (!ggml_backend_sched_alloc_graph(multi_gpu_sched, gf)) {
+            LOG_ERROR("%s multi-GPU: failed to allocate graph", get_desc().c_str());
+            return std::nullopt;
+        }
+
+        // Debug: verify parameter tensors have correct backend assignment
+        {
+            int param_on_gpu0 = 0, param_on_gpu1 = 0, param_on_gpu2 = 0, param_on_cpu = 0, param_no_buf = 0;
+            int leaf_count = sd::ggml_graph_cut::leaf_count(gf);
+            for (int i = 0; i < leaf_count; i++) {
+                ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i);
+                if (t == nullptr) continue;
+                ggml_backend_t t_backend = ggml_backend_sched_get_tensor_backend(multi_gpu_sched, t);
+                if (t_backend == nullptr) {
+                    param_no_buf++;
+                    continue;
+                }
+                if (t_backend == all_backends[0]) param_on_gpu0++;
+                else if (n_backends > 1 && t_backend == all_backends[1]) param_on_gpu1++;
+                else if (n_backends > 2 && t_backend == all_backends[2]) param_on_gpu2++;
+                else if (n_backends > 3 && t_backend == all_backends[3]) param_on_cpu++;
+                else param_no_buf++;
+            }
+            LOG_INFO("%s multi-GPU: leaf backend dist: gpu0=%d gpu1=%d gpu2=%d cpu=%d null=%d",
+                     get_desc().c_str(), param_on_gpu0, param_on_gpu1, param_on_gpu2, param_on_cpu, param_no_buf);
+        }
+
+        // Debug: check a few leaf parameter tensors for valid data
+        {
+            int checked = 0;
+            int leaf_count = sd::ggml_graph_cut::leaf_count(gf);
+            for (int i = 0; i < leaf_count && checked < 5; i++) {
+                ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i);
+                if (t == nullptr) continue;
+                ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+                if (buf == nullptr || t->data == nullptr) continue;
+                // Only check weight tensors (skip input tensors like latent/timestep)
+                if (t->op != GGML_OP_NONE) continue;
+                // Read first few values
+                float vals[4] = {0};
+                size_t nb = ggml_nbytes(t);
+                size_t to_read = std::min(nb, sizeof(vals));
+                ggml_backend_tensor_get(t, vals, 0, to_read);
+                bool has_nan = false;
+                for (int k = 0; k < 4 && k < (int)(to_read/sizeof(float)); k++) {
+                    if (std::isnan(vals[k]) || std::isinf(vals[k])) has_nan = true;
+                }
+                LOG_INFO("%s multi-GPU: leaf[%d] '%s' type=%s ne=[%lld,%lld,%lld] vals=[%.4f,%.4f,%.4f,%.4f] nan=%d buf=%p",
+                         get_desc().c_str(), i, t->name, ggml_type_name(t->type),
+                         (long long)t->ne[0], (long long)t->ne[1], (long long)t->ne[2],
+                         vals[0], vals[1], vals[2], vals[3], has_nan, (void*)buf);
+                checked++;
+            }
+        }
+
+        // Copy data to backend tensors AFTER alloc_graph so tensors have buffers
+        // For multi-GPU path, always preserve the data map since the scheduler is reused
+        // across iterations and leaf tensors need their data copied each time
+        copy_data_to_backend_tensor(gf, false);
+
+        // Set n_threads for CPU backend
+        sd_backend_cpu_set_n_threads(cpu_backend, n_threads);
+
+        LOG_INFO("%s multi-GPU: executing graph with %d splits across %d backends",
+                  get_desc().c_str(),
+                  ggml_backend_sched_get_n_splits(multi_gpu_sched),
+                  n_backends);
+
+        // Execute the graph
+        ggml_status status = ggml_backend_sched_graph_compute(multi_gpu_sched, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            LOG_ERROR("%s multi-GPU: compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
+            return std::nullopt;
+        }
+
+        ggml_backend_sched_synchronize(multi_gpu_sched);
+
+        LOG_INFO("%s multi-GPU: graph compute completed with %d splits",
+                  get_desc().c_str(),
+                  ggml_backend_sched_get_n_splits(multi_gpu_sched));
+
+        // Read output
+        if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) {
+            return std::nullopt;
+        }
+        auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
+        std::optional<sd::Tensor<T>> output;
+        if (!no_return) {
+            output = read_graph_tensor<T>(result, "output");
+            if (!output.has_value()) {
+                return std::nullopt;
+            }
+        } else {
+            output = sd::Tensor<T>();
+        }
+
+        if (!free_compute_params) {
+            for (ggml_tensor* param : params_to_prepare) {
+                if (param == nullptr) continue;
+                kept_compute_param_tensor_set.insert(param);
+            }
+        }
+
+        return output;
+    }
+
     template <typename T>
     std::optional<sd::Tensor<T>> compute_graph_cut_segments(ggml_cgraph* gf,
                                                             const GraphCutPlan& plan,
@@ -2645,6 +2852,16 @@ struct GGMLRunner {
         free_params_ctx();
         free_compute_ctx();
         free_cache_ctx_and_buffer();
+        if (multi_gpu_sched != nullptr) {
+            ggml_backend_sched_free(multi_gpu_sched);
+            multi_gpu_sched = nullptr;
+        }
+        for (auto& backend : extra_gpu_backends) {
+            if (backend != nullptr) {
+                ggml_backend_free(backend);
+            }
+        }
+        extra_gpu_backends.clear();
     }
 
     virtual GGMLRunnerContext get_context() {
@@ -2696,6 +2913,12 @@ struct GGMLRunner {
             LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
             return true;
         }
+
+        // Multi-GPU: distribute params across multiple GPUs
+        if (multi_gpu_enabled && !extra_gpu_backends.empty() && !sd_backend_is_cpu(params_backend)) {
+            return alloc_params_buffer_multi_gpu(num_tensors);
+        }
+
         // Pinned host buffer when CPU-offloaded for DMA-direct H2D.
         ggml_backend_buffer_type_t params_buft = nullptr;
         if (params_backend != runtime_backend) {
@@ -2725,12 +2948,149 @@ struct GGMLRunner {
         return true;
     }
 
+    // Extract block index from tensor name for round-robin GPU assignment
+    int extract_block_index(const std::string& name) const {
+        // Look for patterns like "transformer_blocks.N" or "single_blocks.N"
+        static const std::vector<std::string> block_prefixes = {
+            "transformer_blocks.",
+            "single_blocks.",
+            "double_blocks.",
+        };
+        for (const auto& prefix : block_prefixes) {
+            size_t pos = name.find(prefix);
+            if (pos != std::string::npos) {
+                size_t start = pos + prefix.size();
+                size_t end   = start;
+                while (end < name.size() && std::isdigit(name[end])) {
+                    end++;
+                }
+                if (end > start) {
+                    return std::stoi(name.substr(start, end - start));
+                }
+            }
+        }
+        return -1;  // No block index found
+    }
+
+    bool alloc_params_buffer_multi_gpu(size_t num_tensors) {
+        // Collect all GPU backends (primary + extra)
+        std::vector<ggml_backend_t> all_gpus;
+        all_gpus.push_back(runtime_backend);
+        for (auto& b : extra_gpu_backends) {
+            all_gpus.push_back(b);
+        }
+        int n_gpus = (int)all_gpus.size();
+
+        LOG_INFO("%s multi-GPU: distributing params across %d GPUs", get_desc().c_str(), n_gpus);
+
+        // Group tensors by GPU assignment using round-robin based on block index
+        std::vector<std::vector<ggml_tensor*>> gpu_tensors(n_gpus);
+        std::vector<ggml_context*> gpu_ctxs(n_gpus, nullptr);
+
+        for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+            int block_idx = extract_block_index(t->name);
+            int gpu_idx;
+            if (block_idx >= 0) {
+                gpu_idx = block_idx % n_gpus;
+            } else {
+                // Non-block tensors go to GPU 0
+                gpu_idx = 0;
+            }
+            gpu_tensors[gpu_idx].push_back(t);
+        }
+
+        // Allocate buffers on each GPU
+        for (int g = 0; g < n_gpus; g++) {
+            if (gpu_tensors[g].empty()) continue;
+
+            // Create a context for this GPU's tensors
+            ggml_init_params init_params;
+            init_params.mem_size   = gpu_tensors[g].size() * ggml_tensor_overhead();
+            init_params.mem_buffer = nullptr;
+            init_params.no_alloc   = true;
+            gpu_ctxs[g]            = ggml_init(init_params);
+            GGML_ASSERT(gpu_ctxs[g] != nullptr);
+
+            // Duplicate tensor metadata into this GPU's context
+            for (ggml_tensor* src : gpu_tensors[g]) {
+                ggml_tensor* dst = ggml_dup_tensor(gpu_ctxs[g], src);
+                ggml_set_name(dst, src->name);
+                // Swap buffer/data pointers so the original params_ctx tensor
+                // points to the new GPU buffer after allocation
+                std::swap(src->buffer, dst->buffer);
+                std::swap(src->data, dst->data);
+                std::swap(src->extra, dst->extra);
+            }
+
+            // Allocate on this GPU
+            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(all_gpus[g]);
+            ggml_backend_buffer_t buf       = ggml_backend_alloc_ctx_tensors_from_buft(gpu_ctxs[g], buft);
+            if (buf == nullptr) {
+                LOG_ERROR("%s multi-GPU: alloc params buffer failed on GPU %d", get_desc().c_str(), g);
+                // Cleanup
+                for (int i = 0; i <= g; i++) {
+                    if (gpu_ctxs[i] != nullptr) ggml_free(gpu_ctxs[i]);
+                }
+                return false;
+            }
+            ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+            // Copy buffer/data back to original params_ctx tensors
+            for (size_t i = 0; i < gpu_tensors[g].size(); i++) {
+                ggml_tensor* src = gpu_tensors[g][i];
+                ggml_tensor* dst = ggml_get_tensor(gpu_ctxs[g], src->name);
+                src->buffer = dst->buffer;
+                src->data   = dst->data;
+                src->extra  = dst->extra;
+            }
+
+            size_t buf_size = ggml_backend_buffer_get_size(buf);
+            LOG_INFO("%s multi-GPU: GPU %d params buffer = %.2f MB (%zu tensors)",
+                     get_desc().c_str(), g, buf_size / (1024.f * 1024.f), gpu_tensors[g].size());
+
+            // Store the buffer and context for later cleanup
+            // (We'll use multi_gpu_params_buffers to track these)
+            multi_gpu_params_buffers.push_back(buf);
+            multi_gpu_params_ctxs.push_back(gpu_ctxs[g]);
+        }
+
+        // Load data from model files into the GPU buffers
+        auto manager = weight_manager.lock();
+        if (manager != nullptr) {
+            std::vector<ggml_tensor*> all_tensors;
+            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+                all_tensors.push_back(t);
+            }
+            if (!manager->prepare_params(all_tensors)) {
+                LOG_ERROR("%s multi-GPU: prepare params failed", get_desc().c_str());
+                return false;
+            }
+        }
+
+        rebuild_params_tensor_set();
+        LOG_INFO("%s multi-GPU: params distributed across %d GPUs successfully",
+                 get_desc().c_str(), n_gpus);
+        return true;
+    }
+
 protected:
     void free_params_buffer() {
         if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
             params_buffer = nullptr;
         }
+        for (auto& buf : multi_gpu_params_buffers) {
+            if (buf != nullptr) {
+                ggml_backend_buffer_free(buf);
+            }
+        }
+        multi_gpu_params_buffers.clear();
+        for (auto& ctx : multi_gpu_params_ctxs) {
+            if (ctx != nullptr) {
+                ggml_free(ctx);
+            }
+        }
+        multi_gpu_params_ctxs.clear();
         observed_max_effective_budget_ = 0;
     }
 
@@ -2903,6 +3263,20 @@ struct GGMLRunner {
         stream_layers_enabled = enabled;
     }
 
+    void set_multi_gpu_enabled(bool enabled) {
+        multi_gpu_enabled = enabled;
+    }
+
+    void add_extra_gpu_backend(ggml_backend_t backend) {
+        if (backend != nullptr) {
+            extra_gpu_backends.push_back(backend);
+        }
+    }
+
+    bool has_extra_gpu_backends() const {
+        return !extra_gpu_backends.empty();
+    }
+
     ggml_backend_t get_runtime_backend() {
         return runtime_backend;
     }
diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp
index 455dc4b2e..f128ca3d3 100644
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
@@ -918,6 +918,10 @@ namespace LTXV {
             }
 
             auto regs = ggml_reshape_3d(ctx->ggml_ctx, params["learnable_registers"], hidden_size, num_learnable_registers, 1);
+            // Cast regs to match hidden_states type for concat compatibility
+            if (regs->type != hidden_states->type) {
+                regs = ggml_cast(ctx->ggml_ctx, regs, hidden_states->type);
+            }
             auto temp = ggml_new_tensor_3d(ctx->ggml_ctx, regs->type, regs->ne[0], regs->ne[1], hidden_states->ne[2]);
             regs      = ggml_repeat(ctx->ggml_ctx, regs, temp);
 
diff --git a/src/model/vae/ltx_vae.hpp b/src/model/vae/ltx_vae.hpp
index 59e38c32d..8caf501be 100644
--- a/src/model/vae/ltx_vae.hpp
+++ b/src/model/vae/ltx_vae.hpp
@@ -1068,7 +1068,9 @@ namespace LTXVAE {
               timestep_conditioning(timestep_conditioning),
               patch_size(patch_size),
               decode_only(decode_only) {
+            LOG_INFO("VideoVAE: version=%d, decode_only=%d, prefix=%s", version, decode_only, prefix.c_str());
             if (!decode_only) {
+                LOG_INFO("VideoVAE: creating Encoder with version=%d", version);
                 blocks["encoder"] = std::make_shared<Encoder>(version,
                                                               tensor_storage_map,
                                                               prefix,
diff --git a/src/model_io/safetensors_io.cpp b/src/model_io/safetensors_io.cpp
index 39131dbd8..fa73aaafb 100644
--- a/src/model_io/safetensors_io.cpp
+++ b/src/model_io/safetensors_io.cpp
@@ -72,9 +72,9 @@ static ggml_type safetensors_dtype_to_ggml_type(const std::string& dtype) {
     } else if (dtype == "F64") {
         ttype = GGML_TYPE_F32;
     } else if (dtype == "F8_E4M3") {
-        ttype = GGML_TYPE_F16;
+        ttype = GGML_TYPE_BF16;
     } else if (dtype == "F8_E5M2") {
-        ttype = GGML_TYPE_F16;
+        ttype = GGML_TYPE_BF16;
     } else if (dtype == "I32") {
         ttype = GGML_TYPE_I32;
     } else if (dtype == "I64") {
diff --git a/src/model_loader.cpp b/src/model_loader.cpp
index 8d37d39a2..d2341cdf6 100644
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@@ -4,6 +4,7 @@
 #include <cinttypes>
 #include <cstdarg>
 #include <cstdlib>
+#include <dirent.h>
 #include <fstream>
 #include <functional>
 #include <mutex>
@@ -15,6 +16,7 @@
 #include <vector>
 
 #include "core/util.h"
+#include "json.hpp"
 #include "model_io/gguf_io.h"
 #include "model_io/safetensors_io.h"
 #include "model_io/torch_legacy_io.h"
@@ -120,10 +122,99 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) {
     return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
 }
 
+uint16_t f8_e4m3_to_bf16(uint8_t f8) {
+    // Convert FP8 E4M3 to BF16 via FP32 intermediate
+    const uint32_t exponent_bias = 7;
+    if (f8 == 0xff) {
+        // Negative NaN in bf16: sign=1, exponent=0xFF, mantissa=0x40
+        return 0xFFC0;
+    } else if (f8 == 0x7f) {
+        // Positive NaN in bf16: sign=0, exponent=0xFF, mantissa=0x40
+        return 0x7FC0;
+    }
+
+    uint32_t sign     = f8 & 0x80;
+    uint32_t exponent = (f8 & 0x78) >> 3;
+    uint32_t mantissa = f8 & 0x07;
+    uint32_t result   = sign << 24;
+    if (exponent == 0) {
+        if (mantissa > 0) {
+            exponent = 0x7f - exponent_bias;
+
+            // yes, 2 times
+            if ((mantissa & 0x04) == 0) {
+                mantissa &= 0x03;
+                mantissa <<= 1;
+                exponent -= 1;
+            }
+            if ((mantissa & 0x04) == 0) {
+                mantissa &= 0x03;
+                mantissa <<= 1;
+                exponent -= 1;
+            }
+
+            result |= (mantissa & 0x03) << 21;
+            result |= exponent << 23;
+        }
+    } else {
+        result |= mantissa << 20;
+        exponent += 0x7f - exponent_bias;
+        result |= exponent << 23;
+    }
+
+    // FP32 to BF16: take upper 16 bits
+    return static_cast<uint16_t>(result >> 16);
+}
+
 uint16_t f8_e5m2_to_f16(uint8_t fp8) {
     return static_cast<uint16_t>(fp8) << 8;
 }
 
+uint16_t f8_e5m2_to_bf16(uint8_t fp8) {
+    // FP8 E5M2 to BF16 via FP32
+    // E5M2: 1 sign + 5 exponent (bias 15) + 2 mantissa
+    float f;
+    if (fp8 == 0x7C) {
+        // positive infinity
+        uint32_t inf_bits = 0x7f800000;
+        f = *reinterpret_cast<const float*>(&inf_bits);
+    } else if (fp8 == 0xFC) {
+        // negative infinity
+        uint32_t inf_bits = 0xff800000;
+        f = *reinterpret_cast<const float*>(&inf_bits);
+    } else if ((fp8 & 0x7C) == 0x7C) {
+        // NaN
+        uint32_t nan_bits = 0x7fc00000;
+        if (fp8 & 0x80) nan_bits = 0xffc00000;
+        f = *reinterpret_cast<const float*>(&nan_bits);
+    } else {
+        uint32_t sign     = fp8 & 0x80;
+        uint32_t exponent = (fp8 & 0x7C) >> 2;
+        uint32_t mantissa = fp8 & 0x03;
+        uint32_t result   = sign << 24;
+        if (exponent == 0) {
+            if (mantissa > 0) {
+                exponent = 0x7f - 15;
+                if ((mantissa & 0x02) == 0) {
+                    mantissa &= 0x01;
+                    mantissa <<= 1;
+                    exponent -= 1;
+                }
+                result |= mantissa << 21;
+                result |= exponent << 23;
+            }
+        } else {
+            result |= mantissa << 21;
+            exponent += 0x7f - 15;
+            result |= exponent << 23;
+        }
+        f = *reinterpret_cast<const float*>(&result);
+    }
+    uint32_t f32_bits;
+    memcpy(&f32_bits, &f, sizeof(f32_bits));
+    return static_cast<uint16_t>(f32_bits >> 16);
+}
+
 void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
     // support inplace op
     for (int64_t i = n - 1; i >= 0; i--) {
@@ -138,6 +229,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
     }
 }
 
+void f8_e4m3_to_bf16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = n - 1; i >= 0; i--) {
+        dst[i] = f8_e4m3_to_bf16(src[i]);
+    }
+}
+
+void f8_e5m2_to_bf16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = n - 1; i >= 0; i--) {
+        dst[i] = f8_e5m2_to_bf16(src[i]);
+    }
+}
+
 void f64_to_f32_vec(double* src, float* dst, int64_t n) {
     // support inplace op
     for (int64_t i = 0; i < n; i++) {
@@ -406,22 +511,156 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
     std::string clip_path   = path_join(file_path, "text_encoder/model.safetensors");
     std::string clip_g_path = path_join(file_path, "text_encoder_2/model.safetensors");
 
-    if (!init_from_safetensors_file(unet_path, "unet.")) {
+    // Check for standard diffusers directory structure first
+    if (file_exists(unet_path) || file_exists(vae_path) || file_exists(clip_path) || file_exists(clip_g_path)) {
+        if (!init_from_safetensors_file(unet_path, "unet.")) {
+            return false;
+        }
+
+        if (!init_from_safetensors_file(vae_path, "vae.")) {
+            LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
+            // return false;
+        }
+        if (!init_from_safetensors_file(clip_path, "te.")) {
+            LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
+            // return false;
+        }
+        if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
+            LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
+        }
+        return true;
+    }
+
+    // HuggingFace sharded safetensors format: load model-*.safetensors files
+    // Look for model.safetensors.index.json or model-*.safetensors files
+    std::string index_path = path_join(file_path, "model.safetensors.index.json");
+    std::set<std::string> shard_files;
+
+    if (file_exists(index_path)) {
+        // Parse the index JSON to find which shard files exist
+        std::ifstream ifs(index_path);
+        if (ifs.is_open()) {
+            try {
+                nlohmann::json index_json = nlohmann::json::parse(ifs);
+                if (index_json.contains("weight_map")) {
+                    for (auto& [key, val] : index_json["weight_map"].items()) {
+                        if (val.is_string()) {
+                            shard_files.insert(val.get<std::string>());
+                        }
+                    }
+                }
+            } catch (...) {
+                LOG_WARN("Failed to parse model.safetensors.index.json in %s", file_path.c_str());
+            }
+        }
+    }
+
+    // If no index found, glob for model-*.safetensors files
+    if (shard_files.empty()) {
+        // Try single model.safetensors file
+        std::string single_model_path = path_join(file_path, "model.safetensors");
+        if (file_exists(single_model_path)) {
+            return init_from_safetensors_file(single_model_path, prefix);
+        }
+
+        // Try to find model-*.safetensors shard files
+        DIR* dir = opendir(file_path.c_str());
+        if (dir) {
+            struct dirent* entry;
+            while ((entry = readdir(dir)) != nullptr) {
+                std::string name = entry->d_name;
+                if (name.find("model-") == 0 && name.find(".safetensors") != std::string::npos) {
+                    shard_files.insert(name);
+                }
+            }
+            closedir(dir);
+        }
+    }
+
+    if (shard_files.empty()) {
+        LOG_WARN("No safetensors model files found in %s", file_path.c_str());
         return false;
     }
 
-    if (!init_from_safetensors_file(vae_path, "vae.")) {
-        LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
-        // return false;
+    LOG_INFO("Loading %zu shard files from %s", shard_files.size(), file_path.c_str());
+    bool any_loaded = false;
+    for (const auto& shard_name : shard_files) {
+        std::string shard_path = path_join(file_path, shard_name);
+        if (init_from_safetensors_file(shard_path, prefix)) {
+            any_loaded = true;
+        } else {
+            LOG_WARN("Failed to load shard %s", shard_path.c_str());
+        }
     }
-    if (!init_from_safetensors_file(clip_path, "te.")) {
-        LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
-        // return false;
+
+    // HuggingFace gemma models have a "language_model." prefix that needs to be stripped
+    // e.g. "language_model.model.embed_tokens.weight" -> "model.embed_tokens.weight"
+    // This is needed because the C++ code expects "text_encoders.llm.model.embed_tokens.weight"
+    // but the safetensors files contain "language_model.model.embed_tokens.weight"
+    bool has_language_model_prefix = false;
+    for (auto& [name, ts] : tensor_storage_map) {
+        if (starts_with(name, prefix + "language_model.")) {
+            has_language_model_prefix = true;
+            break;
+        }
     }
-    if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
-        LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
+    if (has_language_model_prefix) {
+        LOG_INFO("Stripping 'language_model.' prefix from tensors in %s", file_path.c_str());
+        String2TensorStorage new_map;
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (starts_with(name, prefix + "language_model.")) {
+                std::string new_name = prefix + name.substr(prefix.size() + strlen("language_model."));
+                tensor_storage.name = new_name;
+                new_map[new_name] = std::move(tensor_storage);
+            } else {
+                new_map[name] = std::move(tensor_storage);
+            }
+        }
+        tensor_storage_map.swap(new_map);
     }
-    return true;
+
+    // HuggingFace safetensors LLM naming differs from GGUF naming.
+    // The C++ code uses GGUF-style names internally (e.g., post_attention_norm, post_ffw_norm),
+    // but safetensors files use HuggingFace names (e.g., post_feedforward_layernorm, pre_feedforward_layernorm).
+    // Apply reverse mapping: safetensors HF name -> C++ internal name.
+    static const std::vector<std::pair<std::string, std::string>> hf_llm_name_map = {
+        {"post_feedforward_layernorm.", "post_ffw_norm."},
+        {"pre_feedforward_layernorm.", "post_attention_norm."},
+        // Standard mappings that are already correct in most safetensors files:
+        // {"input_layernorm.", "input_layernorm."}  -- no change needed
+        // {"post_attention_layernorm.", "post_attention_layernorm."}  -- already correct for pre_ffw_norm
+    };
+
+    bool needs_rename = false;
+    for (auto& [name, ts] : tensor_storage_map) {
+        if (!starts_with(name, prefix)) continue;
+        for (const auto& [hf_name, cpp_name] : hf_llm_name_map) {
+            if (name.find(hf_name) != std::string::npos) {
+                needs_rename = true;
+                break;
+            }
+        }
+        if (needs_rename) break;
+    }
+
+    if (needs_rename) {
+        LOG_INFO("Applying HuggingFace LLM name mapping for %s", file_path.c_str());
+        String2TensorStorage new_map;
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            std::string new_name = name;
+            for (const auto& [hf_name, cpp_name] : hf_llm_name_map) {
+                size_t pos = new_name.find(hf_name);
+                if (pos != std::string::npos) {
+                    new_name.replace(pos, hf_name.size(), cpp_name);
+                }
+            }
+            tensor_storage.name = new_name;
+            new_map[new_name] = std::move(tensor_storage);
+        }
+        tensor_storage_map.swap(new_map);
+    }
+
+    return any_loaded;
 }
 
 SDVersion ModelLoader::get_sd_version() {
@@ -1130,9 +1369,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
 
                     t0 = ggml_time_ms();
                     if (tensor_storage.is_f8_e4m3) {
-                        f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        if (tensor_storage.type == GGML_TYPE_BF16) {
+                            f8_e4m3_to_bf16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        } else {
+                            f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        }
                     } else if (tensor_storage.is_f8_e5m2) {
-                        f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        if (tensor_storage.type == GGML_TYPE_BF16) {
+                            f8_e5m2_to_bf16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        } else {
+                            f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
+                        }
                     } else if (tensor_storage.is_f64) {
                         f64_to_f32_vec((double*)read_buf, (float*)target_buf, tensor_storage.nelements());
                     } else if (tensor_storage.is_i64) {
diff --git a/src/model_manager.cpp b/src/model_manager.cpp
index 328a478bb..79ee61d2e 100644
--- a/src/model_manager.cpp
+++ b/src/model_manager.cpp
@@ -100,6 +100,28 @@ size_t estimate_tensors_size(const std::map<std::string, ggml_tensor*>& tensors)
     return size;
 }
 
+static int extract_block_index(const std::string& name) {
+    static const std::vector<std::string> block_prefixes = {
+        "transformer_blocks.",
+        "single_blocks.",
+        "double_blocks.",
+    };
+    for (const auto& prefix : block_prefixes) {
+        size_t pos = name.find(prefix);
+        if (pos != std::string::npos) {
+            size_t start = pos + prefix.size();
+            size_t end   = start;
+            while (end < name.size() && std::isdigit(name[end])) {
+                end++;
+            }
+            if (end > start) {
+                return std::stoi(name.substr(start, end - start));
+            }
+        }
+    }
+    return -1;
+}
+
 bool ModelManager::register_param_tensors(const std::string& desc,
                                           std::map<std::string, ggml_tensor*> tensors,
                                           ResidencyMode residency_mode,
@@ -114,6 +136,15 @@ bool ModelManager::register_param_tensors(const std::string& desc,
         *registered_tensor_size += estimate_tensors_size(tensors);
     }
 
+    // Build list of all GPU backends for multi-GPU distribution
+    std::vector<ggml_backend_t> all_gpus;
+    if (multi_gpu_enabled_ && !extra_gpu_backends_.empty()) {
+        all_gpus.push_back(compute_backend);
+        for (auto& b : extra_gpu_backends_) {
+            all_gpus.push_back(b);
+        }
+    }
+
     std::vector<std::unique_ptr<TensorState>> new_states;
     new_states.reserve(tensors.size());
 
@@ -134,7 +165,21 @@ bool ModelManager::register_param_tensors(const std::string& desc,
         state->tensor          = tensor;
         state->desc            = desc;
         state->residency_mode  = residency_mode;
-        state->compute_backend = compute_backend;
+
+        // Multi-GPU: assign compute backend based on block index
+        if (!all_gpus.empty()) {
+            int block_idx = extract_block_index(name);
+            int gpu_idx;
+            if (block_idx >= 0) {
+                gpu_idx = block_idx % (int)all_gpus.size();
+            } else {
+                gpu_idx = 0;
+            }
+            state->compute_backend = all_gpus[gpu_idx];
+        } else {
+            state->compute_backend = compute_backend;
+        }
+
         state->params_backend  = params_backend;
         new_states.push_back(std::move(state));
     }
diff --git a/src/model_manager.h b/src/model_manager.h
index b3da8a36a..97177ad3b 100644
--- a/src/model_manager.h
+++ b/src/model_manager.h
@@ -69,6 +69,8 @@ class ModelManager : public RunnerWeightManager {
     uint64_t current_lora_epoch_ = 0;
     int n_threads_               = 0;
     bool enable_mmap_            = false;
+    bool multi_gpu_enabled_      = false;
+    std::vector<ggml_backend_t> extra_gpu_backends_;
 
     void finish_compute_backend_usage(const std::vector<TensorState*>& states);
     void release_all();
@@ -110,6 +112,12 @@ class ModelManager : public RunnerWeightManager {
         model_loader_.set_n_threads(n_threads);
     }
     void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
+    void set_multi_gpu_enabled(bool enabled) { multi_gpu_enabled_ = enabled; }
+    void add_extra_gpu_backend(ggml_backend_t backend) {
+        if (backend != nullptr) {
+            extra_gpu_backends_.push_back(backend);
+        }
+    }
     void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
     void set_loras(std::vector<LoraSpec> loras, SDVersion version);
 
diff --git a/src/runtime/denoiser.hpp b/src/runtime/denoiser.hpp
index fed5911bc..520d7a855 100644
--- a/src/runtime/denoiser.hpp
+++ b/src/runtime/denoiser.hpp
@@ -950,6 +950,33 @@ static sd::Tensor<float> sample_euler(denoise_cb_t model,
             return {};
         }
         sd::Tensor<float> denoised = std::move(denoised_opt.pred);
+
+        // Debug: check denoised and x for NaN at each step
+        {
+            int nan_d = 0, nan_x = 0;
+            int n = std::min((int)denoised.numel(), 100);
+            for (int j = 0; j < n; j++) {
+                if (std::isnan(denoised.data()[j]) || std::isinf(denoised.data()[j])) nan_d++;
+                if (std::isnan(x.data()[j]) || std::isinf(x.data()[j])) nan_x++;
+            }
+            if (nan_d > 0 || nan_x > 0 || i == 0) {
+                float d_min = std::numeric_limits<float>::max(), d_max = std::numeric_limits<float>::lowest();
+                float x_min = std::numeric_limits<float>::max(), x_max = std::numeric_limits<float>::lowest();
+                for (int j = 0; j < n; j++) {
+                    if (!std::isnan(denoised.data()[j]) && !std::isinf(denoised.data()[j])) {
+                        d_min = std::min(d_min, denoised.data()[j]);
+                        d_max = std::max(d_max, denoised.data()[j]);
+                    }
+                    if (!std::isnan(x.data()[j]) && !std::isinf(x.data()[j])) {
+                        x_min = std::min(x_min, x.data()[j]);
+                        x_max = std::max(x_max, x.data()[j]);
+                    }
+                }
+                LOG_INFO("euler step %d/%d: sigma=%.4f denoised[nan=%d min=%.4f max=%.4f] x[nan=%d min=%.4f max=%.4f]",
+                         i, steps, sigma, nan_d, d_min, d_max, nan_x, x_min, x_max);
+            }
+        }
+
         sd::Tensor<float> d        = (x - denoised) / sigma;
         x += d * (sigmas[i + 1] - sigma);
     }
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 19f9e85ea..956454b85 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -193,6 +193,7 @@ class StableDiffusionGGML {
     bool enable_mmap                     = false;
     float max_vram                       = 0.f;
     bool stream_layers                   = false;
+    bool multi_gpu                       = false;
     std::string backend_spec;
     std::string params_backend_spec;
 
@@ -327,6 +328,7 @@ class StableDiffusionGGML {
         enable_mmap             = sd_ctx_params->enable_mmap;
         max_vram                = sd_ctx_params->max_vram;
         stream_layers           = sd_ctx_params->stream_layers;
+        multi_gpu               = sd_ctx_params->multi_gpu;
         backend_spec            = SAFE_STR(sd_ctx_params->backend);
         params_backend_spec     = SAFE_STR(sd_ctx_params->params_backend);
         if (stream_layers && max_vram == 0.f) {
@@ -809,6 +811,28 @@ class StableDiffusionGGML {
 
             diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
             diffusion_model->set_stream_layers_enabled(stream_layers);
+            if (multi_gpu) {
+                diffusion_model->set_multi_gpu_enabled(true);
+                // Initialize extra GPU backends (GPU 1, GPU 2, ...)
+                int n_devices = ggml_backend_dev_count();
+                for (int i = 1; i < n_devices; i++) {
+                    ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+                    if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                        if (backend != nullptr) {
+                            LOG_INFO("Multi-GPU: adding GPU %d (%s)", i, ggml_backend_name(backend));
+                            diffusion_model->add_extra_gpu_backend(backend);
+                            model_manager->add_extra_gpu_backend(backend);
+                        }
+                    }
+                }
+                if (!diffusion_model->has_extra_gpu_backends()) {
+                    LOG_WARN("--multi-gpu specified but no extra GPUs found; disabling");
+                    diffusion_model->set_multi_gpu_enabled(false);
+                } else {
+                    model_manager->set_multi_gpu_enabled(true);
+                }
+            }
             if (!register_runner_params("Diffusion model",
                                         diffusion_model,
                                         SDBackendModule::DIFFUSION,
@@ -870,6 +894,7 @@ class StableDiffusionGGML {
 
             auto create_vae = [&]() -> std::shared_ptr<VAE> {
                 if (sd_version_is_ltxav(version)) {
+                    LOG_INFO("Creating LTXVideoVAE with vae_decode_only=%d", vae_decode_only);
                     return std::make_shared<LTXVideoVAE>(backend_for(SDBackendModule::VAE),
                                                          params_backend_for(SDBackendModule::VAE),
                                                          tensor_storage_map,
@@ -2053,6 +2078,22 @@ class StableDiffusionGGML {
                     return sd::Tensor<float>();
                 }
 
+                // Debug: check for NaN in diffusion model output
+                {
+                    int nan_count = 0;
+                    int inf_count = 0;
+                    int out_n = (int)output_opt.numel();
+                    for (int i = 0; i < std::min(out_n, 1000); i++) {
+                        float v = output_opt.data()[i];
+                        if (std::isnan(v)) nan_count++;
+                        else if (std::isinf(v)) inf_count++;
+                    }
+                    if (nan_count > 0 || inf_count > 0) {
+                        LOG_WARN("diffusion model output step=%d: nan=%d inf=%d in first %d elements",
+                                 step, nan_count, inf_count, std::min(out_n, 1000));
+                    }
+                }
+
                 step_cache.after_condition(&condition, noised_input, output_opt);
                 return output_opt;
             };
@@ -2638,6 +2679,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->max_vram                = 0.f;
     sd_ctx_params->stream_layers           = false;
+    sd_ctx_params->multi_gpu               = false;
     sd_ctx_params->enable_mmap             = false;
     sd_ctx_params->keep_clip_on_cpu        = false;
     sd_ctx_params->keep_control_net_on_cpu = false;
@@ -2687,6 +2729,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "offload_params_to_cpu: %s\n"
              "max_vram: %.3f\n"
              "stream_layers: %s\n"
+             "multi_gpu: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
              "keep_clip_on_cpu: %s\n"
@@ -2727,6 +2770,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              sd_ctx_params->max_vram,
              BOOL_STR(sd_ctx_params->stream_layers),
+             BOOL_STR(sd_ctx_params->multi_gpu),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
              BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
@@ -4929,6 +4973,26 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
               (int)video_latent.shape()[2],
               (int)video_latent.shape()[3]);
     // auto z = sd::load_tensor_from_file_as_tensor<float>("ltx_vae_z.bin");
+    // Debug: check latent values before VAE decode
+    {
+        float min_val = std::numeric_limits<float>::max();
+        float max_val = std::numeric_limits<float>::lowest();
+        float sum = 0.f;
+        int n = (int)video_latent.numel();
+        for (int i = 0; i < n; i++) {
+            float v = video_latent.data()[i];
+            if (std::isnan(v) || std::isinf(v)) {
+                LOG_WARN("latent[%d] = %f (nan/inf!)", i, v);
+                if (i < 10) continue;
+                break;
+            }
+            min_val = std::min(min_val, v);
+            max_val = std::max(max_val, v);
+            sum += v;
+        }
+        LOG_INFO("video_latent stats: min=%.4f max=%.4f mean=%.4f n=%d",
+                 min_val, max_val, sum / n, n);
+    }
     int64_t t4            = ggml_time_ms();
     sd::Tensor<float> vid = sd_ctx->sd->decode_first_stage(video_latent, true);
     int64_t t5            = ggml_time_ms();
@@ -5307,6 +5371,27 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
     }
     LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
 
+    // Debug: check final latent values
+    {
+        float min_val = std::numeric_limits<float>::max();
+        float max_val = std::numeric_limits<float>::lowest();
+        float sum = 0.f;
+        int n = (int)final_latent.numel();
+        int nan_count = 0;
+        for (int i = 0; i < n; i++) {
+            float v = final_latent.data()[i];
+            if (std::isnan(v) || std::isinf(v)) {
+                nan_count++;
+                continue;
+            }
+            min_val = std::min(min_val, v);
+            max_val = std::max(max_val, v);
+            sum += v;
+        }
+        LOG_INFO("final_latent stats: min=%.4f max=%.4f mean=%.4f nan/inf=%d n=%d",
+                 min_val, max_val, sum / n, nan_count, n);
+    }
+
     if (latent_upscale_enabled) {
         int64_t upscale_start             = ggml_time_ms();
         sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,