diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..bc40f86d1 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -449,6 +449,10 @@ ArgOptions SDContextParams::get_options() { "--stream-layers", "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", true, &stream_layers}, + {"", + "--multi-gpu", + "distribute model params across multiple GPUs for parallel computation (defaults to false)", + true, &multi_gpu}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -733,6 +737,7 @@ std::string SDContextParams::to_string() const { << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" + << " multi_gpu: " << (multi_gpu ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -815,6 +820,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f str_to_vae_format(vae_format), max_vram, stream_layers, + multi_gpu, backend.c_str(), params_backend.c_str(), }; diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..885a8af9a 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -146,6 +146,7 @@ struct SDContextParams { bool offload_params_to_cpu = false; float max_vram = 0.f; bool stream_layers = false; + bool multi_gpu = false; std::string backend; std::string params_backend; bool enable_mmap = false; diff --git a/ggml b/ggml index 0ce7ad348..37050baa1 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421 +Subproject commit 37050baa178eeb5f518871a013f12f35db35d6e0 diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 2175f895a..9560fe675 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -224,6 +224,7 @@ typedef struct { enum sd_vae_format_t vae_format; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) + bool multi_gpu; // Distribute model params across multiple GPUs for parallel computation const char* backend; const char* params_backend; } sd_ctx_params_t; diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index 70703d244..9f8c61ada 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1699,6 +1699,13 @@ struct GGMLRunner { ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; + // Multi-GPU support: additional GPU backends for parallel parameter distribution + std::vector extra_gpu_backends; + ggml_backend_sched_t multi_gpu_sched = nullptr; + bool multi_gpu_enabled = false; + std::vector multi_gpu_params_buffers; + std::vector multi_gpu_params_ctxs; + ggml_context* params_ctx = nullptr; ggml_backend_buffer_t params_buffer = nullptr; @@ -2147,6 +2154,9 @@ struct GGMLRunner { graph_tensor_set.insert(ggml_graph_node(gf, i)); } + int copied_count = 0; + int skipped_no_buf = 0; + int skipped_not_in_graph = 0; for (auto& kv : backend_tensor_data_map) { auto tensor = kv.first; auto data = kv.second; @@ -2155,9 +2165,11 @@ struct GGMLRunner { } const char* name = ggml_get_name(tensor); if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) { + skipped_not_in_graph++; continue; } if (tensor->buffer == nullptr) { + skipped_no_buf++; LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s", get_desc().c_str(), name != nullptr ? name : "", @@ -2182,8 +2194,13 @@ struct GGMLRunner { } ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor)); + copied_count++; } + LOG_INFO("%s copy_data_to_backend_tensor: copied=%d skipped_no_buf=%d skipped_not_in_graph=%d total_in_map=%zu graph_leafs=%d graph_nodes=%d", + get_desc().c_str(), copied_count, skipped_no_buf, skipped_not_in_graph, + backend_tensor_data_map.size(), n_leafs, n_nodes); + if (clear_after_copy) { backend_tensor_data_map.clear(); } @@ -2434,6 +2451,21 @@ struct GGMLRunner { }; GraphWeightDoneGuard graph_weight_done_guard(this, ¶ms_to_prepare); + // Multi-GPU execution path using ggml_backend_sched + if (multi_gpu_enabled && !extra_gpu_backends.empty() && !sd_backend_is_cpu(runtime_backend)) { + auto result = execute_graph_multi_gpu(gf, + n_threads, + free_compute_buffer, + free_compute_params, + preserve_backend_tensor_data_map, + no_return, + cache_keep_names, + params_to_prepare); + // Dismiss the guard since multi-GPU path handles cleanup internally + graph_weight_done_guard.dismiss(); + return result; + } + if (!alloc_compute_buffer(gf)) { LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); return std::nullopt; @@ -2538,6 +2570,181 @@ struct GGMLRunner { return output; } + template + std::optional> execute_graph_multi_gpu( + ggml_cgraph* gf, + int n_threads, + bool free_compute_buffer, + bool free_compute_params, + bool preserve_backend_tensor_data_map, + bool no_return, + const std::unordered_set* cache_keep_names, + const std::vector& params_to_prepare) { + // Collect all GPU backends + std::vector all_backends; + all_backends.push_back(runtime_backend); + for (auto& b : extra_gpu_backends) { + all_backends.push_back(b); + } + // Add CPU backend for fallback + ggml_backend_t cpu_backend = sd_backend_cpu_init(); + all_backends.push_back(cpu_backend); + + int n_backends = (int)all_backends.size(); + + // Get buffer types for each backend + std::vector bufts(n_backends); + for (int i = 0; i < n_backends; i++) { + bufts[i] = ggml_backend_get_default_buffer_type(all_backends[i]); + } + + // Determine graph size from the actual graph + int n_nodes = ggml_graph_n_nodes(gf); + int n_leafs = sd::ggml_graph_cut::leaf_count(gf); + size_t graph_size = (size_t)n_nodes + (size_t)n_leafs + 256; // extra margin + + // Create scheduler only once, reuse across iterations + if (multi_gpu_sched == nullptr) { + multi_gpu_sched = ggml_backend_sched_new(all_backends.data(), + bufts.data(), + n_backends, + graph_size, + false, // sequential - split graph into segments per GPU + true); // op_offload + + if (multi_gpu_sched == nullptr) { + LOG_ERROR("%s multi-GPU: failed to create backend scheduler", get_desc().c_str()); + return std::nullopt; + } + } + + // Reset the scheduler for this iteration + ggml_backend_sched_reset(multi_gpu_sched); + + // After reset, manually assign backends for leaf tensors without buffers + // (graph inputs like latent, timestep, positional encoding, etc.) + { + int n_leafs_dbg = sd::ggml_graph_cut::leaf_count(gf); + int no_buffer_count = 0; + for (int i = 0; i < n_leafs_dbg; i++) { + ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i); + if (t == nullptr) continue; + ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; + if (buf == nullptr) { + no_buffer_count++; + ggml_backend_sched_set_tensor_backend(multi_gpu_sched, t, runtime_backend); + } + } + LOG_INFO("%s multi-GPU: assigned %d leaf tensors without buffer to primary GPU", + get_desc().c_str(), no_buffer_count); + } + + if (!ggml_backend_sched_alloc_graph(multi_gpu_sched, gf)) { + LOG_ERROR("%s multi-GPU: failed to allocate graph", get_desc().c_str()); + return std::nullopt; + } + + // Debug: verify parameter tensors have correct backend assignment + { + int param_on_gpu0 = 0, param_on_gpu1 = 0, param_on_gpu2 = 0, param_on_cpu = 0, param_no_buf = 0; + int leaf_count = sd::ggml_graph_cut::leaf_count(gf); + for (int i = 0; i < leaf_count; i++) { + ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i); + if (t == nullptr) continue; + ggml_backend_t t_backend = ggml_backend_sched_get_tensor_backend(multi_gpu_sched, t); + if (t_backend == nullptr) { + param_no_buf++; + continue; + } + if (t_backend == all_backends[0]) param_on_gpu0++; + else if (n_backends > 1 && t_backend == all_backends[1]) param_on_gpu1++; + else if (n_backends > 2 && t_backend == all_backends[2]) param_on_gpu2++; + else if (n_backends > 3 && t_backend == all_backends[3]) param_on_cpu++; + else param_no_buf++; + } + LOG_INFO("%s multi-GPU: leaf backend dist: gpu0=%d gpu1=%d gpu2=%d cpu=%d null=%d", + get_desc().c_str(), param_on_gpu0, param_on_gpu1, param_on_gpu2, param_on_cpu, param_no_buf); + } + + // Debug: check a few leaf parameter tensors for valid data + { + int checked = 0; + int leaf_count = sd::ggml_graph_cut::leaf_count(gf); + for (int i = 0; i < leaf_count && checked < 5; i++) { + ggml_tensor* t = sd::ggml_graph_cut::leaf_tensor(gf, i); + if (t == nullptr) continue; + ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; + if (buf == nullptr || t->data == nullptr) continue; + // Only check weight tensors (skip input tensors like latent/timestep) + if (t->op != GGML_OP_NONE) continue; + // Read first few values + float vals[4] = {0}; + size_t nb = ggml_nbytes(t); + size_t to_read = std::min(nb, sizeof(vals)); + ggml_backend_tensor_get(t, vals, 0, to_read); + bool has_nan = false; + for (int k = 0; k < 4 && k < (int)(to_read/sizeof(float)); k++) { + if (std::isnan(vals[k]) || std::isinf(vals[k])) has_nan = true; + } + LOG_INFO("%s multi-GPU: leaf[%d] '%s' type=%s ne=[%lld,%lld,%lld] vals=[%.4f,%.4f,%.4f,%.4f] nan=%d buf=%p", + get_desc().c_str(), i, t->name, ggml_type_name(t->type), + (long long)t->ne[0], (long long)t->ne[1], (long long)t->ne[2], + vals[0], vals[1], vals[2], vals[3], has_nan, (void*)buf); + checked++; + } + } + + // Copy data to backend tensors AFTER alloc_graph so tensors have buffers + // For multi-GPU path, always preserve the data map since the scheduler is reused + // across iterations and leaf tensors need their data copied each time + copy_data_to_backend_tensor(gf, false); + + // Set n_threads for CPU backend + sd_backend_cpu_set_n_threads(cpu_backend, n_threads); + + LOG_INFO("%s multi-GPU: executing graph with %d splits across %d backends", + get_desc().c_str(), + ggml_backend_sched_get_n_splits(multi_gpu_sched), + n_backends); + + // Execute the graph + ggml_status status = ggml_backend_sched_graph_compute(multi_gpu_sched, gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s multi-GPU: compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); + return std::nullopt; + } + + ggml_backend_sched_synchronize(multi_gpu_sched); + + LOG_INFO("%s multi-GPU: graph compute completed with %d splits", + get_desc().c_str(), + ggml_backend_sched_get_n_splits(multi_gpu_sched)); + + // Read output + if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) { + return std::nullopt; + } + auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); + std::optional> output; + if (!no_return) { + output = read_graph_tensor(result, "output"); + if (!output.has_value()) { + return std::nullopt; + } + } else { + output = sd::Tensor(); + } + + if (!free_compute_params) { + for (ggml_tensor* param : params_to_prepare) { + if (param == nullptr) continue; + kept_compute_param_tensor_set.insert(param); + } + } + + return output; + } + template std::optional> compute_graph_cut_segments(ggml_cgraph* gf, const GraphCutPlan& plan, @@ -2645,6 +2852,16 @@ struct GGMLRunner { free_params_ctx(); free_compute_ctx(); free_cache_ctx_and_buffer(); + if (multi_gpu_sched != nullptr) { + ggml_backend_sched_free(multi_gpu_sched); + multi_gpu_sched = nullptr; + } + for (auto& backend : extra_gpu_backends) { + if (backend != nullptr) { + ggml_backend_free(backend); + } + } + extra_gpu_backends.clear(); } virtual GGMLRunnerContext get_context() { @@ -2696,6 +2913,12 @@ struct GGMLRunner { LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); return true; } + + // Multi-GPU: distribute params across multiple GPUs + if (multi_gpu_enabled && !extra_gpu_backends.empty() && !sd_backend_is_cpu(params_backend)) { + return alloc_params_buffer_multi_gpu(num_tensors); + } + // Pinned host buffer when CPU-offloaded for DMA-direct H2D. ggml_backend_buffer_type_t params_buft = nullptr; if (params_backend != runtime_backend) { @@ -2725,12 +2948,149 @@ struct GGMLRunner { return true; } + // Extract block index from tensor name for round-robin GPU assignment + int extract_block_index(const std::string& name) const { + // Look for patterns like "transformer_blocks.N" or "single_blocks.N" + static const std::vector block_prefixes = { + "transformer_blocks.", + "single_blocks.", + "double_blocks.", + }; + for (const auto& prefix : block_prefixes) { + size_t pos = name.find(prefix); + if (pos != std::string::npos) { + size_t start = pos + prefix.size(); + size_t end = start; + while (end < name.size() && std::isdigit(name[end])) { + end++; + } + if (end > start) { + return std::stoi(name.substr(start, end - start)); + } + } + } + return -1; // No block index found + } + + bool alloc_params_buffer_multi_gpu(size_t num_tensors) { + // Collect all GPU backends (primary + extra) + std::vector all_gpus; + all_gpus.push_back(runtime_backend); + for (auto& b : extra_gpu_backends) { + all_gpus.push_back(b); + } + int n_gpus = (int)all_gpus.size(); + + LOG_INFO("%s multi-GPU: distributing params across %d GPUs", get_desc().c_str(), n_gpus); + + // Group tensors by GPU assignment using round-robin based on block index + std::vector> gpu_tensors(n_gpus); + std::vector gpu_ctxs(n_gpus, nullptr); + + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + int block_idx = extract_block_index(t->name); + int gpu_idx; + if (block_idx >= 0) { + gpu_idx = block_idx % n_gpus; + } else { + // Non-block tensors go to GPU 0 + gpu_idx = 0; + } + gpu_tensors[gpu_idx].push_back(t); + } + + // Allocate buffers on each GPU + for (int g = 0; g < n_gpus; g++) { + if (gpu_tensors[g].empty()) continue; + + // Create a context for this GPU's tensors + ggml_init_params init_params; + init_params.mem_size = gpu_tensors[g].size() * ggml_tensor_overhead(); + init_params.mem_buffer = nullptr; + init_params.no_alloc = true; + gpu_ctxs[g] = ggml_init(init_params); + GGML_ASSERT(gpu_ctxs[g] != nullptr); + + // Duplicate tensor metadata into this GPU's context + for (ggml_tensor* src : gpu_tensors[g]) { + ggml_tensor* dst = ggml_dup_tensor(gpu_ctxs[g], src); + ggml_set_name(dst, src->name); + // Swap buffer/data pointers so the original params_ctx tensor + // points to the new GPU buffer after allocation + std::swap(src->buffer, dst->buffer); + std::swap(src->data, dst->data); + std::swap(src->extra, dst->extra); + } + + // Allocate on this GPU + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(all_gpus[g]); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(gpu_ctxs[g], buft); + if (buf == nullptr) { + LOG_ERROR("%s multi-GPU: alloc params buffer failed on GPU %d", get_desc().c_str(), g); + // Cleanup + for (int i = 0; i <= g; i++) { + if (gpu_ctxs[i] != nullptr) ggml_free(gpu_ctxs[i]); + } + return false; + } + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Copy buffer/data back to original params_ctx tensors + for (size_t i = 0; i < gpu_tensors[g].size(); i++) { + ggml_tensor* src = gpu_tensors[g][i]; + ggml_tensor* dst = ggml_get_tensor(gpu_ctxs[g], src->name); + src->buffer = dst->buffer; + src->data = dst->data; + src->extra = dst->extra; + } + + size_t buf_size = ggml_backend_buffer_get_size(buf); + LOG_INFO("%s multi-GPU: GPU %d params buffer = %.2f MB (%zu tensors)", + get_desc().c_str(), g, buf_size / (1024.f * 1024.f), gpu_tensors[g].size()); + + // Store the buffer and context for later cleanup + // (We'll use multi_gpu_params_buffers to track these) + multi_gpu_params_buffers.push_back(buf); + multi_gpu_params_ctxs.push_back(gpu_ctxs[g]); + } + + // Load data from model files into the GPU buffers + auto manager = weight_manager.lock(); + if (manager != nullptr) { + std::vector all_tensors; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + all_tensors.push_back(t); + } + if (!manager->prepare_params(all_tensors)) { + LOG_ERROR("%s multi-GPU: prepare params failed", get_desc().c_str()); + return false; + } + } + + rebuild_params_tensor_set(); + LOG_INFO("%s multi-GPU: params distributed across %d GPUs successfully", + get_desc().c_str(), n_gpus); + return true; + } + protected: void free_params_buffer() { if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; } + for (auto& buf : multi_gpu_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_free(buf); + } + } + multi_gpu_params_buffers.clear(); + for (auto& ctx : multi_gpu_params_ctxs) { + if (ctx != nullptr) { + ggml_free(ctx); + } + } + multi_gpu_params_ctxs.clear(); observed_max_effective_budget_ = 0; } @@ -2903,6 +3263,20 @@ struct GGMLRunner { stream_layers_enabled = enabled; } + void set_multi_gpu_enabled(bool enabled) { + multi_gpu_enabled = enabled; + } + + void add_extra_gpu_backend(ggml_backend_t backend) { + if (backend != nullptr) { + extra_gpu_backends.push_back(backend); + } + } + + bool has_extra_gpu_backends() const { + return !extra_gpu_backends.empty(); + } + ggml_backend_t get_runtime_backend() { return runtime_backend; } diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index 455dc4b2e..f128ca3d3 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -918,6 +918,10 @@ namespace LTXV { } auto regs = ggml_reshape_3d(ctx->ggml_ctx, params["learnable_registers"], hidden_size, num_learnable_registers, 1); + // Cast regs to match hidden_states type for concat compatibility + if (regs->type != hidden_states->type) { + regs = ggml_cast(ctx->ggml_ctx, regs, hidden_states->type); + } auto temp = ggml_new_tensor_3d(ctx->ggml_ctx, regs->type, regs->ne[0], regs->ne[1], hidden_states->ne[2]); regs = ggml_repeat(ctx->ggml_ctx, regs, temp); diff --git a/src/model/vae/ltx_vae.hpp b/src/model/vae/ltx_vae.hpp index 59e38c32d..8caf501be 100644 --- a/src/model/vae/ltx_vae.hpp +++ b/src/model/vae/ltx_vae.hpp @@ -1068,7 +1068,9 @@ namespace LTXVAE { timestep_conditioning(timestep_conditioning), patch_size(patch_size), decode_only(decode_only) { + LOG_INFO("VideoVAE: version=%d, decode_only=%d, prefix=%s", version, decode_only, prefix.c_str()); if (!decode_only) { + LOG_INFO("VideoVAE: creating Encoder with version=%d", version); blocks["encoder"] = std::make_shared(version, tensor_storage_map, prefix, diff --git a/src/model_io/safetensors_io.cpp b/src/model_io/safetensors_io.cpp index 39131dbd8..fa73aaafb 100644 --- a/src/model_io/safetensors_io.cpp +++ b/src/model_io/safetensors_io.cpp @@ -72,9 +72,9 @@ static ggml_type safetensors_dtype_to_ggml_type(const std::string& dtype) { } else if (dtype == "F64") { ttype = GGML_TYPE_F32; } else if (dtype == "F8_E4M3") { - ttype = GGML_TYPE_F16; + ttype = GGML_TYPE_BF16; } else if (dtype == "F8_E5M2") { - ttype = GGML_TYPE_F16; + ttype = GGML_TYPE_BF16; } else if (dtype == "I32") { ttype = GGML_TYPE_I32; } else if (dtype == "I64") { diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 8d37d39a2..d2341cdf6 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include "core/util.h" +#include "json.hpp" #include "model_io/gguf_io.h" #include "model_io/safetensors_io.h" #include "model_io/torch_legacy_io.h" @@ -120,10 +122,99 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) { return ggml_fp32_to_fp16(*reinterpret_cast(&result)); } +uint16_t f8_e4m3_to_bf16(uint8_t f8) { + // Convert FP8 E4M3 to BF16 via FP32 intermediate + const uint32_t exponent_bias = 7; + if (f8 == 0xff) { + // Negative NaN in bf16: sign=1, exponent=0xFF, mantissa=0x40 + return 0xFFC0; + } else if (f8 == 0x7f) { + // Positive NaN in bf16: sign=0, exponent=0xFF, mantissa=0x40 + return 0x7FC0; + } + + uint32_t sign = f8 & 0x80; + uint32_t exponent = (f8 & 0x78) >> 3; + uint32_t mantissa = f8 & 0x07; + uint32_t result = sign << 24; + if (exponent == 0) { + if (mantissa > 0) { + exponent = 0x7f - exponent_bias; + + // yes, 2 times + if ((mantissa & 0x04) == 0) { + mantissa &= 0x03; + mantissa <<= 1; + exponent -= 1; + } + if ((mantissa & 0x04) == 0) { + mantissa &= 0x03; + mantissa <<= 1; + exponent -= 1; + } + + result |= (mantissa & 0x03) << 21; + result |= exponent << 23; + } + } else { + result |= mantissa << 20; + exponent += 0x7f - exponent_bias; + result |= exponent << 23; + } + + // FP32 to BF16: take upper 16 bits + return static_cast(result >> 16); +} + uint16_t f8_e5m2_to_f16(uint8_t fp8) { return static_cast(fp8) << 8; } +uint16_t f8_e5m2_to_bf16(uint8_t fp8) { + // FP8 E5M2 to BF16 via FP32 + // E5M2: 1 sign + 5 exponent (bias 15) + 2 mantissa + float f; + if (fp8 == 0x7C) { + // positive infinity + uint32_t inf_bits = 0x7f800000; + f = *reinterpret_cast(&inf_bits); + } else if (fp8 == 0xFC) { + // negative infinity + uint32_t inf_bits = 0xff800000; + f = *reinterpret_cast(&inf_bits); + } else if ((fp8 & 0x7C) == 0x7C) { + // NaN + uint32_t nan_bits = 0x7fc00000; + if (fp8 & 0x80) nan_bits = 0xffc00000; + f = *reinterpret_cast(&nan_bits); + } else { + uint32_t sign = fp8 & 0x80; + uint32_t exponent = (fp8 & 0x7C) >> 2; + uint32_t mantissa = fp8 & 0x03; + uint32_t result = sign << 24; + if (exponent == 0) { + if (mantissa > 0) { + exponent = 0x7f - 15; + if ((mantissa & 0x02) == 0) { + mantissa &= 0x01; + mantissa <<= 1; + exponent -= 1; + } + result |= mantissa << 21; + result |= exponent << 23; + } + } else { + result |= mantissa << 21; + exponent += 0x7f - 15; + result |= exponent << 23; + } + f = *reinterpret_cast(&result); + } + uint32_t f32_bits; + memcpy(&f32_bits, &f, sizeof(f32_bits)); + return static_cast(f32_bits >> 16); +} + void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { // support inplace op for (int64_t i = n - 1; i >= 0; i--) { @@ -138,6 +229,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { } } +void f8_e4m3_to_bf16_vec(uint8_t* src, uint16_t* dst, int64_t n) { + // support inplace op + for (int64_t i = n - 1; i >= 0; i--) { + dst[i] = f8_e4m3_to_bf16(src[i]); + } +} + +void f8_e5m2_to_bf16_vec(uint8_t* src, uint16_t* dst, int64_t n) { + // support inplace op + for (int64_t i = n - 1; i >= 0; i--) { + dst[i] = f8_e5m2_to_bf16(src[i]); + } +} + void f64_to_f32_vec(double* src, float* dst, int64_t n) { // support inplace op for (int64_t i = 0; i < n; i++) { @@ -406,22 +511,156 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s std::string clip_path = path_join(file_path, "text_encoder/model.safetensors"); std::string clip_g_path = path_join(file_path, "text_encoder_2/model.safetensors"); - if (!init_from_safetensors_file(unet_path, "unet.")) { + // Check for standard diffusers directory structure first + if (file_exists(unet_path) || file_exists(vae_path) || file_exists(clip_path) || file_exists(clip_g_path)) { + if (!init_from_safetensors_file(unet_path, "unet.")) { + return false; + } + + if (!init_from_safetensors_file(vae_path, "vae.")) { + LOG_WARN("Couldn't find working VAE in %s", file_path.c_str()); + // return false; + } + if (!init_from_safetensors_file(clip_path, "te.")) { + LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str()); + // return false; + } + if (!init_from_safetensors_file(clip_g_path, "te.1.")) { + LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str()); + } + return true; + } + + // HuggingFace sharded safetensors format: load model-*.safetensors files + // Look for model.safetensors.index.json or model-*.safetensors files + std::string index_path = path_join(file_path, "model.safetensors.index.json"); + std::set shard_files; + + if (file_exists(index_path)) { + // Parse the index JSON to find which shard files exist + std::ifstream ifs(index_path); + if (ifs.is_open()) { + try { + nlohmann::json index_json = nlohmann::json::parse(ifs); + if (index_json.contains("weight_map")) { + for (auto& [key, val] : index_json["weight_map"].items()) { + if (val.is_string()) { + shard_files.insert(val.get()); + } + } + } + } catch (...) { + LOG_WARN("Failed to parse model.safetensors.index.json in %s", file_path.c_str()); + } + } + } + + // If no index found, glob for model-*.safetensors files + if (shard_files.empty()) { + // Try single model.safetensors file + std::string single_model_path = path_join(file_path, "model.safetensors"); + if (file_exists(single_model_path)) { + return init_from_safetensors_file(single_model_path, prefix); + } + + // Try to find model-*.safetensors shard files + DIR* dir = opendir(file_path.c_str()); + if (dir) { + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + std::string name = entry->d_name; + if (name.find("model-") == 0 && name.find(".safetensors") != std::string::npos) { + shard_files.insert(name); + } + } + closedir(dir); + } + } + + if (shard_files.empty()) { + LOG_WARN("No safetensors model files found in %s", file_path.c_str()); return false; } - if (!init_from_safetensors_file(vae_path, "vae.")) { - LOG_WARN("Couldn't find working VAE in %s", file_path.c_str()); - // return false; + LOG_INFO("Loading %zu shard files from %s", shard_files.size(), file_path.c_str()); + bool any_loaded = false; + for (const auto& shard_name : shard_files) { + std::string shard_path = path_join(file_path, shard_name); + if (init_from_safetensors_file(shard_path, prefix)) { + any_loaded = true; + } else { + LOG_WARN("Failed to load shard %s", shard_path.c_str()); + } } - if (!init_from_safetensors_file(clip_path, "te.")) { - LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str()); - // return false; + + // HuggingFace gemma models have a "language_model." prefix that needs to be stripped + // e.g. "language_model.model.embed_tokens.weight" -> "model.embed_tokens.weight" + // This is needed because the C++ code expects "text_encoders.llm.model.embed_tokens.weight" + // but the safetensors files contain "language_model.model.embed_tokens.weight" + bool has_language_model_prefix = false; + for (auto& [name, ts] : tensor_storage_map) { + if (starts_with(name, prefix + "language_model.")) { + has_language_model_prefix = true; + break; + } } - if (!init_from_safetensors_file(clip_g_path, "te.1.")) { - LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str()); + if (has_language_model_prefix) { + LOG_INFO("Stripping 'language_model.' prefix from tensors in %s", file_path.c_str()); + String2TensorStorage new_map; + for (auto& [name, tensor_storage] : tensor_storage_map) { + if (starts_with(name, prefix + "language_model.")) { + std::string new_name = prefix + name.substr(prefix.size() + strlen("language_model.")); + tensor_storage.name = new_name; + new_map[new_name] = std::move(tensor_storage); + } else { + new_map[name] = std::move(tensor_storage); + } + } + tensor_storage_map.swap(new_map); } - return true; + + // HuggingFace safetensors LLM naming differs from GGUF naming. + // The C++ code uses GGUF-style names internally (e.g., post_attention_norm, post_ffw_norm), + // but safetensors files use HuggingFace names (e.g., post_feedforward_layernorm, pre_feedforward_layernorm). + // Apply reverse mapping: safetensors HF name -> C++ internal name. + static const std::vector> hf_llm_name_map = { + {"post_feedforward_layernorm.", "post_ffw_norm."}, + {"pre_feedforward_layernorm.", "post_attention_norm."}, + // Standard mappings that are already correct in most safetensors files: + // {"input_layernorm.", "input_layernorm."} -- no change needed + // {"post_attention_layernorm.", "post_attention_layernorm."} -- already correct for pre_ffw_norm + }; + + bool needs_rename = false; + for (auto& [name, ts] : tensor_storage_map) { + if (!starts_with(name, prefix)) continue; + for (const auto& [hf_name, cpp_name] : hf_llm_name_map) { + if (name.find(hf_name) != std::string::npos) { + needs_rename = true; + break; + } + } + if (needs_rename) break; + } + + if (needs_rename) { + LOG_INFO("Applying HuggingFace LLM name mapping for %s", file_path.c_str()); + String2TensorStorage new_map; + for (auto& [name, tensor_storage] : tensor_storage_map) { + std::string new_name = name; + for (const auto& [hf_name, cpp_name] : hf_llm_name_map) { + size_t pos = new_name.find(hf_name); + if (pos != std::string::npos) { + new_name.replace(pos, hf_name.size(), cpp_name); + } + } + tensor_storage.name = new_name; + new_map[new_name] = std::move(tensor_storage); + } + tensor_storage_map.swap(new_map); + } + + return any_loaded; } SDVersion ModelLoader::get_sd_version() { @@ -1130,9 +1369,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, t0 = ggml_time_ms(); if (tensor_storage.is_f8_e4m3) { - f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + if (tensor_storage.type == GGML_TYPE_BF16) { + f8_e4m3_to_bf16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + } else { + f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + } } else if (tensor_storage.is_f8_e5m2) { - f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + if (tensor_storage.type == GGML_TYPE_BF16) { + f8_e5m2_to_bf16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + } else { + f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements()); + } } else if (tensor_storage.is_f64) { f64_to_f32_vec((double*)read_buf, (float*)target_buf, tensor_storage.nelements()); } else if (tensor_storage.is_i64) { diff --git a/src/model_manager.cpp b/src/model_manager.cpp index 328a478bb..79ee61d2e 100644 --- a/src/model_manager.cpp +++ b/src/model_manager.cpp @@ -100,6 +100,28 @@ size_t estimate_tensors_size(const std::map& tensors) return size; } +static int extract_block_index(const std::string& name) { + static const std::vector block_prefixes = { + "transformer_blocks.", + "single_blocks.", + "double_blocks.", + }; + for (const auto& prefix : block_prefixes) { + size_t pos = name.find(prefix); + if (pos != std::string::npos) { + size_t start = pos + prefix.size(); + size_t end = start; + while (end < name.size() && std::isdigit(name[end])) { + end++; + } + if (end > start) { + return std::stoi(name.substr(start, end - start)); + } + } + } + return -1; +} + bool ModelManager::register_param_tensors(const std::string& desc, std::map tensors, ResidencyMode residency_mode, @@ -114,6 +136,15 @@ bool ModelManager::register_param_tensors(const std::string& desc, *registered_tensor_size += estimate_tensors_size(tensors); } + // Build list of all GPU backends for multi-GPU distribution + std::vector all_gpus; + if (multi_gpu_enabled_ && !extra_gpu_backends_.empty()) { + all_gpus.push_back(compute_backend); + for (auto& b : extra_gpu_backends_) { + all_gpus.push_back(b); + } + } + std::vector> new_states; new_states.reserve(tensors.size()); @@ -134,7 +165,21 @@ bool ModelManager::register_param_tensors(const std::string& desc, state->tensor = tensor; state->desc = desc; state->residency_mode = residency_mode; - state->compute_backend = compute_backend; + + // Multi-GPU: assign compute backend based on block index + if (!all_gpus.empty()) { + int block_idx = extract_block_index(name); + int gpu_idx; + if (block_idx >= 0) { + gpu_idx = block_idx % (int)all_gpus.size(); + } else { + gpu_idx = 0; + } + state->compute_backend = all_gpus[gpu_idx]; + } else { + state->compute_backend = compute_backend; + } + state->params_backend = params_backend; new_states.push_back(std::move(state)); } diff --git a/src/model_manager.h b/src/model_manager.h index b3da8a36a..97177ad3b 100644 --- a/src/model_manager.h +++ b/src/model_manager.h @@ -69,6 +69,8 @@ class ModelManager : public RunnerWeightManager { uint64_t current_lora_epoch_ = 0; int n_threads_ = 0; bool enable_mmap_ = false; + bool multi_gpu_enabled_ = false; + std::vector extra_gpu_backends_; void finish_compute_backend_usage(const std::vector& states); void release_all(); @@ -110,6 +112,12 @@ class ModelManager : public RunnerWeightManager { model_loader_.set_n_threads(n_threads); } void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; } + void set_multi_gpu_enabled(bool enabled) { multi_gpu_enabled_ = enabled; } + void add_extra_gpu_backend(ggml_backend_t backend) { + if (backend != nullptr) { + extra_gpu_backends_.push_back(backend); + } + } void set_common_ignore_tensors(std::set ignore_tensors); void set_loras(std::vector loras, SDVersion version); diff --git a/src/runtime/denoiser.hpp b/src/runtime/denoiser.hpp index fed5911bc..520d7a855 100644 --- a/src/runtime/denoiser.hpp +++ b/src/runtime/denoiser.hpp @@ -950,6 +950,33 @@ static sd::Tensor sample_euler(denoise_cb_t model, return {}; } sd::Tensor denoised = std::move(denoised_opt.pred); + + // Debug: check denoised and x for NaN at each step + { + int nan_d = 0, nan_x = 0; + int n = std::min((int)denoised.numel(), 100); + for (int j = 0; j < n; j++) { + if (std::isnan(denoised.data()[j]) || std::isinf(denoised.data()[j])) nan_d++; + if (std::isnan(x.data()[j]) || std::isinf(x.data()[j])) nan_x++; + } + if (nan_d > 0 || nan_x > 0 || i == 0) { + float d_min = std::numeric_limits::max(), d_max = std::numeric_limits::lowest(); + float x_min = std::numeric_limits::max(), x_max = std::numeric_limits::lowest(); + for (int j = 0; j < n; j++) { + if (!std::isnan(denoised.data()[j]) && !std::isinf(denoised.data()[j])) { + d_min = std::min(d_min, denoised.data()[j]); + d_max = std::max(d_max, denoised.data()[j]); + } + if (!std::isnan(x.data()[j]) && !std::isinf(x.data()[j])) { + x_min = std::min(x_min, x.data()[j]); + x_max = std::max(x_max, x.data()[j]); + } + } + LOG_INFO("euler step %d/%d: sigma=%.4f denoised[nan=%d min=%.4f max=%.4f] x[nan=%d min=%.4f max=%.4f]", + i, steps, sigma, nan_d, d_min, d_max, nan_x, x_min, x_max); + } + } + sd::Tensor d = (x - denoised) / sigma; x += d * (sigmas[i + 1] - sigma); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 19f9e85ea..956454b85 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -193,6 +193,7 @@ class StableDiffusionGGML { bool enable_mmap = false; float max_vram = 0.f; bool stream_layers = false; + bool multi_gpu = false; std::string backend_spec; std::string params_backend_spec; @@ -327,6 +328,7 @@ class StableDiffusionGGML { enable_mmap = sd_ctx_params->enable_mmap; max_vram = sd_ctx_params->max_vram; stream_layers = sd_ctx_params->stream_layers; + multi_gpu = sd_ctx_params->multi_gpu; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); if (stream_layers && max_vram == 0.f) { @@ -809,6 +811,28 @@ class StableDiffusionGGML { diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); diffusion_model->set_stream_layers_enabled(stream_layers); + if (multi_gpu) { + diffusion_model->set_multi_gpu_enabled(true); + // Initialize extra GPU backends (GPU 1, GPU 2, ...) + int n_devices = ggml_backend_dev_count(); + for (int i = 1; i < n_devices; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend != nullptr) { + LOG_INFO("Multi-GPU: adding GPU %d (%s)", i, ggml_backend_name(backend)); + diffusion_model->add_extra_gpu_backend(backend); + model_manager->add_extra_gpu_backend(backend); + } + } + } + if (!diffusion_model->has_extra_gpu_backends()) { + LOG_WARN("--multi-gpu specified but no extra GPUs found; disabling"); + diffusion_model->set_multi_gpu_enabled(false); + } else { + model_manager->set_multi_gpu_enabled(true); + } + } if (!register_runner_params("Diffusion model", diffusion_model, SDBackendModule::DIFFUSION, @@ -870,6 +894,7 @@ class StableDiffusionGGML { auto create_vae = [&]() -> std::shared_ptr { if (sd_version_is_ltxav(version)) { + LOG_INFO("Creating LTXVideoVAE with vae_decode_only=%d", vae_decode_only); return std::make_shared(backend_for(SDBackendModule::VAE), params_backend_for(SDBackendModule::VAE), tensor_storage_map, @@ -2053,6 +2078,22 @@ class StableDiffusionGGML { return sd::Tensor(); } + // Debug: check for NaN in diffusion model output + { + int nan_count = 0; + int inf_count = 0; + int out_n = (int)output_opt.numel(); + for (int i = 0; i < std::min(out_n, 1000); i++) { + float v = output_opt.data()[i]; + if (std::isnan(v)) nan_count++; + else if (std::isinf(v)) inf_count++; + } + if (nan_count > 0 || inf_count > 0) { + LOG_WARN("diffusion model output step=%d: nan=%d inf=%d in first %d elements", + step, nan_count, inf_count, std::min(out_n, 1000)); + } + } + step_cache.after_condition(&condition, noised_input, output_opt); return output_opt; }; @@ -2638,6 +2679,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->max_vram = 0.f; sd_ctx_params->stream_layers = false; + sd_ctx_params->multi_gpu = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -2687,6 +2729,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" "stream_layers: %s\n" + "multi_gpu: %s\n" "backend: %s\n" "params_backend: %s\n" "keep_clip_on_cpu: %s\n" @@ -2727,6 +2770,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, BOOL_STR(sd_ctx_params->stream_layers), + BOOL_STR(sd_ctx_params->multi_gpu), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), @@ -4929,6 +4973,26 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx, (int)video_latent.shape()[2], (int)video_latent.shape()[3]); // auto z = sd::load_tensor_from_file_as_tensor("ltx_vae_z.bin"); + // Debug: check latent values before VAE decode + { + float min_val = std::numeric_limits::max(); + float max_val = std::numeric_limits::lowest(); + float sum = 0.f; + int n = (int)video_latent.numel(); + for (int i = 0; i < n; i++) { + float v = video_latent.data()[i]; + if (std::isnan(v) || std::isinf(v)) { + LOG_WARN("latent[%d] = %f (nan/inf!)", i, v); + if (i < 10) continue; + break; + } + min_val = std::min(min_val, v); + max_val = std::max(max_val, v); + sum += v; + } + LOG_INFO("video_latent stats: min=%.4f max=%.4f mean=%.4f n=%d", + min_val, max_val, sum / n, n); + } int64_t t4 = ggml_time_ms(); sd::Tensor vid = sd_ctx->sd->decode_first_stage(video_latent, true); int64_t t5 = ggml_time_ms(); @@ -5307,6 +5371,27 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, } LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + // Debug: check final latent values + { + float min_val = std::numeric_limits::max(); + float max_val = std::numeric_limits::lowest(); + float sum = 0.f; + int n = (int)final_latent.numel(); + int nan_count = 0; + for (int i = 0; i < n; i++) { + float v = final_latent.data()[i]; + if (std::isnan(v) || std::isinf(v)) { + nan_count++; + continue; + } + min_val = std::min(min_val, v); + max_val = std::max(max_val, v); + sum += v; + } + LOG_INFO("final_latent stats: min=%.4f max=%.4f mean=%.4f nan/inf=%d n=%d", + min_val, max_val, sum / n, nan_count, n); + } + if (latent_upscale_enabled) { int64_t upscale_start = ggml_time_ms(); sd::Tensor upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,