diff --git a/examples/common/common.cpp b/examples/common/common.cpp index e9b8bc85a..f992636d8 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -431,6 +431,18 @@ ArgOptions SDContextParams::get_options() { "--rpc-servers", "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052", &rpc_servers}, + {"", + "--multi-gpu-mode", + "how to split a too-large DiT across GPUs (auto-fit): " + "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off " + "(default: row)", + &multi_gpu_mode}, + {"", + "--fit-compute-reserve", + "auto-fit: per-component compute-buffer reserve in MiB as a component " + "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in " + "defaults)", + &fit_compute_reserve}, {"", "--max-vram", "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value", @@ -447,6 +459,10 @@ ArgOptions SDContextParams::get_options() { "--chroma-t5-mask-pad", "t5 mask pad size of chroma", &chroma_t5_mask_pad}, + {"", + "--fit-target", + "auto-fit: MiB of free memory to leave on each GPU (default: 512)", + &auto_fit_target_mb}, }; options.bool_options = { @@ -518,6 +534,24 @@ ArgOptions SDContextParams::get_options() { "--chroma-enable-t5-mask", "enable t5 mask for chroma", true, &chroma_use_t5_mask}, + {"", + "--auto-fit", + "automatically pick DiT/VAE/Conditioner device placements based on " + "free GPU memory (default ON)", + true, &auto_fit}, + {"", + "--no-auto-fit", + "disable auto-fit and use the explicit --backend / --params-backend flags", + false, &auto_fit}, + {"", + "--no-multi-gpu", + "auto-fit: keep all components on a single GPU when they fit " + "(by default, multi-GPU placements are preferred to balance load)", + false, &auto_multi_gpu}, + {"", + "--fit-dry-run", + "auto-fit: print the computed plan and exit without loading models", + true, &auto_fit_dry_run}, }; auto on_type_arg = [&](int argc, const char** argv, int index) { @@ -616,6 +650,15 @@ ArgOptions SDContextParams::get_options() { "but it usually offers faster inference speed and, in some cases, lower memory usage. " "The at_runtime mode, on the other hand, is exactly the opposite.", on_lora_apply_mode_arg}, + {"", + "--list-devices", + "list available ggml backend devices (one per line, " + "namedescription) and exit", + [](int /*argc*/, const char** /*argv*/, int /*index*/) { + sd_list_devices(); + std::exit(0); + return 0; + }}, }; return options; @@ -760,9 +803,12 @@ std::string SDContextParams::to_string() const { << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" + << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" + << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" + << " fit_compute_reserve: \"" << fit_compute_reserve << "\",\n" + << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" + << " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n" << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" @@ -837,6 +883,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { sd_ctx_params.stream_layers = stream_layers; sd_ctx_params.backend = effective_backend.c_str(); sd_ctx_params.params_backend = effective_params_backend.c_str(); + sd_ctx_params.auto_fit = auto_fit; + sd_ctx_params.auto_fit_target_mb = auto_fit_target_mb; + sd_ctx_params.auto_fit_dry_run = auto_fit_dry_run; + sd_ctx_params.auto_fit_compute_reserve = fit_compute_reserve.c_str(); + sd_ctx_params.auto_multi_gpu = auto_multi_gpu; + sd_ctx_params.multi_gpu_mode = multi_gpu_mode.c_str(); sd_ctx_params.rpc_servers = rpc_servers.c_str(); return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index 55fa5ac0a..1549ca9c1 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -152,9 +152,6 @@ struct SDContextParams { std::string effective_backend; std::string effective_params_backend; bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; bool flash_attn = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; @@ -170,6 +167,23 @@ struct SDContextParams { bool qwen_image_zero_cond_t = false; + // Auto-fit defaults — placement is computed automatically based on free + // VRAM. Pass --no-auto-fit to disable and use explicit --backend specs. + bool auto_fit = true; + int auto_fit_target_mb = 512; + bool auto_fit_dry_run = false; + // Per-component compute-buffer reserve in MiB as a component map, + // e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults. + std::string fit_compute_reserve; + bool auto_multi_gpu = true; + std::string multi_gpu_mode = "row"; + + // Deprecated aliases for --backend =cpu (kept for + // backwards compatibility with the pre-auto-fit CLI). + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + prediction_t prediction = PREDICTION_COUNT; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 00f3e4e97..41f561e38 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -220,6 +220,35 @@ typedef struct { bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; + + // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory. + // When `auto_fit` is true (default), `backend` / `params_backend` are + // ignored and the placement is computed automatically (the plan is fed + // into the same backend assignment that `backend` / `params_backend` use). + // `auto_fit_target_mb` is the memory to leave free per GPU (default 512). + // `auto_fit_dry_run` prints the plan and aborts init before loading. + // `auto_fit_compute_reserve` tunes the per-component compute-buffer + // reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512" + // (same component-key style as `backend`); missing keys / NULL keep the + // built-in defaults. + bool auto_fit; + int auto_fit_target_mb; + bool auto_fit_dry_run; + const char* auto_fit_compute_reserve; + + // When more than one GPU device is present, prefer placing different + // components on different GPUs to balance load and fit larger total + // working sets. Set false to keep all components on a single GPU when + // they fit. Defaults to true. Each component still lives entirely on + // one device unless multi_gpu_mode splits it (see below). + bool auto_multi_gpu; + + // How to split a single component (currently only the DiT) across GPUs + // when it doesn't fit on one but fits across several: "row" (matmul rows + // split via the backend's stock split buffer type, CUDA/SYCL), + // "layer" (whole blocks per GPU, routed by a scheduler, backend-generic), + // or "off" (never split a single component). NULL / empty => "row". + const char* multi_gpu_mode; const char* rpc_servers; } sd_ctx_params_t; @@ -485,6 +514,11 @@ SD_API bool preprocess_canny(sd_image_t image, SD_API const char* sd_commit(void); SD_API const char* sd_version(void); +// List available ggml backend devices to stdout, in `namedescription` +// per-line format. The output is intended to be parsed by tools and used as +// device names in the --backend / --params-backend assignment specs. +SD_API void sd_list_devices(void); + // for C API, caller needs to call free_sd_images to free the memory after use // This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT. SD_API void free_sd_images(sd_image_t* result_images, int num_images); diff --git a/src/backend_fit.hpp b/src/backend_fit.hpp new file mode 100644 index 000000000..c8a4ff2a0 --- /dev/null +++ b/src/backend_fit.hpp @@ -0,0 +1,729 @@ +#ifndef __SD_BACKEND_FIT_HPP__ +#define __SD_BACKEND_FIT_HPP__ + +// Auto-fit algorithm for distributing DiT, VAE, and conditioner across the +// available GPU devices and system RAM. +// +// Each component is treated as a single atomic unit that lives entirely on +// one device (plus its compute buffer on the same device). There is no +// intra-tensor row split: cross-device parallelism comes from placing +// different components on different GPUs, not from splitting individual +// matmul weights — the equivalent of llama.cpp's LLAMA_SPLIT_MODE_LAYER +// at the component granularity. +// +// Placement priority: DiT + compute buffer -> VAE -> Conditioner. +// Overflow falls back to CPU (or GPU_OFFLOAD_PARAMS for components that +// support streaming params from RAM at compute time). + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend.h" + +#include "model_loader.h" +#include "core/util.h" + +namespace backend_fit { + +constexpr int64_t MiB = 1024 * 1024; +constexpr int DEVICE_ID_CPU = -1; + +static inline int bit_count(unsigned int value) { + int count = 0; + while (value != 0) { + count += static_cast(value & 1U); + value >>= 1; + } + return count; +} + +enum class ComponentKind { + DIT, + VAE, + CONDITIONER, +}; + +enum class Placement { + CPU, + GPU, + GPU_OFFLOAD_PARAMS, // params in RAM, compute on GPU + GPU_LAYER_SPLIT, // params split across multiple GPUs at block boundaries (sched-based) + GPU_TENSOR_SPLIT, // matmul weights row-split across GPUs (CUDA split-buft, single backend) +}; + +struct Component { + ComponentKind kind; + std::string name; + int64_t params_bytes = 0; + int64_t compute_bytes = 0; + bool supports_offload = false; +}; + +struct Device { + int id = DEVICE_ID_CPU; + std::string name; + std::string description; + int64_t free_bytes = 0; + int64_t total_bytes = 0; + ggml_backend_dev_t dev = nullptr; // backing ggml device handle (GPU only) +}; + +struct Decision { + ComponentKind kind; + std::string name; + Placement placement = Placement::CPU; + int device_id = DEVICE_ID_CPU; + int64_t on_device_bytes = 0; + int64_t on_host_bytes = 0; + + // Populated when placement == GPU_LAYER_SPLIT. Contains the device IDs + // that share this component (in order) and each device's estimated share + // of the params. The order also defines block-range partitioning: the + // i-th device gets a contiguous range of blocks proportional to share[i]. + std::vector split_device_ids; + std::vector split_share_bytes; +}; + +struct Plan { + std::vector decisions; + std::map device_bytes; + int64_t host_bytes = 0; + bool any_changes = false; +}; + +struct ComputeReserves { + int64_t dit_bytes = int64_t(2048) * MiB; + int64_t vae_bytes = int64_t(1024) * MiB; + int64_t conditioner_bytes = int64_t(512) * MiB; +}; + +enum class MultiGpuMode { + OFF, // never split a single component across GPUs + ROW, // CUDA-only: row-split matmul weights via cuda_split_buffer_type + LAYER, // generic: assign block-indexed tensors to per-block backends + sched +}; + +inline const char* multi_gpu_mode_str(MultiGpuMode m) { + switch (m) { + case MultiGpuMode::OFF: return "off"; + case MultiGpuMode::ROW: return "row"; + case MultiGpuMode::LAYER: return "layer"; + } + return "?"; +} + +inline MultiGpuMode str_to_multi_gpu_mode(const std::string& s) { + if (s == "off") return MultiGpuMode::OFF; + if (s == "row") return MultiGpuMode::ROW; + if (s == "layer") return MultiGpuMode::LAYER; + return MultiGpuMode::ROW; // default +} + +// --- Classification ------------------------------------------------------- + +inline bool classify_tensor(const std::string& name, ComponentKind& out) { + auto contains = [&](const char* s) { return name.find(s) != std::string::npos; }; + + if (contains("model.diffusion_model.") || contains("unet.")) { + out = ComponentKind::DIT; + return true; + } + + if (contains("first_stage_model.") || + name.rfind("vae.", 0) == 0 || + name.rfind("tae.", 0) == 0) { + out = ComponentKind::VAE; + return true; + } + + if (contains("text_encoders") || + contains("cond_stage_model") || + contains("te.text_model.") || + contains("conditioner") || + name.rfind("text_encoder.", 0) == 0 || + // Connector / text projection layers that run on the conditioner + // backend (e.g. LTX-2's text_embedding_projection: video/audio + // aggregate embeds + projection that map LLM hidden states into + // DiT-input space). + name.rfind("text_embedding_projection.", 0) == 0 || + contains(".aggregate_embed.")) { + out = ComponentKind::CONDITIONER; + return true; + } + + return false; +} + +// --- Memory estimation ---------------------------------------------------- + +inline std::vector estimate_components(ModelLoader& loader, + ggml_type override_wtype, + int64_t alignment, + const ComputeReserves& reserves) { + auto& storage = loader.get_tensor_storage_map(); + + int64_t bytes[3] = {0, 0, 0}; + + for (auto& [name, ts_const] : storage) { + TensorStorage ts = ts_const; + if (is_unused_tensor(ts.name)) { + continue; + } + + ComponentKind k; + if (!classify_tensor(ts.name, k)) { + continue; + } + + if (override_wtype != GGML_TYPE_COUNT && + loader.tensor_should_be_converted(ts, override_wtype)) { + ts.type = override_wtype; + } else if (ts.expected_type != GGML_TYPE_COUNT && ts.expected_type != ts.type) { + ts.type = ts.expected_type; + } + + bytes[int(k)] += ts.nbytes() + alignment; + } + + std::vector out; + out.reserve(3); + out.push_back({ComponentKind::DIT, "DiT", + bytes[int(ComponentKind::DIT)], reserves.dit_bytes, true}); + out.push_back({ComponentKind::VAE, "VAE", + bytes[int(ComponentKind::VAE)], reserves.vae_bytes, false}); + out.push_back({ComponentKind::CONDITIONER, "Conditioner", + bytes[int(ComponentKind::CONDITIONER)], reserves.conditioner_bytes, true}); + return out; +} + +// --- Device enumeration --------------------------------------------------- + +inline std::vector enumerate_gpu_devices() { + // Make sure the dynamically-loaded backends are registered before we query + // the device list. This runs before SDBackendManager initializes any + // backend, so nothing else has triggered the (file-local) lazy load yet. + // Safe to call once here: the manager's own load-all-once guard short + // circuits afterwards because the device count is already non-zero. + ggml_backend_load_all(); + + std::vector out; + int gpu_idx = 0; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + continue; + } + Device d; + d.id = gpu_idx++; + d.dev = dev; + d.name = ggml_backend_dev_name(dev); + d.description = ggml_backend_dev_description(dev); + size_t free_b = 0, total_b = 0; + ggml_backend_dev_memory(dev, &free_b, &total_b); + d.free_bytes = int64_t(free_b); + d.total_bytes = int64_t(total_b); + out.push_back(d); + } + return out; +} + +// --- Core algorithm ------------------------------------------------------- + +// Per-GPU share for a layer-split component: free-VRAM-weighted partition +// of params, plus the full compute reserve on each participating device. +// (Compute reserve is per-device since each shard activates its own kernels.) +inline std::vector layer_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs, + int64_t margin_bytes = 0) { + // Every participating device hosts its param share PLUS a full compute + // reserve (the sched allocates a compute buffer per backend), so weight the + // param shares by what remains AFTER compute + margin. This guarantees + // share_k + compute <= free_k - margin whenever the total fits at all; + // weighting by raw free overcommits the smaller GPU and the planner then + // rejects layer-split as infeasible (observed: 22B DiT fell to CPU). + std::vector avail(gpu_idxs.size(), 0); + int64_t total = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + int64_t a = std::max(0, devices[gpu_idxs[k]].free_bytes - compute_bytes - margin_bytes); + avail[k] = a; + total += a; + } + std::vector out(gpu_idxs.size(), 0); + if (total <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + double r = double(avail[k]) / double(total); + out[k] = int64_t(double(params_bytes) * r) + compute_bytes; + } + return out; +} + +// Per-GPU PARAM share for a row (tensor) split. Unlike layer-split, the graph +// runs on a single MAIN backend (the biggest GPU at gpu_idxs[main_pos]), so +// ONLY the main device also hosts the compute buffer. We therefore reserve +// `compute_bytes` of the main device's free VRAM before weighting, so the main +// doesn't get so many matmul rows that its compute buffer no longer fits. The +// caller adds compute_bytes back when computing the main device's peak. Returns +// param bytes per device (no compute folded in) — these become the split ratios. +inline std::vector row_split_shares(int64_t params_bytes, + int64_t compute_bytes, + const std::vector& devices, + const std::vector& gpu_idxs, + size_t main_pos) { + std::vector avail(gpu_idxs.size(), 0); + int64_t total = 0; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + int64_t a = std::max(0, devices[gpu_idxs[k]].free_bytes); + if (k == main_pos) { + a = std::max(0, a - compute_bytes); + } + avail[k] = a; + total += a; + } + std::vector out(gpu_idxs.size(), 0); + if (total <= 0) return out; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + out[k] = int64_t(double(params_bytes) * double(avail[k]) / double(total)); + } + return out; +} + +// Peak per device = MAX of any single component's footprint on that device. +// Components free their params between phases (free_params_immediately; the +// split runners load lazily and free after each phase too), so they time-share +// VRAM rather than coexisting — hence MAX, not sum. +inline int64_t gpu_peak(int gpu_idx, + const std::vector& pl, + const std::vector& dev, + const std::vector& components, + const std::vector& devices = {}) { + int64_t peak = 0; + for (size_t i = 0; i < components.size(); i++) { + int64_t footprint = 0; + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (dev[i] != gpu_idx) continue; + footprint = components[i].params_bytes + components[i].compute_bytes; + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: every GPU in the mask gets a free-VRAM-weighted + // share of params; the compute reserve lands on the BIGGEST + // GPU (which becomes the runner's main backend). + const int mask = dev[i]; + if (!(mask & (1 << gpu_idx))) continue; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + int slot = -1; + int biggest_slot = 0; + int64_t biggest_mem = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) slot = int(k); + if (devices[gpu_idxs[k]].total_bytes > biggest_mem) { + biggest_mem = devices[gpu_idxs[k]].total_bytes; + biggest_slot = int(k); + } + } + if (slot < 0) continue; + // Row-split: graph runs on the main (= biggest) GPU, which reserves + // its compute buffer; param rows are weighted by the remaining free. + auto shares = row_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs, size_t(biggest_slot)); + footprint = shares[slot]; + if (slot == biggest_slot) { + footprint += components[i].compute_bytes; + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // dev[i] holds the bitmask of participating GPU indices into the + // devices[] vector (encoded by the planner). Look up our slot. + const int mask = dev[i]; + std::vector gpu_idxs; + for (size_t k = 0; k < devices.size(); k++) { + if (mask & (1 << k)) gpu_idxs.push_back(k); + } + // Find this gpu's slot in gpu_idxs. + int slot = -1; + for (size_t k = 0; k < gpu_idxs.size(); k++) { + if (int(gpu_idxs[k]) == gpu_idx) { slot = int(k); break; } + } + if (slot < 0) continue; + auto shares = layer_split_shares(components[i].params_bytes, + components[i].compute_bytes, + devices, gpu_idxs); + footprint = shares[slot]; + } + peak = std::max(peak, footprint); + } + return peak; +} + +inline Plan compute_plan(const std::vector& components, + const std::vector& devices, + int64_t margin_bytes, + bool allow_multi_gpu = true, + MultiGpuMode mode = MultiGpuMode::ROW) { + const size_t nC = components.size(); + const size_t nG = devices.size(); + if (!allow_multi_gpu) { + mode = MultiGpuMode::OFF; + } + + std::vector cap(nG, 0); + for (size_t g = 0; g < nG; g++) { + cap[g] = std::max(0, devices[g].free_bytes - margin_bytes); + } + + struct OptionSlot { + Placement placement; + int device_idx; + }; + + // ROW-split is DiT-exclusive. Keeping a single homogeneous row-split + // component (same tensor sizes every phase/generation) lets the driver + // reuse freed split-buffer chunks, which is what avoids the + // cuda_split_buffer fragmentation a ggml patch would otherwise be needed + // for. The DiT is also the per-step bottleneck, where row-split's small + // compute buffer matters most. + auto supports_tensor_split = [](ComponentKind k) { + return k == ComponentKind::DIT; + }; + // LAYER-split (regular per-device buffers routed by a scheduler) is + // general and fragmentation-free, so any block-structured component can + // use it. The Conditioner (e.g. Gemma) splits this way when it is too big + // for one GPU; its (larger) cross-backend compute buffer is acceptable + // because it runs once at encode time and frees before the DiT loop. + auto supports_layer_split = [](ComponentKind k) { + return k == ComponentKind::DIT || k == ComponentKind::CONDITIONER; + }; + + auto build_options = [&](const Component& c) { + std::vector opts; + for (size_t g = 0; g < nG; g++) { + opts.push_back({Placement::GPU, int(g)}); + if (c.supports_offload) { + opts.push_back({Placement::GPU_OFFLOAD_PARAMS, int(g)}); + } + } + if (nG >= 2) { + // ROW-split: DiT only, in row mode. Spans all GPUs (one option). + if (mode == MultiGpuMode::ROW && supports_tensor_split(c.kind)) { + opts.push_back({Placement::GPU_TENSOR_SPLIT, (1 << nG) - 1}); + } + // LAYER-split: the DiT in layer mode, and any OTHER layer-split + // candidate (the Conditioner) regardless of mode — non-DiT + // components never row-split, preserving the single-row invariant. + const bool want_layer = supports_layer_split(c.kind) && + (mode == MultiGpuMode::LAYER || + (mode == MultiGpuMode::ROW && !supports_tensor_split(c.kind))); + if (want_layer) { + const int max_mask = 1 << nG; + for (int mask = 1; mask < max_mask; mask++) { + if (bit_count(static_cast(mask)) < 2) continue; + opts.push_back({Placement::GPU_LAYER_SPLIT, mask}); + } + } + } + opts.push_back({Placement::CPU, -1}); + return opts; + }; + + std::vector> options; + options.reserve(nC); + for (const Component& c : components) { + options.push_back(build_options(c)); + } + + auto priority_weight = [](ComponentKind k) -> int { + switch (k) { + case ComponentKind::DIT: return 300; + case ComponentKind::CONDITIONER: return 120; + case ComponentKind::VAE: return 60; + } + return 1; + }; + + auto score = [&](const std::vector& pl, const std::vector& dev) { + int64_t s = 0; + std::set gpus_used; + for (size_t i = 0; i < nC; i++) { + const int pw = priority_weight(components[i].kind); + if (pl[i] == Placement::GPU) { + s += 10 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + s += 5 * pw; + gpus_used.insert(dev[i]); + } else if (pl[i] == Placement::GPU_TENSOR_SPLIT) { + // Row-split: cheaper than layer-split (no sched cross- + // backend doubling) but pays per-matmul cross-device + // reductions. Score it slightly above LAYER_SPLIT so the + // planner prefers it when both fit. + s += 8 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } + } else if (pl[i] == Placement::GPU_LAYER_SPLIT) { + // Better than CPU but worse than fitting on a single GPU + // (cross-GPU traffic between blocks). + s += 7 * pw; + for (size_t g = 0; g < nG; g++) { + if (dev[i] & (1 << g)) gpus_used.insert(int(g)); + } + } else { + s -= 10 * pw; + } + } + if (allow_multi_gpu) { + s += 2 * int64_t(gpus_used.size()); + } + return s; + }; + + std::vector idx(nC, 0); + std::vector best_pl; + std::vector best_dev; + int64_t best_score = std::numeric_limits::min(); + bool found_any = false; + + while (true) { + std::vector pl(nC); + std::vector dev(nC); + for (size_t i = 0; i < nC; i++) { + pl[i] = options[i][idx[i]].placement; + dev[i] = options[i][idx[i]].device_idx; + } + // Constraint: when multi-GPU is disabled, all GPU placements must + // share the same device index. + if (!allow_multi_gpu) { + int common = -1; + bool ok = true; + for (size_t i = 0; i < nC; i++) { + if (pl[i] == Placement::GPU || pl[i] == Placement::GPU_OFFLOAD_PARAMS) { + if (common < 0) common = dev[i]; + else if (dev[i] != common) { ok = false; break; } + } + } + if (ok) { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + } else { + bool feasible = true; + for (size_t g = 0; g < nG; g++) { + if (gpu_peak(int(g), pl, dev, components, devices) > cap[g]) { feasible = false; break; } + } + if (feasible) { + int64_t sc = score(pl, dev); + if (sc > best_score) { + best_score = sc; best_pl = pl; best_dev = dev; found_any = true; + } + } + } + + size_t pos = 0; + while (pos < nC) { + idx[pos]++; + if (idx[pos] < options[pos].size()) break; + idx[pos] = 0; + pos++; + } + if (pos >= nC) break; + } + + Plan plan; + if (!found_any) { + best_pl.assign(nC, Placement::CPU); + best_dev.assign(nC, -1); + } + + for (size_t i = 0; i < nC; i++) { + const Component& c = components[i]; + Decision d; + d.kind = c.kind; + d.name = c.name; + d.placement = best_pl[i]; + if (best_pl[i] == Placement::CPU) { + d.device_id = DEVICE_ID_CPU; + d.on_host_bytes = c.params_bytes + c.compute_bytes; + plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_TENSOR_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + // Sort participating GPUs by descending TOTAL memory so the + // largest device is the "main" (runs the graph + hosts the compute + // buffer + sub-runners that don't get their own spec). This matches + // the user's preference: always use the bigger GPU as main. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + // PARAM shares for the split ratio: the main (order[0]) reserves its + // compute buffer first so it doesn't get over-loaded with rows. + auto shares = row_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs, order[0]); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + // split_share_bytes drives the row ratio in apply_dit -> keep it + // param-only. The main device's peak (params + compute) is folded + // into on_device_bytes for the plan display / feasibility. + d.split_share_bytes.push_back(shares[k]); + int64_t peak = shares[k] + (pos == 0 ? c.compute_bytes : 0); + max_share = std::max(max_share, peak); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; + } else if (best_pl[i] == Placement::GPU_LAYER_SPLIT) { + std::vector gpu_idxs; + for (size_t k = 0; k < nG; k++) { + if (best_dev[i] & (1 << k)) gpu_idxs.push_back(k); + } + auto shares = layer_split_shares(c.params_bytes, c.compute_bytes, + devices, gpu_idxs); + // Sort participating GPUs by descending TOTAL memory so the + // physically bigger GPU is listed first (and becomes the runner's + // main backend). Sub-runners that don't get the layer-split spec + // (e.g. the LTX-2 text projection) follow the main backend. + std::vector order(gpu_idxs.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return devices[gpu_idxs[a]].total_bytes > devices[gpu_idxs[b]].total_bytes; + }); + + int64_t max_share = 0; + for (size_t pos = 0; pos < order.size(); pos++) { + size_t k = order[pos]; + d.split_device_ids.push_back(devices[gpu_idxs[k]].id); + d.split_share_bytes.push_back(shares[k]); + max_share = std::max(max_share, shares[k]); + } + d.device_id = d.split_device_ids.empty() ? DEVICE_ID_CPU : d.split_device_ids[0]; + d.on_device_bytes = max_share; + plan.any_changes = true; + } else { + d.device_id = devices[best_dev[i]].id; + if (best_pl[i] == Placement::GPU) { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + } else { + d.on_device_bytes = c.params_bytes + c.compute_bytes; + d.on_host_bytes = c.params_bytes; + plan.any_changes = true; + } + } + plan.decisions.push_back(d); + plan.host_bytes += d.on_host_bytes; + } + + for (size_t g = 0; g < nG; g++) { + plan.device_bytes[devices[g].id] = gpu_peak(int(g), best_pl, best_dev, components, devices); + } + return plan; +} + +inline const char* placement_str(Placement p) { + switch (p) { + case Placement::CPU: return "CPU"; + case Placement::GPU: return "GPU"; + case Placement::GPU_OFFLOAD_PARAMS: return "GPU(params->RAM)"; + case Placement::GPU_LAYER_SPLIT: return "GPU(layer-split)"; + case Placement::GPU_TENSOR_SPLIT: return "GPU(row-split)"; + } + return "?"; +} + +inline void print_plan(const Plan& plan, + const std::vector& components, + const std::vector& devices, + int64_t margin_bytes) { + LOG_INFO("auto-fit plan (margin=%lld MiB per GPU):", (long long)(margin_bytes / MiB)); + LOG_INFO(" available devices:"); + if (devices.empty()) { + LOG_INFO(" (no GPU devices detected — all components will run on CPU)"); + } + for (const Device& d : devices) { + LOG_INFO(" %-12s %-32s free %6lld / %6lld MiB", + d.name.c_str(), d.description.c_str(), + (long long)(d.free_bytes / MiB), + (long long)(d.total_bytes / MiB)); + } + LOG_INFO(" components:"); + for (const Component& c : components) { + LOG_INFO(" %-12s params %6lld MiB, compute reserve %6lld MiB", + c.name.c_str(), + (long long)(c.params_bytes / MiB), + (long long)(c.compute_bytes / MiB)); + } + LOG_INFO(" decisions:"); + for (const Decision& d : plan.decisions) { + if (d.placement == Placement::CPU) { + LOG_INFO(" %-12s -> CPU (RAM %lld MiB)", + d.name.c_str(), (long long)(d.on_host_bytes / MiB)); + } else if (d.placement == Placement::GPU) { + LOG_INFO(" %-12s -> GPU %d (VRAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB)); + } else if (d.placement == Placement::GPU_LAYER_SPLIT || + d.placement == Placement::GPU_TENSOR_SPLIT) { + std::string ids; + const char* tag = d.placement == Placement::GPU_TENSOR_SPLIT ? "row" : "layer"; + for (size_t k = 0; k < d.split_device_ids.size(); k++) { + if (k > 0) ids += "+"; + ids += "GPU" + std::to_string(d.split_device_ids[k]); + ids += "(" + std::to_string(d.split_share_bytes[k] / MiB) + "MiB)"; + } + LOG_INFO(" %-12s -> %s-split %s", + d.name.c_str(), tag, ids.c_str()); + } else { + LOG_INFO(" %-12s -> GPU %d (params RAM) (VRAM %lld MiB, RAM %lld MiB)", + d.name.c_str(), d.device_id, + (long long)(d.on_device_bytes / MiB), + (long long)(d.on_host_bytes / MiB)); + } + } + LOG_INFO(" projected per-device peak:"); + for (const Device& d : devices) { + int64_t peak = 0; + auto it = plan.device_bytes.find(d.id); + if (it != plan.device_bytes.end()) peak = it->second; + LOG_INFO(" %-12s peak %6lld / %6lld MiB free (remaining %lld MiB)", + d.name.c_str(), + (long long)(peak / MiB), + (long long)(d.free_bytes / MiB), + (long long)((d.free_bytes - peak) / MiB)); + } + LOG_INFO(" %-12s host RAM additional %lld MiB", "CPU", + (long long)(plan.host_bytes / MiB)); +} + +inline const Decision* find_decision(const Plan& plan, ComponentKind kind) { + for (const Decision& d : plan.decisions) { + if (d.kind == kind) return &d; + } + return nullptr; +} + +} // namespace backend_fit + +#endif // __SD_BACKEND_FIT_HPP__ diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index b5dda4c0e..f193e0704 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -116,6 +116,13 @@ struct Conditioner { virtual void get_param_tensors(std::map& tensors) = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} virtual void set_stream_layers_enabled(bool enabled) {} + // Multi-GPU + lazy-load hooks. Default no-op; LLM-backed conditioners + // forward them to their (heavy) LLM sub-runner so it can be split across + // GPUs (layer-split) and/or have its params alloc+load deferred to the + // first compute so it time-shares VRAM with the DiT. + virtual void set_lazy_load(std::function fn) {} + virtual void set_multi_backend_spec(const MultiBackendSpec& spec) {} + virtual void set_weight_manager(std::shared_ptr manager) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} virtual void runner_done() {} @@ -1410,6 +1417,18 @@ struct AnimaConditioner : public Conditioner { llm->set_stream_layers_enabled(enabled); } + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + + void set_weight_manager(std::shared_ptr manager) override { + llm->set_weight_manager(std::move(manager)); + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -1555,6 +1574,20 @@ struct LLMEmbedder : public Conditioner { llm->set_stream_layers_enabled(enabled); } + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + + void set_weight_manager(std::shared_ptr manager) override { + if (llm) { + llm->set_weight_manager(std::move(manager)); + } + } + void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); } @@ -2137,6 +2170,20 @@ struct LTXAVEmbedder : public Conditioner { projector->set_flash_attention_enabled(enabled); } + // Split/lazy apply to the heavy LLM only; the small projector stays on the + // main backend and loads eagerly. + void set_lazy_load(std::function fn) override { + llm->set_lazy_load(std::move(fn)); + } + + void set_multi_backend_spec(const MultiBackendSpec& spec) override { + llm->set_multi_backend_spec(spec); + } + + void set_weight_manager(std::shared_ptr manager) override { + llm->set_weight_manager(std::move(manager)); + } + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { llm->set_max_graph_vram_bytes(max_vram_bytes); projector->set_max_graph_vram_bytes(max_vram_bytes); @@ -2180,6 +2227,7 @@ struct LTXAVEmbedder : public Conditioner { std::vector mask; tokenizer->pad_tokens(tokens, &weights, &mask, kMinLength); + return {tokens, weights, mask}; } @@ -2220,6 +2268,7 @@ struct LTXAVEmbedder : public Conditioner { true, true); GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), weights); int64_t valid_tokens = 0; diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index a3dda16b2..0d2f9b27b 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1690,6 +1690,39 @@ struct GGMLRunnerContext { } }; +// Multi-GPU split of a single runner across several GPU backends, on stock +// ggml (no ggml patch needed). Two modes: +// LAYER_SPLIT: whole transformer blocks are assigned to per-block backends +// and a ggml_backend_sched routes cross-device ops. Works on +// any multi-GPU set. +// ROW_SPLIT: matmul weights are split row-wise via the backend's stock +// split buffer type (CUDA/SYCL `ggml_backend_split_buffer_type`), +// non-matmul weights live on the main GPU; sched still wires the +// extra backends so it can route the cross-device reductions. +// The split params are allocated once and kept resident (the runner is not +// freed+realloc'd between generations), which is what lets us avoid the +// split-buffer fragmentation a ggml patch would otherwise be needed for. +enum class MultiBackendMode { + LAYER_SPLIT, + ROW_SPLIT, +}; + +struct MultiBackendSpec { + MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT; + // Extra GPU backends beyond the runner's main (runtime) backend. The main + // backend is implicit and is NOT listed here. Borrowed handles — owned by + // the SDBackendManager, never freed by the runner. + std::vector additional_backends; + // LAYER_SPLIT: map a param tensor to the backend that should hold it (the + // main backend, or one of additional_backends). nullptr => main. Keyed by + // tensor POINTER, not name: param tensors are unnamed at alloc time. + std::function tensor_backend_fn; + // ROW_SPLIT: per-device weight ratios (length = the backend registry's + // device count) and the main device index that owns the non-split portion. + std::vector tensor_split_ratios; + int main_device = 0; +}; + struct GGMLRunner { protected: typedef std::function get_graph_cb_t; @@ -1710,6 +1743,34 @@ struct GGMLRunner { bool stream_layers_enabled = false; size_t observed_max_effective_budget_ = 0; + // --- multi-GPU split state (layer-split via sched OR row-split via the + // stock split buffer type). Inactive unless set_multi_backend_spec() + // was called before alloc_params_buffer(). --- + ggml_backend_t params_backend = nullptr; + ggml_backend_buffer_t params_buffer = nullptr; + bool multi_backend_mode = false; + MultiBackendMode multi_backend_kind = MultiBackendMode::LAYER_SPLIT; + std::vector additional_backends; // borrowed (manager-owned) + std::function tensor_backend_fn = nullptr; + ggml_backend_sched_t sched = nullptr; // owned + bool sched_reserved = false; + ggml_backend_t cpu_fallback_backend = nullptr; + bool owns_cpu_fallback_backend = false; + // LAYER_SPLIT: one resident params buffer per participating backend. + std::vector multi_params_buffers; // owned + // ROW_SPLIT: resident split + main buffers and the split buft (buft is + // backend-cached, not owned). + std::vector row_split_ratios; + int row_main_device = 0; + ggml_backend_buffer_type_t row_split_buft = nullptr; + ggml_backend_buffer_t row_split_buffer = nullptr; // owned + ggml_backend_buffer_t row_main_buffer = nullptr; // owned + + // Lazy-load: when set, params alloc + tensor-data load is deferred to the + // first compute() (ensure_params_loaded) and freed after each phase, so + // components time-share VRAM instead of all coexisting at init. + std::function lazy_load_fn = nullptr; + std::shared_ptr weight_adapter = nullptr; std::weak_ptr weight_manager; std::unordered_set kept_compute_param_tensor_set; @@ -1877,9 +1938,13 @@ struct GGMLRunner { } auto manager = weight_manager.lock(); if (manager == nullptr) { - if (!params_to_prepare.empty()) { - LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str()); - return false; + for (ggml_tensor* param : params_to_prepare) { + if (param != nullptr && param->data == nullptr) { + LOG_ERROR("%s param '%s' is not loaded and weight manager is not set", + get_desc().c_str(), + ggml_get_name(param)); + return false; + } } return true; } @@ -1977,7 +2042,167 @@ struct GGMLRunner { return true; } + // Build the multi-backend scheduler (lazily). Backends in priority order: + // main runtime backend, then the additional GPU backends, then a CPU + // fallback last (ggml_backend_sched_new requires the last backend be CPU). + bool ensure_sched() { + if (sched != nullptr) { + return true; + } + std::vector backends; + backends.reserve(1 + additional_backends.size() + 1); + backends.push_back(runtime_backend); + for (auto* b : additional_backends) { + backends.push_back(b); + } + if (cpu_fallback_backend == nullptr) { + cpu_fallback_backend = sd_backend_cpu_init(); + owns_cpu_fallback_backend = true; + } + backends.push_back(cpu_fallback_backend); + // Build an explicit per-backend buffer-type array instead of passing + // nullptr. ggml_backend_sched uses these in buffer_supported() to decide + // whether a cross-backend src needs a copy; with nullptr it synthesizes + // them from default backend types, and CUDA devices can spuriously report + // supporting each other's buffers -> a needed copy is skipped and a node + // (e.g. a cont in attention) reads another device's memory -> illegal + // access. For the trailing CPU slot, use device-0's host buffer type + // (pinned host memory) exactly as llama.cpp does (llama-context.cpp). + std::vector bufts; + bufts.reserve(backends.size()); + ggml_backend_dev_t dev0 = ggml_backend_get_device(runtime_backend); + for (auto* b : backends) { + if (b == cpu_fallback_backend && dev0 != nullptr) { + ggml_backend_buffer_type_t host = ggml_backend_dev_host_buffer_type(dev0); + bufts.push_back(host != nullptr ? host : ggml_backend_get_default_buffer_type(b)); + } else { + bufts.push_back(ggml_backend_get_default_buffer_type(b)); + } + } + sched = ggml_backend_sched_new(backends.data(), + bufts.data(), + (int)backends.size(), + MAX_GRAPH_SIZE, + /*parallel=*/false, + /*op_offload=*/false); + if (sched == nullptr) { + LOG_ERROR("%s: failed to create backend sched", get_desc().c_str()); + return false; + } + return true; + } + + // Map a weight tensor to the backend it was allocated on in a layer split. + ggml_backend_t backend_of_weight(ggml_tensor* t) const { + if (t == nullptr || t->buffer == nullptr) { + return nullptr; + } + if (ggml_backend_buffer_get_usage(t->buffer) != GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + return nullptr; + } + for (size_t i = 0; i < multi_params_buffers.size(); i++) { + if (multi_params_buffers[i] == t->buffer) { + if (i == 0) { + return runtime_backend; + } + if (i - 1 < additional_backends.size()) { + return additional_backends[i - 1]; + } + } + } + return nullptr; + } + + // Pin compute nodes to their layer's device for a LAYER split. Stock + // ggml_backend_sched anchors weight-bearing ops (matmuls) to the weight's + // device, but weightless ops (norm, residual add, permute, cont) have no + // anchor and are placed by a heuristic that, for the attention `cont`, can + // land on the wrong device and then read it without a cross-device copy -> + // CUDA illegal access. llama.cpp pins each layer-boundary norm to the + // layer's device for exactly this reason (llama-context.cpp). We generalise: + // walk the graph in execution order, track the device of the most recently + // consumed weight (= the current layer's device), and pin every node to it. + // This forces clean per-layer cuts so sched copies only the residual stream + // across the boundary. No-op outside a layer split. + void pin_layer_split_nodes(ggml_cgraph* gf) { + if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::LAYER_SPLIT) { + return; + } + if (sched == nullptr || multi_params_buffers.empty() || gf == nullptr) { + return; + } + ggml_backend_t cur = runtime_backend; + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor* node = ggml_graph_node(gf, i); + for (int s = 0; s < GGML_MAX_SRC; s++) { + ggml_backend_t wb = backend_of_weight(node->src[s]); + if (wb != nullptr) { + cur = wb; + } + } + // NEVER pin view ops (view/reshape/permute/transpose): a view + // assigned to a different backend than its view_src's data makes + // the sched skip the cross-device copy for consumers (the copy + // decision trusts the assigned id), and a kernel then dereferences + // the other device's pointer. The sched places views correctly on + // its own by following view_src. + if (node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + continue; + } + if (cur != nullptr && ggml_backend_supports_op(cur, node)) { + ggml_backend_sched_set_tensor_backend(sched, node, cur); + } + } + } + + // Pin un-allocated graph-input leaves (rope pe tables, timesteps, latents…) + // to the MAIN backend before sched alloc. Left to its own heuristics the + // sched places them on the CPU/host slot and emits per-split host->device + // input copies; those copies were observed landing LATE (first pass reads + // zeros / stale pool garbage, second pass reads the first pass's data). + // Pinning them to the main backend makes our copy_data_to_backend_tensor + // fill a device-resident tensor directly (synchronous H2D) and removes the + // cross-backend input copies entirely. + void pin_input_leaves(ggml_cgraph* gf) { + // ROW_SPLIT only: the whole graph computes on the main backend, so + // graph inputs trivially belong there; pinning them avoids per-split + // host->device input copies. (Layer-split graphs span devices and the + // sched routes their inputs correctly on its own.) + if (!multi_backend_mode || multi_backend_kind != MultiBackendMode::ROW_SPLIT || + sched == nullptr || gf == nullptr || runtime_backend == nullptr) { + return; + } + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor* node = ggml_graph_node(gf, i); + for (int s = 0; s < GGML_MAX_SRC && node->src[s] != nullptr; s++) { + ggml_tensor* t = node->src[s]; + while (t->view_src != nullptr) { + t = t->view_src; + } + // op NONE + no buffer yet = a graph input the sched will + // allocate (weights already sit in params buffers). + if (t->op == GGML_OP_NONE && t->buffer == nullptr) { + ggml_backend_sched_set_tensor_backend(sched, t, runtime_backend); + } + } + } + } + bool alloc_compute_buffer(ggml_cgraph* gf) { + if (multi_backend_mode) { + // Do NOT ggml_backend_sched_reserve(gf) here: reserve runs + // split_graph, which REWIRES gf's src pointers to sched-internal + // copy tensors. execute_graph then sched_alloc_graph's the SAME gf, + // and the second split sees the stale reserve-epoch copies (measure + // layout) as valid inputs — silently corrupting every cross-backend + // input (garbage rope pe, garbage Gemma stack) or crashing. A graph + // must be split at most once; the first sched_alloc_graph in + // execute_graph performs the real allocation instead. + return ensure_sched(); + } if (compute_allocr != nullptr) { return true; } @@ -2193,12 +2418,14 @@ struct GGMLRunner { plan.valid && max_graph_vram_bytes > 0 && plan.segments.size() > 1 && - !sd_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend) && + !multi_backend_mode; } bool can_attempt_graph_cut_segmented_compute() const { return max_graph_vram_bytes > 0 && - !sd_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend) && + !multi_backend_mode; } bool resolve_graph_cut_plan(ggml_cgraph* gf, @@ -2454,7 +2681,15 @@ struct GGMLRunner { }; ComputeBufferGuard compute_buffer_guard(this, free_compute_buffer); - if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { + if (multi_backend_mode) { + ggml_backend_sched_reset(sched); + pin_layer_split_nodes(gf); // reset clears pins; re-apply before alloc + pin_input_leaves(gf); + if (!ggml_backend_sched_alloc_graph(sched, gf)) { + LOG_ERROR("%s sched alloc compute graph failed", get_desc().c_str()); + return std::nullopt; + } + } else if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); return std::nullopt; } @@ -2463,8 +2698,19 @@ struct GGMLRunner { if (sd_backend_is_cpu(runtime_backend)) { sd_backend_cpu_set_n_threads(runtime_backend, n_threads); } + if (multi_backend_mode && cpu_fallback_backend != nullptr && sd_backend_is_cpu(cpu_fallback_backend)) { + sd_backend_cpu_set_n_threads(cpu_fallback_backend, n_threads); + } - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); + ggml_status status; + if (multi_backend_mode) { + status = ggml_backend_sched_graph_compute(sched, gf); + if (status == GGML_STATUS_SUCCESS) { + ggml_backend_sched_synchronize(sched); + } + } else { + status = ggml_backend_graph_compute(runtime_backend, gf); + } if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); return std::nullopt; @@ -2623,6 +2869,9 @@ struct GGMLRunner { kept_compute_param_tensor_set.clear(); free_compute_backend_param_tensors(tensors_to_release); free_params_backend_param_tensors(tensors_to_release); + if (lazy_load_fn) { + free_params_buffer(); + } } public: @@ -2631,6 +2880,7 @@ struct GGMLRunner { GGMLRunner(ggml_backend_t backend, std::shared_ptr manager = nullptr) : runtime_backend(backend), + params_backend(backend), weight_manager(manager) { GGML_ASSERT(runtime_backend != nullptr); alloc_params_ctx(); @@ -2638,9 +2888,20 @@ struct GGMLRunner { virtual ~GGMLRunner() { free_compute_buffer(); + free_params_buffer(); free_params_ctx(); free_compute_ctx(); free_cache_ctx_and_buffer(); + // Multi-GPU split teardown. additional_backends are owned by the + // SDBackendManager (not freed here); row_split_buft is backend-cached. + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + } + if (owns_cpu_fallback_backend && cpu_fallback_backend != nullptr) { + ggml_backend_free(cpu_fallback_backend); + cpu_fallback_backend = nullptr; + } } virtual GGMLRunnerContext get_context() { @@ -2670,6 +2931,311 @@ struct GGMLRunner { alloc_compute_ctx(); } + // Row-split eligibility: contiguous, rank-2, both dims >= 256, not a view. + // 1D biases/norms, embeddings, small projections and views fall back to the + // main GPU's regular per-device buft. Excluding views respects the split + // buft's documented contract (GGML_ASSERT(view_src == nullptr)) so we never + // need to patch ggml. + static bool is_row_split_eligible(const ggml_tensor* t) { + if (t->view_src != nullptr) return false; + if (!ggml_is_contiguous(t)) return false; + if (ggml_n_dims(t) != 2) return false; + if (t->ne[0] < 256 || t->ne[1] < 256) return false; + return true; + } + + // ROW_SPLIT: matmul-eligible weights -> row_split_buft (split row-wise + // across GPUs by the CUDA/SYCL backend), everything else -> the main GPU's + // default buft. Each is allocated ONCE into a single resident buffer and + // suballocated via ggml_tallocr — no per-tensor churn, no free->realloc. + bool alloc_params_buffer_row_split() { + if (row_split_buft == nullptr) { + LOG_ERROR("%s row-split buft not initialized (backend lacks ggml_backend_split_buffer_type)", + get_desc().c_str()); + return false; + } + ggml_backend_buffer_type_t main_buft = ggml_backend_get_default_buffer_type(runtime_backend); + const size_t main_align = ggml_backend_buft_get_alignment(main_buft); + const size_t split_align = ggml_backend_buft_get_alignment(row_split_buft); + + size_t main_size = 0, split_size = 0; + size_t main_count = 0, split_count = 0; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + if (is_row_split_eligible(t)) { + split_size += GGML_PAD(ggml_backend_buft_get_alloc_size(row_split_buft, t), split_align); + split_count++; + } else { + main_size += GGML_PAD(ggml_backend_buft_get_alloc_size(main_buft, t), main_align); + main_count++; + } + } + + if (main_size > 0) { + row_main_buffer = ggml_backend_buft_alloc_buffer(main_buft, main_size); + if (row_main_buffer == nullptr) { + LOG_ERROR("%s row-split main buffer alloc failed (%.1f MB)", get_desc().c_str(), main_size / (1024.f * 1024.f)); + return false; + } + } + if (split_size > 0) { + row_split_buffer = ggml_backend_buft_alloc_buffer(row_split_buft, split_size); + if (row_split_buffer == nullptr) { + LOG_ERROR("%s row-split params buffer alloc failed (%.1f MB)", get_desc().c_str(), split_size / (1024.f * 1024.f)); + return false; + } + } + + ggml_tallocr main_alloc{}; + ggml_tallocr split_alloc{}; + if (row_main_buffer != nullptr) main_alloc = ggml_tallocr_new(row_main_buffer); + if (row_split_buffer != nullptr) split_alloc = ggml_tallocr_new(row_split_buffer); + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + ggml_status st = is_row_split_eligible(t) ? ggml_tallocr_alloc(&split_alloc, t) : ggml_tallocr_alloc(&main_alloc, t); + if (st != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s row-split tallocr_alloc failed", get_desc().c_str()); + return false; + } + } + if (row_main_buffer != nullptr) ggml_backend_buffer_set_usage(row_main_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + if (row_split_buffer != nullptr) ggml_backend_buffer_set_usage(row_split_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + rebuild_params_tensor_set(); + LOG_INFO("%s row-split params: main %.1f MB (%zu tensors), split %.1f MB (%zu tensors)", + get_desc().c_str(), main_size / (1024.f * 1024.f), main_count, split_size / (1024.f * 1024.f), split_count); + return true; + } + + // LAYER_SPLIT: assign each param tensor to a backend (via tensor_backend_fn, + // keyed by tensor pointer), allocate one resident buffer per backend on its + // default buft, and suballocate via ggml_tallocr. + bool alloc_params_buffer_layer_split() { + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : additional_backends) backends.push_back(b); + + std::vector bufts(backends.size()); + std::vector aligns(backends.size()); + std::vector sizes(backends.size(), 0); + std::vector counts(backends.size(), 0); + for (size_t i = 0; i < backends.size(); i++) { + bufts[i] = ggml_backend_get_default_buffer_type(backends[i]); + aligns[i] = ggml_backend_buft_get_alignment(bufts[i]); + } + + std::map tensor_backend_idx; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + int idx = 0; + if (tensor_backend_fn) { + ggml_backend_t target = tensor_backend_fn(t); + if (target != nullptr) { + for (size_t i = 0; i < backends.size(); i++) { + if (backends[i] == target) { idx = int(i); break; } + } + } + } + tensor_backend_idx[t] = idx; + sizes[idx] += GGML_PAD(ggml_backend_buft_get_alloc_size(bufts[idx], t), aligns[idx]); + counts[idx] += 1; + } + + multi_params_buffers.assign(backends.size(), nullptr); + for (size_t i = 0; i < backends.size(); i++) { + if (sizes[i] == 0) continue; + ggml_backend_dev_t dev = ggml_backend_buft_get_device(bufts[i]); + size_t free_pre = 0, total_pre = 0; + if (dev) ggml_backend_dev_memory(dev, &free_pre, &total_pre); + multi_params_buffers[i] = ggml_backend_buft_alloc_buffer(bufts[i], sizes[i]); + if (multi_params_buffers[i] == nullptr) { + LOG_ERROR("%s layer-split alloc on %s failed (%.1f MB)", get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f)); + return false; + } + size_t free_post = 0, total_post = 0; + if (dev) ggml_backend_dev_memory(dev, &free_post, &total_post); + LOG_DEBUG("%s layer-split alloc[%zu] %s req=%.1f MB dev_free %.1f -> %.1f MB is_host=%d", + get_desc().c_str(), i, ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), + free_pre / (1024.f * 1024.f), free_post / (1024.f * 1024.f), + (int)ggml_backend_buffer_is_host(multi_params_buffers[i])); + } + + std::vector tallocs(backends.size()); + for (size_t i = 0; i < backends.size(); i++) { + if (multi_params_buffers[i] != nullptr) tallocs[i] = ggml_tallocr_new(multi_params_buffers[i]); + } + for (auto& kv : tensor_backend_idx) { + if (ggml_tallocr_alloc(&tallocs[kv.second], kv.first) != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s layer-split tallocr_alloc failed", get_desc().c_str()); + return false; + } + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + rebuild_params_tensor_set(); + for (size_t i = 0; i < backends.size(); i++) { + if (counts[i] == 0) continue; + LOG_INFO("%s layer-split params on %s: %.1f MB (%zu tensors)", + get_desc().c_str(), ggml_backend_name(backends[i]), sizes[i] / (1024.f * 1024.f), counts[i]); + } + return true; + } + + // Lazy mode: defer alloc + tensor-data load until the first compute(). + // The caller still runs alloc_params_buffer + get_param_tensors at init, + // but for a lazy runner alloc_params_buffer is a no-op and the bulk loader + // skips this runner's tensors (they have no buffer yet); ensure_params_loaded() + // then allocates and invokes lazy_load_fn() on demand, and the params are + // freed after the phase (free_params_immediately) so components time-share VRAM. + void set_lazy_load(std::function fn) { + lazy_load_fn = std::move(fn); + } + + // True once a (non-lazy) buffer exists OR a lazy load has materialized one. + bool params_loaded() const { + return params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr; + } + + bool ensure_params_loaded() { + if (params_loaded()) { + return true; + } + if (!lazy_load_fn) { + // Non-lazy runner with no buffer: either it had no tensors, or its + // params are mmap-resident (data already set). Nothing to do. + return true; + } + int64_t t0 = ggml_time_ms(); + if (!do_alloc_params_buffer()) { + return false; + } + if (!lazy_load_fn()) { + LOG_ERROR("%s: lazy params load failed", get_desc().c_str()); + return false; + } + LOG_INFO("%s: lazy-loaded params in %.2fs", get_desc().c_str(), (ggml_time_ms() - t0) / 1000.f); + return true; + } + + bool alloc_params_buffer() { + // Defer to first compute() for lazy runners (see set_lazy_load). + if (lazy_load_fn) { + return true; + } + return do_alloc_params_buffer(); + } + + bool do_alloc_params_buffer() { + if (multi_backend_mode) { + // Split allocation bypasses the mmap fast-path: the params must land + // in the GPU split buffers, not stay mmap'd. + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + return alloc_params_buffer_row_split(); + } + return alloc_params_buffer_layer_split(); + } + size_t num_tensors = ggml_tensor_num(params_ctx); + if (num_tensors > 0) { + // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated + // (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch. + bool all_have_data = true; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + if (t->data == nullptr) { + all_have_data = false; + break; + } + } + if (all_have_data) { + LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str()); + params_buffer = nullptr; + rebuild_params_tensor_set(); + return true; + } + } else { + LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); + return true; + } + // Pinned host buffer when CPU-offloaded for DMA-direct H2D. + ggml_backend_buffer_type_t params_buft = nullptr; + if (params_backend != runtime_backend) { + ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend); + if (runtime_dev != nullptr) { + params_buft = ggml_backend_dev_host_buffer_type(runtime_dev); + } + } + if (params_buft == nullptr) { + params_buft = ggml_backend_get_default_buffer_type(params_backend); + } + params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft); + if (params_buffer == nullptr) { + LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", + get_desc().c_str(), + num_tensors); + return false; + } + rebuild_params_tensor_set(); + ggml_backend_buffer_set_usage(params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer); + LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", + get_desc().c_str(), + params_buffer_size / (1024.f * 1024.f), + sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM", + num_tensors); + return true; + } + + void free_params_buffer() { + if (params_buffer != nullptr) { + ggml_backend_buffer_free(params_buffer); + params_buffer = nullptr; + } + // Multi-GPU split buffers (layer-split: one per backend; row-split: + // split + main). The split buft itself is backend-cached, not freed. + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + ggml_backend_buffer_free(buf); + } + } + multi_params_buffers.clear(); + if (row_split_buffer != nullptr) { + ggml_backend_buffer_free(row_split_buffer); + row_split_buffer = nullptr; + } + if (row_main_buffer != nullptr) { + ggml_backend_buffer_free(row_main_buffer); + row_main_buffer = nullptr; + } + // Release the multi-backend scheduler as well. Its reserved compute + // buffers can be GBs on each device, and free_compute_buffer only + // sched_reset()s them (kept alive across the sampling loop to avoid a + // per-step rebuild). free_params_buffer is the end-of-phase release, so + // here we actually free the sched so the next component can claim that + // VRAM (time-share). It is recreated lazily on the next compute(). + if (sched != nullptr) { + ggml_backend_sched_free(sched); + sched = nullptr; + sched_reserved = false; + } + observed_max_effective_budget_ = 0; + } + + size_t get_params_buffer_size() { + size_t total = 0; + if (params_buffer != nullptr) { + total += ggml_backend_buffer_get_size(params_buffer); + } + for (auto* buf : multi_params_buffers) { + if (buf != nullptr) { + total += ggml_backend_buffer_get_size(buf); + } + } + if (row_split_buffer != nullptr) { + total += ggml_backend_buffer_get_size(row_split_buffer); + } + if (row_main_buffer != nullptr) { + total += ggml_backend_buffer_get_size(row_main_buffer); + } + return total; + } + public: void free_cache_ctx_and_buffer() { free_cache_buffer(); @@ -2681,10 +3247,23 @@ struct GGMLRunner { ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } + if (sched != nullptr) { + // Reset (not free): keeping the sched alive across the sampling + // loop's compute() calls avoids a per-step rebuild. It is freed in + // the destructor. + ggml_backend_sched_reset(sched); + sched_reserved = false; + } } // do copy after alloc graph void set_backend_tensor_data(ggml_tensor* tensor, const void* data) { + // In multi-backend mode, sched needs the tensor flagged as input so it + // gets a concrete backend assignment (tensors with no producers and no + // consumers otherwise stay at backend_id = -1 and never get a buffer). + if (multi_backend_mode) { + ggml_set_input(tensor); + } backend_tensor_data_map[tensor] = data; } @@ -2768,6 +3347,11 @@ struct GGMLRunner { }; RunnerDoneGuard runner_done_guard(this, auto_free); + // Lazy split runners allocate and load params on first use of the phase. + if (!ensure_params_loaded()) { + return std::nullopt; + } + ggml_cgraph* gf = nullptr; if (!prepare_compute_graph(get_graph, &gf)) { return std::nullopt; @@ -2815,6 +3399,10 @@ struct GGMLRunner { weight_adapter = adapter; } + void set_weight_manager(std::shared_ptr manager) { + weight_manager = std::move(manager); + } + void set_max_graph_vram_bytes(size_t max_vram_bytes) { max_graph_vram_bytes = max_vram_bytes; } @@ -2822,6 +3410,53 @@ struct GGMLRunner { void set_stream_layers_enabled(bool enabled) { stream_layers_enabled = enabled; } + + // Configure a multi-GPU split for this runner. Must be called AFTER + // construction + get_param_tensors() and BEFORE alloc_params_buffer(). + // For ROW_SPLIT, resolves the backend's stock split buffer type; if the + // backend has none (non-CUDA/SYCL), it cleanly falls back to single-GPU. + void set_multi_backend_spec(const MultiBackendSpec& spec) { + if (params_buffer != nullptr || !multi_params_buffers.empty() || + row_split_buffer != nullptr || row_main_buffer != nullptr) { + LOG_ERROR("%s set_multi_backend_spec called after params were allocated; ignoring", + get_desc().c_str()); + return; + } + multi_backend_mode = true; + multi_backend_kind = spec.mode; + additional_backends = spec.additional_backends; + tensor_backend_fn = spec.tensor_backend_fn; + row_split_ratios = spec.tensor_split_ratios; + row_main_device = spec.main_device; + if (multi_backend_kind == MultiBackendMode::ROW_SPLIT) { + row_split_buft = sd_backend_split_buffer_type( + runtime_backend, + row_main_device, + row_split_ratios.empty() ? nullptr : row_split_ratios.data()); + if (row_split_buft == nullptr) { + LOG_WARN("%s row-split unavailable on this backend; falling back to single-GPU", + get_desc().c_str()); + multi_backend_mode = false; + additional_backends.clear(); + tensor_backend_fn = nullptr; + return; + } + } + // Streaming (graph-cut param offload) is mutually exclusive with split. + stream_layers_enabled = false; + } + + bool is_multi_backend() const { + return multi_backend_mode; + } + + ggml_backend_t get_runtime_backend() { + return runtime_backend; + } + + ggml_backend_t get_params_backend() { + return params_backend; + } }; class GGMLBlock { diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp index f3e2cceba..73a28df10 100644 --- a/src/core/ggml_extend_backend.cpp +++ b/src/core/ggml_extend_backend.cpp @@ -544,6 +544,10 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) { return init_cached_backend(name); } +ggml_backend_t SDBackendManager::ensure_backend(const std::string& device_name) { + return init_cached_backend(device_name); +} + bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) { return sd_backend_is_cpu(runtime_backend(module)); } @@ -687,3 +691,22 @@ const char* sd_backend_module_name(SDBackendModule module) { } return "unknown"; } + +ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split) { + if (backend == nullptr) { + return nullptr; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev == nullptr) { + return nullptr; + } + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (reg == nullptr) { + return nullptr; + } + auto fn = (ggml_backend_split_buffer_type_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); + if (fn == nullptr) { + return nullptr; // backend has no row-split support (non-CUDA/SYCL) + } + return fn(main_device, tensor_split); +} diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h index 9aecf97c0..d1a492750 100644 --- a/src/core/ggml_extend_backend.h +++ b/src/core/ggml_extend_backend.h @@ -57,6 +57,12 @@ class SDBackendManager { ggml_backend_t runtime_backend(SDBackendModule module); ggml_backend_t params_backend(SDBackendModule module); + // Return (creating + caching on first use) the backend for an explicit + // ggml device name (e.g. "CUDA1"). Used to obtain the additional GPU + // backends a multi-GPU split needs; the manager owns the handle and frees + // it once at teardown, so callers only borrow it. + ggml_backend_t ensure_backend(const std::string& device_name); + bool runtime_backend_is_cpu(SDBackendModule module); bool params_backend_is_cpu(SDBackendModule module); bool params_backend_is_disk(SDBackendModule module) const; @@ -74,5 +80,13 @@ bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); std::string sd_backend_resolve_name(const std::string& name); const char* sd_backend_module_name(SDBackendModule module); void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); + +// Runtime lookup of a backend's row-split buffer type, published by the CUDA +// and SYCL backends as the "ggml_backend_split_buffer_type" proc. Returns +// nullptr when the backend does not support row-split (the caller then falls +// back to a non-split single-GPU path). `tensor_split` is a per-device weight +// array of length = the backend registry's device count; `main_device` is the +// index of the device that owns the non-split portion. +ggml_backend_buffer_type_t sd_backend_split_buffer_type(ggml_backend_t backend, int main_device, const float* tensor_split); bool add_rpc_devices(const std::string& servers); #endif // __SD_CORE_GGML_EXTEND_BACKEND_H__ diff --git a/src/core/util.cpp b/src/core/util.cpp index 7325607e0..b10e53ed7 100644 --- a/src/core/util.cpp +++ b/src/core/util.cpp @@ -25,6 +25,7 @@ #include #endif +#include "ggml-backend.h" #include "ggml.h" #include "stable-diffusion.h" @@ -972,3 +973,12 @@ std::vector> split_quotation_attention( } return result; } + +void sd_list_devices(void) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + const char* name = ggml_backend_dev_name(dev); + const char* desc = ggml_backend_dev_description(dev); + printf("%s\t%s\n", name ? name : "", desc ? desc : ""); + } +} diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index b89ff32c6..8ef2ffe81 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -1606,8 +1606,13 @@ namespace LTXV { if (config.cross_attention_adaln) { auto prompt_adaln_single = std::dynamic_pointer_cast(blocks["prompt_adaln_single"]); auto audio_prompt_adaln_single = std::dynamic_pointer_cast(blocks["audio_prompt_adaln_single"]); - v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first; - a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first; + // The reference feeds modality.sigma (the RAW per-batch sigma) to + // both prompt adalns. effective_audio_timestep is exactly that: + // audio timesteps are never denoise-masked, so it carries the + // unmasked sigma even in i2v. The VIDEO timestep tensor is the + // denoise-masked per-token one and must NOT be used here. + v_prompt_timestep_mod = prompt_adaln_single->forward(ctx, a_timestep_scaled).first; + a_prompt_timestep_mod = audio_prompt_adaln_single->forward(ctx, a_timestep_scaled).first; } auto av_ca_video_timestep = repeat_scalar_timestep_like(ctx, effective_audio_timestep, timestep); diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp index 74dc232e5..685af7502 100644 --- a/src/model/te/llm.hpp +++ b/src/model/te/llm.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_LLM_HPP__ +#ifndef __SD_MODEL_TE_LLM_HPP__ #define __SD_MODEL_TE_LLM_HPP__ #include diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 23da08222..745442aee 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -1,608 +1,608 @@ -#ifndef __SD_MODEL_TE_T5_HPP__ -#define __SD_MODEL_TE_T5_HPP__ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "core/ggml_extend.hpp" -#include "model_loader.h" -#include "model_manager.h" -#include "tokenizers/t5_unigram_tokenizer.h" - -struct T5Config { - int64_t num_layers = 24; - int64_t model_dim = 4096; - int64_t ff_dim = 10240; - int64_t num_heads = 64; - int64_t vocab_size = 32128; - bool relative_attention = true; - - static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, - const std::string& prefix, - bool is_umt5 = false) { - (void)tensor_storage_map; - (void)prefix; - T5Config config; - if (is_umt5) { - config.vocab_size = 256384; - config.relative_attention = false; - } - return config; - } -}; - -class T5LayerNorm : public UnaryBlock { -protected: - int64_t hidden_size; - float eps; - - void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { - enum ggml_type wtype = GGML_TYPE_F32; - params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); - } - -public: - T5LayerNorm(int64_t hidden_size, - float eps = 1e-06f) - : hidden_size(hidden_size), - eps(eps) {} - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - ggml_tensor* w = params["weight"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul(ctx->ggml_ctx, x, w); - return x; - } -}; - -struct T5DenseActDense : public UnaryBlock { -public: - T5DenseActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi = std::dynamic_pointer_cast(blocks["wi"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - x = wi->forward(ctx, x); - x = ggml_relu_inplace(ctx->ggml_ctx, x); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5DenseGatedActDense : public UnaryBlock { -public: - T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - float scale = 1.f / 32.f; - // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); - auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); - auto hidden_linear = wi_1->forward(ctx, x); - x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5LayerFF : public UnaryBlock { -public: - T5LayerFF(int64_t model_dim, int64_t ff_dim) { - blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto forwarded_states = layer_norm->forward(ctx, x); - forwarded_states = DenseReluDense->forward(ctx, forwarded_states); - x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); - return x; - } -}; - -class T5Attention : public GGMLBlock { -protected: - int64_t model_dim; - int64_t inner_dim; - int64_t num_heads; - bool using_relative_attention_bias; - int64_t relative_attention_num_buckets = 32; - int64_t relative_attention_max_distance = 128; - -public: - T5Attention(int64_t model_dim, - int64_t inner_dim, - int64_t num_heads, - bool using_relative_attention_bias = false) - : model_dim(model_dim), - inner_dim(inner_dim), - num_heads(num_heads), - using_relative_attention_bias(using_relative_attention_bias) { - blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); - if (using_relative_attention_bias) { - blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); - } - } - - ggml_tensor* compute_bias(GGMLRunnerContext* ctx, - ggml_tensor* relative_position_bucket) { - auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); - - auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) - values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) - return values; - } - - // x: [N, n_token, model_dim] - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto out_proj = std::dynamic_pointer_cast(blocks["o"]); - - int64_t n_head = num_heads; - int64_t d_head = inner_dim / n_head; - - auto q = q_proj->forward(ctx, x); - auto k = k_proj->forward(ctx, x); - auto v = v_proj->forward(ctx, x); - - if (using_relative_attention_bias && relative_position_bucket != nullptr) { - past_bias = compute_bias(ctx, relative_position_bucket); - } - if (past_bias != nullptr) { - if (mask != nullptr) { - mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); - mask = ggml_add(ctx->ggml_ctx, mask, past_bias); - } else { - mask = past_bias; - } - } - - k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); - - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] - - x = out_proj->forward(ctx, x); // [N, n_token, model_dim] - return {x, past_bias}; - } -}; - -struct T5LayerSelfAttention : public GGMLBlock { -public: - T5LayerSelfAttention(int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool using_relative_attention_bias) { - blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto normed_hidden_state = layer_norm->forward(ctx, x); - auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); - auto output = ret.first; - past_bias = ret.second; - - x = ggml_add_inplace(ctx->ggml_ctx, output, x); - return {x, past_bias}; - } -}; - -struct T5Block : public GGMLBlock { -public: - T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { - blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); - blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); - auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); - - auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - x = layer_1->forward(ctx, x); - return {x, past_bias}; - } -}; - -struct T5Stack : public GGMLBlock { - int64_t num_layers; - -public: - T5Stack(int64_t num_layers, - int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool relative_attention = true) - : num_layers(num_layers) { - for (int i = 0; i < num_layers; i++) { - blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); - } - - blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr, - const std::string& graph_cut_prefix = "") { - // x: [N, n_token, model_dim] - for (int i = 0; i < num_layers; i++) { - auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); - - auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - if (!graph_cut_prefix.empty()) { - sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x"); - } - } - - auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); - - x = final_layer_norm->forward(ctx, x); - return x; - } -}; - -struct T5 : public GGMLBlock { - T5Config config; - -public: - T5() {} - T5(T5Config config) - : config(config) { - blocks["encoder"] = std::shared_ptr(new T5Stack(config.num_layers, - config.model_dim, - config.model_dim, - config.ff_dim, - config.num_heads, - config.relative_attention)); - blocks["shared"] = std::shared_ptr(new Embedding(config.vocab_size, - config.model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // input_ids: [N, n_token] - - auto shared = std::dynamic_pointer_cast(blocks["shared"]); - auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); - - auto x = shared->forward(ctx, input_ids); - sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x"); - x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5"); - return x; - } -}; - -struct T5Runner : public GGMLRunner { - T5Config config; - T5 model; - std::vector relative_position_bucket_vec; - - T5Runner(ggml_backend_t backend, - const String2TensorStorage& tensor_storage_map, - const std::string prefix, - bool is_umt5 = false, - std::shared_ptr weight_manager = nullptr) - : GGMLRunner(backend, weight_manager), - config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) { - model = T5(config); - model.init(params_ctx, tensor_storage_map, prefix); - } - - std::string get_desc() override { - return "t5"; - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* relative_position_bucket, - ggml_tensor* attention_mask = nullptr) { - size_t N = input_ids->ne[1]; - size_t n_token = input_ids->ne[0]; - - auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] - return hidden_states; - } - - ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, - const sd::Tensor& attention_mask_tensor = {}) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - ggml_tensor* input_ids = make_input(input_ids_tensor); - ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor); - - relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); - - // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { - // if (i % 77 == 0) { - // printf("\n"); - // } - // printf("%d ", relative_position_bucket_vec[i]); - // } - - auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, - GGML_TYPE_I32, - input_ids->ne[0], - input_ids->ne[0]); - set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); - - auto runner_ctx = get_context(); - ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); - - ggml_build_forward_expand(gf, hidden_states); - - return gf; - } - - sd::Tensor compute(const int n_threads, - const sd::Tensor& input_ids, - const sd::Tensor& attention_mask, - bool auto_free = true, - bool free_compute_buffer = true, - bool free_compute_params = true) { - auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(input_ids, attention_mask); - }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3); - } - - static std::vector _relative_position_bucket(const std::vector& relative_position, - bool bidirectional = true, - int num_buckets = 32, - int max_distance = 128) { - std::vector relative_buckets(relative_position.size(), 0); - std::vector abs_relative_position = relative_position; - - if (bidirectional) { - num_buckets = num_buckets / 2; - for (size_t i = 0; i < relative_position.size(); ++i) { - if (relative_position[i] > 0) { - relative_buckets[i] += num_buckets; - } - abs_relative_position[i] = std::abs(relative_position[i]); - } - } else { - for (size_t i = 0; i < relative_position.size(); ++i) { - abs_relative_position[i] = std::max(-relative_position[i], 0); - } - } - - int max_exact = num_buckets / 2; - std::vector relative_position_if_large(relative_position.size(), 0); - - for (size_t i = 0; i < relative_position.size(); ++i) { - if (abs_relative_position[i] < max_exact) { - relative_buckets[i] += abs_relative_position[i]; - } else { - float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); - float log_base = std::log(static_cast(max_distance) / max_exact); - relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); - relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); - relative_buckets[i] += relative_position_if_large[i]; - } - } - - return relative_buckets; - } - - std::vector compute_relative_position_bucket(int query_length, - int key_length) { - std::vector context_position(query_length); - std::vector memory_position(key_length); - - for (int i = 0; i < query_length; ++i) { - context_position[i] = i; - } - for (int i = 0; i < key_length; ++i) { - memory_position[i] = i; - } - - std::vector> relative_position(query_length, std::vector(key_length, 0)); - for (int i = 0; i < query_length; ++i) { - for (int j = 0; j < key_length; ++j) { - relative_position[i][j] = memory_position[j] - context_position[i]; - } - } - - std::vector relative_position_bucket; - for (int i = 0; i < query_length; ++i) { - std::vector result = _relative_position_bucket(relative_position[i], true); - relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); - } - - return relative_position_bucket; - } -}; - -struct T5Embedder { - T5UniGramTokenizer tokenizer; - T5Runner model; - - T5Embedder(ggml_backend_t backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool is_umt5 = false, - std::shared_ptr weight_manager = nullptr) - : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) { - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - std::tuple, std::vector, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { - auto parsed_attention = parse_prompt_attention(text); - - { - std::stringstream ss; - ss << "["; - for (const auto& item : parsed_attention) { - ss << "['" << item.first << "', " << item.second << "], "; - } - ss << "]"; - LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); - } - - std::vector tokens; - std::vector weights; - for (const auto& item : parsed_attention) { - const std::string& curr_text = item.first; - float curr_weight = item.second; - std::vector curr_tokens = tokenizer.encode(curr_text); - tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); - weights.insert(weights.end(), curr_tokens.size(), curr_weight); - } - - std::vector attention_mask; - - tokenizer.pad_tokens(tokens, &weights, &attention_mask, padding ? max_length : 0, padding ? max_length : 100000000, padding); - for (auto& mask_value : attention_mask) { - mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF; - } - - // for (int i = 0; i < tokens.size(); i++) { - // std::cout << tokens[i] << ":" << weights[i] << ", "; - // } - // std::cout << std::endl; - - return {tokens, weights, attention_mask}; - } - - void test() { - ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = nullptr; - params.no_alloc = false; - - ggml_context* ctx = ggml_init(params); - GGML_ASSERT(ctx != nullptr); - - { - std::string text("a lovely cat"); - auto tokens_and_weights = tokenize(text, 512, true); - std::vector& tokens = std::get<0>(tokens_and_weights); - std::vector& weights = std::get<1>(tokens_and_weights); - std::vector& masks = std::get<2>(tokens_and_weights); - for (auto token : tokens) { - printf("%d ", token); - } - printf("\n"); - auto input_ids = sd::Tensor::from_vector(tokens); - auto attention_mask = sd::Tensor::from_vector(masks); - sd::Tensor out; - - int64_t t0 = ggml_time_ms(); - auto out_opt = model.compute(8, input_ids, attention_mask); - int64_t t1 = ggml_time_ms(); - - GGML_ASSERT(!out_opt.empty()); - out = std::move(out_opt); - print_sd_tensor(out); - LOG_DEBUG("t5 test done in %lldms", t1 - t0); - } - } - - static void load_from_file_and_test(const std::string& file_path) { - // cpu f16: pass - // cpu f32: pass - // cuda f16: pass - // cuda f32: pass - // cuda q8_0: pass - // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = sd_backend_cpu_init(); - ggml_type model_data_type = GGML_TYPE_F16; - - auto model_manager = std::make_shared(); - ModelLoader& model_loader = model_manager->loader(); - if (!model_loader.init_from_file_and_convert_name(file_path)) { - LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); - return; - } - - auto& tensor_storage_map = model_loader.get_tensor_storage_map(); - for (auto& [name, tensor_storage] : tensor_storage_map) { - if (ends_with(name, "weight")) { - tensor_storage.expected_type = model_data_type; - } - } - - std::shared_ptr t5 = std::make_shared(backend, tensor_storage_map, "", true, model_manager); - - if (!model_manager->register_runner_params("T5 test", - *t5, - "", - ModelManager::ResidencyMode::ParamBackend, - backend, - backend) || - !model_manager->validate_registered_tensors()) { - LOG_ERROR("register t5 tensors with model manager failed"); - return; - } - - LOG_INFO("t5 model loaded"); - t5->test(); - } -}; - -#endif // __SD_MODEL_TE_T5_HPP__ +#ifndef __SD_MODEL_TE_T5_HPP__ +#define __SD_MODEL_TE_T5_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/ggml_extend.hpp" +#include "model_loader.h" +#include "model_manager.h" +#include "tokenizers/t5_unigram_tokenizer.h" + +struct T5Config { + int64_t num_layers = 24; + int64_t model_dim = 4096; + int64_t ff_dim = 10240; + int64_t num_heads = 64; + int64_t vocab_size = 32128; + bool relative_attention = true; + + static T5Config detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + bool is_umt5 = false) { + (void)tensor_storage_map; + (void)prefix; + T5Config config; + if (is_umt5) { + config.vocab_size = 256384; + config.relative_attention = false; + } + return config; + } +}; + +class T5LayerNorm : public UnaryBlock { +protected: + int64_t hidden_size; + float eps; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = GGML_TYPE_F32; + params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); + } + +public: + T5LayerNorm(int64_t hidden_size, + float eps = 1e-06f) + : hidden_size(hidden_size), + eps(eps) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["weight"]; + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul(ctx->ggml_ctx, x, w); + return x; + } +}; + +struct T5DenseActDense : public UnaryBlock { +public: + T5DenseActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi = std::dynamic_pointer_cast(blocks["wi"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + x = wi->forward(ctx, x); + x = ggml_relu_inplace(ctx->ggml_ctx, x); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5DenseGatedActDense : public UnaryBlock { +public: + T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + float scale = 1.f / 32.f; + // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); + auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); + auto hidden_linear = wi_1->forward(ctx, x); + x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5LayerFF : public UnaryBlock { +public: + T5LayerFF(int64_t model_dim, int64_t ff_dim) { + blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto forwarded_states = layer_norm->forward(ctx, x); + forwarded_states = DenseReluDense->forward(ctx, forwarded_states); + x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); + return x; + } +}; + +class T5Attention : public GGMLBlock { +protected: + int64_t model_dim; + int64_t inner_dim; + int64_t num_heads; + bool using_relative_attention_bias; + int64_t relative_attention_num_buckets = 32; + int64_t relative_attention_max_distance = 128; + +public: + T5Attention(int64_t model_dim, + int64_t inner_dim, + int64_t num_heads, + bool using_relative_attention_bias = false) + : model_dim(model_dim), + inner_dim(inner_dim), + num_heads(num_heads), + using_relative_attention_bias(using_relative_attention_bias) { + blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); + if (using_relative_attention_bias) { + blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); + } + } + + ggml_tensor* compute_bias(GGMLRunnerContext* ctx, + ggml_tensor* relative_position_bucket) { + auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); + + auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) + values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) + return values; + } + + // x: [N, n_token, model_dim] + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto out_proj = std::dynamic_pointer_cast(blocks["o"]); + + int64_t n_head = num_heads; + int64_t d_head = inner_dim / n_head; + + auto q = q_proj->forward(ctx, x); + auto k = k_proj->forward(ctx, x); + auto v = v_proj->forward(ctx, x); + + if (using_relative_attention_bias && relative_position_bucket != nullptr) { + past_bias = compute_bias(ctx, relative_position_bucket); + } + if (past_bias != nullptr) { + if (mask != nullptr) { + mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); + mask = ggml_add(ctx->ggml_ctx, mask, past_bias); + } else { + mask = past_bias; + } + } + + k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); + + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] + + x = out_proj->forward(ctx, x); // [N, n_token, model_dim] + return {x, past_bias}; + } +}; + +struct T5LayerSelfAttention : public GGMLBlock { +public: + T5LayerSelfAttention(int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool using_relative_attention_bias) { + blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto normed_hidden_state = layer_norm->forward(ctx, x); + auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); + auto output = ret.first; + past_bias = ret.second; + + x = ggml_add_inplace(ctx->ggml_ctx, output, x); + return {x, past_bias}; + } +}; + +struct T5Block : public GGMLBlock { +public: + T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { + blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); + blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); + auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); + + auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + x = layer_1->forward(ctx, x); + return {x, past_bias}; + } +}; + +struct T5Stack : public GGMLBlock { + int64_t num_layers; + +public: + T5Stack(int64_t num_layers, + int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool relative_attention = true) + : num_layers(num_layers) { + for (int i = 0; i < num_layers; i++) { + blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); + } + + blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr, + const std::string& graph_cut_prefix = "") { + // x: [N, n_token, model_dim] + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); + + auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + if (!graph_cut_prefix.empty()) { + sd::ggml_graph_cut::mark_graph_cut(x, graph_cut_prefix + ".block." + std::to_string(i), "x"); + } + } + + auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); + + x = final_layer_norm->forward(ctx, x); + return x; + } +}; + +struct T5 : public GGMLBlock { + T5Config config; + +public: + T5() {} + T5(T5Config config) + : config(config) { + blocks["encoder"] = std::shared_ptr(new T5Stack(config.num_layers, + config.model_dim, + config.model_dim, + config.ff_dim, + config.num_heads, + config.relative_attention)); + blocks["shared"] = std::shared_ptr(new Embedding(config.vocab_size, + config.model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // input_ids: [N, n_token] + + auto shared = std::dynamic_pointer_cast(blocks["shared"]); + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + + auto x = shared->forward(ctx, input_ids); + sd::ggml_graph_cut::mark_graph_cut(x, "t5.prelude", "x"); + x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket, "t5"); + return x; + } +}; + +struct T5Runner : public GGMLRunner { + T5Config config; + T5 model; + std::vector relative_position_bucket_vec; + + T5Runner(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map, + const std::string prefix, + bool is_umt5 = false, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), + config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) { + model = T5(config); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "t5"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* relative_position_bucket, + ggml_tensor* attention_mask = nullptr) { + size_t N = input_ids->ne[1]; + size_t n_token = input_ids->ne[0]; + + auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] + return hidden_states; + } + + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, + const sd::Tensor& attention_mask_tensor = {}) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* input_ids = make_input(input_ids_tensor); + ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor); + + relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); + + // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { + // if (i % 77 == 0) { + // printf("\n"); + // } + // printf("%d ", relative_position_bucket_vec[i]); + // } + + auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, + GGML_TYPE_I32, + input_ids->ne[0], + input_ids->ne[0]); + set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); + + ggml_build_forward_expand(gf, hidden_states); + + return gf; + } + + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + const sd::Tensor& attention_mask, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(input_ids, attention_mask); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3); + } + + static std::vector _relative_position_bucket(const std::vector& relative_position, + bool bidirectional = true, + int num_buckets = 32, + int max_distance = 128) { + std::vector relative_buckets(relative_position.size(), 0); + std::vector abs_relative_position = relative_position; + + if (bidirectional) { + num_buckets = num_buckets / 2; + for (size_t i = 0; i < relative_position.size(); ++i) { + if (relative_position[i] > 0) { + relative_buckets[i] += num_buckets; + } + abs_relative_position[i] = std::abs(relative_position[i]); + } + } else { + for (size_t i = 0; i < relative_position.size(); ++i) { + abs_relative_position[i] = std::max(-relative_position[i], 0); + } + } + + int max_exact = num_buckets / 2; + std::vector relative_position_if_large(relative_position.size(), 0); + + for (size_t i = 0; i < relative_position.size(); ++i) { + if (abs_relative_position[i] < max_exact) { + relative_buckets[i] += abs_relative_position[i]; + } else { + float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); + float log_base = std::log(static_cast(max_distance) / max_exact); + relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); + relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); + relative_buckets[i] += relative_position_if_large[i]; + } + } + + return relative_buckets; + } + + std::vector compute_relative_position_bucket(int query_length, + int key_length) { + std::vector context_position(query_length); + std::vector memory_position(key_length); + + for (int i = 0; i < query_length; ++i) { + context_position[i] = i; + } + for (int i = 0; i < key_length; ++i) { + memory_position[i] = i; + } + + std::vector> relative_position(query_length, std::vector(key_length, 0)); + for (int i = 0; i < query_length; ++i) { + for (int j = 0; j < key_length; ++j) { + relative_position[i][j] = memory_position[j] - context_position[i]; + } + } + + std::vector relative_position_bucket; + for (int i = 0; i < query_length; ++i) { + std::vector result = _relative_position_bucket(relative_position[i], true); + relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); + } + + return relative_position_bucket; + } +}; + +struct T5Embedder { + T5UniGramTokenizer tokenizer; + T5Runner model; + + T5Embedder(ggml_backend_t backend, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool is_umt5 = false, + std::shared_ptr weight_manager = nullptr) + : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) { + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + std::tuple, std::vector, std::vector> tokenize(std::string text, + size_t max_length = 0, + bool padding = false) { + auto parsed_attention = parse_prompt_attention(text); + + { + std::stringstream ss; + ss << "["; + for (const auto& item : parsed_attention) { + ss << "['" << item.first << "', " << item.second << "], "; + } + ss << "]"; + LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); + } + + std::vector tokens; + std::vector weights; + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + std::vector curr_tokens = tokenizer.encode(curr_text); + tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); + weights.insert(weights.end(), curr_tokens.size(), curr_weight); + } + + std::vector attention_mask; + + tokenizer.pad_tokens(tokens, &weights, &attention_mask, padding ? max_length : 0, padding ? max_length : 100000000, padding); + for (auto& mask_value : attention_mask) { + mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF; + } + + // for (int i = 0; i < tokens.size(); i++) { + // std::cout << tokens[i] << ":" << weights[i] << ", "; + // } + // std::cout << std::endl; + + return {tokens, weights, attention_mask}; + } + + void test() { + ggml_init_params params; + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + params.mem_buffer = nullptr; + params.no_alloc = false; + + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); + + { + std::string text("a lovely cat"); + auto tokens_and_weights = tokenize(text, 512, true); + std::vector& tokens = std::get<0>(tokens_and_weights); + std::vector& weights = std::get<1>(tokens_and_weights); + std::vector& masks = std::get<2>(tokens_and_weights); + for (auto token : tokens) { + printf("%d ", token); + } + printf("\n"); + auto input_ids = sd::Tensor::from_vector(tokens); + auto attention_mask = sd::Tensor::from_vector(masks); + sd::Tensor out; + + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, attention_mask); + int64_t t1 = ggml_time_ms(); + + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); + LOG_DEBUG("t5 test done in %lldms", t1 - t0); + } + } + + static void load_from_file_and_test(const std::string& file_path) { + // cpu f16: pass + // cpu f32: pass + // cuda f16: pass + // cuda f32: pass + // cuda q8_0: pass + // ggml_backend_t backend = ggml_backend_cuda_init(0); + ggml_backend_t backend = sd_backend_cpu_init(); + ggml_type model_data_type = GGML_TYPE_F16; + + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); + if (!model_loader.init_from_file_and_convert_name(file_path)) { + LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); + return; + } + + auto& tensor_storage_map = model_loader.get_tensor_storage_map(); + for (auto& [name, tensor_storage] : tensor_storage_map) { + if (ends_with(name, "weight")) { + tensor_storage.expected_type = model_data_type; + } + } + + std::shared_ptr t5 = std::make_shared(backend, tensor_storage_map, "", true, model_manager); + + if (!model_manager->register_runner_params("T5 test", + *t5, + "", + ModelManager::ResidencyMode::ParamBackend, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register t5 tensors with model manager failed"); + return; + } + + LOG_INFO("t5 model loaded"); + t5->test(); + } +}; + +#endif // __SD_MODEL_TE_T5_HPP__ diff --git a/src/model_loader.h b/src/model_loader.h index 4dc700f20..529f3e890 100644 --- a/src/model_loader.h +++ b/src/model_loader.h @@ -27,6 +27,8 @@ struct MmapTensorStore { std::shared_ptr mmbuffer; }; +bool is_unused_tensor(const std::string& name); + class ModelLoader { protected: SDVersion version_ = VERSION_COUNT; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 836b0f85b..13a5e14ed 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -12,6 +12,8 @@ #include "core/rng_mt19937.hpp" #include "core/rng_philox.hpp" #include "core/util.h" + +#include "backend_fit.hpp" #include "model_loader.h" #include "model_manager.h" #include "stable-diffusion.h" @@ -194,6 +196,27 @@ class StableDiffusionGGML { std::string backend_spec; std::string params_backend_spec; + // DiT multi-GPU split decision captured from the auto-fit plan and applied + // to the diffusion runner(s) before param load. OFF when the DiT is not + // split. device_ids[0] is the "main" GPU (largest); share_bytes is the + // per-device VRAM share (same order as device_ids). + backend_fit::MultiGpuMode fit_dit_split_mode = backend_fit::MultiGpuMode::OFF; + std::vector fit_dit_split_device_names; // ggml device names, [0] = main + std::vector fit_dit_split_share_bytes; + // Conditioner (LLM) split decision — always layer-split when it splits + // (only the DiT ever row-splits; see backend_fit::supports_tensor_split). + backend_fit::MultiGpuMode fit_cond_split_mode = backend_fit::MultiGpuMode::OFF; + std::vector fit_cond_split_device_names; + std::vector fit_cond_split_share_bytes; + + // Auto-fit decided the components can't all be resident at once (the + // per-component MAX plan only fits if they time-share), so defer the heavy + // components' param alloc+load to their compute phase and free after. + bool auto_lazy_load = false; + // auto-fit is on: when a VAE decode OOMs we may auto-enable tiling and retry + // (temporal for LTX video, spatial otherwise) instead of failing. + bool auto_fit_enabled = false; + bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; @@ -233,6 +256,12 @@ class StableDiffusionGGML { return params_backend_for(module) != nullptr; } + // Initialize the backend manager from backend_spec / params_backend_spec. + // These hold the user's --backend / --params-backend by default, but when + // auto-fit is enabled they are overwritten with the computed plan before + // this runs. The keep_*_on_cpu shortcuts were replaced by the spec + // mechanism (e.g. "vae=cpu"), so they are always false here. + template bool register_runner_params(const std::string& desc, const std::shared_ptr& model, @@ -265,6 +294,338 @@ class StableDiffusionGGML { return ensure_backend_pair(SDBackendModule::DIFFUSION); } + // Parse a transformer block index out of a weight name, or -1 if none. + static int dit_block_index_of(const std::string& name) { + static const char* kw[] = {"transformer_blocks.", "joint_blocks.", "double_blocks.", + "single_blocks.", "blocks.", "layers."}; + for (const char* k : kw) { + size_t p = name.find(k); + if (p == std::string::npos) { + continue; + } + p += strlen(k); + size_t e = p; + while (e < name.size() && name[e] >= '0' && name[e] <= '9') { + e++; + } + if (e > p) { + return atoi(name.substr(p, e - p).c_str()); + } + } + return -1; + } + + // Build a MultiBackendSpec from the auto-fit DiT split decision and apply it + // to a diffusion runner BEFORE its params are allocated. No-op when the DiT + // is not split. Always returns true (any failure falls back to single-GPU). + bool apply_dit_multi_gpu_split(const std::shared_ptr& runner, + ModelLoader& model_loader) { + if (!runner || fit_dit_split_mode == backend_fit::MultiGpuMode::OFF || + fit_dit_split_device_names.size() < 2) { + return true; + } + const auto& devnames = fit_dit_split_device_names; + const auto& shares = fit_dit_split_share_bytes; + ggml_backend_t main_backend = runner->get_runtime_backend(); + MultiBackendSpec spec; + + if (fit_dit_split_mode == backend_fit::MultiGpuMode::ROW) { + // ROW: one main backend; matmul rows are split across the devices by + // the stock split buft. sched still needs the extra backends so it + // can route the cross-device reductions. + auto reg_prefix_of = [](const std::string& n) -> std::string { + size_t i = 0; + while (i < n.size() && !(n[i] >= '0' && n[i] <= '9')) { + i++; + } + return n.substr(0, i); + }; + std::string reg_name = reg_prefix_of(devnames[0]); + ggml_backend_reg_t reg = ggml_backend_reg_by_name(reg_name.c_str()); + if (reg == nullptr) { + LOG_WARN("row-split: backend registry '%s' not found; using single GPU", reg_name.c_str()); + return true; + } + int dev_count = (int)ggml_backend_reg_dev_count(reg); + if (dev_count <= 0) { + return true; + } + auto reg_index_of = [&](const std::string& n) -> int { + if (n.rfind(reg_name, 0) != 0) { + return -1; + } + try { + return std::stoi(n.substr(reg_name.size())); + } catch (...) { + return -1; + } + }; + int64_t total = 0; + for (auto b : shares) { + total += b; + } + if (total <= 0) { + return true; + } + std::vector ratios(dev_count, 0.f); + for (size_t k = 0; k < devnames.size(); k++) { + int idx = reg_index_of(devnames[k]); + if (idx < 0 || idx >= dev_count) { + continue; + } + ratios[idx] = float(double(shares[k]) / double(total)); + } + // The main device must be the runner's runtime backend, which the + // planner set to devnames[0] (the largest-VRAM GPU, listed first). + // Keeping these aligned ensures the split buft's non-split portion + // and the runner's compute buffer live on the same device. + int main_dev = reg_index_of(devnames[0]); + if (main_dev < 0 || main_dev >= dev_count) { + return true; + } + for (size_t k = 0; k < devnames.size(); k++) { + int idx = reg_index_of(devnames[k]); + if (idx == main_dev || idx < 0) { + continue; + } + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b != nullptr) { + spec.additional_backends.push_back(b); + } else { + LOG_WARN("row-split: failed to init backend %s", devnames[k].c_str()); + } + } + spec.mode = MultiBackendMode::ROW_SPLIT; + spec.tensor_split_ratios = ratios; + spec.main_device = main_dev; + LOG_INFO("DiT row-split across %zu devices (main reg-index %d)", devnames.size(), main_dev); + } else { + // LAYER: assign contiguous block ranges to per-device backends. + std::vector all_backends; + all_backends.push_back(main_backend); + for (size_t k = 1; k < devnames.size(); k++) { + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b == nullptr) { + LOG_WARN("layer-split: failed to init backend %s; using single GPU", devnames[k].c_str()); + return true; + } + spec.additional_backends.push_back(b); + all_backends.push_back(b); + } + const std::string tensor_prefix = "model.diffusion_model."; + std::map block_bytes; + int64_t non_block_bytes = 0; + int max_block_idx = -1; + for (const auto& kv : model_loader.get_tensor_storage_map()) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + block_bytes[idx] += bytes; + if (idx > max_block_idx) { + max_block_idx = idx; + } + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("layer-split: no transformer blocks found; using single GPU"); + return true; + } + const int n_blocks = max_block_idx + 1; + int64_t total_share = 0, total_block = 0; + for (auto s : shares) { + total_share += s; + } + for (const auto& kv : block_bytes) { + total_block += kv.second; + } + if (total_share <= 0) { + return true; + } + std::vector budgets(shares.size(), 0); + for (size_t k = 0; k < shares.size(); k++) { + int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share)); + if (k == 0) { + b = std::max(b - non_block_bytes, 0); // backend 0 also holds non-block weights + } + budgets[k] = b; + } + std::vector boundaries(shares.size(), 0); + size_t cur = 0; + int64_t cur_use = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) { + boundaries[cur] = b; + cur++; + cur_use = 0; + } + cur_use += bb; + } + for (size_t k = cur; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) { + boundaries[k] = std::min(min_bound, n_blocks); + } + } + // Map each param tensor pointer to its backend (block range -> device). + auto ptr_backend = std::make_shared>(); + std::map dit_map; + runner->get_param_tensors(dit_map); + for (const auto& kv : dit_map) { + ggml_backend_t target = all_backends[0]; + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) == 0) { + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + for (size_t k = 0; k < boundaries.size(); k++) { + if (idx < boundaries[k]) { + target = all_backends[std::min(k, all_backends.size() - 1)]; + break; + } + } + } + } + (*ptr_backend)[kv.second] = target; + } + spec.mode = MultiBackendMode::LAYER_SPLIT; + spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t { + auto it = ptr_backend->find(t); + return it != ptr_backend->end() ? it->second : main_backend; + }; + LOG_INFO("DiT layer-split: %d blocks across %zu devices", n_blocks, all_backends.size()); + } + + runner->set_multi_backend_spec(spec); + return true; + } + + // Conditioner (LLM) layer-split: same block-partition approach as the DiT + // layer-split, but applied to the conditioner's LLM sub-runner (tensors + // under "text_encoders.llm."). LAYER only — the conditioner never row-splits + // (only the DiT does, preserving the single-row-component invariant). The + // conditioner's small projector stays on the main backend. + bool apply_cond_multi_gpu_split(const std::shared_ptr& cond, ModelLoader& model_loader) { + if (!cond || fit_cond_split_mode == backend_fit::MultiGpuMode::OFF || + fit_cond_split_device_names.size() < 2) { + return true; + } + ggml_backend_t main_backend = backend_for(SDBackendModule::TE); + if (main_backend == nullptr) { + return true; + } + const auto& devnames = fit_cond_split_device_names; + const auto& shares = fit_cond_split_share_bytes; + std::vector all_backends; + all_backends.push_back(main_backend); + MultiBackendSpec spec; + for (size_t k = 1; k < devnames.size(); k++) { + ggml_backend_t b = backend_manager.ensure_backend(devnames[k]); + if (b == nullptr) { + LOG_WARN("cond layer-split: failed to init backend %s; using single GPU", devnames[k].c_str()); + return true; + } + spec.additional_backends.push_back(b); + all_backends.push_back(b); + } + const std::string tensor_prefix = "text_encoders.llm."; + std::map block_bytes; + int64_t non_block_bytes = 0; + int max_block_idx = -1; + for (const auto& kv : model_loader.get_tensor_storage_map()) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; + } + int64_t bytes = (int64_t)kv.second.nbytes(); + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + block_bytes[idx] += bytes; + if (idx > max_block_idx) { + max_block_idx = idx; + } + } else { + non_block_bytes += bytes; + } + } + if (max_block_idx < 0) { + LOG_WARN("cond layer-split: no transformer blocks under '%s'; using single GPU", tensor_prefix.c_str()); + return true; + } + const int n_blocks = max_block_idx + 1; + int64_t total_share = 0, total_block = 0; + for (auto s : shares) { + total_share += s; + } + for (const auto& kv : block_bytes) { + total_block += kv.second; + } + if (total_share <= 0) { + return true; + } + std::vector budgets(shares.size(), 0); + for (size_t k = 0; k < shares.size(); k++) { + int64_t b = int64_t(double(total_block + non_block_bytes) * double(shares[k]) / double(total_share)); + if (k == 0) { + b = std::max(b - non_block_bytes, 0); + } + budgets[k] = b; + } + std::vector boundaries(shares.size(), 0); + size_t cur = 0; + int64_t cur_use = 0; + for (int b = 0; b < n_blocks; b++) { + int64_t bb = block_bytes[b]; + if (cur + 1 < shares.size() && cur_use + bb > budgets[cur] && cur_use > 0) { + boundaries[cur] = b; + cur++; + cur_use = 0; + } + cur_use += bb; + } + for (size_t k = cur; k < boundaries.size(); k++) { + boundaries[k] = n_blocks; + } + for (size_t k = 0; k < boundaries.size(); k++) { + int min_bound = (k > 0 ? boundaries[k - 1] : 0) + 1; + if (boundaries[k] < min_bound) { + boundaries[k] = std::min(min_bound, n_blocks); + } + } + auto ptr_backend = std::make_shared>(); + std::map cond_map; + cond->get_param_tensors(cond_map); + for (const auto& kv : cond_map) { + if (kv.first.compare(0, tensor_prefix.size(), tensor_prefix) != 0) { + continue; // only the LLM tensors are split; projector stays on main + } + ggml_backend_t target = all_backends[0]; + int idx = dit_block_index_of(kv.first); + if (idx >= 0) { + for (size_t k = 0; k < boundaries.size(); k++) { + if (idx < boundaries[k]) { + target = all_backends[std::min(k, all_backends.size() - 1)]; + break; + } + } + } + (*ptr_backend)[kv.second] = target; + } + spec.mode = MultiBackendMode::LAYER_SPLIT; + spec.tensor_backend_fn = [ptr_backend, main_backend](ggml_tensor* t) -> ggml_backend_t { + auto it = ptr_backend->find(t); + return it != ptr_backend->end() ? it->second : main_backend; + }; + cond->set_multi_backend_spec(spec); + LOG_INFO("Conditioner LLM layer-split: %d blocks across %zu devices", n_blocks, all_backends.size()); + return true; + } + std::shared_ptr get_rng(rng_type_t rng_type) { if (rng_type == STD_DEFAULT_RNG) { return std::make_shared(); @@ -347,21 +708,10 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - if (!init_backend()) { - return false; - } - { - std::string error; - if (!max_vram_assignment.canonicalize_backend_keys(&error)) { - LOG_ERROR("%s", error.c_str()); - return false; - } - } - if (stream_layers && !backend_manager.params_backend_is_cpu(SDBackendModule::DIFFUSION)) { - LOG_WARN("--stream-layers has no effect unless diffusion params backend is cpu; ignoring"); - stream_layers = false; - } - + // Backend initialization is deferred until after the model metadata is + // loaded, so auto-fit can size the components and choose device + // placements before the backends are created (see the auto-fit block + // below, which feeds its plan into init_backend()). model_manager = std::make_shared(); model_manager->set_n_threads(n_threads); model_manager->set_enable_mmap(enable_mmap); @@ -523,6 +873,185 @@ class StableDiffusionGGML { return oss.str(); }; + auto_fit_enabled = sd_ctx_params->auto_fit; + if (sd_ctx_params->auto_fit) { + if (!backend_spec.empty() || !params_backend_spec.empty()) { + LOG_WARN("auto-fit is enabled; ignoring --backend / --params-backend " + "(pass --no-auto-fit to set device placement manually)"); + } + + backend_fit::ComputeReserves reserves; + // Parse the per-component reserve map ("dit=2048,vae=1024,cond=512"). + // Missing keys keep the built-in defaults. + if (sd_ctx_params->auto_fit_compute_reserve != nullptr) { + std::string spec(sd_ctx_params->auto_fit_compute_reserve); + size_t pos = 0; + while (pos < spec.size()) { + size_t comma = spec.find(',', pos); + std::string entry = spec.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos); + pos = comma == std::string::npos ? spec.size() : comma + 1; + size_t eq = entry.find('='); + if (eq == std::string::npos) { + LOG_WARN("auto-fit: ignoring malformed compute-reserve entry '%s' (expected component=MiB)", entry.c_str()); + continue; + } + std::string key = entry.substr(0, eq); + int64_t mib = std::atoll(entry.c_str() + eq + 1); + if (mib <= 0) { + LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (value must be a positive MiB count)", entry.c_str()); + continue; + } + backend_fit::ComponentKind kind; + if (key == "dit" || key == "diffusion" || key == "model" || key == "unet") { + kind = backend_fit::ComponentKind::DIT; + } else if (key == "vae") { + kind = backend_fit::ComponentKind::VAE; + } else if (key == "cond" || key == "conditioner" || key == "te" || key == "clip") { + kind = backend_fit::ComponentKind::CONDITIONER; + } else { + LOG_WARN("auto-fit: ignoring compute-reserve entry '%s' (unknown component, expected dit/vae/cond)", entry.c_str()); + continue; + } + switch (kind) { + case backend_fit::ComponentKind::DIT: + reserves.dit_bytes = mib * backend_fit::MiB; + break; + case backend_fit::ComponentKind::VAE: + reserves.vae_bytes = mib * backend_fit::MiB; + break; + case backend_fit::ComponentKind::CONDITIONER: + reserves.conditioner_bytes = mib * backend_fit::MiB; + break; + } + } + } + auto components = backend_fit::estimate_components( + model_loader, wtype, /*alignment=*/64, reserves); + auto devices = backend_fit::enumerate_gpu_devices(); + int64_t margin_bytes = + int64_t(std::max(0, sd_ctx_params->auto_fit_target_mb)) * backend_fit::MiB; + backend_fit::MultiGpuMode multi_gpu_mode = + backend_fit::str_to_multi_gpu_mode(SAFE_STR(sd_ctx_params->multi_gpu_mode)); + auto plan = backend_fit::compute_plan( + components, devices, margin_bytes, sd_ctx_params->auto_multi_gpu, multi_gpu_mode); + backend_fit::print_plan(plan, components, devices, margin_bytes); + + if (sd_ctx_params->auto_fit_dry_run) { + LOG_INFO("auto-fit: --fit-dry-run set, aborting init before loading models"); + return false; + } + + // Translate the plan into the backend-assignment specs consumed by + // SDBackendManager. Each component lives entirely on one device: + // GPU -> runtime= (params follow runtime) + // GPU_OFFLOAD_PARAMS -> runtime=, params=cpu (params streamed from RAM) + // CPU -> runtime=cpu (params follow runtime) + // Modules the planner doesn't cover (clip_vision, control_net, + // photomaker, upscaler) fall back to the default backend. + std::string runtime_spec; + std::string params_spec; + auto append_assignment = [](std::string& spec, const char* key, const std::string& value) { + if (!spec.empty()) { + spec += ","; + } + spec += key; + spec += "="; + spec += value; + }; + auto dev_name_by_id = [&](int id) -> std::string { + for (const auto& dev : devices) { + if (dev.id == id) { + return dev.name; + } + } + return ""; + }; + auto apply_decision = [&](const backend_fit::Decision* d, const char* module_key) { + if (d == nullptr) { + return; + } + if (d->placement == backend_fit::Placement::CPU) { + append_assignment(runtime_spec, module_key, "cpu"); + return; + } + // Multi-GPU split (DiT only): the runner's main backend is the + // largest participating GPU (split_device_ids[0]); the actual + // per-tensor distribution is applied later via a MultiBackendSpec + // (see prepare_*_split_spec). Record the decision for that step. + if (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT || + d->placement == backend_fit::Placement::GPU_LAYER_SPLIT) { + std::string main_dev = d->split_device_ids.empty() ? "" : dev_name_by_id(d->split_device_ids[0]); + if (main_dev.empty()) { + return; // fall back to default backend + } + append_assignment(runtime_spec, module_key, main_dev); + backend_fit::MultiGpuMode m = (d->placement == backend_fit::Placement::GPU_TENSOR_SPLIT) + ? backend_fit::MultiGpuMode::ROW + : backend_fit::MultiGpuMode::LAYER; + std::vector names; + for (int id : d->split_device_ids) { + names.push_back(dev_name_by_id(id)); + } + if (std::string(module_key) == "diffusion") { + fit_dit_split_mode = m; + fit_dit_split_device_names = names; + fit_dit_split_share_bytes = d->split_share_bytes; + } else if (std::string(module_key) == "te") { + fit_cond_split_mode = m; + fit_cond_split_device_names = names; + fit_cond_split_share_bytes = d->split_share_bytes; + } + return; + } + std::string dev_name = dev_name_by_id(d->device_id); + if (dev_name.empty()) { + return; // no matching device; fall back to the default backend + } + append_assignment(runtime_spec, module_key, dev_name); + if (d->placement == backend_fit::Placement::GPU_OFFLOAD_PARAMS) { + append_assignment(params_spec, module_key, "cpu"); + } + }; + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::DIT), "diffusion"); + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::CONDITIONER), "te"); + apply_decision(backend_fit::find_decision(plan, backend_fit::ComponentKind::VAE), "vae"); + + backend_spec = runtime_spec; + params_backend_spec = params_spec; + LOG_INFO("auto-fit: backend spec '%s', params backend spec '%s'", + backend_spec.empty() ? "(default)" : backend_spec.c_str(), + params_backend_spec.empty() ? "(none)" : params_backend_spec.c_str()); + + // When a component is split across GPUs the working set is tight: + // the split component (and the others sharing those GPUs) cannot all + // be resident at once. Enable lazy-load so the DiT / conditioner / + // VAE defer their param alloc+load to their compute phase and free + // after, time-sharing VRAM (the per-component MAX plan assumes this). + if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF || + fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) { + auto_lazy_load = true; + LOG_INFO("auto-fit: enabling lazy-load (components time-share VRAM across phases)"); + } + } + + // Create the backends now that the placement (manual or auto-fit) is + // settled, then canonicalize graph-cut VRAM budget assignments against + // the initialized backend registry. + if (!init_backend()) { + return false; + } + { + std::string error; + if (!max_vram_assignment.canonicalize_backend_keys(&error)) { + LOG_ERROR("%s", error.c_str()); + return false; + } + } + if (stream_layers && !backend_manager.params_backend_is_cpu(SDBackendModule::DIFFUSION)) { + LOG_WARN("--stream-layers has no effect unless diffusion params backend is cpu; ignoring"); + stream_layers = false; + } + LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); LOG_INFO("Conditioner weight type stat: %s", wtype_stat_to_str(conditioner_wtype_stat).c_str()); LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str()); @@ -810,8 +1339,19 @@ class StableDiffusionGGML { return false; } + // When the DiT is split across GPUs its params live resident in the + // (per-device) split buffers, so it must not be mmap'd and must not + // use the RAM-streaming path (mutually exclusive with split). + const bool dit_split = fit_dit_split_mode != backend_fit::MultiGpuMode::OFF && + fit_dit_split_device_names.size() >= 2; + if (dit_split && stream_layers) { + LOG_WARN("--stream-layers is ignored for the diffusion model when it is " + "split across GPUs (--multi-gpu-mode=%s)", + backend_fit::multi_gpu_mode_str(fit_dit_split_mode)); + } + diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes_for_module(SDBackendModule::DIFFUSION)); - diffusion_model->set_stream_layers_enabled(stream_layers); + diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers); if (!register_runner_params("Diffusion model", diffusion_model, SDBackendModule::DIFFUSION, @@ -821,7 +1361,7 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes_for_module(SDBackendModule::DIFFUSION)); - high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); + high_noise_diffusion_model->set_stream_layers_enabled(dit_split ? false : stream_layers); if (!register_runner_params("High noise diffusion model", high_noise_diffusion_model, SDBackendModule::DIFFUSION, @@ -1099,6 +1639,59 @@ class StableDiffusionGGML { ignore_tensors.insert("model.visual.deepstack_merger_list."); } + // --- Multi-GPU split + lazy-load (auto-fit) ------------------------ + // Apply split specs before any params are prepared. Split runners use + // runner-owned buffers, so their weight manager is disabled and their + // tensors are loaded directly by a lazy callback at first compute. + apply_dit_multi_gpu_split(diffusion_model, model_loader); + apply_dit_multi_gpu_split(high_noise_diffusion_model, model_loader); + apply_cond_multi_gpu_split(cond_stage_model, model_loader); + + if (auto_lazy_load) { + const bool lazy_mmap = sd_ctx_params->enable_mmap; + ModelLoader* loader_ptr = &model_loader; + auto make_lazy = [&](auto&& component, + const std::function&)>& collect, + const std::string& only_prefix) { + if (!component) { + return; + } + std::map all; + collect(all); + auto sub = std::make_shared>(); + for (const auto& kv : all) { + if (!only_prefix.empty() && + kv.first.compare(0, only_prefix.size(), only_prefix) != 0) { + continue; + } + (*sub)[kv.first] = kv.second; + } + if (sub->empty()) { + return; + } + component->set_weight_manager(nullptr); + component->set_lazy_load([loader_ptr, sub, lazy_mmap]() -> bool { + auto local = *sub; + return loader_ptr->load_tensors(local, {}, lazy_mmap); + }); + LOG_INFO("auto-fit: deferring %zu split tensors to first compute (lazy-load)", sub->size()); + }; + if (fit_dit_split_mode != backend_fit::MultiGpuMode::OFF) { + make_lazy(diffusion_model, + [&](std::map& m) { diffusion_model->get_param_tensors(m); }, + ""); + make_lazy(high_noise_diffusion_model, + [&](std::map& m) { high_noise_diffusion_model->get_param_tensors(m); }, + ""); + } + if (fit_cond_split_mode != backend_fit::MultiGpuMode::OFF) { + make_lazy(cond_stage_model, + [&](std::map& m) { cond_stage_model->get_param_tensors(m); }, + "text_encoders.llm."); + } + } + // ------------------------------------------------------------------ + model_manager->set_common_ignore_tensors(ignore_tensors); if (!model_manager->validate_registered_tensors()) { LOG_ERROR("model metadata validation failed"); @@ -2294,7 +2887,35 @@ class StableDiffusionGGML { } auto latents = first_stage_model->diffusion_to_vae_latents(x); first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); - return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + auto decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + // Auto-fit tiling fallback: a full-frame video decode can need ~10 GB of + // compute buffer and OOM (a graceful failure -> empty result, not an + // abort). Under auto-fit, enable tiling and retry once instead of failing. + // Temporal tiling is LTX-only (its 3D VAE supports temporal_tile_frames); + // every other architecture falls back to ordinary spatial tiling. + if (decoded.empty() && auto_fit_enabled) { + bool changed = false; + if (version == VERSION_LTXAV) { + if (!vae_tiling_params.temporal_tiling) { + vae_tiling_params.temporal_tiling = true; + changed = true; + } + } else if (!vae_tiling_params.enabled) { + vae_tiling_params.enabled = true; + // Reasonable default tile if the user didn't set one. + if (vae_tiling_params.tile_size_x <= 0) vae_tiling_params.tile_size_x = 256; + if (vae_tiling_params.tile_size_y <= 0) vae_tiling_params.tile_size_y = 256; + changed = true; + } + if (changed) { + LOG_WARN("auto-fit: VAE decode failed (likely OOM); retrying with %s tiling", + version == VERSION_LTXAV ? "temporal" : "spatial"); + first_stage_model->free_compute_buffer(); + first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); + decoded = first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); + } + } + return decoded; } sd::Tensor normalize_ltx_video_latents(const sd::Tensor& x) { @@ -2641,6 +3262,12 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO; sd_ctx_params->backend = nullptr; sd_ctx_params->params_backend = nullptr; + sd_ctx_params->auto_fit = true; + sd_ctx_params->auto_fit_target_mb = 512; + sd_ctx_params->auto_fit_dry_run = false; + sd_ctx_params->auto_fit_compute_reserve = nullptr; + sd_ctx_params->auto_multi_gpu = true; + sd_ctx_params->multi_gpu_mode = "row"; sd_ctx_params->rpc_servers = nullptr; } @@ -2677,6 +3304,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "stream_layers: %s\n" "backend: %s\n" "params_backend: %s\n" + "auto_fit: %s\n" + "auto_fit_target_mb: %d\n" + "auto_fit_dry_run: %s\n" + "auto_fit_compute_reserve: %s\n" + "auto_multi_gpu: %s\n" + "multi_gpu_mode: %s\n" + "rpc_servers: %s\n" "flash_attn: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" @@ -2711,6 +3345,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->stream_layers), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), + BOOL_STR(sd_ctx_params->auto_fit), + sd_ctx_params->auto_fit_target_mb, + BOOL_STR(sd_ctx_params->auto_fit_dry_run), + SAFE_STR(sd_ctx_params->auto_fit_compute_reserve), + BOOL_STR(sd_ctx_params->auto_multi_gpu), + SAFE_STR(sd_ctx_params->multi_gpu_mode), + SAFE_STR(sd_ctx_params->rpc_servers), BOOL_STR(sd_ctx_params->flash_attn), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x),