Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/backend.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,21 +124,21 @@ Runtime and parameter assignments also share the same backend cache. If `--backe

## Compatibility flags

The older CPU placement flags are still supported:
The example CLI/server still accepts these older CPU placement flags as compatibility aliases:

- `--clip-on-cpu`
- `--vae-on-cpu`
- `--control-net-cpu`
- `--offload-to-cpu`

`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.
`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` are deprecated. The example argument layer prepends `te=cpu`, `vae=cpu`, and `controlnet=cpu` to `--backend` before creating the context.

`--offload-to-cpu` prepends a CPU default to the parameter assignment before parsing:
`--offload-to-cpu` prepends a CPU default to the parameter assignment in the caller before creating the context:

```shell
--params-backend '*=cpu'
```

Because this default is inserted first, later explicit `--params-backend` entries can still override it, for example `--offload-to-cpu --params-backend te=disk` keeps non-TE parameters on CPU and reloads TE parameters from disk.

Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
Library callers should set `backend` and `params_backend` directly. The old CPU/offload fields are no longer part of the C API. Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
2 changes: 1 addition & 1 deletion docs/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Use CPU params to reduce VRAM usage:
--backend cuda0 --params-backend cpu
```

This keeps model weights in system RAM and moves them to the runtime backend when needed. `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend`, so explicit module assignments can still override it:
This keeps model weights in system RAM and moves them to the runtime backend when needed. In the example CLI/server, `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend` before creating the context, so explicit module assignments can still override it:

```shell
--offload-to-cpu --params-backend te=disk
Expand Down
6 changes: 3 additions & 3 deletions examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ Context Options:
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--control-net-cpu deprecated; use --backend controlnet=cpu
--clip-on-cpu deprecated; use --backend te=cpu
--vae-on-cpu deprecated; use --backend vae=cpu
--fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
Expand Down
5 changes: 2 additions & 3 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,12 +782,11 @@ int main(int argc, const char* argv[]) {
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct,
ctx_params.n_threads,
gen_params.upscale_tile_size,
ctx_params.backend.c_str(),
ctx_params.params_backend.c_str()));
sd_ctx_params.backend,
sd_ctx_params.params_backend));

if (upscaler_ctx == nullptr) {
LOG_ERROR("new_upscaler_ctx failed");
Expand Down
124 changes: 72 additions & 52 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ static sd_vae_format_t str_to_vae_format(const std::string& value) {
return SD_VAE_FORMAT_COUNT;
}

static void prepend_backend_assignment(std::string& spec, const char* assignment) {
spec = spec.empty() ? assignment : std::string(assignment) + "," + spec;
}

#if defined(_WIN32)
static std::string utf16_to_utf8(const std::wstring& wstr) {
if (wstr.empty())
Expand Down Expand Up @@ -463,15 +467,15 @@ ArgOptions SDContextParams::get_options() {
true, &enable_mmap},
{"",
"--control-net-cpu",
"keep controlnet in cpu (for low vram)",
"deprecated; use --backend controlnet=cpu",
true, &control_net_cpu},
{"",
"--clip-on-cpu",
"keep clip in cpu (for low vram)",
"deprecated; use --backend te=cpu",
true, &clip_on_cpu},
{"",
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
"deprecated; use --backend vae=cpu",
true, &vae_on_cpu},
{"",
"--fa",
Expand Down Expand Up @@ -688,6 +692,25 @@ bool SDContextParams::resolve_and_validate(SDMode mode) {
return true;
}

void SDContextParams::prepare_backend_assignments() {
effective_backend = backend;
effective_params_backend = params_backend;

if (offload_params_to_cpu) {
prepend_backend_assignment(effective_params_backend, "*=cpu");
}

if (clip_on_cpu) {
prepend_backend_assignment(effective_backend, "te=cpu");
}
if (vae_on_cpu) {
prepend_backend_assignment(effective_backend, "vae=cpu");
}
if (control_net_cpu) {
prepend_backend_assignment(effective_backend, "controlnet=cpu");
}
}

std::string SDContextParams::to_string() const {
std::ostringstream emb_ss;
emb_ss << "{\n";
Expand Down Expand Up @@ -758,6 +781,7 @@ std::string SDContextParams::to_string() const {
}

sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
prepare_backend_assignments();
embedding_vec.clear();
embedding_vec.reserve(embedding_map.size());
for (const auto& kv : embedding_map) {
Expand All @@ -767,55 +791,51 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
embedding_vec.emplace_back(item);
}

sd_ctx_params_t sd_ctx_params = {
model_path.c_str(),
clip_l_path.c_str(),
clip_g_path.c_str(),
clip_vision_path.c_str(),
t5xxl_path.c_str(),
llm_path.c_str(),
llm_vision_path.c_str(),
diffusion_model_path.c_str(),
high_noise_diffusion_model_path.c_str(),
uncond_diffusion_model_path.c_str(),
embeddings_connectors_path.c_str(),
vae_path.c_str(),
audio_vae_path.c_str(),
taesd_path.c_str(),
control_net_path.c_str(),
embedding_vec.data(),
static_cast<uint32_t>(embedding_vec.size()),
photo_maker_path.c_str(),
tensor_type_rules.c_str(),
n_threads,
wtype,
rng_type,
sampler_rng_type,
prediction,
lora_apply_mode,
offload_params_to_cpu,
enable_mmap,
clip_on_cpu,
control_net_cpu,
vae_on_cpu,
flash_attn,
diffusion_flash_attn,
taesd_preview,
diffusion_conv_direct,
vae_conv_direct,
circular || circular_x,
circular || circular_y,
force_sdxl_vae_conv_scale,
chroma_use_dit_mask,
chroma_use_t5_mask,
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
str_to_vae_format(vae_format),
max_vram,
stream_layers,
backend.c_str(),
params_backend.c_str(),
};
sd_ctx_params_t sd_ctx_params;
sd_ctx_params_init(&sd_ctx_params);
sd_ctx_params.model_path = model_path.c_str();
sd_ctx_params.clip_l_path = clip_l_path.c_str();
sd_ctx_params.clip_g_path = clip_g_path.c_str();
sd_ctx_params.clip_vision_path = clip_vision_path.c_str();
sd_ctx_params.t5xxl_path = t5xxl_path.c_str();
sd_ctx_params.llm_path = llm_path.c_str();
sd_ctx_params.llm_vision_path = llm_vision_path.c_str();
sd_ctx_params.diffusion_model_path = diffusion_model_path.c_str();
sd_ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path.c_str();
sd_ctx_params.uncond_diffusion_model_path = uncond_diffusion_model_path.c_str();
sd_ctx_params.embeddings_connectors_path = embeddings_connectors_path.c_str();
sd_ctx_params.vae_path = vae_path.c_str();
sd_ctx_params.audio_vae_path = audio_vae_path.c_str();
sd_ctx_params.taesd_path = taesd_path.c_str();
sd_ctx_params.control_net_path = control_net_path.c_str();
sd_ctx_params.embeddings = embedding_vec.data();
sd_ctx_params.embedding_count = static_cast<uint32_t>(embedding_vec.size());
sd_ctx_params.photo_maker_path = photo_maker_path.c_str();
sd_ctx_params.tensor_type_rules = tensor_type_rules.c_str();
sd_ctx_params.n_threads = n_threads;
sd_ctx_params.wtype = wtype;
sd_ctx_params.rng_type = rng_type;
sd_ctx_params.sampler_rng_type = sampler_rng_type;
sd_ctx_params.prediction = prediction;
sd_ctx_params.lora_apply_mode = lora_apply_mode;
sd_ctx_params.enable_mmap = enable_mmap;
sd_ctx_params.flash_attn = flash_attn;
sd_ctx_params.diffusion_flash_attn = diffusion_flash_attn;
sd_ctx_params.tae_preview_only = taesd_preview;
sd_ctx_params.diffusion_conv_direct = diffusion_conv_direct;
sd_ctx_params.vae_conv_direct = vae_conv_direct;
sd_ctx_params.circular_x = circular || circular_x;
sd_ctx_params.circular_y = circular || circular_y;
sd_ctx_params.force_sdxl_vae_conv_scale = force_sdxl_vae_conv_scale;
sd_ctx_params.chroma_use_dit_mask = chroma_use_dit_mask;
sd_ctx_params.chroma_use_t5_mask = chroma_use_t5_mask;
sd_ctx_params.chroma_t5_mask_pad = chroma_t5_mask_pad;
sd_ctx_params.qwen_image_zero_cond_t = qwen_image_zero_cond_t;
sd_ctx_params.vae_format = str_to_vae_format(vae_format);
sd_ctx_params.max_vram = max_vram;
sd_ctx_params.stream_layers = stream_layers;
sd_ctx_params.backend = effective_backend.c_str();
sd_ctx_params.params_backend = effective_params_backend.c_str();
return sd_ctx_params;
}

Expand Down
3 changes: 3 additions & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ struct SDContextParams {
bool stream_layers = false;
std::string backend;
std::string params_backend;
std::string effective_backend;
std::string effective_params_backend;
bool enable_mmap = false;
bool control_net_cpu = false;
bool clip_on_cpu = false;
Expand Down Expand Up @@ -175,6 +177,7 @@ struct SDContextParams {
float flow_shift = INFINITY;
ArgOptions get_options();
void build_embedding_map();
void prepare_backend_assignments();
bool resolve(SDMode mode);
bool validate(SDMode mode);
bool resolve_and_validate(SDMode mode);
Expand Down
6 changes: 3 additions & 3 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ Context Options:
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--control-net-cpu deprecated; use --backend controlnet=cpu
--clip-on-cpu deprecated; use --backend te=cpu
--vae-on-cpu deprecated; use --backend vae=cpu
--fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
Expand Down
5 changes: 0 additions & 5 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,7 @@ typedef struct {
enum rng_type_t sampler_rng_type;
enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu;
bool enable_mmap;
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
bool flash_attn;
bool diffusion_flash_attn;
bool tae_preview_only;
Expand Down Expand Up @@ -458,7 +454,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
typedef struct upscaler_ctx_t upscaler_ctx_t;

SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
bool direct,
int n_threads,
int tile_size,
Expand Down
15 changes: 0 additions & 15 deletions src/core/ggml_extend_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,9 +545,6 @@ bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule modu

bool SDBackendManager::init(const char* backend_spec,
const char* params_backend_spec,
bool keep_clip_on_cpu,
bool keep_vae_on_cpu,
bool keep_control_net_on_cpu,
std::string* error) {
reset();

Expand All @@ -558,18 +555,6 @@ bool SDBackendManager::init(const char* backend_spec,
return false;
}

if (runtime_assignment_.empty()) {
if (keep_clip_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
}
if (keep_vae_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
}
if (keep_control_net_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
}
}

return validate(error);
}

Expand Down
3 changes: 0 additions & 3 deletions src/core/ggml_extend_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ class SDBackendManager {

bool init(const char* backend_spec,
const char* params_backend_spec,
bool keep_clip_on_cpu,
bool keep_vae_on_cpu,
bool keep_control_net_on_cpu,
std::string* error);
void reset();

Expand Down
Loading
Loading