diff --git a/docs/backend.md b/docs/backend.md
index 248133bc1..2e3122369 100644
--- a/docs/backend.md
+++ b/docs/backend.md
@@ -124,16 +124,16 @@ Runtime and parameter assignments also share the same backend cache. If `--backe
 
 ## Compatibility flags
 
-The older CPU placement flags are still supported:
+The example CLI/server still accepts these older CPU placement flags as compatibility aliases:
 
 - `--clip-on-cpu`
 - `--vae-on-cpu`
 - `--control-net-cpu`
 - `--offload-to-cpu`
 
-`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.
+`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` are deprecated. The example argument layer prepends `te=cpu`, `vae=cpu`, and `controlnet=cpu` to `--backend` before creating the context.
 
-`--offload-to-cpu` prepends a CPU default to the parameter assignment before parsing:
+`--offload-to-cpu` prepends a CPU default to the parameter assignment in the caller before creating the context:
 
 ```shell
 --params-backend '*=cpu'
@@ -141,4 +141,4 @@ The older CPU placement flags are still supported:
 
 Because this default is inserted first, later explicit `--params-backend` entries can still override it, for example `--offload-to-cpu --params-backend te=disk` keeps non-TE parameters on CPU and reloads TE parameters from disk.
 
-Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
+Library callers should set `backend` and `params_backend` directly. The old CPU/offload fields are no longer part of the C API. Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
diff --git a/docs/performance.md b/docs/performance.md
index 2f526057f..ed86a4f7c 100644
--- a/docs/performance.md
+++ b/docs/performance.md
@@ -31,7 +31,7 @@ Use CPU params to reduce VRAM usage:
 --backend cuda0 --params-backend cpu
 ```
 
-This keeps model weights in system RAM and moves them to the runtime backend when needed. `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend`, so explicit module assignments can still override it:
+This keeps model weights in system RAM and moves them to the runtime backend when needed. In the example CLI/server, `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend` before creating the context, so explicit module assignments can still override it:
 
 ```shell
 --offload-to-cpu --params-backend te=disk
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 1b7c2731c..3df91eebf 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -63,9 +63,9 @@ Context Options:
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed
   --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --control-net-cpu                        deprecated; use --backend controlnet=cpu
+  --clip-on-cpu                            deprecated; use --backend te=cpu
+  --vae-on-cpu                             deprecated; use --backend vae=cpu
   --fa                                     use flash attention
   --diffusion-fa                           use flash attention in the diffusion model only
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index e2854158d..bb5d6862c 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -782,12 +782,11 @@ int main(int argc, const char* argv[]) {
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
         UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
-                                                     ctx_params.offload_params_to_cpu,
                                                      ctx_params.diffusion_conv_direct,
                                                      ctx_params.n_threads,
                                                      gen_params.upscale_tile_size,
-                                                     ctx_params.backend.c_str(),
-                                                     ctx_params.params_backend.c_str()));
+                                                     sd_ctx_params.backend,
+                                                     sd_ctx_params.params_backend));
 
         if (upscaler_ctx == nullptr) {
             LOG_ERROR("new_upscaler_ctx failed");
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index cb19331ea..f0742f62f 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -51,6 +51,10 @@ static sd_vae_format_t str_to_vae_format(const std::string& value) {
     return SD_VAE_FORMAT_COUNT;
 }
 
+static void prepend_backend_assignment(std::string& spec, const char* assignment) {
+    spec = spec.empty() ? assignment : std::string(assignment) + "," + spec;
+}
+
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
     if (wstr.empty())
@@ -463,15 +467,15 @@ ArgOptions SDContextParams::get_options() {
          true, &enable_mmap},
         {"",
          "--control-net-cpu",
-         "keep controlnet in cpu (for low vram)",
+         "deprecated; use --backend controlnet=cpu",
          true, &control_net_cpu},
         {"",
          "--clip-on-cpu",
-         "keep clip in cpu (for low vram)",
+         "deprecated; use --backend te=cpu",
          true, &clip_on_cpu},
         {"",
          "--vae-on-cpu",
-         "keep vae in cpu (for low vram)",
+         "deprecated; use --backend vae=cpu",
          true, &vae_on_cpu},
         {"",
          "--fa",
@@ -688,6 +692,25 @@ bool SDContextParams::resolve_and_validate(SDMode mode) {
     return true;
 }
 
+void SDContextParams::prepare_backend_assignments() {
+    effective_backend        = backend;
+    effective_params_backend = params_backend;
+
+    if (offload_params_to_cpu) {
+        prepend_backend_assignment(effective_params_backend, "*=cpu");
+    }
+
+    if (clip_on_cpu) {
+        prepend_backend_assignment(effective_backend, "te=cpu");
+    }
+    if (vae_on_cpu) {
+        prepend_backend_assignment(effective_backend, "vae=cpu");
+    }
+    if (control_net_cpu) {
+        prepend_backend_assignment(effective_backend, "controlnet=cpu");
+    }
+}
+
 std::string SDContextParams::to_string() const {
     std::ostringstream emb_ss;
     emb_ss << "{\n";
@@ -758,6 +781,7 @@ std::string SDContextParams::to_string() const {
 }
 
 sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
+    prepare_backend_assignments();
     embedding_vec.clear();
     embedding_vec.reserve(embedding_map.size());
     for (const auto& kv : embedding_map) {
@@ -767,55 +791,51 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
         embedding_vec.emplace_back(item);
     }
 
-    sd_ctx_params_t sd_ctx_params = {
-        model_path.c_str(),
-        clip_l_path.c_str(),
-        clip_g_path.c_str(),
-        clip_vision_path.c_str(),
-        t5xxl_path.c_str(),
-        llm_path.c_str(),
-        llm_vision_path.c_str(),
-        diffusion_model_path.c_str(),
-        high_noise_diffusion_model_path.c_str(),
-        uncond_diffusion_model_path.c_str(),
-        embeddings_connectors_path.c_str(),
-        vae_path.c_str(),
-        audio_vae_path.c_str(),
-        taesd_path.c_str(),
-        control_net_path.c_str(),
-        embedding_vec.data(),
-        static_cast<uint32_t>(embedding_vec.size()),
-        photo_maker_path.c_str(),
-        tensor_type_rules.c_str(),
-        n_threads,
-        wtype,
-        rng_type,
-        sampler_rng_type,
-        prediction,
-        lora_apply_mode,
-        offload_params_to_cpu,
-        enable_mmap,
-        clip_on_cpu,
-        control_net_cpu,
-        vae_on_cpu,
-        flash_attn,
-        diffusion_flash_attn,
-        taesd_preview,
-        diffusion_conv_direct,
-        vae_conv_direct,
-        circular || circular_x,
-        circular || circular_y,
-        force_sdxl_vae_conv_scale,
-        chroma_use_dit_mask,
-        chroma_use_t5_mask,
-        chroma_t5_mask_pad,
-        qwen_image_zero_cond_t,
-        str_to_vae_format(vae_format),
-        max_vram,
-        stream_layers,
-        backend.c_str(),
-        params_backend.c_str(),
-    };
+    sd_ctx_params_t sd_ctx_params;
+    sd_ctx_params_init(&sd_ctx_params);
+    sd_ctx_params.model_path                      = model_path.c_str();
+    sd_ctx_params.clip_l_path                     = clip_l_path.c_str();
+    sd_ctx_params.clip_g_path                     = clip_g_path.c_str();
+    sd_ctx_params.clip_vision_path                = clip_vision_path.c_str();
+    sd_ctx_params.t5xxl_path                      = t5xxl_path.c_str();
+    sd_ctx_params.llm_path                        = llm_path.c_str();
+    sd_ctx_params.llm_vision_path                 = llm_vision_path.c_str();
+    sd_ctx_params.diffusion_model_path            = diffusion_model_path.c_str();
+    sd_ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path.c_str();
+    sd_ctx_params.uncond_diffusion_model_path     = uncond_diffusion_model_path.c_str();
+    sd_ctx_params.embeddings_connectors_path      = embeddings_connectors_path.c_str();
+    sd_ctx_params.vae_path                        = vae_path.c_str();
+    sd_ctx_params.audio_vae_path                  = audio_vae_path.c_str();
+    sd_ctx_params.taesd_path                      = taesd_path.c_str();
+    sd_ctx_params.control_net_path                = control_net_path.c_str();
+    sd_ctx_params.embeddings                      = embedding_vec.data();
+    sd_ctx_params.embedding_count                 = static_cast<uint32_t>(embedding_vec.size());
+    sd_ctx_params.photo_maker_path                = photo_maker_path.c_str();
+    sd_ctx_params.tensor_type_rules               = tensor_type_rules.c_str();
+    sd_ctx_params.n_threads                       = n_threads;
+    sd_ctx_params.wtype                           = wtype;
+    sd_ctx_params.rng_type                        = rng_type;
+    sd_ctx_params.sampler_rng_type                = sampler_rng_type;
+    sd_ctx_params.prediction                      = prediction;
+    sd_ctx_params.lora_apply_mode                 = lora_apply_mode;
+    sd_ctx_params.enable_mmap                     = enable_mmap;
+    sd_ctx_params.flash_attn                      = flash_attn;
+    sd_ctx_params.diffusion_flash_attn            = diffusion_flash_attn;
+    sd_ctx_params.tae_preview_only                = taesd_preview;
+    sd_ctx_params.diffusion_conv_direct           = diffusion_conv_direct;
+    sd_ctx_params.vae_conv_direct                 = vae_conv_direct;
+    sd_ctx_params.circular_x                      = circular || circular_x;
+    sd_ctx_params.circular_y                      = circular || circular_y;
+    sd_ctx_params.force_sdxl_vae_conv_scale       = force_sdxl_vae_conv_scale;
+    sd_ctx_params.chroma_use_dit_mask             = chroma_use_dit_mask;
+    sd_ctx_params.chroma_use_t5_mask              = chroma_use_t5_mask;
+    sd_ctx_params.chroma_t5_mask_pad              = chroma_t5_mask_pad;
+    sd_ctx_params.qwen_image_zero_cond_t          = qwen_image_zero_cond_t;
+    sd_ctx_params.vae_format                      = str_to_vae_format(vae_format);
+    sd_ctx_params.max_vram                        = max_vram;
+    sd_ctx_params.stream_layers                   = stream_layers;
+    sd_ctx_params.backend                         = effective_backend.c_str();
+    sd_ctx_params.params_backend                  = effective_params_backend.c_str();
     return sd_ctx_params;
 }
 
diff --git a/examples/common/common.h b/examples/common/common.h
index 8f97ac95b..2ae54c2c7 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -148,6 +148,8 @@ struct SDContextParams {
     bool stream_layers          = false;
     std::string backend;
     std::string params_backend;
+    std::string effective_backend;
+    std::string effective_params_backend;
     bool enable_mmap           = false;
     bool control_net_cpu       = false;
     bool clip_on_cpu           = false;
@@ -175,6 +177,7 @@ struct SDContextParams {
     float flow_shift = INFINITY;
     ArgOptions get_options();
     void build_embedding_map();
+    void prepare_backend_assignments();
     bool resolve(SDMode mode);
     bool validate(SDMode mode);
     bool resolve_and_validate(SDMode mode);
diff --git a/examples/server/README.md b/examples/server/README.md
index 16fb393c6..63e38977a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -165,9 +165,9 @@ Context Options:
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed
   --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --control-net-cpu                        deprecated; use --backend controlnet=cpu
+  --clip-on-cpu                            deprecated; use --backend te=cpu
+  --vae-on-cpu                             deprecated; use --backend vae=cpu
   --fa                                     use flash attention
   --diffusion-fa                           use flash attention in the diffusion model only
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 02e5b6175..ffefdaadf 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -202,11 +202,7 @@ typedef struct {
     enum rng_type_t sampler_rng_type;
     enum prediction_t prediction;
     enum lora_apply_mode_t lora_apply_mode;
-    bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
     bool flash_attn;
     bool diffusion_flash_attn;
     bool tae_preview_only;
@@ -458,7 +454,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        bool offload_params_to_cpu,
                                         bool direct,
                                         int n_threads,
                                         int tile_size,
diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp
index 500e04e27..834a047e7 100644
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@@ -545,9 +545,6 @@ bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule modu
 
 bool SDBackendManager::init(const char* backend_spec,
                             const char* params_backend_spec,
-                            bool keep_clip_on_cpu,
-                            bool keep_vae_on_cpu,
-                            bool keep_control_net_on_cpu,
                             std::string* error) {
     reset();
 
@@ -558,18 +555,6 @@ bool SDBackendManager::init(const char* backend_spec,
         return false;
     }
 
-    if (runtime_assignment_.empty()) {
-        if (keep_clip_on_cpu) {
-            runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
-        }
-        if (keep_vae_on_cpu) {
-            runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
-        }
-        if (keep_control_net_on_cpu) {
-            runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
-        }
-    }
-
     return validate(error);
 }
 
diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h
index a604984f3..58d41ac44 100644
--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@@ -51,9 +51,6 @@ class SDBackendManager {
 
     bool init(const char* backend_spec,
               const char* params_backend_spec,
-              bool keep_clip_on_cpu,
-              bool keep_vae_on_cpu,
-              bool keep_control_net_on_cpu,
               std::string* error);
     void reset();
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index cf44014bf..a5fb0e54d 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -187,7 +187,6 @@ class StableDiffusionGGML {
 
     std::string taesd_path;
     sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
-    bool offload_params_to_cpu           = false;
     bool enable_mmap                     = false;
     float max_vram                       = 0.f;
     bool stream_layers                   = false;
@@ -250,13 +249,10 @@ class StableDiffusionGGML {
                                                      params_mem_size);
     }
 
-    bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
+    bool init_backend() {
         std::string error;
-        if (!backend_manager.init(sd_ctx_params->backend,
+        if (!backend_manager.init(backend_spec.c_str(),
                                   params_backend_spec.c_str(),
-                                  sd_ctx_params->keep_clip_on_cpu,
-                                  sd_ctx_params->keep_vae_on_cpu,
-                                  sd_ctx_params->keep_control_net_on_cpu,
                                   &error)) {
             LOG_ERROR("backend config failed: %s", error.c_str());
             return false;
@@ -316,16 +312,12 @@ class StableDiffusionGGML {
     }
 
     bool init(const sd_ctx_params_t* sd_ctx_params) {
-        n_threads             = sd_ctx_params->n_threads;
-        offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
-        enable_mmap           = sd_ctx_params->enable_mmap;
-        max_vram              = sd_ctx_params->max_vram;
-        stream_layers         = sd_ctx_params->stream_layers;
-        backend_spec          = SAFE_STR(sd_ctx_params->backend);
-        params_backend_spec   = SAFE_STR(sd_ctx_params->params_backend);
-        if (offload_params_to_cpu) {
-            params_backend_spec = params_backend_spec.empty() ? "*=cpu" : "*=cpu," + params_backend_spec;
-        }
+        n_threads           = sd_ctx_params->n_threads;
+        enable_mmap         = sd_ctx_params->enable_mmap;
+        max_vram            = sd_ctx_params->max_vram;
+        stream_layers       = sd_ctx_params->stream_layers;
+        backend_spec        = SAFE_STR(sd_ctx_params->backend);
+        params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
         if (stream_layers && max_vram == 0.f) {
             LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring");
             stream_layers = false;
@@ -344,7 +336,7 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        if (!init_backend(sd_ctx_params)) {
+        if (!init_backend()) {
             return false;
         }
         if (stream_layers && !backend_manager.params_backend_is_cpu(SDBackendModule::DIFFUSION)) {
@@ -534,8 +526,8 @@ class StableDiffusionGGML {
                 }
             }
             // Avoid full-model LoRA merge buffers on constrained setups.
-            const bool streaming_constrained = stream_layers ||
-                                               sd_ctx_params->offload_params_to_cpu;
+            const bool params_offloaded      = params_backend_for(SDBackendModule::DIFFUSION) != backend_for(SDBackendModule::DIFFUSION);
+            const bool streaming_constrained = stream_layers || params_offloaded;
             if (have_quantized_weight || streaming_constrained) {
                 apply_lora_immediately = false;
             } else {
@@ -2615,29 +2607,25 @@ void sd_hires_params_init(sd_hires_params_t* hires_params) {
 }
 
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
-    *sd_ctx_params                         = {};
-    sd_ctx_params->n_threads               = sd_get_num_physical_cores();
-    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
-    sd_ctx_params->rng_type                = CUDA_RNG;
-    sd_ctx_params->sampler_rng_type        = RNG_TYPE_COUNT;
-    sd_ctx_params->prediction              = PREDICTION_COUNT;
-    sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
-    sd_ctx_params->offload_params_to_cpu   = false;
-    sd_ctx_params->max_vram                = 0.f;
-    sd_ctx_params->stream_layers           = false;
-    sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
-    sd_ctx_params->diffusion_flash_attn    = false;
-    sd_ctx_params->circular_x              = false;
-    sd_ctx_params->circular_y              = false;
-    sd_ctx_params->chroma_use_dit_mask     = true;
-    sd_ctx_params->chroma_use_t5_mask      = false;
-    sd_ctx_params->chroma_t5_mask_pad      = 1;
-    sd_ctx_params->vae_format              = SD_VAE_FORMAT_AUTO;
-    sd_ctx_params->backend                 = nullptr;
-    sd_ctx_params->params_backend          = nullptr;
+    *sd_ctx_params                      = {};
+    sd_ctx_params->n_threads            = sd_get_num_physical_cores();
+    sd_ctx_params->wtype                = SD_TYPE_COUNT;
+    sd_ctx_params->rng_type             = CUDA_RNG;
+    sd_ctx_params->sampler_rng_type     = RNG_TYPE_COUNT;
+    sd_ctx_params->prediction           = PREDICTION_COUNT;
+    sd_ctx_params->lora_apply_mode      = LORA_APPLY_AUTO;
+    sd_ctx_params->max_vram             = 0.f;
+    sd_ctx_params->stream_layers        = false;
+    sd_ctx_params->enable_mmap          = false;
+    sd_ctx_params->diffusion_flash_attn = false;
+    sd_ctx_params->circular_x           = false;
+    sd_ctx_params->circular_y           = false;
+    sd_ctx_params->chroma_use_dit_mask  = true;
+    sd_ctx_params->chroma_use_t5_mask   = false;
+    sd_ctx_params->chroma_t5_mask_pad   = 1;
+    sd_ctx_params->vae_format           = SD_VAE_FORMAT_AUTO;
+    sd_ctx_params->backend              = nullptr;
+    sd_ctx_params->params_backend       = nullptr;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -2669,14 +2657,10 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "rng_type: %s\n"
              "sampler_rng_type: %s\n"
              "prediction: %s\n"
-             "offload_params_to_cpu: %s\n"
              "max_vram: %.3f\n"
              "stream_layers: %s\n"
              "backend: %s\n"
              "params_backend: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2707,14 +2691,10 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_rng_type_name(sd_ctx_params->rng_type),
              sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
-             BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              sd_ctx_params->max_vram,
              BOOL_STR(sd_ctx_params->stream_layers),
              SAFE_STR(sd_ctx_params->backend),
              SAFE_STR(sd_ctx_params->params_backend),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
@@ -4436,7 +4416,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
             const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
             hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
             if (!hires_upscaler->load_from_file(request.hires.model_path,
-                                                sd_ctx->sd->offload_params_to_cpu,
                                                 sd_ctx->sd->n_threads)) {
                 LOG_ERROR("load hires model upscaler failed");
                 return nullptr;
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index be1bb2f50..d02366ecb 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -39,20 +39,12 @@ void UpscalerGGML::set_stream_layers_enabled(bool enabled) {
 }
 
 bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
-                                  bool offload_params_to_cpu,
                                   int n_threads) {
     ggml_log_set(ggml_log_callback_default, nullptr);
 
-    std::string effective_params_backend_spec = params_backend_spec;
-    if (offload_params_to_cpu) {
-        effective_params_backend_spec = effective_params_backend_spec.empty() ? "*=cpu" : "*=cpu," + effective_params_backend_spec;
-    }
     std::string error;
     if (!backend_manager.init(backend_spec.c_str(),
-                              effective_params_backend_spec.c_str(),
-                              false,
-                              false,
-                              false,
+                              params_backend_spec.c_str(),
                               &error)) {
         LOG_ERROR("upscaler backend config failed: %s", error.c_str());
         return false;
@@ -181,7 +173,6 @@ struct upscaler_ctx_t {
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 bool offload_params_to_cpu,
                                  bool direct,
                                  int n_threads,
                                  int tile_size,
@@ -198,7 +189,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return nullptr;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, n_threads)) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = nullptr;
         free(upscaler_ctx);
diff --git a/src/upscaler.h b/src/upscaler.h
index 349e35318..38150f59f 100644
--- a/src/upscaler.h
+++ b/src/upscaler.h
@@ -32,7 +32,6 @@ struct UpscalerGGML {
     ~UpscalerGGML();
 
     bool load_from_file(const std::string& esrgan_path,
-                        bool offload_params_to_cpu,
                         int n_threads);
     void set_max_graph_vram_bytes(size_t max_vram_bytes);
     void set_stream_layers_enabled(bool enabled);