diff --git a/docs/pulid.md b/docs/pulid.md
index 4a72ea617..b7eec37e1 100644
--- a/docs/pulid.md
+++ b/docs/pulid.md
@@ -52,14 +52,15 @@ to a `.pulidembd` binary file (about 131 KB). Run it once per source
 person; the same file is reused for any number of generations.
 
 A reference Python script is provided alongside this docs file at
-[`scripts/pulid_extract_id.py`](../scripts/pulid_extract_id.py). It
+[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
 requires:
-- A working CUDA / CPU PyTorch + diffusers stack
-- `insightface`, `facexlib`, `eva-clip`, `torchvision`
+- A working CUDA / CPU PyTorch stack
+- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
+  `huggingface_hub`, `gguf`
 - The PuLID weights file (same one stable-diffusion.cpp will load below)
-- The ToTheBeginning/PuLID repo's `pulid/pipeline_flux.py` (and its
-  dependencies under `pulid/` and `flux/`) -- recommended to vendor
-  rather than pip-install due to upstream packaging quirks
+- The ToTheBeginning/PuLID repo's `pulid/` package (including
+  `pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
+  is not needed for embedding extraction
 
 Run it as:
 
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index ba51d8cce..dd5d35055 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -417,7 +417,7 @@ ArgOptions SDContextParams::get_options() {
          &photo_maker_path},
         {"",
          "--pulid-weights",
-         "path to PuLID flux weights (e.g. pulid_flux_v0.9.1.safetensors). Identity is injected during the denoise loop when paired with --pulid-id-embedding.",
+         "path to PuLID Flux weights",
          &pulid_weights_path},
         {"",
          "--upscale-model",
@@ -894,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() {
          &pm_id_embed_path},
         {"",
          "--pulid-id-embedding",
-         "path to a .pulidembd binary produced by pulid_extract_id.py. Carries a (32, 2048) identity embedding extracted from a source portrait. Pair with --pulid-weights on the context.",
+         "path to PuLID id embedding",
          &pulid_id_embedding_path},
         {"",
          "--hires-upscaler",
@@ -1048,7 +1048,7 @@ ArgOptions SDGenerationParams::get_options() {
          &pm_style_strength},
         {"",
          "--pulid-id-weight",
-         "strength of PuLID identity injection (default: 1.0). 0.7-1.2 are typical; lower lets the prompt override the face more, higher tightens identity match.",
+         "strength of PuLID identity injection",
          &pulid_id_weight},
         {"",
          "--control-strength",
diff --git a/examples/common/common.h b/examples/common/common.h
index 0c0febc38..fcf9840db 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -133,10 +133,6 @@ struct SDContextParams {
     std::string control_net_path;
     std::string embedding_dir;
     std::string photo_maker_path;
-    // PuLID-Flux identity-preservation context path: the safetensors blob
-    // carrying the PerceiverAttentionCA cross-attention weights. Loaded
-    // once with the model. Per-generation pulid_id_embedding_path lives in
-    // SDGenerationParams below.
     std::string pulid_weights_path;
     sd_type_t wtype = SD_TYPE_COUNT;
     std::string tensor_type_rules;
@@ -239,9 +235,6 @@ struct SDGenerationParams {
     std::string pm_id_embed_path;
     float pm_style_strength = 20.f;
 
-    // PuLID-Flux: per-generation identity embedding (binary file produced by
-    // runtime-scripts/pulid_extract_id.py). Format documented in
-    // include/stable-diffusion.h sd_pulid_params_t.
     std::string pulid_id_embedding_path;
     float pulid_id_weight = 1.0f;
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index c813b0f02..2f2851c2e 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -195,15 +195,6 @@ typedef struct {
     const sd_embedding_t* embeddings;
     uint32_t embedding_count;
     const char* photo_maker_path;
-    /**
-     * Path to pulid_flux_v0.9.1.safetensors (the PuLID identity-injection
-     * cross-attention weights). When set together with sd_img_gen_params_t.
-     * pulid_params.id_embedding_path, the Flux diffusion model performs PuLID
-     * cross-attention injection during the denoise loop. Loaded once with
-     * the model; the embedding is per-generation. Currently only meaningful
-     * for Flux (depth=19 double, 38 single blocks); silently ignored for
-     * other model versions.
-     */
     const char* pulid_weights_path;
     const char* tensor_type_rules;
     int n_threads;
@@ -282,23 +273,9 @@ typedef struct {
     float style_strength;
 } sd_pm_params_t;  // photo maker
 
-/**
- * PuLID-Flux identity preservation params.
- *
- * Unlike PhotoMaker (which extracts the ID embedding inside the inference
- * process from a directory of images), PuLID's ID extraction is a heavy
- * Python-only stack (insightface ArcFace + EVA-CLIP-L + IDFormer). To stay
- * cross-vendor in C++/Vulkan, sd.cpp consumes a precomputed binary file
- * produced by an external tool (runtime-scripts/pulid_extract_id.py in the
- * Cloudhands client tree).
- *
- * Format: a gguf container with a single tensor "pulid_id" of shape
- * [token_dim, num_tokens] (ggml order; typically [2048, 32]) in F16/F32/BF16.
- * Loaded with the standard gguf reader; see docs/pulid.md.
- */
 typedef struct {
-    const char* id_embedding_path;  // path to .pulidembd file produced by pulid_extract_id.py
-    float id_weight;                // strength of the ID injection; typical 0.7-1.2, default 1.0
+    const char* id_embedding_path;
+    float id_weight;
 } sd_pulid_params_t;
 
 enum sd_cache_mode_t {
diff --git a/scripts/pulid_extract_id.py b/script/pulid_extract_id.py
similarity index 67%
rename from scripts/pulid_extract_id.py
rename to script/pulid_extract_id.py
index f887260f0..aca52e24c 100644
--- a/scripts/pulid_extract_id.py
+++ b/script/pulid_extract_id.py
@@ -2,26 +2,18 @@
 Precompute a PuLID-Flux identity embedding from a single source portrait.
 
 Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
-`--pulid-id-embedding` flag consumes. See docs/pulid.md for the format and
-overall PuLID-Flux flow.
-
-This script intentionally lives outside the C++ build: identity extraction
-needs insightface + EVA-CLIP-L + IDFormer, which are PyTorch-only stacks
-that would be impractical to reimplement in ggml just to run once per
-source person. The C++ side downstream of this file is cross-vendor and
-backend-agnostic.
+`--pulid-id-embedding` flag consumes.
 
 Dependencies (recommended: vendor rather than pip-install due to upstream
 packaging quirks):
   - torch + safetensors
-  - The ToTheBeginning/PuLID repository's `pulid/pipeline_flux.py` and
-    its sibling packages (`flux/`, `eva_clip/`, `models/`). Put them on
-    PYTHONPATH or sys.path before running this script.
-  - insightface, facexlib (PuLID pipeline pulls these in)
+  - The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
+    Put them on PYTHONPATH or sys.path before running this script.
+  - insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
   - numpy, Pillow
 
 Usage:
-  python pulid_extract_id.py \\
+  python script/pulid_extract_id.py \\
     --portrait /path/to/source-photo.jpg \\
     --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
     --out /path/to/source.pulidembd
@@ -35,21 +27,7 @@
 import argparse
 import os
 import sys
-
-
-def _make_minimal_flux_skeleton(device):
-    """PuLIDPipeline expects a `dit` (Flux transformer) to attach its
-    PerceiverAttentionCA modules to during construction. We never run a
-    forward pass on it -- the encoders alone (which is what we actually
-    need) live on the pipeline object, not the dit. So we instantiate a
-    real Flux skeleton with default params and never load its weights."""
-    import torch
-    from flux.model import Flux
-    from flux.util import configs
-
-    with torch.device("cpu"):
-        model = Flux(configs["flux-dev"].params).to(torch.bfloat16)
-    return model
+from types import SimpleNamespace
 
 
 def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
@@ -65,18 +43,17 @@ def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
 
     print(f"device={device}", flush=True)
 
-    print("constructing minimal Flux skeleton (no weights loaded)", flush=True)
-    dit = _make_minimal_flux_skeleton(device)
-
-    print("instantiating PuLIDPipeline", flush=True)
-    pulid = PuLIDPipeline(dit=dit, device=device,
+    # PuLIDPipeline only attaches pulid_ca attributes to `dit` during
+    # construction; get_id_embedding() never runs Flux, so a dummy object is
+    # enough and avoids importing/building a Flux skeleton.
+    print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
+    dit = SimpleNamespace()
+    pulid = PuLIDPipeline(dit=dit,
+                          device=device,
                           weight_dtype=torch.bfloat16,
                           onnx_provider=onnx_provider)
 
     print(f"loading PuLID weights from {pulid_weights}", flush=True)
-    # PuLIDPipeline.load_pretrain expects a "version" string used to construct
-    # the default filename when pretrain_path is None. We pass the file
-    # directly so the version string is informational only.
     pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
 
     print(f"extracting ID embedding from {portrait_path}", flush=True)
@@ -100,10 +77,6 @@ def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
 
     os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
 
-    # The embedding ships as a standard gguf container holding a single tensor
-    # named "pulid_id". numpy is row-major (num_tokens, token_dim); gguf stores
-    # dims reversed, so stable-diffusion.cpp reads it back as
-    # ne[0]=token_dim, ne[1]=num_tokens (see load_pulid_id_embedding).
     writer = gguf.GGUFWriter(out_path, arch="pulid")
     writer.add_uint32("pulid.version", 1)
 
diff --git a/src/extensions/pulid_extension.cpp b/src/extensions/pulid_extension.cpp
index 51c736b49..d529e5710 100644
--- a/src/extensions/pulid_extension.cpp
+++ b/src/extensions/pulid_extension.cpp
@@ -7,24 +7,15 @@
 #include "core/util.h"
 #include "gguf.h"
 
-// Load the precomputed PuLID identity embedding produced by
-// scripts/pulid_extract_id.py into a sd::Tensor<float> (always materialized as
-// fp32 for the diffusion path). Returns an empty tensor on any failure (the
-// caller treats empty as "PuLID off").
-//
-// The file is a standard gguf container holding a single tensor named
-// "pulid_id" with shape [token_dim, num_tokens] (ggml order; typically
-// [2048, 32]) in f16 / bf16 / f32. Using gguf rather than a bespoke header
-// means the shape + dtype are self-describing and we reuse ggml's reader.
 static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
     sd::Tensor<float> empty;
     if (path == nullptr || strlen(path) == 0) {
         return empty;
     }
 
-    struct ggml_context* ctx_data   = nullptr;
-    struct gguf_init_params gp       = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
-    struct gguf_context* gguf_ctx    = gguf_init_from_file(path, gp);
+    struct ggml_context* ctx_data = nullptr;
+    struct gguf_init_params gp    = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
+    struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp);
     if (gguf_ctx == nullptr || ctx_data == nullptr) {
         LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path);
         if (gguf_ctx != nullptr)
@@ -83,20 +74,9 @@ static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
     return out;
 }
 
-// PuLID-Flux identity injection as a generation extension.
-//
-// Unlike PhotoMaker, PuLID does NOT modify the conditioning -- it injects an
-// identity embedding via cross-attention *inside* the Flux denoise forward (the
-// pulid_ca.* blocks). Those cross-attention weights are part of the Flux
-// diffusion model and are loaded into the model tensor map before the model is
-// constructed (see SDImpl ctor, gated on sd_ctx_params.pulid_weights_path), so
-// this extension does not own a separate model. Its job is purely runtime:
-//   - prepare_condition: load the per-generation id-embedding file.
-//   - before_diffusion:  hand that embedding (+ weight) to FluxDiffusionExtra,
-//                        which flux.hpp reads to drive the pulid_ca injection.
 struct PuLIDExtension : public GenerationExtension {
     bool enabled = false;
-    sd::Tensor<float> id_embedding;  // per-generation; empty when PuLID is off for this request
+    sd::Tensor<float> id_embedding;
     float id_weight = 1.0f;
 
     const char* name() const override {
diff --git a/src/model/adapter/pulid.hpp b/src/model/adapter/pulid.hpp
index 74796a231..442c5b8b2 100644
--- a/src/model/adapter/pulid.hpp
+++ b/src/model/adapter/pulid.hpp
@@ -4,125 +4,71 @@
 #include "core/ggml_extend.hpp"
 #include "model/common/block.hpp"
 
-/**
- * PuLID-Flux identity injection for stable-diffusion.cpp.
- *
- * Mirrors the PerceiverAttentionCA module from
- * https://github.com/ToTheBeginning/PuLID/blob/main/pulid/encoders_transformer.py
- *
- * Each instance is a cross-attention layer where:
- *   Q comes from image tokens             (dim = 3072 = Flux hidden_size)
- *   K, V come from a precomputed ID embedding (kv_dim = 2048, num_tokens = 32)
- *
- * 14 instances are inserted into the Flux denoise loop at fixed intervals:
- *   - Every 2nd of the 19 double_blocks  (10 hook points)
- *   - Every 4th of the 38 single_blocks  (10 hook points... but the v0.9.1
- *     reference uses 4 single hooks, for 14 total)
- *
- * Weight key prefix in pulid_flux_v0.9.1.safetensors:
- *   pulid_ca.<i>.norm1.{weight,bias}
- *   pulid_ca.<i>.norm2.{weight,bias}
- *   pulid_ca.<i>.to_q.weight
- *   pulid_ca.<i>.to_kv.weight
- *   pulid_ca.<i>.to_out.weight
- *
- * Pure-ggml implementation: all ops have Vulkan / CUDA / Metal kernels in
- * the upstream ggml backends, so this works cross-vendor by construction.
- */
 class PuLIDPerceiverAttentionCA : public GGMLBlock {
 public:
-    static constexpr int64_t DEFAULT_DIM     = 3072;  // Flux hidden size
+    static constexpr int64_t DEFAULT_DIM      = 3072;  // Flux hidden size
     static constexpr int64_t DEFAULT_DIM_HEAD = 128;
-    static constexpr int64_t DEFAULT_HEADS   = 16;
-    static constexpr int64_t DEFAULT_KV_DIM  = 2048;  // PuLID ID-embedding dim
+    static constexpr int64_t DEFAULT_HEADS    = 16;
+    static constexpr int64_t DEFAULT_KV_DIM   = 2048;  // PuLID ID-embedding dim
 
 protected:
     int64_t dim;
     int64_t dim_head;
     int64_t heads;
     int64_t kv_dim;
-    int64_t inner_dim;  // dim_head * heads = 2048
+    int64_t inner_dim;
 
 public:
-    PuLIDPerceiverAttentionCA(int64_t dim       = DEFAULT_DIM,
-                              int64_t dim_head  = DEFAULT_DIM_HEAD,
-                              int64_t heads     = DEFAULT_HEADS,
-                              int64_t kv_dim    = DEFAULT_KV_DIM)
+    PuLIDPerceiverAttentionCA(int64_t dim      = DEFAULT_DIM,
+                              int64_t dim_head = DEFAULT_DIM_HEAD,
+                              int64_t heads    = DEFAULT_HEADS,
+                              int64_t kv_dim   = DEFAULT_KV_DIM)
         : dim(dim),
           dim_head(dim_head),
           heads(heads),
           kv_dim(kv_dim),
           inner_dim(dim_head * heads) {
-        // Note the PyTorch reference's surprising signature:
-        // norm1 operates on x (the id_embedding side, kv_dim wide)
-        // norm2 operates on latents (the image tokens, dim wide)
-        // to_q  consumes latents (dim -> inner_dim)
-        // to_kv consumes x       (kv_dim -> 2*inner_dim)
-        // to_out projects        (inner_dim -> dim)
         blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
         blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
-        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim,    inner_dim,     /*bias=*/false));
+        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
         blocks["to_kv"]  = std::shared_ptr<GGMLBlock>(new Linear(kv_dim, inner_dim * 2, /*bias=*/false));
-        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim,        /*bias=*/false));
+        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
     }
 
-    /**
-     * Compute: residual_to_image = PerceiverAttentionCA(id_embedding, image_tokens)
-     *
-     * Inputs:
-     *   id_embedding  [N, n_id_tokens=32, kv_dim=2048]
-     *   image_tokens  [N, n_img_tokens,  dim=3072]
-     *
-     * Returns:
-     *   [N, n_img_tokens, dim=3072]  -- to be added to image_tokens by the caller,
-     *                                  scaled by id_weight.
-     */
     ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor*       id_embedding,
-                         ggml_tensor*       image_tokens) {
+                         ggml_tensor* id_embedding,
+                         ggml_tensor* image_tokens) {
         auto norm1  = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
         auto norm2  = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
         auto to_q   = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
         auto to_kv  = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
         auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
 
-        // Normalize each input on its own dim. The PyTorch reference normalizes
-        // x (id_embedding) and `latents` (image_tokens) separately, then uses
-        // latents for Q and x for K/V -- mind the unusual cross-attention shape.
-        ggml_tensor* x_normed   = norm1->forward(ctx, id_embedding);    // [N, 32, 2048]
-        ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);    // [N, T_img, 3072]
+        ggml_tensor* x_normed   = norm1->forward(ctx, id_embedding);
+        ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
 
-        // Projections. to_q : 3072 -> 2048 ; to_kv : 2048 -> 4096 (k concat v).
-        ggml_tensor* q  = to_q->forward(ctx, lat_normed);   // [N, T_img, 2048]
-        ggml_tensor* kv = to_kv->forward(ctx, x_normed);    // [N, 32,    4096]
+        ggml_tensor* q  = to_q->forward(ctx, lat_normed);  // [N, T_img, 2048]
+        ggml_tensor* kv = to_kv->forward(ctx, x_normed);   // [N, T_img, 3072]
 
-        // Split KV into K (first inner_dim of last axis) and V (second
-        // inner_dim). ggml_view_3d gives strided views without copying;
-        // ggml_cont materializes them so ggml_ext_attention_ext sees
-        // contiguous tensors.
         ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
-                                       inner_dim, kv->ne[1], kv->ne[2],
-                                       kv->nb[1], kv->nb[2],
-                                       /*offset=*/0);                              // [N, 32, 2048]
+                                      inner_dim, kv->ne[1], kv->ne[2],
+                                      kv->nb[1], kv->nb[2],
+                                      /*offset=*/0);
         ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
-                                       inner_dim, kv->ne[1], kv->ne[2],
-                                       kv->nb[1], kv->nb[2],
-                                       /*offset=*/inner_dim * ggml_element_size(kv)); // [N, 32, 2048]
-        k = ggml_cont(ctx->ggml_ctx, k);
-        v = ggml_cont(ctx->ggml_ctx, v);
+                                      inner_dim, kv->ne[1], kv->ne[2],
+                                      kv->nb[1], kv->nb[2],
+                                      /*offset=*/inner_dim * ggml_element_size(kv));
+        k              = ggml_cont(ctx->ggml_ctx, k);
+        v              = ggml_cont(ctx->ggml_ctx, v);
 
-        // Standard multi-head attention. ggml_ext_attention_ext expects
-        // [N, n_token, embed_dim] and reshapes into heads internally.
-        // n_head = heads (=16), per-head dim = inner_dim / heads (=128).
         ggml_tensor* attn_out = ggml_ext_attention_ext(
             ctx->ggml_ctx, ctx->backend,
             q, k, v,
             heads,
             /*mask=*/nullptr,
-            /*diag_mask_inf=*/false);  // [N, T_img, inner_dim=2048]
+            /*diag_mask_inf=*/false);
 
-        // Project back to image-token width (3072).
-        ggml_tensor* out = to_out->forward(ctx, attn_out);  // [N, T_img, 3072]
+        ggml_tensor* out = to_out->forward(ctx, attn_out);
         return out;
     }
 };
diff --git a/src/model/diffusion/flux.hpp b/src/model/diffusion/flux.hpp
index 9986bb8d7..b5e6c63bf 100644
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@@ -50,12 +50,9 @@ namespace Flux {
         float ref_index_scale     = 1.f;
         ChromaRadianceConfig chroma_radiance_params;
 
-        // PuLID-Flux identity injection. Turned on by the runner when a
-        // --pulid-weights path is provided. The intervals are fixed by the
-        // PuLID v0.9.1 architecture (every 2nd double, every 4th single).
-        bool pulid_enabled         = false;
-        int  pulid_double_interval = 2;
-        int  pulid_single_interval = 4;
+        bool pulid_enabled        = false;
+        int pulid_double_interval = 2;
+        int pulid_single_interval = 4;
 
         static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
                                               const std::string& prefix,
@@ -146,10 +143,6 @@ namespace Flux {
                 if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
                     head_dim = tensor_storage.ne[0];
                 }
-                // PuLID weights live alongside the diffusion model under the same
-                // prefix (pulid_ca.<i>.<sub>) when the pulid loader merges them in
-                // (see stable-diffusion.cpp). Spotting any pulid_ca.* key flips the
-                // flag so the Flux ctor builds the pulid_ca.<i> child blocks.
                 if (name.find("pulid_ca.") != std::string::npos) {
                     config.pulid_enabled = true;
                 }
@@ -973,26 +966,17 @@ namespace Flux {
                 blocks["single_stream_modulation"]     = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
             }
 
-            // PuLID-Flux identity-injection cross-attention modules. Only constructed
-            // when config.pulid_enabled is set (turned on by the runner after seeing a
-            // --pulid-weights path during model load). Counts come straight from PuLID
-            // v0.9.1's pipeline_flux.py: every `pulid_double_interval` double block
-            // (=2) and every `pulid_single_interval` single block (=4). For a stock
-            // Flux Dev (depth=19, depth_single_blocks=38), this means 10 + 10 = 20
-            // hook points... but the reference uses ceil-rounding so the actual count
-            // is `ceil(depth/2) + ceil(depth_single_blocks/4)` = 10 + 10 = 20. PuLID
-            // v0.9.1 trained weights have 20 entries.
             if (config.pulid_enabled) {
-                int num_double_ca = (config.depth                 + config.pulid_double_interval - 1) / config.pulid_double_interval;
-                int num_single_ca = (config.depth_single_blocks   + config.pulid_single_interval - 1) / config.pulid_single_interval;
+                int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
+                int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
                 int num_ca        = num_double_ca + num_single_ca;
                 for (int i = 0; i < num_ca; i++) {
                     blocks["pulid_ca." + std::to_string(i)] =
                         std::shared_ptr<GGMLBlock>(new PuLIDPerceiverAttentionCA(
-                            /*dim=*/    config.hidden_size,
+                            /*dim=*/config.hidden_size,
                             /*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD,
-                            /*heads=*/   PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
-                            /*kv_dim=*/  PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
+                            /*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
+                            /*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
                 }
             }
         }
@@ -1007,7 +991,7 @@ namespace Flux {
                                   ggml_tensor* mod_index_arange = nullptr,
                                   std::vector<int> skip_layers  = {},
                                   ggml_tensor* pulid_id         = nullptr,
-                                  float        pulid_id_weight  = 1.0f) {
+                                  float pulid_id_weight         = 1.0f) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
             auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
@@ -1084,22 +1068,12 @@ namespace Flux {
             sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
             sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
 
-            // PuLID identity injection: mirrors ToTheBeginning/PuLID
-            // pulid/encoders_transformer.py + flux/model.py. The CA layers
-            // run *between* transformer blocks, with their output added to
-            // img (scaled by id_weight) at every `pulid_double_interval`-th
-            // double_block and every `pulid_single_interval`-th single_block.
-            //
-            // skip_layers + PuLID is NOT a supported combination -- skipping
-            // a block at a PuLID-aligned index would either misalign the
-            // ca_idx assignment (silent quality regression) or require us
-            // to invent a non-reference index policy. Refuse early instead.
             const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
             if (pulid_active && !skip_layers.empty()) {
                 LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
             }
             const bool pulid_run = pulid_active && skip_layers.empty();
-            int        ca_idx    = 0;
+            int ca_idx           = 0;
 
             for (int i = 0; i < config.depth; i++) {
                 if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
@@ -1117,15 +1091,15 @@ namespace Flux {
                 if (pulid_run && (i % config.pulid_double_interval == 0)) {
                     auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
                         blocks["pulid_ca." + std::to_string(ca_idx)]);
-                    ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img);   // [N, n_img_token, hidden_size]
-                    img = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
+                    ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img);  // [N, n_img_token, hidden_size]
+                    img                 = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
                     sd::ggml_graph_cut::mark_graph_cut(img, "flux.pulid_ca." + std::to_string(ca_idx), "img");
                     ca_idx++;
                 }
             }
 
-            auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
-            const int64_t n_txt_tok = txt->ne[1];                     // for splitting back into img portion below
+            auto txt_img            = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
+            const int64_t n_txt_tok = txt->ne[1];
             for (int i = 0; i < config.depth_single_blocks; i++) {
                 if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
                     continue;
@@ -1138,24 +1112,22 @@ namespace Flux {
                 if (pulid_run && (i % config.pulid_single_interval == 0)) {
                     auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
                         blocks["pulid_ca." + std::to_string(ca_idx)]);
-                    // Split txt_img into [txt | img], inject ID into the img portion
-                    // only, then concatenate back. Matches the PyTorch reference.
                     ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
-                                                          txt_img->ne[0], n_txt_tok, txt_img->ne[2],
-                                                          txt_img->nb[1], txt_img->nb[2],
-                                                          0);
+                                                         txt_img->ne[0], n_txt_tok, txt_img->ne[2],
+                                                         txt_img->nb[1], txt_img->nb[2],
+                                                         0);
                     ggml_tensor* img_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
-                                                          txt_img->ne[0],
-                                                          txt_img->ne[1] - n_txt_tok,
-                                                          txt_img->ne[2],
-                                                          txt_img->nb[1],
-                                                          txt_img->nb[2],
-                                                          n_txt_tok * txt_img->nb[1]);
-                    txt_part = ggml_cont(ctx->ggml_ctx, txt_part);
-                    img_part = ggml_cont(ctx->ggml_ctx, img_part);
-                    ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img_part);
-                    img_part = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
-                    txt_img = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1);
+                                                         txt_img->ne[0],
+                                                         txt_img->ne[1] - n_txt_tok,
+                                                         txt_img->ne[2],
+                                                         txt_img->nb[1],
+                                                         txt_img->nb[2],
+                                                         n_txt_tok * txt_img->nb[1]);
+                    txt_part              = ggml_cont(ctx->ggml_ctx, txt_part);
+                    img_part              = ggml_cont(ctx->ggml_ctx, img_part);
+                    ggml_tensor* ca_out   = pulid_ca->forward(ctx, pulid_id, img_part);
+                    img_part              = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
+                    txt_img               = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1);
                     sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.pulid_ca." + std::to_string(ca_idx), "txt_img");
                     ca_idx++;
                 }
@@ -1567,12 +1539,9 @@ namespace Flux {
                 set_backend_tensor_data(dct, dct_vec.data());
             }
 
-            // Materialize the PuLID id embedding into the compute graph when
-            // pulid_id_tensor is non-empty. forward() accepts nullptr for the
-            // no-injection case.
             ggml_tensor* pulid_id = pulid_id_tensor.empty()
-                                      ? nullptr
-                                      : make_input(pulid_id_tensor);
+                                        ? nullptr
+                                        : make_input(pulid_id_tensor);
 
             auto runner_ctx = get_context();
 
diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp
index 76bc0c2af..67f0fee02 100644
--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@@ -22,9 +22,6 @@ struct SkipLayerDiffusionExtra {
 struct FluxDiffusionExtra {
     const sd::Tensor<float>* guidance   = nullptr;
     const std::vector<int>* skip_layers = nullptr;
-    // PuLID-Flux: precomputed (N=1, num_tokens=32, kv_dim=2048) identity embedding
-    // produced by runtime-scripts/pulid_extract_id.py. nullptr when PuLID is
-    // disabled. id_weight is per-job (typical 0.7-1.2; default 1.0).
     const sd::Tensor<float>* pulid_id   = nullptr;
     float pulid_id_weight               = 1.0f;
 };
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 0544cfb93..1cc7edfce 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -430,14 +430,6 @@ class StableDiffusionGGML {
 
         if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) {
             LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path);
-            // PuLID's cross-attention (pulid_ca.*) weights are part of the Flux
-            // diffusion model -- its blocks are constructed inside FluxModel when
-            // the tensor map contains pulid_ca.* keys. So they must be merged into
-            // the model loader here, BEFORE the diffusion model is built; that is
-            // why this stays in the ctor rather than in the pulid generation
-            // extension (whose init runs after model construction). The runtime
-            // side -- per-generation id-embedding + per-step injection -- lives in
-            // src/extensions/pulid_extension.cpp.
             if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path,
                                              "model.diffusion_model.")) {
                 LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);