leejet · leejet · Jun 15, 2026 · Jun 15, 2026
diff --git a/docs/pulid.md b/docs/pulid.md
@@ -52,14 +52,15 @@ to a `.pulidembd` binary file (about 131 KB). Run it once per source
 person; the same file is reused for any number of generations.
 
 A reference Python script is provided alongside this docs file at
-[`scripts/pulid_extract_id.py`](../scripts/pulid_extract_id.py). It
+[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
 requires:
-- A working CUDA / CPU PyTorch + diffusers stack
-- `insightface`, `facexlib`, `eva-clip`, `torchvision`
+- A working CUDA / CPU PyTorch stack
+- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
+  `huggingface_hub`, `gguf`
 - The PuLID weights file (same one stable-diffusion.cpp will load below)
-- The ToTheBeginning/PuLID repo's `pulid/pipeline_flux.py` (and its
-  dependencies under `pulid/` and `flux/`) -- recommended to vendor
-  rather than pip-install due to upstream packaging quirks
+- The ToTheBeginning/PuLID repo's `pulid/` package (including
+  `pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
+  is not needed for embedding extraction
 
 Run it as:
 

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -417,7 +417,7 @@ ArgOptions SDContextParams::get_options() {
          &photo_maker_path},
         {"",
          "--pulid-weights",
-         "path to PuLID flux weights (e.g. pulid_flux_v0.9.1.safetensors). Identity is injected during the denoise loop when paired with --pulid-id-embedding.",
+         "path to PuLID Flux weights",
          &pulid_weights_path},
         {"",
          "--upscale-model",
@@ -894,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() {
          &pm_id_embed_path},
         {"",
          "--pulid-id-embedding",
-         "path to a .pulidembd binary produced by pulid_extract_id.py. Carries a (32, 2048) identity embedding extracted from a source portrait. Pair with --pulid-weights on the context.",
+         "path to PuLID id embedding",
          &pulid_id_embedding_path},
         {"",
          "--hires-upscaler",
@@ -1048,7 +1048,7 @@ ArgOptions SDGenerationParams::get_options() {
          &pm_style_strength},
         {"",
          "--pulid-id-weight",
-         "strength of PuLID identity injection (default: 1.0). 0.7-1.2 are typical; lower lets the prompt override the face more, higher tightens identity match.",
+         "strength of PuLID identity injection",
          &pulid_id_weight},
         {"",
          "--control-strength",

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -133,10 +133,6 @@ struct SDContextParams {
     std::string control_net_path;
     std::string embedding_dir;
     std::string photo_maker_path;
-    // PuLID-Flux identity-preservation context path: the safetensors blob
-    // carrying the PerceiverAttentionCA cross-attention weights. Loaded
-    // once with the model. Per-generation pulid_id_embedding_path lives in
-    // SDGenerationParams below.
     std::string pulid_weights_path;
     sd_type_t wtype = SD_TYPE_COUNT;
     std::string tensor_type_rules;
@@ -239,9 +235,6 @@ struct SDGenerationParams {
     std::string pm_id_embed_path;
     float pm_style_strength = 20.f;
 
-    // PuLID-Flux: per-generation identity embedding (binary file produced by
-    // runtime-scripts/pulid_extract_id.py). Format documented in
-    // include/stable-diffusion.h sd_pulid_params_t.
     std::string pulid_id_embedding_path;
     float pulid_id_weight = 1.0f;
 

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -195,15 +195,6 @@ typedef struct {
     const sd_embedding_t* embeddings;
     uint32_t embedding_count;
     const char* photo_maker_path;
-    /**
-     * Path to pulid_flux_v0.9.1.safetensors (the PuLID identity-injection
-     * cross-attention weights). When set together with sd_img_gen_params_t.
-     * pulid_params.id_embedding_path, the Flux diffusion model performs PuLID
-     * cross-attention injection during the denoise loop. Loaded once with
-     * the model; the embedding is per-generation. Currently only meaningful
-     * for Flux (depth=19 double, 38 single blocks); silently ignored for
-     * other model versions.
-     */
     const char* pulid_weights_path;
     const char* tensor_type_rules;
     int n_threads;
@@ -282,23 +273,9 @@ typedef struct {
     float style_strength;
 } sd_pm_params_t;  // photo maker
 
-/**
- * PuLID-Flux identity preservation params.
- *
- * Unlike PhotoMaker (which extracts the ID embedding inside the inference
- * process from a directory of images), PuLID's ID extraction is a heavy
- * Python-only stack (insightface ArcFace + EVA-CLIP-L + IDFormer). To stay
- * cross-vendor in C++/Vulkan, sd.cpp consumes a precomputed binary file
- * produced by an external tool (runtime-scripts/pulid_extract_id.py in the
- * Cloudhands client tree).
- *
- * Format: a gguf container with a single tensor "pulid_id" of shape
- * [token_dim, num_tokens] (ggml order; typically [2048, 32]) in F16/F32/BF16.
- * Loaded with the standard gguf reader; see docs/pulid.md.
- */
 typedef struct {
-    const char* id_embedding_path;  // path to .pulidembd file produced by pulid_extract_id.py
-    float id_weight;                // strength of the ID injection; typical 0.7-1.2, default 1.0
+    const char* id_embedding_path;
+    float id_weight;
 } sd_pulid_params_t;
 
 enum sd_cache_mode_t {

diff --git a/scripts/pulid_extract_id.py → script/pulid_extract_id.py b/scripts/pulid_extract_id.py → script/pulid_extract_id.py
@@ -2,26 +2,18 @@
 Precompute a PuLID-Flux identity embedding from a single source portrait.
 
 Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
-`--pulid-id-embedding` flag consumes. See docs/pulid.md for the format and
-overall PuLID-Flux flow.
-
-This script intentionally lives outside the C++ build: identity extraction
-needs insightface + EVA-CLIP-L + IDFormer, which are PyTorch-only stacks
-that would be impractical to reimplement in ggml just to run once per
-source person. The C++ side downstream of this file is cross-vendor and
-backend-agnostic.
+`--pulid-id-embedding` flag consumes.
 
 Dependencies (recommended: vendor rather than pip-install due to upstream
 packaging quirks):
   - torch + safetensors
-  - The ToTheBeginning/PuLID repository's `pulid/pipeline_flux.py` and
-    its sibling packages (`flux/`, `eva_clip/`, `models/`). Put them on
-    PYTHONPATH or sys.path before running this script.
-  - insightface, facexlib (PuLID pipeline pulls these in)
+  - The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
+    Put them on PYTHONPATH or sys.path before running this script.
+  - insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
   - numpy, Pillow
 
 Usage:
-  python pulid_extract_id.py \\
+  python script/pulid_extract_id.py \\
     --portrait /path/to/source-photo.jpg \\
     --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
     --out /path/to/source.pulidembd
@@ -35,21 +27,7 @@
 import argparse
 import os
 import sys
-
-
-def _make_minimal_flux_skeleton(device):
-    """PuLIDPipeline expects a `dit` (Flux transformer) to attach its
-    PerceiverAttentionCA modules to during construction. We never run a
-    forward pass on it -- the encoders alone (which is what we actually
-    need) live on the pipeline object, not the dit. So we instantiate a
-    real Flux skeleton with default params and never load its weights."""
-    import torch
-    from flux.model import Flux
-    from flux.util import configs
-
-    with torch.device("cpu"):
-        model = Flux(configs["flux-dev"].params).to(torch.bfloat16)
-    return model
+from types import SimpleNamespace
 
 
 def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
@@ -65,18 +43,17 @@ def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
 
     print(f"device={device}", flush=True)
 
-    print("constructing minimal Flux skeleton (no weights loaded)", flush=True)
-    dit = _make_minimal_flux_skeleton(device)
-
-    print("instantiating PuLIDPipeline", flush=True)
-    pulid = PuLIDPipeline(dit=dit, device=device,
+    # PuLIDPipeline only attaches pulid_ca attributes to `dit` during
+    # construction; get_id_embedding() never runs Flux, so a dummy object is
+    # enough and avoids importing/building a Flux skeleton.
+    print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
+    dit = SimpleNamespace()
+    pulid = PuLIDPipeline(dit=dit,
+                          device=device,
                           weight_dtype=torch.bfloat16,
                           onnx_provider=onnx_provider)
 
     print(f"loading PuLID weights from {pulid_weights}", flush=True)
-    # PuLIDPipeline.load_pretrain expects a "version" string used to construct
-    # the default filename when pretrain_path is None. We pass the file
-    # directly so the version string is informational only.
     pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
 
     print(f"extracting ID embedding from {portrait_path}", flush=True)
@@ -100,10 +77,6 @@ def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
 
     os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
 
-    # The embedding ships as a standard gguf container holding a single tensor
-    # named "pulid_id". numpy is row-major (num_tokens, token_dim); gguf stores
-    # dims reversed, so stable-diffusion.cpp reads it back as
-    # ne[0]=token_dim, ne[1]=num_tokens (see load_pulid_id_embedding).
     writer = gguf.GGUFWriter(out_path, arch="pulid")
     writer.add_uint32("pulid.version", 1)
 

diff --git a/src/extensions/pulid_extension.cpp b/src/extensions/pulid_extension.cpp
@@ -7,24 +7,15 @@
 #include "core/util.h"
 #include "gguf.h"
 
-// Load the precomputed PuLID identity embedding produced by
-// scripts/pulid_extract_id.py into a sd::Tensor<float> (always materialized as
-// fp32 for the diffusion path). Returns an empty tensor on any failure (the
-// caller treats empty as "PuLID off").
-//
-// The file is a standard gguf container holding a single tensor named
-// "pulid_id" with shape [token_dim, num_tokens] (ggml order; typically
-// [2048, 32]) in f16 / bf16 / f32. Using gguf rather than a bespoke header
-// means the shape + dtype are self-describing and we reuse ggml's reader.
 static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
     sd::Tensor<float> empty;
     if (path == nullptr || strlen(path) == 0) {
         return empty;
     }
 
-    struct ggml_context* ctx_data   = nullptr;
-    struct gguf_init_params gp       = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
-    struct gguf_context* gguf_ctx    = gguf_init_from_file(path, gp);
+    struct ggml_context* ctx_data = nullptr;
+    struct gguf_init_params gp    = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
+    struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp);
     if (gguf_ctx == nullptr || ctx_data == nullptr) {
         LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path);
         if (gguf_ctx != nullptr)
@@ -83,20 +74,9 @@ static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
     return out;
 }
 
-// PuLID-Flux identity injection as a generation extension.
-//
-// Unlike PhotoMaker, PuLID does NOT modify the conditioning -- it injects an
-// identity embedding via cross-attention *inside* the Flux denoise forward (the
-// pulid_ca.* blocks). Those cross-attention weights are part of the Flux
-// diffusion model and are loaded into the model tensor map before the model is
-// constructed (see SDImpl ctor, gated on sd_ctx_params.pulid_weights_path), so
-// this extension does not own a separate model. Its job is purely runtime:
-//   - prepare_condition: load the per-generation id-embedding file.
-//   - before_diffusion:  hand that embedding (+ weight) to FluxDiffusionExtra,
-//                        which flux.hpp reads to drive the pulid_ca injection.
 struct PuLIDExtension : public GenerationExtension {
     bool enabled = false;
-    sd::Tensor<float> id_embedding;  // per-generation; empty when PuLID is off for this request
+    sd::Tensor<float> id_embedding;
     float id_weight = 1.0f;
 
     const char* name() const override {