diff --git a/docs/pulid.md b/docs/pulid.md index 4a72ea617..b7eec37e1 100644 --- a/docs/pulid.md +++ b/docs/pulid.md @@ -52,14 +52,15 @@ to a `.pulidembd` binary file (about 131 KB). Run it once per source person; the same file is reused for any number of generations. A reference Python script is provided alongside this docs file at -[`scripts/pulid_extract_id.py`](../scripts/pulid_extract_id.py). It +[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It requires: -- A working CUDA / CPU PyTorch + diffusers stack -- `insightface`, `facexlib`, `eva-clip`, `torchvision` +- A working CUDA / CPU PyTorch stack +- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`, + `huggingface_hub`, `gguf` - The PuLID weights file (same one stable-diffusion.cpp will load below) -- The ToTheBeginning/PuLID repo's `pulid/pipeline_flux.py` (and its - dependencies under `pulid/` and `flux/`) -- recommended to vendor - rather than pip-install due to upstream packaging quirks +- The ToTheBeginning/PuLID repo's `pulid/` package (including + `pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/` + is not needed for embedding extraction Run it as: diff --git a/examples/common/common.cpp b/examples/common/common.cpp index ba51d8cce..dd5d35055 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -417,7 +417,7 @@ ArgOptions SDContextParams::get_options() { &photo_maker_path}, {"", "--pulid-weights", - "path to PuLID flux weights (e.g. pulid_flux_v0.9.1.safetensors). Identity is injected during the denoise loop when paired with --pulid-id-embedding.", + "path to PuLID Flux weights", &pulid_weights_path}, {"", "--upscale-model", @@ -894,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() { &pm_id_embed_path}, {"", "--pulid-id-embedding", - "path to a .pulidembd binary produced by pulid_extract_id.py. Carries a (32, 2048) identity embedding extracted from a source portrait. Pair with --pulid-weights on the context.", + "path to PuLID id embedding", &pulid_id_embedding_path}, {"", "--hires-upscaler", @@ -1048,7 +1048,7 @@ ArgOptions SDGenerationParams::get_options() { &pm_style_strength}, {"", "--pulid-id-weight", - "strength of PuLID identity injection (default: 1.0). 0.7-1.2 are typical; lower lets the prompt override the face more, higher tightens identity match.", + "strength of PuLID identity injection", &pulid_id_weight}, {"", "--control-strength", diff --git a/examples/common/common.h b/examples/common/common.h index 0c0febc38..fcf9840db 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -133,10 +133,6 @@ struct SDContextParams { std::string control_net_path; std::string embedding_dir; std::string photo_maker_path; - // PuLID-Flux identity-preservation context path: the safetensors blob - // carrying the PerceiverAttentionCA cross-attention weights. Loaded - // once with the model. Per-generation pulid_id_embedding_path lives in - // SDGenerationParams below. std::string pulid_weights_path; sd_type_t wtype = SD_TYPE_COUNT; std::string tensor_type_rules; @@ -239,9 +235,6 @@ struct SDGenerationParams { std::string pm_id_embed_path; float pm_style_strength = 20.f; - // PuLID-Flux: per-generation identity embedding (binary file produced by - // runtime-scripts/pulid_extract_id.py). Format documented in - // include/stable-diffusion.h sd_pulid_params_t. std::string pulid_id_embedding_path; float pulid_id_weight = 1.0f; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index c813b0f02..2f2851c2e 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -195,15 +195,6 @@ typedef struct { const sd_embedding_t* embeddings; uint32_t embedding_count; const char* photo_maker_path; - /** - * Path to pulid_flux_v0.9.1.safetensors (the PuLID identity-injection - * cross-attention weights). When set together with sd_img_gen_params_t. - * pulid_params.id_embedding_path, the Flux diffusion model performs PuLID - * cross-attention injection during the denoise loop. Loaded once with - * the model; the embedding is per-generation. Currently only meaningful - * for Flux (depth=19 double, 38 single blocks); silently ignored for - * other model versions. - */ const char* pulid_weights_path; const char* tensor_type_rules; int n_threads; @@ -282,23 +273,9 @@ typedef struct { float style_strength; } sd_pm_params_t; // photo maker -/** - * PuLID-Flux identity preservation params. - * - * Unlike PhotoMaker (which extracts the ID embedding inside the inference - * process from a directory of images), PuLID's ID extraction is a heavy - * Python-only stack (insightface ArcFace + EVA-CLIP-L + IDFormer). To stay - * cross-vendor in C++/Vulkan, sd.cpp consumes a precomputed binary file - * produced by an external tool (runtime-scripts/pulid_extract_id.py in the - * Cloudhands client tree). - * - * Format: a gguf container with a single tensor "pulid_id" of shape - * [token_dim, num_tokens] (ggml order; typically [2048, 32]) in F16/F32/BF16. - * Loaded with the standard gguf reader; see docs/pulid.md. - */ typedef struct { - const char* id_embedding_path; // path to .pulidembd file produced by pulid_extract_id.py - float id_weight; // strength of the ID injection; typical 0.7-1.2, default 1.0 + const char* id_embedding_path; + float id_weight; } sd_pulid_params_t; enum sd_cache_mode_t { diff --git a/scripts/pulid_extract_id.py b/script/pulid_extract_id.py similarity index 67% rename from scripts/pulid_extract_id.py rename to script/pulid_extract_id.py index f887260f0..aca52e24c 100644 --- a/scripts/pulid_extract_id.py +++ b/script/pulid_extract_id.py @@ -2,26 +2,18 @@ Precompute a PuLID-Flux identity embedding from a single source portrait. Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's -`--pulid-id-embedding` flag consumes. See docs/pulid.md for the format and -overall PuLID-Flux flow. - -This script intentionally lives outside the C++ build: identity extraction -needs insightface + EVA-CLIP-L + IDFormer, which are PyTorch-only stacks -that would be impractical to reimplement in ggml just to run once per -source person. The C++ side downstream of this file is cross-vendor and -backend-agnostic. +`--pulid-id-embedding` flag consumes. Dependencies (recommended: vendor rather than pip-install due to upstream packaging quirks): - torch + safetensors - - The ToTheBeginning/PuLID repository's `pulid/pipeline_flux.py` and - its sibling packages (`flux/`, `eva_clip/`, `models/`). Put them on - PYTHONPATH or sys.path before running this script. - - insightface, facexlib (PuLID pipeline pulls these in) + - The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`. + Put them on PYTHONPATH or sys.path before running this script. + - insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf - numpy, Pillow Usage: - python pulid_extract_id.py \\ + python script/pulid_extract_id.py \\ --portrait /path/to/source-photo.jpg \\ --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\ --out /path/to/source.pulidembd @@ -35,21 +27,7 @@ import argparse import os import sys - - -def _make_minimal_flux_skeleton(device): - """PuLIDPipeline expects a `dit` (Flux transformer) to attach its - PerceiverAttentionCA modules to during construction. We never run a - forward pass on it -- the encoders alone (which is what we actually - need) live on the pipeline object, not the dit. So we instantiate a - real Flux skeleton with default params and never load its weights.""" - import torch - from flux.model import Flux - from flux.util import configs - - with torch.device("cpu"): - model = Flux(configs["flux-dev"].params).to(torch.bfloat16) - return model +from types import SimpleNamespace def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor": @@ -65,18 +43,17 @@ def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor": print(f"device={device}", flush=True) - print("constructing minimal Flux skeleton (no weights loaded)", flush=True) - dit = _make_minimal_flux_skeleton(device) - - print("instantiating PuLIDPipeline", flush=True) - pulid = PuLIDPipeline(dit=dit, device=device, + # PuLIDPipeline only attaches pulid_ca attributes to `dit` during + # construction; get_id_embedding() never runs Flux, so a dummy object is + # enough and avoids importing/building a Flux skeleton. + print("instantiating PuLIDPipeline with a dummy Flux object", flush=True) + dit = SimpleNamespace() + pulid = PuLIDPipeline(dit=dit, + device=device, weight_dtype=torch.bfloat16, onnx_provider=onnx_provider) print(f"loading PuLID weights from {pulid_weights}", flush=True) - # PuLIDPipeline.load_pretrain expects a "version" string used to construct - # the default filename when pretrain_path is None. We pass the file - # directly so the version string is informational only. pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1") print(f"extracting ID embedding from {portrait_path}", flush=True) @@ -100,10 +77,6 @@ def write_embd(tensor, out_path: str, dtype_choice: str) -> None: os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True) - # The embedding ships as a standard gguf container holding a single tensor - # named "pulid_id". numpy is row-major (num_tokens, token_dim); gguf stores - # dims reversed, so stable-diffusion.cpp reads it back as - # ne[0]=token_dim, ne[1]=num_tokens (see load_pulid_id_embedding). writer = gguf.GGUFWriter(out_path, arch="pulid") writer.add_uint32("pulid.version", 1) diff --git a/src/extensions/pulid_extension.cpp b/src/extensions/pulid_extension.cpp index 51c736b49..d529e5710 100644 --- a/src/extensions/pulid_extension.cpp +++ b/src/extensions/pulid_extension.cpp @@ -7,24 +7,15 @@ #include "core/util.h" #include "gguf.h" -// Load the precomputed PuLID identity embedding produced by -// scripts/pulid_extract_id.py into a sd::Tensor (always materialized as -// fp32 for the diffusion path). Returns an empty tensor on any failure (the -// caller treats empty as "PuLID off"). -// -// The file is a standard gguf container holding a single tensor named -// "pulid_id" with shape [token_dim, num_tokens] (ggml order; typically -// [2048, 32]) in f16 / bf16 / f32. Using gguf rather than a bespoke header -// means the shape + dtype are self-describing and we reuse ggml's reader. static sd::Tensor load_pulid_id_embedding(const char* path) { sd::Tensor empty; if (path == nullptr || strlen(path) == 0) { return empty; } - struct ggml_context* ctx_data = nullptr; - struct gguf_init_params gp = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data}; - struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp); + struct ggml_context* ctx_data = nullptr; + struct gguf_init_params gp = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data}; + struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp); if (gguf_ctx == nullptr || ctx_data == nullptr) { LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path); if (gguf_ctx != nullptr) @@ -83,20 +74,9 @@ static sd::Tensor load_pulid_id_embedding(const char* path) { return out; } -// PuLID-Flux identity injection as a generation extension. -// -// Unlike PhotoMaker, PuLID does NOT modify the conditioning -- it injects an -// identity embedding via cross-attention *inside* the Flux denoise forward (the -// pulid_ca.* blocks). Those cross-attention weights are part of the Flux -// diffusion model and are loaded into the model tensor map before the model is -// constructed (see SDImpl ctor, gated on sd_ctx_params.pulid_weights_path), so -// this extension does not own a separate model. Its job is purely runtime: -// - prepare_condition: load the per-generation id-embedding file. -// - before_diffusion: hand that embedding (+ weight) to FluxDiffusionExtra, -// which flux.hpp reads to drive the pulid_ca injection. struct PuLIDExtension : public GenerationExtension { bool enabled = false; - sd::Tensor id_embedding; // per-generation; empty when PuLID is off for this request + sd::Tensor id_embedding; float id_weight = 1.0f; const char* name() const override { diff --git a/src/model/adapter/pulid.hpp b/src/model/adapter/pulid.hpp index 74796a231..442c5b8b2 100644 --- a/src/model/adapter/pulid.hpp +++ b/src/model/adapter/pulid.hpp @@ -4,125 +4,71 @@ #include "core/ggml_extend.hpp" #include "model/common/block.hpp" -/** - * PuLID-Flux identity injection for stable-diffusion.cpp. - * - * Mirrors the PerceiverAttentionCA module from - * https://github.com/ToTheBeginning/PuLID/blob/main/pulid/encoders_transformer.py - * - * Each instance is a cross-attention layer where: - * Q comes from image tokens (dim = 3072 = Flux hidden_size) - * K, V come from a precomputed ID embedding (kv_dim = 2048, num_tokens = 32) - * - * 14 instances are inserted into the Flux denoise loop at fixed intervals: - * - Every 2nd of the 19 double_blocks (10 hook points) - * - Every 4th of the 38 single_blocks (10 hook points... but the v0.9.1 - * reference uses 4 single hooks, for 14 total) - * - * Weight key prefix in pulid_flux_v0.9.1.safetensors: - * pulid_ca..norm1.{weight,bias} - * pulid_ca..norm2.{weight,bias} - * pulid_ca..to_q.weight - * pulid_ca..to_kv.weight - * pulid_ca..to_out.weight - * - * Pure-ggml implementation: all ops have Vulkan / CUDA / Metal kernels in - * the upstream ggml backends, so this works cross-vendor by construction. - */ class PuLIDPerceiverAttentionCA : public GGMLBlock { public: - static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size + static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size static constexpr int64_t DEFAULT_DIM_HEAD = 128; - static constexpr int64_t DEFAULT_HEADS = 16; - static constexpr int64_t DEFAULT_KV_DIM = 2048; // PuLID ID-embedding dim + static constexpr int64_t DEFAULT_HEADS = 16; + static constexpr int64_t DEFAULT_KV_DIM = 2048; // PuLID ID-embedding dim protected: int64_t dim; int64_t dim_head; int64_t heads; int64_t kv_dim; - int64_t inner_dim; // dim_head * heads = 2048 + int64_t inner_dim; public: - PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM, - int64_t dim_head = DEFAULT_DIM_HEAD, - int64_t heads = DEFAULT_HEADS, - int64_t kv_dim = DEFAULT_KV_DIM) + PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM, + int64_t dim_head = DEFAULT_DIM_HEAD, + int64_t heads = DEFAULT_HEADS, + int64_t kv_dim = DEFAULT_KV_DIM) : dim(dim), dim_head(dim_head), heads(heads), kv_dim(kv_dim), inner_dim(dim_head * heads) { - // Note the PyTorch reference's surprising signature: - // norm1 operates on x (the id_embedding side, kv_dim wide) - // norm2 operates on latents (the image tokens, dim wide) - // to_q consumes latents (dim -> inner_dim) - // to_kv consumes x (kv_dim -> 2*inner_dim) - // to_out projects (inner_dim -> dim) blocks["norm1"] = std::shared_ptr(new LayerNorm(kv_dim)); blocks["norm2"] = std::shared_ptr(new LayerNorm(dim)); - blocks["to_q"] = std::shared_ptr(new Linear(dim, inner_dim, /*bias=*/false)); + blocks["to_q"] = std::shared_ptr(new Linear(dim, inner_dim, /*bias=*/false)); blocks["to_kv"] = std::shared_ptr(new Linear(kv_dim, inner_dim * 2, /*bias=*/false)); - blocks["to_out"] = std::shared_ptr(new Linear(inner_dim, dim, /*bias=*/false)); + blocks["to_out"] = std::shared_ptr(new Linear(inner_dim, dim, /*bias=*/false)); } - /** - * Compute: residual_to_image = PerceiverAttentionCA(id_embedding, image_tokens) - * - * Inputs: - * id_embedding [N, n_id_tokens=32, kv_dim=2048] - * image_tokens [N, n_img_tokens, dim=3072] - * - * Returns: - * [N, n_img_tokens, dim=3072] -- to be added to image_tokens by the caller, - * scaled by id_weight. - */ ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* id_embedding, - ggml_tensor* image_tokens) { + ggml_tensor* id_embedding, + ggml_tensor* image_tokens) { auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); auto to_q = std::dynamic_pointer_cast(blocks["to_q"]); auto to_kv = std::dynamic_pointer_cast(blocks["to_kv"]); auto to_out = std::dynamic_pointer_cast(blocks["to_out"]); - // Normalize each input on its own dim. The PyTorch reference normalizes - // x (id_embedding) and `latents` (image_tokens) separately, then uses - // latents for Q and x for K/V -- mind the unusual cross-attention shape. - ggml_tensor* x_normed = norm1->forward(ctx, id_embedding); // [N, 32, 2048] - ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens); // [N, T_img, 3072] + ggml_tensor* x_normed = norm1->forward(ctx, id_embedding); + ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens); - // Projections. to_q : 3072 -> 2048 ; to_kv : 2048 -> 4096 (k concat v). - ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048] - ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, 32, 4096] + ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048] + ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, T_img, 3072] - // Split KV into K (first inner_dim of last axis) and V (second - // inner_dim). ggml_view_3d gives strided views without copying; - // ggml_cont materializes them so ggml_ext_attention_ext sees - // contiguous tensors. ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv, - inner_dim, kv->ne[1], kv->ne[2], - kv->nb[1], kv->nb[2], - /*offset=*/0); // [N, 32, 2048] + inner_dim, kv->ne[1], kv->ne[2], + kv->nb[1], kv->nb[2], + /*offset=*/0); ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv, - inner_dim, kv->ne[1], kv->ne[2], - kv->nb[1], kv->nb[2], - /*offset=*/inner_dim * ggml_element_size(kv)); // [N, 32, 2048] - k = ggml_cont(ctx->ggml_ctx, k); - v = ggml_cont(ctx->ggml_ctx, v); + inner_dim, kv->ne[1], kv->ne[2], + kv->nb[1], kv->nb[2], + /*offset=*/inner_dim * ggml_element_size(kv)); + k = ggml_cont(ctx->ggml_ctx, k); + v = ggml_cont(ctx->ggml_ctx, v); - // Standard multi-head attention. ggml_ext_attention_ext expects - // [N, n_token, embed_dim] and reshapes into heads internally. - // n_head = heads (=16), per-head dim = inner_dim / heads (=128). ggml_tensor* attn_out = ggml_ext_attention_ext( ctx->ggml_ctx, ctx->backend, q, k, v, heads, /*mask=*/nullptr, - /*diag_mask_inf=*/false); // [N, T_img, inner_dim=2048] + /*diag_mask_inf=*/false); - // Project back to image-token width (3072). - ggml_tensor* out = to_out->forward(ctx, attn_out); // [N, T_img, 3072] + ggml_tensor* out = to_out->forward(ctx, attn_out); return out; } }; diff --git a/src/model/diffusion/flux.hpp b/src/model/diffusion/flux.hpp index 9986bb8d7..b5e6c63bf 100644 --- a/src/model/diffusion/flux.hpp +++ b/src/model/diffusion/flux.hpp @@ -50,12 +50,9 @@ namespace Flux { float ref_index_scale = 1.f; ChromaRadianceConfig chroma_radiance_params; - // PuLID-Flux identity injection. Turned on by the runner when a - // --pulid-weights path is provided. The intervals are fixed by the - // PuLID v0.9.1 architecture (every 2nd double, every 4th single). - bool pulid_enabled = false; - int pulid_double_interval = 2; - int pulid_single_interval = 4; + bool pulid_enabled = false; + int pulid_double_interval = 2; + int pulid_single_interval = 4; static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix, @@ -146,10 +143,6 @@ namespace Flux { if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) { head_dim = tensor_storage.ne[0]; } - // PuLID weights live alongside the diffusion model under the same - // prefix (pulid_ca..) when the pulid loader merges them in - // (see stable-diffusion.cpp). Spotting any pulid_ca.* key flips the - // flag so the Flux ctor builds the pulid_ca. child blocks. if (name.find("pulid_ca.") != std::string::npos) { config.pulid_enabled = true; } @@ -973,26 +966,17 @@ namespace Flux { blocks["single_stream_modulation"] = std::make_shared(config.hidden_size, false, !config.disable_bias); } - // PuLID-Flux identity-injection cross-attention modules. Only constructed - // when config.pulid_enabled is set (turned on by the runner after seeing a - // --pulid-weights path during model load). Counts come straight from PuLID - // v0.9.1's pipeline_flux.py: every `pulid_double_interval` double block - // (=2) and every `pulid_single_interval` single block (=4). For a stock - // Flux Dev (depth=19, depth_single_blocks=38), this means 10 + 10 = 20 - // hook points... but the reference uses ceil-rounding so the actual count - // is `ceil(depth/2) + ceil(depth_single_blocks/4)` = 10 + 10 = 20. PuLID - // v0.9.1 trained weights have 20 entries. if (config.pulid_enabled) { - int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval; - int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval; + int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval; + int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval; int num_ca = num_double_ca + num_single_ca; for (int i = 0; i < num_ca; i++) { blocks["pulid_ca." + std::to_string(i)] = std::shared_ptr(new PuLIDPerceiverAttentionCA( - /*dim=*/ config.hidden_size, + /*dim=*/config.hidden_size, /*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD, - /*heads=*/ PuLIDPerceiverAttentionCA::DEFAULT_HEADS, - /*kv_dim=*/ PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM)); + /*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS, + /*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM)); } } } @@ -1007,7 +991,7 @@ namespace Flux { ggml_tensor* mod_index_arange = nullptr, std::vector skip_layers = {}, ggml_tensor* pulid_id = nullptr, - float pulid_id_weight = 1.0f) { + float pulid_id_weight = 1.0f) { auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); @@ -1084,22 +1068,12 @@ namespace Flux { sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt"); sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec"); - // PuLID identity injection: mirrors ToTheBeginning/PuLID - // pulid/encoders_transformer.py + flux/model.py. The CA layers - // run *between* transformer blocks, with their output added to - // img (scaled by id_weight) at every `pulid_double_interval`-th - // double_block and every `pulid_single_interval`-th single_block. - // - // skip_layers + PuLID is NOT a supported combination -- skipping - // a block at a PuLID-aligned index would either misalign the - // ca_idx assignment (silent quality regression) or require us - // to invent a non-reference index policy. Refuse early instead. const bool pulid_active = config.pulid_enabled && pulid_id != nullptr; if (pulid_active && !skip_layers.empty()) { LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation."); } const bool pulid_run = pulid_active && skip_layers.empty(); - int ca_idx = 0; + int ca_idx = 0; for (int i = 0; i < config.depth; i++) { if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) { @@ -1117,15 +1091,15 @@ namespace Flux { if (pulid_run && (i % config.pulid_double_interval == 0)) { auto pulid_ca = std::dynamic_pointer_cast( blocks["pulid_ca." + std::to_string(ca_idx)]); - ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img); // [N, n_img_token, hidden_size] - img = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight)); + ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img); // [N, n_img_token, hidden_size] + img = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight)); sd::ggml_graph_cut::mark_graph_cut(img, "flux.pulid_ca." + std::to_string(ca_idx), "img"); ca_idx++; } } - auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size] - const int64_t n_txt_tok = txt->ne[1]; // for splitting back into img portion below + auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size] + const int64_t n_txt_tok = txt->ne[1]; for (int i = 0; i < config.depth_single_blocks; i++) { if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) { continue; @@ -1138,24 +1112,22 @@ namespace Flux { if (pulid_run && (i % config.pulid_single_interval == 0)) { auto pulid_ca = std::dynamic_pointer_cast( blocks["pulid_ca." + std::to_string(ca_idx)]); - // Split txt_img into [txt | img], inject ID into the img portion - // only, then concatenate back. Matches the PyTorch reference. ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img, - txt_img->ne[0], n_txt_tok, txt_img->ne[2], - txt_img->nb[1], txt_img->nb[2], - 0); + txt_img->ne[0], n_txt_tok, txt_img->ne[2], + txt_img->nb[1], txt_img->nb[2], + 0); ggml_tensor* img_part = ggml_view_3d(ctx->ggml_ctx, txt_img, - txt_img->ne[0], - txt_img->ne[1] - n_txt_tok, - txt_img->ne[2], - txt_img->nb[1], - txt_img->nb[2], - n_txt_tok * txt_img->nb[1]); - txt_part = ggml_cont(ctx->ggml_ctx, txt_part); - img_part = ggml_cont(ctx->ggml_ctx, img_part); - ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img_part); - img_part = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight)); - txt_img = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1); + txt_img->ne[0], + txt_img->ne[1] - n_txt_tok, + txt_img->ne[2], + txt_img->nb[1], + txt_img->nb[2], + n_txt_tok * txt_img->nb[1]); + txt_part = ggml_cont(ctx->ggml_ctx, txt_part); + img_part = ggml_cont(ctx->ggml_ctx, img_part); + ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img_part); + img_part = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight)); + txt_img = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1); sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.pulid_ca." + std::to_string(ca_idx), "txt_img"); ca_idx++; } @@ -1567,12 +1539,9 @@ namespace Flux { set_backend_tensor_data(dct, dct_vec.data()); } - // Materialize the PuLID id embedding into the compute graph when - // pulid_id_tensor is non-empty. forward() accepts nullptr for the - // no-injection case. ggml_tensor* pulid_id = pulid_id_tensor.empty() - ? nullptr - : make_input(pulid_id_tensor); + ? nullptr + : make_input(pulid_id_tensor); auto runner_ctx = get_context(); diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp index 76bc0c2af..67f0fee02 100644 --- a/src/model/diffusion/model.hpp +++ b/src/model/diffusion/model.hpp @@ -22,9 +22,6 @@ struct SkipLayerDiffusionExtra { struct FluxDiffusionExtra { const sd::Tensor* guidance = nullptr; const std::vector* skip_layers = nullptr; - // PuLID-Flux: precomputed (N=1, num_tokens=32, kv_dim=2048) identity embedding - // produced by runtime-scripts/pulid_extract_id.py. nullptr when PuLID is - // disabled. id_weight is per-job (typical 0.7-1.2; default 1.0). const sd::Tensor* pulid_id = nullptr; float pulid_id_weight = 1.0f; }; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 0544cfb93..1cc7edfce 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -430,14 +430,6 @@ class StableDiffusionGGML { if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) { LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path); - // PuLID's cross-attention (pulid_ca.*) weights are part of the Flux - // diffusion model -- its blocks are constructed inside FluxModel when - // the tensor map contains pulid_ca.* keys. So they must be merged into - // the model loader here, BEFORE the diffusion model is built; that is - // why this stays in the ctor rather than in the pulid generation - // extension (whose init runs after model construction). The runtime - // side -- per-generation id-embedding + per-step injection -- lives in - // src/extensions/pulid_extension.cpp. if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path, "model.diffusion_model.")) { LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);