Skip to content

Commit 1426ede

Browse files
lsteinclaude
andcommitted
feat: add Qwen Image 2512 txt2img support
Shares the QwenImageEdit base type and infrastructure with the edit model. Key changes: - Text encoder: auto-selects prompt template based on reference images — edit template (drop_idx=64) when images present, generate template (drop_idx=34) when absent - Denoise: detects zero_cond_t to determine whether to concatenate reference latents; txt2img models pass only noisy patches with a single-entry img_shapes - Model config: accept QwenImagePipeline in addition to QwenImageEditPlusPipeline - LoRA: handle "transformer." key prefix from some training frameworks, add to config detection - Starter models: Qwen-Image-2512 full + 4 GGUF variants + Lightning V2.0 LoRAs (4-step, 8-step), all added to the Qwen Image Edit bundle Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 01d6f09 commit 1426ede

4 files changed

Lines changed: 143 additions & 35 deletions

File tree

invokeai/app/invocations/qwen_image_denoise.py

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -353,29 +353,44 @@ def _run_diffusion(self, context: InvocationContext):
353353
# Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4)
354354
latents = self._pack_latents(latents, 1, out_channels, latent_height, latent_width)
355355

356-
# Pack reference image latents and concatenate along the sequence dimension.
357-
# The edit transformer always expects [noisy_patches ; ref_patches] in its sequence.
358-
if ref_latents is not None:
359-
_, ref_ch, rh, rw = ref_latents.shape
360-
if rh != latent_height or rw != latent_width:
361-
ref_latents = torch.nn.functional.interpolate(
362-
ref_latents, size=(latent_height, latent_width), mode="bilinear"
356+
# Determine whether the model uses reference latent conditioning (zero_cond_t).
357+
# Edit models (zero_cond_t=True) expect [noisy_patches ; ref_patches] in the sequence.
358+
# Txt2img models (zero_cond_t=False) only take noisy patches.
359+
has_zero_cond_t = getattr(transformer_info.model, "zero_cond_t", False) or getattr(
360+
transformer_info.model.config, "zero_cond_t", False
361+
)
362+
use_ref_latents = has_zero_cond_t
363+
364+
ref_latents_packed = None
365+
if use_ref_latents:
366+
if ref_latents is not None:
367+
_, ref_ch, rh, rw = ref_latents.shape
368+
if rh != latent_height or rw != latent_width:
369+
ref_latents = torch.nn.functional.interpolate(
370+
ref_latents, size=(latent_height, latent_width), mode="bilinear"
371+
)
372+
else:
373+
# No reference image provided — use zeros so the model still gets the
374+
# expected sequence layout.
375+
ref_latents = torch.zeros(
376+
1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
363377
)
378+
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)
379+
380+
# img_shapes tells the transformer the spatial layout of patches.
381+
if use_ref_latents:
382+
img_shapes = [
383+
[
384+
(1, latent_height // 2, latent_width // 2),
385+
(1, latent_height // 2, latent_width // 2),
386+
]
387+
]
364388
else:
365-
# No reference image provided — use zeros so the model still gets the
366-
# expected sequence layout.
367-
ref_latents = torch.zeros(
368-
1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
369-
)
370-
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)
371-
372-
# img_shapes tells the transformer the spatial layout of noisy and reference patches.
373-
img_shapes = [
374-
[
375-
(1, latent_height // 2, latent_width // 2),
376-
(1, latent_height // 2, latent_width // 2),
389+
img_shapes = [
390+
[
391+
(1, latent_height // 2, latent_width // 2),
392+
]
377393
]
378-
]
379394

380395
# Prepare inpaint extension (operates in 4D space, so unpack/repack around it)
381396
inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape
@@ -428,8 +443,12 @@ def _run_diffusion(self, context: InvocationContext):
428443
# The pipeline passes timestep / 1000 to the transformer
429444
timestep = t.expand(latents.shape[0]).to(inference_dtype)
430445

431-
# Concatenate noisy and reference patches along the sequence dim
432-
model_input = torch.cat([latents, ref_latents_packed], dim=1)
446+
# For edit models: concatenate noisy and reference patches along the sequence dim
447+
# For txt2img models: just use noisy patches
448+
if ref_latents_packed is not None:
449+
model_input = torch.cat([latents, ref_latents_packed], dim=1)
450+
else:
451+
model_input = latents
433452

434453
noise_pred_cond = transformer(
435454
hidden_states=model_input,

invokeai/app/invocations/qwen_image_text_encoder.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,44 @@
2020
QwenImageConditioningInfo,
2121
)
2222

23-
# The Qwen Image Edit pipeline uses a specific system prompt and drops the first
24-
# N tokens (the system prompt prefix) from the embeddings. These constants are
25-
# taken directly from the diffusers QwenImagePipeline.
26-
_SYSTEM_PROMPT = (
23+
# Prompt templates and drop indices for the two Qwen Image model modes.
24+
# These are taken directly from the diffusers pipelines.
25+
26+
# Image editing mode (QwenImagePipeline)
27+
_EDIT_SYSTEM_PROMPT = (
2728
"Describe the key features of the input image (color, shape, size, texture, objects, background), "
2829
"then explain how the user's text instruction should alter or modify the image. "
2930
"Generate a new image that meets the user's requirements while maintaining consistency "
3031
"with the original input where appropriate."
3132
)
33+
_EDIT_DROP_IDX = 64
34+
35+
# Text-to-image mode (QwenImagePipeline)
36+
_GENERATE_SYSTEM_PROMPT = (
37+
"Describe the image by detailing the color, shape, size, texture, quantity, "
38+
"text, spatial relationships of the objects and background:"
39+
)
40+
_GENERATE_DROP_IDX = 34
41+
3242
_IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
33-
_DROP_IDX = 64
3443

3544

3645
def _build_prompt(user_prompt: str, num_images: int) -> str:
37-
"""Build the full prompt with one vision placeholder per reference image."""
38-
image_tokens = _IMAGE_PLACEHOLDER * max(num_images, 1)
39-
return (
40-
f"<|im_start|>system\n{_SYSTEM_PROMPT}<|im_end|>\n"
41-
f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n"
42-
"<|im_start|>assistant\n"
46+
"""Build the full prompt with the appropriate template based on whether reference images are provided."""
47+
if num_images > 0:
48+
# Edit mode: include vision placeholders for reference images
49+
image_tokens = _IMAGE_PLACEHOLDER * num_images
50+
return (
51+
f"<|im_start|>system\n{_EDIT_SYSTEM_PROMPT}<|im_end|>\n"
52+
f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n"
53+
"<|im_start|>assistant\n"
54+
)
55+
else:
56+
# Generate mode: text-only prompt
57+
return (
58+
f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n"
59+
f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
60+
"<|im_start|>assistant\n"
4361
)
4462

4563

@@ -188,15 +206,18 @@ def _encode(
188206
hidden_states = outputs.hidden_states[-1]
189207

190208
# Extract valid (non-padding) tokens using the attention mask,
191-
# then drop the first _DROP_IDX tokens (system prompt prefix).
209+
# then drop the system prompt prefix tokens.
210+
# The drop index differs between edit mode (64) and generate mode (34).
211+
drop_idx = _EDIT_DROP_IDX if images else _GENERATE_DROP_IDX
212+
192213
attn_mask = model_inputs.attention_mask
193214
bool_mask = attn_mask.bool()
194215
valid_lengths = bool_mask.sum(dim=1)
195216
selected = hidden_states[bool_mask]
196217
split_hidden = torch.split(selected, valid_lengths.tolist(), dim=0)
197218

198219
# Drop system prefix tokens and build padded output
199-
trimmed = [h[_DROP_IDX:] for h in split_hidden]
220+
trimmed = [h[drop_idx:] for h in split_hidden]
200221
attn_mask_list = [torch.ones(h.size(0), dtype=torch.long, device=device) for h in trimmed]
201222
max_seq_len = max(h.size(0) for h in trimmed)
202223

invokeai/backend/model_manager/configs/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,7 @@ def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -
12191219
common_config_paths(mod.path),
12201220
{
12211221
"QwenImagePlusPipeline",
1222+
"QwenImagePipeline",
12221223
},
12231224
)
12241225

invokeai/backend/model_manager/starter_models.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,69 @@ class StarterModelBundle(BaseModel):
711711
"Settings: Steps=8, CFG=1, Shift Override=3.",
712712
type=ModelType.LoRA,
713713
)
714+
715+
# Qwen Image (txt2img)
716+
qwen_image = StarterModel(
717+
name="Qwen Image 2512",
718+
base=BaseModelType.QwenImage,
719+
source="Qwen/Qwen-Image-2512",
720+
description="Qwen Image 2512 full diffusers model. High-quality text-to-image generation. (~40GB)",
721+
type=ModelType.Main,
722+
)
723+
724+
qwen_image_gguf_q4_k_m = StarterModel(
725+
name="Qwen Image 2512 (Q4_K_M)",
726+
base=BaseModelType.QwenImage,
727+
source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q4_K_M.gguf",
728+
description="Qwen Image 2512 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)",
729+
type=ModelType.Main,
730+
format=ModelFormat.GGUFQuantized,
731+
)
732+
733+
qwen_image_gguf_q2_k = StarterModel(
734+
name="Qwen Image 2512 (Q2_K)",
735+
base=BaseModelType.QwenImage,
736+
source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q2_K.gguf",
737+
description="Qwen Image 2512 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)",
738+
type=ModelType.Main,
739+
format=ModelFormat.GGUFQuantized,
740+
)
741+
742+
qwen_image_gguf_q6_k = StarterModel(
743+
name="Qwen Image 2512 (Q6_K)",
744+
base=BaseModelType.QwenImage,
745+
source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q6_K.gguf",
746+
description="Qwen Image 2512 - Q6_K quantized transformer. Near-lossless quality. (~17GB)",
747+
type=ModelType.Main,
748+
format=ModelFormat.GGUFQuantized,
749+
)
750+
751+
qwen_image_gguf_q8_0 = StarterModel(
752+
name="Qwen Image 2512 (Q8_0)",
753+
base=BaseModelType.QwenImage,
754+
source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q8_0.gguf",
755+
description="Qwen Image 2512 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)",
756+
type=ModelType.Main,
757+
format=ModelFormat.GGUFQuantized,
758+
)
759+
760+
qwen_image_lightning_4step = StarterModel(
761+
name="Qwen Image Lightning (4-step, V2.0, bf16)",
762+
base=BaseModelType.QwenImage,
763+
source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V2.0-bf16.safetensors",
764+
description="Lightning distillation LoRA for Qwen Image — enables generation in just 4 steps. "
765+
"Settings: Steps=4, CFG=1, Shift Override=3.",
766+
type=ModelType.LoRA,
767+
)
768+
769+
qwen_image_lightning_8step = StarterModel(
770+
name="Qwen Image Lightning (8-step, V2.0, bf16)",
771+
base=BaseModelType.QwenImage,
772+
source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V2.0-bf16.safetensors",
773+
description="Lightning distillation LoRA for Qwen Image — enables generation in 8 steps with better quality. "
774+
"Settings: Steps=8, CFG=1, Shift Override=3.",
775+
type=ModelType.LoRA,
776+
)
714777
# endregion
715778

716779
# region SigLIP
@@ -1102,6 +1165,10 @@ class StarterModelBundle(BaseModel):
11021165
qwen_image_gguf_q8_0,
11031166
qwen_image_lightning_4step,
11041167
qwen_image_lightning_8step,
1168+
qwen_image,
1169+
qwen_image_gguf_q4_k_m,
1170+
qwen_image_lightning_4step,
1171+
qwen_image_lightning_8step,
11051172
]
11061173

11071174
STARTER_BUNDLES: dict[str, StarterModelBundle] = {

0 commit comments

Comments
 (0)