update zoe qwen-image (#1012)

helloyongyang · web-flow · commit 09f01d3f6e9b · 2026-04-15T03:18:59.000+08:00
diff --git a/configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json b/configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json
@@ -0,0 +1,11 @@
+{
+    "infer_steps": 4,
+    "max_custom_size": 4096,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 34,
+    "attn_type": "sage_attn2",
+    "enable_cfg": false,
+    "dit_original_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4.safetensors",
+    "sample_shift": 5.0,
+    "zoe_style_noise": true
+}
diff --git a/configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json b/configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json
@@ -0,0 +1,13 @@
+{
+    "infer_steps": 4,
+    "max_custom_size": 4096,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 34,
+    "attn_type": "sage_attn2",
+    "enable_cfg": false,
+    "dit_quantized": true,
+    "dit_quantized_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4_fp8_mix.safetensors",
+    "dit_quant_scheme": "fp8-sgl",
+    "sample_shift": 5.0,
+    "zoe_style_noise": true
+}
diff --git a/lightx2v/models/schedulers/qwen_image/scheduler.py b/lightx2v/models/schedulers/qwen_image/scheduler.py
@@ -37,6 +37,11 @@ def calculate_shift(
     return mu
 
 
+def time_shift_linear(mu: float, t: torch.Tensor) -> torch.Tensor:
+    """Linear time shift: mu / (mu + (1/t - 1)), matching zoe-diffusion's implementation."""
+    return mu / (mu + (1.0 / t - 1.0))
+
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
@@ -428,7 +433,7 @@ def __init__(self, config):
         with open(os.path.join(config["model_path"], "scheduler", "scheduler_config.json"), "r") as f:
             self.scheduler_config = json.load(f)
         self.dtype = torch.bfloat16
-        self.sample_guide_scale = self.config["sample_guide_scale"]
+        self.sample_guide_scale = self.config.get("sample_guide_scale", None)
         self.zero_cond_t = config.get("zero_cond_t", False)
         if self.config["seq_parallel"]:
             self.seq_p_group = self.config.get("device_mesh").get_group(mesh_dim="seq_p")
@@ -480,43 +485,84 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
 
         return latent_image_ids.to(device=device, dtype=dtype)
 
+    def _prepare_latents_lightx2v(self, shape, height, width, num_channels_latents):
+        """Original LightX2V latent generation: noise in [B, T, C, H, W] then pack."""
+        latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
+        if self.is_layered:
+            latents = self._pack_latents(latents, 1, num_channels_latents, height, width, self.layers + 1)
+        else:
+            latents = self._pack_latents(latents, 1, num_channels_latents, height, width)
+        return latents
+
+    def _prepare_latents_zoe(self, shape, height, width, num_channels_latents):
+        """Zoe-aligned latent generation: noise in packed format [B, C*4, T, H//2, W//2].
+        Ensures the same random sampling order as Zoe for bit-exact alignment.
+        """
+        b, t = shape[0], shape[1]
+        zoe_shape = (b, num_channels_latents * 4, t, height // 2, width // 2)
+        latents = randn_tensor(zoe_shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
+        # Convert to LightX2V sequence format: [B, (H//2)*(W//2), C*4]
+        latents = latents.squeeze(2)  # [B, C*4, H//2, W//2]
+        latents = latents.permute(0, 2, 3, 1)  # [B, H//2, W//2, C*4]
+        latents = latents.reshape(b, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+
     def prepare_latents(self, input_info):
         self.input_info = input_info
         shape = input_info.target_shape
+        # shape: [B, T, C, H, W]
         width, height = shape[-1], shape[-2]
-        latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
-        if self.is_layered:
-            latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width, self.layers + 1)
+        num_channels_latents = self.config.get("num_channels_latents", 16)
+
+        if self.config.get("zoe_style_noise", False) and not self.is_layered:
+            latents = self._prepare_latents_zoe(shape, height, width, num_channels_latents)
         else:
-            latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width)
+            latents = self._prepare_latents_lightx2v(shape, height, width, num_channels_latents)
+
         latent_image_ids = self._prepare_latent_image_ids(1, height // 2, width // 2, AI_DEVICE, self.dtype)
         self.latents = latents
         self.latent_image_ids = latent_image_ids
         self.noise_pred = None
 
     def set_timesteps(self):
-        sigmas = np.linspace(1.0, 1 / self.config["infer_steps"], self.config["infer_steps"])
-        image_seq_len = self.latents.shape[1]
-        if self.is_layered:
-            base_seqlen = 256 * 256 / 16 / 16
-            image_seq_len = self.latents.shape[1] // 5
-            mu = (image_seq_len / base_seqlen) ** 0.5
+        num_inference_steps = self.config["infer_steps"]
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+
+        sample_shift = self.config.get("sample_shift", None)
+        if sample_shift is not None:
+            # Zoe-style: linear time shift with a fixed mu, resolution-independent.
+            # Formula: t_shifted = mu / (mu + (1/t - 1))
+            sigmas_tensor = torch.from_numpy(sigmas).float().to(AI_DEVICE)
+            sigmas_shifted = time_shift_linear(mu=sample_shift, t=sigmas_tensor)
+            sigmas_shifted = torch.cat([sigmas_shifted, torch.zeros(1, device=AI_DEVICE)])
+            self.scheduler.sigmas = sigmas_shifted.to(dtype=torch.float32, device=AI_DEVICE)
+            self.scheduler.timesteps = sigmas_shifted[:-1] * self.scheduler_config["num_train_timesteps"]
+            self.scheduler.timesteps = self.scheduler.timesteps.to(AI_DEVICE)
+            self.scheduler._step_index = None
+            self.scheduler._begin_index = None
+            timesteps = self.scheduler.timesteps
         else:
-            mu = calculate_shift(
-                image_seq_len,
-                self.scheduler_config.get("base_image_seq_len", 256),
-                self.scheduler_config.get("max_image_seq_len", 4096),
-                self.scheduler_config.get("base_shift", 0.5),
-                self.scheduler_config.get("max_shift", 1.15),
+            # Original: resolution-adaptive exponential shift via diffusers.
+            image_seq_len = self.latents.shape[1]
+            if self.is_layered:
+                base_seqlen = 256 * 256 / 16 / 16
+                image_seq_len = self.latents.shape[1] // 5
+                mu = (image_seq_len / base_seqlen) ** 0.5
+            else:
+                mu = calculate_shift(
+                    image_seq_len,
+                    self.scheduler_config.get("base_image_seq_len", 256),
+                    self.scheduler_config.get("max_image_seq_len", 4096),
+                    self.scheduler_config.get("base_shift", 0.5),
+                    self.scheduler_config.get("max_shift", 1.15),
+                )
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler,
+                num_inference_steps,
+                AI_DEVICE,
+                sigmas=sigmas,
+                mu=mu,
             )
-        num_inference_steps = self.config["infer_steps"]
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            AI_DEVICE,
-            sigmas=sigmas,
-            mu=mu,
-        )
 
         self.timesteps = timesteps
         self.infer_steps = num_inference_steps
diff --git a/scripts/qwen_image/qwen_image_t2i_2512_distill_zoe.sh b/scripts/qwen_image/qwen_image_t2i_2512_distill_zoe.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# set path firstly
+lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V
+model_path=/data/nvme1/models/Qwen/Qwen-Image-2512
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+python -m lightx2v.infer \
+--model_cls qwen_image \
+--task t2i \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json \
+--prompt '2K超高清画质，16:9宽屏比例，电影级渲染。一个精致的咖啡店门口场景，温馨的街道氛围。门口摆放着一个复古风格的木质黑板，黑板上用粉笔字体写着"日日新咖啡，2美元一杯"，笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌，红色霓虹灯管拼出"商汤科技"字样，现代科技感。旁边立着一幅精美的海报，海报上是一位优雅的中国美女模特，海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆，暖色调灯光，傍晚时分，细节精致，高质量渲染' \
+--negative_prompt " " \
+--save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe.png \
+--seed 42 \
+--target_shape 1536 2752
diff --git a/scripts/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.sh b/scripts/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# set path firstly
+lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V
+model_path=/data/nvme1/models/Qwen/Qwen-Image-2512
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+python -m lightx2v.infer \
+--model_cls qwen_image \
+--task t2i \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json \
+--prompt '2K超高清画质，16:9宽屏比例，电影级渲染。一个精致的咖啡店门口场景，温馨的街道氛围。门口摆放着一个复古风格的木质黑板，黑板上用粉笔字体写着"日日新咖啡，2美元一杯"，笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌，红色霓虹灯管拼出"商汤科技"字样，现代科技感。旁边立着一幅精美的海报，海报上是一位优雅的中国美女模特，海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆，暖色调灯光，傍晚时分，细节精致，高质量渲染' \
+--negative_prompt " " \
+--save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe_fp83.png \
+--seed 42 \
+--target_shape 1536 2752