Merge branch 'comfyanonymous:master' into master

kinqsradio · web-flow · commit 5dc405cdd2eb · 2024-08-29T13:22:32.000+10:00
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <div align="center">
 
 # ComfyUI
-**The most powerful and modular stable diffusion GUI and backend.**
+**The most powerful and modular diffusion model GUI and backend.**
 
 
 [![Website][website-shield]][website-url]
diff --git a/comfy/controlnet.py b/comfy/controlnet.py
@@ -34,7 +34,7 @@
 import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
-import comfy.ldm.flux.controlnet_xlabs
+import comfy.ldm.flux.controlnet
 
 
 def broadcast_image_to(tensor, target_batch_size, batched_number):
@@ -433,12 +433,25 @@ def load_controlnet_hunyuandit(controlnet_data):
 
 def load_controlnet_flux_xlabs(sd):
     model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd)
-    control_model = comfy.ldm.flux.controlnet_xlabs.ControlNetFlux(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
     control_model = controlnet_load_state_dict(control_model, sd)
     extra_conds = ['y', 'guidance']
     control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
     return control
 
+def load_controlnet_flux_instantx(sd):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd)
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.Flux()
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
 
 def load_controlnet(ckpt_path, model=None):
     controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
@@ -504,8 +517,10 @@ def load_controlnet(ckpt_path, model=None):
     elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format
         if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
             return load_controlnet_flux_xlabs(controlnet_data)
-        else:
+        elif "pos_embed_input.proj.weight" in controlnet_data:
             return load_controlnet_mmdit(controlnet_data)
+        elif "controlnet_x_embedder.weight" in controlnet_data:
+            return load_controlnet_flux_instantx(controlnet_data)
 
     pth_key = 'control_model.zero_convs.0.0.weight'
     pth = False
diff --git a/comfy/ldm/common_dit.py b/comfy/ldm/common_dit.py
@@ -1,8 +1,21 @@
 import torch
+import comfy.ops
 
 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
     if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
         padding_mode = "reflect"
     pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
     pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
     return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+except:
+    rms_norm_torch = None
+
+def rms_norm(x, weight, eps=1e-6):
+    if rms_norm_torch is not None:
+        return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        return (x * rrms) * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
diff --git a/comfy/ldm/flux/controlnet.py b/comfy/ldm/flux/controlnet.py
@@ -1,6 +1,7 @@
 #Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
 
 import torch
+import math
 from torch import Tensor, nn
 from einops import rearrange, repeat
 
@@ -13,34 +14,38 @@
 
 
 class ControlNetFlux(Flux):
-    def __init__(self, image_model=None, dtype=None, device=None, operations=None, **kwargs):
+    def __init__(self, latent_input=False, image_model=None, dtype=None, device=None, operations=None, **kwargs):
         super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)
 
+        self.main_model_double = 19
+        self.main_model_single = 38
         # add ControlNet blocks
         self.controlnet_blocks = nn.ModuleList([])
         for _ in range(self.params.depth):
             controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
             # controlnet_block = zero_module(controlnet_block)
             self.controlnet_blocks.append(controlnet_block)
-        self.pos_embed_input = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
         self.gradient_checkpointing = False
-        self.input_hint_block = nn.Sequential(
-            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
-        )
+        self.latent_input = latent_input
+        self.pos_embed_input = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
+        if not self.latent_input:
+            self.input_hint_block = nn.Sequential(
+                operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
+            )
 
     def forward_orig(
         self,
@@ -58,8 +63,10 @@ def forward_orig(
 
         # running on sequences img
         img = self.img_in(img)
-        controlnet_cond = self.input_hint_block(controlnet_cond)
-        controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        if not self.latent_input:
+            controlnet_cond = self.input_hint_block(controlnet_cond)
+            controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+
         controlnet_cond = self.pos_embed_input(controlnet_cond)
         img = img + controlnet_cond
         vec = self.time_in(timestep_embedding(timesteps, 256))
@@ -82,13 +89,25 @@ def forward_orig(
             block_res_sample = controlnet_block(block_res_sample)
             controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
 
-        return {"input": (controlnet_block_res_samples * 10)[:19]}
+
+        repeat = math.ceil(self.main_model_double / len(controlnet_block_res_samples))
+        if self.latent_input:
+            out_input = ()
+            for x in controlnet_block_res_samples:
+                    out_input += (x,) * repeat
+        else:
+            out_input = (controlnet_block_res_samples * repeat)
+        return {"input": out_input[:self.main_model_double]}
 
     def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
-        hint = hint * 2.0 - 1.0
+        patch_size = 2
+        if self.latent_input:
+            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
+            hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+        else:
+            hint = hint * 2.0 - 1.0
 
         bs, c, h, w = x.shape
-        patch_size = 2
         x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
 
         img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
@@ -6,6 +6,7 @@
 
 from .math import attention, rope
 import comfy.ops
+import comfy.ldm.common_dit
 
 
 class EmbedND(nn.Module):
@@ -63,8 +64,7 @@ def __init__(self, dim: int, dtype=None, device=None, operations=None):
         self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
 
     def forward(self, x: Tensor):
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms) * comfy.ops.cast_to(self.scale, dtype=x.dtype, device=x.device)
+        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
 
 
 class QKNorm(torch.nn.Module):
diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -355,29 +355,9 @@ def __init__(
         else:
             self.register_parameter("weight", None)
 
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
     def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        x = self._norm(x)
-        if self.learnable_scale:
-            return x * self.weight.to(device=x.device, dtype=x.dtype)
-        else:
-            return x
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
+
 
 
 class SwiGLUFeedForward(nn.Module):
diff --git a/comfy/utils.py b/comfy/utils.py
@@ -528,6 +528,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
         ("guidance_in.out_layer.weight", "time_text_embed.guidance_embedder.linear_2.weight"),
         ("final_layer.adaLN_modulation.1.bias", "norm_out.linear.bias", swap_scale_shift),
         ("final_layer.adaLN_modulation.1.weight", "norm_out.linear.weight", swap_scale_shift),
+        ("pos_embed_input.bias", "controlnet_x_embedder.bias"),
+        ("pos_embed_input.weight", "controlnet_x_embedder.weight"),
     }
 
     for k in MAP_BASIC:
diff --git a/script_examples/websockets_api_example.py b/script_examples/websockets_api_example.py
@@ -41,15 +41,14 @@ def get_images(ws, prompt):
             continue #previews are binary data
 
     history = get_history(prompt_id)[prompt_id]
-    for o in history['outputs']:
-        for node_id in history['outputs']:
-            node_output = history['outputs'][node_id]
-            if 'images' in node_output:
-                images_output = []
-                for image in node_output['images']:
-                    image_data = get_image(image['filename'], image['subfolder'], image['type'])
-                    images_output.append(image_data)
-            output_images[node_id] = images_output
+    for node_id in history['outputs']:
+        node_output = history['outputs'][node_id]
+        images_output = []
+        if 'images' in node_output:
+            for image in node_output['images']:
+                image_data = get_image(image['filename'], image['subfolder'], image['type'])
+                images_output.append(image_data)
+        output_images[node_id] = images_output
 
     return output_images
 
diff --git a/tests/inference/test_execution.py b/tests/inference/test_execution.py
@@ -95,17 +95,16 @@ def run(self, graph):
                     pass # Probably want to store this off for testing
 
         history = self.get_history(prompt_id)[prompt_id]
-        for o in history['outputs']:
-            for node_id in history['outputs']:
-                node_output = history['outputs'][node_id]
-                result.outputs[node_id] = node_output
-                if 'images' in node_output:
-                    images_output = []
-                    for image in node_output['images']:
-                        image_data = self.get_image(image['filename'], image['subfolder'], image['type'])
-                        image_obj = Image.open(BytesIO(image_data))
-                        images_output.append(image_obj)
-                    node_output['image_objects'] = images_output
+        for node_id in history['outputs']:
+            node_output = history['outputs'][node_id]
+            result.outputs[node_id] = node_output
+            images_output = []
+            if 'images' in node_output:
+                for image in node_output['images']:
+                    image_data = self.get_image(image['filename'], image['subfolder'], image['type'])
+                    image_obj = Image.open(BytesIO(image_data))
+                    images_output.append(image_obj)
+                node_output['image_objects'] = images_output
 
         return result
 
diff --git a/tests/inference/test_inference.py b/tests/inference/test_inference.py
@@ -109,15 +109,14 @@ def get_images(self, graph, save=True):
                 continue #previews are binary data
 
         history = self.get_history(prompt_id)[prompt_id]
-        for o in history['outputs']:
-            for node_id in history['outputs']:
-                node_output = history['outputs'][node_id]
-                if 'images' in node_output:
-                    images_output = []
-                    for image in node_output['images']:
-                        image_data = self.get_image(image['filename'], image['subfolder'], image['type'])
-                        images_output.append(image_data)
-                output_images[node_id] = images_output
+        for node_id in history['outputs']:
+            node_output = history['outputs'][node_id]
+            images_output = []
+            if 'images' in node_output:
+                for image in node_output['images']:
+                    image_data = self.get_image(image['filename'], image['subfolder'], image['type'])
+                    images_output.append(image_data)
+            output_images[node_id] = images_output
 
         return output_images
 

Original file line number	Diff line number	Diff line change
`@@ -528,6 +528,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):`
`528`	`528`	`("guidance_in.out_layer.weight", "time_text_embed.guidance_embedder.linear_2.weight"),`
`529`	`529`	`("final_layer.adaLN_modulation.1.bias", "norm_out.linear.bias", swap_scale_shift),`
`530`	`530`	`("final_layer.adaLN_modulation.1.weight", "norm_out.linear.weight", swap_scale_shift),`
	`531`	`+ ("pos_embed_input.bias", "controlnet_x_embedder.bias"),`
	`532`	`+ ("pos_embed_input.weight", "controlnet_x_embedder.weight"),`
`531`	`533`	`}`
`532`	`534`
`533`	`535`	`for k in MAP_BASIC:`