InterSyncAnalytics
diff --git a/‎.github/workflows/windows_release_dependencies.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/windows_release_dependencies.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎comfy/controlnet.py‎
Lines changed: 10 additions & 11 deletions b/‎comfy/controlnet.py‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎comfy/float.py‎
Lines changed: 2 additions & 0 deletions b/‎comfy/float.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎comfy/ldm/modules/diffusionmodules/mmdit.py‎
Lines changed: 8 additions & 10 deletions b/‎comfy/ldm/modules/diffusionmodules/mmdit.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎comfy/lora.py‎
Lines changed: 1 addition & 1 deletion b/‎comfy/lora.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎comfy/model_base.py‎
Lines changed: 6 additions & 1 deletion b/‎comfy/model_base.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎comfy/model_detection.py‎
Lines changed: 9 additions & 3 deletions b/‎comfy/model_detection.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎comfy/model_management.py‎
Lines changed: 16 additions & 19 deletions b/‎comfy/model_management.py‎
Lines changed: 16 additions & 19 deletions
diff --git a/‎comfy/model_patcher.py‎
Lines changed: 42 additions & 10 deletions b/‎comfy/model_patcher.py‎
Lines changed: 42 additions & 10 deletions
@@ -12,7 +12,7 @@ on:
         description: 'extra dependencies'
         required: false
         type: string
-        default: "\"numpy<2\""
+        default: ""
       cu:
         description: 'cuda version'
         required: true
 
@@ -127,6 +127,8 @@ To run it on services like paperspace, kaggle or colab you can use my [Jupyter N
 
 ## Manual Install (Windows, Linux)
 
+Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.
+
 Git clone this repo.
 
 Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
 
@@ -60,7 +60,7 @@ class StrengthType(Enum):
     LINEAR_UP = 2
 
 class ControlBase:
-    def __init__(self, device=None):
+    def __init__(self):
         self.cond_hint_original = None
         self.cond_hint = None
         self.strength = 1.0
@@ -72,10 +72,6 @@ def __init__(self, device=None):
         self.compression_ratio = 8
         self.upscale_algorithm = 'nearest-exact'
         self.extra_args = {}
-
-        if device is None:
-            device = comfy.model_management.get_torch_device()
-        self.device = device
         self.previous_controlnet = None
         self.extra_conds = []
         self.strength_type = StrengthType.CONSTANT
@@ -185,8 +181,8 @@ def set_extra_arg(self, argument, value=None):
 
 
 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False):
-        super().__init__(device)
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False):
+        super().__init__()
         self.control_model = control_model
         self.load_device = load_device
         if control_model is not None:
@@ -242,7 +238,7 @@ def get_control(self, x_noisy, t, cond, batched_number):
                     to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
                 self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
 
-            self.cond_hint = self.cond_hint.to(device=self.device, dtype=dtype)
+            self.cond_hint = self.cond_hint.to(device=x_noisy.device, dtype=dtype)
         if x_noisy.shape[0] != self.cond_hint.shape[0]:
             self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
 
@@ -341,8 +337,8 @@ def forward(self, input):
 
 
 class ControlLora(ControlNet):
-    def __init__(self, control_weights, global_average_pooling=False, device=None, model_options={}): #TODO? model_options
-        ControlBase.__init__(self, device)
+    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
+        ControlBase.__init__(self)
         self.control_weights = control_weights
         self.global_average_pooling = global_average_pooling
         self.extra_conds += ["y"]
@@ -662,12 +658,15 @@ def load_controlnet(ckpt_path, model=None, model_options={}):
 
 class T2IAdapter(ControlBase):
     def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
-        super().__init__(device)
+        super().__init__()
         self.t2i_model = t2i_model
         self.channels_in = channels_in
         self.control_input = None
         self.compression_ratio = compression_ratio
         self.upscale_algorithm = upscale_algorithm
+        if device is None:
+            device = comfy.model_management.get_torch_device()
+        self.device = device
 
     def scale_image_to(self, width, height):
         unshuffle_amount = self.t2i_model.unshuffle_amount
 
@@ -41,6 +41,8 @@ def manual_stochastic_round_to_float8(x, dtype, generator=None):
         (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
     )
 
+    inf = torch.finfo(dtype)
+    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
     return sign
 
 
 
@@ -5,7 +5,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from .. import attention
+from ..attention import optimized_attention
 from einops import rearrange, repeat
 from .util import timestep_embedding
 import comfy.ops
@@ -266,8 +266,6 @@ def split_qkv(qkv, head_dim):
     qkv = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, -1, head_dim).movedim(2, 0)
     return qkv[0], qkv[1], qkv[2]
 
-def optimized_attention(qkv, num_heads):
-    return attention.optimized_attention(qkv[0], qkv[1], qkv[2], num_heads)
 
 class SelfAttention(nn.Module):
     ATTENTION_MODES = ("xformers", "torch", "torch-hb", "math", "debug")
@@ -326,9 +324,9 @@ def post_attention(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        qkv = self.pre_attention(x)
+        q, k, v = self.pre_attention(x)
         x = optimized_attention(
-            qkv, num_heads=self.num_heads
+            q, k, v, heads=self.num_heads
         )
         x = self.post_attention(x)
         return x
@@ -531,8 +529,8 @@ def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
         assert not self.pre_only
         qkv, intermediates = self.pre_attention(x, c)
         attn = optimized_attention(
-            qkv,
-            num_heads=self.attn.num_heads,
+            qkv[0], qkv[1], qkv[2],
+            heads=self.attn.num_heads,
         )
         return self.post_attention(attn, *intermediates)
 
@@ -557,8 +555,8 @@ def _block_mixing(context, x, context_block, x_block, c):
     qkv = tuple(o)
 
     attn = optimized_attention(
-        qkv,
-        num_heads=x_block.attn.num_heads,
+        qkv[0], qkv[1], qkv[2],
+        heads=x_block.attn.num_heads,
     )
     context_attn, x_attn = (
         attn[:, : context_qkv[0].shape[1]],
@@ -642,7 +640,7 @@ def __init__(self, dim, heads=8, dim_head=64, dtype=None, device=None, operation
     def forward(self, x):
         qkv = self.qkv(x)
         q, k, v = split_qkv(qkv, self.dim_head)
-        x = optimized_attention((q.reshape(q.shape[0], q.shape[1], -1), k, v), self.heads)
+        x = optimized_attention(q.reshape(q.shape[0], q.shape[1], -1), k, v, heads=self.heads)
         return self.proj(x)
 
 class ContextProcessorBlock(nn.Module):
 
@@ -415,7 +415,7 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32):
             weight *= strength_model
 
         if isinstance(v, list):
-            v = (calculate_weight(v[1:], comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype, copy=True), key, intermediate_dtype=intermediate_dtype), )
+            v = (calculate_weight(v[1:], v[0][1](comfy.model_management.cast_to_device(v[0][0], weight.device, intermediate_dtype, copy=True), inplace=True), key, intermediate_dtype=intermediate_dtype), )
 
         if len(v) == 1:
             patch_type = "diff"
 
@@ -96,7 +96,8 @@ def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_mod
 
         if not unet_config.get("disable_unet_model_creation", False):
             if model_config.custom_operations is None:
-                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=model_config.optimizations.get("fp8", False))
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
+                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
             else:
                 operations = model_config.custom_operations
             self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
@@ -244,6 +245,10 @@ def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_
             extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
 
         unet_state_dict = self.diffusion_model.state_dict()
+
+        if self.model_config.scaled_fp8 is not None:
+            unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)
+
         unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
 
         if self.model_type == ModelType.V_PREDICTION:
 
@@ -286,9 +286,15 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
         return None
     model_config = model_config_from_unet_config(unet_config, state_dict)
     if model_config is None and use_base_if_no_match:
-        return comfy.supported_models_base.BASE(unet_config)
-    else:
-        return model_config
+        model_config = comfy.supported_models_base.BASE(unet_config)
+
+    scaled_fp8_weight = state_dict.get("{}scaled_fp8".format(unet_key_prefix), None)
+    if scaled_fp8_weight is not None:
+        model_config.scaled_fp8 = scaled_fp8_weight.dtype
+        if model_config.scaled_fp8 == torch.float32:
+            model_config.scaled_fp8 = torch.float8_e4m3fn
+
+    return model_config
 
 def unet_prefix_from_state_dict(state_dict):
     candidates = ["model.diffusion_model.", #ldm/sgm models
 
@@ -647,6 +647,9 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
         pass
 
     if fp8_dtype is not None:
+        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
+            return fp8_dtype
+
         free_model_memory = maximum_vram_for_weights(device)
         if model_params * 2 > free_model_memory:
             return fp8_dtype
@@ -840,27 +843,21 @@ def force_channels_last():
     #TODO
     return False
 
-def cast_to_device(tensor, device, dtype, copy=False):
-    device_supports_cast = False
-    if tensor.dtype == torch.float32 or tensor.dtype == torch.float16:
-        device_supports_cast = True
-    elif tensor.dtype == torch.bfloat16:
-        if hasattr(device, 'type') and device.type.startswith("cuda"):
-            device_supports_cast = True
-        elif is_intel_xpu():
-            device_supports_cast = True
+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
+    if device is None or weight.device == device:
+        if not copy:
+            if dtype is None or weight.dtype == dtype:
+                return weight
+        return weight.to(dtype=dtype, copy=copy)
 
-    non_blocking = device_should_use_non_blocking(device)
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight, non_blocking=non_blocking)
+    return r
+
+def cast_to_device(tensor, device, dtype, copy=False):
+    non_blocking = device_supports_non_blocking(device)
+    return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy)
 
-    if device_supports_cast:
-        if copy:
-            if tensor.device == device:
-                return tensor.to(dtype, copy=copy, non_blocking=non_blocking)
-            return tensor.to(device, copy=copy, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
-        else:
-            return tensor.to(device, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
-    else:
-        return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)
 
 def xformers_enabled():
     global directml_enabled
 
@@ -94,6 +94,31 @@ def __call__(self, weight):
             return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key))
 
         return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
+
+def get_key_weight(model, key):
+    set_func = None
+    convert_func = None
+    op_keys = key.rsplit('.', 1)
+    if len(op_keys) < 2:
+        weight = comfy.utils.get_attr(model, key)
+    else:
+        op = comfy.utils.get_attr(model, op_keys[0])
+        try:
+            set_func = getattr(op, "set_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        try:
+            convert_func = getattr(op, "convert_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        weight = getattr(op, op_keys[1])
+        if convert_func is not None:
+            weight = comfy.utils.get_attr(model, key)
+
+    return weight, set_func, convert_func
+
 class ModelPatcher:
     def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
         self.size = size
@@ -294,14 +319,16 @@ def get_key_patches(self, filter_prefix=None):
                 if not k.startswith(filter_prefix):
                     continue
             bk = self.backup.get(k, None)
+            weight, set_func, convert_func = get_key_weight(self.model, k)
             if bk is not None:
                 weight = bk.weight
-            else:
-                weight = model_sd[k]
+            if convert_func is None:
+                convert_func = lambda a, **kwargs: a
+
             if k in self.patches:
-                p[k] = [weight] + self.patches[k]
+                p[k] = [(weight, convert_func)] + self.patches[k]
             else:
-                p[k] = (weight,)
+                p[k] = [(weight, convert_func)]
         return p
 
     def model_state_dict(self, filter_prefix=None):
@@ -317,8 +344,7 @@ def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
         if key not in self.patches:
             return
 
-        weight = comfy.utils.get_attr(self.model, key)
-
+        weight, set_func, convert_func = get_key_weight(self.model, key)
         inplace_update = self.weight_inplace_update or inplace_update
 
         if key not in self.backup:
@@ -328,12 +354,18 @@ def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
             temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
         else:
             temp_weight = weight.to(torch.float32, copy=True)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+
         out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
-        out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
-        if inplace_update:
-            comfy.utils.copy_to_param(self.model, key, out_weight)
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            if inplace_update:
+                comfy.utils.copy_to_param(self.model, key, out_weight)
+            else:
+                comfy.utils.set_attr_param(self.model, key, out_weight)
         else:
-            comfy.utils.set_attr_param(self.model, key, out_weight)
+            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
 
     def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
         mem_counter = 0
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,8 @@ def manual_stochastic_round_to_float8(x, dtype, generator=None):`
`41`	`41`	`(2.0 ** (-EXPONENT_BIAS + 1)) * abs_x`
`42`	`42`	`)`
`43`	`43`
	`44`	`+ inf = torch.finfo(dtype)`
	`45`	`+ torch.clamp(sign, min=inf.min, max=inf.max, out=sign)`
`44`	`46`	`return sign`
`45`	`47`
`46`	`48`