diff --git a/docs/Model Support.md b/docs/Model Support.md index ec5e9de1f..10160dc7e 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -20,6 +20,7 @@ [Anima](#anima) | DiT | 2026 | Circlestone Labs | 2B | WTF | Modern, very small, decent for anime | [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast | [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality | +[Lens](#lens) | MMDiT | 2026 | Microsoft | 3.8B | Minimal | Modern, lightweight | Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md): @@ -618,6 +619,23 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - **Dev Lora:** - A dev lora can be downloaded here [Kijai/hidream-O1-image_comfy](). It allows use of the base model with the distilled behavior from the Dev model. 8 steps will generate a coherent image of lower quality, 16 steps seems closer to original quality. Use CFG Scale 1. +# Lens + +- Microsoft's [Lens]() is supported in SwarmUI! +- It is a 3.8B model, with a base model and an official turbo distill designed to run fast. + - The raw base model (FP8) can be downloaded here: [Comfy-Org/Lens]() + - The Turbo model (FP8) can be downloaded here: [Comfy-Org/Lens - Turbo]() + - Or fat BF16 versions [Comfy-Org/Lens - base bf16]() [Comfy-Org/Lens - turbo bf16]() + - Save in `diffusion_models` +- Uses the Flux.2 VAE, will be downloaded and handled automatically +- Uses the GPT-OSS 20B text encoder, will be downloaded and handled automatically +- **Parameters:** + - **Sampler:** Default is fine. + - **Scheduler:** Default is fine. + - **CFG Scale:** For Turbo, `1`, for base normal CFG ranges (around `5`) + - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal. + - **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048. + # Video Models - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md). diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs index 02176700b..abf94e193 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs @@ -1080,6 +1080,7 @@ void copyParam(T2IRegisteredParam param) copyParam(T2IParamTypes.QwenModel); copyParam(T2IParamTypes.MistralModel); copyParam(T2IParamTypes.GemmaModel); + copyParam(T2IParamTypes.GptOssModel); } WorkflowGenerator wg = new() { UserInput = input, ModelFolderFormat = ModelFolderFormat, Features = [.. SupportedFeatures] }; JObject workflow = wg.Generate(); diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index cadb9a665..b4472813b 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -959,7 +959,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent } } // TODO: Registry of model default preferences instead of this - else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1()) + else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens()) { defscheduler ??= "simple"; } diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index 48d60e7fa..32e7c4bb8 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -91,6 +91,9 @@ public bool IsKontext() /// Returns true if the current model is HiDream-O1 Image. public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1); + /// Returns true if the current model is Lens. + public bool IsLens() => IsModelCompatClass(T2IModelClassSorter.CompatLens); + /// Returns true if the current model supports Flux Guidance. public bool HasFluxGuidance() { @@ -269,7 +272,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n ["width"] = width }, id)); } - else if (IsAnyFlux2() || IsErnie()) + else if (IsAnyFlux2() || IsErnie() || IsLens()) { return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject() { @@ -598,6 +601,11 @@ public string GetMinistral3_3bModel() return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel); } + public string GetGptOss_20bModel() + { + return RequireClipModel("gpt_oss_20b_nvfp4.safetensors", "https://huggingface.co/Comfy-Org/Lens/resolve/main/text_encoders/gpt_oss_20b_nvfp4.safetensors", "103d7759c720627e5ffdcb0d885595695085dad4201fa6a522a84d4b86335ca0", T2IParamTypes.GptOssModel); + } + public string GetClipLModel() { if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model)) @@ -899,7 +907,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) { dtype = "default"; } - else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format + else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens()) // Model is small and dense, so trust user preferred download format { dtype = "default"; } @@ -1057,6 +1065,29 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel()); helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae"); } + else if (IsLens()) + { + helpers.LoadClip("lens", helpers.GetGptOss_20bModel()); + helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae"); + // TODO: SamplingFlux is a dirty node, is this really needed? Or can we do a generic shift? + string lensSamplingNode = CreateNode("ModelSamplingFlux", new JObject() + { + ["model"] = LoadingModel, + ["width"] = UserInput.GetImageWidth(), + ["height"] = UserInput.GetImageHeight(), + ["max_shift"] = UserInput.Get(T2IParamTypes.SigmaShift, 1.15, sectionId: sectionId), + ["base_shift"] = 0.5 + }); + LoadingModel = [lensSamplingNode, 0]; + // TODO: Should this CFGNorm be configurable? + string lensCfgNormNode = CreateNode("CFGNorm", new JObject() + { + ["model"] = LoadingModel, + ["strength"] = 1.0, + ["pre_cfg"] = true + }); + LoadingModel = [lensCfgNormNode, 0]; + } else if (IsFlux() && (LoadingClip is null || LoadingVAE is null || UserInput.Get(T2IParamTypes.T5XXLModel) is not null || UserInput.Get(T2IParamTypes.ClipLModel) is not null)) { helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel()); diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index 2df28ecad..520737e25 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -71,6 +71,7 @@ public static T2IModelCompatClass CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }), CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }), CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }), + CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }), // Audio models CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }), // Obscure old random ones @@ -158,6 +159,7 @@ bool isFluxLora(JObject h) bool isFlux2KleinLora(JObject h) => hasLoraKey(h, "double_blocks.4.img_attn.proj") && hasLoraKey(h, "double_blocks.4.txt_mlp.2") && hasLoraKey(h, "single_blocks.18.linear1") && hasLoraKey(h, "single_blocks.19.linear2"); bool isFlux2Klein9BLora(JObject h) => hasLoraKey(h, "single_blocks.23.linear1"); bool isFlux2DevLora(JObject h) => hasLoraKey(h, "single_blocks.47.linear2"); + bool isLens(JObject h) => h.ContainsKey("transformer_blocks.0.attn.norm_added_q.weight") && h.ContainsKey("transformer_blocks.0.img_mlp.w1.weight"); bool isSD35Lora(JObject h) => h.ContainsKey("transformer.transformer_blocks.0.attn.to_k.lora_A.weight") && h.ContainsKey("transformer.transformer_blocks.37.attn.to_out.0.lora_B.weight"); bool isMochi(JObject h) => hasKey(h, "blocks.0.attn.k_norm_x.weight"); bool isMochiVae(JObject h) => h.ContainsKey("encoder.layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("blocks.2.blocks.3.stack.5.weight") || h.ContainsKey("decoder.blocks.2.blocks.3.stack.5.weight"); @@ -479,6 +481,10 @@ JToken GetEmbeddingKey(JObject h) { return isFlux2KleinLora(h) && isFlux2Klein9BLora(h) && !isFlux2DevLora(h); }}); + Register(new() { ID = "lens", CompatClass = CompatLens, Name = "Lens", StandardWidth = 1440, StandardHeight = 1440, IsThisModelOfClass = (m, h) => + { + return isLens(h); + }}); // ====================== Wan Video ====================== Register(new() { ID = "wan-2_1-text2video/vae", CompatClass = CompatWan21, Name = "Wan 2.1 VAE", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) => { return false; }}); Register(new() { ID = "wan-2_1-text2video-1_3b", CompatClass = CompatWan21_1_3b, Name = "Wan 2.1 Text2Video 1.3B", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) => diff --git a/src/Text2Image/T2IParamTypes.cs b/src/Text2Image/T2IParamTypes.cs index d8890c4dc..45498cbb2 100644 --- a/src/Text2Image/T2IParamTypes.cs +++ b/src/Text2Image/T2IParamTypes.cs @@ -330,7 +330,7 @@ public static string ApplyStringEdit(string prior, string update) FreeUBlock1, FreeUBlock2, FreeUSkip1, FreeUSkip2, GlobalRegionFactor, EndStepsEarly, SamplerSigmaMin, SamplerSigmaMax, SamplerRho, VideoAugmentationLevel, VideoCFG, VideoMinCFG, Video2VideoCreativity, VideoSwapPercent, VideoExtendSwapPercent, IP2PCFG2, RegionalObjectCleanupFactor, SigmaShift, SegmentThresholdMax, SegmentCFGScale, FluxGuidanceScale, Text2AudioDuration; public static T2IRegisteredParam InitImage, MaskImage, VideoEndFrame; public static T2IRegisteredParam VideoAudioInput, VideoAudioReference; - public static T2IRegisteredParam Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, VideoExtendModel, VideoExtendSwapModel; + public static T2IRegisteredParam Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, GptOssModel, VideoExtendModel, VideoExtendSwapModel; public static T2IRegisteredParam> Loras, LoraWeights, LoraTencWeights, LoraSectionConfinement; public static T2IRegisteredParam> PromptImages; public static T2IRegisteredParam OutputIntermediateImages, DoNotSave, DoNotSaveIntermediates, ControlNetPreviewOnly, RevisionZeroPrompt, RemoveBackground, NoSeedIncrement, NoPreviews, VideoBoomerang, ModelSpecificEnhancements, UseInpaintingEncode, MaskCompositeUnthresholded, SaveSegmentMask, InitImageRecompositeMask, UseReferenceOnly, RefinerDoTiling, AutomaticVAE, ZeroNegative, FluxDisableGuidance, SmartImagePromptResizing, NoLoadModels, NoInternalSpecialHandling, ForwardRawBackendData, ForwardSwarmData, @@ -715,6 +715,9 @@ static List listVaes(Session s) GemmaModel = Register(new("Gemma Model", "Which Gemma LLM to use as a text encoder, for models that use Gemma (such as Lumina2, LTX2).", "", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7 )); + GptOssModel = Register(new("GPT-OSS Model", "Which GPT-OSS LLM to use as a text encoder, for Lens-style 'diffusion_models' folder models.", + "", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7 + )); TorchCompile = Register(new("Torch Compile", "Torch.Compile is a way to dynamically accelerate AI models.\nIt wastes a bit of time (around a minute) on the first call compiling a graph of the generation, and then all subsequent generations run faster thanks to the compiled graph.\nTorch.Compile depends on Triton, which is difficult to install on Windows, easier on Linux.", "Disabled", IgnoreIf: "Disabled", GetValues: _ => ["Disabled", "inductor", "cudagraphs"], OrderPriority: 40, Group: GroupAdvancedModelAddons ));