diff --git a/docs.json b/docs.json index 8b5c470fb..d6817e2a2 100644 --- a/docs.json +++ b/docs.json @@ -329,7 +329,8 @@ "pages": [ "tutorials/video/kandinsky/kandinsky-5" ] - } + }, + "tutorials/video/bernini-r" ] }, { @@ -367,6 +368,7 @@ "tutorials/utility/video-segment-sam3", "tutorials/utility/remove-background-birefnet", "tutorials/utility/moge", + "tutorials/utility/depth-anything-3", { "group": "Face Detection", "pages": [ @@ -2854,7 +2856,8 @@ "pages": [ "zh/tutorials/video/kandinsky/kandinsky-5" ] - } + }, + "zh/tutorials/video/bernini-r" ] }, { @@ -2892,6 +2895,7 @@ "zh/tutorials/utility/video-segment-sam3", "zh/tutorials/utility/remove-background-birefnet", "zh/tutorials/utility/moge", + "zh/tutorials/utility/depth-anything-3", { "group": "人脸检测", "pages": [ @@ -5379,7 +5383,8 @@ "pages": [ "ja/tutorials/video/kandinsky/kandinsky-5" ] - } + }, + "ja/tutorials/video/bernini-r" ] }, { @@ -5417,6 +5422,7 @@ "ja/tutorials/utility/video-segment-sam3", "ja/tutorials/utility/remove-background-birefnet", "ja/tutorials/utility/moge", + "ja/tutorials/utility/depth-anything-3", { "group": "顔検出", "pages": [ @@ -7982,7 +7988,8 @@ "pages": [ "ko/tutorials/video/kandinsky/kandinsky-5" ] - } + }, + "ko/tutorials/video/bernini-r" ] }, { @@ -8020,6 +8027,7 @@ "ko/tutorials/utility/video-segment-sam3", "ko/tutorials/utility/remove-background-birefnet", "ko/tutorials/utility/moge", + "ko/tutorials/utility/depth-anything-3", { "group": "얼굴 감지", "pages": [ @@ -10390,4 +10398,4 @@ "destination": "/zh/:slug*" } ] -} +} \ No newline at end of file diff --git a/ja/tutorials/utility/depth-anything-3.mdx b/ja/tutorials/utility/depth-anything-3.mdx new file mode 100644 index 000000000..fd43a0f54 --- /dev/null +++ b/ja/tutorials/utility/depth-anything-3.mdx @@ -0,0 +1,124 @@ +--- +title: "ComfyUI Depth Anything 3 公式サンプル" +description: "ComfyUI で Depth Anything 3 を使用した画像と動画の単眼・多視点深度推定を学びましょう。" +sidebarTitle: "Depth Anything 3" +translationSourceHash: d646a0e3 +translationFrom: tutorials/utility/depth-anything-3.mdx +--- + +import UpdateReminder from '/snippets/ja/tutorials/update-reminder.mdx' + +# ComfyUI Depth Anything 3 概要 + +[Depth Anything 3 (DA3)](https://github.com/ByteDance-Seed/Depth-Anything-3) は、ByteDance Seed が開発したビジョントランスフォーマーで、カメラポーズの有無にかかわらず、任意のビジュアル入力から空間的に一貫したジオメトリを復元します。単一の DINO エンコーダと統一された深度-レイ表現により、同一モデルファミリーで単眼深度、多視点深度、カメラポーズ推定、3D 再構築をカバーします。 + +主な機能: + +- **統一された単眼・多視点深度**:単一または複数の画像から深度を推定 +- **カメラポーズ推定**:順序なし画像セットからカメラ位置を復元 +- **3D 再構築**:多視点入力をサポート +- **動画深度推定**:動画入力のフレームごとの深度シーケンスを生成 +- **複数のモデルバリアント**:Small、Base、Mono/Metric Large + + + + +ComfyUI は Depth Anything 3 ノードをネイティブサポートしています。始める前に [ComfyUI](https://github.com/Comfy-Org/ComfyUI) を最新バージョンに更新してください。 + + +## モデルインストール + +Depth Anything 3 チェックポイントをダウンロードし、対応する ComfyUI フォルダに保存します: + +- **Small** ([depth_anything_3_small.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_small.safetensors)) — 軽量で高速な推論 +- **Base** ([depth_anything_3_base.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_base.safetensors)) — バランスの取れた性能 +- **Mono-Large** ([depth_anything_3_mono_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors)) — 単眼深度に最適(空検出対応) +- **Metric-Large** ([depth_anything_3_metric_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_metric_large.safetensors)) — メートル単位の物理深度(空検出対応) + +``` +ComfyUI/ +├── models/ +│ ├── geometry_estimation/ +│ │ ├── depth_anything_3_small.safetensors +│ │ ├── depth_anything_3_base.safetensors +│ │ ├── depth_anything_3_mono_large.safetensors +│ │ └── depth_anything_3_metric_large.safetensors +``` + +## サンプルワークフロー + +--- + +## 1. 画像深度推定 + +**機能説明:** 1 枚の画像をアップロードし、**Image Depth Estimation (Depth Anything 3)** を実行して深度マップを生成します。**Depth Preview** に元画像と深度出力のサイドバイサイド比較が表示されます。 + + + + JSON をダウンロード または テンプレートライブラリで "Depth Anything 3" を検索 + + + このワークフローのサンプル入力画像を取得 + + + +
+ 画像深度推定出力 + 画像深度推定比較 +
+ +### 実行手順 + +1. **LoadImage** — 入力画像を読み込む +2. **LoadDA3Model** — Depth Anything 3 バリアントを選択 +3. **実行** — Queue をクリックするか `Cmd+Enter` を押す +4. ワークフローが深度マップと並列比較を出力 + + + このワークフローはモジュール処理にサブグラフノードを使用しています。サブグラフのカスタマイズと拡張についてはサブグラフのドキュメントをご覧ください。 + + +--- + +## 2. 動画深度推定 + +**機能説明:** 動画をアップロードし、**Video Depth Estimation (Depth Anything 3)** を実行してフレームごとの深度シーケンスを生成します。サブグラフ内では **GetVideoComponents** が入力動画をフレームに分割し、**LoadDA3Model** がモデルを読み込み、**SetVideoComponents** が深度フレームを動画に再構成します。 + + + + JSON をダウンロード または テンプレートライブラリで "Depth Anything 3" を検索 + + + Comfy Cloud で開く + + + +![動画深度推定プレビュー](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/utility_depth_anything3_video_depth_estimation-1.webp) + +### 実行手順 + +1. **LoadVideo** — 入力動画を読み込む +2. **モデルを選択** — **Small**、**Base**、**Mono-Large**、**Metric-Large** から選択 +3. **実行** — Queue をクリックするか `Cmd+Enter` を押す +4. ワークフローがフレームごとの深度マップ動画を出力 + + + このワークフローはモジュール処理にサブグラフノードを使用しています。サブグラフのカスタマイズと拡張についてはサブグラフのドキュメントをご覧ください。 + + +## モデルバリアント + +| バリアント | head_type | 空検出 | 信頼度 | カメラデコーダ | 最適な用途 | +|-----------|-----------|:-------:|:------:|:--------------:|------------| +| **Small** | dualdpt | ❌ | ✅ | ✅ | 高速推論、モバイル/エッジ | +| **Base** | dualdpt | ❌ | ✅ | ✅ | バランスの取れた性能 | +| **Mono-Large** | dpt | ✅ | ❌ | ❌ | 空検出対応の単眼深度 | +| **Metric-Large** | dpt | ✅ | ❌ | ❌ | メートル単位の物理深度 | + +- **Small** と **Base** は `dualdpt` ヘッドタイプを使用し、信頼度推定とカメラデコーダをサポート(多視点アプリケーション向け)。 +- **Mono-Large** と **Metric-Large** は `dpt` ヘッドタイプを使用し、空検出に対応。Metric-Large はメートル単位の生深度を出力。 + +## コミュニティリソース + +- [Depth Anything 3 GitHub (ByteDance-Seed)](https://github.com/ByteDance-Seed/Depth-Anything-3) — 研究論文とコード +- [Comfy-Org/Depth-Anything-3](https://huggingface.co/Comfy-Org/Depth-Anything-3) — 公式 ComfyUI モデル重み diff --git a/ja/tutorials/video/bernini-r.mdx b/ja/tutorials/video/bernini-r.mdx new file mode 100644 index 000000000..059f6a9af --- /dev/null +++ b/ja/tutorials/video/bernini-r.mdx @@ -0,0 +1,138 @@ +--- +title: "ComfyUI Bernini-R 公式サンプル" +description: "ComfyUI で Bernini-R を使用した画像・動画編集(再照明、スタイル転送、被写体挿入など)を学びましょう。" +sidebarTitle: "Bernini-R" +translationSourceHash: 6d8b19fc +translationFrom: tutorials/video/bernini-r.mdx +--- + +import UpdateReminder from '/snippets/ja/tutorials/update-reminder.mdx' + +# ComfyUI Bernini-R 概要 + +[Bernini-R](https://github.com/bytedance/Bernini) は、ByteDance の **レンダラーのみ** の Wan 2.2 モデルで、コンテキスト内における画像・動画コンディショニング用に設計されています。条件ストリーム(ソース動画、参照画像、参照動画)を使用して生成をガイドするため、LoRA 訓練やファインチューニングは不要です。 + +主な機能: + +- **複数タスクを1つに統合**:画像/動画生成、編集、再照明、スタイル転送、被写体挿入 +- **コンテキスト内条件制御**:参照画像/動画を視覚的プロンプトとしてトークン注入 +- **軽量設計**:レンダラーのみ — 拡散ベースの text-to-video バックボーンは不要 +- **柔軟な入力対応**:単一または複数の参照画像、動画 to 動画、参照誘導編集 + +Bernini-R がサポートする6つのタスクタイプ: + +| タスク | 入力 | 説明 | +|-------|------|------| +| **t2v** | テキストプロンプト | テキストから動画生成 | +| **v2v** | ソース動画 | 動画 to 動画スタイル転送 | +| **rv2v** | ソース動画 + 参照画像 | 参照誘導動画編集(再照明、被写体挿入) | +| **r2v** | 参照画像 | 参照 to 動画生成 | +| **ads2v** | ソース動画 + 参照動画 | 画像/動画コンテンツをソース動画に挿入 | +| **img** | ソース画像 | 画像編集 | + + + + +ComfyUI は Bernini-R ノードをネイティブサポートしています。始める前に [ComfyUI](https://github.com/Comfy-Org/ComfyUI) を最新バージョンに更新してください。 + + +## モデルインストール + +必要なモデルウェイトをダウンロードし、対応する ComfyUI フォルダに保存します: + +**text_encoders:** +- [umt5_xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true) + +**vae:** +- [Wan2_1_VAE_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan2_1_VAE_bf16.safetensors?download=true) + +**loras:** +- [lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors?download=true) + +**diffusion_models:** +- [wan2.2_bernini_r_fp16.safetensors](https://huggingface.co/Comfy-Org/Bernini-R/resolve/main/wan2.2_bernini_r_fp16.safetensors) + +``` +ComfyUI/ +├── models/ +│ ├── text_encoders/ +│ │ └── umt5_xxl_fp8_e4m3fn_scaled.safetensors +│ ├── vae/ +│ │ └── Wan2_1_VAE_bf16.safetensors +│ ├── loras/ +│ │ └── lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors +│ ├── diffusion_models/ +│ │ └── wan2.2_bernini_r_fp16.safetensors +``` + +## サンプルワークフロー + +--- + +## 1. 画像編集 + +**機能説明:** 照明が一致した編集画像を生成し、前後の比較を並べて表示します。ポートレートやプロダクトの再照明、写真コレクションの一貫した照明、E コマースカタログ撮影に最適です。 + + + + JSON をダウンロード または テンプレートライブラリで "Bernini-R" を検索 + + + Comfy Cloud で開く + + + +
+ Bernini-R 画像編集出力 + Bernini-R 画像編集比較 +
+ +### 実行手順 + +1. **タスクタイプを選択** — タスクを選択(Image Editing、Subject to Image など) +2. **入力を接続** — ソース画像とオプションの参照画像を読み込む +3. **プロンプトを作成** — 必要な編集内容を記述 +4. **実行** — Queue をクリックするか `Cmd+Enter` を押す + +**参照画像入力:** 1枚以上の参照画像が必要な場合に使用(被写体、服、シーン、小道具)。プロンプト内で `image0`、`image1` などを使って各画像を参照します。**Image Editing** タスクでは不要です。代わりに `source_image` を使用します。 + + + このワークフローはモジュール処理にサブグラフノードを使用しています。サブグラフのカスタマイズと拡張についてはサブグラフのドキュメントをご覧ください。 + + +--- + +## 2. 動画編集 + +**機能説明:** Bernini-R で一貫した再照明の編集動画を生成します。ソース動画、オプションの参照画像や参照動画を接続し、タスクタイプを選択し、プロンプトを作成して実行します。 + + + + JSON をダウンロード または テンプレートライブラリで "Bernini-R" を検索 + + + Comfy Cloud で開く + + + +![Bernini-R 動画編集プレビュー](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/video_bernini_r_video_editing-1.webp) + +### 実行手順 + +1. **ソース動画を読み込む** — 入力動画を接続 +2. **(オプション)参照を読み込む** — 参照画像または参照動画 +3. **タスクタイプを選択** — v2v、rv2v、r2v、または ads2v +4. **プロンプトを作成** — 必要な編集内容を記述 +5. **実行** — Queue をクリックするか `Cmd+Enter` を押す + +**参照画像入力:** 1枚以上の参照画像が必要な場合に使用(rv2v、r2v、複数衣装)。バッチ処理された各画像が独自のコンテキスト内トークンになります。参照物が異なる役割を持つ場合は、プロンプト内で `image0`、`image1` などを使用します。 + + + このワークフローはモジュール処理にサブグラフノードを使用しています。サブグラフのカスタマイズと拡張についてはサブグラフのドキュメントをご覧ください。 + + +## コミュニティリソース + +- [Bernini GitHub (bytedance/Bernini)](https://github.com/bytedance/Bernini) — 研究論文とタスクドキュメント +- [Comfy-Org/Bernini-R](https://huggingface.co/Comfy-Org/Bernini-R) — 公式 ComfyUI モデル重み +- [Bernini: Latent Semantic Planning for Video Diffusion](https://arxiv.org/abs/2605.22344) — 研究論文 diff --git a/ko/tutorials/utility/depth-anything-3.mdx b/ko/tutorials/utility/depth-anything-3.mdx new file mode 100644 index 000000000..a99708ad3 --- /dev/null +++ b/ko/tutorials/utility/depth-anything-3.mdx @@ -0,0 +1,124 @@ +--- +title: "ComfyUI Depth Anything 3 공식 예제" +description: "ComfyUI에서 Depth Anything 3를 사용하여 이미지 및 비디오의 단일/다중 뷰 깊이 추정 방법을 알아보세요." +sidebarTitle: "Depth Anything 3" +translationSourceHash: d646a0e3 +translationFrom: tutorials/utility/depth-anything-3.mdx +--- + +import UpdateReminder from '/snippets/ko/tutorials/update-reminder.mdx' + +# ComfyUI Depth Anything 3 소개 + +[Depth Anything 3 (DA3)](https://github.com/ByteDance-Seed/Depth-Anything-3)은 ByteDance Seed의 비전 트랜스포머로, 카메라 포즈 유무와 관계없이 임의의 시각적 입력으로부터 공간적으로 일관된 기하학 정보를 복원합니다. 단일 DINO 인코더와 통합된 깊이-레이 표현을 통해 동일한 모델 패밀리로 단일 뷰 깊이, 다중 뷰 깊이, 카메라 포즈 추정 및 3D 재구성을 모두 처리합니다. + +주요 기능: + +- **통합 단일/다중 뷰 깊이**: 단일 또는 여러 이미지에서 깊이 추정 +- **카메라 포즈 추정**: 순서 없는 이미지 세트에서 카메라 위치 복원 +- **3D 재구성**: 다중 뷰 입력 지원 +- **비디오 깊이 추정**: 비디오 입력에 대한 프레임별 깊이 시퀀스 생성 +- **여러 모델 변형**: Small, Base, Mono/Metric Large + + + + +ComfyUI는 이제 Depth Anything 3 노드를 기본 지원합니다. 시작하기 전에 [ComfyUI](https://github.com/Comfy-Org/ComfyUI)를 최신 버전으로 업데이트하세요. + + +## 모델 설치 + +Depth Anything 3 체크포인트를 다운로드하여 해당 ComfyUI 폴더에 저장합니다: + +- **Small** ([depth_anything_3_small.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_small.safetensors)) — 가볍고 빠른 추론 +- **Base** ([depth_anything_3_base.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_base.safetensors)) — 균형 잡힌 성능 +- **Mono-Large** ([depth_anything_3_mono_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors)) — 단일 뷰 깊이에 최적 (하늘 감지 포함) +- **Metric-Large** ([depth_anything_3_metric_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_metric_large.safetensors)) — 미터 단위의 물리적 깊이 (하늘 감지 포함) + +``` +ComfyUI/ +├── models/ +│ ├── geometry_estimation/ +│ │ ├── depth_anything_3_small.safetensors +│ │ ├── depth_anything_3_base.safetensors +│ │ ├── depth_anything_3_mono_large.safetensors +│ │ └── depth_anything_3_metric_large.safetensors +``` + +## 예제 워크플로우 + +--- + +## 1. 이미지 깊이 추정 + +**기능 설명:** 이미지 1장을 업로드하고 **Image Depth Estimation (Depth Anything 3)**을 실행하여 깊이 맵을 생성합니다. **Depth Preview**에 원본 이미지와 깊이 출력의 나란히 비교 결과가 표시됩니다. + + + + JSON 다운로드 또는 템플릿 라이브러리에서 "Depth Anything 3" 검색 + + + 이 워크플로우의 예제 입력 이미지 가져오기 + + + +
+ 이미지 깊이 추정 출력 + 이미지 깊이 추정 비교 +
+ +### 실행 단계 + +1. **LoadImage** — 입력 이미지 로드 +2. **LoadDA3Model** — Depth Anything 3 변형 선택 +3. **실행** — Queue 클릭 또는 `Cmd+Enter` +4. 워크플로우가 깊이 맵과 나란히 비교 결과 출력 + + + 이 워크플로우는 모듈식 처리를 위해 서브그래프 노드를 사용합니다. 서브그래프 사용자 지정 및 확장에 대한 자세한 내용은 서브그래프 문서를 확인하세요. + + +--- + +## 2. 비디오 깊이 추정 + +**기능 설명:** 비디오를 업로드하고 **Video Depth Estimation (Depth Anything 3)**을 실행하여 프레임별 깊이 시퀀스를 생성합니다. 서브그래프 내에서 **GetVideoComponents**가 입력 비디오를 프레임으로 분할하고, **LoadDA3Model**이 모델을 로드하며, **SetVideoComponents**가 깊이 프레임을 비디오로 재구성합니다. + + + + JSON 다운로드 또는 템플릿 라이브러리에서 "Depth Anything 3" 검색 + + + Comfy Cloud에서 열기 + + + +![비디오 깊이 추정 미리보기](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/utility_depth_anything3_video_depth_estimation-1.webp) + +### 실행 단계 + +1. **LoadVideo** — 입력 비디오 로드 +2. **모델 선택** — **Small**, **Base**, **Mono-Large** 또는 **Metric-Large** 중 선택 +3. **실행** — Queue 클릭 또는 `Cmd+Enter` +4. 워크플로우가 프레임별 깊이 맵 비디오 출력 + + + 이 워크플로우는 모듈식 처리를 위해 서브그래프 노드를 사용합니다. 서브그래프 사용자 지정 및 확장에 대한 자세한 내용은 서브그래프 문서를 확인하세요. + + +## 모델 변형 + +| 변형 | head_type | 하늘 감지 | 신뢰도 | 카메라 디코더 | 최적 용도 | +|------|-----------|:---------:|:------:|:-------------:|-----------| +| **Small** | dualdpt | ❌ | ✅ | ✅ | 빠른 추론, 모바일/엣지 | +| **Base** | dualdpt | ❌ | ✅ | ✅ | 균형 잡힌 성능 | +| **Mono-Large** | dpt | ✅ | ❌ | ❌ | 하늘 감지 지원 단일 뷰 깊이 | +| **Metric-Large** | dpt | ✅ | ❌ | ❌ | 미터 단위 물리적 깊이 | + +- **Small**과 **Base**는 `dualdpt` 헤드 유형을 사용하며, 신뢰도 추정 및 카메라 디코더를 지원합니다(다중 뷰 애플리케이션용)。 +- **Mono-Large**와 **Metric-Large**는 `dpt` 헤드 유형을 사용하며, 하늘 감지를 지원합니다。Metric-Large는 미터 단위의 원시 깊이를 출력합니다。 + +## 커뮤니티 리소스 + +- [Depth Anything 3 GitHub (ByteDance-Seed)](https://github.com/ByteDance-Seed/Depth-Anything-3) — 연구 논문 및 코드 +- [Comfy-Org/Depth-Anything-3](https://huggingface.co/Comfy-Org/Depth-Anything-3) — 공식 ComfyUI 모델 가중치 diff --git a/ko/tutorials/video/bernini-r.mdx b/ko/tutorials/video/bernini-r.mdx new file mode 100644 index 000000000..daafbc9bf --- /dev/null +++ b/ko/tutorials/video/bernini-r.mdx @@ -0,0 +1,138 @@ +--- +title: "ComfyUI Bernini-R 공식 예제" +description: "ComfyUI에서 Bernini-R을 사용한 이미지 및 비디오 편집(재조명, 스타일 변환, 피사체 삽입 등)을 알아보세요." +sidebarTitle: "Bernini-R" +translationSourceHash: 6d8b19fc +translationFrom: tutorials/video/bernini-r.mdx +--- + +import UpdateReminder from '/snippets/ko/tutorials/update-reminder.mdx' + +# ComfyUI Bernini-R 소개 + +[Bernini-R](https://github.com/bytedance/Bernini)은 ByteDance의 **렌더러 전용** Wan 2.2 모델로, 컨텍스트 내 이미지 및 비디오 조건 제어를 위해 설계되었습니다. 조건 스트림(소스 비디오, 참조 이미지, 참조 비디오)을 사용하여 생성을 안내하므로 LoRA 학습이나 파인튜닝이 필요하지 않습니다. + +주요 기능: + +- **하나로 통합된 여러 작업 유형**: 이미지/비디오 생성, 편집, 재조명, 스타일 변환, 피사체 삽입 +- **컨텍스트 내 조건 제어**: 참조 이미지/비디오를 시각적 프롬프트로 토큰 주입 +- **경량**: 렌더러 전용 — 확산 기반 text-to-video 백본 불필요 +- **유연한 입력 지원**: 단일 또는 다중 참조 이미지, 비디오 to 비디오, 참조 유도 편집 + +Bernini-R이 지원하는 6가지 작업 유형: + +| 작업 | 입력 | 설명 | +|------|------|------| +| **t2v** | 텍스트 프롬프트 | 텍스트로 비디오 생성 | +| **v2v** | 소스 비디오 | 비디오 to 비디오 스타일 변환 | +| **rv2v** | 소스 비디오 + 참조 이미지 | 참조 유도 비디오 편집(재조명, 피사체 삽입) | +| **r2v** | 참조 이미지 | 참조 to 비디오 생성 | +| **ads2v** | 소스 비디오 + 참조 비디오 | 이미지/비디오 콘텐츠를 소스 비디오에 삽입 | +| **img** | 소스 이미지 | 이미지 편집 | + + + + +ComfyUI는 이제 Bernini-R 노드를 기본 지원합니다. 시작하기 전에 [ComfyUI](https://github.com/Comfy-Org/ComfyUI)를 최신 버전으로 업데이트하세요. + + +## 모델 설치 + +필요한 모델 가중치를 다운로드하여 해당 ComfyUI 폴더에 저장합니다: + +**text_encoders:** +- [umt5_xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true) + +**vae:** +- [Wan2_1_VAE_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan2_1_VAE_bf16.safetensors?download=true) + +**loras:** +- [lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors?download=true) + +**diffusion_models:** +- [wan2.2_bernini_r_fp16.safetensors](https://huggingface.co/Comfy-Org/Bernini-R/resolve/main/wan2.2_bernini_r_fp16.safetensors) + +``` +ComfyUI/ +├── models/ +│ ├── text_encoders/ +│ │ └── umt5_xxl_fp8_e4m3fn_scaled.safetensors +│ ├── vae/ +│ │ └── Wan2_1_VAE_bf16.safetensors +│ ├── loras/ +│ │ └── lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors +│ ├── diffusion_models/ +│ │ └── wan2.2_bernini_r_fp16.safetensors +``` + +## 예제 워크플로우 + +--- + +## 1. 이미지 편집 + +**기능 설명:** 조명이 일치하는 편집된 이미지를 생성하고, 전후 비교를 나란히 표시합니다. 인물 및 제품 재조명, 사진 컬렉션의 일관된 조명, 전자상거래 카탈로그 촬영에 적합합니다. + + + + JSON 다운로드 또는 템플릿 라이브러리에서 "Bernini-R" 검색 + + + Comfy Cloud에서 열기 + + + +
+ Bernini-R 이미지 편집 출력 + Bernini-R 이미지 편집 비교 +
+ +### 실행 단계 + +1. **작업 유형 선택** — 작업 선택 (Image Editing, Subject to Image 등) +2. **입력 연결** — 소스 이미지 및 선택적 참조 이미지 로드 +3. **프롬프트 작성** — 원하는 편집 내용 설명 +4. **실행** — Queue 클릭 또는 `Cmd+Enter` + +**참조 이미지 입력:** 하나 이상의 참조 이미지가 필요할 때 사용 (피사체, 의상, 장면, 소품). 프롬프트에서 `image0`, `image1` 등을 사용하여 각 이미지를 참조합니다. **Image Editing** 작업에는 필요하지 않습니다. 대신 `source_image`를 사용합니다. + + + 이 워크플로우는 모듈식 처리를 위해 서브그래프 노드를 사용합니다. 서브그래프 사용자 지정 및 확장에 대한 자세한 내용은 서브그래프 문서를 확인하세요. + + +--- + +## 2. 비디오 편집 + +**기능 설명:** Bernini-R로 일관된 재조명이 적용된 편집 비디오를 생성합니다. 소스 비디오, 선택적 참조 이미지 또는 참조 비디오를 연결하고, 작업 유형을 선택한 후 프롬프트를 작성하여 실행합니다. + + + + JSON 다운로드 또는 템플릿 라이브러리에서 "Bernini-R" 검색 + + + Comfy Cloud에서 열기 + + + +![Bernini-R 비디오 편집 미리보기](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/video_bernini_r_video_editing-1.webp) + +### 실행 단계 + +1. **소스 비디오 로드** — 입력 비디오 연결 +2. **(선택) 참조 로드** — 참조 이미지 또는 참조 비디오 +3. **작업 유형 선택** — v2v, rv2v, r2v 또는 ads2v +4. **프롬프트 작성** — 원하는 편집 내용 설명 +5. **실행** — Queue 클릭 또는 `Cmd+Enter` + +**참조 이미지 입력:** 하나 이상의 참조 이미지가 필요할 때 사용 (rv2v, r2v, 여러 의상). 배치 처리된 각 이미지는 고유한 컨텍스트 내 토큰이 됩니다. 참조물이 다른 역할을 하는 경우 프롬프트에서 `image0`, `image1` 등을 사용합니다. + + + 이 워크플로우는 모듈식 처리를 위해 서브그래프 노드를 사용합니다. 서브그래프 사용자 지정 및 확장에 대한 자세한 내용은 서브그래프 문서를 확인하세요. + + +## 커뮤니티 리소스 + +- [Bernini GitHub (bytedance/Bernini)](https://github.com/bytedance/Bernini) — 연구 논문 및 작업 문서 +- [Comfy-Org/Bernini-R](https://huggingface.co/Comfy-Org/Bernini-R) — 공식 ComfyUI 모델 가중치 +- [Bernini: Latent Semantic Planning for Video Diffusion](https://arxiv.org/abs/2605.22344) — 연구 논문 diff --git a/tutorials/utility/depth-anything-3.mdx b/tutorials/utility/depth-anything-3.mdx new file mode 100644 index 000000000..7ba26dc3c --- /dev/null +++ b/tutorials/utility/depth-anything-3.mdx @@ -0,0 +1,124 @@ +--- +title: "ComfyUI Depth Anything 3 Examples" +description: "Learn how to use Depth Anything 3 in ComfyUI for monocular and multi-view depth estimation from images and videos, powered by ByteDance Seed's vision transformer." +sidebarTitle: "Depth Anything 3" +--- + +import UpdateReminder from '/snippets/tutorials/update-reminder.mdx' + +# ComfyUI Depth Anything 3 Introduction + +[Depth Anything 3 (DA3)](https://github.com/ByteDance-Seed/Depth-Anything-3), from ByteDance Seed, is a vision transformer that recovers spatially consistent geometry from arbitrary visual inputs, with or without known camera poses. A single plain DINO encoder and unified depth-ray representation let one model family cover monocular depth, multi-view depth, camera pose estimation, and 3D reconstruction. + +Key capabilities: + +- **Unified monocular & multi-view depth**: estimate depth from a single image or multiple views +- **Camera pose estimation**: recover camera positions from unordered image sets +- **3D reconstruction** from multi-view inputs +- **Video depth estimation**: per-frame depth sequences for video inputs +- **Multiple model variants**: Small, Base, Mono/Metric Large + + + + +ComfyUI now natively supports Depth Anything 3 nodes. Make sure you have updated to the latest version of [ComfyUI](https://github.com/Comfy-Org/ComfyUI) before starting. + + +## Model Installation + +Download the Depth Anything 3 checkpoint(s) and save them to the corresponding ComfyUI folder: + +- **Small** ([depth_anything_3_small.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_small.safetensors)) — Lightweight, fast inference +- **Base** ([depth_anything_3_base.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_base.safetensors)) — Balanced performance +- **Mono-Large** ([depth_anything_3_mono_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors)) — Best for monocular depth, includes sky detection +- **Metric-Large** ([depth_anything_3_metric_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_metric_large.safetensors)) — Metric scale depth in metres, includes sky detection + +``` +ComfyUI/ +├── models/ +│ ├── geometry_estimation/ +│ │ ├── depth_anything_3_small.safetensors +│ │ ├── depth_anything_3_base.safetensors +│ │ ├── depth_anything_3_mono_large.safetensors +│ │ └── depth_anything_3_metric_large.safetensors +``` + +## Example Workflows + +--- + +## 1. Image Depth Estimation + +**What it does:** Upload one image and run **Image Depth Estimation (Depth Anything 3)** to produce a depth map. The result is shown in **Depth Preview**, with a side-by-side comparison view of the original image and depth output. + + + + Download JSON or search "Depth Anything 3" in Template Library + + + Get the example input image for this workflow + + + +
+ Image Depth Estimation output + Image Depth Estimation comparison +
+ +### Steps to Run + +1. **LoadImage** — load your input image +2. **LoadDA3Model** — select a Depth Anything 3 variant +3. **Run** — click Queue or use `Cmd+Enter` +4. The workflow outputs a depth map and side-by-side comparison + + + This workflow uses Subgraph nodes for modular processing. Check out the Subgraph documentation to learn how to customize and extend the workflow. + + +--- + +## 2. Video Depth Estimation + +**What it does:** Upload a video and run **Video Depth Estimation (Depth Anything 3)** to produce a per-frame depth sequence. Inside the subgraph, **GetVideoComponents** splits the input video into frames, **LoadDA3Model** loads the model, and **SetVideoComponents** reassembles the depth frames back into a video output. + + + + Download JSON or search "Depth Anything 3" in Template Library + + + Open in Comfy Cloud + + + +![Video Depth Estimation preview](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/utility_depth_anything3_video_depth_estimation-1.webp) + +### Steps to Run + +1. **LoadVideo** — load your input video +2. **Select Model** — choose between **Small**, **Base**, **Mono-Large**, or **Metric-Large** +3. **Run** — click Queue or use `Cmd+Enter` +4. The workflow outputs a video with per-frame depth maps + + + This workflow uses Subgraph nodes for modular processing. Check out the Subgraph documentation to learn how to customize and extend the workflow. + + +--- + +## Model Variants + +| Variant | head_type | has_sky | has_confidence | camera_decoder | Best for | +|---------|-----------|:-------:|:--------------:|:--------------:|----------| +| **Small** | dualdpt | ❌ | ✅ | ✅ | Fast inference, mobile/edge | +| **Base** | dualdpt | ❌ | ✅ | ✅ | Balanced performance | +| **Mono-Large** | dpt | ✅ | ❌ | ❌ | Monocular depth with sky detection | +| **Metric-Large** | dpt | ✅ | ❌ | ❌ | Physical metric depth in metres | + +- **Small** and **Base** use the `dualdpt` head type with confidence estimation and camera decoder support for multi-view applications. +- **Mono-Large** and **Metric-Large** use the `dpt` head type with sky detection. Metric-Large outputs raw depth in metres. + +## Community Resources + +- [Depth Anything 3 GitHub (ByteDance-Seed)](https://github.com/ByteDance-Seed/Depth-Anything-3) — Research paper and code +- [Comfy-Org/Depth-Anything-3](https://huggingface.co/Comfy-Org/Depth-Anything-3) — Official ComfyUI model weights diff --git a/tutorials/video/bernini-r.mdx b/tutorials/video/bernini-r.mdx new file mode 100644 index 000000000..d4bbd8573 --- /dev/null +++ b/tutorials/video/bernini-r.mdx @@ -0,0 +1,137 @@ +--- +title: "ComfyUI Bernini-R Examples" +description: "Learn how to use Bernini-R in ComfyUI for image and video editing with in-context conditioning — relighting, restyling, subject insertion, and more." +sidebarTitle: "Bernini-R" +--- + +import UpdateReminder from '/snippets/tutorials/update-reminder.mdx' + +# ComfyUI Bernini-R Introduction + +[Bernini-R](https://github.com/bytedance/Bernini) is ByteDance's **renderer-only** Wan 2.2 model for in-context image and video conditioning. It uses a set of conditioning streams (source video, reference images, reference video) to guide generation — no LoRA training or fine-tuning required. + +Key capabilities: + +- **Multiple task types in one model**: image/video generation, editing, relighting, restyling, subject insertion +- **In-context conditioning**: reference images/videos act as visual prompts, injected as tokens +- **Lightweight**: renderer-only model — no diffusion-based text-to-video backbone +- **Flexible input support**: single or multi-image references, video-to-video, reference-guided editing + +Bernini-R supports these task types: + +| Task | Inputs | Description | +|------|--------|-------------| +| **t2v** | Text prompt | Text-to-video generation | +| **v2v** | Source video | Video-to-video restyling | +| **rv2v** | Source video + ref images(s) | Reference-guided video editing (relighting, subject insertion) | +| **r2v** | Reference image(s) only | Reference-to-video generation | +| **ads2v** | Source video + ref video | Insert image/video content into source video | + + + + +ComfyUI now natively supports Bernini-R nodes. Make sure you have updated to the latest version of [ComfyUI](https://github.com/Comfy-Org/ComfyUI) before starting. + + +## Model Installation + +Download the required model weights and save them to the corresponding ComfyUI folders: + +**text_encoders:** +- [umt5_xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true) + +**vae:** +- [Wan2_1_VAE_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan2_1_VAE_bf16.safetensors?download=true) + +**loras:** +- [lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors?download=true) + +**diffusion_models:** +- [wan2.2_bernini_r_fp16.safetensors](https://huggingface.co/Comfy-Org/Bernini-R/resolve/main/wan2.2_bernini_r_fp16.safetensors) + +``` +ComfyUI/ +├── models/ +│ ├── text_encoders/ +│ │ └── umt5_xxl_fp8_e4m3fn_scaled.safetensors +│ ├── vae/ +│ │ └── Wan2_1_VAE_bf16.safetensors +│ ├── loras/ +│ │ └── lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors +│ ├── diffusion_models/ +│ │ └── wan2.2_bernini_r_fp16.safetensors +``` + +## Example Workflows + +--- + +## 1. Image Editing + +**What it does:** Generate an edited image with matched lighting and view a side-by-side before/after comparison. Ideal for portrait and product relighting, consistent lighting across photo sets, and e-commerce catalog photography. + + + + Download JSON or search "Bernini-R" in Template Library + + + Open in Comfy Cloud + + + +
+ Bernini-R Image Editing output + Bernini-R Image Editing comparison +
+ +### Steps to Run + +1. **Select Task Type** — choose your task (Image Editing, Subject to Image, etc.) +2. **Connect Inputs** — load source image and optional reference images +3. **Write Prompt** — describe the desired edit +4. **Run** — click Queue or use `Cmd+Enter` + +**Reference Image input:** Use for **Subject to Image** when you need one or more reference images (subject, outfit, scene, props). In the prompt, use `image0`, `image1`, … to reference each image. Not needed for **Image Editing** — that task uses `source_image` instead. + + + This workflow uses Subgraph nodes for modular processing. Check out the Subgraph documentation to learn how to customize and extend the workflow. + + +--- + +## 2. Video Editing + +**What it does:** Generate an edited video with consistent relighting using Bernini-R. Connect a source video, optional reference image(s) or reference video, pick the task type, write a prompt, and run. + + + + Download JSON or search "Bernini-R" in Template Library + + + Open in Comfy Cloud + + + +![Bernini-R Video Editing preview](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/video_bernini_r_video_editing-1.webp) + +### Steps to Run + +1. **Load Source Video** — connect your input video +2. **(Optional) Load References** — reference image(s) or reference video +3. **Select Task Type** — v2v, rv2v, r2v, or ads2v +4. **Write Prompt** — describe the desired edit +5. **Run** — click Queue or use `Cmd+Enter` + +**Reference Image input:** Use when a task needs one or more reference images (rv2v, r2v, multi-piece outfits). Each batched image becomes its own in-context token. Mention `image0`, `image1`, … in the prompt if references play different roles. + + + This workflow uses Subgraph nodes for modular processing. Check out the Subgraph documentation to learn how to customize and extend the workflow. + + +--- + +## Community Resources + +- [Bernini GitHub (bytedance/Bernini)](https://github.com/bytedance/Bernini) — Research paper and task documentation +- [Comfy-Org/Bernini-R](https://huggingface.co/Comfy-Org/Bernini-R) — Official ComfyUI model weights +- [Bernini: Latent Semantic Planning for Video Diffusion](https://arxiv.org/abs/2605.22344) — Research paper diff --git a/zh/tutorials/utility/depth-anything-3.mdx b/zh/tutorials/utility/depth-anything-3.mdx new file mode 100644 index 000000000..8bbbcb0ec --- /dev/null +++ b/zh/tutorials/utility/depth-anything-3.mdx @@ -0,0 +1,124 @@ +--- +title: "ComfyUI Depth Anything 3 官方示例" +description: "了解如何在 ComfyUI 中使用 Depth Anything 3 进行单目和多视角深度估计,支持图像和视频输入。" +sidebarTitle: "Depth Anything 3" +translationSourceHash: d646a0e3 +translationFrom: tutorials/utility/depth-anything-3.mdx +--- + +import UpdateReminder from '/snippets/zh/tutorials/update-reminder.mdx' + +# ComfyUI Depth Anything 3 简介 + +[Depth Anything 3 (DA3)](https://github.com/ByteDance-Seed/Depth-Anything-3) 来自字节跳动豆包团队,是一个视觉 Transformer,能够从任意视觉输入中恢复空间一致的几何信息,无论是否具有已知的相机位姿。单个 DINO 编码器和统一的深度-射线表示使得同一模型家族能够覆盖单目深度、多视角深度、相机位姿估计和 3D 重建。 + +主要能力: + +- **统一单目与多视角深度**:从单张图像或多张图像估计深度 +- **相机位姿估计**:从无序图像集合中恢复相机位置 +- **3D 重建**:支持多视角输入 +- **视频深度估计**:为视频输入生成逐帧深度序列 +- **多种模型变体**:Small、Base、Mono/Metric Large + + + + +ComfyUI 现已原生支持 Depth Anything 3 节点。开始前请确保已更新到最新版本的 [ComfyUI](https://github.com/Comfy-Org/ComfyUI)。 + + +## 模型下载 + +下载 Depth Anything 3 的模型文件并将其保存到对应的 ComfyUI 文件夹: + +- **Small** ([depth_anything_3_small.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_small.safetensors)) — 轻量快速推理 +- **Base** ([depth_anything_3_base.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_base.safetensors)) — 平衡性能 +- **Mono-Large** ([depth_anything_3_mono_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors)) — 最佳单目深度,含天空检测 +- **Metric-Large** ([depth_anything_3_metric_large.safetensors](https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_metric_large.safetensors)) — 物理度量深度(米级) + +``` +ComfyUI/ +├── models/ +│ ├── geometry_estimation/ +│ │ ├── depth_anything_3_small.safetensors +│ │ ├── depth_anything_3_base.safetensors +│ │ ├── depth_anything_3_mono_large.safetensors +│ │ └── depth_anything_3_metric_large.safetensors +``` + +## 示例工作流 + +--- + +## 1. 图像深度估计 + +**功能说明:** 上传一张图像,使用 **Image Depth Estimation (Depth Anything 3)** 生成深度图。结果在 **Depth Preview** 中显示,提供原始图像与深度输出的并排对比视图。 + + + + 下载 JSON 或在模板库中搜索 "Depth Anything 3" + + + 获取此工作流的示例输入图片 + + + +
+ 图像深度估计输出 + 图像深度估计对比 +
+ +### 运行步骤 + +1. **LoadImage** — 加载输入图像 +2. **LoadDA3Model** — 选择 Depth Anything 3 变体 +3. **运行** — 点击 Queue 或使用 `Cmd+Enter` +4. 工作流输出深度图和并排比较结果 + + + 此工作流使用子图节点进行模块化处理。查看子图文档了解如何自定义和扩展工作流。 + + +--- + +## 2. 视频深度估计 + +**功能说明:** 上传一个视频,运行 **Video Depth Estimation (Depth Anything 3)** 生成逐帧深度序列。在子图内部,**GetVideoComponents** 将输入视频拆分为帧,**LoadDA3Model** 加载模型,**SetVideoComponents** 将深度帧重新组合为视频输出。 + + + + 下载 JSON 或在模板库中搜索 "Depth Anything 3" + + + 在 Comfy Cloud 中打开 + + + +![视频深度估计预览](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/utility_depth_anything3_video_depth_estimation-1.webp) + +### 运行步骤 + +1. **LoadVideo** — 加载输入视频 +2. **选择模型** — 在 **Small**、**Base**、**Mono-Large** 或 **Metric-Large** 中选择 +3. **运行** — 点击 Queue 或使用 `Cmd+Enter` +4. 工作流输出逐帧深度图视频 + + + 此工作流使用子图节点进行模块化处理。查看子图文档了解如何自定义和扩展工作流。 + + +## 模型变体 + +| 变体 | head_type | 天空检测 | 置信度 | 相机解码 | 最佳用途 | +|------|-----------|:-------:|:------:|:--------:|----------| +| **Small** | dualdpt | ❌ | ✅ | ✅ | 快速推理、移动/边缘设备 | +| **Base** | dualdpt | ❌ | ✅ | ✅ | 均衡性能 | +| **Mono-Large** | dpt | ✅ | ❌ | ❌ | 带天空检测的单目深度 | +| **Metric-Large** | dpt | ✅ | ❌ | ❌ | 物理度量深度(米级输出) | + +- **Small** 和 **Base** 使用 `dualdpt` 头类型,支持置信度估计和相机解码器,适用于多视角应用。 +- **Mono-Large** 和 **Metric-Large** 使用 `dpt` 头类型,支持天空检测。Metric-Large 输出原始米级深度。 + +## 社区资源 + +- [Depth Anything 3 GitHub (ByteDance-Seed)](https://github.com/ByteDance-Seed/Depth-Anything-3) — 研究论文和代码 +- [Comfy-Org/Depth-Anything-3](https://huggingface.co/Comfy-Org/Depth-Anything-3) — 官方 ComfyUI 模型权重 diff --git a/zh/tutorials/video/bernini-r.mdx b/zh/tutorials/video/bernini-r.mdx new file mode 100644 index 000000000..de1f2f745 --- /dev/null +++ b/zh/tutorials/video/bernini-r.mdx @@ -0,0 +1,138 @@ +--- +title: "ComfyUI Bernini-R 官方示例" +description: "了解如何在 ComfyUI 中使用 Bernini-R 进行图像和视频编辑——重光照、风格转换、主体插入等。" +sidebarTitle: "Bernini-R" +translationSourceHash: 6d8b19fc +translationFrom: tutorials/video/bernini-r.mdx +--- + +import UpdateReminder from '/snippets/zh/tutorials/update-reminder.mdx' + +# ComfyUI Bernini-R 简介 + +[Bernini-R](https://github.com/bytedance/Bernini) 是字节跳动的 **仅渲染器** Wan 2.2 模型,专为上下文内图像和视频条件控制设计。它使用一组条件流(源视频、参考图像、参考视频)来引导生成——无需 LoRA 训练或微调。 + +主要能力: + +- **多种任务类型合一**:图像/视频生成、编辑、重光照、风格转换、主体插入 +- **上下文内条件控制**:参考图像/视频作为视觉提示,以令牌方式注入 +- **轻量级**:仅渲染器模型——无需基于扩散的文本到视频骨干 +- **灵活输入支持**:单图或多图参考、视频到视频、参考引导编辑 + +Bernini-R 支持的六种任务类型: + +| 任务 | 输入 | 说明 | +|------|------|------| +| **t2v** | 文本提示 | 文生视频 | +| **v2v** | 源视频 | 视频到视频风格转换 | +| **rv2v** | 源视频 + 参考图像 | 参考引导视频编辑(重光照、主体插入) | +| **r2v** | 参考图像 | 参考到视频生成 | +| **ads2v** | 源视频 + 参考视频 | 将图像/视频内容插入源视频 | +| **img** | 源图像 | 图像编辑 | + + + + +ComfyUI 现已原生支持 Bernini-R 节点。开始前请确保已更新到最新版本的 [ComfyUI](https://github.com/Comfy-Org/ComfyUI)。 + + +## 模型下载 + +下载所需的模型权重并将其保存到对应的 ComfyUI 文件夹: + +**text_encoders:** +- [umt5_xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true) + +**vae:** +- [Wan2_1_VAE_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan2_1_VAE_bf16.safetensors?download=true) + +**loras:** +- [lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors](https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors?download=true) + +**diffusion_models:** +- [wan2.2_bernini_r_fp16.safetensors](https://huggingface.co/Comfy-Org/Bernini-R/resolve/main/wan2.2_bernini_r_fp16.safetensors) + +``` +ComfyUI/ +├── models/ +│ ├── text_encoders/ +│ │ └── umt5_xxl_fp8_e4m3fn_scaled.safetensors +│ ├── vae/ +│ │ └── Wan2_1_VAE_bf16.safetensors +│ ├── loras/ +│ │ └── lightx2v_T2V_14B_cfg_step_distill_v2_lora_rank64_bf16.safetensors +│ ├── diffusion_models/ +│ │ └── wan2.2_bernini_r_fp16.safetensors +``` + +## 示例工作流 + +--- + +## 1. 图像编辑 + +**功能说明:** 生成具有匹配光照的编辑图像,并查看前/后并排对比。适用于人像和产品重光照、照片集一致的光照、电商目录摄影。 + + + + 下载 JSON 或在模板库中搜索 "Bernini-R" + + + 在 Comfy Cloud 中打开 + + + +
+ Bernini-R 图像编辑输出 + Bernini-R 图像编辑对比 +
+ +### 运行步骤 + +1. **选择任务类型** — 选择你的任务(Image Editing、Subject to Image 等) +2. **连接输入** — 加载源图像和可选参考图像 +3. **编写提示词** — 描述所需的编辑 +4. **运行** — 点击 Queue 或使用 `Cmd+Enter` + +**参考图像输入:** 当需要一张或多张参考图像时使用(主体、服装、场景、道具)。在提示词中使用 `image0`、`image1` 等来引用每张图像。**图像编辑(Image Editing)** 任务不需要此项——该任务使用 `source_image` 代替。 + + + 此工作流使用子图节点进行模块化处理。查看子图文档了解如何自定义和扩展工作流。 + + +--- + +## 2. 视频编辑 + +**功能说明:** 使用 Bernini-R 生成具有一致重光照的编辑视频。连接源视频、可选的参考图像或参考视频,选择任务类型,编写提示词,然后运行。 + + + + 下载 JSON 或在模板库中搜索 "Bernini-R" + + + 在 Comfy Cloud 中打开 + + + +![Bernini-R 视频编辑预览](https://raw.githubusercontent.com/Comfy-Org/workflow_templates/main/templates/video_bernini_r_video_editing-1.webp) + +### 运行步骤 + +1. **加载源视频** — 连接输入视频 +2. **(可选)加载参考** — 参考图像或参考视频 +3. **选择任务类型** — v2v、rv2v、r2v 或 ads2v +4. **编写提示词** — 描述所需的编辑 +5. **运行** — 点击 Queue 或使用 `Cmd+Enter` + +**参考图像输入:** 当需要一张或多张参考图像时使用(rv2v、r2v、多件服装)。每张批处理的图像成为自己的上下文内令牌。如果参考物扮演不同角色,在提示词中使用 `image0`、`image1` 等。 + + + 此工作流使用子图节点进行模块化处理。查看子图文档了解如何自定义和扩展工作流。 + + +## 社区资源 + +- [Bernini GitHub (bytedance/Bernini)](https://github.com/bytedance/Bernini) — 研究论文和任务文档 +- [Comfy-Org/Bernini-R](https://huggingface.co/Comfy-Org/Bernini-R) — 官方 ComfyUI 模型权重 +- [Bernini: Latent Semantic Planning for Video Diffusion](https://arxiv.org/abs/2605.22344) — 研究论文