CompVis
diff --git a/‎LICENSE.txt‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 56 additions & 10 deletions b/‎README.md‎
Lines changed: 56 additions & 10 deletions
diff --git a/‎configs/ViT-L-14_stats.th‎
6.91 KB b/‎configs/ViT-L-14_stats.th‎
6.91 KB
diff --git a/‎configs/inference.yaml‎
Lines changed: 38 additions & 0 deletions b/‎configs/inference.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎configs/v2-1-stable-unclip-l-inference.yaml‎
Lines changed: 83 additions & 0 deletions b/‎configs/v2-1-stable-unclip-l-inference.yaml‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎image_samples/Cubism/02316.png‎
333 KB b/‎image_samples/Cubism/02316.png‎
333 KB
diff --git a/‎image_samples/Cubism/09728.png‎
1.42 MB b/‎image_samples/Cubism/09728.png‎
1.42 MB
diff --git a/‎image_samples/Cyberpunk/02316.png‎
395 KB b/‎image_samples/Cyberpunk/02316.png‎
395 KB
diff --git a/‎image_samples/Cyberpunk/09728.png‎
392 KB b/‎image_samples/Cyberpunk/09728.png‎
392 KB
diff --git a/‎image_samples/Drip_Painting/02316.png‎
458 KB b/‎image_samples/Drip_Painting/02316.png‎
458 KB
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2025 Johannes Schusterbauer
+Copyright (c) 2025 CompVis - Computer Vision and Learning LMU Munich
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -18,26 +18,72 @@
 </p>
 
 
-<p align="center">
-   <a href="https://compvis.github.io/SCFlow/"><img src="docs/static/figures/badge-website.svg" alt="Website"></a>
-   <a href="https://arxiv.org/abs/2508.03402"><img src="https://img.shields.io/badge/arXiv-PDF-b31b1b" alt="Paper"></a>
-</p>
+<a href="https://compvis.github.io/SCFlow/"><img src="docs/static/figures/badge-website.svg" alt="Website"></a>
+<a href="https://arxiv.org/abs/2508.03402"><img src="https://img.shields.io/badge/arXiv-PDF-b31b1b" alt="Paper"></a>
+<a href="https://huggingface.co/CompVis/SCFlow"><img src="https://img.shields.io/badge/HuggingFace-Weights-orange" alt="Paper"></a>
+
+This repository contains the official implementation of the paper "SCFlow: Implicitly Learning Style and Content Disentanglement with Flow Models".
+We proposed a flow-matching framework that learns an invertible mapping between style-content mixtures and their separate representations, avoiding explicit disentanglement objectives. Together with the method, we have curated a 510k synthetic dataset consisting of 10k content instances and 51 distinct styles.
+
 
 <p align="center">
-   <img src="docs/static/images/teaser.jpg" alt="Cover" width="75%">
+   <img src="docs/static/images/teaser.jpg" alt="Cover" width="80%">
 </p>
 
 
-<!-- 
+
+## 🛠️ Setup
+Create the enviroment with conda:
+```bash
+conda create -n scflow python=3.10
+conda activate scflow
+pip install -r requirements.txt
+```
+The enviroment was tested on `Ubuntu 22.04.5 LTS` with `CUDA 12.1`. You can *optionally* install jupyter-notebook to run the notebook provided in [`notebooks`](https://github.com/CompVis/SCFlow/tree/main/notebooks)
+
+Download the model checkpoints:
+```bash
+mkdir ckpts
+cd ckpts
+
+# model checkpoint
+wget -O scflow_last.ckpt https://huggingface.co/CompVis/SCFlow/resolve/main/scflow_last.ckpt?dowload=true
+
+# unclip checkpoint for visualization
+wget -O sd21-unclip-l.ckpt https://huggingface.co/CompVis/SCFlow/resolve/main/sd21-unclip-l.ckpt?dowload=true
+```
+## 🔥 Usage
+Inference forward (merge content and style)
+```bash
+bash scripts/inference_forward.sh
+```
+Inference reverse (disentangle content and style from a given reference)
+```bash
+bash scripts/inference_reverse.sh
+```
+
+Training (coming soon)
+```bash
+bash ...
+```
+
+## 🗂️ Dataset
+Coming soon
 
 ## 🎓 Citation
 
+
 If you use this codebase or otherwise found our work valuable, please cite our paper:
 ```bibtex
-TBD
-``` -->
+@article{ma2025scflow,
+  title={SCFlow: Implicitly Learning Style and Content Disentanglement with Flow Models},
+  author={Ma, Pingchuan and Yang, Xiaopei and Li, Yusong and Gui, Ming and Krause, Felix and Schusterbauer, Johannes and Ommer, Bj{\"o}rn},
+  journal={arXiv preprint arXiv:2508.03402},
+  year={2025}
+}
+```
 
 ## 🔥 Updates and Backlogs
 - [x] **[06.08.2025]** [ArXiv](https://arxiv.org/abs/2508.03402) paper avaiable.
-- [ ] Release Inference code and ckpt
-- [ ] Host the dataset and training code
+- [x] **[12.08.2025]** Release Inference code and ckpt
+- [ ] Host the dataset and training code
@@ -0,0 +1,38 @@
+model:
+  scale_factor: 0.6304
+  fm:
+    target: scflow.cfm.FlowMatching
+    params:
+      sigma_min: 1.0e-08
+      net_cfg:
+        target: scflow.models.kakaomodels.prior.PriorTransformer
+        params:
+          xf_width: 2048
+          xf_layers: 12
+          xf_heads: 32
+          xf_final_ln: true
+          clip_dim: 1536
+    
+train:
+  lr: 1.0e-05
+  weight_decay: 0.0
+  lr_scheduler_patience: 20
+  cal_metrics: true
+  ema_rate: 0.999
+  ema_update_every: 1
+  ema_update_after_step: 1000
+  use_ema_for_sampling: true
+  checkpoint_callback_params:
+    every_n_train_steps: 800000
+    save_top_k: -1
+    verbose: False
+    save_last: false
+    auto_insert_metric_name: false
+  trainer_params:
+    max_epochs: 40
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    log_every_n_steps: 50
+    limit_val_batches: 64
+    val_check_interval: 20000
+    precision: 16
@@ -0,0 +1,83 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: scflow.ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+
+    embedder_config:
+      target: scflow.ldm.modules.encoders.modules.ClipImageEmbedder
+      params:
+        model: "ViT-L/14"
+
+    noise_aug_config:
+      target: scflow.ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        clip_stats_path: "configs/ViT-L-14_stats.th"
+        timestep_dim: 768
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+
+    unet_config:
+      target: scflow.ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 1536
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: scflow.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: scflow.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"