add stable diffusion model (ckpt loadable) and toy model trained on celebA

Animadversio · Animadversio · commit 714556d2e23d · 2022-10-31T17:39:16.000-04:00
And extra experiments that you can play with
diff --git a/StableDiff_UNet_model.py b/StableDiff_UNet_model.py
@@ -12,30 +12,38 @@
 
 
 class UNet_SD(nn.Module):
-    def __init__(self, cat_unet=True):
+
+    def __init__(self, in_channels=4,
+                 base_channels=320,
+                 time_emb_dim=1280,
+                 context_dim=768,
+                 multipliers=(1, 2, 4, 4),
+                 attn_levels=(0, 1, 2),
+                 nResAttn_block=2,
+                 cat_unet=True):
         super().__init__()
-        self.in_channels = 4
-        self.out_channels = 4
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        base_channels = 320
-        time_proj_dim = 320
-        time_emb_dim = 1280
-        context_dim = 768
-        nlevel = 4
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        base_channels = base_channels
+        time_emb_dim = time_emb_dim
+        context_dim = context_dim
+        multipliers = multipliers
+        nlevel = len(multipliers)
         self.base_channels = base_channels
-        attn_levels = [0, 1, 2]
-        level_channels = [base_channels * mult for mult in [1, 2, 4, 4]]
+        # attn_levels = [0, 1, 2]
+        level_channels = [base_channels * mult for mult in multipliers]
         # Transform time into embedding
         self.time_embedding = nn.Sequential(OrderedDict({
-            "linear_1": nn.Linear(time_proj_dim, time_emb_dim, bias=True),
+            "linear_1": nn.Linear(base_channels, time_emb_dim, bias=True),
             "act": nn.SiLU(),
             "linear_2": nn.Linear(time_emb_dim, time_emb_dim, bias=True),
         })
         )  # 2 layer MLP
         self.conv_in = nn.Conv2d(self.in_channels, base_channels, 3, stride=1, padding=1)
 
         # Tensor Downsample blocks
-        nResAttn_block = 2
+        nResAttn_block = nResAttn_block
         self.down_blocks = TimeModulatedSequential()  # nn.ModuleList()
         self.down_blocks_channels = [base_channels]
         cur_chan = base_channels
@@ -81,14 +89,13 @@ def __init__(self, cat_unet=True):
             nn.Conv2d(base_channels, self.out_channels, 3, padding=1),
         )
         self.to(self.device)
-
     def time_proj(self, time_steps, max_period: int = 10000):
         if time_steps.ndim == 0:
             time_steps = time_steps.unsqueeze(0)
         half = self.base_channels // 2
         frequencies = torch.exp(- math.log(max_period)
-             * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(device=time_steps.device)
+                                * torch.arange(start=0, end=half, dtype=torch.float32) / half
+                                ).to(device=time_steps.device)
         angles = time_steps[:, None].float() * frequencies[None, :]
         return torch.cat([torch.cos(angles), torch.sin(angles)], dim=-1)
 
diff --git a/StableDiff_toy_celebA.py b/StableDiff_toy_celebA.py
@@ -0,0 +1,215 @@
+#%%
+import torch
+import functools
+from tqdm import tqdm, trange
+import torch.multiprocessing
+from tqdm import tqdm
+import torch.nn as nn
+import torch.nn.functional as F
+torch.multiprocessing.set_sharing_strategy('file_system')
+#%%
+from torch.utils.data import DataLoader, TensorDataset
+from torchvision.datasets import CelebA
+from torchvision.transforms import ToTensor, CenterCrop, Resize, Compose, Normalize
+
+
+tfm = Compose([
+    Resize(32),
+    CenterCrop(32),
+    ToTensor(),
+    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+dataset_rsz = CelebA("/home/binxuwang/Datasets", target_type=["attr"],
+                    transform=tfm, download=False) # ,"identity"
+#%%
+dataloader = DataLoader(dataset_rsz, batch_size=64, num_workers=8, shuffle=False)
+x_col = []
+y_col = []
+for xs, ys in tqdm(dataloader):
+  x_col.append(xs)
+  y_col.append(ys)
+x_col = torch.concat(x_col, dim=0)
+y_col = torch.concat(y_col, dim=0)
+print(x_col.shape)
+print(y_col.shape)
+
+nantoken = 40
+maxlen = (y_col.sum(dim=1)).max()
+yseq_data = torch.ones(y_col.size(0), maxlen, dtype=int).fill_(nantoken)
+
+saved_dataset = TensorDataset(x_col, yseq_data)
+#%%
+import math
+from torch.optim import Adam
+from torch.optim.lr_scheduler import MultiplicativeLR, LambdaLR
+device = 'cuda'
+
+def marginal_prob_std(t, sigma):
+    t = torch.tensor(t, device=device)
+    return torch.sqrt((sigma ** (2 * t) - 1.) / 2. / math.log(sigma))
+
+
+def diffusion_coeff(t, sigma):
+    return torch.tensor(sigma ** t, device=device)
+
+
+sigma = 25.0  # @param {'type':'number'}
+marginal_prob_std_fn = functools.partial(marginal_prob_std, sigma=sigma)
+diffusion_coeff_fn = functools.partial(diffusion_coeff, sigma=sigma)
+#%
+#@title Training Loss function
+def loss_fn_cond(model, x, y, marginal_prob_std, eps=1e-5):
+    """The loss function for training score-based generative models.
+
+    Args:
+    model: A PyTorch model instance that represents a
+      time-dependent score-based model.
+    x: A mini-batch of training data.
+    marginal_prob_std: A function that gives the standard deviation of
+      the perturbation kernel.
+    eps: A tolerance value for numerical stability.
+    """
+    random_t = torch.rand(x.shape[0], device=x.device) * (1. - eps) + eps
+    z = torch.randn_like(x)
+    std = marginal_prob_std(random_t)
+    perturbed_x = x + z * std[:, None, None, None]
+    score = model(perturbed_x, random_t, cond=y, output_dict=False)
+    loss = torch.mean(torch.sum((score * std[:, None, None, None] + z)**2, dim=(1,2,3)))
+    return loss
+
+#%
+def train_score_model(score_model, cond_embed, dataset, lr, n_epochs, batch_size, ckpt_name,
+                      marginal_prob_std_fn=marginal_prob_std_fn,
+                      lr_scheduler_fn=lambda epoch: max(0.2, 0.98 ** epoch),
+                      device="cuda",
+                      callback=None): # resume=False,
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
+    optimizer = Adam([*score_model.parameters(), *cond_embed.parameters()], lr=lr)
+    scheduler = LambdaLR(optimizer, lr_lambda=lr_scheduler_fn)
+    tqdm_epoch = trange(n_epochs)
+    for epoch in tqdm_epoch:
+        score_model.train()
+        avg_loss = 0.
+        num_items = 0
+        batch_tqdm = tqdm(data_loader)
+        for x, y in batch_tqdm:
+            x = x.to(device)
+            y_emb = cond_embed(y.to(device))
+            loss = loss_fn_cond(score_model, x, y_emb, marginal_prob_std_fn)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            avg_loss += loss.item() * x.shape[0]
+            num_items += x.shape[0]
+            batch_tqdm.set_description("Epoch %d, loss %.4f" % (epoch, avg_loss / num_items))
+        scheduler.step()
+        lr_current = scheduler.get_last_lr()[0]
+        print('{} Average Loss: {:5f} lr {:.1e}'.format(epoch, avg_loss / num_items, lr_current))
+        # Print the averaged training loss so far.
+        tqdm_epoch.set_description('Average Loss: {:5f}'.format(avg_loss / num_items))
+        # Update the checkpoint after each epoch of training.
+        torch.save(score_model.state_dict(), f'/home/binxuwang/DL_Projects/SDfromScratch/ckpt_{ckpt_name}.pth')
+        torch.save(cond_embed.state_dict(),
+                   f'/home/binxuwang/DL_Projects/SDfromScratch/ckpt_{ckpt_name}_cond_embed.pth')
+        if callback is not None:
+            score_model.eval()
+            callback(score_model, epoch, ckpt_name)
+#%%
+def Euler_Maruyama_sampler(score_model,
+                           marginal_prob_std,
+                           diffusion_coeff,
+                           batch_size=64,
+                           x_shape=(1, 28, 28),
+                           num_steps=500,
+                           device='cuda',
+                           eps=1e-3,
+                           y=None):
+    """Generate samples from score-based models with the Euler-Maruyama solver.
+
+    Args:
+      score_model: A PyTorch model that represents the time-dependent score-based model.
+      marginal_prob_std: A function that gives the standard deviation of
+        the perturbation kernel.
+      diffusion_coeff: A function that gives the diffusion coefficient of the SDE.
+      batch_size: The number of samplers to generate by calling this function once.
+      num_steps: The number of sampling steps.
+        Equivalent to the number of discretized time steps.
+      device: 'cuda' for running on GPUs, and 'cpu' for running on CPUs.
+      eps: The smallest time step for numerical stability.
+
+    Returns:
+      Samples.
+    """
+    t = torch.ones(batch_size, device=device)
+    init_x = torch.randn(batch_size, *x_shape, device=device) \
+             * marginal_prob_std(t)[:, None, None, None]
+    time_steps = torch.linspace(1., eps, num_steps, device=device)
+    step_size = time_steps[0] - time_steps[1]
+    x = init_x
+    with torch.no_grad():
+        for time_step in tqdm(time_steps):
+            batch_time_step = torch.ones(batch_size, device=device) * time_step
+            g = diffusion_coeff(batch_time_step)
+            mean_x = x + (g ** 2)[:, None, None, None] * score_model(x, batch_time_step, cond=y, output_dict=False) * step_size
+            x = mean_x + torch.sqrt(step_size) * g[:, None, None, None] * torch.randn_like(x)
+            # Do not include any noise in the last sampling step.
+    return mean_x
+#%%
+import matplotlib.pyplot as plt
+from torchvision.utils import make_grid
+def save_sample_callback(score_model, epocs, ckpt_name):
+    sample_batch_size = 64
+    num_steps = 250
+    y_samp = yseq_data[:sample_batch_size, :]
+    y_emb = cond_embed(y_samp.cuda())
+    sampler = Euler_Maruyama_sampler
+    samples = sampler(score_model,
+                      marginal_prob_std_fn,
+                      diffusion_coeff_fn,
+                      sample_batch_size,
+                      x_shape=(3, 32, 32),
+                      num_steps=num_steps,
+                      device=device,
+                      y=y_emb, )
+    denormalize = Normalize([-0.485/0.229, -0.456/0.224, -0.406/0.225],
+                        [1/0.229, 1/0.224, 1/0.225])
+    samples = denormalize(samples).clamp(0.0, 1.0)
+    sample_grid = make_grid(samples, nrow=int(math.sqrt(sample_batch_size)))
+
+    plt.figure(figsize=(8, 8))
+    plt.axis('off')
+    plt.imshow(sample_grid.permute(1, 2, 0).cpu(), vmin=0., vmax=1.)
+    plt.tight_layout()
+    plt.savefig(f"/home/binxuwang/DL_Projects/SDfromScratch/samples_{ckpt_name}_{epocs}.png")
+    plt.show()
+#%%
+from StableDiff_UNet_model import UNet_SD, load_pipe_into_our_UNet
+#%% UNet without latent space no VAE
+unet_face = UNet_SD(in_channels=3,
+                    base_channels=128,
+                    time_emb_dim=256,
+                    context_dim=256,
+                    multipliers=(1, 1, 2),
+                    attn_levels=(1, 2, ),
+                    nResAttn_block=1,
+                    )
+cond_embed = nn.Embedding(40 + 1, 256, padding_idx=40).cuda()
+#%%
+torch.save(unet_face.state_dict(), "/home/binxuwang/DL_Projects/SDfromScratch/SD_unet_face.pt",)
+#%%
+unet_face(torch.randn(1, 3, 64, 64).cuda(), time_steps=torch.rand(1).cuda(),
+          cond=torch.randn(1, 20, 256).cuda(),
+          output_dict=False)
+#%%
+#%%
+train_score_model(unet_face, cond_embed, saved_dataset,
+                  lr=1.5e-4, n_epochs=100, batch_size=256,
+                  ckpt_name="unet_SD_face", device=device,
+                  callback=save_sample_callback)
+
+#%%
+
+
+save_sample_callback(unet_face, 0, "unet_SD_face")
+#%%
+torch.save(cond_embed.state_dict(), f'/home/binxuwang/DL_Projects/SDfromScratch/ckpt_{"unet_SD_face"}_cond_embed.pth')
diff --git a/StableDiffusion_exps.py b/StableDiffusion_exps.py
@@ -14,11 +14,25 @@ def plt_show_image(image):
     plt.show()
 
 
+def recursive_print(module, prefix="", depth=0, deepest=3):
+    """Simulating print(module) for torch.nn.Modules
+        but with depth control. Print to the `deepest` level. `deepest=0` means no print
+    """
+    if depth >= deepest:
+        return
+    for name, child in module.named_children():
+        if len([*child.named_children()]) == 0:
+            print(f"{prefix}({name}): {child}")
+        else:
+            print(f"{prefix}({name}): {type(child).__name__}")
+        recursive_print(child, prefix + "  ", depth + 1, deepest)
+
+#%%
+
 pipe = StableDiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
     use_auth_token=True
 ).to("cuda")
-#%%
 def dummy_checker(images, **kwargs): return images, False
 pipe.safety_checker = dummy_checker
 #%% Text to
@@ -40,36 +54,36 @@ def dummy_checker(images, **kwargs): return images, False
 
 
 #%% Saving images during diffusion process using callback
+
+latents_reservoir = []
 @torch.no_grad()
 def plot_show_callback(i, t, latents):
+    latents_reservoir.append(latents.detach().cpu())
     latents = 1 / 0.18215 * latents
     image = pipe.vae.decode(latents).sample
     image = (image / 2 + 0.5).clamp(0, 1)
     image = image.cpu().permute(0, 2, 3, 1).float().numpy()
     plt_show_image(image[0])
     plt.imsave(f"/home/binxuwang/DL_Projects/SDfromScratch/diffproc/sample_{i:02d}.png", image[0])
 
+latents_reservoir = []
+@torch.no_grad()
+def save_latents(i, t, latents):
+    latents_reservoir.append(latents.detach().cpu())
 #%%
 # prompt = "A ballerina dancing on a high ground in the starry night"
-prompt = "A cute cat running on the grass in the style of Monet"
+# prompt = "A cute cat running on the grass in the style of Monet"
+prompt = "A ballerina chasing her cat running on the grass in the style of Monet"
+prompt = "A kitty cat dressed like Lincoln, old timey style"
 with autocast("cuda"):
-    image = pipe(prompt, callback=plot_show_callback)["sample"][0]
+    image = pipe(prompt, callback=None)["sample"][0]  # plot_show_callback
 
-image.save("cat_Monet.png")
+image.save("cat_Lincoln.png")
 plt_show_image(image)
+#%%
+len(latents_reservoir)
+plt_show_image(latents_reservoir[-10][0, [0, 1, 2,], :].permute(1, 2, 0).cpu().numpy() / 1.6 + 0.4)
 #%% Visualize architecture
-def recursive_print(module, prefix="", depth=0, deepest=3):
-    """Simulating print(module) for torch.nn.Modules
-        but with depth control. Print to the `deepest` level. `deepest=0` means no print
-    """
-    if depth >= deepest:
-        return
-    for name, child in module.named_children():
-        if len([*child.named_children()]) == 0:
-            print(f"{prefix}({name}): {child}")
-        else:
-            print(f"{prefix}({name}): {type(child).__name__}")
-        recursive_print(child, prefix + "  ", depth + 1, deepest)
 
 #%% Full unets
 recursive_print(pipe.unet, deepest=3)