How to generate imange using Latent Diffusion Models
An introduction tutorial to test Latent Diffusion Models to generate a imange from text - images generated with Artificial Intellingence.
- Clone repos and install requirements
!git clone https://github.com/CompVis/latent-diffusion.git
!git clone https://github.com/CompVis/taming-transformers
!pip install -e ./taming-transformers
!pip install ipywidgets omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops
import sys
sys.path.append(".")
sys.path.append('./taming-transformers')
from taming.models import vqgan # checking correct import from taming
- Define the Task (currently only superresolution is available, other tasks are coming soon)
%cd latent-diffusion
import ipywidgets as widgets
from IPython.display import display
mode = widgets.Select(options=['superresolution'],
value='superresolution', description='Task:')
display(mode)
- Download model checkpoint ( takes ~ 3 Min) and load model
from notebook_helpers import get_model
model = get_model(mode.value)
- Optional step: Upload your own conditioning image for superresolution (height and width have to take values in [128, 192, 256])
from notebook_helpers import get_custom_cond
get_custom_cond(mode.value)
- Select conditioning from available examples or the uploaded custom conditioning
from notebook_helpers import get_cond_options, get_cond
dir, options = get_cond_options(mode.value)
cond_choice = widgets.RadioButtons(
options=options,
description='Select conditioning:',
disabled=False
)
display(cond_choice)
- Run Model
from notebook_helpers import run
import os
custom_steps = 100
cond_choice_path = os.path.join(dir, cond_choice.value)
logs = run(model["model"], cond_choice_path, mode.value, custom_steps)
import torch
import numpy as np
import IPython.display as d
from PIL import Image
sample = logs["sample"]
sample = sample.detach().cpu()
sample = torch.clamp(sample, -1., 1.)
sample = (sample + 1.) / 2. * 255
sample = sample.numpy().astype(np.uint8)
sample = np.transpose(sample, (0, 2, 3, 1))
print(sample.shape)
a = Image.fromarray(sample[0])
display(a)
from notebook_helpers import run
import os
custom_steps = 100
cond_choice_path = os.path.join(dir, cond_choice.value)
logs = run(model["model"], cond_choice_path, mode.value, custom_steps)
import torch
import numpy as np
import IPython.display as d
from PIL import Image
sample = logs["sample"]
sample = sample.detach().cpu()
sample = torch.clamp(sample, -1., 1.)
sample = (sample + 1.) / 2. * 255
sample = sample.numpy().astype(np.uint8)
sample = np.transpose(sample, (0, 2, 3, 1))
print(sample.shape)
a = Image.fromarray(sample[0])
display(a)
from notebook_helpers import run
import os
custom_steps = 100
cond_choice_path = os.path.join(dir, cond_choice.value)
logs = run(model["model"], cond_choice_path, mode.value, custom_steps)
- Display Sample
import torch
import numpy as np
import IPython.display as d
from PIL import Image
sample = logs["sample"]
sample = sample.detach().cpu()
sample = torch.clamp(sample, -1., 1.)
sample = (sample + 1.) / 2. * 255
sample = sample.numpy().astype(np.uint8)
sample = np.transpose(sample, (0, 2, 3, 1))
print(sample.shape)
a = Image.fromarray(sample[0])
display(a)
!git clone https://github.com/CompVis/latent-diffusion.git
!git clone https://github.com/CompVis/taming-transformers
!pip install -e ./taming-transformers
!pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops
import sys
sys.path.append(".")
sys.path.append('./taming-transformers')
from taming.models import vqgan
Let's also check what type of GPU we've got.
!nvidia-smi
Now, download the checkpoint (~1.7 GB). This will usually take 1-2 minutes.
%cd latent-diffusion/
!mkdir -p models/ldm/cin256-v2/
!wget -O models/ldm/cin256-v2/model.ckpt https://ommer-lab.com/files/latent-diffusion/nitro/cin/model.ckpt
Loading model
import torch
from omegaconf import OmegaConf
from ldm.util import instantiate_from_config
def load_model_from_config(config, ckpt):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt)#, map_location="cpu")
sd = pl_sd["state_dict"]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
model.cuda()
model.eval()
return model
def get_model():
config = OmegaConf.load("configs/latent-diffusion/cin256-v2.yaml")
model = load_model_from_config(config, "models/ldm/cin256-v2/model.ckpt")
return model
!pip install clip
!pip install kornia
from ldm.models.diffusion.ddim import DDIMSampler
model = get_model()
sampler = DDIMSampler(model)
And go. Quality, sampling speed and diversity are best controlled via the scale, ddim_steps and ddim_eta variables. As a rule of thumb, higher values of scale produce better samples at the cost of a reduced output diversity. Furthermore, increasing ddim_steps generally also gives higher quality samples, but returns are diminishing for values > 250. Fast sampling (i e. low values of ddim_steps) while retaining good quality can be achieved by using ddim_eta = 0.0.
import numpy as np
from PIL import Image
from einops import rearrange
from torchvision.utils import make_grid
classes = [25, 187, 448, 992] # define classes to be sampled here
n_samples_per_class = 6
ddim_steps = 20
ddim_eta = 0.0
scale = 3.0 # for unconditional guidance
all_samples = list()
with torch.no_grad():
with model.ema_scope():
uc = model.get_learned_conditioning(
{model.cond_stage_key: torch.tensor(n_samples_per_class*[1000]).to(model.device)}
)
for class_label in classes:
print(f"rendering {n_samples_per_class} examples of class '{class_label}' in {ddim_steps} steps and using s={scale:.2f}.")
xc = torch.tensor(n_samples_per_class*[class_label])
c = model.get_learned_conditioning({model.cond_stage_key: xc.to(model.device)})
samples_ddim, _ = sampler.sample(S=ddim_steps,
conditioning=c,
batch_size=n_samples_per_class,
shape=[3, 64, 64],
verbose=False,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc,
eta=ddim_eta)
x_samples_ddim = model.decode_first_stage(samples_ddim)
x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0,
min=0.0, max=1.0)
all_samples.append(x_samples_ddim)
# display as grid
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n b) c h w')
grid = make_grid(grid, nrow=n_samples_per_class)
# to image
grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
Image.fromarray(grid.astype(np.uint8))
import numpy as np
from PIL import Image
from einops import rearrange
from torchvision.utils import make_grid
classes = [1, 3, 5, 7,9,33] # define classes to be sampled here
n_samples_per_class = 6
ddim_steps = 20
ddim_eta = 0.0
scale = 3.0 # for unconditional guidance
all_samples = list()
with torch.no_grad():
with model.ema_scope():
uc = model.get_learned_conditioning(
{model.cond_stage_key: torch.tensor(n_samples_per_class*[1000]).to(model.device)}
)
for class_label in classes:
print(f"rendering {n_samples_per_class} examples of class '{class_label}' in {ddim_steps} steps and using s={scale:.2f}.")
xc = torch.tensor(n_samples_per_class*[class_label])
c = model.get_learned_conditioning({model.cond_stage_key: xc.to(model.device)})
samples_ddim, _ = sampler.sample(S=ddim_steps,
conditioning=c,
batch_size=n_samples_per_class,
shape=[3, 64, 64],
verbose=False,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc,
eta=ddim_eta)
x_samples_ddim = model.decode_first_stage(samples_ddim)
x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0,
min=0.0, max=1.0)
all_samples.append(x_samples_ddim)
# display as grid
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n b) c h w')
grid = make_grid(grid, nrow=n_samples_per_class)
# to image
grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
Image.fromarray(grid.astype(np.uint8))
ls
%pip install --quiet --upgrade diffusers transformers scipy mediapy
from diffusers import PNDMScheduler, DDIMScheduler, LMSDiscreteScheduler
scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
# scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
# scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
!huggingface-cli login
import mediapy as media
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda"
remove_safety = False
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16, revision="fp16", use_auth_token=True)
if remove_safety:
pipe.safety_checker = lambda images, clip_input: (images, False)
pipe = pipe.to(device)
prompt = "a photo of an astronaut riding a horse on mars"
num_images = 1
prompts = [ prompt ] * num_images
with autocast("cuda"):
images = pipe(prompts, guidance_scale=7.5, num_inference_steps=50)["sample"]
media.show_images(images)
images[0].save("output.jpg")
prompt = "a photo of an astronaut riding a horse on mars"
num_images = 3
prompts = [ prompt ] * num_images
with autocast("cuda"):
images = pipe(prompts, guidance_scale=7.5, num_inference_steps=50)["sample"]
media.show_images(images)
prompt = "Ella went to the supermarket to buy the ingredients to make a cake"
num_images = 1
prompts = [ prompt ] * num_images
with autocast("cuda"):
images = pipe(prompts, guidance_scale=7.5, num_inference_steps=50)["sample"]
media.show_images(images)
prompt = "Ella went to the supermarket to buy the ingredients to make a cake"
num_images = 3
prompts = [ prompt ] * num_images
with autocast("cuda"):
images = pipe(prompts, guidance_scale=7.5, num_inference_steps=50)["sample"]
media.show_images(images)
prompt = "Today is her birthday and her friends come to her house and help her to prepare the cake "
num_images = 3
prompts = [ prompt ] * num_images
with autocast("cuda"):
images = pipe(prompts, guidance_scale=7.5, num_inference_steps=50)["sample"]
media.show_images(images)