inference_layout.py

import argparse
import ast
from PIL import Image
from omegaconf import OmegaConf
from ldm.models.diffusion.plms import PLMSSampler
import os 
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import torch 
from ldm.util import instantiate_from_config
import numpy as np
import random
from functools import partial
import torchvision.transforms.functional as TF
from diffusers import UNet2DConditionModel
from utils.attentionmap import AttentionStore, AttentionControl, register_attention_control
from get_layout import get_layout

device = "cuda"

def set_alpha_scale(model, alpha_scale):
    from ldm.modules.attention import GatedCrossAttentionDense, GatedSelfAttentionDense
    for module in model.modules():
        if type(module) == GatedCrossAttentionDense or type(module) == GatedSelfAttentionDense:
            module.scale = alpha_scale


def batch_to_device(batch, device):
    for k in batch:
        if isinstance(batch[k], torch.Tensor):
            batch[k] = batch[k].to(device)
    return batch

def alpha_generator(length, type=None):
    """
    length is total timestpes needed for sampling. 
    type should be a list containing three values which sum should be 1
    
    It means the percentage of three stages: 
    alpha=1 stage 
    linear deacy stage 
    alpha=0 stage. 
    
    For example if length=100, type=[0.8,0.1,0.1]
    then the first 800 stpes, alpha will be 1, and then linearly decay to 0 in the next 100 steps,
    and the last 100 stpes are 0.    
    """
    if type == None:
        type = [1,0,0]

    assert len(type)==3 
    assert type[0] + type[1] + type[2] == 1
    
    stage0_length = int(type[0]*length)
    stage1_length = int(type[1]*length)
    stage2_length = length - stage0_length - stage1_length
    
    if stage1_length != 0: 
        decay_alphas = np.arange(start=0, stop=1, step=1/stage1_length)[::-1]
        decay_alphas = list(decay_alphas)
    else:
        decay_alphas = []
        
    
    alphas = [1]*stage0_length + decay_alphas + [0]*stage2_length
    
    assert len(alphas) == length
    
    return alphas


def load_ckpt(ckpt_path):
    
    saved_ckpt = torch.load(ckpt_path)
    config = saved_ckpt["config_dict"]["_content"]

    model = instantiate_from_config(config['model']).to(device).eval()
    autoencoder = instantiate_from_config(config['autoencoder']).to(device).eval()
    text_encoder = instantiate_from_config(config['text_encoder']).to(device).eval()
    diffusion = instantiate_from_config(config['diffusion']).to(device)

    # donot need to load official_ckpt for self.model here, since we will load from our ckpt
    model.load_state_dict( saved_ckpt['model'] )
    autoencoder.load_state_dict( saved_ckpt["autoencoder"]  )
    text_encoder.load_state_dict( saved_ckpt["text_encoder"]  )
    diffusion.load_state_dict( saved_ckpt["diffusion"]  )

    return model, autoencoder, text_encoder, diffusion, config


def project(x, projection_matrix):
    """
    x (Batch*768) should be the penultimate feature of CLIP (before projection)
    projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer 
    defined in CLIP (out_dim, in_dim), thus we need to apply transpose below.  
    this function will return the CLIP feature (without normalziation)
    """
    return x@torch.transpose(projection_matrix, 0, 1)


def get_clip_feature(model, processor, input, is_image=False):
    which_layer_text = 'before'
    which_layer_image = 'after_reproject'

    if is_image:
        if input == None:
            return None
        image = Image.open(input).convert("RGB")
        inputs = processor(images=[image],  return_tensors="pt", padding=True)
        inputs['pixel_values'] = inputs['pixel_values'].cuda() # we use our own preprocessing without center_crop 
        inputs['input_ids'] = torch.tensor([[0,1,2,3]]).cuda()  # placeholder
        outputs = model(**inputs)
        feature = outputs.image_embeds 
        if which_layer_image == 'after_reproject':
            feature = project( feature, torch.load('projection_matrix').cuda().T ).squeeze(0)
            feature = ( feature / feature.norm() )  * 28.7 
            feature = feature.unsqueeze(0)
    else:
        if input == None:
            return None
        inputs = processor(text=input,  return_tensors="pt", padding=True)
        inputs['input_ids'] = inputs['input_ids'].cuda()
        inputs['pixel_values'] = torch.ones(1,3,224,224).cuda() # placeholder 
        inputs['attention_mask'] = inputs['attention_mask'].cuda()
        outputs = model(**inputs)
        if which_layer_text == 'before':
            feature = outputs.text_model_output.pooler_output
    return feature


def complete_mask(has_mask, max_objs):
    mask = torch.ones(1,max_objs)
    if has_mask == None:
        return mask 

    if type(has_mask) == int or type(has_mask) == float:
        return mask * has_mask
    else:
        for idx, value in enumerate(has_mask):
            mask[0,idx] = value
        return mask


@torch.no_grad()
def prepare_batch(meta, batch=1, max_objs=30):
    phrases, images = meta.get("phrases"), meta.get("images")
    images = [None]*len(phrases) if images==None else images 
    phrases = [None]*len(images) if phrases==None else phrases 

    version = "openai/clip-vit-large-patch14"
    model = CLIPModel.from_pretrained(version).to(device)
    processor = CLIPProcessor.from_pretrained(version)

    boxes = torch.zeros(max_objs, 4)
    masks = torch.zeros(max_objs)
    text_masks = torch.zeros(max_objs)
    image_masks = torch.zeros(max_objs)
    text_embeddings = torch.zeros(max_objs, 768)
    image_embeddings = torch.zeros(max_objs, 768)
    
    text_features = []
    image_features = []
    for phrase, image in zip(phrases,images):
        text_features.append(  get_clip_feature(model, processor, phrase, is_image=False) )
        image_features.append( get_clip_feature(model, processor, image,  is_image=True) )
        # text_features.append(text_encoder.encode(phrase) )

    for idx, (box, text_feature, image_feature) in enumerate(zip( meta['boundingbox'], text_features, image_features)):
        boxes[idx] = torch.tensor(box)
        masks[idx] = 1
        if text_feature is not None:
            text_embeddings[idx] = text_feature
            text_masks[idx] = 1 
        if image_feature is not None:
            image_embeddings[idx] = image_feature
            image_masks[idx] = 1 

    out = {
        "boxes" : boxes.unsqueeze(0).repeat(batch,1,1),  #(batch, max_objs, 4)
        "masks" : masks.unsqueeze(0).repeat(batch,1),  
        "text_masks" : text_masks.unsqueeze(0).repeat(batch,1)*complete_mask( meta.get("text_mask"), max_objs ),
        "image_masks" : image_masks.unsqueeze(0).repeat(batch,1)*complete_mask( meta.get("image_mask"), max_objs ),
        "text_embeddings"  : text_embeddings.unsqueeze(0).repeat(batch,1,1),  #(batch, max_objs, 768)
        "image_embeddings" : image_embeddings.unsqueeze(0).repeat(batch,1,1)
    }

    return batch_to_device(out, device) 


def crop_and_resize(image):
    crop_size = min(image.size)
    image = TF.center_crop(image, crop_size)
    image = image.resize( (512, 512) )
    return image

def colorEncode(labelmap, colors):
    labelmap = labelmap.astype('int')
    labelmap_rgb = np.zeros((labelmap.shape[0], labelmap.shape[1], 3),
                            dtype=np.uint8)

    for label in np.unique(labelmap):
        if label < 0:
            continue
        labelmap_rgb += (labelmap == label)[:, :, np.newaxis] * \
            np.tile(colors[label],
                    (labelmap.shape[0], labelmap.shape[1], 1))

    return labelmap_rgb


def run(meta, config, starting_noise=None):

    # pretrained l2i model gligen
    layout_unet, autoencoder, text_encoder, diffusion, config = load_ckpt(meta["ckpt"])
    grounding_tokenizer_input = instantiate_from_config(config['grounding_tokenizer_input'])
    layout_unet.grounding_tokenizer_input = grounding_tokenizer_input
    grounding_downsampler_input = None
    if "grounding_downsampler_input" in config:
        grounding_downsampler_input = instantiate_from_config(config['grounding_downsampler_input'])

    config.update( vars(args) )
    config = OmegaConf.create(config)

    batch = prepare_batch(meta, config.batch_size)
    context = text_encoder.encode(  [meta["prompt"]]*config.batch_size  )
    uc = text_encoder.encode( config.batch_size*[""] )
    if args.negative_prompt is not None:
        uc = text_encoder.encode( config.batch_size*[args.negative_prompt] )
           
    # pretrained t2i model sd unet
    text_unet = UNet2DConditionModel.from_pretrained(
        config.pretrained_text2img_model,
        subfolder="unet",
        revision=config.revision,
    ).to('cuda')
    controller = AttentionStore()
    register_attention_control(text_unet, controller)

    alpha_generator_func = partial(alpha_generator, type=meta.get("alpha_type"))
    
    sampler = PLMSSampler(diffusion = diffusion,
                                layout_unet = layout_unet, 
                                text_unet = text_unet, 
                                controller = controller, 
                                scale_factor = config.scale_factor,
                                scale_range = config.scale_range,
                                t0 = config.t0,
                                alpha_generator_func = alpha_generator_func, 
                                set_alpha_scale = set_alpha_scale)
    steps = 50 

    # inpainting related
    inpainting_mask = z0 = None  # used for replacing known region in diffusion process
    inpainting_extra_input = None # used as model input 
                  
    grounding_input = grounding_tokenizer_input.prepare(batch)
    grounding_extra_input = None
    if grounding_downsampler_input != None:
        grounding_extra_input = grounding_downsampler_input.prepare(batch)

    input = dict(
                x = starting_noise,
                timesteps = None, 
                context = context, 
                grounding_input = grounding_input,
                inpainting_extra_input = inpainting_extra_input,
                grounding_extra_input = grounding_extra_input,
                boundingbox = meta['boundingbox'],
                prompt = meta['prompt'],
                token_location = meta['token_location'],
            )

    # start sampling
    shape = (config.batch_size, layout_unet.in_channels, layout_unet.image_size, layout_unet.image_size)

    samples_fake = sampler.sample(S=steps, shape=shape, input=input, uc=uc, guidance_scale=config.guidance_scale, mask=inpainting_mask, x0=z0)
    samples_fake = autoencoder.decode(samples_fake)

    # save 
    output_folder = os.path.join( args.folder,  meta["save_folder_name"])
    os.makedirs( output_folder, exist_ok=True)

    start = len( os.listdir(output_folder) )
    image_ids = list(range(start,start+config.batch_size))
    print(image_ids)
    for image_id, sample in zip(image_ids, samples_fake):
        img_name = str(int(image_id))+'.png'
        sample = torch.clamp(sample, min=-1, max=1) * 0.5 + 0.5
        sample = sample.detach().cpu().numpy().transpose(1,2,0) * 255 
        sample = Image.fromarray(sample.astype(np.uint8))
        sample.save(  os.path.join(output_folder, img_name)   )


if __name__ == "__main__":
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--folder", type=str,  default="generation_samples", help="root folder for output")
    parser.add_argument("--pretrained_text2img_model", type=str, default='Lykon/DreamShaper', required=False, help="Path to pretrained model or model identifier from huggingface.co/models.")
    parser.add_argument("--scale_factor", type=int, default=1000000, help="")
    parser.add_argument("--scale_range", type = tuple, default=(1.0, 0.5), help="")
    parser.add_argument("--seed", type=int, default=152644, help="random seed")
    parser.add_argument("--t0", type=int, default=None, help="when fuse text and layout")
    parser.add_argument("--style", type = str, default=None, help="style of t2i model")
    parser.add_argument("--revision", type=str, default=None, required=False, help="Revision of pretrained model identifier from huggingface.co/models.")
    parser.add_argument("--batch_size", type=int, default=1, help="")
    parser.add_argument("--no_plms", action='store_true', help="use DDIM instead. WARNING: I did not test the code yet")
    parser.add_argument("--guidance_scale", type=float,  default=7.5, help="")
    parser.add_argument("--negative_prompt", type=str,  default='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality', help="")
    parser.add_argument('--user_prompt', type=str,help='input user prompt')
    parser.add_argument('--api_key',default=None,type=str,help='the api key of GPT-4')
    parser.add_argument('--no_gpt',action='store_true', help="Don't use gpt to obtain the layout")
    parser.add_argument("--object", type = str, default=None, help="the set of objects mentioned in the prompt")
    parser.add_argument("--boundingbox", type = str, default=None, help="the set of bounding boxes for each object")
    parser.add_argument("--token_location", type = str, default=None, help="the set of locations where each object appears in the prompt")
    args = parser.parse_args()

    def seed_everything(seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
    seed_everything(args.seed)

    if args.style is not None:
        args.t0 = 0
        if args.style == 'coloring-pages':
            args.pretrained_text2img_model = 'stablediffusionapi/coloring-pages'
        elif args.style == 'cuteyukimix':
            args.pretrained_text2img_model = 'stablediffusionapi/cuteyukimix'

    if args.no_gpt:
        phrases, boundingbox, token_location = ast.literal_eval(args.object), ast.literal_eval(args.boundingbox), ast.literal_eval(args.token_location)
    else:
        phrases, boundingbox, token_location = get_layout(args.user_prompt, args.api_key)
        
    meta_list = [ 
        dict(
            ckpt = "you should download gligen/gligen-generation-text-box/diffusion_pytorch_model.bin and fill in the path here",
            prompt = args.user_prompt,
            phrases = phrases,
            boundingbox = boundingbox,
            token_location = token_location,
            alpha_type = [1.0, 0.0, 0.0],
            save_folder_name="generation_realcompo_v1_sd_gligen_" + args.user_prompt.replace(" ", "_")
        ),     
    ]


    starting_noise = torch.randn(args.batch_size, 4, 64, 64).to(device)
    starting_noise = None
    for meta in meta_list:
        run(meta, args, starting_noise)