tools/llm_flux_cogvideox/llm_flux_cogvideox.py

"""
The original experimental code for this project can be found at:

https://gist.github.com/a-r-r-o-w/d070cce059ab4ceab3a9f289ff83c69c

By using this code, description prompts will be generated through a local large language model, and images will be
generated using the black-forest-labs/FLUX.1-dev model, followed by video generation via CogVideoX.
The entire process utilizes open-source solutions, without the need for any API keys.

You can use the generate.sh file in the same folder to automate running this code
for batch generation of videos and images.

bash generate.sh

"""

import argparse
import gc
import json
import os
import pathlib
import random
from typing import Any, Dict

from transformers import AutoTokenizer

os.environ["TORCH_LOGS"] = "+dynamo,recompiles,graph_breaks"
os.environ["TORCHDYNAMO_VERBOSE"] = "1"

import numpy as np
import torch
import transformers
from diffusers import CogVideoXImageToVideoPipeline, CogVideoXDPMScheduler, DiffusionPipeline
from diffusers.utils.logging import get_logger
from diffusers.utils import export_to_video

torch.set_float32_matmul_precision("high")

logger = get_logger(__name__)

SYSTEM_PROMPT = """
You are part of a team of people that create videos using generative models. You use a video-generation model that can generate a video about anything you describe.

For example, if you respond with "A beautiful morning in the woods with the sun peaking through the trees", the video generation model will create a video of exactly as described. You task is to summarize the descriptions of videos provided to by users, and create details prompts to feed into the generative model.

There are a few rules to follow:
- You will only ever output a single video description per request.
- If the user mentions to summarize the prompt in [X] words, make sure to not exceed the limit.

You responses should just be the video generation prompt. Here are examples:
- “A lone figure stands on a city rooftop at night, gazing up at the full moon. The moon glows brightly, casting a gentle light over the quiet cityscape. Below, the windows of countless homes shine with warm lights, creating a contrast between the bustling life below and the peaceful solitude above. The scene captures the essence of the Mid-Autumn Festival, where despite the distance, the figure feels connected to loved ones through the shared beauty of the moonlit sky.”
- "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting."
- "A street artist, clad in a worn-out denim jacket and a colorful banana, stands before a vast concrete wall in the heart, holding a can of spray paint, spray-painting a colorful bird on a mottled wall"
""".strip()

USER_PROMPT = """
Could you generate a prompt for a video generation model? 
Please limit the prompt to [{0}] words.
""".strip()


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--num_videos",
        type=int,
        default=5,
        help="Number of unique videos you would like to generate."
    )
    parser.add_argument(
        "--model_path",
        type=str,
        default="THUDM/CogVideoX-5B",
        help="The path of Image2Video CogVideoX-5B",
    )
    parser.add_argument(
        "--caption_generator_model_id",
        type=str,
        default="THUDM/glm-4-9b-chat",
        help="Caption generation model. default GLM-4-9B",
    )
    parser.add_argument(
        "--caption_generator_cache_dir",
        type=str,
        default=None,
        help="Cache directory for caption generation model."
    )
    parser.add_argument(
        "--image_generator_model_id",
        type=str,
        default="black-forest-labs/FLUX.1-dev",
        help="Image generation model."
    )
    parser.add_argument(
        "--image_generator_cache_dir",
        type=str,
        default=None,
        help="Cache directory for image generation model."
    )
    parser.add_argument(
        "--image_generator_num_inference_steps",
        type=int,
        default=50,
        help="Caption generation model."
    )
    parser.add_argument(
        "--guidance_scale",
        type=float,
        default=7,
        help="Guidance scale to be use for generation."
    )
    parser.add_argument(
        "--use_dynamic_cfg",
        action="store_true",
        help="Whether or not to use cosine dynamic guidance for generation [Recommended].",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="outputs/",
        help="Location where generated images and videos should be stored.",
    )
    parser.add_argument(
        "--compile",
        action="store_true",
        help="Whether or not to compile the transformer of image and video generators."
    )
    parser.add_argument(
        "--enable_vae_tiling",
        action="store_true",
        help="Whether or not to use VAE tiling when encoding/decoding."
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Seed for reproducibility."
    )
    return parser.parse_args()


def reset_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()


@torch.no_grad()
def main(args: Dict[str, Any]) -> None:
    output_dir = pathlib.Path(args.output_dir)
    os.makedirs(output_dir.as_posix(), exist_ok=True)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    reset_memory()
    tokenizer = AutoTokenizer.from_pretrained(args.caption_generator_model_id, trust_remote_code=True)
    caption_generator = transformers.pipeline(
        "text-generation",
        model=args.caption_generator_model_id,
        device_map="auto",
        model_kwargs={
            "local_files_only": True,
            "cache_dir": args.caption_generator_cache_dir,
            "torch_dtype": torch.bfloat16,
        },
        trust_remote_code=True,
        tokenizer=tokenizer
    )

    captions = []
    for i in range(args.num_videos):
        num_words = random.choice([50, 75, 100])
        user_prompt = USER_PROMPT.format(num_words)

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ]

        outputs = caption_generator(messages, max_new_tokens=226)
        caption = outputs[0]["generated_text"][-1]["content"]
        if caption.startswith("\"") and caption.endswith("\""):
            caption = caption[1:-1]
        captions.append(caption)
        logger.info(f"Generated caption: {caption}")

    with open(output_dir / "captions.json", "w") as file:
        json.dump(captions, file)

    del caption_generator
    reset_memory()

    image_generator = DiffusionPipeline.from_pretrained(
        args.image_generator_model_id,
        cache_dir=args.image_generator_cache_dir,
        torch_dtype=torch.bfloat16
    )
    image_generator.to("cuda")

    if args.compile:
        image_generator.transformer = torch.compile(image_generator.transformer, mode="max-autotune", fullgraph=True)

    if args.enable_vae_tiling:
        image_generator.vae.enable_tiling()

    images = []
    for index, caption in enumerate(captions):
        image = image_generator(
            prompt=caption,
            height=480,
            width=720,
            num_inference_steps=args.image_generator_num_inference_steps,
            guidance_scale=3.5,
        ).images[0]
        filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_")
        image.save(output_dir / f"{index}_{filename}.png")
        images.append(image)

    del image_generator
    reset_memory()

    video_generator = CogVideoXImageToVideoPipeline.from_pretrained(
        args.model_path, torch_dtype=torch.bfloat16).to("cuda")
    video_generator.scheduler = CogVideoXDPMScheduler.from_config(
        video_generator.scheduler.config,
        timestep_spacing="trailing")

    if args.compile:
        video_generator.transformer = torch.compile(video_generator.transformer, mode="max-autotune", fullgraph=True)

    if args.enable_vae_tiling:
        video_generator.vae.enable_tiling()

    generator = torch.Generator().manual_seed(args.seed)
    for index, (caption, image) in enumerate(zip(captions, images)):
        video = video_generator(
            image=image,
            prompt=caption,
            height=480,
            width=720,
            num_frames=49,
            num_inference_steps=50,
            guidance_scale=args.guidance_scale,
            use_dynamic_cfg=args.use_dynamic_cfg,
            generator=generator,
        ).frames[0]
        filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_")
        export_to_video(video, output_dir / f"{index}_{filename}.mp4", fps=8)


if __name__ == "__main__":
    args = get_args()
    main(args)