Release v0.3.4 · chengzeyi/ParaAttention

Run HunyuanVideo🚀 with Parallel Inference

NOTE: To run HunyuanVideo, you need to install diffusers from its latest master branch.
It is suggested to run HunyuanVideo with GPUs with 80GB memory, or you might experience OOM errors,
and the performance might be worse due to frequent memory re-allocation.

import torch
import torch.distributed as dist
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# RuntimeError: Expected mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good() to be true, but got false.
torch.backends.cuda.enable_cudnn_sdp(False)

dist.init_process_group()

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    torch_dtype=torch.bfloat16,
    revision="refs/pr/18",
)
pipe = HunyuanVideoPipeline.from_pretrained(
    model_id,
    transformer=transformer,
    torch_dtype=torch.float16,
    revision="refs/pr/18",
).to(f"cuda:{dist.get_rank()}")

pipe.vae.enable_tiling(
    # Make it runnable on GPUs with 48GB memory
    tile_sample_min_height=128,
    tile_sample_stride_height=96,
    tile_sample_min_width=128,
    tile_sample_stride_width=96,
    tile_sample_min_num_frames=32,
    tile_sample_stride_num_frames=24,
)

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
from para_attn.parallel_vae.diffusers_adapters import parallelize_vae

mesh = init_context_parallel_mesh(
    pipe.device.type,
)
parallelize_pipe(
    pipe,
    mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())

# Fix OOM because of awful inductor lowering of attn_bias of _scaled_dot_product_efficient_attention
# import para_attn
# para_attn.config.attention.force_dispatch_to_custom_ops = True

# torch._inductor.config.reorder_for_compute_comm_overlap = True
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
    output_type="pil" if dist.get_rank() == 0 else "pt",
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to hunyuan_video.mp4")
    export_to_video(output, "hunyuan_video.mp4", fps=15)

dist.destroy_process_group()

Save the above code to run_hunyuan_video.py and run it with torchrun:

torchrun --nproc_per_node=2 run_hunyuan_video.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

v0.3.4

Run HunyuanVideo🚀 with Parallel Inference