Dev first block cache (#12)

* implement first block cache

* fix

* fix

* fix

* add doc

* fix

* make flux work

* fix

* fix

* fix

* fix

* refactor

* fix

* fix

* Update fastest_hunyuan_video.md

* Update fastest_hunyuan_video.md

* fix

* fix

* fix

* fix

Assets 4

25 Dec 16:10

github-actions

v0.3.7

fbb42ff

v0.3.7

make hunyuan_video roboster

Assets 4

25 Dec 15:01

github-actions

v0.3.6

1c83112

v0.3.6

Update README.md

Assets 4

19 Dec 05:40

github-actions

v0.3.5

7d11f6c

v0.3.5

remove unnecessary assert

Assets 4

19 Dec 02:09

github-actions

v0.3.4

e91b702

v0.3.4

Run HunyuanVideo🚀 with Parallel Inference

NOTE: To run HunyuanVideo, you need to install diffusers from its latest master branch.
It is suggested to run HunyuanVideo with GPUs with 80GB memory, or you might experience OOM errors,
and the performance might be worse due to frequent memory re-allocation.

import torch
import torch.distributed as dist
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# RuntimeError: Expected mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good() to be true, but got false.
torch.backends.cuda.enable_cudnn_sdp(False)

dist.init_process_group()

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    torch_dtype=torch.bfloat16,
    revision="refs/pr/18",
)
pipe = HunyuanVideoPipeline.from_pretrained(
    model_id,
    transformer=transformer,
    torch_dtype=torch.float16,
    revision="refs/pr/18",
).to(f"cuda:{dist.get_rank()}")

pipe.vae.enable_tiling(
    # Make it runnable on GPUs with 48GB memory
    tile_sample_min_height=128,
    tile_sample_stride_height=96,
    tile_sample_min_width=128,
    tile_sample_stride_width=96,
    tile_sample_min_num_frames=32,
    tile_sample_stride_num_frames=24,
)

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
from para_attn.parallel_vae.diffusers_adapters import parallelize_vae

mesh = init_context_parallel_mesh(
    pipe.device.type,
)
parallelize_pipe(
    pipe,
    mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())

# Fix OOM because of awful inductor lowering of attn_bias of _scaled_dot_product_efficient_attention
# import para_attn
# para_attn.config.attention.force_dispatch_to_custom_ops = True

# torch._inductor.config.reorder_for_compute_comm_overlap = True
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
    output_type="pil" if dist.get_rank() == 0 else "pt",
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to hunyuan_video.mp4")
    export_to_video(output, "hunyuan_video.mp4", fps=15)

dist.destroy_process_group()

Save the above code to run_hunyuan_video.py and run it with torchrun:

torchrun --nproc_per_node=2 run_hunyuan_video.py

Assets 4

26 Nov 09:12

github-actions

v0.3.3

f529df3

v0.3.3

set output_type to latent for all ranks except rank 0

Assets 4

19 Nov 15:42

github-actions

v0.3.2

6668e18

v0.3.2

🚀Support Multi-GPU Parallel Inference Speedup for CogVideoX

Everything works out of the box!

import torch
import torch.distributed as dist
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video

dist.init_process_group()

pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-5b",
    torch_dtype=torch.bfloat16,
).to(f"cuda:{dist.get_rank()}")

# pipe.enable_model_cpu_offload()
# pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe

parallelize_pipe(
    pipe,
    mesh=init_context_parallel_mesh(
        pipe.device.type,
        max_batch_dim_size=2,
        max_ring_dim_size=2,
    ),
)

torch._inductor.config.reorder_for_compute_comm_overlap = True
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
video = pipe(
    prompt=prompt,
    num_videos_per_prompt=1,
    num_inference_steps=50,
    num_frames=49,
    guidance_scale=6,
    # generator=torch.Generator(device=pipe.device).manual_seed(42),
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to cogvideox.mp4")
    export_to_video(video, "cogvideox.mp4", fps=8)

dist.destroy_process_group()

Assets 4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run HunyuanVideo🚀 with Parallel Inference

🚀Support Multi-GPU Parallel Inference Speedup for CogVideoX

Releases: chengzeyi/ParaAttention

Nightly Release 20250113

v0.3.10

v0.3.9 Fastest FLUX.1-dev Inference

v0.3.8

v0.3.7

v0.3.6

v0.3.5

v0.3.4

Run HunyuanVideo🚀 with Parallel Inference

v0.3.3

v0.3.2

🚀Support Multi-GPU Parallel Inference Speedup for CogVideoX