Releases: chengzeyi/ParaAttention
Releases · chengzeyi/ParaAttention
Nightly Release 20250113
TODO: Add nightly release notes
v0.3.10
v0.3.9 Fastest FLUX.1-dev Inference
v0.3.8
Dev first block cache (#12) * implement first block cache * fix * fix * fix * add doc * fix * make flux work * fix * fix * fix * fix * refactor * fix * fix * Update fastest_hunyuan_video.md * Update fastest_hunyuan_video.md * fix * fix * fix * fix
v0.3.7
make hunyuan_video roboster
v0.3.6
Update README.md
v0.3.5
remove unnecessary assert
v0.3.4
Run HunyuanVideo🚀 with Parallel Inference
NOTE: To run HunyuanVideo
, you need to install diffusers
from its latest master branch.
It is suggested to run HunyuanVideo
with GPUs with 80GB memory, or you might experience OOM errors,
and the performance might be worse due to frequent memory re-allocation.
import torch
import torch.distributed as dist
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video
# RuntimeError: Expected mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good() to be true, but got false.
torch.backends.cuda.enable_cudnn_sdp(False)
dist.init_process_group()
model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder="transformer",
torch_dtype=torch.bfloat16,
revision="refs/pr/18",
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision="refs/pr/18",
).to(f"cuda:{dist.get_rank()}")
pipe.vae.enable_tiling(
# Make it runnable on GPUs with 48GB memory
tile_sample_min_height=128,
tile_sample_stride_height=96,
tile_sample_min_width=128,
tile_sample_stride_width=96,
tile_sample_min_num_frames=32,
tile_sample_stride_num_frames=24,
)
from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
from para_attn.parallel_vae.diffusers_adapters import parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.type,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
# Fix OOM because of awful inductor lowering of attn_bias of _scaled_dot_product_efficient_attention
# import para_attn
# para_attn.config.attention.force_dispatch_to_custom_ops = True
# torch._inductor.config.reorder_for_compute_comm_overlap = True
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
output = pipe(
prompt="A cat walks on the grass, realistic",
height=320,
width=512,
num_frames=61,
num_inference_steps=30,
output_type="pil" if dist.get_rank() == 0 else "pt",
).frames[0]
if dist.get_rank() == 0:
print("Saving video to hunyuan_video.mp4")
export_to_video(output, "hunyuan_video.mp4", fps=15)
dist.destroy_process_group()
Save the above code to run_hunyuan_video.py
and run it with torchrun
:
torchrun --nproc_per_node=2 run_hunyuan_video.py
v0.3.3
set output_type to latent for all ranks except rank 0
v0.3.2
🚀Support Multi-GPU Parallel Inference Speedup for CogVideoX
Everything works out of the box!
import torch
import torch.distributed as dist
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
dist.init_process_group()
pipe = CogVideoXPipeline.from_pretrained(
"THUDM/CogVideoX-5b",
torch_dtype=torch.bfloat16,
).to(f"cuda:{dist.get_rank()}")
# pipe.enable_model_cpu_offload()
# pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
parallelize_pipe(
pipe,
mesh=init_context_parallel_mesh(
pipe.device.type,
max_batch_dim_size=2,
max_ring_dim_size=2,
),
)
torch._inductor.config.reorder_for_compute_comm_overlap = True
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
video = pipe(
prompt=prompt,
num_videos_per_prompt=1,
num_inference_steps=50,
num_frames=49,
guidance_scale=6,
# generator=torch.Generator(device=pipe.device).manual_seed(42),
).frames[0]
if dist.get_rank() == 0:
print("Saving video to cogvideox.mp4")
export_to_video(video, "cogvideox.mp4", fps=8)
dist.destroy_process_group()