code release

VITA-Group · Apr 2, 2024 · e91e884 · e91e884
1 parent a78ce9a
commit e91e884
Show file tree

Hide file tree

Showing 20 changed files with 1,726 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 The official implementation of paper "Comp4D: LLM-Guided Compositional 4D Scene Generation".
 
+Dejia Xu*, Hanwen Liang*, Neel P. Bhatt, Hezhen Hu, Hanxue Liang, Konstantinos N. Plataniotis, and Zhangyang Wang
+
 [[Project Page]](https://vita-group.github.io/Comp4D/) | [[Video (narrated)]](https://www.youtube.com/watch?v=9q8SV1Xf_Xw) | [[Video (results)]](https://www.youtube.com/watch?v=gXVoPTGb734) | [[Paper]](https://github.com/VITA-Group/Comp4D/blob/main/assets/Comp4D.pdf) | [[Arxiv]](https://arxiv.org/abs/2403.16993)
 
 ## News
@@ -73,18 +75,16 @@ We release a set of pre-generated static assets in `data/` directory. During tra
 ```
 
 # cd /path_to_4dfy/
-# seed=0
-# gpu=0
 
 ## Stage 1
-# python launch.py --config configs/fourdfy_stage_1_low_vram.yaml --train --gpu $gpu exp_root_dir=output/ seed=$seed system.prompt_processor.prompt="a flower"
+# python launch.py --config configs/fourdfy_stage_1_low_vram.yaml --train --gpu 0 exp_root_dir=output/ seed=0 system.prompt_processor.prompt="a flower"
 
 ## Stage 2
 # ckpt=output/fourdfy_stage_1_low_vram/a_flower@timestamp/ckpts/last.ckpt
-# python launch.py --config configs/fourdfy_stage_2_low_vram.yaml --train --gpu $gpu exp_root_dir=output/ seed=$seed system.prompt_processor.prompt="a flower" system.weights=$ckpt
+# python launch.py --config configs/fourdfy_stage_2_low_vram.yaml --train --gpu 0 exp_root_dir=output/ seed=0 system.prompt_processor.prompt="a flower" system.weights=$ckpt
 
 ## Post-Process. Convert to mesh file.
-# python launch.py --config output/fourdfy_stage_2_low_vram/a_flower@timestamp/configs/parsed.yaml --export --gpu $gpu \
+# python launch.py --config output/fourdfy_stage_2_low_vram/a_flower@timestamp/configs/parsed.yaml --export --gpu 0 \
 #   resume=output/fourdfy_stage_2_low_vram/a_flower@timestamp/ckpts/last.ckpt system.exporter_type=mesh-exporter \
 #   system.exporter.context_type=cuda system.exporter.fmt=obj
 ## saved to output/fourdfy_stage_2_low_vram/a_flower@timestamp/save/iterations-export/

diff --git a/videocrafter/License b/videocrafter/License
diff --git a/videocrafter/cog.yaml b/videocrafter/cog.yaml
@@ -0,0 +1,25 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+  gpu: true
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  python_version: "3.11"
+  python_packages:
+    - "torch==2.0.1"
+    - "opencv-python==4.8.1.78"
+    - "torchvision==0.15.2"
+    - "pytorch_lightning==2.1.0"
+    - "einops==0.7.0"
+    - "imageio==2.31.6"
+    - "omegaconf==2.3.0"
+    - "transformers==4.35.0"
+    - "moviepy==1.0.3"
+    - "av==10.0.0"
+    - "decord==0.6.0"
+    - "kornia==0.7.0"
+    - "open-clip-torch==2.12.0"
+    - "xformers==0.0.21"
+predict: "predict.py:Predictor"
diff --git a/videocrafter/configs/inference_i2v_512_v1.0.yaml b/videocrafter/configs/inference_i2v_512_v1.0.yaml
@@ -0,0 +1,83 @@
+model:
+  target: lvdm.models.ddpm3d.LatentVisualDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size:
+    - 40
+    - 64
+    channels: 4
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_ema: false
+    uncond_type: empty_seq
+    use_scale: true
+    scale_b: 0.7
+    finegrained: true
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: true
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: false
+        use_image_attention: true
+        temporal_length: 16
+        addition_attention: true
+        fps_cond: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: penultimate
+    cond_img_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
diff --git a/videocrafter/configs/inference_t2v_1024_v1.0.yaml b/videocrafter/configs/inference_t2v_1024_v1.0.yaml
@@ -0,0 +1,77 @@
+model:
+  target: lvdm.models.ddpm3d.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size:
+    - 72
+    - 128
+    channels: 4
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_ema: false
+    uncond_type: empty_seq
+    use_scale: true
+    fix_scale_bug: true
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: false
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: true
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+        fps_cond: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: penultimate
diff --git a/videocrafter/configs/inference_t2v_512_v1.0.yaml b/videocrafter/configs/inference_t2v_512_v1.0.yaml
@@ -0,0 +1,74 @@
+model:
+  target: lvdm.models.ddpm3d.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size:
+    - 40
+    - 64
+    channels: 4
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_ema: false
+    uncond_type: empty_seq
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: false
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: true
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: penultimate
diff --git a/videocrafter/configs/inference_t2v_512_v2.0.yaml b/videocrafter/configs/inference_t2v_512_v2.0.yaml
@@ -0,0 +1,77 @@
+model:
+  target: lvdm.models.ddpm3d.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size:
+    - 40
+    - 64
+    channels: 4
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_ema: false
+    uncond_type: empty_seq
+    use_scale: true
+    scale_b: 0.7
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: true
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+        fps_cond: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: penultimate
diff --git a/videocrafter/prompts/i2v_prompts/horse.png b/videocrafter/prompts/i2v_prompts/horse.png
diff --git a/videocrafter/prompts/i2v_prompts/seashore.png b/videocrafter/prompts/i2v_prompts/seashore.png
diff --git a/videocrafter/prompts/i2v_prompts/test_prompts.txt b/videocrafter/prompts/i2v_prompts/test_prompts.txt
@@ -0,0 +1,2 @@
+horses are walking on the grassland
+a boy and a girl are talking on the seashore
diff --git a/videocrafter/prompts/test_prompts.txt b/videocrafter/prompts/test_prompts.txt
@@ -0,0 +1,3 @@
+a bee is flying over a flower from left to right, photorealistic, high definition
+a cat is on the left of a dog, photorealistic, high definition
+a panda is dancing, photorealistic, high definition
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		horses are walking on the grassland
		a boy and a girl are talking on the seashore