remove streamlit demo since outdate

wenc-k · Sep 16, 2024 · 1b16921 · 1b16921
1 parent 0a558e0
commit 1b16921
Show file tree

Hide file tree

Showing 9 changed files with 29 additions and 297 deletions.
diff --git a/inference/cli_demo_quantization.py b/inference/cli_demo_quantization.py
@@ -84,8 +84,8 @@ def generate_video(
     # Using with compile will run faster. First time infer will cost ~30min to compile.
     # pipe.transformer.to(memory_format=torch.channels_last)
 
-    # for FP8 should remove  pipe.enable_sequential_cpu_offload()
-    pipe.enable_sequential_cpu_offload()
+    # for FP8 should remove pipe.enable_model_cpu_offload()
+    pipe.enable_model_cpu_offload()
 
     # This is not for FP8 and INT8 and should remove this line
     # pipe.enable_sequential_cpu_offload()

diff --git a/inference/gradio_composite_demo/app.py b/inference/gradio_composite_demo/app.py
@@ -55,7 +55,7 @@
 pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
     "THUDM/CogVideoX-5b",
     transformer=CogVideoXTransformer3DModel.from_pretrained(
-        "THUDM/CogVideoX-5b-I2V", subfolder="transformers", torch_dtype=torch.bfloat16
+        "THUDM/CogVideoX-5b-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
     ),
     vae=pipe.vae,
     scheduler=pipe.scheduler,
@@ -65,10 +65,10 @@
 ).to(device)
 
 
-pipe.transformer.to(memory_format=torch.channels_last)
-pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
-pipe_image.transformer.to(memory_format=torch.channels_last)
-pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True)
+# pipe.transformer.to(memory_format=torch.channels_last)
+# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
+# pipe_image.transformer.to(memory_format=torch.channels_last)
+# pipe_image.transformer = torch.compile(pipe_image.transformer, mode="max-autotune", fullgraph=True)
 
 os.makedirs("./output", exist_ok=True)
 os.makedirs("./gradio_tmp", exist_ok=True)
@@ -241,7 +241,7 @@ def infer(
             generator=torch.Generator(device="cpu").manual_seed(seed),
         ).frames
     elif image_input is not None:
-        image_input = Image.fromarray(image_input)  # Change to PIL
+        image_input = Image.fromarray(image_input).resize(size=(720, 480))  # Convert to PIL
         image = load_image(image_input)
         video_pt = pipe_image(
             image=image,

diff --git a/inference/gradio_web_demo.py b/inference/gradio_web_demo.py
@@ -2,6 +2,10 @@
 THis is the main file for the gradio web demo. It uses the CogVideoX-2B model to generate videos gradio web demo.
 set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
 
+This demo only supports the text-to-video generation model.
+If you wish to use the image-to-video or video-to-video generation models,
+please use the gradio_composite_demo to implement the full GUI functionality.
+
 Usage:
     OpenAI_API_KEY=your_openai_api_key OpenAI_BASE_URL=https://api.openai.com/v1 python inference/gradio_web_demo.py
 """
@@ -18,12 +22,8 @@
 from openai import OpenAI
 import moviepy.editor as mp
 
-dtype = torch.bfloat16
-device = "cuda"  # Need to use cuda
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16).to("cuda")
 
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=dtype).to(device)
-pipe.enable_model_cpu_offload()
-pipe.enable_sequential_cpu_offload()
 pipe.vae.enable_slicing()
 pipe.vae.enable_tiling()
 
@@ -47,6 +47,7 @@
 def convert_prompt(prompt: str, retry_times: int = 3) -> str:
     if not os.environ.get("OPENAI_API_KEY"):
         return prompt
+
     client = OpenAI()
     text = prompt.strip()
 
@@ -83,7 +84,7 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
                     "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
                 },
             ],
-            model="glm-4-0520",
+            model="glm-4-plus",
             temperature=0.01,
             top_p=0.7,
             stream=False,
@@ -145,19 +146,9 @@ def delete_old_files():
 with gr.Blocks() as demo:
     gr.Markdown("""
            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
-               CogVideoX-2B Huggingface Space🤗
-           </div>
-           <div style="text-align: center;">
-               <a href="https://huggingface.co/THUDM/CogVideoX-2B">🤗 2B Model Hub</a> |
-               <a href="https://github.com/THUDM/CogVideo">🌐 Github</a> |
-               <a href="https://arxiv.org/pdf/2408.06072">📜 arxiv </a>
-           </div>
-
-           <div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px;">
-            ⚠️ This demo is for academic research and experiential use only. 
-            Users should strictly adhere to local laws and ethics.
-            </div>
-           """)
+               CogVideoX Gradio Simple Space🤗
+            """)
+
     with gr.Row():
         with gr.Column():
             prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
@@ -173,7 +164,6 @@ def delete_old_files():
                     "**Optional Parameters** (default values are recommended)<br>"
                     "Increasing the number of inference steps will produce more detailed videos, but it will slow down the process.<br>"
                     "50 steps are recommended for most cases.<br>"
-                    "For the 5B model, 50 steps will take approximately 350 seconds."
                 )
                 with gr.Row():
                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
@@ -186,42 +176,6 @@ def delete_old_files():
                 download_video_button = gr.File(label="📥 Download Video", visible=False)
                 download_gif_button = gr.File(label="📥 Download GIF", visible=False)
 
-    gr.Markdown("""
-    <table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
-         <div style="text-align: center; font-size: 24px; font-weight: bold; margin-bottom: 20px;">
-               Demo Videos with 50 Inference Steps and 6.0 Guidance Scale.
-         </div>
-        <tr>
-            <td style="width: 25%; vertical-align: top; font-size: 0.8em;">
-                <p>A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.</p>
-            </td>
-            <td style="width: 25%; vertical-align: top;">
-                <video src="https://github.com/user-attachments/assets/ea3af39a-3160-4999-90ec-2f7863c5b0e9" width="100%" controls autoplay></video>
-            </td>
-            <td style="width: 25%; vertical-align: top; font-size: 0.8em;">
-                <p>The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.</p>
-            </td>
-            <td style="width: 25%; vertical-align: top;">
-                <video src="https://github.com/user-attachments/assets/9de41efd-d4d1-4095-aeda-246dd834e91d" width="100%" controls autoplay></video>
-            </td>
-        </tr>
-        <tr>
-            <td style="width: 25%; vertical-align: top; font-size: 0.8em;">
-                <p>A street artist, clad in a worn-out denim jacket and a colorful bandana, stands before a vast concrete wall in the heart, holding a can of spray paint, spray-painting a colorful bird on a mottled wall.</p>
-            </td>
-            <td style="width: 25%; vertical-align: top;">
-                <video src="https://github.com/user-attachments/assets/941d6661-6a8d-4a1b-b912-59606f0b2841" width="100%" controls autoplay></video>
-            </td>
-            <td style="width: 25%; vertical-align: top; font-size: 0.8em;">
-                <p>In the haunting backdrop of a war-torn city, where ruins and crumbled walls tell a story of devastation, a poignant close-up frames a young girl. Her face is smudged with ash, a silent testament to the chaos around her. Her eyes glistening with a mix of sorrow and resilience, capturing the raw emotion of a world that has lost its innocence to the ravages of conflict.</p>
-            </td>
-            <td style="width: 25%; vertical-align: top;">
-                <video src="https://github.com/user-attachments/assets/938529c4-91ae-4f60-b96b-3c3947fa63cb" width="100%" controls autoplay></video>
-            </td>
-        </tr>
-    </table>
-    """)
-
     def generate(prompt, num_inference_steps, guidance_scale, model_choice, progress=gr.Progress(track_tqdm=True)):
         tensor = infer(prompt, num_inference_steps, guidance_scale, progress=progress)
         video_path = save_video(tensor)

diff --git a/inference/streamlit_web_demo.py b/inference/streamlit_web_demo.py
diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,6 @@ torchvision==0.19.0
 sentencepiece==0.2.0
 SwissArmyTransformer>=0.4.12
 gradio>=4.44.0
-streamlit>=1.38.0
 imageio>=2.35.1
 imageio-ffmpeg>=0.5.1
 openai>=1.45.0