index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="VideoMV">
  <meta property="og:title" content="VideoMV"/>
  <meta property="og:description" content="VideoMV: Consistent Multi-View Generation Based on Large Video Generative Model"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/video_t1.png" />
  <meta property="og:image:width" content="2412"/>
  <meta property="og:image:height" content="1394"/>


  <meta name="twitter:title" content="VideoMV">
  <meta name="twitter:description" content="VideoMV: Consistent Multi-View Generation Based on Large Video Generative Model">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/video_t1.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="Image-to-Video">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>VideoMV</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>

  <nav class="navbar" role="navigation" aria-label="main navigation">
    <div class="navbar-brand">
      <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
      </a>
    </div>
    <div class="navbar-menu">
      <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
        <a class="navbar-item" href=https://github.com/aigc3d>
          <span class="icon">
            <i class="fas fa-home"></i>
          </span>
        </a>

        <div class="navbar-item has-dropdown is-hoverable">
          <a class="navbar-link">
            More Research
          </a>
          <div class="navbar-dropdown">
            <a class="navbar-item" href="https://aigc3d.github.io/richdreamer">
              Richdreamer
            </a>
            <a class="navbar-item" href="https://github.com/yushuang-wu/IPoD">
              IPoD
            </a>
            <a class="navbar-item" href="https://aigc3d.github.io/gobjaverse/">
              G-Objaverse
            </a>
            <a class="navbar-item" href="https://aigc3d.github.io/motionshop/">
              MotionShop
            </a>
          </div>
        </div>
      </div>

    </div>
  </nav>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">VideoMV: Consistent Multi-View Generation Based on Large Video Generative Model</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://scholar.google.com/citations?view_op=list_works&hl=en&user=UDnHe2IAAAAJ" target="_blank">Qi Zuo<sup>1*</sup></a>,</span>
                <span class="author-block">
                  <a href="https://scholar.google.com.hk/citations?user=aJPO514AAAAJ&hl=zh-CN&oi=ao" target="_blank">Xiaodong Gu<sup>1*</sup></a>,</span>
                  <span class="author-block">
                    <a href="https://lingtengqiu.github.io/" target="_blank">Lingteng Qiu<sup>2,1</sup></a>,</span>
                    <span class="author-block">
                      <a href="dy283090@alibaba-inc.com" target="_blank">Yuan Dong<sup>1</sup></a>,</span>
                        <span class="author-block">
                          <a href="bushe.zzy@alibaba-inc.com" target="_blank">Zhengyi Zhao<sup>1</sup></a>,</span>
                            <span class="author-block">
                              <a href="https://weihao-yuan.com/" target="_blank">Weihao Yuan<sup>1</sup></a>,</span>
                              <span class="author-block">
                                <a href="https://prstrive.github.io/" target="_blank">Rui Peng<sup>4</sup></a>,</span>
                                  <span class="author-block">
                                    <a href=" https://sites.google.com/site/zhusiyucs/home/" target="_blank">Siyu Zhu<sup>3</sup></a>,</span>
                                      <span class="author-block">
                                        <a href="https://scholar.google.com/citations?user=GHOQKCwAAAAJ&hl=zh-CN&oi=ao" target="_blank">Zilong Dong<sup>1</sup></a>,</span>
                                        <span class="author-block">
                                          <a href="https://research.cs.washington.edu/istc/lfb/" target="_blank">Liefeng Bo<sup>1</sup></a>,</span>
                                            <span class="author-block">
                                              <a href="https://www.cs.utexas.edu/~huangqx/" target="_blank">Qixing Huang<sup>5</sup></a></span>
                  </span>

                  </div>
                  <div class="is-size-5 publication-authors">
                    <span class="author-block"><sup>1</sup>Alibaba Group </span>
                    <span class="author-block"><sup>2</sup>SSE, CUHKSZ</span>
                    <span class="author-block"><sup>3</sup>Fudan, University</span>
                    <span class="author-block"><sup>4</sup>Peking, University</span>
                    <span class="author-block"><sup>5</sup>The University of Texas at Austin</span>
                    <span class="author-block"><sup>*</sup>Equal Contribution</span>
                    <!-- <div class="is-size-5 publication-authors">
                      <span class="author-block">Alibaba Group</span>
                      <div class="is-size-5 publication-authors">
                        <span class="author-block">Alibaba Group</span>
                        <div class="is-size-5 publication-authors">
                          <span class="author-block">Alibaba Group</span>
                          <div class="is-size-5 publication-authors">
                            <span class="author-block">Alibaba Group</span> -->
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="https://arxiv.org/abs/2403.12010" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                    <!-- Video link -->
                    <span class="link-block">
                      <a href="https://www.youtube.com/watch?v=zxjX5p0p0Ks" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fab fa-youtube"></i>
                      </span>
                      <span>video</span>
                    </a>
                  </span>

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/alibaba/VideoMV" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2403.12010" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Teaser video-->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop height="100%">
        <!-- Your video here -->
        <source src="static/videos/project_head_video.mp4"
        type="video/mp4">
      </video>
      </h2>
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Generating multi-view images based on text or single-image prompts is a critical capability for the creation of 3D content. Two fundamental questions on this topic are what data we use for training and how to ensure multi-view consistency. This paper introduces a novel framework that makes fundamental contributions to both questions. Unlike leveraging images from 2D diffusion models for training, we propose a dense consistent multi-view generation model that is fine-tuned from off-the-shelf video generative models. Images from video generative models are more suitable for multi-view generation because the underlying network architecture that generates them employs a temporal module to enforce frame consistency. Moreover, the video data sets used to train these models are abundant and diverse, leading to a reduced train-finetuning domain gap. To enhance multi-view consistency, we introduce a 3D-Aware Denoising Sampling, which first employs a feed-forward reconstruction module to get an explicit global 3D model, and then adopts a sampling strategy that effectively involves images rendered from the global 3D model into the denoising sampling loop to improve the multi-view consistency of the final images. As a by-product, this module also provides a fast way to create 3D assets represented by 3D Gaussians within a few seconds. Our approach can generate 24 dense views and converges much faster in training than state-of-the-art approaches (4 GPU hours versus many thousands GPU hours) with comparable visual quality and consistency.
By further fine-tuning, our approach outperforms existing state-of-the-art methods in both quantitative metrics and visual effects. We implement our method based on open-source video generative models, and we will release code and models for further research.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Youtube video -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <!-- Paper video. -->
      <h2 class="title is-3">Video</h2>
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          
          <div class="publication-video">
            <!-- Youtube embed code here -->
            <iframe src="https://www.youtube.com/watch?v=zxjX5p0p0Ks" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End youtube video -->


<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <h2 class="title is-3">Method</h2>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/f.png" alt="MY ALT TEXT"/>
        <h2 class="content has-text-justified">
          The overall framework. In the first stage, we take a pre-trained video generation model and fine-tune it by incorporating camera poses to generate multi-view images. Then we train a feed-forward reconstruction module to get an explicit global 3D model given noise-corrupted images. Finally, we adopt a 3D-aware denoise sampling strategy that effectively inserts the images rendered from the global 3D model into the denoising loop to further improve consistency.
       </h2>
     </div>
    </div>
  </div>
</div>
</div>
</section>
<!-- End image carousel -->


<!-- Video carousel -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Text-based multi-view generation + GaussianSplatting Reconstruction</h2>
      <!-- <h2 class="title is-4">Specific IDs</h2> -->
      <div id="results-carousel11" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo1.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo2.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo3.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo4.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo5.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo6.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo7.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo8.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo9.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo10.mp4"
            type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End video carousel -->

<!-- Video carousel -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <!-- <h2 class="title is-3">Text-based multi-view generation</h2> -->
      <!-- <h2 class="title is-4">Specific IDs</h2> -->
      <div id="results-carousel11" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo11.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo12.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo13.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo14.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo15.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo16.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo17.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo18.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo19.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/t2v_demo20.mp4"
            type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End video carousel -->


<!-- Video carousel -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3">Image-based multi-view generation + GaussianSplatting Reconstruction</h2>
      <!-- <h2 class="title is-4">Specific IDs</h2> -->
      <div id="results-carousel11" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo1.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo2.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo3.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo4.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo5.mp4"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/i2v_demo6.mp4"
            type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End video carousel -->

<!--BibTex citation -->
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@misc{zuo2024videomv,
      title={VideoMV: Consistent Multi-View Generation Based on Large Video Generative Model}, 
      author={Qi Zuo and Xiaodong Gu and Lingteng Qiu and Yuan Dong and Zhengyi Zhao and Weihao Yuan and Rui Peng and Siyu Zhu and Zilong Dong and Liefeng Bo and Qixing Huang},
      year={2024},
      eprint={2403.12010},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}</code></pre>
  </div>
</section>


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<script>
bulmaCarousel.attach('#results-carousel11', {
    slidesToScroll: 1,
    slidesToShow: 2,
    infinite: true,
    autoplay: false,
});
bulmaCarousel.attach('#results-carousel22', {
    slidesToScroll: 1,
    slidesToShow: 2,
    infinite: true,
    autoplay: false,
});
bulmaCarousel.attach('#results-carousel33', {
    slidesToScroll: 1,
    slidesToShow: 2,
    infinite: true,
    autoplay: false,
});
</script>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>