diff --git a/README.md b/README.md index 4c60c1f..cc22851 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Konstantinos N. Plataniotis, and Zhangyang Wang ## News +- 2024.4.1: Released code! - 2024.3.25: Released on arxiv! ## Overview @@ -23,8 +24,6 @@ We release our pre-generated static assets in `data/` directory. During training ## Custom Prompts -## Code is coming soon! - ## Training ## Testing @@ -32,4 +31,3 @@ We release our pre-generated static assets in `data/` directory. During training ## Citation If you find this repository/work helpful in your research, please consider citing the paper and starring the repo ⭐. - diff --git a/VideoCrafter/License b/VideoCrafter/License new file mode 100644 index 0000000..e2741c4 --- /dev/null +++ b/VideoCrafter/License @@ -0,0 +1,470 @@ +This license applies to the source codes that are open sourced in connection with the VideoCrafter1. + +Copyright (C) 2023 THL A29 Limited, a Tencent company. + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +10. This code is provided for research purposes only and is +not to be used for any commercial purposes. By using this code, +you agree that it will be used solely for academic research, scholarly work, +and non-commercial activities. Any use of this code for commercial purposes, +including but not limited to, selling, distributing, or incorporating it into +commercial products or services, is strictly prohibited. Violation of this +clause may result in legal actions and penalties. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +Other dependencies and licenses (if such optional components are used): + + +Components under BSD 3-Clause License: +------------------------------------------------ +1. numpy +Copyright (c) 2005-2022, NumPy Developers. +All rights reserved. + +2. pytorch +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +3. torchvision +Copyright (c) Soumith Chintala 2016, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Component under Apache v2 License: +----------------------------------------------------- +1. timm +Copyright 2019 Ross Wightman + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/VideoCrafter/README.md b/VideoCrafter/README.md new file mode 100644 index 0000000..d3fc39c --- /dev/null +++ b/VideoCrafter/README.md @@ -0,0 +1,199 @@ + +## ___***VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models***___ + + + +[![Discord](https://dcbadge.vercel.app/api/server/rrayYqZ4tf?style=flat)](https://discord.gg/rrayYqZ4tf) + +[![GitHub](https://img.shields.io/github/stars/VideoCrafter/VideoCrafter?style=social)](https://github.com/VideoCrafter/VideoCrafter) + +### 🔥🔥 Our dedicated high-resolution I2V model is released at: :point_right:[DynamiCrafter](https://github.com/Doubiiu/DynamiCrafter)!!! + +[![](https://img.youtube.com/vi/0NfmIsNAg-g/0.jpg)](https://www.youtube.com/watch?v=0NfmIsNAg-g) + +### 🔥The VideoCrafter2 Large improvements over VideoCrafter1 with limited data. Better Motion, Better Concept Combination!!! + +Please Join us and create your own film on [Discord/Floor33](https://discord.gg/rrayYqZ4tf). + +##### 🎥 Exquisite film, produced by VideoCrafter2, directed by Human + [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/TUsFkW0tK-s/0.jpg)](https://www.youtube.com/watch?v=TUsFkW0tK-s) + +## 🔆 Introduction + +🤗🤗🤗 VideoCrafter is an open-source video generation and editing toolbox for crafting video content. +It currently includes the Text2Video and Image2Video models: + +### 1. Generic Text-to-video Generation +Click the GIF to access the high-resolution video. + + + + + + + + + + +
"Tom Cruise's face reflects focus, his eyes filled with purpose and drive.""A child excitedly swings on a rusty swing set, laughter filling the air.""A young woman with glasses is jogging in the park wearing a pink headband."
+ + + + + + + + + + +
"With the style of van gogh, A young couple dances under the moonlight by the lake.""A rabbit, low-poly game art style""Impressionist style, a yellow rubber duck floating on the wave on the sunset"
+ +### 2. Generic Image-to-video Generation + + + + + + + + + + + + + + + + + +
"a black swan swims on the pond""a girl is riding a horse fast on grassland""a boy sits on a chair facing the sea""two galleons moving in the wind at sunset"
+ +:boom: **You are highly recommended to try our dedicated I2V model [DynamiCrafter](https://github.com/Doubiiu/DynamiCrafter): Higher resolution, Better Dynamics, More Coherence!!!** + +--- + +## 📝 Changelog +- __[2024.02.05]__: 🔥🔥 Release new I2V model with the resolution of 640x1024 of VideoCrafter1/DynamiCrafter. + +- __[2024.01.26]__: Release the 512x320 checkpoint of VideoCrafter2. + +- __[2024.01.18]__: Release the [VideoCrafter2](https://ailab-cvc.github.io/videocrafter2/) and [Tech Report](https://arxiv.org/abs/2401.09047)! + +- __[2023.10.30]__: Release [VideoCrafter1](https://arxiv.org/abs/2310.19512) Technical Report! + +- __[2023.10.13]__: Release the VideoCrafter1, High Quality Video Generation! + +- __[2023.08.14]__: Release a new version of VideoCrafter on [Discord/Floor33](https://discord.gg/uHaQuThT). Please join us to create your own film! + +- __[2023.04.18]__: Release a VideoControl model with most of the watermarks removed! + +- __[2023.04.05]__: Release pretrained Text-to-Video models, VideoLora models, and inference code. +
+ + +## ⏳ Models + +|T2V-Models|Resolution|Checkpoints| +|:---------|:---------|:--------| +|VideoCrafter2|320x512|[Hugging Face](https://huggingface.co/VideoCrafter/VideoCrafter2/blob/main/model.ckpt) +|VideoCrafter1|576x1024|[Hugging Face](https://huggingface.co/VideoCrafter/Text2Video-1024/blob/main/model.ckpt) +|VideoCrafter1|320x512|[Hugging Face](https://huggingface.co/VideoCrafter/Text2Video-512/blob/main/model.ckpt) + +|I2V-Models|Resolution|Checkpoints| +|:---------|:---------|:--------| +|VideoCrafter1|640x1024|[Hugging Face](https://huggingface.co/Doubiiu/DynamiCrafter_1024/blob/main/model.ckpt) +|VideoCrafter1|320x512|[Hugging Face](https://huggingface.co/VideoCrafter/Image2Video-512/blob/main/model.ckpt) + + + +## ⚙️ Setup + +### 1. Install Environment via Anaconda (Recommended) +```bash +conda create -n videocrafter python=3.8.5 +conda activate videocrafter +pip install -r requirements.txt +``` + + +## 💫 Inference +### 1. Text-to-Video + +1) Download pretrained T2V models via [Hugging Face](https://huggingface.co/VideoCrafter/VideoCrafter2/blob/main/model.ckpt), and put the `model.ckpt` in `checkpoints/base_512_v2/model.ckpt`. +2) Input the following commands in terminal. +```bash + sh scripts/run_text2video.sh +``` + +### 2. Image-to-Video + +1) Download pretrained I2V models via [Hugging Face](https://huggingface.co/VideoCrafter/Image2Video-512-v1.0/blob/main/model.ckpt), and put the `model.ckpt` in `checkpoints/i2v_512_v1/model.ckpt`. +2) Input the following commands in terminal. +```bash + sh scripts/run_image2video.sh +``` + +### 3. Local Gradio demo + +1. Download the pretrained T2V and I2V models and put them in the corresponding directory according to the previous guidelines. +2. Input the following commands in terminal. +```bash + python gradio_app.py +``` + +--- +## 📋 Techinical Report +😉 VideoCrafter2 Tech report: [VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models](https://arxiv.org/abs/2401.09047) + +😉 VideoCrafter1 Tech report: [VideoCrafter1: Open Diffusion Models for High-Quality Video Generation](https://arxiv.org/abs/2310.19512) +
+ +## 😉 Citation +The technical report is currently unavailable as it is still in preparation. You can cite the paper of our image-to-video model and related base model. +``` +@misc{chen2024videocrafter2, + title={VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models}, + author={Haoxin Chen and Yong Zhang and Xiaodong Cun and Menghan Xia and Xintao Wang and Chao Weng and Ying Shan}, + year={2024}, + eprint={2401.09047}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@misc{chen2023videocrafter1, + title={VideoCrafter1: Open Diffusion Models for High-Quality Video Generation}, + author={Haoxin Chen and Menghan Xia and Yingqing He and Yong Zhang and Xiaodong Cun and Shaoshu Yang and Jinbo Xing and Yaofang Liu and Qifeng Chen and Xintao Wang and Chao Weng and Ying Shan}, + year={2023}, + eprint={2310.19512}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@article{xing2023dynamicrafter, + title={DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors}, + author={Jinbo Xing and Menghan Xia and Yong Zhang and Haoxin Chen and Xintao Wang and Tien-Tsin Wong and Ying Shan}, + year={2023}, + eprint={2310.12190}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@article{he2022lvdm, + title={Latent Video Diffusion Models for High-Fidelity Long Video Generation}, + author={Yingqing He and Tianyu Yang and Yong Zhang and Ying Shan and Qifeng Chen}, + year={2022}, + eprint={2211.13221}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + + +## 🤗 Acknowledgements +Our codebase builds on [Stable Diffusion](https://github.com/Stability-AI/stablediffusion). +Thanks the authors for sharing their awesome codebases! + + +## 📢 Disclaimer +We develop this repository for RESEARCH purposes, so it can only be used for personal/research/non-commercial purposes. +**** diff --git a/VideoCrafter/cog.yaml b/VideoCrafter/cog.yaml new file mode 100644 index 0000000..b0108f3 --- /dev/null +++ b/VideoCrafter/cog.yaml @@ -0,0 +1,25 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + python_version: "3.11" + python_packages: + - "torch==2.0.1" + - "opencv-python==4.8.1.78" + - "torchvision==0.15.2" + - "pytorch_lightning==2.1.0" + - "einops==0.7.0" + - "imageio==2.31.6" + - "omegaconf==2.3.0" + - "transformers==4.35.0" + - "moviepy==1.0.3" + - "av==10.0.0" + - "decord==0.6.0" + - "kornia==0.7.0" + - "open-clip-torch==2.12.0" + - "xformers==0.0.21" +predict: "predict.py:Predictor" diff --git a/VideoCrafter/configs/inference_i2v_512_v1.0.yaml b/VideoCrafter/configs/inference_i2v_512_v1.0.yaml new file mode 100644 index 0000000..7d49007 --- /dev/null +++ b/VideoCrafter/configs/inference_i2v_512_v1.0.yaml @@ -0,0 +1,83 @@ +model: + target: lvdm.models.ddpm3d.LatentVisualDiffusion + params: + linear_start: 0.00085 + linear_end: 0.012 + num_timesteps_cond: 1 + timesteps: 1000 + first_stage_key: video + cond_stage_key: caption + cond_stage_trainable: false + conditioning_key: crossattn + image_size: + - 40 + - 64 + channels: 4 + scale_by_std: false + scale_factor: 0.18215 + use_ema: false + uncond_type: empty_seq + use_scale: true + scale_b: 0.7 + finegrained: true + unet_config: + target: lvdm.modules.networks.openaimodel3d.UNetModel + params: + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: + - 4 + - 2 + - 1 + num_res_blocks: 2 + channel_mult: + - 1 + - 2 + - 4 + - 4 + num_head_channels: 64 + transformer_depth: 1 + context_dim: 1024 + use_linear: true + use_checkpoint: true + temporal_conv: true + temporal_attention: true + temporal_selfatt_only: true + use_relative_position: false + use_causal_attention: false + use_image_attention: true + temporal_length: 16 + addition_attention: true + fps_cond: true + first_stage_config: + target: lvdm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 512 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + cond_stage_config: + target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder + params: + freeze: true + layer: penultimate + cond_img_config: + target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2 + params: + freeze: true \ No newline at end of file diff --git a/VideoCrafter/configs/inference_t2v_1024_v1.0.yaml b/VideoCrafter/configs/inference_t2v_1024_v1.0.yaml new file mode 100644 index 0000000..4cb9af1 --- /dev/null +++ b/VideoCrafter/configs/inference_t2v_1024_v1.0.yaml @@ -0,0 +1,77 @@ +model: + target: lvdm.models.ddpm3d.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.012 + num_timesteps_cond: 1 + timesteps: 1000 + first_stage_key: video + cond_stage_key: caption + cond_stage_trainable: false + conditioning_key: crossattn + image_size: + - 72 + - 128 + channels: 4 + scale_by_std: false + scale_factor: 0.18215 + use_ema: false + uncond_type: empty_seq + use_scale: true + fix_scale_bug: true + unet_config: + target: lvdm.modules.networks.openaimodel3d.UNetModel + params: + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: + - 4 + - 2 + - 1 + num_res_blocks: 2 + channel_mult: + - 1 + - 2 + - 4 + - 4 + num_head_channels: 64 + transformer_depth: 1 + context_dim: 1024 + use_linear: true + use_checkpoint: true + temporal_conv: false + temporal_attention: true + temporal_selfatt_only: true + use_relative_position: true + use_causal_attention: false + temporal_length: 16 + addition_attention: true + fps_cond: true + first_stage_config: + target: lvdm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 512 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + cond_stage_config: + target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder + params: + freeze: true + layer: penultimate diff --git a/VideoCrafter/configs/inference_t2v_512_v1.0.yaml b/VideoCrafter/configs/inference_t2v_512_v1.0.yaml new file mode 100644 index 0000000..849623e --- /dev/null +++ b/VideoCrafter/configs/inference_t2v_512_v1.0.yaml @@ -0,0 +1,74 @@ +model: + target: lvdm.models.ddpm3d.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.012 + num_timesteps_cond: 1 + timesteps: 1000 + first_stage_key: video + cond_stage_key: caption + cond_stage_trainable: false + conditioning_key: crossattn + image_size: + - 40 + - 64 + channels: 4 + scale_by_std: false + scale_factor: 0.18215 + use_ema: false + uncond_type: empty_seq + unet_config: + target: lvdm.modules.networks.openaimodel3d.UNetModel + params: + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: + - 4 + - 2 + - 1 + num_res_blocks: 2 + channel_mult: + - 1 + - 2 + - 4 + - 4 + num_head_channels: 64 + transformer_depth: 1 + context_dim: 1024 + use_linear: true + use_checkpoint: true + temporal_conv: false + temporal_attention: true + temporal_selfatt_only: true + use_relative_position: true + use_causal_attention: false + temporal_length: 16 + addition_attention: true + first_stage_config: + target: lvdm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 512 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + cond_stage_config: + target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder + params: + freeze: true + layer: penultimate diff --git a/VideoCrafter/configs/inference_t2v_512_v2.0.yaml b/VideoCrafter/configs/inference_t2v_512_v2.0.yaml new file mode 100644 index 0000000..4a2e6c4 --- /dev/null +++ b/VideoCrafter/configs/inference_t2v_512_v2.0.yaml @@ -0,0 +1,77 @@ +model: + target: lvdm.models.ddpm3d.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.012 + num_timesteps_cond: 1 + timesteps: 1000 + first_stage_key: video + cond_stage_key: caption + cond_stage_trainable: false + conditioning_key: crossattn + image_size: + - 40 + - 64 + channels: 4 + scale_by_std: false + scale_factor: 0.18215 + use_ema: false + uncond_type: empty_seq + use_scale: true + scale_b: 0.7 + unet_config: + target: lvdm.modules.networks.openaimodel3d.UNetModel + params: + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: + - 4 + - 2 + - 1 + num_res_blocks: 2 + channel_mult: + - 1 + - 2 + - 4 + - 4 + num_head_channels: 64 + transformer_depth: 1 + context_dim: 1024 + use_linear: true + use_checkpoint: true + temporal_conv: true + temporal_attention: true + temporal_selfatt_only: true + use_relative_position: false + use_causal_attention: false + temporal_length: 16 + addition_attention: true + fps_cond: true + first_stage_config: + target: lvdm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 512 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + cond_stage_config: + target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder + params: + freeze: true + layer: penultimate diff --git a/VideoCrafter/gradio_app.py b/VideoCrafter/gradio_app.py new file mode 100644 index 0000000..e8bfc8e --- /dev/null +++ b/VideoCrafter/gradio_app.py @@ -0,0 +1,58 @@ +import os +import sys +import gradio as gr +from scripts.gradio.t2v_test import Text2Video +sys.path.insert(1, os.path.join(sys.path[0], 'lvdm')) + +t2v_examples = [ + ['an elephant is walking under the sea, 4K, high definition',50, 12,1, 16], + ['an astronaut riding a horse in outer space',25,12,1,16], + ['a monkey is playing a piano',25,12,1,16], + ['A fire is burning on a candle',25,12,1,16], + ['a horse is drinking in the river',25,12,1,16], + ['Robot dancing in times square',25,12,1,16], +] + + +def videocrafter_demo(result_dir='./tmp/'): + text2video = Text2Video(result_dir) + with gr.Blocks(analytics_enabled=False) as videocrafter_iface: + gr.Markdown("

VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models

\ + Github
") + + #######t2v####### + with gr.Tab(label="Text2Video"): + with gr.Column(): + with gr.Row().style(equal_height=False): + with gr.Column(): + input_text = gr.Text(label='Prompts') + with gr.Row(): + steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id=f"steps", label="Sampling steps", value=50) + eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="eta") + with gr.Row(): + cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=12.0, elem_id="cfg_scale") + fps = gr.Slider(minimum=4, maximum=32, step=1, label='fps', value=16, elem_id="fps") + send_btn = gr.Button("Send") + with gr.Tab(label='result'): + with gr.Row(): + output_video_1 = gr.Video().style(width=512) + gr.Examples(examples=t2v_examples, + inputs=[input_text,steps,cfg_scale,eta], + outputs=[output_video_1], + fn=text2video.get_prompt, + cache_examples=False) + #cache_examples=os.getenv('SYSTEM') == 'spaces') + send_btn.click( + fn=text2video.get_prompt, + inputs=[input_text,steps,cfg_scale,eta,fps], + outputs=[output_video_1], + ) + + return videocrafter_iface + +if __name__ == "__main__": + result_dir = os.path.join('./', 'results') + videocrafter_iface = videocrafter_demo(result_dir) + videocrafter_iface.queue(concurrency_count=1, max_size=10) + videocrafter_iface.launch() + # videocrafter_iface.launch(server_name='0.0.0.0', server_port=80) \ No newline at end of file diff --git a/VideoCrafter/predict.py b/VideoCrafter/predict.py new file mode 100644 index 0000000..9c1d207 --- /dev/null +++ b/VideoCrafter/predict.py @@ -0,0 +1,155 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + + +import os +import sys +import argparse +import random +from omegaconf import OmegaConf +from einops import rearrange, repeat +import torch +import torchvision +from pytorch_lightning import seed_everything +from cog import BasePredictor, Input, Path + +sys.path.insert(0, "scripts/evaluation") +from funcs import ( + batch_ddim_sampling, + load_model_checkpoint, + load_image_batch, + get_filelist, +) +from utils.utils import instantiate_from_config + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + ckpt_path_base = "checkpoints/base_1024_v1/model.ckpt" + config_base = "configs/inference_t2v_1024_v1.0.yaml" + ckpt_path_i2v = "checkpoints/i2v_512_v1/model.ckpt" + config_i2v = "configs/inference_i2v_512_v1.0.yaml" + + config_base = OmegaConf.load(config_base) + model_config_base = config_base.pop("model", OmegaConf.create()) + self.model_base = instantiate_from_config(model_config_base) + self.model_base = self.model_base.cuda() + self.model_base = load_model_checkpoint(self.model_base, ckpt_path_base) + self.model_base.eval() + + config_i2v = OmegaConf.load(config_i2v) + model_config_i2v = config_i2v.pop("model", OmegaConf.create()) + self.model_i2v = instantiate_from_config(model_config_i2v) + self.model_i2v = self.model_i2v.cuda() + self.model_i2v = load_model_checkpoint(self.model_i2v, ckpt_path_i2v) + self.model_i2v.eval() + + def predict( + self, + task: str = Input( + description="Choose the task.", + choices=["text2video", "image2video"], + default="text2video", + ), + prompt: str = Input( + description="Prompt for video generation.", + default="A tiger walks in the forest, photorealistic, 4k, high definition.", + ), + image: Path = Input( + description="Input image for image2video task.", default=None + ), + ddim_steps: int = Input(description="Number of denoising steps.", default=50), + unconditional_guidance_scale: float = Input( + description="Classifier-free guidance scale.", default=12.0 + ), + seed: int = Input( + description="Random seed. Leave blank to randomize the seed", default=None + ), + save_fps: int = Input( + description="Frame per second for the generated video.", default=10 + ), + ) -> Path: + + width = 1024 if task == "text2video" else 512 + height = 576 if task == "text2video" else 320 + model = self.model_base if task == "text2video" else self.model_i2v + + if task == "image2video": + assert image is not None, "Please provide image for image2video generation." + + if seed is None: + seed = int.from_bytes(os.urandom(2), "big") + print(f"Using seed: {seed}") + seed_everything(seed) + + args = argparse.Namespace( + mode="base" if task == "text2video" else "i2v", + savefps=save_fps, + n_samples=1, + ddim_steps=ddim_steps, + ddim_eta=1.0, + bs=1, + height=height, + width=width, + frames=-1, + fps=28 if task == "text2video" else 8, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_guidance_scale_temporal=None, + ) + + ## latent noise shape + h, w = args.height // 8, args.width // 8 + frames = model.temporal_length if args.frames < 0 else args.frames + channels = model.channels + + batch_size = 1 + noise_shape = [batch_size, channels, frames, h, w] + fps = torch.tensor([args.fps] * batch_size).to(model.device).long() + prompts = [prompt] + text_emb = model.get_learned_conditioning(prompts) + + if args.mode == "base": + cond = {"c_crossattn": [text_emb], "fps": fps} + elif args.mode == "i2v": + cond_images = load_image_batch([str(image)], (args.height, args.width)) + cond_images = cond_images.to(model.device) + img_emb = model.get_image_embeds(cond_images) + imtext_cond = torch.cat([text_emb, img_emb], dim=1) + cond = {"c_crossattn": [imtext_cond], "fps": fps} + else: + raise NotImplementedError + + ## inference + batch_samples = batch_ddim_sampling( + model, + cond, + noise_shape, + args.n_samples, + args.ddim_steps, + args.ddim_eta, + args.unconditional_guidance_scale, + ) + + out_path = "/tmp/output.mp4" + vid_tensor = batch_samples[0] + video = vid_tensor.detach().cpu() + video = torch.clamp(video.float(), -1.0, 1.0) + video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w + + frame_grids = [ + torchvision.utils.make_grid(framesheet, nrow=int(args.n_samples)) + for framesheet in video + ] # [3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + grid = (grid + 1.0) / 2.0 + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + torchvision.io.write_video( + out_path, + grid, + fps=args.savefps, + video_codec="h264", + options={"crf": "10"}, + ) + return Path(out_path) diff --git a/VideoCrafter/prompts/i2v_prompts/horse.png b/VideoCrafter/prompts/i2v_prompts/horse.png new file mode 100644 index 0000000..b0e8f56 Binary files /dev/null and b/VideoCrafter/prompts/i2v_prompts/horse.png differ diff --git a/VideoCrafter/prompts/i2v_prompts/seashore.png b/VideoCrafter/prompts/i2v_prompts/seashore.png new file mode 100644 index 0000000..f6c94a4 Binary files /dev/null and b/VideoCrafter/prompts/i2v_prompts/seashore.png differ diff --git a/VideoCrafter/prompts/i2v_prompts/test_prompts.txt b/VideoCrafter/prompts/i2v_prompts/test_prompts.txt new file mode 100644 index 0000000..4111c78 --- /dev/null +++ b/VideoCrafter/prompts/i2v_prompts/test_prompts.txt @@ -0,0 +1,2 @@ +horses are walking on the grassland +a boy and a girl are talking on the seashore diff --git a/VideoCrafter/prompts/test_prompts.txt b/VideoCrafter/prompts/test_prompts.txt new file mode 100644 index 0000000..7878436 --- /dev/null +++ b/VideoCrafter/prompts/test_prompts.txt @@ -0,0 +1,3 @@ +a bee is flying over a flower from left to right, photorealistic, high definition +a cat is on the left of a dog, photorealistic, high definition +a panda is dancing, photorealistic, high definition \ No newline at end of file diff --git a/VideoCrafter/requirements.txt b/VideoCrafter/requirements.txt new file mode 100644 index 0000000..e70353c --- /dev/null +++ b/VideoCrafter/requirements.txt @@ -0,0 +1,24 @@ +decord==0.6.0 +einops==0.3.0 +imageio==2.9.0 +# numpy==1.24.2 +omegaconf==2.1.1 +opencv_python +# pandas==2.0.0 +Pillow==9.5.0 +pytorch_lightning==1.8.3 +PyYAML==6.0 +setuptools==65.6.3 +# torch==2.0.0 +# torchvision +tqdm==4.65.0 +# transformers==4.25.1 +moviepy +av +# xformers +gradio +timm +scikit-learn +open_clip_torch==2.22.0 +kornia +mmengine diff --git a/VideoCrafter/scripts/evaluation/ddp_wrapper.py b/VideoCrafter/scripts/evaluation/ddp_wrapper.py new file mode 100644 index 0000000..01853c1 --- /dev/null +++ b/VideoCrafter/scripts/evaluation/ddp_wrapper.py @@ -0,0 +1,46 @@ +import datetime +import argparse, importlib +from pytorch_lightning import seed_everything + +import torch +import torch.distributed as dist + +def setup_dist(local_rank): + if dist.is_initialized(): + return + torch.cuda.set_device(local_rank) + torch.distributed.init_process_group('nccl', init_method='env://') + + +def get_dist_info(): + if dist.is_available(): + initialized = dist.is_initialized() + else: + initialized = False + if initialized: + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +if __name__ == '__main__': + now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + parser = argparse.ArgumentParser() + parser.add_argument("--module", type=str, help="module name", default="inference") + parser.add_argument("--local_rank", type=int, nargs="?", help="for ddp", default=0) + args, unknown = parser.parse_known_args() + inference_api = importlib.import_module(args.module, package=None) + + inference_parser = inference_api.get_parser() + inference_args, unknown = inference_parser.parse_known_args() + + seed_everything(inference_args.seed) + setup_dist(args.local_rank) + torch.backends.cudnn.benchmark = True + rank, gpu_num = get_dist_info() + + print("@CoLVDM Inference [rank%d]: %s"%(rank, now)) + inference_api.run_inference(inference_args, gpu_num, rank) \ No newline at end of file diff --git a/VideoCrafter/scripts/evaluation/funcs.py b/VideoCrafter/scripts/evaluation/funcs.py new file mode 100644 index 0000000..f80c0e3 --- /dev/null +++ b/VideoCrafter/scripts/evaluation/funcs.py @@ -0,0 +1,195 @@ +import os, sys, glob +import numpy as np +from collections import OrderedDict +from decord import VideoReader, cpu +import cv2 + +import torch +import torchvision +sys.path.insert(1, os.path.join(sys.path[0], '..', '..')) +from lvdm.models.samplers.ddim import DDIMSampler + + +def batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1.0,\ + cfg_scale=1.0, temporal_cfg_scale=None, **kwargs): + ddim_sampler = DDIMSampler(model) + uncond_type = model.uncond_type + batch_size = noise_shape[0] + + ## construct unconditional guidance + if cfg_scale != 1.0: + if uncond_type == "empty_seq": + prompts = batch_size * [""] + #prompts = N * T * [""] ## if is_imgbatch=True + uc_emb = model.get_learned_conditioning(prompts) + elif uncond_type == "zero_embed": + c_emb = cond["c_crossattn"][0] if isinstance(cond, dict) else cond + uc_emb = torch.zeros_like(c_emb) + + ## process image embedding token + if hasattr(model, 'embedder'): + uc_img = torch.zeros(noise_shape[0],3,224,224).to(model.device) + ## img: b c h w >> b l c + uc_img = model.get_image_embeds(uc_img) + uc_emb = torch.cat([uc_emb, uc_img], dim=1) + + if isinstance(cond, dict): + uc = {key:cond[key] for key in cond.keys()} + uc.update({'c_crossattn': [uc_emb]}) + else: + uc = uc_emb + else: + uc = None + + x_T = None + batch_variants = [] + #batch_variants1, batch_variants2 = [], [] + for _ in range(n_samples): + if ddim_sampler is not None: + kwargs.update({"clean_cond": True}) + print(f'##### valid infos temporal_cfg_scale {temporal_cfg_scale} kwargs {kwargs.keys()}') + samples, _ = ddim_sampler.sample(S=ddim_steps, + conditioning=cond, + batch_size=noise_shape[0], + shape=noise_shape[1:], + verbose=False, + unconditional_guidance_scale=cfg_scale, + unconditional_conditioning=uc, + eta=ddim_eta, + temporal_length=noise_shape[2], + conditional_guidance_scale_temporal=temporal_cfg_scale, + x_T=x_T, + **kwargs + ) + ## reconstruct from latent to pixel space + batch_images = model.decode_first_stage_2DAE(samples) + batch_variants.append(batch_images) + ## batch, , c, t, h, w + batch_variants = torch.stack(batch_variants, dim=1) + return batch_variants + + +def get_filelist(data_dir, ext='*'): + file_list = glob.glob(os.path.join(data_dir, '*.%s'%ext)) + file_list.sort() + return file_list + +def get_dirlist(path): + list = [] + if (os.path.exists(path)): + files = os.listdir(path) + for file in files: + m = os.path.join(path,file) + if (os.path.isdir(m)): + list.append(m) + list.sort() + return list + + +def load_model_checkpoint(model, ckpt): + def load_checkpoint(model, ckpt, full_strict): + state_dict = torch.load(ckpt, map_location="cpu") + try: + ## deepspeed + new_pl_sd = OrderedDict() + for key in state_dict['module'].keys(): + new_pl_sd[key[16:]]=state_dict['module'][key] + model.load_state_dict(new_pl_sd, strict=full_strict) + except: + if "state_dict" in list(state_dict.keys()): + state_dict = state_dict["state_dict"] + model.load_state_dict(state_dict, strict=full_strict) + return model + load_checkpoint(model, ckpt, full_strict=True) + print('>>> model checkpoint loaded.') + return model + + +def load_prompts(prompt_file): + f = open(prompt_file, 'r') + prompt_list = [] + for idx, line in enumerate(f.readlines()): + l = line.strip() + if len(l) != 0: + prompt_list.append(l) + f.close() + return prompt_list + + +def load_video_batch(filepath_list, frame_stride, video_size=(256,256), video_frames=16): + ''' + Notice about some special cases: + 1. video_frames=-1 means to take all the frames (with fs=1) + 2. when the total video frames is less than required, padding strategy will be used (repreated last frame) + ''' + fps_list = [] + batch_tensor = [] + assert frame_stride > 0, "valid frame stride should be a positive interge!" + for filepath in filepath_list: + padding_num = 0 + vidreader = VideoReader(filepath, ctx=cpu(0), width=video_size[1], height=video_size[0]) + fps = vidreader.get_avg_fps() + total_frames = len(vidreader) + max_valid_frames = (total_frames-1) // frame_stride + 1 + if video_frames < 0: + ## all frames are collected: fs=1 is a must + required_frames = total_frames + frame_stride = 1 + else: + required_frames = video_frames + query_frames = min(required_frames, max_valid_frames) + frame_indices = [frame_stride*i for i in range(query_frames)] + + ## [t,h,w,c] -> [c,t,h,w] + frames = vidreader.get_batch(frame_indices) + frame_tensor = torch.tensor(frames.asnumpy()).permute(3, 0, 1, 2).float() + frame_tensor = (frame_tensor / 255. - 0.5) * 2 + if max_valid_frames < required_frames: + padding_num = required_frames - max_valid_frames + frame_tensor = torch.cat([frame_tensor, *([frame_tensor[:,-1:,:,:]]*padding_num)], dim=1) + print(f'{os.path.split(filepath)[1]} is not long enough: {padding_num} frames padded.') + batch_tensor.append(frame_tensor) + sample_fps = int(fps/frame_stride) + fps_list.append(sample_fps) + + return torch.stack(batch_tensor, dim=0) + +from PIL import Image +def load_image_batch(filepath_list, image_size=(256,256)): + batch_tensor = [] + for filepath in filepath_list: + _, filename = os.path.split(filepath) + _, ext = os.path.splitext(filename) + if ext == '.mp4': + vidreader = VideoReader(filepath, ctx=cpu(0), width=image_size[1], height=image_size[0]) + frame = vidreader.get_batch([0]) + img_tensor = torch.tensor(frame.asnumpy()).squeeze(0).permute(2, 0, 1).float() + elif ext == '.png' or ext == '.jpg': + img = Image.open(filepath).convert("RGB") + rgb_img = np.array(img, np.float32) + #bgr_img = cv2.imread(filepath, cv2.IMREAD_COLOR) + #bgr_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB) + rgb_img = cv2.resize(rgb_img, (image_size[1],image_size[0]), interpolation=cv2.INTER_LINEAR) + img_tensor = torch.from_numpy(rgb_img).permute(2, 0, 1).float() + else: + print(f'ERROR: <{ext}> image loading only support format: [mp4], [png], [jpg]') + raise NotImplementedError + img_tensor = (img_tensor / 255. - 0.5) * 2 + batch_tensor.append(img_tensor) + return torch.stack(batch_tensor, dim=0) + + +def save_videos(batch_tensors, savedir, filenames, fps=10): + # b,samples,c,t,h,w + n_samples = batch_tensors.shape[1] + for idx, vid_tensor in enumerate(batch_tensors): + video = vid_tensor.detach().cpu() + video = torch.clamp(video.float(), -1., 1.) + video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n_samples)) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + grid = (grid + 1.0) / 2.0 + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + savepath = os.path.join(savedir, f"{filenames[idx]}.mp4") + torchvision.io.write_video(savepath, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + diff --git a/VideoCrafter/scripts/evaluation/inference.py b/VideoCrafter/scripts/evaluation/inference.py new file mode 100644 index 0000000..5ddcc68 --- /dev/null +++ b/VideoCrafter/scripts/evaluation/inference.py @@ -0,0 +1,137 @@ +import argparse, os, sys, glob, yaml, math, random +import datetime, time +import numpy as np +from omegaconf import OmegaConf +from collections import OrderedDict +from tqdm import trange, tqdm +from einops import repeat +from einops import rearrange, repeat +from functools import partial +import torch +from pytorch_lightning import seed_everything + +from funcs import load_model_checkpoint, load_prompts, load_image_batch, get_filelist, save_videos +from funcs import batch_ddim_sampling +from utils.utils import instantiate_from_config + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=20230211, help="seed for seed_everything") + parser.add_argument("--mode", default="base", type=str, help="which kind of inference mode: {'base', 'i2v'}") + parser.add_argument("--ckpt_path", type=str, default=None, help="checkpoint path") + parser.add_argument("--config", type=str, help="config (yaml) path") + parser.add_argument("--prompt_file", type=str, default=None, help="a text file containing many prompts") + parser.add_argument("--savedir", type=str, default=None, help="results saving path") + parser.add_argument("--savefps", type=str, default=10, help="video fps to generate") + parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",) + parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",) + parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",) + parser.add_argument("--bs", type=int, default=1, help="batch size for inference") + parser.add_argument("--height", type=int, default=512, help="image height, in pixel space") + parser.add_argument("--width", type=int, default=512, help="image width, in pixel space") + parser.add_argument("--frames", type=int, default=-1, help="frames num to inference") + parser.add_argument("--fps", type=int, default=24) + parser.add_argument("--unconditional_guidance_scale", type=float, default=1.0, help="prompt classifier-free guidance") + parser.add_argument("--unconditional_guidance_scale_temporal", type=float, default=None, help="temporal consistency guidance") + ## for conditional i2v only + parser.add_argument("--cond_input", type=str, default=None, help="data dir of conditional input") + return parser + + +def run_inference(args, gpu_num, gpu_no, **kwargs): + ## step 1: model config + ## ----------------------------------------------------------------- + config = OmegaConf.load(args.config) + #data_config = config.pop("data", OmegaConf.create()) + model_config = config.pop("model", OmegaConf.create()) + model = instantiate_from_config(model_config) + model = model.cuda(gpu_no) + assert os.path.exists(args.ckpt_path), f"Error: checkpoint [{args.ckpt_path}] Not Found!" + model = load_model_checkpoint(model, args.ckpt_path) + model.eval() + + ## sample shape + assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!" + ## latent noise shape + h, w = args.height // 8, args.width // 8 + frames = model.temporal_length if args.frames < 0 else args.frames + channels = model.channels + + ## saving folders + os.makedirs(args.savedir, exist_ok=True) + + ## step 2: load data + ## ----------------------------------------------------------------- + assert os.path.exists(args.prompt_file), "Error: prompt file NOT Found!" + prompt_list = load_prompts(args.prompt_file) + num_samples = len(prompt_list) + filename_list = [f"{id+1:04d}" for id in range(num_samples)] + + samples_split = num_samples // gpu_num + residual_tail = num_samples % gpu_num + print(f'[rank:{gpu_no}] {samples_split}/{num_samples} samples loaded.') + indices = list(range(samples_split*gpu_no, samples_split*(gpu_no+1))) + if gpu_no == 0 and residual_tail != 0: + indices = indices + list(range(num_samples-residual_tail, num_samples)) + prompt_list_rank = [prompt_list[i] for i in indices] + + ## conditional input + if args.mode == "i2v": + ## each video or frames dir per prompt + cond_inputs = get_filelist(args.cond_input, ext='[mpj][pn][4gj]') # '[mpj][pn][4gj]' + assert len(cond_inputs) == num_samples, f"Error: conditional input ({len(cond_inputs)}) NOT match prompt ({num_samples})!" + filename_list = [f"{os.path.split(cond_inputs[id])[-1][:-4]}" for id in range(num_samples)] + cond_inputs_rank = [cond_inputs[i] for i in indices] + + filename_list_rank = [filename_list[i] for i in indices] + + ## step 3: run over samples + ## ----------------------------------------------------------------- + start = time.time() + n_rounds = len(prompt_list_rank) // args.bs + n_rounds = n_rounds+1 if len(prompt_list_rank) % args.bs != 0 else n_rounds + for idx in range(0, n_rounds): + print(f'[rank:{gpu_no}] batch-{idx+1} ({args.bs})x{args.n_samples} ...') + idx_s = idx*args.bs + idx_e = min(idx_s+args.bs, len(prompt_list_rank)) + batch_size = idx_e - idx_s + filenames = filename_list_rank[idx_s:idx_e] + noise_shape = [batch_size, channels, frames, h, w] + fps = torch.tensor([args.fps]*batch_size).to(model.device).long() + + prompts = prompt_list_rank[idx_s:idx_e] + if isinstance(prompts, str): + prompts = [prompts] + #prompts = batch_size * [""] + text_emb = model.get_learned_conditioning(prompts) + + if args.mode == 'base': + cond = {"c_crossattn": [text_emb], "fps": fps} + elif args.mode == 'i2v': + #cond_images = torch.zeros(noise_shape[0],3,224,224).to(model.device) + cond_images = load_image_batch(cond_inputs_rank[idx_s:idx_e], (args.height, args.width)) + cond_images = cond_images.to(model.device) + img_emb = model.get_image_embeds(cond_images) + imtext_cond = torch.cat([text_emb, img_emb], dim=1) + cond = {"c_crossattn": [imtext_cond], "fps": fps} + else: + raise NotImplementedError + + ## inference + batch_samples = batch_ddim_sampling(model, cond, noise_shape, args.n_samples, \ + args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, **kwargs) + ## b,samples,c,t,h,w + save_videos(batch_samples, args.savedir, filenames, fps=args.savefps) + + print(f"Saved in {args.savedir}. Time used: {(time.time() - start):.2f} seconds") + + +if __name__ == '__main__': + now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + print("@CoLVDM Inference: %s"%now) + parser = get_parser() + args = parser.parse_args() + seed_everything(args.seed) + rank, gpu_num = 0, 1 + run_inference(args, gpu_num, rank) \ No newline at end of file diff --git a/VideoCrafter/scripts/evaluation/videocrafter2_utils.py b/VideoCrafter/scripts/evaluation/videocrafter2_utils.py new file mode 100644 index 0000000..9fb2aa7 --- /dev/null +++ b/VideoCrafter/scripts/evaluation/videocrafter2_utils.py @@ -0,0 +1,309 @@ +from typing import List + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import imageio + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True +import sys +import argparse, os +sys.path.insert(1, os.path.join(sys.path[0], '..', '..')) +from omegaconf import OmegaConf +import matplotlib.pyplot as plt +import torchvision, tqdm +# from utils.utils import instantiate_from_config +# from funcs import load_model_checkpoint, load_prompts, load_image_batch, get_filelist, save_videos +# from funcs import batch_ddim_sampling + +# from lvdm.models.samplers.ddim import DDIMSampler +import importlib +from collections import OrderedDict + + + + +def load_prompts(prompt_file): + f = open(prompt_file, 'r') + prompt_list = [] + for idx, line in enumerate(f.readlines()): + l = line.strip() + if len(l) != 0: + prompt_list.append(l) + f.close() + return prompt_list + +def load_model_checkpoint(model, ckpt): + def load_checkpoint(model, ckpt, full_strict): + state_dict = torch.load(ckpt, map_location="cpu") + try: + ## deepspeed + new_pl_sd = OrderedDict() + for key in state_dict['module'].keys(): + new_pl_sd[key[16:]]=state_dict['module'][key] + model.load_state_dict(new_pl_sd, strict=full_strict) + except: + if "state_dict" in list(state_dict.keys()): + state_dict = state_dict["state_dict"] + model.load_state_dict(state_dict, strict=full_strict) + return model + load_checkpoint(model, ckpt, full_strict=True) + print('>>> model checkpoint loaded.') + return model + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") + return total_params + + +def check_istarget(name, para_list): + """ + name: full name of source para + para_list: partial name of target para + """ + istarget=False + for para in para_list: + if para in name: + return True + return istarget + + +def instantiate_from_config(config): + if not "target" in config: + if config == '__is_first_stage__': + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + +def get_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument("--seed", type=int, default=20230211, help="seed for seed_everything") + parser.add_argument("--mode", default="base", type=str, help="which kind of inference mode: {'base', 'i2v'}") + parser.add_argument("--ckpt_path", type=str, default=None, help="checkpoint path") + parser.add_argument("--config", type=str, help="config (yaml) path") + parser.add_argument("--prompt_file", type=str, default=None, help="a text file containing many prompts") + parser.add_argument("--savedir", type=str, default=None, help="results saving path") + parser.add_argument("--savefps", type=str, default=16, help="video fps to generate") + parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",) + # parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",) + # parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",) + parser.add_argument("--bs", type=int, default=1, help="batch size for inference") + parser.add_argument("--height", type=int, default=512, help="image height, in pixel space") + parser.add_argument("--width", type=int, default=512, help="image width, in pixel space") + parser.add_argument("--frames", type=int, default=-1, help="frames num to inference") + parser.add_argument("--fps", type=int, default=24) + parser.add_argument("--lr", type=float, default=0.05) + parser.add_argument("--cfg", type=float, default=1.0, help="prompt classifier-free guidance") + parser.add_argument("--cfg_temporal", type=float, default=0.0, help="prompt classifier-free guidance") + # parser.add_argument("--unconditional_guidance_scale_temporal", type=float, default=None, help="temporal consistency guidance") + ## for conditional i2v only + parser.add_argument("--cond_input", type=str, default=None, help="data dir of conditional input") + + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument("--vram_O", action="store_true", help="optimization for low VRAM usage") + parser.add_argument("--use_rgb", action="store_true", help="use rgb") + + return parser + +class VideoCrafter2(nn.Module): + def __init__( + self, + model_config, + ckpt_path, + device=torch.device("cuda"), + weights_dtype=torch.float32 + ): + super().__init__() + + self.model = instantiate_from_config(model_config).to(weights_dtype) + self.device = device + self.weights_dtype = weights_dtype + if ckpt_path: + self.model = load_model_checkpoint(self.model, ckpt_path).to(weights_dtype) + self.model.model.diffusion_model.dtype = weights_dtype + self.model.to(device) + self.model.eval() + print(f"{self.model.parameterization} {self.model.dtype} {self.model.model.diffusion_model.dtype}") + self._init_train() + + def _init_train(self, t_range=[0.02, 0.98]): + total_steps = self.model.num_timesteps + self.min_step = int(total_steps * t_range[0]) + self.max_step = int(total_steps * t_range[1]) + self.alphas = self.model.alphas_cumprod.to(self.weights_dtype).to(self.device) + self.sigmas = ((1 - self.model.alphas_cumprod) ** 0.5).to(self.weights_dtype).to(self.device) + + def train_step(self, rgbs, cond, un_cond, cfg=10.0, cfg_temporal=0.0, as_latent=False): + batch_size = rgbs.shape[0] + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + rgbs = rgbs.to(self.weights_dtype) + if as_latent: + rgbs_latent = rgbs + else: + rgbs_latent = self.model.encode_first_stage(rgbs) + with torch.no_grad(): + noise = torch.randn_like(rgbs_latent).to(self.weights_dtype) + rgb_noisy = self.model.q_sample(x_start=rgbs_latent, t=t, noise=noise).to(self.weights_dtype) + noise_pred_text= self.model.apply_model(rgb_noisy, t.to(self.weights_dtype), cond) + noise_pred_uncond= self.model.apply_model(rgb_noisy, t.to(self.weights_dtype), un_cond) + + if cfg_temporal: + noise_pred_static = self.model.apply_model(rgb_noisy, t.to(self.weights_dtype), cond, no_temporal_attn=True).to(self.weights_dtype) + + # perform guidance (high scale from paper!) + noise_pred_cond = noise_pred_text + cfg * ( + noise_pred_text - noise_pred_uncond + ) + if cfg_temporal: + noise_pred_cond += cfg_temporal * (noise_pred_text - noise_pred_static) + + weight = (1 - self.alphas[t]).view(-1, 1, 1, 1, 1) + + # print(noise_pred_cond.shape, noise.shape) + grad = weight * (noise_pred_cond - noise) + target = (rgbs_latent - grad).detach() + loss_sds = 0.5 * F.mse_loss(rgbs_latent.float(), target, reduction='sum') / rgbs_latent.shape[0] + # print(f"loss_sds {loss_sds}") + + # latents_1step_orig = ( + # 1 + # / self.alphas[t].view(-1, 1, 1, 1) + # * (rgb_noisy - self.sigmas[t].view(-1, 1, 1, 1) * noise_pred_cond) + # ).detach() + # with torch.no_grad(): + # # rgb_target = self.model.decode_first_stage(target.to(self.weights_dtype)) + # image_denoised_pretrain = self.model.decode_first_stage(latents_1step_orig) + # grad_img = ( + # weight + # * (rgbs - image_denoised_pretrain) + # * self.alphas[t].view(-1, 1, 1, 1) + # / self.sigmas[t].view(-1, 1, 1, 1) + # ) + # target_img = (rgbs - grad_img).detach() + # recon_loss = F.mse_loss(rgbs.float(), target_img.detach().float(), reduction="sum") / target_img.shape[0] + # loss_sds += 0.01 * torch.nan_to_num(recon_loss) + # print(f"recon_loss {recon_loss}") + + return loss_sds + + def decode_latent(self, rgbs_latent): + return self.model.decode_first_stage(rgbs_latent) + +if __name__ == "__main__": + @torch.no_grad() + def save_results(results, filename, fps=10): + # print('results.shape :', results.shape) + video = results.permute(2, 0, 1, 3, 4) # [t, sample_num, c, h, w] + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + # already in [0,1] + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + # torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + imageio.mimwrite(filename, grid, format='gif') + # imageio.mimwrite(filename, grid, format='mp4', fps=8) + + + parser = get_parser() + opt = parser.parse_args() + seed_everything(opt.seed) + device = torch.device("cuda") + + weights_dtype = torch.float16 if opt.fp16 else torch.float32 + + ## step 1: model config + ## ----------------------------------------------------------------- + config = OmegaConf.load(opt.config) + model_config = config.pop("model", OmegaConf.create()) + # model = instantiate_from_config(model_config) + vc2 = VideoCrafter2(model_config, ckpt_path=opt.ckpt_path, weights_dtype=weights_dtype, device=device) + ## saving folders + os.makedirs(opt.savedir, exist_ok=True) + + ## step 2: load data + ## ----------------------------------------------------------------- + assert os.path.exists(opt.prompt_file), "Error: prompt file NOT Found!" + prompt_list = load_prompts(opt.prompt_file) + num_samples = len(prompt_list) + filename_list = [f"{id+1:04d}" for id in range(num_samples)] + + ## step 3: run over samples + ## ----------------------------------------------------------------- + for prompts, filename in zip(prompt_list, filename_list): + if isinstance(prompts, str): + prompts = [prompts] + with torch.no_grad(): + text_emb = vc2.model.get_learned_conditioning(prompts) + neg_prompt_emb = vc2.model.get_learned_conditioning(["text, watermark, copyright, blurry, nsfw"]) + + ## sample shape + assert (opt.height % 16 == 0) and (opt.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!" + frames = vc2.model.temporal_length if opt.frames < 0 else opt.frames + batch_size = 1 + fps = torch.tensor([opt.fps]*batch_size).to(vc2.model.device).long() + + if opt.use_rgb: + # rgbs = torch.ones(batch_size, 3, frames, opt.height, opt.width).to(weights_dtype).to(device) + rgbs = torch.randn(batch_size, 3, 1, opt.height, opt.width).repeat(1, 1, frames, 1, 1).clamp(0, 1).to(weights_dtype).to(device) # works better + rgbs.requires_grad = True + + optimizer = torch.optim.Adam([rgbs], lr=opt.lr) + cond = {"c_crossattn": [text_emb], "fps": fps} + un_cond = {"c_crossattn": [neg_prompt_emb], "fps": fps} + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = vc2.train_step(rgbs, cond, un_cond, cfg=opt.cfg, cfg_temporal=opt.cfg_temporal, as_latent=False) + loss_sds.backward() + optimizer.step() + + if step % 100 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + video_path = os.path.join(opt.savedir, f"{filename}_sds_{step}.gif") + out = rgbs.detach().float().clamp(0, 1) + save_results(out.data.cpu(), video_path, fps=opt.savefps) + + else: + ## latent noise shape + h, w = opt.height // 8, opt.width // 8 + latent_channels = vc2.model.channels + rgbs_latent = torch.randn(batch_size, latent_channels, 1, h, w).repeat(1, 1, frames, 1, 1).to(device) + rgbs_latent.requires_grad = True + + optimizer = torch.optim.Adam([rgbs_latent], lr=opt.lr) + cond = {"c_crossattn": [text_emb], "fps": fps} + un_cond = {"c_crossattn": [neg_prompt_emb], "fps": fps} + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = vc2.train_step(rgbs_latent, cond, un_cond, cfg=opt.cfg, cfg_temporal=opt.cfg_temporal, as_latent=True) + loss_sds.backward() + optimizer.step() + if step % 100 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + with torch.no_grad(): + if step % 100 == 0: + video_path = os.path.join(opt.savedir, f"{filename}_sds_{step}.gif") + # out = model.decode_first_stage_2DAE(rgbs_latent.detach()) + out = vc2.decode_latent(rgbs_latent.detach()) + out = out.float() + out = (out / 2 + 0.5).clamp(0, 1) + save_results(out.data.cpu(), video_path, fps=opt.savefps) + diff --git a/VideoCrafter/scripts/gradio/i2v_test.py b/VideoCrafter/scripts/gradio/i2v_test.py new file mode 100644 index 0000000..7059273 --- /dev/null +++ b/VideoCrafter/scripts/gradio/i2v_test.py @@ -0,0 +1,83 @@ +import os +import time +from omegaconf import OmegaConf +import torch +from scripts.evaluation.funcs import load_model_checkpoint, load_image_batch, save_videos, batch_ddim_sampling +from utils.utils import instantiate_from_config +from huggingface_hub import hf_hub_download + +class Image2Video(): + def __init__(self,result_dir='./tmp/',gpu_num=1) -> None: + self.download_model() + self.result_dir = result_dir + if not os.path.exists(self.result_dir): + os.mkdir(self.result_dir) + ckpt_path='checkpoints/i2v_512_v1/model.ckpt' + config_file='configs/inference_i2v_512_v1.0.yaml' + config = OmegaConf.load(config_file) + model_config = config.pop("model", OmegaConf.create()) + model_config['params']['unet_config']['params']['use_checkpoint']=False + model_list = [] + for gpu_id in range(gpu_num): + model = instantiate_from_config(model_config) + # model = model.cuda(gpu_id) + assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!" + model = load_model_checkpoint(model, ckpt_path) + model.eval() + model_list.append(model) + self.model_list = model_list + self.save_fps = 8 + + def get_image(self, image, prompt, steps=50, cfg_scale=12.0, eta=1.0, fps=16): + torch.cuda.empty_cache() + print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) + start = time.time() + gpu_id=0 + if steps > 60: + steps = 60 + model = self.model_list[gpu_id] + model = model.cuda() + batch_size=1 + channels = model.model.diffusion_model.in_channels + frames = model.temporal_length + h, w = 320 // 8, 512 // 8 + noise_shape = [batch_size, channels, frames, h, w] + + # text cond + text_emb = model.get_learned_conditioning([prompt]) + + # img cond + img_tensor = torch.from_numpy(image).permute(2, 0, 1).float() + img_tensor = (img_tensor / 255. - 0.5) * 2 + img_tensor = img_tensor.unsqueeze(0) + cond_images = img_tensor.to(model.device) + img_emb = model.get_image_embeds(cond_images) + imtext_cond = torch.cat([text_emb, img_emb], dim=1) + cond = {"c_crossattn": [imtext_cond], "fps": fps} + + ## inference + batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale) + ## b,samples,c,t,h,w + prompt_str = prompt.replace("/", "_slash_") if "/" in prompt else prompt + prompt_str = prompt_str.replace(" ", "_") if " " in prompt else prompt_str + prompt_str=prompt_str[:30] + + save_videos(batch_samples, self.result_dir, filenames=[prompt_str], fps=self.save_fps) + print(f"Saved in {prompt_str}. Time used: {(time.time() - start):.2f} seconds") + model = model.cpu() + return os.path.join(self.result_dir, f"{prompt_str}.mp4") + + def download_model(self): + REPO_ID = 'VideoCrafter/Image2Video-512' + filename_list = ['model.ckpt'] + if not os.path.exists('./checkpoints/i2v_512_v1/'): + os.makedirs('./checkpoints/i2v_512_v1/') + for filename in filename_list: + local_file = os.path.join('./checkpoints/i2v_512_v1/', filename) + if not os.path.exists(local_file): + hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/i2v_512_v1/', local_dir_use_symlinks=False) + +if __name__ == '__main__': + i2v = Image2Video() + video_path = i2v.get_image('prompts/i2v_prompts/horse.png','horses are walking on the grassland') + print('done', video_path) \ No newline at end of file diff --git a/VideoCrafter/scripts/gradio/t2v_test.py b/VideoCrafter/scripts/gradio/t2v_test.py new file mode 100644 index 0000000..3c94f25 --- /dev/null +++ b/VideoCrafter/scripts/gradio/t2v_test.py @@ -0,0 +1,77 @@ +import os +import time +from omegaconf import OmegaConf +import torch +from scripts.evaluation.funcs import load_model_checkpoint, save_videos, batch_ddim_sampling +from utils.utils import instantiate_from_config +from huggingface_hub import hf_hub_download + +class Text2Video(): + def __init__(self,result_dir='./tmp/',gpu_num=1) -> None: + self.download_model() + self.result_dir = result_dir + if not os.path.exists(self.result_dir): + os.mkdir(self.result_dir) + ckpt_path='checkpoints/base_512_v2/model.ckpt' + config_file='configs/inference_t2v_512_v2.0.yaml' + config = OmegaConf.load(config_file) + model_config = config.pop("model", OmegaConf.create()) + model_config['params']['unet_config']['params']['use_checkpoint']=False + model_list = [] + for gpu_id in range(gpu_num): + model = instantiate_from_config(model_config) + # model = model.cuda(gpu_id) + assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!" + model = load_model_checkpoint(model, ckpt_path) + model.eval() + model_list.append(model) + self.model_list = model_list + self.save_fps = 8 + + def get_prompt(self, prompt, steps=50, cfg_scale=12.0, eta=1.0, fps=16): + torch.cuda.empty_cache() + print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) + start = time.time() + gpu_id=0 + if steps > 60: + steps = 60 + model = self.model_list[gpu_id] + model = model.cuda() + batch_size=1 + channels = model.model.diffusion_model.in_channels + frames = model.temporal_length + h, w = 320 // 8, 512 // 8 + noise_shape = [batch_size, channels, frames, h, w] + + # text cond + text_emb = model.get_learned_conditioning([prompt]) + cond = {"c_crossattn": [text_emb], "fps": fps} + + ## inference + batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale) + ## b,samples,c,t,h,w + prompt_str = prompt.replace("/", "_slash_") if "/" in prompt else prompt + prompt_str = prompt_str.replace(" ", "_") if " " in prompt else prompt_str + prompt_str=prompt_str[:30] + + save_videos(batch_samples, self.result_dir, filenames=[prompt_str], fps=self.save_fps) + print(f"Saved in {prompt_str}. Time used: {(time.time() - start):.2f} seconds") + model=model.cpu() + return os.path.join(self.result_dir, f"{prompt_str}.mp4") + + def download_model(self): + REPO_ID = 'VideoCrafter/VideoCrafter2' + filename_list = ['model.ckpt'] + if not os.path.exists('./checkpoints/base_512_v2/'): + os.makedirs('./checkpoints/base_512_v2/') + for filename in filename_list: + local_file = os.path.join('./checkpoints/base_512_v2/', filename) + + if not os.path.exists(local_file): + hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/base_512_v2/', local_dir_use_symlinks=False) + + +if __name__ == '__main__': + t2v = Text2Video() + video_path = t2v.get_prompt('a black swan swims on the pond') + print('done', video_path) \ No newline at end of file diff --git a/VideoCrafter/scripts/run_image2video.sh b/VideoCrafter/scripts/run_image2video.sh new file mode 100644 index 0000000..70e8746 --- /dev/null +++ b/VideoCrafter/scripts/run_image2video.sh @@ -0,0 +1,24 @@ +name="i2v_512_test" + +ckpt='checkpoints/i2v_512_v1/model.ckpt' +config='configs/inference_i2v_512_v1.0.yaml' + +prompt_file="prompts/i2v_prompts/test_prompts.txt" +condimage_dir="prompts/i2v_prompts" +res_dir="results" + +python3 scripts/evaluation/inference.py \ +--seed 123 \ +--mode 'i2v' \ +--ckpt_path $ckpt \ +--config $config \ +--savedir $res_dir/$name \ +--n_samples 1 \ +--bs 1 --height 320 --width 512 \ +--unconditional_guidance_scale 12.0 \ +--ddim_steps 50 \ +--ddim_eta 1.0 \ +--prompt_file $prompt_file \ +--cond_input $condimage_dir \ +--fps 8 + diff --git a/VideoCrafter/scripts/run_sds.sh b/VideoCrafter/scripts/run_sds.sh new file mode 100644 index 0000000..7f16a93 --- /dev/null +++ b/VideoCrafter/scripts/run_sds.sh @@ -0,0 +1,18 @@ +ckpt='checkpoints/base_512_v2/model.ckpt' +config='configs/inference_t2v_512_v2.0.yaml' + +prompt_file="prompts/test_prompts.txt" +res_dir="results" +export CUDA_VISIBLE_DEVICES=1 + +name="cfg10_lr1_t20" +python scripts/evaluation/videocrafter2_utils.py \ + --seed 123 \ + --mode 'base' \ + --ckpt_path $ckpt \ + --config $config \ + --savedir $res_dir/$name \ + --n_samples 1 \ + --bs 1 --height 320 --width 512 \ + --prompt_file $prompt_file \ + --cfg 10.0 --lr 0.1 --cfg_temporal 20.0 \ diff --git a/VideoCrafter/scripts/run_text2video.sh b/VideoCrafter/scripts/run_text2video.sh new file mode 100644 index 0000000..68205fb --- /dev/null +++ b/VideoCrafter/scripts/run_text2video.sh @@ -0,0 +1,21 @@ +name="base_512_v2" + +ckpt='checkpoints/base_512_v2/model.ckpt' +config='configs/inference_t2v_512_v2.0.yaml' + +prompt_file="prompts/test_prompts.txt" +res_dir="results" +export CUDA_VISIBLE_DEVICES=1 +python scripts/evaluation/inference.py \ +--seed 123 \ +--mode 'base' \ +--ckpt_path $ckpt \ +--config $config \ +--savedir $res_dir/$name \ +--n_samples 1 \ +--bs 1 --height 320 --width 512 \ +--unconditional_guidance_scale 12.0 \ +--ddim_steps 50 \ +--ddim_eta 1.0 \ +--prompt_file $prompt_file \ +--fps 28 diff --git a/arguments/__init__.py b/arguments/__init__.py new file mode 100644 index 0000000..c7dca57 --- /dev/null +++ b/arguments/__init__.py @@ -0,0 +1,185 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +from argparse import ArgumentParser, Namespace +import sys +import os + +class GroupParams: + pass + +class ParamGroup: + def __init__(self, parser: ArgumentParser, name : str, fill_none = False): + group = parser.add_argument_group(name) + for key, value in vars(self).items(): + shorthand = False + if key.startswith("_"): + shorthand = True + key = key[1:] + t = type(value) + value = value if not fill_none else None + if shorthand: + if t == bool: + group.add_argument("--" + key, ("-" + key[0:1]), default=value, action="store_true") + else: + group.add_argument("--" + key, ("-" + key[0:1]), default=value, type=t) + else: + if t == bool: + group.add_argument("--" + key, default=value, action="store_true") + else: + group.add_argument("--" + key, default=value, type=t) + + def extract(self, args): + group = GroupParams() + for arg in vars(args).items(): + if arg[0] in vars(self) or ("_" + arg[0]) in vars(self): + setattr(group, arg[0], arg[1]) + return group + +class ModelParams(ParamGroup): + def __init__(self, parser, sentinel=False): + self.frame_num = 8 + self.sh_degree = 0 # NOTE: we don't need sh + self._source_path = "" + self._model_path = "" + self._images = "images" + self._resolution = -1 + self._white_background = True + self.data_device = "cuda" + self.eval = True + self.render_process=False + self.name="panda" + self.cloud_path = None # required + self.rife=False + self.imagedream=False + self.static=False + super().__init__(parser, "Loading Parameters", sentinel) + + def extract(self, args): + g = super().extract(args) + g.source_path = os.path.abspath(g.source_path) + return g + +class PipelineParams(ParamGroup): + def __init__(self, parser): + self.convert_SHs_python = False + self.compute_cov3D_python = False + self.debug = False + super().__init__(parser, "Pipeline Parameters") +class ModelHiddenParams(ParamGroup): + def __init__(self, parser): + self.net_width = 64 + self.timebase_pe = 4 + self.defor_depth = 1 + self.posebase_pe = 10 + self.scale_rotation_pe = 2 + self.opacity_pe = 2 + self.timenet_width = 64 + self.timenet_output = 32 + self.bounds = 1.6 + self.plane_tv_weight = 0.0001 + self.time_smoothness_weight = 0.01 + self.l1_time_planes = 0.0001 + self.grid_merge = 'mul' + self.kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 25] + } + self.multires = [1, 2, 4, 8] + self.no_grid=False + self.no_ds=False + self.no_dr=False + self.no_do=True + self.no_dc=True + + + super().__init__(parser, "ModelHiddenParams") + +class OptimizationParams(ParamGroup): + def __init__(self, parser): + self.dataloader=False + self.prompt='' + self.func_name='' + self.scales=[] + self.obj_prompt='' + self.video_sds_type='' + self.pre_scale=False + self.iterations = 30_000 + self.coarse_iterations = 3000 + self.static_iterations = 700 + self.position_lr_init = 0.00016 + self.position_lr_final = 0.0000016 + self.position_lr_delay_mult = 0.01 + self.position_lr_max_steps = 20_000 + self.deformation_lr_init = 0.00016 + self.deformation_lr_final = 0.000016 + self.deformation_lr_delay_mult = 0.01 + self.grid_lr_init = 0.0016 + self.grid_lr_final = 0.00016 + + self.feature_lr = 0.0025 + self.opacity_lr = 0.05 + self.scaling_lr = 0.005 + self.rotation_lr = 0.001 + self.percent_dense = 0.01 + self.lambda_dssim = 0 + self.lambda_pts = 0 + self.lambda_zero123 = 0.5 + self.lambda_lpips = 0 + self.fine_rand_rate=1 + self.weight_constraint_init= 1 + self.weight_constraint_after = 0.2 + self.weight_decay_iteration = 5000 + self.opacity_reset_interval = 3000 + self.densification_interval = 100 + self.densify_from_iter = 500 + self.densify_until_iter = 15_000 + self.densify_grad_threshold_coarse = 0.0002 + self.densify_grad_threshold_fine_init = 0.0002 + self.densify_grad_threshold_after = 0.0002 + self.pruning_from_iter = 500 + self.pruning_interval = 100 + self.pruning_interval_fine = 100 + self.opacity_threshold_coarse = 0.005 + self.opacity_threshold_fine_init = 0.005 + self.opacity_threshold_fine_after = 0.005 + self.image_weight = 0.001 + self.nn_weight = 1000 + self.cfg = 20.0 + self.cfg_temporal = 0.0 + self.loss_dx_weight = 0.0 + self.with_reg = False + + super().__init__(parser, "Optimization Parameters") + +def get_combined_args(parser : ArgumentParser): + cmdlne_string = sys.argv[1:] + cfgfile_string = "Namespace()" + args_cmdline = parser.parse_args(cmdlne_string) + + try: + cfgfilepath = os.path.join(args_cmdline.model_path, "cfg_args") + print("Looking for config file in", cfgfilepath) + with open(cfgfilepath) as cfg_file: + print("Config file found: {}".format(cfgfilepath)) + cfgfile_string = cfg_file.read() + except TypeError: + print("Config file not found at") + pass + args_cfgfile = eval(cfgfile_string) + + merged_dict = vars(args_cfgfile).copy() + for k,v in vars(args_cmdline).items(): + if v != None: + merged_dict[k] = v + return Namespace(**merged_dict) \ No newline at end of file diff --git a/arguments/comp_butterfly_flower_vc.py b/arguments/comp_butterfly_flower_vc.py new file mode 100644 index 0000000..0e1af8a --- /dev/null +++ b/arguments/comp_butterfly_flower_vc.py @@ -0,0 +1,70 @@ +OptimizationParams = dict( + prompt='a butterfly flies around the flower', + # first one is static + obj_prompt = [ + 'a flower', + 'a butterfly flying', + ], + scales = [1.0, 1,1], + func_name = 'traj_funcs.butterfly_flower.generate_coordinates', + video_sds_type = 'videocrafter2', + cfg = 20, + cfg_temporal = 10, + static_iterations = 0, + coarse_iterations = 0, + iterations = 4000, + position_lr_max_steps = 20000, + position_lr_delay_mult = 1, #1, + pruning_interval = 100, + pruning_interval_fine = 100000, + percent_dense = 0.01, + densify_grad_threshold_fine_init = 0.5, + densify_grad_threshold_coarse = 0.5, + densify_grad_threshold_after = 0.1, + deformation_lr_delay_mult = 1, + deformation_lr_init = 0.0002, + deformation_lr_final = 0.0002, + grid_lr_init = 0.016, + grid_lr_final = 0.016, + densification_interval = 100, + opacity_reset_interval = 300, + lambda_lpips = 2, + lambda_dssim = 2, + lambda_pts = 0, + lambda_zero123 = 0.5, # not used + fine_rand_rate = 1, +) + +ModelParams = dict( + frame_num = 16, + name="rose", + rife=False, + cloud_path = [ + './data/8w/a_flower.ply', + './data/8w/a_butterfly.ply', + ] +) + +ModelHiddenParams = dict( + no_grid = True, + grid_merge = 'cat', # not used + # grid_merge = 'mul', + multires = [1, 2, 4, 8 ], # not used + defor_depth = 5, + net_width = 128, + plane_tv_weight = 0, + time_smoothness_weight = 0, + l1_time_planes = 0, + weight_decay_iteration=0, + bounds=2, + no_ds=True, + no_dr=True, + no_do=True, + no_dc=True, + kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 24] #8 is frame numbers/2 + } +) \ No newline at end of file diff --git a/arguments/comp_butterfly_flower_zs.py b/arguments/comp_butterfly_flower_zs.py new file mode 100644 index 0000000..e47fcf5 --- /dev/null +++ b/arguments/comp_butterfly_flower_zs.py @@ -0,0 +1,69 @@ +OptimizationParams = dict( + prompt='a butterfly flies towards the flower', + # first one is static + obj_prompt = [ + 'a flower', + 'a butterfly flying', + ], + scales = [1.0, 1,1], + func_name = 'traj_funcs.butterfly_flower.generate_coordinates', + video_sds_type = 'zeroscope', + cfg_temporal = 100, + cfg = 100, + static_iterations = 0, + coarse_iterations = 0, + iterations = 4500, + position_lr_max_steps = 20000, + position_lr_delay_mult = 1, + pruning_interval = 100, + pruning_interval_fine = 100000, + percent_dense = 0.01, + densify_grad_threshold_fine_init = 0.5, + densify_grad_threshold_coarse = 0.5, + densify_grad_threshold_after = 0.1, + deformation_lr_delay_mult = 1, + deformation_lr_init = 0.0002, + deformation_lr_final = 0.0002, + grid_lr_init = 0.016, + grid_lr_final = 0.016, + densification_interval = 100, + opacity_reset_interval = 300, + lambda_lpips = 2, + lambda_dssim = 2, + lambda_pts = 0, + lambda_zero123 = 0.5, # not used + fine_rand_rate = 1, +) + +ModelParams = dict( + frame_num = 16, + name="rose", + rife=False, + cloud_path = [ + './data/8w/a_flower.ply', + './data/8w/a_butterfly.ply', + ] +) + +ModelHiddenParams = dict( + no_grid = True, + grid_merge = 'cat', # not used + multires = [1, 2, 4, 8 ], # not used + defor_depth = 5, + net_width = 128, + plane_tv_weight = 0, + time_smoothness_weight = 0, + l1_time_planes = 0, + weight_decay_iteration=0, + bounds=2, + no_ds=True, + no_dr=True, + no_do=True, + no_dc=True, + kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 24] #8 is frame numbers/2 + } +) diff --git a/arguments/comp_fish_rock_vc.py b/arguments/comp_fish_rock_vc.py new file mode 100644 index 0000000..bdd82f8 --- /dev/null +++ b/arguments/comp_fish_rock_vc.py @@ -0,0 +1,70 @@ +OptimizationParams = dict( + prompt='a fish swimming around a rock', + # first one is static + obj_prompt = [ + 'a flower', + 'a butterfly flying', + ], + scales = [1.0, 1,1], + func_name = 'traj_funcs.fish_rock.generate_coordinates', + video_sds_type = 'videocrafter2', + cfg = 20, + cfg_temporal = 10, + static_iterations = 0, + coarse_iterations = 0, + iterations = 4000, + position_lr_max_steps = 20000, + position_lr_delay_mult = 1, #1, + pruning_interval = 100, + pruning_interval_fine = 100000, + percent_dense = 0.01, + densify_grad_threshold_fine_init = 0.5, + densify_grad_threshold_coarse = 0.5, + densify_grad_threshold_after = 0.1, + deformation_lr_delay_mult = 1, + deformation_lr_init = 0.0002, + deformation_lr_final = 0.0002, + grid_lr_init = 0.016, + grid_lr_final = 0.016, + densification_interval = 100, + opacity_reset_interval = 300, + lambda_lpips = 2, + lambda_dssim = 2, + lambda_pts = 0, + lambda_zero123 = 0.5, # not used + fine_rand_rate = 1, +) + +ModelParams = dict( + frame_num = 16, + name="rose", + rife=False, + cloud_path = [ + './data/8w/a_rock.ply', + './data/8w/a_yellow_fish.ply', + ] +) + +ModelHiddenParams = dict( + no_grid = True, + grid_merge = 'cat', # not used + # grid_merge = 'mul', + multires = [1, 2, 4, 8 ], # not used + defor_depth = 5, + net_width = 128, + plane_tv_weight = 0, + time_smoothness_weight = 0, + l1_time_planes = 0, + weight_decay_iteration=0, + bounds=2, + no_ds=True, + no_dr=True, + no_do=True, + no_dc=True, + kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 24] #8 is frame numbers/2 + } +) \ No newline at end of file diff --git a/arguments/comp_fish_rock_zs.py b/arguments/comp_fish_rock_zs.py new file mode 100644 index 0000000..e879fad --- /dev/null +++ b/arguments/comp_fish_rock_zs.py @@ -0,0 +1,70 @@ +OptimizationParams = dict( + prompt='a fish swimming around a rock', + # first one is static + obj_prompt = [ + 'a static rock', + 'a fish swimming', + ], + scales = [1.0, 0.5], + func_name = 'traj_funcs.fish_rock.generate_coordinates', + video_sds_type = 'zeroscope', + cfg_temporal = 100, + cfg = 100, + static_iterations = 0, + coarse_iterations = 0, + iterations = 4000, + position_lr_max_steps = 20000, + position_lr_delay_mult = 1, #1, + pruning_interval = 100, + pruning_interval_fine = 100000, + percent_dense = 0.01, + densify_grad_threshold_fine_init = 0.5, + densify_grad_threshold_coarse = 0.5, + densify_grad_threshold_after = 0.1, + deformation_lr_delay_mult = 1, + deformation_lr_init = 0.0002, + deformation_lr_final = 0.0002, + grid_lr_init = 0.016, + grid_lr_final = 0.016, + densification_interval = 100, + opacity_reset_interval = 300, + lambda_lpips = 2, + lambda_dssim = 2, + lambda_pts = 0, + lambda_zero123 = 0.5, # not used + fine_rand_rate = 1, +) + +ModelParams = dict( + frame_num = 16, + name="rose", + rife=False, + cloud_path = [ + './data/8w/a_rock.ply', + './data/8w/a_yellow_fish.ply', + ] +) + +ModelHiddenParams = dict( + no_grid = True, + grid_merge = 'cat', # not used + # grid_merge = 'mul', + multires = [1, 2, 4, 8 ], # not used + defor_depth = 5, + net_width = 128, + plane_tv_weight = 0, + time_smoothness_weight = 0, + l1_time_planes = 0, + weight_decay_iteration=0, + bounds=2, + no_ds=True, + no_dr=True, + no_do=True, + no_dc=True, + kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 24] #8 is frame numbers/2 + } +) diff --git a/data/8w/a_butterfly.ply b/data/8w/a_butterfly.ply new file mode 100644 index 0000000..3ab4031 Binary files /dev/null and b/data/8w/a_butterfly.ply differ diff --git a/data/8w/a_flower.ply b/data/8w/a_flower.ply new file mode 100644 index 0000000..92def12 Binary files /dev/null and b/data/8w/a_flower.ply differ diff --git a/data/8w/a_rock.ply b/data/8w/a_rock.ply new file mode 100644 index 0000000..0dc61d2 Binary files /dev/null and b/data/8w/a_rock.ply differ diff --git a/data/8w/a_yellow_fish.ply b/data/8w/a_yellow_fish.ply new file mode 100644 index 0000000..b1c0a4c Binary files /dev/null and b/data/8w/a_yellow_fish.ply differ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..f20799b --- /dev/null +++ b/environment.yml @@ -0,0 +1,78 @@ +name: 4DGen +channels: + - defaults +dependencies: + - anaconda/noarch::mesa-libegl-cos6-x86_64==11.0.7=4 + - conda-forge/linux-64::_libgcc_mutex==0.1=conda_forge + - defaults/linux-64::blas==1.0=mkl + - anaconda/linux-64::ca-certificates==2023.01.10=h06a4308_0 + - conda-forge/linux-64::ld_impl_linux-64==2.40=h41732ed_0 + - conda-forge/linux-64::libstdcxx-ng==13.2.0=h7e041cc_2 + - pytorch/noarch::pytorch-mutex==1.0=cuda + - anaconda/linux-64::certifi==2022.12.7=py37h06a4308_0 + - anaconda/linux-64::openssl==1.1.1s=h7f8727e_0 + - conda-forge/linux-64::_openmp_mutex==4.5=2_kmp_llvm + - conda-forge/linux-64::libgcc-ng==13.2.0=h807b86a_2 + - conda-forge/linux-64::bzip2==1.0.8=h7f98852_4 + - conda-forge/linux-64::cudatoolkit==11.6.2=hfc3e2af_12 + - conda-forge/linux-64::gmp==6.2.1=h58526e2_0 + - conda-forge/linux-64::icu==73.2=h59595ed_0 + - conda-forge/linux-64::jpeg==9e=h0b41bf4_3 + - conda-forge/linux-64::lame==3.100=h166bdaf_1003 + - conda-forge/linux-64::lerc==4.0.0=h27087fc_0 + - conda-forge/linux-64::libdeflate==1.14=h166bdaf_0 + - conda-forge/linux-64::libffi==3.3=h58526e2_2 + - conda-forge/linux-64::libiconv==1.17=h166bdaf_0 + - conda-forge/linux-64::libwebp-base==1.3.2=hd590300_0 + - conda-forge/linux-64::libzlib==1.2.13=hd590300_5 + - conda-forge/linux-64::ncurses==6.4=hcb278e6_0 + - conda-forge/linux-64::nettle==3.6=he412f7d_0 + - conda-forge/linux-64::pthread-stubs==0.4=h36c2ea0_1001 + - conda-forge/linux-64::xorg-libxau==1.0.11=hd590300_0 + - conda-forge/linux-64::xorg-libxdmcp==1.1.3=h7f98852_0 + - conda-forge/linux-64::xz==5.2.6=h166bdaf_0 + - conda-forge/linux-64::gnutls==3.6.13=h85f3911_1 + - conda-forge/linux-64::libpng==1.6.39=h753d276_0 + - conda-forge/linux-64::libsqlite==3.43.0=h2797004_0 + - conda-forge/linux-64::libxcb==1.13=h7f98852_1004 + - conda-forge/linux-64::libxml2==2.11.5=h232c23b_1 + - conda-forge/linux-64::readline==8.2=h8228510_1 + - conda-forge/linux-64::tk==8.6.13=h2797004_0 + - conda-forge/linux-64::zlib==1.2.13=hd590300_5 + - conda-forge/linux-64::zstd==1.5.5=hfc55251_0 + - conda-forge/linux-64::freetype==2.12.1=h267a509_2 + - conda-forge/linux-64::libhwloc==2.9.3=default_h554bfaf_1009 + - conda-forge/linux-64::libtiff==4.4.0=h82bc61c_5 + - conda-forge/linux-64::llvm-openmp==16.0.6=h4dfa4b3_0 + - conda-forge/linux-64::openh264==2.1.1=h780b84a_0 + - conda-forge/linux-64::sqlite==3.43.0=h2c6b66d_0 + - pytorch/linux-64::ffmpeg==4.3=hf484d3e_0 + - conda-forge/linux-64::lcms2==2.14=h6ed2654_0 + - conda-forge/linux-64::openjpeg==2.5.0=h7d73246_1 + - defaults/linux-64::python==3.7.13=haa1d7c7_1 + - conda-forge/linux-64::tbb==2021.10.0=h00ab1b0_1 + - conda-forge/noarch::charset-normalizer==3.2.0=pyhd8ed1ab_0 + - conda-forge/noarch::colorama==0.4.6=pyhd8ed1ab_0 + - conda-forge/noarch::idna==3.4=pyhd8ed1ab_0 + - conda-forge/linux-64::mkl==2021.4.0=h8d4b97c_729 + - conda-forge/linux-64::python_abi==3.7=2_cp37m + - conda-forge/noarch::setuptools==68.2.2=pyhd8ed1ab_0 + - conda-forge/noarch::six==1.16.0=pyh6c4a22f_0 + - conda-forge/noarch::typing_extensions==4.7.1=pyha770c72_0 + - conda-forge/noarch::wheel==0.41.2=pyhd8ed1ab_0 + - conda-forge/linux-64::brotli-python==1.0.9=py37hd23a5d3_7 + - conda-forge/linux-64::mkl-service==2.4.0=py37h402132d_0 + - conda-forge/linux-64::pillow==9.2.0=py37h850a105_2 + - conda-forge/noarch::pip==22.3.1=pyhd8ed1ab_0 + - conda-forge/linux-64::pysocks==1.7.1=py37h89c1867_5 + - pytorch/linux-64::pytorch==1.12.1=py3.7_cuda11.6_cudnn8.3.2_0 + - conda-forge/noarch::tqdm==4.66.1=pyhd8ed1ab_0 + - defaults/linux-64::numpy-base==1.21.5=py37ha15fc14_3 + - conda-forge/noarch::urllib3==2.0.5=pyhd8ed1ab_0 + - conda-forge/noarch::requests==2.31.0=pyhd8ed1ab_0 + - conda-forge/linux-64::mkl_fft==1.3.1=py37h3e078e5_1 + - conda-forge/linux-64::mkl_random==1.2.2=py37h219a48f_0 + - defaults/linux-64::numpy==1.21.5=py37h6c91a56_3 + - conda-forge/noarch::plyfile==0.8.1=pyhd8ed1ab_0 + - pytorch/linux-64::torchaudio==0.12.1=py37_cu116 + - pytorch/linux-64::torchvision==0.13.1=py37_cu116 diff --git a/gaussian_renderer/__init__.py b/gaussian_renderer/__init__.py new file mode 100644 index 0000000..215ce9f --- /dev/null +++ b/gaussian_renderer/__init__.py @@ -0,0 +1,240 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import math +from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer +from scene.gaussian_model import GaussianModel +from utils.sh_utils import eval_sh + +def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, time=torch.tensor([[0]]), scaling_modifier = 1.0, override_color = None, stage=None, render_flow=False, return_pts=False, offset=None, scales_preset=None, pre_scale=False): + # print(scaling_modifier) + assert scaling_modifier == 1 + if stage is None: + raise NotImplementedError + """ + Render the scene. + + Background tensor (bg_color) must be on GPU! + """ + + # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means + screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 + try: + screenspace_points.retain_grad() + except Exception as e: + # print(e) + pass + + # Set up rasterization configuration + + tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) + tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) + + raster_settings = GaussianRasterizationSettings( + image_height=int(viewpoint_camera.image_height), + image_width=int(viewpoint_camera.image_width), + tanfovx=tanfovx, + tanfovy=tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=viewpoint_camera.world_view_transform.cuda(), + projmatrix=viewpoint_camera.full_proj_transform.cuda(), + sh_degree=pc.active_sh_degree, + campos=viewpoint_camera.camera_center.cuda(), + prefiltered=False, + debug=pipe.debug + ) + + rasterizer = GaussianRasterizer(raster_settings=raster_settings) + + # means3D = pc.get_xyz + # add deformation to each points + # deformation = pc.get_deformation + means3D = pc.get_xyz + + if pre_scale: + means3D = means3D * scales_preset + means3D = offset(means3D) + + try: + assert time.item() >= 0 and time.item() <= 1 + time = time.to(means3D.device).repeat(means3D.shape[0],1) + except: + assert time >= 0 and time <= 1 + time = torch.tensor([time]).to(means3D.device).repeat(means3D.shape[0],1) + # time = time / 16 # in range of [0, 1] + + means2D = screenspace_points + opacity = pc._opacity + color=pc._features_dc + color=color[:,0,:] + + + + # If precomputed 3d covariance is provided, use it. If not, then it will be computed from + # scaling / rotation by the rasterizer. + scales = None + rotations = None + cov3D_precomp = None + + dx = None + if pipe.compute_cov3D_python: + cov3D_precomp = pc.get_covariance(scaling_modifier) + else: + # scales = pc.get_scaling + scales = pc._scaling + if scales.shape[-1] == 1: + scales = scales.repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + # rotations = pc.get_rotation + rotations = pc._rotation + deformation_point = pc._deformation_table + # print('color render:',color.shape) #[40000, 1, 3]->[40000, 3] + # print('rotations render:',rotations.shape) #[40000, 4] + + if stage == "static": # or time.sum() == 0: + # if stage == "static" or time.sum() == 0: + means3D_deform, scales_deform, rotations_deform, opacity_deform,color_deform = means3D, scales, rotations, opacity,color + # elif stage in ["coarse", 'fine']: + # means3D_deform, scales_deform, rotations_deform, opacity_deform, color_deform = pc._deformation(means3D[deformation_point], scales[deformation_point], rotations[deformation_point], opacity[deformation_point], color[deformation_point], time[deformation_point]) + # dx = (means3D_deform - means3D[deformation_point]) + # ds = (scales_deform - scales[deformation_point]) + # dr = (rotations_deform - rotations[deformation_point]) + # do = (opacity_deform - opacity[deformation_point]) + # dc = (color_deform - color[deformation_point]) + else: + # deprecated + means3D_deform, scales_deform, rotations_deform, opacity_deform,color_deform = pc._deformation(means3D[deformation_point].detach(), scales[deformation_point].detach(), rotations[deformation_point].detach(), opacity[deformation_point].detach(),color[deformation_point].detach(), time[deformation_point].detach()) + dx = (means3D_deform - means3D[deformation_point].detach()) + ds = (scales_deform - scales[deformation_point].detach()) + dr = (rotations_deform - rotations[deformation_point].detach()) + do = (opacity_deform - opacity[deformation_point].detach()) + #dc=0 + dc = (color_deform - color[deformation_point].detach()) + + # dx = dx * (time ** 0.35) + # # dx = dx * time + # means3D_deform = dx + means3D[deformation_point].detach() + + means3D_final = torch.zeros_like(means3D) + rotations_final = torch.zeros_like(rotations) + scales_final = torch.zeros_like(scales) + opacity_final = torch.zeros_like(opacity) + color_final= torch.zeros_like(color) + means3D_final[deformation_point] = means3D_deform + rotations_final[deformation_point] = rotations_deform + scales_final[deformation_point] = scales_deform + opacity_final[deformation_point] = opacity_deform + + # print('color_final shape before',color_final.shape) + + # print('color_final shape',color_final.shape) + # print('color_deform shape',color_deform.shape) + # print('deformation_point shape',deformation_point.shape) + color_final[deformation_point] = color_deform + + means3D_final[~deformation_point] = means3D[~deformation_point] + rotations_final[~deformation_point] = rotations[~deformation_point] + scales_final[~deformation_point] = scales[~deformation_point] + opacity_final[~deformation_point] = opacity[~deformation_point] + color_final[~deformation_point] = color[~deformation_point] + color_final=torch.unsqueeze(color_final, 1) #[40000, 3]->[40000, 1, 3] + + scales_final = pc.scaling_activation(scales_final) + #scales_final = torch.ones_like(scales_final ) * 0.01 + rotations_final = pc.rotation_activation(rotations_final) + opacity = pc.opacity_activation(opacity) + #color without activation + + # print(opacity.max()) + # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors + # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. + shs = None + colors_precomp = None + + #pc._features_dc=color_final #update color + + if override_color is None: + if pipe.convert_SHs_python: + shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2) + dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.cuda().repeat(pc.get_features.shape[0], 1)) + dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True) + sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) + colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) + else: + # print('shs=============') + #shs = pc.get_features + # dc=pc.get_features_dc + # print('pc.get_features_dc devide',pc.get_features_dc.device) + dc=color_final + #print('color_final devide',dc.device) + rest=pc.get_features_rest + shs=torch.cat((dc, rest), dim=1) + else: + colors_precomp = override_color + + #colors_precomp=color_final #not sure + # print('colors_precomp shape:',colors_precomp.shape) + # print('color_final shape:',color_final.shape) + + # Rasterize visible Gaussians to image, obtain their radii (on screen). + rendered_image, radii, depth, alpha = rasterizer( + means3D = means3D_final, + means2D = means2D, + shs = shs, + colors_precomp = colors_precomp, + opacities = opacity, + scales = scales_final, + rotations = rotations_final, + cov3D_precomp = cov3D_precomp) + + + # Those Gaussians that were frustum culled or had a radius of 0 were not visible. + # They will be excluded from value updates used in the splitting criteria. + res = { + "render": rendered_image, + "viewspace_points": screenspace_points, + "visibility_filter" : radii > 0, + "radii": radii, + "alpha": alpha, + "depth":depth, + } + # print(dx, time.sum(), stage) + if dx is not None: + res['dx'] = dx #.mean() + res['ds'] = ds #.mean() + res['dr'] = dr #.mean() + res['do'] = do #.mean() + res['dc'] = dc + + if render_flow and stage == 'coarse': + flow_screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 + try: + flow_screenspace_points.retain_grad() + except: + pass + rendered_flow, _, _, _ = rasterizer( + means3D = means3D_final, + means2D = flow_screenspace_points, + shs = None, + colors_precomp = dx, + opacities = opacity, + scales = scales_final, + rotations = rotations_final, + cov3D_precomp = cov3D_precomp + ) + res['rendered_flow'] = rendered_flow + if return_pts: + res['means3D'] = means3D_final + res['means2D'] = means2D + res['opacity_final'] = opacity_final + return res \ No newline at end of file diff --git a/gaussian_renderer/comp_renderer.py b/gaussian_renderer/comp_renderer.py new file mode 100644 index 0000000..e2b57e3 --- /dev/null +++ b/gaussian_renderer/comp_renderer.py @@ -0,0 +1,244 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import math +from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer +from scene.gaussian_model import GaussianModel +from utils.sh_utils import eval_sh + +def prepare_single_gs(pc, time, stage='fine', xyz_offset=None, scales_preset=None, pre_scale=True): + screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 + try: + screenspace_points.retain_grad() + except Exception as e: + # print(e) + pass + + + # means3D = pc.get_xyz + means3D = pc.get_xyz + if pre_scale: + means3D = means3D * scales_preset + means3D = xyz_offset(means3D) + # add deformation to each points + # deformation = pc.get_deformation + try: + assert time.item() >= 0 and time.item() <= 1 + time = time.to(means3D.device).repeat(means3D.shape[0],1) + except: + assert time >= 0 and time <= 1 + time = torch.tensor([time]).to(means3D.device).repeat(means3D.shape[0],1) + # time = time / 16 # in range of [0, 1] + + means2D = screenspace_points + opacity = pc._opacity + color=pc._features_dc + color=color[:,0,:] + + + + # If precomputed 3d covariance is provided, use it. If not, then it will be computed from + # scaling / rotation by the rasterizer. + scales = None + rotations = None + # cov3D_precomp = None + + dx = None + # if pipe.compute_cov3D_python: + # cov3D_precomp = pc.get_covariance(scaling_modifier) + # else: + # scales = pc.get_scaling + scales = pc._scaling + if scales.shape[-1] == 1: + scales = scales.repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + # rotations = pc.get_rotation + rotations = pc._rotation + deformation_point = pc._deformation_table + # print('color render:',color.shape) #[40000, 1, 3]->[40000, 3] + # print('rotations render:',rotations.shape) #[40000, 4] + + if stage == "static": # or time.sum() == 0: + # if stage == "static" or time.sum() == 0: + means3D_deform, scales_deform, rotations_deform, opacity_deform,color_deform = means3D, scales, rotations, opacity,color + else: + means3D_deform, scales_deform, rotations_deform, opacity_deform,color_deform = pc._deformation(means3D[deformation_point].detach(), scales[deformation_point].detach(), rotations[deformation_point].detach(), opacity[deformation_point].detach(),color[deformation_point].detach(), time[deformation_point].detach()) + # dx = (means3D_deform - means3D[deformation_point].detach()) + # ds = (scales_deform - scales[deformation_point].detach()) + # dr = (rotations_deform - rotations[deformation_point].detach()) + # do = (opacity_deform - opacity[deformation_point].detach()) + # #dc=0 + # dc = (color_deform - color[deformation_point].detach()) + + # dx = dx * (time ** 0.35) + # # dx = dx * time + # means3D_deform = dx + means3D[deformation_point].detach() + + means3D_final = torch.zeros_like(means3D) + rotations_final = torch.zeros_like(rotations) + scales_final = torch.zeros_like(scales) + opacity_final = torch.zeros_like(opacity) + color_final= torch.zeros_like(color) + means3D_final[deformation_point] = means3D_deform + rotations_final[deformation_point] = rotations_deform + scales_final[deformation_point] = scales_deform + opacity_final[deformation_point] = opacity_deform + + # print('color_final shape before',color_final.shape) + + # print('color_final shape',color_final.shape) + # print('color_deform shape',color_deform.shape) + # print('deformation_point shape',deformation_point.shape) + color_final[deformation_point] = color_deform + + means3D_final[~deformation_point] = means3D[~deformation_point] + rotations_final[~deformation_point] = rotations[~deformation_point] + scales_final[~deformation_point] = scales[~deformation_point] + opacity_final[~deformation_point] = opacity[~deformation_point] + color_final[~deformation_point] = color[~deformation_point] + color_final=torch.unsqueeze(color_final, 1) #[40000, 3]->[40000, 1, 3] + + scales_final = pc.scaling_activation(scales_final) + #scales_final = torch.ones_like(scales_final ) * 0.01 + rotations_final = pc.rotation_activation(rotations_final) + opacity = pc.opacity_activation(opacity) + #color without activation + + if not pre_scale: + means3D_final = means3D_final * scales_preset + means3D_final = xyz_offset(means3D_final) + dx = (means3D_final - means3D.detach()) + # print(opacity.max()) + # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors + # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. + #print('color_final devide',dc.device) + rest = pc.get_features_rest + shs = torch.cat((color_final, rest), dim=1) + return means3D_final, means2D, shs, opacity, scales_final, rotations_final, screenspace_points, dx + +def move(x, axis, time): + x[axis:axis+1] = 0.5 + 0.2 * time.to(x.device) + return x + +def placeholder(idx, time): + # if idx == 0: + return torch.tensor([0, 0, 0], dtype=torch.float32, device="cuda", requires_grad=False) +def placeholder2(idx, time): + # if idx == 0: + return torch.eye(4, dtype=torch.float32, device="cuda", requires_grad=False) + # return + +def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, time=torch.tensor([[0]]), scaling_modifier = 1.0, override_color = None, stage=None, render_flow=False, return_pts=False, offset=[], scales_list=[], pre_scale=False): + # print(scaling_modifier) + assert scaling_modifier == 1 + if stage is None: + raise NotImplementedError + """ + Render the scene. + + Background tensor (bg_color) must be on GPU! + """ + + # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means + + + # Set up rasterization configuration + + tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) + tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) + + raster_settings = GaussianRasterizationSettings( + image_height=int(viewpoint_camera.image_height), + image_width=int(viewpoint_camera.image_width), + tanfovx=tanfovx, + tanfovy=tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=viewpoint_camera.world_view_transform.cuda(), + projmatrix=viewpoint_camera.full_proj_transform.cuda(), + sh_degree=0, + campos=viewpoint_camera.camera_center.cuda(), + prefiltered=False, + debug=pipe.debug + ) + + rasterizer = GaussianRasterizer(raster_settings=raster_settings) + + means3D_final, means2D, shs, opacity, scales_final, rotations_final, screenspace_points, dx = [], [], [], [], [], [], [], [] + + # zero_ts = (bg_color * 0).detach() + # offset = [zero_ts, move(zero_ts, 0, time)] + # 0 is y + # print(scales_list) + for i, _ in enumerate(pc): + means3D_final_, means2D_, shs_, opacity_, scales_final_, rotations_final_, screenspace_points_, dx_ = prepare_single_gs(_, time, xyz_offset=offset[i], scales_preset=scales_list[i], pre_scale=pre_scale) + means3D_final.append(means3D_final_) + means2D.append(means2D_) + shs.append(shs_) + opacity.append(opacity_) + scales_final.append(scales_final_) + rotations_final.append(rotations_final_) + screenspace_points.append(screenspace_points_) + dx.append(dx_) + + means3D_final = torch.cat(means3D_final, dim=0) + means2D = torch.cat(means2D, dim=0) + shs = torch.cat(shs, dim=0) + opacity = torch.cat(opacity, dim=0) + scales_final = torch.cat(scales_final, dim=0) + rotations_final = torch.cat(rotations_final, dim=0) + screenspace_points = torch.cat(screenspace_points, dim=0) + dx = torch.cat(dx, dim=0) + # print('means3D_final', means3D_final.shape) + # print('means2D', means2D.shape) + # print('shs', shs.shape) + # print('opacity', opacity.shape) + # print('scales_final', scales_final.shape) + # print('rotations_final', rotations_final.shape) + # print('screenspace_points', screenspace_points.shape) + # print('dx', dx.shape) + + rendered_image, radii, depth, alpha = rasterizer( + means3D = means3D_final, + means2D = means2D, + shs = shs, + colors_precomp = None, + opacities = opacity, + scales = scales_final, + rotations = rotations_final, + cov3D_precomp = None + ) + + + # Those Gaussians that were frustum culled or had a radius of 0 were not visible. + # They will be excluded from value updates used in the splitting criteria. + res = { + "render": rendered_image, + "viewspace_points": screenspace_points, + "visibility_filter" : radii > 0, + "radii": radii, + "alpha": alpha, + "depth":depth, + } + # print(dx, time.sum(), stage) + if dx is not None: + res['dx'] = dx #.mean() + # res['ds'] = ds #.mean() + # res['dr'] = dr #.mean() + # res['do'] = do #.mean() + # res['dc'] = dc + + if return_pts: + res['means3D'] = means3D_final + res['means2D'] = means2D + res['opacity_final'] = opacity_final + return res \ No newline at end of file diff --git a/gaussian_renderer/network_gui.py b/gaussian_renderer/network_gui.py new file mode 100644 index 0000000..df2f9da --- /dev/null +++ b/gaussian_renderer/network_gui.py @@ -0,0 +1,86 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import traceback +import socket +import json +from scene.cameras import MiniCam + +host = "127.0.0.1" +port = 6009 + +conn = None +addr = None + +listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + +def init(wish_host, wish_port): + global host, port, listener + host = wish_host + port = wish_port + listener.bind((host, port)) + listener.listen() + listener.settimeout(0) + +def try_connect(): + global conn, addr, listener + try: + conn, addr = listener.accept() + print(f"\nConnected by {addr}") + conn.settimeout(None) + except Exception as inst: + pass + +def read(): + global conn + messageLength = conn.recv(4) + messageLength = int.from_bytes(messageLength, 'little') + message = conn.recv(messageLength) + return json.loads(message.decode("utf-8")) + +def send(message_bytes, verify): + global conn + if message_bytes != None: + conn.sendall(message_bytes) + conn.sendall(len(verify).to_bytes(4, 'little')) + conn.sendall(bytes(verify, 'ascii')) + +def receive(): + message = read() + + width = message["resolution_x"] + height = message["resolution_y"] + + if width != 0 and height != 0: + try: + do_training = bool(message["train"]) + fovy = message["fov_y"] + fovx = message["fov_x"] + znear = message["z_near"] + zfar = message["z_far"] + do_shs_python = bool(message["shs_python"]) + do_rot_scale_python = bool(message["rot_scale_python"]) + keep_alive = bool(message["keep_alive"]) + scaling_modifier = message["scaling_modifier"] + world_view_transform = torch.reshape(torch.tensor(message["view_matrix"]), (4, 4)).cuda() + world_view_transform[:,1] = -world_view_transform[:,1] + world_view_transform[:,2] = -world_view_transform[:,2] + full_proj_transform = torch.reshape(torch.tensor(message["view_projection_matrix"]), (4, 4)).cuda() + full_proj_transform[:,1] = -full_proj_transform[:,1] + custom_cam = MiniCam(width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform) + except Exception as e: + print("") + traceback.print_exc() + raise e + return custom_cam, do_training, do_shs_python, do_rot_scale_python, keep_alive, scaling_modifier + else: + return None, None, None, None, None, None \ No newline at end of file diff --git a/guidance/animatediff_utils.py b/guidance/animatediff_utils.py new file mode 100644 index 0000000..9922db1 --- /dev/null +++ b/guidance/animatediff_utils.py @@ -0,0 +1,336 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + DDIMScheduler, + StableDiffusionPipeline, +) +import torchvision.transforms.functional as TF + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys +sys.path.append('./') + + +import os +from omegaconf import OmegaConf +from einops import rearrange +import sys +import argparse +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F +from torchvision.utils import save_image +from torchvision import io +from tqdm import tqdm +from datetime import datetime +import random +import imageio +from pathlib import Path +import shutil +import logging +from diffusers.utils.import_utils import is_xformers_available +# from diffusers import StableDiffusionPipeline +from transformers import CLIPTextModel, CLIPTokenizer +from transformers import logging as transformers_logging +transformers_logging.set_verbosity_error() # disable warning +from animatediff.pipelines.pipeline_old import AnimationPipeline +# from animatediff.pipelines.pipeline_animation import AnimationPipeline +from diffusers import AutoencoderKL, UNet2DConditionModel +from diffusers import DDIMScheduler +from animatediff.models.unet import UNet3DConditionModel +from animatediff.utils.util import load_weights +from animatediff.utils.util import save_videos_grid + +class AnimateDiff(nn.Module): + def __init__(self, device='cuda',use_textual_inversion=False): + inference_config=OmegaConf.load("animatediff/configs/inference/inference-v2.yaml") + pretrained_model_path="animatediff/animatediff_models/StableDiffusion/stable-diffusion-v1-5" + # pretrained_model_path="runwayml/stable-diffusion-v1-5" + self.pretrained_model_path = pretrained_model_path + tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder") + if use_textual_inversion: + inversion_path = None # TODO: CHANGE this! + text_encoder = CLIPTextModel.from_pretrained(inversion_path, subfolder="checkpoint-500") + else: + text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder") + vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae") + unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs)) + vae.requires_grad_(False) + text_encoder.requires_grad_(False) + unet.requires_grad_(False) + if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() + else: assert False + self.device = device = torch.device(device) + # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + pipeline = AnimationPipeline( + vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, + scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs)), + ).to(device) + motion_module_path = "./animatediff/animatediff_models/Motion_Module/mm_sd_v15_v2.ckpt" + dreambooth_model_path = "./animatediff/animatediff_models/DreamBooth_LoRA/rcnzCartoon3d_v20.safetensors" + + self.pipeline = load_weights( + pipeline, + motion_module_path = motion_module_path, + dreambooth_model_path = dreambooth_model_path, + ).to(device) + # unet = unet.to(device) + # vae = vae.to(device) + # text_encoder = text_encoder.to(device) + self.scheduler = self.pipeline.scheduler + self.alphas = self.scheduler.alphas_cumprod.to(self.device) + # self.scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler", torch_dtype= torch.float32) + self.rgb_to_latent = torch.from_numpy(np.array([[ 1.69810224, -0.28270747, -2.55163474, -0.78083445], + [-0.02986101, 4.91430525, 2.23158593, 3.02981481], + [-0.05746497, -3.04784101, 0.0448761 , -3.22913725]])).float().cuda(non_blocking=True) # 3 x 4 + self.latent_to_rgb = torch.from_numpy(np.array([ + [ 0.298, 0.207, 0.208], # L1 + [ 0.187, 0.286, 0.173], # L2 + [-0.158, 0.189, 0.264], # L3 + [-0.184, -0.271, -0.473], # L4 + ])).float().cuda(non_blocking=True) # 4 x 3 + + def load_text_encoder(self, use_textual_inversion=False): + if use_textual_inversion: + inversion_path="." + text_encoder = CLIPTextModel.from_pretrained(inversion_path, subfolder="checkpoint-500") + else: + text_encoder = CLIPTextModel.from_pretrained(self.pretrained_model_path, subfolder="text_encoder") + return text_encoder + + @torch.no_grad() + def prepare_text_emb(self, prompt=None, neg_prompt=None): + #example + if prompt is None: + prompt = "a panda dancing" + # prompt = "a dancing" + if neg_prompt is None: + neg_prompt = "color distortion,color shift,green light,semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, immutable, unchanging, stable, fixed, permant, unvarying, stationary, constant, steady, motionless, inactive, still, rooted, set" + # neg_prompt = "color distortion,color shift,green light,semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, missing arms, missing legs, extra arms, extra legs" + text_embeddings = self.pipeline._encode_prompt( + [prompt], self.device, num_videos_per_prompt=1, do_classifier_free_guidance=True, negative_prompt=[neg_prompt], + ) + return text_embeddings + + @torch.no_grad() + def prepare_text_emb_inversion(self, prompt=None, neg_prompt=None, inversion_prompt=None): + #example + if inversion_prompt is None: + inversion_prompt = 'a dancing' + if prompt is None: + prompt = "a panda dancing" + if neg_prompt is None: + neg_prompt = "color distortion,color shift,green light,semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, missing arms, missing legs, extra arms, extra legs" + text_embeddings = self.pipeline._encode_prompt( + [prompt], self.device, num_videos_per_prompt=1, do_classifier_free_guidance=True, negative_prompt=[neg_prompt], + ) + # self.pipeline.text_encoder = self.load_text_encoder(use_textual_inversion=True) + text_embeddings_inversion = self.pipeline._encode_prompt( + [inversion_prompt], self.device, num_videos_per_prompt=1, do_classifier_free_guidance=True, negative_prompt=[neg_prompt], + ) + return text_embeddings, text_embeddings_inversion + + def get_cfg(self, noisy_latents, text_embeddings, guidance_scale, t): + latent_model_input = torch.cat([noisy_latents] * 2) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # print('latent_model_input', latent_model_input.shape) + noise_pred = self.pipeline.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + return noise_pred + + def train_step(self, pred_rgb, text_embeddings, guidance_scale=30, as_latent=False): + # shape = [1, 4, 8, 64, 64] #b,c,t,h,w b=1, c=4 beacause vae encode + # latents_vsd = torch.randn(shape).to(device) #input + if not as_latent: + print('diff input rgb', pred_rgb.shape) + frame_size=pred_rgb.shape[0] + # latents = (pred_rgb.permute(0, 2, 3, 1) @ self.rgb_to_latent).permute(3, 0, 1, 2) + + # latents = F.interpolate(latents, (64, 64), mode='bilinear', align_corners=False).unsqueeze(0) + # print('latents', latents.shape) + latents = self.pipeline.vae.encode(pred_rgb * 2 - 1).latent_dist.sample() # [8, 4, 64, 64] + print('latents shape',latents.shape) + # randn_noise=torch.rand_like(latents[0]).to(latents.device) + # for i in range(1,frame_size): + # i=torch.tensor(i,device=self.device).long() + # latents[i]=self.scheduler.add_noise(latents[i], randn_noise, i*100) + latents = latents.unsqueeze(0).permute(0, 2, 1, 3, 4) * 0.18215 #[1, 4, 8, 32, 32]) + + #image+guassian+guassian... + + else: + latents = pred_rgb + # latents = rearrange(latents, "b c) f h w -> (b f) c h w") + + print('latents', latents.shape, latents.requires_grad) + with torch.no_grad(): + noise = torch.randn_like(latents) + # t=torch.tensor(100).to(device) # + t = torch.randint( + 50, 950, (latents.shape[0],), device=self.device + ).long() + # print('time shape', t.shape) + noisy_latents = self.scheduler.add_noise(latents, noise, t) + noise_pred = self.get_cfg(noisy_latents, text_embeddings, guidance_scale, t) + noise_diff = noise_pred - noise + # noise_pred=self.pipeline(noisy_lantents=noisy_latents, + # t=t, + # prompt=prompt, + # negative_prompt= n_prompt, + # ) + # print('noise pred shape:',noise_pred.shape) #([1, 4, 8, 64, 64]) + w = (1 - self.alphas[t]).view(noise.shape[0], 1, 1, 1, 1) + grad = w * (noise_diff) + grad = torch.nan_to_num(grad) + + # if not as_latent: + # # grad: [1, 4, 16, 64, 64] + # print(grad.shape) + # # norm = torch.norm(grad, dim=(1)) + # norm = torch.norm(grad, dim=(1, 2)) + # print(norm) + # thres = torch.ones_like(norm).detach() * 1 + # # grad = torch.minimum(norm, thres) * F.normalize(grad, dim=(1)) + # grad = torch.minimum(norm, thres) * F.normalize(grad, dim=(1, 2)) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') + return loss + + def train_step_inversion(self, pred_rgb, text_embeddings, text_embeddings_inversion, guidance_scale=30, as_latent=False): + # shape = [1, 4, 8, 64, 64] #b,c,t,h,w b=1, c=4 beacause vae encode + # latents_vsd = torch.randn(shape).to(device) #input + if not as_latent: + print('diff input rgb', pred_rgb.shape) + # latents = (pred_rgb.permute(0, 2, 3, 1) @ self.rgb_to_latent).permute(3, 0, 1, 2) + + # latents = F.interpolate(latents, (64, 64), mode='bilinear', align_corners=False).unsqueeze(0) + # print('latents', latents.shape) + latents = self.pipeline.vae.encode(pred_rgb * 2 - 1).latent_dist.sample() # [8, 4, 64, 64] + latents = latents.unsqueeze(0).permute(0, 2, 1, 3, 4) * 0.18215 + else: + latents = pred_rgb + # latents = rearrange(latents, "b c) f h w -> (b f) c h w") + + print('latents', latents.shape, latents.requires_grad) + with torch.no_grad(): + noise = torch.randn_like(latents) + # t=torch.tensor(100).to(device) # + t = torch.randint( + 50, 950, (latents.shape[0],), device=self.device + ).long() + # print('time shape', t.shape) + noisy_latents = self.scheduler.add_noise(latents, noise, t) + noise_pred_original = self.get_cfg(noisy_latents, text_embeddings, guidance_scale, t) + noise_pred_inversion = self.get_cfg(noisy_latents, text_embeddings_inversion, guidance_scale, t) + noise_diff = noise_pred_inversion - noise_pred_original + print('noise pred shape:',noise_diff.shape) #([1, 4, 8, 64, 64]) + w = (1 - self.alphas[t]).view(noise.shape[0], 1, 1, 1, 1) + grad = w * (noise_diff) + grad = torch.nan_to_num(grad) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') + return loss + + @torch.no_grad() + def sample(self, text_embeddings, guidance_scale=7.5): + # latents = self.pipeline.vae.encode(pred_rgb).latent_dist.mode() + latents = torch.randn([1, 4, 16, 64, 64], device=self.device) * self.scheduler.init_noise_sigma + # noise = torch.randn_like(latents) + # t=torch.tensor(100).to(device) # + # t = torch.randint( + # 0, self.diffusion_model.num_timesteps, (pred_rgb.shape[0],), device=self.device + # ).long() + # print('time shape', t.shape) + from tqdm import tqdm + extra_step_kwargs = {} + # if accepts_eta: + extra_step_kwargs["eta"] = 0.0 + for i, t in enumerate(tqdm(self.scheduler.timesteps)): + latent_model_input = torch.cat([latents] * 2) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + noise_pred = self.pipeline.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents, eta=0.0).prev_sample + latents = 1 / 0.18215 * latents + print('output', latents.shape) + latents = rearrange(latents, "b c f h w -> (b f) c h w") + imgs = self.pipeline.vae.decode(latents).sample + imgs = (imgs / 2 + 0.5).clamp(0, 1) + return imgs + + @torch.no_grad() + def decode_latent(self, x): + latents = 1 / 0.18215 * x + latents = rearrange(latents, "b c f h w -> (b f) c h w") + return (self.pipeline.vae.decode(latents).sample / 2 + 0.5).clamp(0, 1) + +if __name__ == '__main__': + torch.manual_seed(16931037867122267877) + anim = AnimateDiff() + text_emb = anim.prepare_text_emb() + t2i = False + if t2i: + anim.scheduler.set_timesteps(50) + # pred_rgb = torch.randn((8, 3, 256, 256)).cuda() + # pred_rgb = torch.randn((1, 4, 8, 64, 64)) + res = anim.sample(text_emb) + print(res.shape) + res = res.permute(0, 2, 3, 1) + res = res.detach().cpu().numpy() + res = (res * 255).astype(np.uint8) + print(res.shape) + imageio.mimwrite('a.mp4', res, fps=16, quality=8, macro_block_size=1) + sds = True + if sds: + prefix = 'inversion_sds_latent_0.01' + # prefix = 'sds_rgb' + from PIL import Image + from torchvision.transforms import ToTensor + rgb0 = Image.open('data/panda_static/1.png').resize((256, 256)) + rgb0 = ToTensor()(rgb0).cuda().unsqueeze(0) + # print('rgb0', rgb0.shape) + # anim.scheduler.set_timesteps() + rgb_tensor = torch.randn((1, 4, 16, 32, 32)).cuda() * anim.scheduler.init_noise_sigma + # rgb_tensor = torch.randn((15, 3, 256, 256)).clamp(0, 1).cuda() + # rgb_tensor = torch.cat([rgb0.clone()] * 15).cuda() + # rgb_tensor[0] = rgb0 + rgb_tensor.requires_grad = True + # optim = torch.optim.AdamW([rgb_tensor], lr=0.05) + optim = torch.optim.Adam([rgb_tensor], lr=0.01) + from tqdm import tqdm + for i in tqdm(range(2000)): + # rgb_tensor[0] = rgb_tensor[0] * 0 + rgb0 + # loss = anim.train_step(torch.cat([rgb0, rgb_tensor], dim=0), text_emb, as_latent=False) + loss = anim.train_step(rgb_tensor, text_emb, as_latent=True) + # loss = anim.train_step(rgb_tensor, text_emb, as_latent=True) + loss.backward() + print('grad', rgb_tensor.grad.shape) + optim.step() + optim.zero_grad() + if i % 100 == 0: + res = anim.decode_latent(rgb_tensor).permute(0, 2, 3, 1) + # res = torch.cat([rgb0, rgb_tensor], dim=0).permute(0, 2, 3, 1) + res = res.detach().cpu().numpy() + res = (res * 255).astype(np.uint8) + print(res.shape) + imageio.mimwrite(f'{prefix}_{i}.mp4', res, fps=16, quality=8, macro_block_size=1) + res = anim.decode_latent(rgb_tensor).permute(0, 2, 3, 1) + res = rgb_tensor.permute(0, 2, 3, 1) + res = res.detach().cpu().numpy() + res = (res * 255).astype(np.uint8) + print(res.shape) + imageio.mimwrite(f'{prefix}.mp4', res, fps=16, quality=8, macro_block_size=1) + + # anim.train_step(pred_rgb, text_emb) diff --git a/guidance/attn_utils/__init__.py b/guidance/attn_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/guidance/attn_utils/attention_processor.py b/guidance/attn_utils/attention_processor.py new file mode 100644 index 0000000..5d3a73c --- /dev/null +++ b/guidance/attn_utils/attention_processor.py @@ -0,0 +1,1668 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.utils import deprecate, logging +from diffusers.utils.import_utils import is_xformers_available + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +if is_xformers_available(): + import xformers + import xformers.ops +else: + xformers = None + + +class Attention(nn.Module): + r""" + A cross attention layer. + + Parameters: + query_dim (`int`): The number of channels in the query. + cross_attention_dim (`int`, *optional*): + The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`. + heads (`int`, *optional*, defaults to 8): The number of heads to use for multi-head attention. + dim_head (`int`, *optional*, defaults to 64): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + bias (`bool`, *optional*, defaults to False): + Set to `True` for the query, key, and value linear layers to contain a bias parameter. + """ + + def __init__( + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias=False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + added_kv_proj_dim: Optional[int] = None, + norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + eps: float = 1e-5, + rescale_output_factor: float = 1.0, + residual_connection: bool = False, + _from_deprecated_attn_block=False, + processor: Optional["AttnProcessor"] = None, + ): + super().__init__() + # print("You are using Attention") + inner_dim = dim_head * heads + cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim + self.upcast_attention = upcast_attention + self.upcast_softmax = upcast_softmax + self.rescale_output_factor = rescale_output_factor + self.residual_connection = residual_connection + + # we make use of this private variable to know whether this class is loaded + # with an deprecated state dict so that we can convert it on the fly + self._from_deprecated_attn_block = _from_deprecated_attn_block + + self.scale_qk = scale_qk + self.scale = dim_head**-0.5 if self.scale_qk else 1.0 + + self.heads = heads + # for slice_size > 0 the attention score computation + # is split across the batch axis to save memory + # You can set slice_size with `set_attention_slice` + self.sliceable_head_dim = heads + + self.added_kv_proj_dim = added_kv_proj_dim + self.only_cross_attention = only_cross_attention + + if self.added_kv_proj_dim is None and self.only_cross_attention: + raise ValueError( + "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`." + ) + + if norm_num_groups is not None: + self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True) + else: + self.group_norm = None + + if spatial_norm_dim is not None: + self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim) + else: + self.spatial_norm = None + + if cross_attention_norm is None: + self.norm_cross = None + elif cross_attention_norm == "layer_norm": + self.norm_cross = nn.LayerNorm(cross_attention_dim) + elif cross_attention_norm == "group_norm": + if self.added_kv_proj_dim is not None: + # The given `encoder_hidden_states` are initially of shape + # (batch_size, seq_len, added_kv_proj_dim) before being projected + # to (batch_size, seq_len, cross_attention_dim). The norm is applied + # before the projection, so we need to use `added_kv_proj_dim` as + # the number of channels for the group norm. + norm_cross_num_channels = added_kv_proj_dim + else: + norm_cross_num_channels = cross_attention_dim + + self.norm_cross = nn.GroupNorm( + num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True + ) + else: + raise ValueError( + f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" + ) + + self.to_q = nn.Linear(query_dim, inner_dim, bias=bias) + + if not self.only_cross_attention: + # only relevant for the `AddedKVProcessor` classes + self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias) + self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias) + else: + self.to_k = None + self.to_v = None + + if self.added_kv_proj_dim is not None: + self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim) + self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim) + + self.to_out = nn.ModuleList([]) + self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias)) + self.to_out.append(nn.Dropout(dropout)) + + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + if processor is None: + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + self.set_processor(processor) + + def set_use_memory_efficient_attention_xformers( + self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None + ): + is_lora = hasattr(self, "processor") and isinstance( + self.processor, + (LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor), + ) + is_custom_diffusion = hasattr(self, "processor") and isinstance( + self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor) + ) + is_added_kv_processor = hasattr(self, "processor") and isinstance( + self.processor, + ( + AttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + SlicedAttnAddedKVProcessor, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, + ), + ) + + if use_memory_efficient_attention_xformers: + if is_added_kv_processor and (is_lora or is_custom_diffusion): + raise NotImplementedError( + f"Memory efficient attention is currently not supported for LoRA or custom diffuson for attention processor type {self.processor}" + ) + if not is_xformers_available(): + raise ModuleNotFoundError( + ( + "Refer to https://github.com/facebookresearch/xformers for more information on how to install" + " xformers" + ), + name="xformers", + ) + elif not torch.cuda.is_available(): + raise ValueError( + "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is" + " only available for GPU " + ) + else: + try: + # Make sure we can run the memory efficient attention + _ = xformers.ops.memory_efficient_attention( + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + ) + except Exception as e: + raise e + + if is_lora: + # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers + # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0? + processor = LoRAXFormersAttnProcessor( + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + rank=self.processor.rank, + attention_op=attention_op, + ) + processor.load_state_dict(self.processor.state_dict()) + processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + processor = CustomDiffusionXFormersAttnProcessor( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + attention_op=attention_op, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) + elif is_added_kv_processor: + # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP + # which uses this type of cross attention ONLY because the attention mask of format + # [0, ..., -10.000, ..., 0, ...,] is not supported + # throw warning + logger.info( + "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation." + ) + processor = XFormersAttnAddedKVProcessor(attention_op=attention_op) + else: + processor = XFormersAttnProcessor(attention_op=attention_op) + else: + if is_lora: + attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + processor = attn_processor_class( + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + rank=self.processor.rank, + ) + processor.load_state_dict(self.processor.state_dict()) + processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + processor = CustomDiffusionAttnProcessor( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() + if hasattr(F, "scaled_dot_product_attention") and self.scale_qk + else AttnProcessor() + ) + + self.set_processor(processor) + + def set_attention_slice(self, slice_size): + if slice_size is not None and slice_size > self.sliceable_head_dim: + raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.") + + if slice_size is not None and self.added_kv_proj_dim is not None: + processor = SlicedAttnAddedKVProcessor(slice_size) + elif slice_size is not None: + processor = SlicedAttnProcessor(slice_size) + elif self.added_kv_proj_dim is not None: + processor = AttnAddedKVProcessor() + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + + self.set_processor(processor) + + def set_processor(self, processor: "AttnProcessor"): + # if current processor is in `self._modules` and if passed `processor` is not, we need to + # pop `processor` from `self._modules` + if ( + hasattr(self, "processor") + and isinstance(self.processor, torch.nn.Module) + and not isinstance(processor, torch.nn.Module) + ): + logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}") + self._modules.pop("processor") + # print(f"in Attention set_processor the self.processor is {type(processor)} ") + self.processor = processor + + def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs): + # The `Attention` class can call different attention processors / attention functions + # here we simply pass along all tensors to the selected processor class + # For standard processors that are defined here, `**cross_attention_kwargs` is empty + return self.processor( + self, + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + def batch_to_head_dim(self, tensor): + head_size = self.heads + batch_size, seq_len, dim = tensor.shape + tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim) + tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size) + return tensor + + def head_to_batch_dim(self, tensor, out_dim=3): + head_size = self.heads + batch_size, seq_len, dim = tensor.shape + tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size) + tensor = tensor.permute(0, 2, 1, 3) + + if out_dim == 3: + tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size) + + return tensor + + def get_attention_scores(self, query, key, attention_mask=None): + dtype = query.dtype + if self.upcast_attention: + query = query.float() + key = key.float() + + if attention_mask is None: + baddbmm_input = torch.empty( + query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device + ) + beta = 0 + else: + baddbmm_input = attention_mask + beta = 1 + + attention_scores = torch.baddbmm( + baddbmm_input, + query, + key.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + del baddbmm_input + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = attention_scores.softmax(dim=-1) + del attention_scores + + attention_probs = attention_probs.to(dtype) + print(f'in Attention attention_probs: {attention_probs.shape}') + return attention_probs + + def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3): + if batch_size is None: + deprecate( + "batch_size=None", + "0.0.15", + ( + "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect" + " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to" + " `prepare_attention_mask` when preparing the attention_mask." + ), + ) + batch_size = 1 + + head_size = self.heads + if attention_mask is None: + return attention_mask + + current_length: int = attention_mask.shape[-1] + if current_length != target_length: + if attention_mask.device.type == "mps": + # HACK: MPS: Does not support padding by greater than dimension of input tensor. + # Instead, we can manually construct the padding tensor. + padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length) + padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) + attention_mask = torch.cat([attention_mask, padding], dim=2) + else: + # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: + # we want to instead pad by (0, remaining_length), where remaining_length is: + # remaining_length: int = target_length - current_length + # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding + attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) + + if out_dim == 3: + if attention_mask.shape[0] < batch_size * head_size: + attention_mask = attention_mask.repeat_interleave(head_size, dim=0) + elif out_dim == 4: + attention_mask = attention_mask.unsqueeze(1) + attention_mask = attention_mask.repeat_interleave(head_size, dim=1) + + return attention_mask + + def norm_encoder_hidden_states(self, encoder_hidden_states): + assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states" + + if isinstance(self.norm_cross, nn.LayerNorm): + encoder_hidden_states = self.norm_cross(encoder_hidden_states) + elif isinstance(self.norm_cross, nn.GroupNorm): + # Group norm norms along the channels dimension and expects + # input to be in the shape of (N, C, *). In this case, we want + # to norm along the hidden dimension, so we need to move + # (batch_size, sequence_length, hidden_size) -> + # (batch_size, hidden_size, sequence_length) + encoder_hidden_states = encoder_hidden_states.transpose(1, 2) + encoder_hidden_states = self.norm_cross(encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states.transpose(1, 2) + else: + assert False + + return encoder_hidden_states + + +class AttnProcessor: + r""" + Default processor for performing attention-related computations. + """ + + def __call__( + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + temb=None, + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + print(f"in AttnProcessor attention_probs: {attention_probs.shape}") + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class LoRALinearLayer(nn.Module): + def __init__(self, in_features, out_features, rank=4, network_alpha=None): + super().__init__() + + if rank > min(in_features, out_features): + raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}") + + self.down = nn.Linear(in_features, rank, bias=False) + self.up = nn.Linear(rank, out_features, bias=False) + # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script. + # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning + self.network_alpha = network_alpha + self.rank = rank + + nn.init.normal_(self.down.weight, std=1 / rank) + nn.init.zeros_(self.up.weight) + + def forward(self, hidden_states): + orig_dtype = hidden_states.dtype + dtype = self.down.weight.dtype + + down_hidden_states = self.down(hidden_states.to(dtype)) + up_hidden_states = self.up(down_hidden_states) + + if self.network_alpha is not None: + up_hidden_states *= self.network_alpha / self.rank + + return up_hidden_states.to(orig_dtype) + + +class LoRAAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + """ + + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__( + self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) + query = attn.head_to_batch_dim(query) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) + + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + print(f'in LoRAAttnProcessor attention_probs: {attention_probs.shape}') + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class CustomDiffusionAttnProcessor(nn.Module): + r""" + Processor for implementing attention for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + """ + + def __init__( + self, + train_kv=True, + train_q_out=True, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states) + value = self.to_v_custom_diffusion(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + print(f'in CustomDiffusionAttnProcessor attention_probs: {attention_probs.shape}') + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + +class AttnAddedKVProcessor: + r""" + Processor for performing attention-related computations with extra learnable key and value matrices for the text + encoder. + """ + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + print(f'in AttnAddedKVProcessor attention_probs: {attention_probs.shape}') + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class AttnAddedKVProcessor2_0: + r""" + Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra + learnable key and value matrices for the text encoder. + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0." + ) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query, out_dim=4) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key, out_dim=4) + value = attn.head_to_batch_dim(value, out_dim=4) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1]) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class LoRAAttnAddedKVProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text + encoder. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + + """ + + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora( + encoder_hidden_states + ) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora( + encoder_hidden_states + ) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states) + value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + print(f'in LoRAAttnAddedKVProcessor attention_probs: {attention_probs.shape}') + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class XFormersAttnAddedKVProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + + def __init__(self, attention_op: Optional[Callable] = None): + self.attention_op = attention_op + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class XFormersAttnProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + + def __init__(self, attention_op: Optional[Callable] = None): + self.attention_op = attention_op + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, key_tokens, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size) + if attention_mask is not None: + # expand our mask's singleton query_tokens dimension: + # [batch*heads, 1, key_tokens] -> + # [batch*heads, query_tokens, key_tokens] + # so that it can be added as a bias onto the attention scores that xformers computes: + # [batch*heads, query_tokens, key_tokens] + # we do this explicitly because xformers doesn't broadcast the singleton dimension for us. + _, query_tokens, _ = hidden_states.shape + attention_mask = attention_mask.expand(-1, query_tokens, -1) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query).contiguous() + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + +import math +class AttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def __call__( + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + temb=None, + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + inner_dim = hidden_states.shape[-1] + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + # hidden_states = F.scaled_dot_product_attention( + # query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + # ) + + L, S = query.size(-2), key.size(-2) + scale_factor = 1 / math.sqrt(query.size(-1)) + attn_bias = torch.zeros(L, S, dtype=query.dtype).to(query.device) + if attention_mask is not None: + if attention_mask.dtype == torch.bool: + attn_bias.masked_fill_(attention_mask.logical_not(), float("-inf")) + else: + attn_bias += attention_mask + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight += attn_bias + attn_weight = torch.softmax(attn_weight, dim=-1) + attn_weight = torch.dropout(attn_weight, 0.0, train=True) + hidden_states = attn_weight @ value + # print(f'in AttnProcessor2_0 attention_probs: {attn_weight.shape}') + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + ### + # print(f'add multiple return attn_weight {attn_weight.shape}') + return hidden_states, attn_weight + + +class LoRAXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + + """ + + def __init__( + self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None, network_alpha=None + ): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + self.attention_op = attention_op + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__( + self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) + query = attn.head_to_batch_dim(query).contiguous() + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) + + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class LoRAAttnProcessor2_0(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product + attention. + + Args: + hidden_size (`int`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + """ + + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): + super().__init__() + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): + residual = hidden_states + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + inner_dim = hidden_states.shape[-1] + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) + + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class CustomDiffusionXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use + as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator. + """ + + def __init__( + self, + train_kv=True, + train_q_out=False, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + attention_op: Optional[Callable] = None, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.attention_op = attention_op + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states) + value = self.to_v_custom_diffusion(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query).contiguous() + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + return hidden_states + + +class SlicedAttnProcessor: + r""" + Processor for implementing sliced attention. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + + def __init__(self, slice_size): + self.slice_size = slice_size + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + residual = hidden_states + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + dim = query.shape[-1] + query = attn.head_to_batch_dim(query) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + batch_size_attention, query_tokens, _ = query.shape + hidden_states = torch.zeros( + (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype + ) + + for i in range(batch_size_attention // self.slice_size): + start_idx = i * self.slice_size + end_idx = (i + 1) * self.slice_size + + query_slice = query[start_idx:end_idx] + key_slice = key[start_idx:end_idx] + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None + + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) + print(f'in SlicedAttnProcessor attn_slice: {attn_slice.shape}') + + attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx]) + + hidden_states[start_idx:end_idx] = attn_slice + + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class SlicedAttnAddedKVProcessor: + r""" + Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + + def __init__(self, slice_size): + self.slice_size = slice_size + + def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + dim = query.shape[-1] + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + batch_size_attention, query_tokens, _ = query.shape + hidden_states = torch.zeros( + (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype + ) + + for i in range(batch_size_attention // self.slice_size): + start_idx = i * self.slice_size + end_idx = (i + 1) * self.slice_size + + query_slice = query[start_idx:end_idx] + key_slice = key[start_idx:end_idx] + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None + + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) + + attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx]) + + hidden_states[start_idx:end_idx] = attn_slice + + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +AttentionProcessor = Union[ + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + LoRAAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, + LoRAAttnAddedKVProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, +] + + +class SpatialNorm(nn.Module): + """ + Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002 + """ + + def __init__( + self, + f_channels, + zq_channels, + ): + super().__init__() + self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True) + self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, f, zq): + f_size = f.shape[-2:] + zq = F.interpolate(zq, size=f_size, mode="nearest") + norm_f = self.norm_layer(f) + new_f = norm_f * self.conv_y(zq) + self.conv_b(zq) + return new_f \ No newline at end of file diff --git a/guidance/attn_utils/attention_refocusing_loss.py b/guidance/attn_utils/attention_refocusing_loss.py new file mode 100644 index 0000000..6fc9201 --- /dev/null +++ b/guidance/attn_utils/attention_refocusing_loss.py @@ -0,0 +1,256 @@ +import math +import torch +from torch.nn import functional as F +from torchvision.utils import save_image +import numpy as np +import numbers +from torch import nn + + +class GaussianSmoothing(nn.Module): + """ + Apply gaussian smoothing on a + 1d, 2d or 3d tensor. Filtering is performed seperately for each channel + in the input using a depthwise convolution. + Arguments: + channels (int, sequence): Number of channels of the input tensors. Output will + have this number of channels as well. + kernel_size (int, sequence): Size of the gaussian kernel. + sigma (float, sequence): Standard deviation of the gaussian kernel. + dim (int, optional): The number of dimensions of the data. + Default value is 2 (spatial). + """ + def __init__(self, channels, kernel_size, sigma, dim=2): + super(GaussianSmoothing, self).__init__() + if isinstance(kernel_size, numbers.Number): + kernel_size = [kernel_size] * dim + if isinstance(sigma, numbers.Number): + sigma = [sigma] * dim + + # The gaussian kernel is the product of the + # gaussian function of each dimension. + kernel = 1 + meshgrids = torch.meshgrid( + [ + torch.arange(size, dtype=torch.float32) + for size in kernel_size + ] + ) + for size, std, mgrid in zip(kernel_size, sigma, meshgrids): + mean = (size - 1) / 2 + kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \ + torch.exp(-((mgrid - mean) / (2 * std)) ** 2) + + # Make sure sum of values in gaussian kernel equals 1. + kernel = kernel / torch.sum(kernel) + + # Reshape to depthwise convolutional weight + kernel = kernel.view(1, 1, *kernel.size()) + kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1)) + + self.register_buffer('weight', kernel) + self.groups = channels + + if dim == 1: + self.conv = F.conv1d + elif dim == 2: + self.conv = F.conv2d + elif dim == 3: + self.conv = F.conv3d + else: + raise RuntimeError( + 'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim) + ) + + def forward(self, input): + """ + Apply gaussian filter to input. + Arguments: + input (torch.Tensor): Input to apply gaussian filter on. + Returns: + filtered (torch.Tensor): Filtered output. + """ + return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups) + + +class AverageSmoothing(nn.Module): + """ + Apply average smoothing on a + 1d, 2d or 3d tensor. Filtering is performed seperately for each channel + in the input using a depthwise convolution. + Arguments: + channels (int, sequence): Number of channels of the input tensors. Output will + have this number of channels as well. + kernel_size (int, sequence): Size of the average kernel. + sigma (float, sequence): Standard deviation of the rage kernel. + dim (int, optional): The number of dimensions of the data. + Default value is 2 (spatial). + """ + def __init__(self, channels, kernel_size, dim=2): + super(AverageSmoothing, self).__init__() + + # Make sure sum of values in gaussian kernel equals 1. + kernel = torch.ones(size=(kernel_size, kernel_size)) / (kernel_size * kernel_size) + + # Reshape to depthwise convolutional weight + kernel = kernel.view(1, 1, *kernel.size()) + kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1)) + + self.register_buffer('weight', kernel) + self.groups = channels + + if dim == 1: + self.conv = F.conv1d + elif dim == 2: + self.conv = F.conv2d + elif dim == 3: + self.conv = F.conv3d + else: + raise RuntimeError( + 'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim) + ) + + def forward(self, input): + """ + Apply average filter to input. + Arguments: + input (torch.Tensor): Input to apply average filter on. + Returns: + filtered (torch.Tensor): Filtered output. + """ + return self.conv(input, weight=self.weight, groups=self.groups) + + +def loss_one_att_outside(attn_map, bboxes): + loss = 0 + object_number = len(bboxes) + b, i, j = attn_map.shape + H = W = int(math.sqrt(i)) + # print('in loss_one_att_outside ',b, i, j, H) + for obj_idx in range(object_number): + + for obj_box in bboxes[obj_idx]: + mask = torch.zeros(size=(H, W)).cuda() if torch.cuda.is_available() else torch.zeros(size=(H, W)) + x_min, y_min, x_max, y_max = int(obj_box[0] * W), \ + int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H) + mask[y_min: y_max, x_min: x_max] = 1. + mask_out = 1. - mask + index = (mask == 1.).nonzero(as_tuple=False) + index_in_key = index[:,0]* H + index[:, 1] + att_box = torch.zeros_like(attn_map) + att_box[:,index_in_key,:] = attn_map[:,index_in_key,:] + + att_box = att_box.sum(axis=1) / index_in_key.shape[0] + att_box = att_box.reshape(-1, H, H) + activation_value = (att_box* mask_out).reshape(b, -1).sum(dim=-1) #/ att_box.reshape(b, -1).sum(dim=-1) + loss += torch.mean(activation_value) + + return loss / object_number + +def caculate_loss_self_att(self_attention_lists, bboxes): + cnt = 0 + total_loss = 0 + + for self_att in self_attention_lists: + if not self_att: + continue + for attn in self_att: + attn = attn.view(attn.shape[0]*attn.shape[1], *attn.shape[2:]) + # print('in caculate_loss_self_att attn: ', attn.shape) + total_loss += loss_one_att_outside(attn, bboxes) + cnt += 1 + + return total_loss /cnt + +def caculate_loss_att_fixed_cnt(cross_attn_lists, bboxes, object_positions, t, res=16, smooth_att = True, sigma=0.5, kernel_size=3 ): + result = [] + + for cross_attn_list in cross_attn_lists: + if cross_attn_list == []: continue + for attn_map in cross_attn_list: + # print('in caculate_loss_att_fixed_cnt attn_map: ', attn_map.shape) + H = W = int(math.sqrt(attn_map.shape[-2])) + s = attn_map.shape[0] * attn_map.shape[1] + attn_map = attn_map.view(attn_map.shape[0] * attn_map.shape[1], H, W, attn_map.shape[-1]).sum(0) / s + # print('in caculate_loss_att_fixed_cnt attn_map view: ', attn_map.shape) + result.append(attn_map) + + obj_number = len(bboxes) + total_loss = 0 + + for attn in result: + attn_text = attn[:, :, 1:-1] + attn_text *= 100 + attn_text = torch.nn.functional.softmax(attn_text, dim=-1) + current_res = attn.shape[0] + H = W = current_res + + + min_all_inside = 1000 + max_outside = 0 + + for obj_idx in range(obj_number): + + for obj_position in object_positions[obj_idx]: + true_obj_position = obj_position - 1 + att_map_obj = attn_text[:,:, true_obj_position] + if smooth_att: + smoothing = GaussianSmoothing(channels=1, kernel_size=kernel_size, sigma=sigma, dim=2).cuda() + input = F.pad(att_map_obj.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode='reflect') + att_map_obj = smoothing(input).squeeze(0).squeeze(0) + other_att_map_obj = att_map_obj.clone() + att_copy = att_map_obj.clone() + + for obj_box in bboxes[obj_idx]: + x_min, y_min, x_max, y_max = int(obj_box[0] * W), \ + int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H) + + + if att_map_obj[y_min: y_max, x_min: x_max].numel() == 0: + max_inside=1. + + else: + max_inside = att_map_obj[y_min: y_max, x_min: x_max].max() + if max_inside < 0.1: + total_loss += 6*(1. - max_inside) + elif max_inside < 0.2: + total_loss += 1. - max_inside + elif t < 15: + total_loss += 1. - max_inside + if max_inside < min_all_inside: + min_all_inside = max_inside + + # find max outside the box, find in the other boxes + + att_copy[y_min: y_max, x_min: x_max] = 0. + other_att_map_obj[y_min: y_max, x_min: x_max] = 0. + + for obj_outside in range(obj_number): + if obj_outside != obj_idx: + for obj_out_box in bboxes[obj_outside]: + x_min_out, y_min_out, x_max_out, y_max_out = int(obj_out_box[0] * W), \ + int(obj_out_box[1] * H), int(obj_out_box[2] * W), int(obj_out_box[3] * H) + + + if other_att_map_obj[y_min_out: y_max_out, x_min_out: x_max_out].numel() == 0: + max_outside_one= 0 + else: + max_outside_one = other_att_map_obj[y_min_out: y_max_out, x_min_out: x_max_out].max() + + att_copy[y_min_out: y_max_out, x_min_out: x_max_out] = 0. + + if max_outside_one > 0.15: + total_loss += 4 * max_outside_one + elif max_outside_one > 0.1: + total_loss += max_outside_one + elif t<15: + total_loss += max_outside_one + if max_outside_one > max_outside: + max_outside = max_outside_one + + max_background = att_copy.max() + total_loss += len(bboxes[obj_idx]) * max_background /2. + + return total_loss/obj_number, min_all_inside, max_outside + + diff --git a/guidance/attn_utils/transformer_2d.py b/guidance/attn_utils/transformer_2d.py new file mode 100644 index 0000000..25c24fa --- /dev/null +++ b/guidance/attn_utils/transformer_2d.py @@ -0,0 +1,737 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.embeddings import ImagePositionalEmbeddings +from diffusers.utils import BaseOutput, deprecate +# from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.embeddings import PatchEmbed,CombinedTimestepLabelEmbeddings +from diffusers.models.modeling_utils import ModelMixin + +from typing import Any, Dict, Optional + +from diffusers.models.activations import get_activation +from .attention_processor import Attention + + +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + final_dropout: bool = False, + ): + super().__init__() + # print("You are using BasicTransformerBlock") + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif self.use_ada_layer_norm_zero: + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + ) + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + return_attn: bool = False, + ): + # Notice that normalization is always applied before the real computation in the following blocks. + # 1. Self-Attention + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + else: + norm_hidden_states = self.norm1(hidden_states) + + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + attn_weight, cross_attn_weight = None, None + ### + if return_attn: + attn_output, attn_weight = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + # print(f'add multiple return attn_weight {attn_weight.shape}') + else: + attn_output, _ = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = attn_output + hidden_states + + # 2. Cross-Attention + if self.attn2 is not None: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + ### + if return_attn: + attn_output, cross_attn_weight = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + # print(f'add multiple return cross_attn_weight {cross_attn_weight.shape}') + else: + attn_output, _ = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 3. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + ff_output = self.ff(norm_hidden_states) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = ff_output + hidden_states + ### + # print(f'add multiple returns') + return hidden_states, attn_weight, cross_attn_weight + + +class FeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim) + if activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh") + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim) + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(nn.Linear(inner_dim, dim_out)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states): + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + +class GELU(nn.Module): + r""" + GELU activation function with tanh approximation support with `approximate="tanh"`. + """ + + def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out) + self.approximate = approximate + + def gelu(self, gate): + if gate.device.type != "mps": + return F.gelu(gate, approximate=self.approximate) + # mps: gelu is not implemented for float16 + return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype) + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states = self.gelu(hidden_states) + return hidden_states + + +class GEGLU(nn.Module): + r""" + A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202. + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + """ + + def __init__(self, dim_in: int, dim_out: int): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def gelu(self, gate): + if gate.device.type != "mps": + return F.gelu(gate) + # mps: gelu is not implemented for float16 + return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype) + + def forward(self, hidden_states): + hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1) + return hidden_states * self.gelu(gate) + + +class ApproximateGELU(nn.Module): + """ + The approximate form of Gaussian Error Linear Unit (GELU) + + For more details, see section 2: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, dim_in: int, dim_out: int): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out) + + def forward(self, x): + x = self.proj(x) + return x * torch.sigmoid(1.702 * x) + + +class AdaLayerNorm(nn.Module): + """ + Norm layer modified to incorporate timestep embeddings. + """ + + def __init__(self, embedding_dim, num_embeddings): + super().__init__() + self.emb = nn.Embedding(num_embeddings, embedding_dim) + self.silu = nn.SiLU() + self.linear = nn.Linear(embedding_dim, embedding_dim * 2) + self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False) + + def forward(self, x, timestep): + emb = self.linear(self.silu(self.emb(timestep))) + scale, shift = torch.chunk(emb, 2) + x = self.norm(x) * (1 + scale) + shift + return x + + +class AdaLayerNormZero(nn.Module): + """ + Norm layer adaptive layer norm zero (adaLN-Zero). + """ + + def __init__(self, embedding_dim, num_embeddings): + super().__init__() + + self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim) + + self.silu = nn.SiLU() + self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True) + self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) + + def forward(self, x, timestep, class_labels, hidden_dtype=None): + emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype))) + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1) + x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] + return x, gate_msa, shift_mlp, scale_mlp, gate_mlp + + +class AdaGroupNorm(nn.Module): + """ + GroupNorm layer modified to incorporate timestep embeddings. + """ + + def __init__( + self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5 + ): + super().__init__() + self.num_groups = num_groups + self.eps = eps + + if act_fn is None: + self.act = None + else: + self.act = get_activation(act_fn) + + self.linear = nn.Linear(embedding_dim, out_dim * 2) + + def forward(self, x, emb): + if self.act: + emb = self.act(emb) + emb = self.linear(emb) + emb = emb[:, :, None, None] + scale, shift = emb.chunk(2, dim=1) + + x = F.group_norm(x, self.num_groups, eps=self.eps) + x = x * (1 + scale) + shift + return x + +@dataclass +class Transformer2DModelOutput(BaseOutput): + """ + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions + for the unnoised latent pixels. + """ + + sample: torch.FloatTensor + + +class Transformer2DModel(ModelMixin, ConfigMixin): + """ + Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual + embeddings) inputs. + + When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard + transformer action. Finally, reshape to image. + + When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional + embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict + classes of unnoised image. + + Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised + image do not contain a prediction for the masked pixel as the unnoised image cannot be masked. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input and output. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + num_vector_embeds (`int`, *optional*): + Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + Includes the class for the masked latent pixel. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. + The number of diffusion steps used during training. Note that this is fixed at training time as it is used + to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for + up to but not more than steps than `num_embeds_ada_norm`. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + """ + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + norm_elementwise_affine: bool = True, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` + # Define whether input is continuous or discrete depending on configuration + self.is_input_continuous = (in_channels is not None) and (patch_size is None) + self.is_input_vectorized = num_vector_embeds is not None + self.is_input_patches = in_channels is not None and patch_size is not None + + if norm_type == "layer_norm" and num_embeds_ada_norm is not None: + deprecation_message = ( + f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or" + " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config." + " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect" + " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it" + " would be very nice if you could open a Pull request for the `transformer/config.json` file" + ) + deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False) + norm_type = "ada_norm" + + if self.is_input_continuous and self.is_input_vectorized: + raise ValueError( + f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make" + " sure that either `in_channels` or `num_vector_embeds` is None." + ) + elif self.is_input_vectorized and self.is_input_patches: + raise ValueError( + f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make" + " sure that either `num_vector_embeds` or `num_patches` is None." + ) + elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches: + raise ValueError( + f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:" + f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None." + ) + + # 2. Define input layers + if self.is_input_continuous: + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True) + if use_linear_projection: + self.proj_in = nn.Linear(in_channels, inner_dim) + else: + self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + elif self.is_input_vectorized: + assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size" + assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed" + + self.height = sample_size + self.width = sample_size + self.num_vector_embeds = num_vector_embeds + self.num_latent_pixels = self.height * self.width + + self.latent_image_embedding = ImagePositionalEmbeddings( + num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width + ) + elif self.is_input_patches: + assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size" + + self.height = sample_size + self.width = sample_size + + self.patch_size = patch_size + self.pos_embed = PatchEmbed( + height=sample_size, + width=sample_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=inner_dim, + ) + + # 3. Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + norm_elementwise_affine=norm_elementwise_affine, + ) + for d in range(num_layers) + ] + ) + + # 4. Define output layers + self.out_channels = in_channels if out_channels is None else out_channels + if self.is_input_continuous: + # TODO: should use out_channels for continuous projections + if use_linear_projection: + self.proj_out = nn.Linear(inner_dim, in_channels) + else: + self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) + elif self.is_input_vectorized: + self.norm_out = nn.LayerNorm(inner_dim) + self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1) + elif self.is_input_patches: + self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6) + self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim) + self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + class_labels: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + return_attn: bool = False, + ): + """ + Args: + hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. + When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + hidden_states + encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.LongTensor`, *optional*): + Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): + Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels + conditioning. + encoder_attention_mask ( `torch.Tensor`, *optional* ). + Cross-attention mask, applied to encoder_hidden_states. Two formats supported: + Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0 + = keep, -10000 = discard. + If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format + above. This bias will be added to the cross-attention scores. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`: + [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. + # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. + # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None and attention_mask.ndim == 2: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: + encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 1. Input + if self.is_input_continuous: + batch, _, height, width = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = self.proj_in(hidden_states) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + else: + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + hidden_states = self.proj_in(hidden_states) + elif self.is_input_vectorized: + hidden_states = self.latent_image_embedding(hidden_states) + elif self.is_input_patches: + hidden_states = self.pos_embed(hidden_states) + + # 2. Blocks + ### + attn_weight_list = [] + cross_attn_weight_list = [] + for block in self.transformer_blocks: + if return_attn: + hidden_states, attn_weight, cross_attn_weight = block( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + return_attn=return_attn + ) + attn_weight_list.append(attn_weight) + cross_attn_weight_list.append(cross_attn_weight) + else: + hidden_states, _, _ = block( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + return_attn=False + ) + + # 3. Output + if self.is_input_continuous: + if not self.use_linear_projection: + hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() + hidden_states = self.proj_out(hidden_states) + else: + hidden_states = self.proj_out(hidden_states) + hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() + + output = hidden_states + residual + elif self.is_input_vectorized: + hidden_states = self.norm_out(hidden_states) + logits = self.out(hidden_states) + # (batch, self.num_vector_embeds - 1, self.num_latent_pixels) + logits = logits.permute(0, 2, 1) + + # log(p(x_0)) + output = F.log_softmax(logits.double(), dim=1).float() + elif self.is_input_patches: + # TODO: cleanup! + conditioning = self.transformer_blocks[0].norm1.emb( + timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1) + hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None] + hidden_states = self.proj_out_2(hidden_states) + + # unpatchify + height = width = int(hidden_states.shape[1] ** 0.5) + hidden_states = hidden_states.reshape( + shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels) + ) + hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states) + output = hidden_states.reshape( + shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size) + ) + + if not return_dict: + return (output,) + + ### + # print(f'in Transformer2DModel add multiple returns attn_weight_list {len(attn_weight_list)} cross_attn_weight_list {len(cross_attn_weight_list)}') + return Transformer2DModelOutput(sample=output), attn_weight_list, cross_attn_weight_list \ No newline at end of file diff --git a/guidance/attn_utils/transformer_temporal.py b/guidance/attn_utils/transformer_temporal.py new file mode 100644 index 0000000..0523125 --- /dev/null +++ b/guidance/attn_utils/transformer_temporal.py @@ -0,0 +1,182 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput +from guidance.attn_utils.transformer_2d import BasicTransformerBlock +from diffusers.models.modeling_utils import ModelMixin + + +@dataclass +class TransformerTemporalModelOutput(BaseOutput): + """ + Args: + sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`) + Hidden states conditioned on `encoder_hidden_states` input. + """ + + sample: torch.FloatTensor + + +class TransformerTemporalModel(ModelMixin, ConfigMixin): + """ + Transformer model for video-like data. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input and output. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + double_self_attention (`bool`, *optional*): + Configure if each TransformerBlock should contain two self-attention layers + """ + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + activation_fn: str = "geglu", + norm_elementwise_affine: bool = True, + double_self_attention: bool = True, + ): + super().__init__() + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True) + self.proj_in = nn.Linear(in_channels, inner_dim) + + # 3. Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + attention_bias=attention_bias, + double_self_attention=double_self_attention, + norm_elementwise_affine=norm_elementwise_affine, + ) + for d in range(num_layers) + ] + ) + + self.proj_out = nn.Linear(inner_dim, in_channels) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + class_labels=None, + num_frames=1, + cross_attention_kwargs=None, + return_dict: bool = True, + ): + """ + Args: + hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. + When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + hidden_states + encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.long`, *optional*): + Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): + Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels + conditioning. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.transformer_2d.TransformerTemporalModelOutput`] or `tuple`: + [`~models.transformer_2d.TransformerTemporalModelOutput`] if `return_dict` is True, otherwise a `tuple`. + When returning a tuple, the first element is the sample tensor. + """ + # 1. Input + batch_frames, channel, height, width = hidden_states.shape + batch_size = batch_frames // num_frames + + residual = hidden_states + + hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width) + hidden_states = hidden_states.permute(0, 2, 1, 3, 4) + + hidden_states = self.norm(hidden_states) + hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel) + + hidden_states = self.proj_in(hidden_states) + + # 2. Blocks + ### + attn_weight_list = [] + cross_attn_weight_list = [] + for block in self.transformer_blocks: + hidden_states, attn_weight, cross_attn_weight = block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + ) + attn_weight_list.append(attn_weight) + cross_attn_weight_list.append(cross_attn_weight) + + # 3. Output + hidden_states = self.proj_out(hidden_states) + hidden_states = ( + hidden_states[None, None, :] + .reshape(batch_size, height, width, channel, num_frames) + .permute(0, 3, 4, 1, 2) + .contiguous() + ) + hidden_states = hidden_states.reshape(batch_frames, channel, height, width) + + output = hidden_states + residual + + if not return_dict: + return (output,) + ### + # print(f'in TransformerTemporalModel add multiple returns attn_weight_list {len(attn_weight_list)} cross_attn_weight_list {len(cross_attn_weight_list)}') + return TransformerTemporalModelOutput(sample=output), attn_weight_list, cross_attn_weight_list \ No newline at end of file diff --git a/guidance/attn_utils/unet_3d_blocks.py b/guidance/attn_utils/unet_3d_blocks.py new file mode 100644 index 0000000..9fafb57 --- /dev/null +++ b/guidance/attn_utils/unet_3d_blocks.py @@ -0,0 +1,750 @@ +from diffusers.models.resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D +from .transformer_2d import Transformer2DModel +# from transformer_temporal import TransformerTemporalModel +from diffusers.models.transformer_temporal import TransformerTemporalModel +import torch +from torch import nn + + +def get_down_block( + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=True, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", +): + if down_block_type == "DownBlock3D": + return DownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif down_block_type == "CrossAttnDownBlock3D": + if cross_attention_dim is None: + raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D") + return CrossAttnDownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attn_num_head_channels, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + raise ValueError(f"{down_block_type} does not exist.") + + +def get_up_block( + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=True, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", +): + if up_block_type == "UpBlock3D": + return UpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif up_block_type == "CrossAttnUpBlock3D": + if cross_attention_dim is None: + raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D") + return CrossAttnUpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attn_num_head_channels, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + raise ValueError(f"{up_block_type} does not exist.") + + +class UNetMidBlock3DCrossAttn(nn.Module): + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + output_scale_factor=1.0, + cross_attention_dim=1280, + dual_cross_attention=False, + use_linear_projection=True, + upcast_attention=False, + ): + super().__init__() + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + + # there is always at least one resnet + resnets = [ + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ] + temp_convs = [ + TemporalConvLayer( + in_channels, + in_channels, + dropout=0.1, + ) + ] + attentions = [] + temp_attentions = [] + + for _ in range(num_layers): + attentions.append( + Transformer2DModel( + in_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=in_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + ) + temp_attentions.append( + TransformerTemporalModel( + in_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=in_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temp_convs.append( + TemporalConvLayer( + in_channels, + in_channels, + dropout=0.1, + ) + ) + + self.resnets = nn.ModuleList(resnets) + self.temp_convs = nn.ModuleList(temp_convs) + self.attentions = nn.ModuleList(attentions) + self.temp_attentions = nn.ModuleList(temp_attentions) + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + return_attn=False, + ): + hidden_states = self.resnets[0](hidden_states, temb) + hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames) + + t2d_attn_weight_lists = [] + t2d_cross_attn_weight_lists = [] + temp_attn_weight_lists = [] + temp_cross_attn_weight_lists = [] + for i, (attn, temp_attn, resnet, temp_conv) in enumerate(zip( + self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:] + )): + ### + if i == len(self.attentions)-1: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=return_attn, + ) + else: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=False, + ) + if isinstance(hidden_states, tuple): + hidden_states, t2d_attn_weight_list, t2d_cross_attn_weight_list = hidden_states + + t2d_attn_weight_lists.append(t2d_attn_weight_list) + t2d_cross_attn_weight_lists.append(t2d_cross_attn_weight_list) + hidden_states = hidden_states.sample + + hidden_states = temp_attn( + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs + ) + if isinstance(hidden_states, tuple): + hidden_states, temp_attn_weight_list, temp_cross_attn_weight_list = hidden_states + temp_attn_weight_lists.append(temp_attn_weight_list) + temp_cross_attn_weight_lists.append(temp_cross_attn_weight_list) + hidden_states = hidden_states.sample + + hidden_states = resnet(hidden_states, temb) + hidden_states = temp_conv(hidden_states, num_frames=num_frames) + ### + # print(f'in UNetMidBlock3DCrossAttn add multiple returns t2d_attn_weight_lists {len(t2d_attn_weight_lists)} temp_attn_weight_lists {len(temp_attn_weight_lists)}') + return hidden_states, t2d_attn_weight_lists, t2d_cross_attn_weight_lists, temp_attn_weight_lists, temp_cross_attn_weight_lists + + +class CrossAttnDownBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + downsample_padding=1, + add_downsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): + super().__init__() + resnets = [] + attentions = [] + temp_attentions = [] + temp_convs = [] + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temp_convs.append( + TemporalConvLayer( + out_channels, + out_channels, + dropout=0.1, + ) + ) + attentions.append( + Transformer2DModel( + out_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + ) + ) + temp_attentions.append( + TransformerTemporalModel( + out_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + self.resnets = nn.ModuleList(resnets) + self.temp_convs = nn.ModuleList(temp_convs) + self.attentions = nn.ModuleList(attentions) + self.temp_attentions = nn.ModuleList(temp_attentions) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample2D( + out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op" + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + return_attn=False, + ): + # TODO(Patrick, William) - attention mask is not used + output_states = () + + t2d_attn_weight_lists = [] + t2d_cross_attn_weight_lists = [] + temp_attn_weight_lists = [] + temp_cross_attn_weight_lists = [] + + for i, (resnet, temp_conv, attn, temp_attn) in enumerate(zip( + self.resnets, self.temp_convs, self.attentions, self.temp_attentions + )): + hidden_states = resnet(hidden_states, temb) + hidden_states = temp_conv(hidden_states, num_frames=num_frames) + ### + if i == len(self.attentions)-1: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=return_attn + ) + else: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=False + ) + if isinstance(hidden_states, tuple): + hidden_states, t2d_attn_weight_list, t2d_cross_attn_weight_list = hidden_states + t2d_attn_weight_lists.append(t2d_attn_weight_list) + t2d_cross_attn_weight_lists.append(t2d_cross_attn_weight_list) + hidden_states = hidden_states.sample + + hidden_states = temp_attn( + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs + ) + if isinstance(hidden_states, tuple): + hidden_states, temp_attn_weight_list, temp_cross_attn_weight_list = hidden_states + temp_attn_weight_lists.append(temp_attn_weight_list) + temp_cross_attn_weight_lists.append(temp_cross_attn_weight_list) + hidden_states = hidden_states.sample + + output_states += (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states += (hidden_states,) + ### + # print(f'in CrossAttnDownBlock3D add multiple returns t2d_attn_weight_lists {len(t2d_attn_weight_lists)} temp_attn_weight_lists {len(temp_attn_weight_lists)}') + return hidden_states, output_states, t2d_attn_weight_lists, t2d_cross_attn_weight_lists, temp_attn_weight_lists, temp_cross_attn_weight_lists + +class DownBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_downsample=True, + downsample_padding=1, + ): + super().__init__() + resnets = [] + temp_convs = [] + + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temp_convs.append( + TemporalConvLayer( + out_channels, + out_channels, + dropout=0.1, + ) + ) + + self.resnets = nn.ModuleList(resnets) + self.temp_convs = nn.ModuleList(temp_convs) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample2D( + out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op" + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward(self, hidden_states, temb=None, num_frames=1): + output_states = () + + for resnet, temp_conv in zip(self.resnets, self.temp_convs): + hidden_states = resnet(hidden_states, temb) + hidden_states = temp_conv(hidden_states, num_frames=num_frames) + + output_states += (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states += (hidden_states,) + + return hidden_states, output_states + + +class CrossAttnUpBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + add_upsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): + super().__init__() + resnets = [] + temp_convs = [] + attentions = [] + temp_attentions = [] + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock2D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temp_convs.append( + TemporalConvLayer( + out_channels, + out_channels, + dropout=0.1, + ) + ) + attentions.append( + Transformer2DModel( + out_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + ) + ) + temp_attentions.append( + TransformerTemporalModel( + out_channels // attn_num_head_channels, + attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + self.resnets = nn.ModuleList(resnets) + self.temp_convs = nn.ModuleList(temp_convs) + self.attentions = nn.ModuleList(attentions) + self.temp_attentions = nn.ModuleList(temp_attentions) + + if add_upsample: + self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + upsample_size=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + return_attn=False, + ): + # TODO(Patrick, William) - attention mask is not used + + t2d_attn_weight_lists = [] + t2d_cross_attn_weight_lists = [] + temp_attn_weight_lists = [] + temp_cross_attn_weight_lists = [] + for i, (resnet, temp_conv, attn, temp_attn) in enumerate(zip( + self.resnets, self.temp_convs, self.attentions, self.temp_attentions + )): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + hidden_states = resnet(hidden_states, temb) + hidden_states = temp_conv(hidden_states, num_frames=num_frames) + + + ### + if i == len(self.attentions)-1: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=return_attn + ) + else: + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=False + ) + if isinstance(hidden_states, tuple): + hidden_states, t2d_attn_weight_list, t2d_cross_attn_weight_list = hidden_states + t2d_attn_weight_lists.append(t2d_attn_weight_list) + t2d_cross_attn_weight_lists.append(t2d_cross_attn_weight_list) + hidden_states = hidden_states.sample + + hidden_states = temp_attn( + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs + ) + if isinstance(hidden_states, tuple): + hidden_states, _, _ = hidden_states + # temp_attn_weight_lists.append(temp_attn_weight_list) + # temp_cross_attn_weight_lists.append(temp_cross_attn_weight_list) + hidden_states = hidden_states.sample + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + ### + # print(f'in CrossAttnUpBlock3D add multiple returns t2d_attn_weight_lists {len(t2d_attn_weight_lists)} temp_attn_weight_lists {len(temp_attn_weight_lists)}') + return hidden_states, t2d_attn_weight_lists, t2d_cross_attn_weight_lists, temp_attn_weight_lists, temp_cross_attn_weight_lists + + +class UpBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_upsample=True, + ): + super().__init__() + resnets = [] + temp_convs = [] + + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock2D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + temp_convs.append( + TemporalConvLayer( + out_channels, + out_channels, + dropout=0.1, + ) + ) + + self.resnets = nn.ModuleList(resnets) + self.temp_convs = nn.ModuleList(temp_convs) + + if add_upsample: + self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, num_frames=1): + for resnet, temp_conv in zip(self.resnets, self.temp_convs): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + hidden_states = resnet(hidden_states, temb) + hidden_states = temp_conv(hidden_states, num_frames=num_frames) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states \ No newline at end of file diff --git a/guidance/attn_utils/unet_attn.py b/guidance/attn_utils/unet_attn.py new file mode 100644 index 0000000..decefd9 --- /dev/null +++ b/guidance/attn_utils/unet_attn.py @@ -0,0 +1,420 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, + UNet3DConditionModel +) + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.models.embeddings import TimestepEmbedding, Timesteps +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.transformer_temporal import TransformerTemporalModel +from einops import rearrange, repeat + +from .unet_3d_blocks import ( + CrossAttnDownBlock3D, + CrossAttnUpBlock3D, + DownBlock3D, + UNetMidBlock3DCrossAttn, + UpBlock3D, + get_down_block, + get_up_block, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +@dataclass +class UNet3DConditionOutput(BaseOutput): + """ + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: torch.FloatTensor + +class UNet3DConditionModel_Attn(UNet3DConditionModel): + _supports_gradient_checkpointing = False + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ), + up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"), + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1024, + attention_head_dim: Union[int, Tuple[int]] = 64, + ): + super().__init__() + + self.sample_size = sample_size + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + ) + + # input + conv_in_kernel = 3 + conv_out_kernel = 3 + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = nn.Conv2d( + in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding + ) + + # time + time_embed_dim = block_out_channels[0] * 4 + self.time_proj = Timesteps(block_out_channels[0], True, 0) + timestep_input_dim = block_out_channels[0] + + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + ) + + self.transformer_in = TransformerTemporalModel( + num_attention_heads=8, + attention_head_dim=attention_head_dim, + in_channels=block_out_channels[0], + num_layers=1, + ) + + # class embedding + self.down_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[i], + downsample_padding=downsample_padding, + dual_cross_attention=False, + ) + self.down_blocks.append(down_block) + + # mid + self.mid_block = UNetMidBlock3DCrossAttn( + in_channels=block_out_channels[-1], + temb_channels=time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=False, + ) + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_attention_head_dim = list(reversed(attention_head_dim)) + + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=layers_per_block + 1, + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=reversed_attention_head_dim[i], + dual_cross_attention=False, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if norm_num_groups is not None: + self.conv_norm_out = nn.GroupNorm( + num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps + ) + self.conv_act = nn.SiLU() + else: + self.conv_norm_out = None + self.conv_act = None + + conv_out_padding = (conv_out_kernel - 1) // 2 + self.conv_out = nn.Conv2d( + block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding + ) + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet3DConditionOutput, Tuple]: + r""" + Args: + sample (`torch.FloatTensor`): (batch, num_frames, channel, height, width) noisy inputs tensor + timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps + encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Returns: + [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`: + [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + num_frames = sample.shape[2] + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=self.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + emb = emb.repeat_interleave(repeats=num_frames, dim=0) + encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0) + + # 2. pre-process + sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:]) + sample = self.conv_in(sample) + + sample = self.transformer_in( + sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs + ).sample + + t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss = [], [], [], [] + + # 3. down + down_block_res_samples = (sample,) + + for i, downsample_block in enumerate(self.down_blocks): + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + sample = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + num_frames=num_frames, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=True, + ) + if isinstance(sample, tuple): + sample, res_samples, t2d_attn_weight_lists, t2d_cross_attn_weight_lists, temp_attn_weight_lists, temp_cross_attn_weight_lists = sample + ### + t2d_attn_weight_listss.extend(t2d_attn_weight_lists) + t2d_cross_attn_weight_listss.extend(t2d_cross_attn_weight_lists) + temp_attn_weight_listss.extend(temp_attn_weight_lists) + temp_cross_attn_weight_listss.extend(temp_cross_attn_weight_lists) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames) + + down_block_res_samples += res_samples + + if down_block_additional_residuals is not None: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + num_frames=num_frames, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=True, + ) + if isinstance(sample, tuple): + sample, mid_t2d_attn_weight_lists, mid_t2d_cross_attn_weight_lists, mid_temp_attn_weight_lists, mid_temp_cross_attn_weight_lists = sample + ### + t2d_attn_weight_listss.extend(mid_t2d_attn_weight_lists) + t2d_cross_attn_weight_listss.extend(mid_t2d_cross_attn_weight_lists) + temp_attn_weight_listss.extend(mid_temp_attn_weight_lists) + temp_cross_attn_weight_listss.extend(mid_temp_cross_attn_weight_lists) + + if mid_block_additional_residual is not None: + sample = sample + mid_block_additional_residual + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + upsample_size=upsample_size, + attention_mask=attention_mask, + num_frames=num_frames, + cross_attention_kwargs=cross_attention_kwargs, + return_attn=True, + ) + + if isinstance(sample, tuple): + sample, up_t2d_attn_weight_lists, up_t2d_cross_attn_weight_lists, up_temp_attn_weight_lists, up_temp_cross_attn_weight_lists = sample + ### + t2d_attn_weight_listss.extend(up_t2d_attn_weight_lists) + t2d_cross_attn_weight_listss.extend(up_t2d_cross_attn_weight_lists) + temp_attn_weight_listss.extend(up_temp_attn_weight_lists) + temp_cross_attn_weight_listss.extend(up_temp_cross_attn_weight_lists) + + else: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + num_frames=num_frames, + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + + sample = self.conv_out(sample) + + # reshape to (batch, channel, framerate, width, height) + sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4) + + if not return_dict: + return (sample,) + + return UNet3DConditionOutput(sample=sample), t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss + diff --git a/guidance/clip.py b/guidance/clip.py new file mode 100644 index 0000000..730f9d1 --- /dev/null +++ b/guidance/clip.py @@ -0,0 +1,134 @@ +import torch +import torch.nn as nn +import os +import torchvision.transforms as T +import torchvision.transforms.functional as TF + +# import clip +from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTokenizer +from torchvision import transforms + +import torch.nn.functional as F + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + # print(x.shape, y.shape) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + +class CLIP(nn.Module): + def __init__(self, device, clip_name = 'openai/clip-vit-base-patch32'): + super().__init__() + + self.device = device + + clip_name = clip_name + self.feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_name) + self.clip_model = CLIPModel.from_pretrained(clip_name).cuda() + self.tokenizer = CLIPTokenizer.from_pretrained(clip_name) + + self.normalize = transforms.Normalize(mean=self.feature_extractor.image_mean, std=self.feature_extractor.image_std) + + self.resize = transforms.Resize(224) + + # image augmentation + # self.aug = T.Compose([ + # T.Resize((224, 224)), + # T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + # ]) + + + def get_text_embeds(self, prompt, neg_prompt=None, dir=None): + + clip_text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ).input_ids.cuda() + text_z = self.clip_model.get_text_features(clip_text_input) + # text = clip.tokenize(prompt).to(self.device) + # text_z = self.clip_model.encode_text(text) + text_z = text_z / text_z.norm(dim=-1, keepdim=True) + + return text_z + + def set_epoch(self, epoch): + pass + + def get_img_embeds(self, img): + # img = self.aug(img) + assert len(img.shape) == 4 + img = self.resize(img) + img = self.normalize(img) + # print(img.shape) + image_z = self.clip_model.get_image_features(img) + image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features + # print(image_z.shape, 'clip image embed') + return image_z + + + def train_step(self, text_z, pred_rgb, image_ref_clip, **kwargs): + + pred_rgb = self.resize(pred_rgb) + pred_rgb = self.normalize(pred_rgb) + + image_z = self.clip_model.get_image_features(pred_rgb) + image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features + + # print(image_z.shape, text_z.shape) + loss = spherical_dist_loss(image_z, image_ref_clip) + + # loss = - (image_z * text_z).sum(-1).mean() + + return loss + + def text_loss(self, text_z, pred_rgb): + + pred_rgb = self.resize(pred_rgb) + pred_rgb = self.normalize(pred_rgb) + + image_z = self.clip_model.get_image_features(pred_rgb) + image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features + + # print(image_z.shape, text_z.shape) + loss = spherical_dist_loss(image_z, text_z) + + # loss = - (image_z * text_z).sum(-1).mean() + + return loss + + def img_loss(self, img_ref_z, pred_rgb): + # pred_rgb = self.aug(pred_rgb) + pred_rgb = self.resize(pred_rgb) + pred_rgb = self.normalize(pred_rgb) + + image_z = self.clip_model.get_image_features(pred_rgb) + image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features + + # loss = - (image_z * img_ref_z).sum(-1).mean() + loss = spherical_dist_loss(image_z, img_ref_z) + + return loss + + def img_img_loss(self, gt_rgb, pred_rgb): + # pred_rgb = self.aug(pred_rgb) + pred_rgb = self.resize(pred_rgb) + pred_rgb = self.normalize(pred_rgb) + + image_z = self.clip_model.get_image_features(pred_rgb) + image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features + + # loss = - (image_z * img_ref_z).sum(-1).mean() + loss = spherical_dist_loss(image_z, img_ref_z) + + return loss + +if __name__ == '__main__': + clip = CLIP('cuda') + im = torch.randn((1, 3, 512, 512)).cuda() + res = clip.get_img_embeds(im) + print(res.shape) + \ No newline at end of file diff --git a/guidance/debug.ipynb b/guidance/debug.ipynb new file mode 100644 index 0000000..37e64d4 --- /dev/null +++ b/guidance/debug.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "def generate_coordinates(timestep):\n", + " # Normalized gravity for our scale\n", + " g = 9.8 * 0.2 # Adjusting gravity to our scale\n", + " \n", + " # Initial position\n", + " x0, y0, z0 = -1, 0, -1 # Starting at the side\n", + " \n", + " # Initial velocity, scaled to ensure a realistic jump arc\n", + " v0x, v0y, v0z = 2, 4, 2 # Adjust these based on trial and error for a realistic jump\n", + " \n", + " # Time scaling to fit the jump into a 2-3 second duration, mapped to [0,1]\n", + " t = timestep * 2 # If 1 is the end of the jump, mapping to 2-3 seconds\n", + " \n", + " # Calculate the position at time t\n", + " x = v0x * t + x0\n", + " y = -0.5 * g * t**2 + v0y * t + y0\n", + " z = v0z * t + z0\n", + " \n", + " return np.array([x, y, z])\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[-1. 0. -1. ]\n", + " [-0.6 0.7608 -0.6 ]\n", + " [-0.2 1.4432 -0.2 ]\n", + " [ 0.2 2.0472 0.2 ]\n", + " [ 0.6 2.5728 0.6 ]\n", + " [ 1. 3.02 1. ]\n", + " [ 1.4 3.3888 1.4 ]\n", + " [ 1.8 3.6792 1.8 ]\n", + " [ 2.2 3.8912 2.2 ]\n", + " [ 2.6 4.0248 2.6 ]\n", + " [ 3. 4.08 3. ]]\n", + "(11, 3)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAGlCAYAAADQ/XDvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAACyM0lEQVR4nOydd3xbZ73/PxqWty3LK44TbyeOkzgeWU466UybNumAtvTSAfdSoC0t/cGFMkqBy4XSCxTKpbTA7aAESpu0paFtuhK628S2vEfseA9ZsuShLZ3z/P4Iz+mRrK2j5Zz365UX1JZ1jqSj53Oe7/h8JYQQAhERERERkRCRxvoEREREREQSG1FIRERERETCQhQSEREREZGwEIVERERERCQsRCEREREREQkLUUhERERERMJCFBIRERERkbAQhUREREREJCxEIRERERERCQtRSLzwxBNPQCKRYGRkJO7O47zzzsN5550X9XOJ1XGDQaPR4Nprr0Vubi4kEgkeeughwZ57ZGQEEokE//M//+P3sffffz8kEolgx04EJBIJ7r///lifhkgMOGOE5Morr0RaWhqWlpa8PubGG2+EQqHA3NxcFM8svujp6cH9998fcwENla997Ws4cuQI7r33XvzpT3/CpZde6vPxJpMJP/rRj1BXV4e0tDRkZ2fj7LPPxlNPPYV4dg96+eWXA1q06Y2Iv39lZWURP+dwMJvNuP/++3Hs2LFYn4qIJ8gZwl//+lcCgDz55JMef28ymUh6ejq54oorCCGEOJ1OYrFYCMuy0TzNZTz++OMEABkeHuZ+ZrPZiM1mi8jxnn32WQKAHD16dNnvInlcoSgsLCQ33nhjQI+dmZkhGzduJFKplHz2s58ljz76KPnVr35FzjnnHAKAXHfddcTpdHKPHx4eJgDIgw8+6Pe5HQ4HsVgsIb8Of9x+++0kkK/v0NAQ+dOf/uTyLzk5mZx99tkuP3v++efDPieLxUIcDkfYz+MJrVZLAJDvf//7EXl+kfCQx1DDosqVV16JzMxMHDhwADfddNOy37/44oswmUy48cYbAQAymQwymSzapxkQCoXijDpuMMzOzkKpVAb02Jtvvhm9vb14/vnnceWVV3I//+pXv4pvfOMb+J//+R80NDTgm9/8ZtDnIZfLIZfH/utVUVGBiooKl5996UtfQkVFBf7t3/7N6985nU6wLBvUZ56SkhLyecYKk8mE9PT0WJ9G4hNrJYsmN998M5HL5USj0Sz73d69e0lmZiYxm82EEM87gePHj5OLL76Y5ObmkpSUFFJWVkZuvfVW7vdHjx71eDdP72Qff/xx7mft7e3k5ptvJuXl5SQ5OZkUFhaSW2+9leh0Ope/9XQe5557Ljn33HO5/y4tLSUAPP6j5zIyMkK+/OUvk3Xr1pGUlBSiUqnItdde6/K89FjensP9uIQQotFoyOc//3lSUFBAkpOTSV1dHXniiSc8vv4HH3yQPProo6SiooIoFAqydetW8vHHHy/7LDwxNDRErr32WpKTk0NSU1PJjh07yOHDh/2euzc++OADAoB8/vOf9/h7h8NBqqurSU5ODndN8F/HL37xC1JSUkJSUlLIOeecQzo7O13+/vvf/77H4//pT38ijY2NJCUlheTk5JDrrruOjI2NLXvchx9+SPbs2UOUSiVJS0sjmzdvJg899BAh5PR1HMxrdSc9PZ3cfPPN3H/zX9cvf/lLUlFRQaRSKWlrayM2m41873vfI42NjSQrK4ukpaWRs846i7z11lvLnhcedgwTExPk1ltvJQUFBUShUJDa2lryxz/+cdnfWiwW8v3vf59UV1eT5ORksmrVKnLVVVeRwcFB7vzc//GP9eabb5KzzjqLpKWlkezsbHLllVeSnp4el2PQz6S7u5vccMMNRKlUkvr6evJ///d/BABpbW1ddl4//vGPiVQqJRMTEwG/v2cisb9liiI33ngjnnzySfztb3/DHXfcwf1cr9fjyJEjuOGGG5Camurxb2dnZ3HxxRcjPz8f3/rWt6BUKjEyMoJDhw6FdC6vv/46Tp06hVtvvRWrVq1Cd3c3HnvsMXR3d+PDDz8MKlH70EMPwWg0uvzsl7/8JdRqNXJzcwEAx48fx/vvv4/rr78ea9aswcjICB555BGcd9556OnpQVpaGs455xx89atfxa9//Wt8+9vfxoYNGwCA+193LBYLzjvvPAwODuKOO+5AeXk5nn32Wdxyyy2Yn5/HXXfd5fL4AwcOYGlpCbfddhskEgl+9rOf4eqrr8apU6eQlJTk9fVpNBrs2rULZrMZX/3qV5Gbm4snn3wSV155JZ577jlcddVVOOecc/CnP/0Jn/vc53DRRRd53HXyeemllwDA6+Pkcjk++9nP4gc/+AHee+89XHjhhdzvnnrqKSwtLeH222+H1WrFr371K3zqU59CZ2cnCgsLvR7zxz/+Mb73ve/hM5/5DP793/8dWq0WDz/8MM455xy0tbVxO6nXX38de/fuRVFREe666y6sWrUKvb29OHz4MO666y7cdtttmJqawuuvv44//elPPl9nMDz++OOwWq344he/iOTkZKhUKiwuLuIPf/gDbrjhBvzHf/wHlpaW8Mc//hGXXHIJPv74Y9TX13t9Po1Gg507d0IikeCOO+5Afn4+XnnlFXzhC1/A4uIi7r77bgAAwzDYu3cv3nzzTVx//fW46667sLS0hNdffx1dXV248MIL8cgjj+DLX/4yrrrqKlx99dUAgLq6OgDAG2+8gT179qCiogL3338/LBYLHn74YezevRutra3L8j+f/vSnUV1djf/+7/8GIQTXXnstbr/9dvz5z39GQ0ODy2P//Oc/47zzzkNxcbFg7/OKJNZKFk2cTicpKioizc3NLj//3e9+RwCQI0eOcD9z3wk8//zzBAA5fvy41+cPZkdC73L5/OUvfyEAyNtvv+31PAjxvDPg87e//Y0AID/84Q99Ho/elT/11FPcz3zlSNyP+9BDDxEA5Omnn+Z+ZrfbSXNzM8nIyCCLi4surz83N5fo9XrusS+++CIBQF566SWvr4UQQu6++24CgLzzzjvcz5aWlkh5eTkpKysjDMNwPwdAbr/9dp/PRwgh+/fvJwCIwWDw+phDhw4RAOTXv/61y+tITU11uUP96KOPCADyta99jfuZ+45kZGSEyGQy8uMf/9jlGJ2dnUQul3M/dzqdpLy8nJSWli47N36+LtAciSe87UiysrLI7Oysy2OdTueyvJjBYCCFhYXLdnNw2yV84QtfIEVFRct22ddffz3Jzs7mrkm6I/jFL36x7Fzpa/aVI6mvrycFBQVkbm6O+1l7ezuRSqXkpptu4n5GP5Mbbrhh2XPccMMNZPXq1S7XUmtr67LvrYhnzpiqLeB03uP666/HBx984FKVdODAARQWFuKCCy7w+rf0bvHw4cNwOBxhnwt/52O1WqHT6bBz504AQGtra8jP29PTg89//vPYt28fvvvd73o8nsPhwNzcHKqqqqBUKkM+3ssvv4xVq1bhhhtu4H6WlJSEr371qzAajfjnP//p8vjrrrsOOTk53H+fffbZAIBTp075Pc727dtx1llncT/LyMjAF7/4RYyMjKCnpyfoc6fVe5mZmV4fQ3+3uLjo8vP9+/e73KFu374dO3bswMsvv+z1uQ4dOgSWZfGZz3wGOp2O+7dq1SpUV1fj6NGjAIC2tjYMDw/j7rvvXpbriXQ58TXXXIP8/HyXn8lkMi5PwrIs9Ho9nE4ntm7d6vO6IYTg4MGDuOKKK0AIcXnNl1xyCRYWFri/P3jwIPLy8nDnnXcuex5/r3l6ehpqtRq33HILVCoV9/O6ujpcdNFFHj+TL33pS8t+dtNNN2Fqaor7HIDTu5HU1FRcc801Ps9B5Awq/6XQZPqBAwcAABMTE3jnnXdw/fXX+0yun3vuubjmmmvwgx/8AHl5edi3bx8ef/xx2Gy2kM5Dr9fjrrvuQmFhIVJTU5Gfn4/y8nIAwMLCQkjPubi4iKuvvhrFxcV46qmnXL6EFosF9913H9auXYvk5GTk5eUhPz8f8/PzIR9vdHQU1dXVkEpdLyMaChsdHXX5eUlJict/U1ExGAx+j7N+/fplP/d2nECgIuGrHNyb2FRXVy977Lp163yWTJ88eRKEEFRXVyM/P9/lX29vL2ZnZwEAQ0NDAIBNmzYF9XqEgF5/7jz55JOoq6tDSkoKcnNzkZ+fj3/84x8+rxutVov5+Xk89thjy17vrbfeCgAur3n9+vUhFSfQz97b9aHT6WAymfy+zosuughFRUX485//DOC0aP7lL3/Bvn37fN5siJzmjMqRAEBTUxNqamrwl7/8Bd/+9rfxl7/8BYQQTmC8IZFI8Nxzz+HDDz/ESy+9hCNHjuDzn/88fv7zn+PDDz9ERkaG17snhmGW/ewzn/kM3n//fXzjG99AfX09MjIywLIsLr30UrAsG9Jru+WWWzA1NYWPP/4YWVlZLr+788478fjjj+Puu+9Gc3MzsrOzIZFIcP3114d8vGDxJtQkBv0aGzZswAsvvICOjg6cc845Hh/T0dEBAKitrQ37eCzLQiKR4JVXXvH4PmRkZIR9jHDxlB98+umnccstt2D//v34xje+gYKCAshkMvzkJz/hRM8T9Jr6t3/7N9x8880eH0NzHNHG0+uUyWT47Gc/i9///vf47W9/i/feew9TU1M+K9tEPuGMExLg9K7ke9/7Hjo6OnDgwAFUV1dj27ZtAf3tzp07sXPnTvz4xz/GgQMHcOONN+Kvf/0r/v3f/527w56fn3f5G/c7ZoPBgDfffBM/+MEPcN9993E/P3nyZMiv6ac//SleeOEFHDp0CDU1Nct+/9xzz+Hmm2/Gz3/+c+5nVqt12bkGEz4pLS1FR0cHWJZ12ZX09fVxvxeC0tJS9Pf3L/t5OMfZu3cvfvKTn+Cpp57yKCQMw+DAgQPIycnB7t27XX7n6XMaGBjw2dRXWVkJQgjKy8uxbt06n48DwCWZvRGtrvnnnnsOFRUVOHTokMsxv//97/v8u/z8fGRmZoJhGJ+vAzj9mj/66CM4HA6vRRfeXi/97L1dH3l5eQGX99500034+c9/jpdeegmvvPIK8vPzcckllwT0t2c6Z1xoC/gkvHXfffdBrVb73Y0Apxd/9ztnWrFCw1ulpaWQyWR4++23XR7329/+1uW/6R2p+/OFaufxxhtv4Lvf/S6+853vYP/+/R4fI5PJlh3v4YcfXrZbol86d4HxxGWXXYaZmRk888wz3M+cTicefvhhZGRk4Nxzzw3uhfg4zscff4wPPviA+5nJZMJjjz2GsrKykHYMu3btwoUXXojHH38chw8fXvb773znOxgYGMB//ud/LruDfeGFFzA5Ocn998cff4yPPvoIe/bs8Xq8q6++GjKZDD/4wQ+WfQ6EEM5NobGxEeXl5XjooYeWfQb8vwvmcwoHT9fqRx995PJZePu7a665BgcPHkRXV9ey32u1Wu7/X3PNNdDpdPjNb36z7HH0uGlpaQCWv96ioiLU19fjySefdPldV1cXXnvtNVx22WW+XyCPuro61NXV4Q9/+AMOHjyI66+/Pi56gRKBM/JdKi8vx65du/Diiy8CQEBC8uSTT+K3v/0trrrqKlRWVmJpaQm///3vkZWVxV2s2dnZ+PSnP42HH34YEokElZWVOHz4MBcLpmRlZeGcc87Bz372MzgcDhQXF+O1117D8PBwSK/nhhtuQH5+Pqqrq/H000+7/O6iiy5CYWEh9u7diz/96U/Izs5GbW0tPvjgA7zxxhtceTClvr4eMpkMDzzwABYWFpCcnIxPfepTKCgoWHbcL37xi3j00Udxyy23oKWlBWVlZXjuuefw3nvv4aGHHhIstvytb30Lf/nLX7Bnzx589atfhUqlwpNPPonh4WEcPHhwWY4mUJ566ilccMEF2LdvHz772c/i7LPPhs1mw6FDh3Ds2DFcd911+MY3vrHs76qqqnDWWWfhy1/+Mmw2Gx566CHk5ubiP//zP70eq7KyEv/1X/+Fe++9FyMjI9i/fz8yMzMxPDyM559/Hl/84hfx9a9/HVKpFI888giuuOIK1NfX49Zbb0VRURH6+vrQ3d2NI0eOADgdogVON09ecsklXCGJ0OzduxeHDh3CVVddhcsvvxzDw8P43e9+h9ra2mUl5+789Kc/xdGjR7Fjxw78x3/8B2pra6HX69Ha2oo33ngDer0ewOmdwFNPPYV77rkHH3/8Mc4++2yYTCa88cYb+MpXvoJ9+/YhNTUVtbW1eOaZZ7Bu3TqoVCps2rQJmzZtwoMPPog9e/agubkZX/jCF7jy3+zs7KC9v2666SZ8/etfBwAxrBUMsSgViwf+93//lwAg27dv9/h797Lb1tZWcsMNN5CSkhKSnJxMCgoKyN69e8mJEydc/k6r1ZJrrrmGpKWlkZycHHLbbbeRrq6uZWWEExMT5KqrriJKpZJkZ2eTT3/602RqampZiWMg5b/w0owIXhmvwWAgt956K8nLyyMZGRnkkksuIX19faS0tNSlFJQQQn7/+9+TiooKIpPJAmpIpM+rUCjI5s2bl5VL+rIWcX+93qANiUqlkqSkpJDt27e7NCTyny+Q8l/K0tISuf/++8nGjRtJamoqyczMJLt37yZPPPHEMnsc/uv4+c9/TtauXcvZjbS3t7s81ltD4sGDB8lZZ51F0tPTSXp6OqmpqSG333476e/vd3ncu+++Sy666CKSmZlJ0tPTSV1dHXn44Ye53zudTnLnnXeS/Px8IpFIBGtIdIdlWfLf//3fpLS0lCQnJ5OGhgZy+PBhcvPNN5PS0lKXx3r6LDUaDbn99tvJ2rVrSVJSElm1ahW54IILyGOPPebyOLPZTL7zne+Q8vJy7nHXXnstGRoa4h7z/vvvk6amJqJQKJYd64033iC7d+8mqampJCsri1xxxRVeGxK1Wq3X92Z6eprIZDKybt06r48RWY6EkDh2phMRSVC+973v4Sc/+QmcTmesTyUqMAwDuVyOH/3oRy5l54mGTqdDUVER7rvvPnzve9+L9ekkDGdkjkREJNJMT08jLy8v1qcRNaanpwEg4V/zE088AYZh8LnPfS7Wp5JQnJE5EhGRSHHq1Ck8//zzePbZZ7F3795Yn05UeO6557i+pfPPPz/WpxMSb731Fnp6evDjH/8Y+/fvj3tb/XhDDG2JiAjIE088ga9+9as477zz8Pvf/96n99ZKoaKiAhKJBN/97ne5ZsNE47zzzsP777+P3bt34+mnnxa9tYJEFBIRERERkbAQcyQiIiIiImEhComIiIiISFiIQiIiIiIiEhaikIiIiIiIhIUoJCIiIiIiYSEKiYiIiIhIWIhCIiIiIiISFqKQiIiIiIiEhSgkIiIiIiJhIQqJiIiIiEhYiEIiIiIiIhIWopCIiIiIiISFKCQiIiIiImEhComIiIiISFiIQiIiIiIiEhaikIiIiIiIhIUoJCIiIiIiYSEKiYiIiIhIWIhCIiIiIiISFqKQiIiIiIiEhSgkIiIiIiJhIQqJiIiIiEhYiEIiIiIiIhIWopCIiIiIiISFKCQiIiIiImEhComIiIiISFiIQiIiIiIiEhaikIiIiIiIhIUoJCIiIiIiYSEKiYiIiIhIWIhCIiIiIiISFvJYn4DImQUhBAzDwGazQSaTcf+kUvGeRkQkURGFRCRqEELgcDjgdDphs9m4n0ulUsjlcsjlclFYREQSEAkhhMT6JERWPgzDwOFwgGVZSCQS2O12SKVSEEJACAHLsiCEQCKRQCKRiMIiIpJAiEIiElEIIXA6nXA6nQAAiUTC7UwkEonHx1NRodDHKRQKJCUlQS6Xe/xbERGR2CCGtkQiBsuy3C4EALfboCJBdyB8JBIJZDIZ999UWD788EOsX78eSqUSUqkUMpnMZdciCouISOwQhUREcOji73A4XMJV7o8JZPGnwkL/VyaTcc9tt9shkUg4YUlKSuIeIwqLiEj0EIVERFBo2KqrqwsFBQXIy8sTZFGnz+Ftx+IuLO45FlFYREQihygkIoJBF3SGYbC0tIScnBzBFnB+SMz951RY6O9ZloXdbofNZhOFRUQkCohCIhI2tDfE6XSCZVlIpVKvC38koeIgCouISHQRhUQkLGgoi2EYAOBEhJb2CkUowuRJWOg/m80Gu93OnbMoLCIioSMKiUjI0Dt9/i6Ej7eFP1aLND/pL5PJlgkLf8dCy4xpD4soLCIi3hGFRCRoaCiLVmV5Wmj97SCCXZgjESrzJSxWq5V7jCgsIiK+EYVEJChYloXT6VwWynLH18LPMAy0Wi0yMjKQlpYW0fMNhkCFxb2HRRQWkTMdUUhEAiKQ3hA+3oTEaDRCrVZz5brJycnIycnh/iUnJwf1fJHEm7CwLMsJi1QqXZZjEYVF5ExDFBIRv3iyOfG3ULov/IQQTE5Oore3FyUlJSgpKQHLslhcXITBYMD4+Dh6enqQlpbGiYpSqYRCoYjoawsGX8Jis9lgtVohlUrhdDqhUCiQkpIiCovIGYEoJCI+4feG0HxBIPCFxOl0oru7G3Nzc6ivr0deXh5n2pibm4vc3FwAgMPhwPz8PAwGA4aHh2EymZCeno6cnBwXIYsX3AWVCktPTw/y8vJQVFTkkmOhIbFAhFhEJJEQhUTEI956QwKFCsni4iLUajVSUlKwa9cupKSkeA1RJSUlIT8/H/n5+QAAu93OCYvNZkNvby8mJia4HUt2djbk8vi5hKlA8O1c+O8j/3fuPmGisIgkMvHzLRSJG7z1hgSLwWDAqVOnUFFRgYqKiqCfQ6FQoKCgAAUFBTAajSgsLIRcLofBYEB/fz9sNhuysrI4YcnKynKxT4kVVCipQNBdHN2xOJ1Ozv2YCgvfJ0y0zBdJNEQhEXHBX29IIDgcDhgMBjgcDjQ1NUGlUglybnK5HKtWrcKqVasAABaLBQaDAQaDAVNTU3A6ncjOzoZSqYRKpUJmZmZcLcqisIisVEQhEQEQWG9IIBgMBrS3twMA1qxZI5iIeDqX1NRUpKamYvXq1SCEwGw2c8IyMTEBlmWRnZ3N7VgyMzPjKoTkT1gAcXqkSGIgComIIKEsQgiGh4cxNDSE6upqGI1GwRc8fw2O6enpSE9Px5o1a0AIgclk4oRlZGQEEokESqWSE5b09PSICUsoz+tNWPjOxuL0SJF4RBSSMxz+CNxQdyE2mw0dHR0wm83Yvn07srOz0dPTI7jXVrCPz8jIQEZGBtauXQuWZWE0GmEwGDA3N4ehoSHIZDKXHpbU1FRBhEWo1+1JWGgVHd2xuAuLOD1SJBaIQnKG4t4bEqqIzM3NoaOjAzk5Odi1axeSkpIABGaREuyCG84CLZVKkZWVhaysLJSWlrr0sGg0GgwMDEChUHD9K1RY4olAZrHwhUWcHikSLUQhOQNxH4EbSmiEZVkMDQ1hZGQENTU1WLNmjcuCJZFIuOcXAqEXQ6lUCqVSCaVSifLycjAMg4WFBS5x39/fH3DXfawIdMiXOD1SJNKIQnIGwV9owgllWa1WtLe3w+FwYOfOncjMzFz2GKEXq0hbpMhkMqhUKq44wOl0Yn5+HvPz8x677nNycrjdl7fzjTbi9EiRWCEKyRkCNR7s6+vDunXrQl5AZmdn0dnZicLCQtTU1HhtCBR6RxJt5HI58vLykJeXB2B5131XVxcyMjJc7FzoexFtTzBviNMjRaKFKCRnAPwE7djYGKqrq4NeLFiWRX9/PyYmJrBx40asXr3a5+MTbUfiD09d97Qi7OTJk7BarcjMzEROTo5L2DBe8DU9sqenBwqFAiUlJaKwiISEKCQrGHebE/dFJFDMZjPUajUAYNeuXUhPT/f7N4m+I/GHQqFAYWEhCgsLAZwO91FhsVgs6Ovrw/T0tIudSzyV6fKFhWVZbvdCDSjF6ZEiwSAKyQrFU28IJZgFfnp6Gt3d3SguLsb69euDMm0UkljvSPyRkpKCoqIiFBUVwWw2o7CwEDKZbFnXPb85Ml6EhT8WgO8R5mt6JC01Fp2NRQBRSFYk/npDAlmQGYZBb28vNBoNNm/ezN15B8pK35H4g+5YPHXdj42NgRDi0hyZkZERswWZOhnwEadHigSDKCQrCH5viK8RuP4WeDp8Si6XY9euXSH1U5xpOxJfeOq6p82RNHkvkUhcEveR7Lp3h+5I/L0GcXqkiDdEIVkheOoN8fQFlkqlXhdk/vCp0tJSVFVVhRx+CWThD2QBc398ouBvemRmZiYyMzO5AV9LS0swGAzQarUYHByEXC532bEI1XXviWA/B/oa/E2PFIXlzEEUkgQn2BG4UqnU446EP3yqoaGBK3sNFaF3ECt58ZFKpcjOzkZ2djbKysrAsiwWFhYwPz+/rOue/ktJSRHs+KEIiTuBTo8UxxKvTEQhSWDcE+qBDEjytMAvLCygvb0dqamp2L17tyAd3JEIRSXKjiTc85RKpZxguHfdT05Ooq+vDykpKS7CEs5IYlq1JSTu1yIVFoZhwDCM1+S9OOQrMRGFJEHhj8AN5q6OvyMhhGB0dBQnT54MefiUN8QdiXB467o3GAwYHR1Fd3c3N5KY5lh8dd27I8SOxB/enI3F6ZErA1FIEgyhRuDa7XZ0dXVhcXERW7duRU5OjqDneSbvSIDICp+nrnuauD916hRMJpPXrntPRENI3PE3i8WbsMRLybSIK6KQJBBCzA2RSqVYXFxER0cHsrKysGvXrrDCIt4QdyTRIykpiRtJDJy29TcYDJifn1/WdU+bI909uWL9/gYjLOL0yPhDFJIEQYgRuFSIBgYGsG7dOpSWlkZsATmTdySxPs/k5ORlI4lpKKy3txd2u92lOTISOZJw8ScsBoMBhBCsWrVKHPIVB4hCEucINQKXDp9yOBxYt24dysrKhD9ZHuKOJH6gI4mLiopACHGZdT8xMQGHw4GRkRGYzWaoVCpkZGTE3YLsLiyLi4tgGAa5ubk+7Vzi7XWsVEQhiWNYloXT6QwrlAUAOp0OHR0dyM3NRWZmZlQGNvkSklASqInWkBivwieRSJCWloa0tDQUFxeDEIIPP/wQWVlZWFxcjLuue2/QnTnfcVmcHhk7RCGJQ4LtDfEGy7IYHBzE6OgoNmzYgOLiYnz88cdRsS5JtIX/TIVeW/n5+cjNzfXYdU+HgFFhSUtLi/mCzLKsS2War1ksnoRFnB4pLKKQxBnuI3BDFRGLxYL29nY4nU6X4VPRWuAjEdpKFGFKlPOk8JPtgXbduzdHRntBpjsSb/gSFnF6pPCIQhJH8HtD+PHgYOEPn9qwYYPLF8pbZ7vQJNLCf6bjq2rLveueYRhu1v309DQ3kpi/YxGy694b/oTEnUCFRbTMDw1RSOKAcHtDKHT41OTkJDZu3IiioqJlj4mXHYmYI4kfPLn/ekMmk3GCAZx2iaYVYRMTE+jt7UVqaqpgXfdCnLMn+MIiTo8MH1FIYowQvSEAYDKZ0N7eDgBobm72OnxK3JGIuBNOH4lMJkNubi5yc3MBCN91741gdyS+4HuEAaKwhIIoJDGEZVloNBoYDIaw7EmmpqbQ09MT0PCpeNiRUG8vAFCpVNwi46v7WiJJnPkmiSagQjYkunfd2+12TliGhoZgNpuXNUf6+ty9IaSQuONLWMTpkZ4RhSQG8HtDTCYTdDodKisrg34e/vCpuro6rrPZF7HckRBCMD4+jv7+fpSVlSElJQXz8/MYGhqCxWJBZmYmJyyeRtMm2gKdKESys12hUHjsujcYDOjv74fNZkNWVhaXY3HvuvdGJIXEHb6weJoeyReWM3V6pCgkUcY9lCWXy0Na2JeWltDe3h708KlY7UioTb1er0dTUxMyMzPBMIzLzHO9Xu8ymlapVHLCkmgk0gISTYsUT133VFh6enrgdDqRlZXF7ViysrI8CkY0hcQdb5b5Z/L0SFFIooinEbjB7hAIIZiYmEBfX19Iw6disSNZWlqCWq1GcnIydu3aheTkZK62n5KSkoLVq1dzo2lNJhMMBgP0ej2Gh4dBCIFCocDU1BQ36ElEGGLptUW77unn7t51z7Lssln3NMwZL13rorCIQhIV3HtD+BdQMAu70+lEV1cX9Hp9yMOnor0jmZqaQnd3N8rKylBVVRXQF0cikSAjIwMZGRlYu3YtWJZFX18fTCaTS8kptVZXKpURqQwKlUQLwcWDaSPgueue3lDQ5D0AKJVK2O12WK3WuDl3Pr6EZWxsDEtLS6iqqlpR0yNFIYkw7iNw3RsMAxUSoYZPRWtHQsWzt7cX9fX1yM/PD/m5pFIp1/S2YcMGl8qg4eFhzjadhsGUSmVAcXaRT8wQ43EBc7+hIIRwzZFzc3M4efIkTp06xX3m8dJ17w7/O0/L/On3cKVMjxSFJELwG5589Yb4W9j5w6cqKytRXl4e1sUVjeons9mMjo4OEEK85m9C6SOheKoMovmVvr4+zt2WCktmZmbUwyCJsgDQ3VMinK9EIkFWVhaysrIwNjaGzZs3AwAMBgNmZ2c9dt3HWwiUYZhlZpK+pkcmirCIQhIBgukNkclkXhf2SAyfkkqly/ITQqLRaNDZ2YmCggKYzWZBv8jeQkYKhYJL4PLj7Hq9HmNjYwDgkriP9F1rIoW2EklI+LAsC7lcjoyMDCiVSm4ksaeue76wCDFGOhyokPDxZpnPF5YbbrgBN998M2644YZYnLZfRCERmGBH4HrbkRgMBrS3tws+fCpSOxKWZTEwMICJiQls2rQJWVlZmJ6eFuz5A13oPMXZl5aWoNfrOa+opKQk5OTkcMIS68UlliSqkHjqbHfvunc6ndys+/HxcfT09CAtLc2lOTLauTVPQuKOJ2GZmZkJqd8mWsTvmSUYodqcuAsJIQSnTp3CqVOnsG7dOpSUlAj6JZdKpYLfMVutVqjVajAMw3XVm81mQS1SgNDu9PnhEOoV5b640M5rmriP5y+s0CSikNCwsb9wpVwud+m6dzgcHnNrVFSE6rr3BcMwQYuXRCKB2WxGWlpahM4qfM6cb0wECcfmhC7sdI56R0cHLBYLtm/fjuzsbMHPVeiqLZ1Oh/b2dhQUFKC2tpa726KvX6hErlALnUwm46q9KisrXeadDw4OBtQYGc3zjTSJKiQAgv5ckpKSkJ+fzxV+2O12biSxp677SBRtsCwb9HPS6jXq4B2PiEISJp56Q4KBfhm0Wi26urqQm5uLhoaGiN0VC1W1RQjB0NAQhoeHsWHDBqxZs8bl95EQkkjkHtznnfMbIycnJ8EwjEuMPZAhT2KOJLLQ6zfcAgqFQoHCwkKuKZbfdU+LNtybI8MVFhryDhaTyeTVPy8eEIUkRPi9IeGMwKW0t7dzw6ci+aUWYkG22+1ob2+HxWJxmXXifhzA96Iajwuuv8ZIqVTqkl+Jt6qgYDmThcQdftc9bSakwkLdFtybI4M9h0ByJJ6gu6V4RRSSEBBqBC4dPgUAjY2NXCw3koS7IzEYDFCr1cjJyfG5c/InJMGKSCzchD01RrpXBaWkpLgIC42xJ8rCHM4EzljB78mKFBKJZFnXvdls5oRlfHwcLMty/StKpZLruvdFKEJit9vhcDiQkZERzkuKKKKQBIFQI3CB02WyXV1dKCwsxPz8fNTubEOt2iKEYGRkBIODgwEVAQSyIwnlHGIJHTlLy03dGyO7urqQmZkJp9OJxcVFZGRkxH1jZLw2I/oinJk9oSKRSJCeno709HSsWbPGZSTx/Pw8hoeHIZFIXAZ8paenLzvHUHIkRqMRAEQhWQm4J9RDFRFq9zE1NcUNn5qamoqaRXooVVsOhwNdXV1YWFjAtm3boFQq/f6N0EISj4ude2MkjbH39fVxoksbI1UqVUB3rNEmUYUk1ucskSwfSUyFZW5uDkNDQy7lyDQMGkqOxGg0cmXt8YooJAEQbG+IN/jDp3bt2sVdGNGyLQGC35EsLi6ira0N6enpQfWzrMQdiT9ojH1wcBCbNm2CQqHgEvexaIwMhEQVkngxbKRIpVJkZWXBLk3BhDMTsoy1KE5jIbEZodFoMDAwAIVCwVWKpaSkBByFoKW/8faa+YhC4gOhRuAC4MwL16xZs2z4VDSFJNAdCd9luKKiIujBW2fCjsQXUqmUa4ykoZB4bIwUhUQYCCF4rU+H59UzWLCcdo5QpiXh6vpVuLChDCzLYmFhAWq1GjqdDqOjowF33RuNRo9hsnhCFBIvCDUClxoXzs7OYsuWLR6HT0V7R+JvcXc6nejp6YFOpwu5COBM3JH4Il4bI0UhEYaeGSOeaZmCBECZ6vROY9Zox19PTGGNMhUbVmVwHfd1dXWQy+Vcfs1T1z2/cCPapb8//elPce+99+Kuu+7CQw89FNDfiELiATqrOdxdCJ3DoVAosHv3bqSkpHh8XLR3JL6OZTQaoVarkZSUhF27dnk9Z3+cyTuSQF6zr8bIkydPwmq1uvQwhNoYGci5JtJ7C8SnkHw8Og+znUFl3id5jFVZyRjSmXF8dB4bVmVw3ztqH8/Pr7l33Xd1dSE1NRUHDhxARkYG534daY4fP45HH30UdXV1Qf2dKCQ8+CNww+kN4YeFysrKUFlZ6fPCj5ccCQ2/lZSUoLq6Ouwvq6/dT7QsUhIF98ZI/oCnyclJl1JTlUolWKgjHhLXweLJZyvWzJsdkEuXv48yqQTz5tOhLn50wx1PXfdjY2NYXFzE3//+dxgMBuzcuROf+tSncP755+Pss88O+SbPG0ajETfeeCN+//vf47/+67+C+ltRSP4F7Q05ceIEiouLUVhYGNIXzOFwoLu7GwaDIeCwkC8HYKHxlCNhWRa9vb2YmZnxGn4LBSF7PxJtsQv3fN17GEwmE5e4F7Ix8kzYkVgdDGRSCZJkkROfirw0fDg8D4YlkP1LUBiWwMkSlP9rl+JLSNxRKBSoqqrCk08+if/7v//D3/72N9x22204evQo/v3f/x0vvfRS0LsGf9x+++24/PLLceGFF4pCEizuvSG00TCULxdNpqWlpXEjZQMhljsSs9kMtVoNAGhubha0xFDoJsKVvCPxBb8xkpaaLi4uQq/X+22M9MdKFpKe6SU8fXwSnVNLkEslOKdKhc9tX4O8DOEdf3dXqPDOoB7DcxbkpJ1eVg1mJ0pUqWguP50boc2Iwb7fJpMJeXl5uOmmm3DTTTdF5Hvw17/+Fa2trTh+/HhIf39GC4n7CFyJRAK5XM7dOQTzPOEMn4pV1dbs7Cw6OztRVFSEmpoawcMFgTQsCvFc8UakBY/fGAnAa2MkFZbs7GyvTXArVUgGtSZ873A/9CYH0pPlsDlY/L1Dg36NCb+4phZpCmEbRfMyFLjr/HL8vUODzqklSCTAOVUq7NtSyAlXKM2IwOmbPX4zotCf1/j4OO666y68/vrrIYfLzlgh4feG8L3/g13U7XY7Ojs7sbS0FPLwqWjvSBiGQX9/P8bGxrBp0yYUFRVF7FhCWsmfqTsSf3hrjDQYDOjt7YXD4eA8otwbI1eqkBxSz0BvcmBVVjL3+tIZGQZmTXh7cA6X1goTvuWzNicVt59bBpPNebqB0E2s4tWwsaWlBbOzs2hsbOR+xjAM3n77bfzmN7+BzWbzK4BnnJD46w2RyWQB70j0ej3a29uhVCqxe/fukGcZRFNIaAhPq9Wiubk5orYLYo4kNribD1oslmWNkbQaLBHFOZACga6pJSTLXb/bSTIpCICBWRMurY3c+aUne15WQzVsNBqNETVsvOCCC9DZ2enys1tvvRU1NTX45je/GdA5n1FCEkhviFQq9SskfAt1IYZPRUtI5ubmuHzIjh07Ij7ER2ghScRFL9bwJ0Z6aoycn58HAPT09CTMxMhAdiTZqXJMzltdfkavn0wvC32kCVVITCZTxKIGAJCZmYlNmza5/Cw9PR25ubnLfu6NM0ZIAu0N8VdBZbVa0dHRAavVih07diArKyvscwtEvMKBP3WxqqoK/f39UTETPFMX/3h+ze6NkVNTUxgfH4dCoUiYiZGBCMnFG/LRO2PEktWJjGQZCHA6X6KQ4dzqyLtseyKcHEk8+2wBZ4CQBNsbIpPJ4HA4PP5Oq9Wis7MTeXl5aGxsFOwLFskdCZ26aDabsWPHDqSkpKC/vz8qTV3ijiT+kUgkSEpKQlVVFQD4bIxUqVTIysqKeQ9HINfupbX56NcY8Wb/HDRLdgBARrIMXzyrBBV5sVmUw9mRRNv599ixY0E9fkULSSg2J552ByzL4uTJkxgbG4vI8KlICcn8/DzUajWys7PR3NyMpKQkrkItGovymVz+myg5HffmPl+NkZ2dnRFrjAyGQO7sk2RS/L8LKnDF5kJ0TC5BIZdgR1kOVmXFLmwXTrI9ni3kgRUsJKGOwHUPbdHhU06nM2LJaalUyi3wQsAvR66qqkJZWRn3+umFHI2czJmcbE8U/FVtBdIYSXMr/hoj3+zX4a8npjCit2CNMgXXNRXhkg35QX+2LMsGlN+TSCRYX5iB9YXxsQiHMx1RFJIo494bEqzNCb9qS6PRuPRZRCqvIOSOxOl0orOzE/Pz8x7LkSNhpugNX0LCsixsNltQHdmJsiNJlPMEgiv/DbQxki8sdMF/tnUav3jrFNf53T29hPv/YYRmyY6bd6wJ+pxjHV4LhVByJFS843leO7DChIT2hvBHcQZ7t0N3Bz09PZiamsKmTZuwatWqSJyuyzGFEJKlpSW0tbUhNTUVu3fv9jg7hL4nsdyRmM1mtLW1YWlpCWlpaZx5oa/ErrgjiQzh9JEE2hiZmpmNR9+ZA8sSl4opk53BEx+MY39dIbJTA68gjEfTxkBgGCakvGqky3+FYEUICd/mJFzHXjomlWVZl+FTkUQIIZmYmEBvby/Ky8tRWVnpt6s8VjsSrVaLjo4OrFq1Cps3b+ZKUWli19dEwUS6008U4ROyIdFbY+RHg7NYsDgglxI4HAQSqRRSqRQpcinMdgbd00bsqgi8kTeRhSTQwXB8xBxJFBBqbghw2v12YGAAMpkMO3bsiNrFGo6QMAyDnp4eaLVaNDQ0cF/iSB0vGPhCwi9Brq2tRVFREex2u4vjKW2c0+v1XOMcFZVEXDgSgUi6/9LGyPXIgPzDdsglgFQCsISFw8GAYQECYGl+DmZzMlJTUwM6l0QWklBCW2azWQxtRRKhRuDyh09VVFRgamoqqhdqqO6/JpMJbW1tkMvlQc0OifaOxOFwoKOjA0ajkeu98fR6U1NTUVxcjOLiYrAsy+1WpqensbCwAIlEgoGBgbjtb6Ak0s4pGhYp6wrTUaZKxZDOjHSFDEkSGVhCYLMxKEiXIZcs4qOPTvey8PMr3hojE9H6HggtR2K1WsEwjBjaigRCjsB1Hz5lsVgwMTEh8Bn7JpQdwszMDLq6urBmzRqsW7cuKOGL5o7EYrHggw8+QFpaGpqbmwPe2kulUmRnZyM7Oxvl5eXQ6XTo6+sDISSgMJhIYERDSKQSCb67pxr3HOw5PZtDIgEIQXaqHP+1bwMa1mZzEyP1er1LYyQVFv6NQ6iNfbEmlB2JyWQCADG0JTRChbIIIRgfH0d/f7/L8Cm73R7RLnNPBLOwsyyL/v5+TE5OYvPmzSgsLAz6eNHakdjtdgwODqK8vBxVVVVhLVgymQxSqRTr168H4DsMplKpBB/6EyyJImrRMm3cWJSJA7c24NUeLcYNFhRlp+Cy2nzkZ57edfAnRgK+GyO9NQzHO6H0kZhMJkil0pDnzUSLhBKSUHtD3PE1fCrSdiWeCFRILBYL1Go1CCFhFQJEekdCxc5sNmPt2rWorq4O+zndP2tfYbD+/n6kpqYGVA12phNN99/cdAVu3FYc0GN9NUZaLBb09PRgeno6po2RwRLqjiQRXltCfLv4vSHhjMAFTnd7t7e3Iz093ePwKZqviOYXLJCFnV/tFG5PSyR3JDabDWq1Gg6HA0qlUrDYrq9zdg+DOZ1OGAyGgKvBhEbMkQgPvzFyfn7e5XN2b4yMhx2pJ0IREqPRKAqJENARuEKEskZGRjA4OLis25sP/aCjGYf1JSQsy2JwcBCjo6PYuHEjVq9eHdHjhcP8/Dza2tqgUqnQ1NSE9vb2mCyqcrk84GqweF10ooXe7ASBJGEEBTj9XU5PT0d2dnbQjZGxJJQ1JRGaEYE4FhL3EbihNBdS+MOntm3bxjVQeYLGMEO1MwgFbwu7zWZDe3s7bDaboPYskfDAovmm6upqlJaWcp9XPJg2xiIMFu+LcufkIn7y2hB6ppcAAOtOmPGtiyvRuDY7xmfmH/fOdm+NkXq9PuiJkZEk1ByJuCMJEUIIFhcXsbS0hNzc3LBEJNjhU/wdSbTwJCT0vFUqlaBOw96OFyoMw6C7uxs6nQ5NTU1cshSIT8feeAuDxYJxgwVf/EsnLI5ProEBjRFf+ksnDtzagKr8+L4D9tdH4q0xUq/X+50YGSnojbG4I4kSdBdiMBhw6tQp7N69O6Tn4Q+fWr9+PdauXRvQxULH7kYz4c4/HiEEw8PDGBoaCuq8g0GoBd5sNkOtVkMqlXrsY4mXHYkvhA6DxZtweuKvLVOwOlhIQQDepeVkCA4cn8R9l62L3ckFQLANie4TI81mM5e4d58YqVKpAm6MDAb6/Q41RxLvxI2QuPeGJCUlhbyYhzt8Kpqjb/nHoyE4o9GI7du3Izs7MmEGIV4fTf5TQ0tPX+xEtJEXKgwWz7uYzqklsIRALv3k85FIJHCyBJ1TSzE+O98QQsIybZRIJEhPT0d6errHiZEnT550aYxUqVQh2Zq4Q79voexI4r2HBIgTIfHUGyKXy0MSErrA5efnhxwSCmZuuxDQL8X777+PrKws7Nq1K6LJwXBMG92tToqLvZdz+hOSYBbbWCzMoYTBEoFVmcno9PDZSCUSFMZwXkcg0OtWKOcJ94mRwTRGBgPDMCGF6BPBHgWIAyHxNgJXJpMFNaODP3zK3wLnj2gKCSEEU1NTAIA1a9b4NVwUAqlUGtLdvcPh4IoWAtnpBbIjCaZaKNZho0DCYNS232azxe3s86vrV+G1Ph1YAtDlmP3Xe3tNfeRmgwuB0ELijrfGSP7NQygTI0O1cTIajeKOxBf+RuDSxTyQhcZsNqO9vR0sywpS3RStHInT6URXVxf0ej0AoKSkJCp33qGEnKhFfTBWJ/6OE+wsjHjDUxhsbm4OWq0WJ06cCNgiP9rsLM/BXeeV4Tf/HIGTPf35SCUSfPGsEpxXrfLz17GFPyIiGnhrjNTr9ZicnAx4YmQ4Y3bdZwrFIzG5sgOxOQnUV4d6Tgk5fCpUE8VgoB5fycnJaG5uxj//+c+o5WWCzZFMT0+jq6sLZWVlQVmdJGKOJFRoGCwzMxMjIyPYuXMnN00wHqvBbm1ei8s2FuDAsQ6kp6dj79YKrM6O/36aSO9I/OFrYuSpU6cgl8tdEve0OCNUIbFYLFi7dq3QL0NwYiIk9AvkK2ZI33RvHwDDMOjr68P09LTgw6ciHdqanJxET0+Py8IcrWFTQOALPMuyGBgYwMTEBLZs2cLdlQl9nECfK5FIhKbIwqxknF+ShPz87IQQESC+LOQ9TYxcWFiAwWBY1hgpl8tDOm+j0RiVmUjhErO9tr84PX3TnU7nsjCK0WhEe3s7V3Yq9BsdqdAWwzDo7e2FRqNBfX09t8jQY8bTjoRvddLc3BxSwu9M2pFQvJ1jvHqDJVJHOxBfQuKOVCrldiOAa2PkzMwMbDYbjh8/ziXuA2mMFKu2wkQikXis3KJ38yUlJaiuro7IRRWJ0BYdL0vFz93NM5pC4m/3Q61OcnJy0NTUFNaidqbuSHwRT02RiSYkiTSvnd8YmZGRgenpaRQXFwfVGCkKiQDwQ0x0jrpWq112Nx/J4woBzeMUFxdj/fr1Hr8I0d6ReFrgvVmdhHMcb+9jqH5piUIwry+WYbBEE5JQbEbiAYZhkJSU5LEx0r3qLycnB2lpaVAqlREt/33kkUfwyCOPYGRkBACwceNG3HfffdizZ0/QzxUzIQnk4qUL+uLiItrb25GcnIzdu3dHPJ4s1KLOzzH4y+PEekfCH9nrbnUSDoncRxIrohkGSzQhSaQdCR/3oiH3xkiWZWE0GqHX6zE7O4sHH3wQH3/8McxmM44fP47du3cLmgcGTrcb/PSnP0V1dTUIIXjyySexb98+tLW1YePGjUE9V9zvSKanpzE9PY3y8vKo9FjQ44a7I7FarVCr1WAYJqAcQzRtWdyPRa1OJBJJUCN7/RGJZHu8L3yhvt5xgwWH1DOYWrCiOj8dV9WvQm66IuJhsHh/P92J5xyJL/ztpKRSqUtj5GOPPYY333wTt912G1544QU8+OCDqK2txZe+9CXcfvvtgpzTFVdc4fLfP/7xj/HII4/gww8/XDlC4nA4YLVaYbFYBL1DDgSpVBpUM6Q7Op0O7e3tKCwsxIYNGwIq+wu1STAU+As8PVdfVidCHEfEO0cH5nDPwZ7T9h8AXoUWf3h/HP/3b3WoLXLtlvcXBpNIJFzMPVBvMFFIIk+w5b+ZmZnYt28fvvKVr+Bvf/sbiouLcfTo0YhNSmQYBs8++yxMJhOam5uD/vu4DG3R4VMAUFlZGVURAU7vSGw2W9B/RwjB4OAgRkZGgu6uj3aOhGEYDA0NBWR1Eipn4o6EEug5WhwM7n2xDwx7WkT4P//OS/049B9NPp8r3DBYooWKWJZNiM/fHZojCQbap5Keno7c3Fxce+21gp9XZ2cnmpubYbVakZGRgeeffx61tbVBP09c7Ujch08ZDIaYnEcoVVs2mw0dHR2wWCzYuXNn0L5L0QxtsSwLvV6P+fn5kEwtAyUSO5KVtsN5/5QBJvvyz50lwKDWjOE5CyryAitvDyUMlijCTEnUHUkoFvJmsxmEkIh6uK1fvx5qtRoLCwt47rnncPPNN+Of//xn0GISN0Jit9vR0dEBk8nEDZ8yGo1Rn58OBJ8jMRgMUKvVyMnJQUNDQ0jJz2jtSJaWljA+Pg4AAVudhMqZ2JAY7Ou1Onx/5hZH6Nd/IGEwelORlpaWEJMiE1VIQp3XDiCi5b8KhQJVVVUAgKamJhw/fhy/+tWv8Oijjwb1PHER2pqbm0NHRweUSqWL8220XXgpgS7q/B3UunXrwvLKioaQUKsTpVIJiUQSUREBxB1JIGwtyYZUcnoH4k52ihzrCoQr/fQUBmtvb8fc3BzGx8dj1hQZDIksJKFMR5TJZFE1/2RZNqSwfkyvFH5OwdMQp1gJSSDHpU64i4uLfsf3BkIkhYRfhlxXVwe73Q6NRhORY/E5E3cklEDPtzArGTfvWIPHP5yABAABOGG554JyJMkis2jSMJhMJkNNTQ3S09MTYlJkIgtJqNMRI/V67733XuzZswclJSVYWlrCgQMHcOzYMRw5ciTo54qZkNhsNnz88cew2+1ecwpyuTwkdQwXf0KysLAAtVqNjIwM7Nq1S5A7+0gJCZ37brfbuTLkiYmJqITRxB1JYHztU+UoUaXi6Y8nMbNoQ1V+Gr6wqwTnr8uN+LFp8lroarBInm8iCkkoOZJIW8jPzs7ipptuwvT0NLKzs1FXV4cjR47goosuCvq5YiYkSUlJUKlUKC8v97qFjmVoy9Nx+Z3fFRUVqKioEOwuLRK2LHyrE/6Qr2iVGgt5BxsPd8OBEMr7KpFIcG1DEa5tiP4sEG/J9nj1BktUIQlnRxIp/vjHPwr2XDETEplMhurqar+PiVVoy31RpxYtOp0OjY2NyM0V9m5RyB0JX/CqqqpQVlbmslhEy2nY13FmZ2fR19fHTZ/Lzc0NqEZ+Je5IYkkgVVvx5g2WqEIS7HmbzWakpaUlxE1UTHMk/kIfwU5JFAp3ATMajVCr1UhKShK085uPUIs73+rEm+DFckdCCMHQ0BCGh4dRVVUFh8OB2dlZnDx50uUuNycnZ5mlRCKRKOcbSvlvLMNgLMvGZRGAP0LZkSTKdEQgjsp/PRHq3PZw4e8Opqam0N3djdLSUlRVVUV0xKfD4QjrOSwWC9ra2vxancRqR+J0OrkChR07diAlJQWEEJSVlbnc5Q4MDMButyM7Oxu5ubnIzc3lXou4IxEWIfpIohkGS9TQVig5kkiHtoQkroUk1lVb3d3dmJmZCWmoU7CEG9qiVierVq3Chg0b/Pr6RHtHYjab0drayk2ETEpKchFO/l0uIQQWiwVzc3PQ6/U4deoUVxKu0+lQUFAQdJdwtEg0oRO6ITHSYbBE7GwnhIQsJOKOJAACCW3FQkjsdjuA09VZzc3NUZlQFqqQEEJw6tQpnDp1Chs2bMCaNWv8/k20dyRU5FavXs3Z6PtzBU5LS0NaWhrWrl0LhmE425yxsTEMDAwgKysLubm5cVWammjQzyCS753QYbBQFuRYwx8pHgzijkQgYpEjmZ2dRUdHB4DTnZ7RagYKRUicTic6OjqwuLiI7du3Izs7O+BjRevO2Wazoa2tLSw/L5lMxuV6GhoaAAB6vR5zc3MYHR2FVCrlEvYqlSrijZaBkAjCFg0hcSfcMFgi7kiokIg7khhBcyTR8ANiWRYnT57E2NgYNm7ciI6OjqiGKYIVEqPRiNbWVqSmpgbdyxKNLnqGYTA2NgabzYYdO3aE3bAJfLKDTU1NxerVq7F69WqwLIvFxUWuO7unpweZmZmcsGRlZSVkTD0a0Os7Vu9PKGGwRMyRMAwDiUQS9HkbjUbk5eVF6KyEJeahLV9QBY/0dtZqtaK9vZ2bT56RkYHOzs6ohtWCWdxnZmbQ2dmJ0tJSVFdXBy2ykbZ3t1qtaG1tBcMw3KS3SCGVSqFUKqFUKlFZWQm73c7tVjo7O0EI4UIn/KS9LwghaB1fxImxeWQky3FxTR7yMwPbmSZSjiQWOxJfBBIGA4CUlJSYNkUGS6jrl9lsFnckQkDf/FBK5wJlbm4O7e3tyMvLw9atW7njRKJB0BeBuP+6W50UFhaGdKxI5kgMBgPa2tpQUFCA/Px8nDx5UrDnDkQAFQqFyzhTGjqZmZnBwMAA0tLSOFGhFiF8rA4Gdz/Xg/dPGSCVnj7ez988hR9cvg5XbA78/Y6XxdkX8SYk7ngKg3V2dmJhYQEffPBBQniDAaGPB47kmF2hic93/l9IpVJIJBI4nU7B497uSeri4uKY+nz525F4sjoJ51iRuHMeHx9HX18f55s2NzcXU4sUiUTiMnXO6XRyd7i9vb1wOBwuu5XU1FQ89u4YPhw2nPa9+peTopMhuO/wAOrXZGFtTmQGC8WCeBcSPjQMplAoUFFRAaVSmRDeYEDoN8JiH4lASCSSiCzo1LLebDZ7nccRT0IyPz8PtVoNpVLpYnUSKkLvSFiWRW9vLzQajcs0y0B2EMHkv8JdGORyOQoKClBQUABCCMxmM+bm5jA3N4ehoSEoFAo808KAIYCMfyhy2kzxcNcsvnx2qc9jiKGtyEJzJO5hMLPZzAlLPHmDAaELiRjaCpBALmChF3SDwYD29nZkZ2dz/QyeiObEQm/HI4RgYmICfX19Hq1OwjkWff5wn89ms7nMpufbnMSzaaNEIkF6ejrS09NRUlIChmFgMBhgPNZzejdCCPCv90YiAaQA5s3hNYzGG9G8voXCW7KdlovHmzcYELqQiOW/AiJUCTAhBKOjozh58iSqq6tRWlrqcxGN9Y4kEKuTcI4FhF/EsLCwwJlCbtq0adlzCS0kkbxzlslkyMvLw6aiTHRNLwHkk2OxhIAhgIosQqPRQKVS+WyITJQ7fHojkSjnCwTW2R5P3mCBnrM7dMxuJKcjCkncC4kQNikOhwNdXV1YWFjA1q1bkZOT4/dvojn6FnBN7gdqdRIq/PnnoUKtY3ztlOJ5R+KNL59Tituf6TqtI/86lFQqRXGWAmeXZ2J0dBTd3d3IyspyKTFOpMWYkmhjdoHQFuVYh8HC2ZGIoa0AiEZoa3FxEWq1GmlpaUH1W8SiaovfBR6I1Uk4xwJCC23wK8fq6+u5L6cnEmlHQjmrUoWHrt2Ih94axojeDKlEgotq8vCfF1UiL+P0tWOz2bgS44mJCQDgFqFouCAIxZkiJO4EEwZzNw8NhXByJGJoSyBCDW0RQjA5OYne3l6Ul5ejsrIyqC9NLOxZGIZBW1tbwFYnoRLqjsRut6O9vR02my2gyrFE3JEAwPnrcnFetQqLVieS5VKkJLkuAsnJySgqKkJRUREIIVhcXIRer8fU1BQWFxcBAIODg1yJcbw20CWakFDPqkh6gzkcDszPzwsaBgtFSFiWFXckQhLKgk4NF3U6HRoaGkLqDo1maMvpdKK/vx+EEEHG9vqDfhGC2ZEsLS2hra0NGRkZ2LlzZ0DJykTckfCPlZ3q3xhSIpG4LESLi4toaWmBw+FAd3c3GIZZVmIcLySikADBW40EQ1JSktcwGLXjCTYMFqphIwAxRxIIgVzEweZIhJodEq3QltFoRFtbGxdy81SKLDQ0wRroIq/RaNDR0YGysjJUVVUFVa6biDuScJDL5ZBIJNiwYQOXMJ2bm+NmrqSkpHCeYEKETcIh0YSEfh+jec5ChMEYhgnarZoKiRjaEohgdiTT09Po6upCSUkJqqurwwopRCO0xbc6KSsrw1tvvRU1L6FAypsJIRgcHMTIyEhInfS+hCSUaqFEWPT4r1cikSAjIwMZGRkoLS2F0+nE/Pw85ubmMDAwAJvNBqVSyQlLenp6VF9jogpJvHiDBRoGYxgm6Btas9mMpKSkqJnGhktCCIm/HAnLsujr68PU1FRY1iF8pFJpxJyHqUHk+Pg4d75UtKKV4Pe3W6DOwktLS9i5c2dIW+wzcUfiC7lcjry8POTl5XmduUJDYDk5ORGfuZJoQhJrk0l3Ag2Dmc3moEOaRqMx6jcW4ZAQoS2bzeb19xaLBWq1GoQQ7Nq1S7CqGZlMxs0lERK73Q61Wg273Y6dO3dyybRwKqlCwdeOxGQyoa2tjRtCFao9TSLnSMIhkPP0NHNlYWEBc3NzGB4eXlZiHIleh0Sbf05ddOP1OvAWBtPr9RgZGcHs7GzA1WBUSBKFhNiReAsxzc7OorOzE6tWrUJNTY2g8eZIhLZ8WZ3QL0isdyRarRbt7e1Ys2YN1q1bF9ZCw68OE+rLn8g7El/IZDJukQFOOyjTEuPx8XFIJBLu97m5uYJ4zyXijiRRhI8fBtPr9Vi1ahUUCkXA1WDUHiVRPp+YC0kgUxLdQ0wsy2JwcBCjo6PYuHEjVq9eLfh5CV21RQ0NfTXwRbNSzH1HQgjByMgIBgcHBXtP/QnJSs+RhENKSsqymSt6vR4TExPo7e0VZOZKoglJIs4iAU7vpBQKRcDVYA6HAyaTKaI9ST/5yU9w6NAh9PX1cTONHnjgAaxfvz6k54u5kPjDfWdAXXBpL0Ok6qyFqtpiGAa9vb2YnZ31a3USTX8vvoAzDIOuri4YDIagJi0GcgxAWH+slboj8QV/5kpFRQU3c0Wv16OzsxMsy7rsVgJN7IpCEh089ZF4C4P19fXhmmuugVKphEKhwMsvv4xzzz1X8DDXP//5T9x+++3Ytm0bnE4nvv3tb+Piiy9GT09PSMeKeyHhl//Ozc2ho6MDKpVKEBdcXwgR2uJbnbgbGnoimkJCj0XPUSaTobm5WdAqEX9CEoooJIKQRHpxdp+5YjQaMTc3x81cSU1N5SrBlEql15CvKCTRwV8fiXs12MmTJ/Gtb30LH3zwAb761a9ibGwMu3fvxjPPPIOCggJBzunVV191+e8nnngCBQUFaGlpwTnnnBP088VcSAINbQ0NDeHUqVPcrItIfwHC3ZHMzc1BrVYHZXUS7R0JHRRUWFgYETuWSOxIRFyRSCTIzMxEZmbmspkrfX19LjNXqIULvyE1kd7TRBWSYDvb8/LyUFtbC4Zh8Nxzz2FoaAhvvvmmoMat7iwsLAAAl6MLlpgLiT8IIbDZbJiYmBA07OKPUPMVhBAMDw9jaGgoaKuTaAkJIQQOhwODg4OoqalBSUlJRI4jhJAQQvDukAGv9sxifMaOnSYdbmzOCqjrPBbEescUyMwVultxOp2ikESBUCYk8u1RKisrUVlZGYlTA3D6fb377ruxe/dubNq0KaTniGshmZ+f52LAu3btinhdPZ9QQltOp5MbBRqK6EWjm55lWfT09MBms6GioiJiIgKELyR0zO1fW6YBQuBkCbr1s3h1YBF/uLEOBQHOUT9T8TRzhTZEDg0NwWw2Qy6XY2RkBLm5uXFfJZRoOyjg9DkTQkKySIlW+e/tt9+Orq4uvPvuuyE/R8yFxNOFQQjB2NgYBgYGUFpailOnTkV9HnOwOxJqdZKSkhKUyzCfSJf/2mw2tLW1gWVZZGVlRdypNlwh6ZpawjMt05AAUCTJIHM6IZECUws2PPbuGL67p1rAsxWOeF3sZDIZcnNzuRDJ8PAwNBoNFhcXMTo6ypUg0x1LNG/cAiERdyT0+xyskJjNZsHyIb644447cPjwYbz99tthGcXGXEjccTqdXAVRU1MTMjIycOrUqbCHMAVLMLsDvtVJdXV1yAtJJHckCwsLaG1tRW5uLjZu3IjW1taohWFCPc7Rk3MgABT/mnsrASD913v7ep8uboUkUUhKSkJqairq6urAsiwWFha4ctR4nLmSiEJCb0aDPW+j0YiKiopInBKA09/JO++8E88//zyOHTuG8vLysJ4vroSEOszSuubk5GRuYXU6nVEXEoZhfFa2eLI6CYdI5UgmJyfR09Pj0sMSjXyMP3NIu90Op9PptZqNJafFwxNMnFZvxTpHEgz8a5v2MeTk5KCystLnzJXc3NyYeEAlqpDQ71swRHoWye23344DBw7gxRdfRGZmJmZmZgAA2dnZITlUx1xI6IVMFzt3h1m6GEV7Noi/uebU6sRms7lYnYR7TCEXd5Zl0d/fj6mpqWV2+tHqovcmJLOzs2hvbwfDMMjMzORCLvw7313lOXj640k4WYIk2SdhMkKAsytDqy4R+QRfN0nuM1eWlpYwNzeHqakp9Pf3Iy0tzaXEOBoLfCJ1tlNCHWplNBojOovkkUceAQCcd955Lj9//PHHccsttwT9fDEXEtoMp9FoPE7ck0gkMRkyRT98TxUXdFZ5dna2oP0sQgqJ+xAq93yIVCqNyt2zu5Dwq9pqa2uRmZnJJYDb29sBgBOVuqIcXLA+F2/0z4FxMCAsICGAMi0Jt50VuSKBcIl1CChQAu0jkUgkyMrKQlZWFud6S0uMe3p6XGauRHJKZCjVT7Em1JA8tUiJFEJ/92MuJGNjYzAajdi1a5fXLVWoUxLDgS8k/KRjIFYnoSKUkCwtLaG1tRVZWVloaGjwKHSx2JGwLIuuri7Mzc1h+/btyMjIgMPhcGmuW1xc5Pylenp6cNXqTJSnZ+P9STvmFs3YXqrEv59bjWKlsHO1z0RCbUhMSkpCYWEhCgsLXWauaLXaiM5cOZN2JNGs2hKCmAtJWVkZiouLfV4gsdiRuIfUqNWJRqPxa3USKkIICU38+xsvHO0dCa0YI4SgubkZKSkpyz5T/rTBiooKLk6fkTaH2mQTGIZBVpYdctsCHA5Z3FUVAYmbIwmVaM5cSdQcSbDnTMU5UaYjAnEgJFKp1O8bHeyURKGgVVTUqh6Az51TuIRj2kiHUI2OjgaU+I+Wb5VEIoHRaERrayuUSiU2b94c8B0aP07PsixOnDgBuVyO0dFR9PT0ICsriwuDxXsPRDwSCYsU/swV4HSIhibtw525kqhCEo+hLaGJuZAEQix2JPS4er0eg4ODKCwsRG1tbUQv5FB3JE6nE+3t7TCZTAEn/qPVRc+yLLq7u1FRUYGKioplC1egYiaVSrmu7aKiIlitVq5jm/ZAUFFRqVRR7zvikyiCFo1QETUnXLNmDTdzRa/XhzRzhWXZmH6uoSCGtuKIWORICCFgGAb9/f2ora0Nq1knUEKZymgymdDa2orU1FQ0NzcHfIcX6RwJIQSnTp2C0+lEdXW1R4uHYHdE/F1USkoKiouLOfdUGk45deoUuru7kZ2djby8vJiMsE0Uom3ayJ+5UlVVFfTMlUTtbA9WSBiGgcViEXckwRDolMRo7kio1QnDMFi/fn1URAQIfpdAh1CtXbsW69atC+pLFsnZJwzDoLu7G3q9HgqFAjk5ORE5DkUqlXILUHV1NTfClh9OycvL48IpkexHOtNyJOHgb+ZKRkYGt8PMzs5O2NBWKM2IAMQcidBEM7TFtzrJyMgIeLaDEAQqJPwS2lCHUEVqR2Kz2dDa2goA2LlzJz766COfiyu9ywzkyxZoXic1NRVr1qzhwil0t0In0+Xk5HBhsNTU1IS7yxWKeKqC8jVzpaury+XO3mKxRCxPKTShhLbMZjMAiDsSoYmWkMzMzKCrqwtr165FdXU1jh8/HjVbdyAwIWEYBp2dnZifnw/LDTkSVVuLi4tobW1FTk4ONm3aBJlM5nXxZ1mWcw6g4TxaeOFtcQtlwXf3l3J3w01OTuZ+72t2RzAkijDFekfiC08zV7q6urC0tIQPP/ww4JkrsSYUITGZTEhOTk6ofFDMzzSQCznSORJqdTI2NobNmzdj1apV3HGjGVLzJyRmsxltbW2Qy+VhD6ESekdCy47dk+qeGhJp/gk4vWAwDAOWZbl/wOn3gpZg84UlXPGjyd+1a9eCYRgYDAbMzc2hv78fdrt92W4lWBIptJUoOQc6c4VW8eXn53OfG525wi8x5s9ciTUMwwT9PTUajQmX14u5kASCXC6HzWaLyHPTDnCr1bpsdG88CQkdlFVUVISampqwQxJC7UhoUv3UqVMey475QkIFhP433bHQOza6S6HW2/S9CMWrKBBkMhlXqsqf3UEb6+hdL92txEsYSCjieUfiCRqKk8vl3Pxz+rnRpD2ducIvMY7lnX0oeR0qJIlEXAhJIFMSI7Gg861Ompubl11wkUxIe8KT+y/fUr+mpgZr164V5FhC9JHwZ73v2LEDWVlZHo8DfCIivnIi/LAWFRP6Nw6HAzabDU6nE06nk3ucUIu7++wOp9MJg8EAnU7nYgNChSWaubNIkWhC4mlR5n9udJfJn7lisViQnZ3NCUu0+41CzZGIO5IIEInQFq0MqaysRHl5uccPLRqDpvi470hoD4ZWq8XWrVsFrX4Kt4/EarW6zKP3tn2nITQqCDRkFcj5Aac/A4fDgc7OTgDgJv/R64GKktC7Fve7XpPJBJ1Ox81Fp6aFubm5yM7Odjl2oiwAK0FI3HHPiVksFm63wp+5Qv+FMjcoGELNkYg7kgggZPkvnRAYiNVJLENbdKEGTnfTC30HHE6OxH22ia8vikQigdPpDEpE+JhMJqjVaqSnp6OxsZETd/4//mfkL2EfCnwbkLKyMs60cG5uDt3d3WAYhrvjjdekrydWopC4k5qa6tJvRBsix8bG0NPT4+I8nZmZKXj4MhQhibTzbySICyGJVmiLWp0QQgKyOol2aIsKyfz8PNra2gJaqMM5ViihLZpU97WTAz5JqqelpaG7uxuTk5PIz89HXl5ewMlQvV6Pjo4OFBcXu4wWcA+BAXDJrUR6t+JuWsi3WF9cXIREIsHQ0BDy8vLiYiCUN84EIeHjbeYKvc4IIYLPXAmlITHSs0giQVwIiT+EEBJqU15QUIANGzYE9OHKZDLY7fawjhsMUqkUdrsdx48fR3V1NUpLSyP2RQ92R0IIwdDQEIaHh5FTWoMBiwKaYQOaSpRQyKXLHkt3C7W1tSgvL8fc3Bx0Oh0GBweRnJyMvLw85OfnIycnx+PiMDk5ib6+PtTU1KC4uNjrebnnSmKxW+FbrM/OzmJgYABWq5VbnGg1kadu7VhypgmJO95mrkxPTws2cyWUhkQxtBUh5HJ5yDkSQghGRkYwODgYdLI6mqEtlmUxOjoKp9OJbdu2RcRdmE8wOxLau6KdM+CYsRDv/H0EVicDmVSCYmUqfnTFBmwoOt2F616ZJZVKXZLYDMNAr9dDq9Wiu7sbTqcTKpWKq55KTk7G4OAgJiYm0NDQAJUquAFW3hL2dIcUjdyKXC7Hxo0bXWzxaU6ObzTpz1sq0iSikETyxsp95gotMe7p6YHT6eSKLYKZuSKGtqKIv4sj1AWdWp0sLCyE1LwXLWNDOm3RarVCKpVGXESAwF+b1WpFa2srpFIpTkrX4o3+SaQmSZGXroCTJRjTm3HvC9048IVtSE2S+k2qy2QylyS20WiETqfD9PQ0+vr6uC9dTU1N2MUF/IQ94NoEGandCl+cvdni872l+EaT0bbFT0QhiVYJdlJSEgoKCrjiDk8zV/glxt7EQky2xxGBzE93h1qdJCcnY9euXSGFFKKxI1lcXERbWxuysrJQU1ODDz74IKLHowRS/kuT6nl5eVhXswE/fPQ45DIJ0pNPXzZJMglyUpMws2jDOyd1uGB9blBJddpolpmZidWrV6O1tRUsyyIjIwP9/f3o7+9Hbm4ut1sJd6H1VV4cSDNkuLjb4tPdSqxs8RNJSOiOMha9POHMXAk1R1JUVBSJlxIxEkZIgNPqHkhzkUajQWdnJ2d1EurFF2khmZ6eRldXF9cNbrVauS9MpL/g/nIk9NzoJMgFixMmmxMKmet7KZdJAQJol6whV2YtLS2hra0NKpWKs+onhGBhYQFarRYjIyMujr55eXlhL7TedivemiGDEZVAy5upt1RlZaWLE260bPETSUj4Ih9r/M1ckcvl3E6FEBLSjkQMbYWAv4uZfon8CQkhBAMDA8usTkIlUqEtQghnybJlyxYUFBRwxwNCn/McDN5yJHRA1sjIiMu5ZabIsSorBaN6M9IUn5yb1clAIgFKVakhiYhWq+UmOvJHF0skEm6hra6uhtVqhU6ng1arxalTp6BQKLgvs0qlCvv98rVb8RQCo/9fKNydcOfn513mdmRnZ3PCIlSzWjyZNvqDn3OLN/gzV/if3cjICACgo6MjqLyYKCQRwn3srSd8WZ2ESiR2JA6HAx0dHR6HUEVTSDztSGhSfWFhATt37nSxsZZJJbhh+xo8eOQk9CY70hQyOBgWVgeLzcWZ2FGuCmpxox371MHY30THlJQUF0df2nXe398Pm82GnJwcrrw4XGdYf7sVXwl7IWxn+Lb4VVVVLrb4w8PDSEpK4hamcCxAxB2J8PA/u7Vr1+K9997D6tWrodfrl81cUalUHkuMRSGJENSPydui7s/qJFSEFhI6cjY9Pd3jECr+whVp3HckNKkuk8nQ3NzsMae0r24VHE4Wf/poHHqTHTKpBBdtyMNdn6qETBr4gsSyLPr6+qDVatHU1BR0EYQnjyytVguNRsOVbdLfC+GR5b5b8VVeHAnTRm+2+IODg7BarVx8Pjc3NyjDwkQUkkQ5X+CT0l/+TpOWGHubuSKVSiMqJG+//TYefPBBtLS0YHp6Gs8//zz2798f9vPGhZCE4wAciNVJqAgZ2pqdnUVHRwdKSkpQXV3t8Tz5CbpIw39ttAEyPz/f5zhhiUSCaxtXY09tHmYWrchKSUJuRnBNW3RHZrfbsX379rB3D3yvJX7XuU6nQ2dnJ1iWdUnYh9vH4SkERkWFEAKLxcK9zkiUF/uyxeeH/FQqld8hXokmJKGETmOJew+JVCp1qeJzn7nywgsvoK+vD2NjY9xwK6ExmUzYsmULPv/5z+Pqq68W7HnjQkgCwd0mhW910tDQwCW+hESIHQnfHXfTpk0+qzFoCC8aQkKrtqamptDd3e23AZIWAbAsC4VMgrLc4OP0ZrMZarUaqamp2LZtW0QSyO5d54uLi9DpdBgfH+dmhNNmyHD7ONxDYJOTkzh16hTWr18PABFvhgS82+IPDAz4tcVPRCFJJPyV/rrPXMnIyMDBgwfx0Ucf4Utf+hJ+/vOf45JLLsF1112H5uZmQc5pz5492LNnjyDPxSdhhIS/qAdrdRIq1CIl1C8cv4/FmzuuO9EyiqQ5p56eHtTX1yM/P9/rY/l33QBCujOcn5/nbPCDHQscKvw+DmqJQXsBxsbGIJVKuZ1Kbm5uWLmGkZERjIyMoL6+Hrm5uTFphgzWFj9R5pEAK1NI+EgkEjQ0NKC+vh5PP/00nn32WSwuLuLVV1/F8ePHBROSSBEXQhJoaIthmJCsTkKFPncoQkKHUCUlJQXVxxKNJkin04ne3l4AwI4dO3zOhvbUqR4s09PT6O3tRXV1tWA2+KGQnJy8rDJKp9NhaGgInZ2dyMnJ4RbiQHMNLMuiv7+fc2im72UsmiH5eLPFn5ubQ29vL5xOJwgh0Ol0SE1NjXtb/EQSPUqoRTMmkwlFRUX41Kc+JUj+IhrEhZAEgkwmw8zMDObm5gSdy+HvmEDwfjnhDKGKtJBYLBauUx2Azw5a/k4klF0IDeuNjY2hrq4uIuHHUOFX16xbtw5msxk6nS4oPzCGYdDR0QGLxYJt27b53BnHuhnSky1+S0sLDAYDJiYmfNrixwPRqGQUmlB8tujcHbFqKwI4nU4YjUbOh0qpVEbluPQiYBgmoK5qQghGR0dx8uRJbNiwAWvWrAnpmJESkvn5ebS2tqKgoABVVVU4duyY1yoj/kIXiogwDIPu7m4sLCxg27Ztcf/FSEtLQ0lJCecHRk0mPfmBpaSkwG63o62tDTKZDNu2bQuq6z6SzZCBQDu1pVIpampqkJqaypVTu9viC+WCGy6JuCMJ1WcLgM8oQTwS90JCrU4IIVi7dm3URAT45O4wkIWdLpxzc3NhiV2khMQ9qU5DK57u9Ph3yaGIiM1mQ3t7O4DTobN4crwNBJlM5uKzRP3Apqam0NfXh7S0NNhsNmRmZqKhoSHqzZBCiQoN2br7Srm/Xlqimpubi6ysrJjsVlZ6joRiNpsB+I4UxCNxISTeFiq+1QmN6UabQCq3+EOompubw4o3Cy0k/C56flKdPwKX/1i6iNFzCVZEqPArlUrU1tYmXDjCHb4fWHl5OXQ6HTo6OpCcnAyj0Yh33nnHJWEvhB8YEFozZLB4yv25v15aojo3N4fOzk5uZgcVlmjdJJwpQmIymZCamhqx743RaMTg4CD338PDw1Cr1VCpVCgpKQn5eeNCSABXE0G6+I2OjnJWJ/39/YKP2w0Ef0JiMBigVquRl5cnyMJJK8WEwOl0oqOjA0aj0WcXPSBMUp32bpSUlKCioiLhQhH+mJ2d5fzHSkpKOONFrVaL4eFhdHV1CeoHBgTXDBnsbiWQIhL3ElV3W3w6YTAvLy+itviJKCSh5HWMRmNE57WfOHEC559/Pvff99xzDwDg5ptvxhNPPBHy88aNkFC8WZ3I5XLYbLaon4+vHQL9Mq1btw4lJSWCfPhC7UhoUj0pKQk7d+5cdufI35GEm1QHgPHxcQwMDKC2tjbhnEsDYXx8HCdPnnSxc+EbL8bKD4zfDBnsbiXYakR3W3y73c41Q6rVas7+g+5WhLTFTyRfMEo8DrU677zzIhLZiSsh8WV1Eu356b6OSy0+pqen/c59DxYhhMRgMKCtrQ2FhYXYsGGD14uZ7n7CSaoTQtDf34+ZmRk0NTVFNYcVDehkyImJCTQ2Nvp8ffHgBxZoebEQtuwKhcKjLf7Y2NiyIV7h7s4ScUfCMEzQhQpUSBJtNx83QjIxMYGenh6vVifeLFIijXuoiVbrOJ1ONDc3BzwpLZjjhSMkk5OT6OnpCWiXJJFI4HQ6QxYR2nBpsViwY8eOiDWGxgrqnmAwGLBt27ag7hTdmwNNJhN0Ol3U/MD4oUpfzaRCLVjutvi0+VMoW/xEFZIzYagVECdCQgiBXq/3aXXibpESLfid5ouLi2htbYVSqURTU1NELD5CFRJqoT8+Pu7XMobejSYnJ3MeW7RfItALn7oLJCcnB13+mgg4nU60t7fD4XBg+/btYZXA8gcjRcsPDPBdXky/S/zdqJC4N38uLCxw7sWh2OInopCEmiOJ91J5T8SFkEgkEmzZssXnAhrr0BYtn42EOSSfUISELnqerOnd4d+p7tixg+vu7u3thd1uR25uLhd+8bZ4LiwsQK1Wo6CgAOvXr0+4L7g/aBVecnIytm7dKvgNQzT9wIDluxXqbJCeng6pVMrt9CPVDCmVSpGTk4OcnBzOFp9WggVqi5+IQhJKjsRsNgse5YgGcSEkgRDL0Nb09DQWFxf9elIJdbxghMRsNqO1tRXJyckeren5uDcZ8ie9rV+/nusfmJyc5Cpy6G6Fxrg1Gg0nqEIVGMQTtHxZpVL5zC8JhSc/MNphT0NCQviBURiGQXt7Owgh2LZtG3ejFK1mSOC0LX5xcTGKi4s5W3y9Xo+hoSFYLBaPtviJKiRnwnREII6EJJApidHekTgcDiwsLIAQ4vdOXyiCMW2kSfVVq1b5tWLx16nuqX+AViCNjIxALpdzvRObNm3yO4gqEdHr9Whvb49p+XJycjK3yFI/MK1Wi8HBwWV+YMHG0m02G9ra2qBQKLBlyxZukYtFMySFnzuprq72aIufm5sLq9WacDk4UUjikGiHtugQKolEgqKioqh9uIH2kdDS49VlVZh0pqO7bRprclLRVKKEQu76ZQ+lU12hUHAxbho6m5+fh1wuR1dXF6anp/2GwBKJmZkZ9PT0YP369SguLo716QBw9QNbv369ix/YyZMnkZKSwn0G3vzAKBaLBS0tLcjOzsbGjRs9PpafW6E3M5FqhvSGN1t8umsxm80uu5V4RhSSOIQKSTRmKPCHUDkcjogeyx2pVOrzmLTcdnJyEulravA/H2ihWZgGIIFUCtSsysS9l6xDboZCkE512tfDsizOOussKBSKgEJgiQL1Rzt16lTcGUu6E6wfGIXeFNGcViCfDxWISDVDBgI/rOdwOKBQKJCSksIZa6ampnJ9K0qlMu5cFEJJtptMJqhUqgidUeSIGyHxd3HznXgjUS0FfNIzMDw87NJRH82dkK8cCd0ZmM1mNGzdjnsPD2FmwYrinFTIpRJYHQw6Jxfx1IdjuPuCSpeyT5pEDQaTyYS2tjZkZmZi06ZN3GfgKwSWlJTk4pobb19uPlSUNRoNtm7dGtC8mHjBnx9YRkYGJygDAwMoLS0NK1znq7w4GrsVQghSUlI4IeXb4vf19cHhcHCiolKp4iIMFmpDorgjiSBUPCIlJLQnYnFxETt37uTcN2UyGex2u+DH84Y3IeEn1Xfu3In2KSMm5y0oyk6B/F/z0lOSZFCmyvHhsB56YzEyk2Uhf6FpvmDNmjWoqqryugDxQ2Asy3JlrcFUgcUChmHQ1dUFo9EoyMjfWOIpvzU3N4fJyUkYDAbIZDJYLBZoNJqI+oFFctaKe/OkJ1v8ubk5aDQaDAwMxNwWnwptKDuSRHP+BRJISOhdTiR2B3SRVigUaG5udqnjj3ZuxpOQ6PV6tLW1YfXq1Vy5rcnOwMkSyGWuC3ySTAqLnYHJ5kR2alJId6ATExPo7+/Hhg0bsHr16qDOPdgqsFhAw3W0cinR3In9oVAoIJFIsLCwgI0bNyI1NRU6nW6ZH1h+fr4gXdTRmLXiq2qL36dTWloKh8PB7VZiZYtPX3Mo7r/xnvvxRNwISTBTEoVEp9Ohvb3dZZHmE8n5IJ5wT7bTpLr7MK+q/HRkJssxb3ZAlf6vhZAAepMdFXmpKMxKCcnu5OTJk5iamkJDQ0NYsVp/VWCxCoFRD7KMjAyXcN1KgvqC1dXVceXqOTk5UfMDA4SftRJM+a8nW/y5uTlMT0+jv78f6enpEbfFD1VIjEajuCOJNHK5XLBeEjpne3BwELW1tV4rdWK1IyGEoK+vD1NTUx79vIqVqbhoQwFebJ+GxW5BapIMSzYnUpOkuKZhNeSy4L4cDMOgs7MTJpMpaDuQQIiHENji4iLnQRZo0jmRIIRgeHgYo6OjXn3BoukHBgQ/a4X+f3dC7SPh39BQVwFaXhxJW3x+gUswiBYpUUCoRT2YIVRC2roHAj1eS0sLLBYLdu7c6fXC+vyuEqzKSsaRHg30Jjvq12This2F2FEe3E7CarVCrVZDLpdj+/btEbc7iUUIjM4RqaioQGlp6YoUkYGBAczMzLjMjvdFLPzA6HGB4GatCNWQmJSU5GKLv7S0BJ1Ot8wWn+5WQr1OaKI9mL+nn4G4IwmDaIW2rFYrN7M8kCFUwTQICoHD4YDRaIRKpcLOnTt9LuoyqQSXbczHpRtywRIgSR58WGJxcRFqtRq5ublR6eR2J9AQWDihl8nJSfT19WHjxo1YtWpVBF5FbKHmkvPz89i2bVtIMXZvfmBarTYifmBAcLNWItHZLpFIkJWVhaysrGW2+BMTE2HZ4oc6Y17MkUSBcG1SaCd4QUEBamtrA7owoxnampubQ39/P6RSKRobGwPqVKfVLLIQ7pzooKZ4ukt3D4EZDAZotVr09fUFHQIjhODUqVMYGxsLO+cTrzAMg46ODlitVmzbtk2wsKAvP7Cenh7OD0yogVa+Zq1YrVbYbDawLAuHwxGxZkhftvj0NVNR8feaQ6nYAsSqragQjk3K+Pg4+vr6sH79eqxduzbgCz9aoS16fmvXroVGowlYRELpD+E34W3atAkFBQXhnn5EkEql3Bc32BAYy7Lo7e3lwpeJWJvvD4fDAbVaDQDYunVrxEKSvvzARkZGBPcD44fALBYLOjo6UFBQAKVS6dJgSx8biWZIX7b4Y2NjkMlkXANoTk7Osvc+lB4Su90Oh8MhCkk4RCq0RRcUjUaDpqamoO9KIx3aYlkW/f39mJqaQlNTEyQSCaanpz0+VohphnQol1arRVNTE7Kzs4V4GREnmBBYVlYWuru7YbPZsH37dr/hy0TEm29WNIikHxgfk8mElpYWl478WDRDAoHZ4lNhSU9PD7mHBEBC3vTEjZAEQrChLZvNBrVaDYZh0NzcHFIVSiRDWw6Hw2WscFpaGhYWFjwKl78BRYEer6OjAw6HAzt27EjoBdZbCKy3txc2mw1JSUkrcm48EJhvVrQQ0g+Mj9FoREtLC4qKilBdXc19jrFohvT0mvm2+Farldut0Jua1NRUMAwDp9MZ8A7NaDQCgJgjCReJROJznnAwizod25uTkxNWvwC/HFfIRclkMqG1tRVpaWnYuXMnd7F52gHx78DoOQWL2WxGW1sb0tLSIjJjI5bQEFhKSgpmZ2c576WZmRkMDAzETSOkEITimxVN+H5gTqeTK/P25wfGZ2lpCS0tLVi7dq3fm4FoNEP6IyUlZdkObXR0FFarFe+8845HW3xP0NLfRLPLB+JMSPwhl8sDMlGkQ6iqqqpQVlYW1pdNJpNxW2ihvrRzc3NQq9UoLi5ethi4N0D6s38PBIPBgPb2dhQVFWHdunVxt/gIwfz8PPeeUksXWokjdBVYrJifn0dbW1tMbe6DQS6XB+QHlpeXh+zsbK4bv7W1FWVlZSgvLw/qeJFqhgz2HFQqFYxGI+RyOSorK7khXnxbfDrEi3/9GY3GiM9r/9///V88+OCDmJmZwZYtW/Dwww9j+/btYT9vQgmJTCaD1Wr1+nuWZTEwMICJiQnBhlDxzSKFuOjGxsY4+5E1a9Ys+71UKuWEix43HBGZmppCb28v1q9f7/F4KwE6bKu6utql+x8QtgosltA+mKqqKpSUlMT6dILGmx+YTqdDW1sbV4prMBhQXl4etIh4IthmSCFFheZIqC0+vwFUr9djYGAAdrud260wDBPx0t9nnnkG99xzD373u99hx44deOihh3DJJZegv78/7IIbCfEVS4oyDofDZ2J7ZGQEer0ejY2Ny35H/ZOsVisaGxsF6w5lWRavvfYazjvvvLByCjTJPT097bMU1W6346233sIFF1zgMhkulMqsoaEhjI+Po66ublln/EphbGwMg4ODQVef8e+QtVotFhcX4zYENjMzg+7ubtTW1qKoqCjWpyM4LMtiYmICAwMDUCgUsNvtgvuBeTomf7dCl0GhdiuDg4NgGAbr16/3+HtCCDfES6PR4Morr0RycjIIIfjLX/6Cc889V/Ac5o4dO7Bt2zb85je/AXD6PVi7di3uvPNOfOtb3wrruRNqR+Kt/HdpaQltbW3IyMhAc3OzoPF/elGFU7lFyzRtNhuXVPcG/cLY7XYoFIqQRIR27i8uLq7Y0le+L5g3OxBfRKMRUgiob9aWLVvielZKOOj1es6qaPXq1bBYLFzCPhJ+YEBwzZCh7Fb8NSRKJBKkp6cjPT0dJSUl6O3txQ9+8AO89NJL+I//+A/odDpccMEF+OMf/yhIeb7dbkdLSwvuvfde7mdSqRQXXnghPvjgg7CfP6GExFOyXaPRoKOjA2VlZT7tzoU+bqDQEsb09HSXpLon6F1RWloa3n//feTm5qKgoAB5eXkB9wjQSjWpVIrt27evOGdb4PSXtKurixNKIXaf8RYCC8Q3ayUwOzuLzs5OF9eB1NRUrF271mVKIt8PjJ+wj5QfGBWVUMuLgx13oVQqUVdXh+HhYbzxxhvo6enBq6++KlgTrU6nA8Mwy0ZkFxYWoq+vL+znjyshCWS4Ff1QCSEYHBzEyMgIN4QqUoTaS6LT6aBWq7F27Vq/SW568RJC0NzcDKPRCK1Wi9HRUXR3d0OpVCI/Px8FBQVevzxLS0tQq9XIyckJuHM/0aAl0wzDREwow2mEFIJQfLMSERqy27x5s9e77kD8wPLz87lKvUj5gQVbXhxqHwkN423cuBEbN24M45VEl7gSEn/QnYHT6URHRweWlpZchlBFilC620dHRzEwMODTWZjiqTKLegBVVlbCYrFAq9VCq9Xi5MmTSE9P5xYyaixHPZFotUu8xPeFhPqkpaamoqGhISrhJn8hMDpgSaiwixC+WYkArdwKJmQXD35g/FJ89wmk/N1KvM1rz8vLg0wmg0ajcfm5RqMR5CY8oYSE2sh/+OGHSE5OXjaEKlIEE9rid9Jv3boVOTk5fh/vr1M9NTWVq82nNtizs7NobW2FTCZDSkoKlpaWgh5ElUjQPFheXh5qampittuKZAgsUr5Z8QZNrNfX14cVuvHmBzY2NsZ1mwvtBwb4361IJBI4nc6gj2c0GiMmJAqFAk1NTXjzzTexf/9+AKfP/80338Qdd9wR9vMnlJAsLi7C4XCguLgY69ati9piEuiOhFaO2e12v530tFKEP7cgkAuPb4PtdDrR1dWFubk5yOVyzvok2LxKvDM3N8flwcLtCxISIUNg0fLNijVjY2MYGhpCQ0OD35usYAjUDyw/Px8qlUqQghxvuxWTyQSTyQS5XA673R5wM6TZbI7oLJJ77rkHN998M7Zu3Yrt27fjoYcegslkwq233hr2c8eVkHj7otEhVCdPngSAqHf0BpIjoR3HGRkZ2LFjh9+kOr9TPRTjRYfDgc7OTlitVuzatYvblQSbV4l3pqen0dPTE/e7rXBCYDabDa2trUhJSUFdXV1CNUkGw/DwMEZGRtDY2BhxjzdvfmAnT56ExWIRzA+MQgXCZrOhs7MTq1ev5vI6gTZDmkymZclwIbnuuuug1Wpx3333YWZmBvX19Xj11VcFOWZc9ZHQ/If7z7q6umAwGLB582YcP34cF154YVQtPk6cOIHCwsJlzW6UUJLqoQy+oVgsFrS1tXELj6f3gpZQzs7OwmAweMyrxDP05mFkZCTh+2D4ITCtVusSAsvIyEBnZ2dc+GZFCmrnPz4+jsbGRmRlZcX0fPh+YHq9PmQ/ME/PS00m+euAezMkf8nlJ+w/97nPYefOnWH3dMSCuNqRuEMXTDqEim73gy2tCxdvoS1qx37y5Els3LjR7x2zEHYn8/PzaG9vR2Fhoc/wHr+EkuZVtFotl1fJy8tDQUEBVCpV3C1edMzw7Ozsiqhacg+BmUwmaLVajI2NwWg0QqFQIC0tjUu2xrvIBwOtrpyamsLWrVvjoqdJCD8wd6iRpruIAK65Fboz8TQZcmpqKmF3o3ElJPw3X6/XQ61Wo7Cw0GVyn0QiieroW8BzaItW19DFjsZ7CSGYXbLB4mBRkJmMNMUniblw7N+B0+WSPT09Qdtk8PMq7k65DoeDuzvOz8+PeWyezo43m83Yvn17wobkvEErj5xOJ0ZGRlBaWor09HTodDqMjo4KXgUWSwgh6O/v574j8TiL3JMfmFar9ekH5o7FYsGJEyeQn5/vNyLhPpeergnvvPMO2tra8KlPfSoyLzTCxFVoi2VZ2O12jI+Po7+/H+vXr1+2YL755pvYtm1bVLfHXV1dSE5ORnV1NYDTSXW1Wg2Hw4HGxkZusdMZbXhBPY1+jRF2hoUqTYFzqlU4uyoXhBcjDcXuhE7627RpkyAeYvR5jUYjZmdnodVqYTQaubxKfn5+1EtP6fsqkUhQX18fc1GLFNQ3y90bzFcILN69wNwhhHCDxbZu3ZqQNwR8PzCdTgeJROIywCspKQlWqxUnTpxAbm4uampqQrpB/OCDD3DVVVfhgQcewJe+9KWE3JHGlZA4nU60t7djdnbWa1XHsWPHsGXLFkErPvzR09MDmUzGVea0trYiMzMTmzdv5kJsdieLR94eRu/0ElZlJyNFLoXOaIfNyeCGrcXYVqoMSUQYhuH6Curr6yMa5rFardBqtTHJq5jNZrS2tiIrKwsbN25M6DtxXwTqm0Wb76ioxLMXmDt0t76wsICmpqaEnntDocOsqKjQkbhmsxkqlQqbN28OKUR8/Phx7Nu3Dz/84Q9x5513xu1n6o+4EhKDwYCOjg5s2bLF6x3MO++8gw0bNkTVd6i/vx8MwyA/Px/t7e0oKSlxGbYDAN1Ti/jd28NYrUxBSpIMIABLWIzozCjLTcNdnwre9puWExNCsGXLlqjekfLzKjqdDlKplKsAc7e/Dhc6O2Yl29wDn/hm1dXVBX398qvAaLl3PIbAqH2N0WhEU1NTQu2igoGOLZDJZJwvXrB+YG1tbdi7dy++853v4P/9v/+X0Nd9XOVIlEolduzY4fMNDXZKohBIpVLo9XpMTk56TarPWxxgWOIiIoQQZKbKoTc74GAIFPLALxSj0Qi1Wh2zO/Ro5VVoV3JlZSVKS0sFfAXxgxC+WYE0QtI+iVgt3izLcg2VW7duXZE+b8DpEt/u7m7k5+ejtraW+zx0Oh33efjzA+vs7MSVV16Jr3/96wkvIkCc7UgIIbDb7T4f89FHH2HNmjV+bUeEgmVZfPjhhzAajdi+fbvXRYDbkWSnQCGXcIOwRucsKMtNw1fPD9y2hDbgrV27FpWVlXF1kQmZV6Edzhs3boxo/XwsoQlnjUaDxsZGwUOT8RICYxgG7e3tXN5wpea3bDYbWlpauBs89/eW7wem0+kwPz/P+YEtLi5i06ZNGBoawmWXXYYvf/nLuP/+++Pq+x0qcbUjCeQN9WYlHwnsdjva2tpgt9uRk5Pj806yKj8dlfnp6JlaRFFWMpIVMuiNp0VxV0VOwBfL+Pg4BgYG4rYBj994V1lZyeVVaLNXWloaCgoKfOZV+LNShO5wjiei4ZvF95+KhheYJ5xOJ9RqNQghaGpqWlFjnPlQK3ZvIgL49gO79dZbMTU1BZZlceGFF+L2229fESICxNmOBDit+L5Qq9XIzs4WZIKaL5aWlrjkb25uLjQaDbZt2+bxsbTJcHbRipc6NRiYNcHBsFCmJuHc6lycXZ0LqZ8Lhjq+Tk9PR72YQCi85VWoLQUto+7p6YHBYEBDQ0Nc9BVEAr5vVmNjY0zCTd6qwIQMgVFrF6lUii1btqxoETlx4gQyMzOxadOmkARgYGAAF154ISoqKsCyLNra2rBt2zb88Y9/TCinX0/EnZDY7Xb4OqXOzk6kpKRwpbiRYHZ2Fh0dHSgtLUVVVRWmp6cxPj6OHTt2LHssv8mQVmXNLtlhcTAufSS+cDqd6OzshMViQX19/YpwfHVfxBwOB1QqFcxmMyQSScwW12jA982KlzJmXyGwUE0NHQ4HWltbkZSUhC1btsRNwl9o6E4kPT0dmzZtCqk6a2RkBHv27MEVV1yBX//615BKpZiZmcErr7yC/fv3J+SNI5+EE5Le3l5IJBLU1NQIfmxqy0FHt9LyTI1Gg6GhIezatWvZ48PtVLdarWhra4NCoUBdXV1cLDpCQwjB3Nwcuru7ufcrlv0qkSRRfLPCrQKji2tqairq6urizh1BKOjrTEtLC7nEd3JyEhdffDEuvvhiPPLIIyvyvUq4fahMJoPD4RD8eVmWRXd3N3Q6HbZv3+5iKufJIkUIEVlYWIBarUZ+fn5MrdEjjclkQm9vL/Ly8rBhwwbY7fag8yqJALXJUCqVcT9YLJwqMJpwzsjICPkOPRGgO65wRGR6ehqXXXYZzjvvPPz2t79dse9V3O1IHA6HT6fdoaEhmEwm1NXVCXZMmlRnGAaNjY3LGqj0ej06Oztx7rnnurh5hiMiGo0G3d3dqKioQGlpacIunv4wGAxQq9UoKSlBRcXyXppA8iqJAM2pFRYWRt2dWkj8hcCSkpI4sfSWcF4JOBwOtLS0cDvLUARAo9HgsssuQ2NjI5588skVmz8CEnRHImQfydLSEvfF2Lx5s8eFiw62cp+KFoqI0PDZ8PAwNm3a5HXE6EqAdnGvX78ea9as8fgYb/0qfX19Lv0qQk28iwTz8/Noa2tDaWlpwk+n9FcFxjAM0tPTUVhYCJZlE0bog4HuRJKTk0MWEZ1OhyuuuAKbNm3CE088saJFBEhAIRGy/Hd2dhbt7e0oLy/32a9BhYQ/QySUi4tOT6T+Q7G2044ko6OjGBoaQl1dXcDeYJ4GRc3OzmJsbAw9PT3Izs7mQmDxklfx5pu1UqAhsOzsbMzPz0OlUiElJSWuGiGFxOFwcDnLLVu2hPQ9NxgM2LdvH6qqqvDnP/95ReY93Yk7IfF3NxfM2Ftv0E7joaEhbN682e/MYuo4PDc3F7LtusPhQHt7O5xOJ7Zv374i/Ic8QcuYZ2Zm0NTUFPIAIyH6VSINHbq1ceNGQeZexytGoxEtLS1YvXo1qqqqIJFIXOzwqVNuuFVgscbpdKKtrQ1yuTzkncjCwgL27duH1atX45lnnonbXbTQxF2OxOl0+hSK2dlZDAwM4Kyzzgrp+akX0NzcXECT2gghcDgcGBgYgEajAQDOcyrQGL7JZIJarUZ6errX8NlKgA4hMxqNaGhoiNiuwel0cuGWWOVVwvHNSiQWFxfR2tqKtWvXesxxURLFC8wbTqcTra2tkMvlIZcyLy0tYd++fcjKysLf//73FXuz6ImEE5K5uTl0dXXh3HPPDfq5bTYb2traQAhBQ0ODzw/aU1KdEIKFhQXMzs5idnYWDoeD29Z7m5Gu1+vR0dGB4uJi7m5uJUJ7JwghqK+vj9qdGB2jSi1bIp1X4Vv6NzQ0hOSblSgsLCygtbUV5eXlKCsrC/jvotEIKSR0JyKVSlFfXx+SiJhMJlx99dWQy+U4fPhwXM5eiSRxJySexu3yWVhYQEtLS9ADYAJJqlPck+qe7N8JIVhaWuIWMJPJBJVKxYVbkpOTMTk5ib6+PtTU1ETNGywW0EmWtEwyVnef/MFEs7OzMBqNguZV+L5ZTU1NK7YrH/ik2q6ysjKoIWruRKIRUkgYhkFra2tYImKxWHDttdfC6XTi5ZdfTviJnqGQcEJiNBrxwQcf4KKLLgr4OTUaDTo6OlBRUeFzew580h8SbFLdbDZzO5XFxUUoFAo4HA6/cycSncXFRbS1tcVl2Ss/r6LX6znzvIKCgqDzKrTPiM7YSMRBTYEyNzeH9vZ2rFu3zmu1XajEUwiMYRi0tbUBABoaGkI6ttVqxfXXX4/FxUUcOXIk5JxgopNwQmKxWPDPf/4Tl1xyid+FgIYhTp06FVBSne5EGIYJuT+EuqAuLi4iPT0dCwsLSE9P5+6KY30HJiS0YikRemHCyavEg29WtKC2/jU1NRE3DY1lCIxhGKjVarAsi8bGxpBExGaz4d/+7d+g0Wjw+uuvJ7zNSTjEnZCwLOuzc93hcODNN9/EhRde6LM2m2EYdHd3Q6/Xo7Gx0W+prVB2J3TYzZYtW7hdCX8BS0pK4mZEK5XKuF58fUHDdom44womrxKPvlmRYnZ2Fp2dndi0aVPUbf29hcCoqAh5A8YXkYaGhpB6PBwOB2666SaMjIzgrbfeQm5uriDnlqgknJCwLIvXXnsN559/vtc7Fup3BMBvUp0+Z7id6ktLS2hra4NKpfJqj8EwDPR6PbeAAcFXgMUafrJ5y5YtUKlUsT6lsPCWV8nPz4dSqURvb2/c+2YJAW0e3bx5c1w0yUYqBEYjBk6nE42NjSGJiNPpxOc//3n09fXhrbfeiov3K9YknJAAwJEjR3DWWWd5rIyg5Yo5OTnYtGmT36Q63YkAnpPqgUDDAbS6JZDnIIRwd8WBVoDFGpZl0dfXB51Oh4aGhhWZVKR5lZmZGczPz0Mul6O4uBgFBQXIzs5O2B2kL2gfSLyWMgsVAmNZ1mX4VigiwjAMbrvtNrS1teHo0aMrun8oGBJSSN58801s27ZtWbhqZmYGnZ2dEU2quz/H2NgYhoaGwpryx586ODs767ECLNZQq3ur1RrQLi+Rob5ZdD59ovuA+YL2w9TX1yfE7jLUEBgVEbvdHvIER4ZhcOedd+K9997DsWPHVnQlZrDEnZAEMm732LFjLsOf+En1uro6vwu6EEl1eneu1WpRX18vaLUGrQDTarVYWFhAVlYWl1eJhTUI7b+hzVrxuFsSCm++WTSvQkNgdrvdZQeZiB3Mo6OjOHXqVEL3wwQSAqOz5G02W8giwrIsvva1r+HNN9/E0aNHUVpaGoFXk7gkpJC88847qKmpQX5+PtdNbTAYopZUdzgc6OjogN1uR319fURLQW02G7d46fV6pKenc3mVaFSAmUwmtLW1ITs7Gxs3blyxNthA4L5Z/LyKVqvF0tISl1eJldgHy/DwMEZGRgJyd0gUPIXAVCoVbDYbWJbFtm3bQhaRb37zm3jppZdw7NgxVFRURODs/fPII4/gkUcewcjICABg48aNuO+++7Bnz56YnA+fhBSS999/HxUVFVAqlWhra4NEIkFDQ4PfEJAQSXWz2Qy1Wo3U1FRs3rw5qq6etIR1dnbWpQIsPz8fOTmBz4UPlPn5eajV6hXflQ+E55vlrV8lPz8/7vIqhBAMDQ1hYmICTU1NKzLPBXzSMNzV1QWLxQKWZZGVlRV0FRjLsvje976Hv/3tbzh69CjWrVsXhbP3zEsvvQSZTIbq6moQQvDkk0/iwQcfRFtbW8xH9cadkAD+57Z//PHHUKlUGB8fR25uLjZu3BiVpDpdWIuKirBu3bqYLhAsy3JzPGZnZwEIWwE2OzuLrq6uFetqy0dI3yyn04m5uTlO7OMpr0IIwcmTJzE9Pb3iO/Opp57JZEJTUxMABF0FRgjBj370IzzxxBM4evQoNmzYEO2X4ReVSoUHH3wQX/jCF2J6HgkpJO+//z6WlpZQXV3td/6De1I9VBGhd6zr1q2Lu4WVXwHGr2opKCgIqQKMLqwrfV5KpH2z4imvQu1dtFotGhsbV7QXlLuIuL/PvqrAcnNzkZqaCkIIHnjgATzyyCN46623sHnz5hi9Gs8wDINnn30WN998M9ra2lBbWxvT84lLIfE2t51uy4eGhrB69Wq/Hy4hBEsWO6bmLbAxBOkKOVYrk5EsD/yukL/YbN68OS7LI/l4qwCjuxVf4T9CCAYHBzE5OYn6+vqETcAGQrR9s3zlVfLz8yO6sBNC0NvbC71ev+LtXQgh6OrqwtLSErZu3epXrN2rwH7729+io6MDubm56OzsxLFjx9DY2Bils/dPZ2cnmpubYbVakZGRgQMHDuCyyy6L9WkljpAwDIPOzk7Mz88jKysLmZmZqK6u9vochBBoFix4/9Qc5owO0E1IUXYKdlaooEz1f5dOu+MXFhbQ0NCQkKEAs9nM3RHzK8DcFy++l1RDQ8OKv2ONtW9WtPIqLMuip6eHe60ruWybEILu7m4sLi6iqakppLL5yclJ3HPPPXj11VehUCiQk5ODvXv34sYbb8TZZ58dgbMODrvdjrGxMSwsLOC5557DH/7wB/zzn/8UdySecBcSq9XKOXQ2NDTg1KlTkEgkqKmp8fj3LMvCyTB4o3cW04t2rFWmQiqVgGEJxgwWrCtIx1lVvi0NbDYb2tvbASCqtuiRhFaA0TgxHQ6lUqkwNDTEWUashNfqDdrZbLfbAyrQiAY0r0I/G6HyKizLorOzkwvxxMNrjRSEEPT09GB+fh5bt24N6bUSQvDYY4/h/vvvxyuvvIKmpia8/fbbeOmll1BZWYm77rorAmceHhdeeCEqKyvx6KOPxvQ84m5CInA6j0GFhM5EyMvL48pPZTKZx6ZFflJdb7JDa3SgMDMZUunpuzuZVIK8dAUmF6ww2pzISPb88o1GI9ra2qBUKlFbW7siGs8AIDk5GWvWrMGaNWu4CrDp6WkMDw9DKpVi9erVMBqNUCqVK7LMl++b1dTUFDf9MHK5HIWFhdwcdJpX6e/vh81mc8l3BSry1GjSZrMFFOJJZPgiEqpgEkLwxBNP4Pvf/z4OHz6MXbt2AQAuuuiioJzGow3Lsn5zytEgLoWEMj09ja6uLlRVVblYj8hkMlgsFpfHus8QIQRgcVo8+MikErCEgGE9b8R0Oh06OztRUlLitzs+kZHL5UhPT8fS0hJWr16N/Px87rUTQrg74tzc3BUhpNR/Ld59s6RSKVQqFVQqFdatW8flVcbHx7m59f7yKnw/qXgSzEhA8z8GgwFbt24NKXRHCMGf//xnfOtb38KLL76Ic845JwJnGj733nsv9uzZg5KSEiwtLeHAgQM4duwYjhw5EutTi08hoWWKIyMj2LJly7LKIfe57fwmQ4lEAqlUCmW6AjmpSZgz2bEq65OLa85kR1F2CjJTlr/08fFxDAwMJKSjbbDo9Xq0t7e7dHAXFBRwFWBarRYDAwNhV4DFA2azGa2trdwOM1F2W/y59RUVFS55lcHBQY95FafTyU2qDNVPKlEghKCvrw96vT4sEXn22Wdxzz334Lnnngt6YF40mZ2dxU033YTp6WlkZ2ejrq4OR44ciYsdU1zmSNra2rgyRU8NUxMTE5iensa2bdt8dqqPzJnx4bABVgeDlCQZzHYGmSly7KpQoVj5yUVHK3hmZmZWfLUS8Ekp84YNG3zOnPBUAZaTk8Ml6xMhcUt9s1atWhXz3h8h8ZRXyc3NxcLCApKTk0Me1JQoUBGZm5sLq2Di+eefxxe/+EU888wz2Lt3r8BneeYQl0Ki0+mQnJzsNa47PT2NkZER7Ny506/dycyiFSNzZixZnchJS0JZbjryMj55XmpGaLFY0NDQsOJLI0dGRjAyMoK6urqgZyhYLBZOVGgFGC0rjscqL2++WSsNlmWh0+nQ09PD7dRDyaskCvyemK1bt4b8nT18+DBuvfVWPP3007jqqqsEPsszi7gUEqfT6RK6cmd2dhYDAwPYuXMnCCEhNxlaLBao1WooFArU1dUlZNgmUOgd3OzsrNedXjDY7XauAZJfARYtDzB/UGv/M6Ez3263o6WlBWlpadi0aRNX8h3tfpVoQAjBwMAAZmdnwxKRV199FZ/73Ofwf//3f7juuusEPsszj7gUEl/jdmkM//jx41i1ahUKCgqQm5sbdNx7YWEBarUaBQUFWL9+fcLEzUOB9uCYzeaI7Lo8eYDRnUosKsDC8c1KNGhpfGZmpkdTTavVyn028e4D5g+aO52ZmcHWrVtDNsd86623cP311+N3v/sdbrzxxoR6D+KVhBISvt0J33rC6XRyC1cgVUYajQbd3d2orKxESUnJir6Q7HY71Go1JBJJVEbFsizrMgUy2hVgY2NjGBwcxJYtW1b8+FOLxYKWlhbk5OSgtrbW73XsKa+Sl5eXEBM6qevC9PR0WCLy9ttv49Of/jR+9atf4dZbb13R3/1okjBCwhcRfiiLEILFxUXMzs5Co9FwfkY0PsyvWiGEcPbZmzdvRn5+flRfV7Qxm81oa2vj7lajvVAQQrCwsMDlVWw2m8tnI6SoUSub8fHxM6Jgwmw2o6WlBXl5eaipqQl6QeT3q2i1Wq5fhYp+POVVqIhMTU1h69atIYfn3n//fVx99dX42c9+httuu00UEQGJSyFxn5LoTUTc4VcZaTQaWCwWqFQqFBYWIjc3FydPnoRer1+xY2L50NBdvFQr8T8brVYLo9EoWAUYTb7S/E8iWtkEg8lkQktLCwoLCwX5bKnfFP1s4imvQv31JicnwxKR48ePY9++ffjRj36EO+64I+bfh5VGXAsJ7VQPdYYI/XLMzMzAaDRCJpOhoqICq1evjqs7LqGhiebKysq4neQmVAVYPPhmRRNazlxcXIzKysqILIjxlFfhz04J9Qahra0Ne/fuxXe/+13cc889oohEgLgVErvd7tKpHmplFp3wl5aWxs3fXlxchFKp5KqMEqEfIlAmJiYwMDAQ1gz5aGO3212mQKampnI7laysLK+fezz6ZkWSxcVFtLa2cq4L0YCfV9HpdJBIJFHLq1DX7a1bt4YsIp2dnbjsssvw9a9/Hd/61rdEEYkQcSkkTqcTVquV++9Qq35o9/aaNWtcJvxZrVbubpi6CRcUFKCwsDBh72hpCIDmCOg8+0TDfTCUTCbjBJ9fAeZwOLjpmNEoIog1tCemvLwcZWVlMTmHaOZVhoeHMTo6GpaI9PT0YM+ePbj99tvx/e9/XxSRCBKXQnLzzTdjaGgI+/fvx5VXXoni4uKgL4KJiQn09/ejpqYGxcXFXh9H+yHo3XBGRgYnKolSc0+twg0GQ8La3XvCUwVYXl4ecnJyMDo6irS0NGzevDmuq42EwGAwoK2tLa56YvhzPGZnZwXNq1ARCWcUcH9/P/bs2YNbb70V//3f/y2KSISJSyGZmJjAc889h0OHDuH999/H1q1bsW/fPuzbtw+lpaV+JyKePHkSU1NTqKurg0qlCvi4DoeD+2LMzc1xIZbCwkJkZGTE5cXodDrR3t4Oh8OxosM7tAJscnIS09PTAMCFWPLz81fsjmRubg7t7e1Yv369zxuiWEPzKnS+Smpqakh5Feq8EI6IDA0N4dJLL8X111+PBx98cEX3iMULcSkkFEIIpqen8fzzz+PQoUN4++23UVdXx4kKP1wFfNJ4ZzKZUF9fH9ZdkXuTnUKhQGFhIQoKCnzG7aOJzWZDW1sb15m/kg36gE8SzYWFhSguLuZEX8gKsHiCFk1s2LAhoUxEveVV/PUSjY6O4tSpU2hqakJWVlZIxx4ZGcGePXtwxRVX4Ne//rUoIlEiroWEDyEEOp2OE5W33noLNTU1nKikpKTgS1/6Er7xjW/gvPPOE/QOlWEYLm6v1Wq5uH1hYSGUSmVMRIXOTFGpVNiwYcOK/8IYDAao1WqUlZW5jBQATleAUVGZn59HZmYml1dJlPCkOxqNBl1dXdi0aVPCFE14ItC8ytjYGIaGhsISkYmJCVxyySW4+OKL8cgjj6z470Q8kTBCwocQAoPBgL///e84ePAgjhw5ApZlUVlZicceewxNTU0Ru4j4cfvZ2VlIJBLk5+ejsLAQOTk5Ubl46aK6du3aiJWAxhP0znzdunVYs2aNz8d6qgCjZcXxspP0B7V4qaurW1FNs97yKklJSdw8+ezs7JCee3p6GpdeeinOPvts/P73v49Z3uwnP/kJDh06hL6+PqSmpmLXrl144IEHsH79+picT7RISCHh88ILL+Bzn/scLr/8cthsNrz22msoKirCvn37sH//fjQ0NERUVObn57kGSGoHEqr/VyBQe5dAFtWVAF1UQ7kz91YBlp+fHzXRD5bJyUn09/efERYvVqsVJ0+ehEajAQCXsu9g8ioajQZ79uzB1q1b8eSTT8a0+ILmZrZt2wan04lvf/vb6OrqQk9PT8LujgMhoYWkvb0dZ511Fp566inOBtpoNOLll1/GwYMH8corr0ClUuHKK6/E/v37sW3btohdZDQZrNFoMDs7C4fDwYlKXl6eIMcdHR3F0NDQGWHvAgjrm0V3kvRumGXZoPzZosH4+DhOnjyJ+vr6oIpEEhXa80TdCELJq+h0Olx22WWora3FgQMH4i5PqNVqUVBQgH/+859xO3lRCBJaSIDTXz5vJZFmsxlHjhzBwYMH8Y9//APp6em44oorsH//fjQ3N0fsoiOEYGlpiRMVq9XqUmEU7HGpdfb09DQaGhpC3v4nCnzfrEi8Xk8eYHR+R6wqwGiiuaGhYcX7hAGfiEhDQ8OynqdA8yp6vR6XX345Kioq8Mwzz8SlW8Xg4CCqq6vR2dmJTZs2xfp0IkbCC0mgWK1WvPHGGzh06BBefPFFyOVyXHHFFbjqqqtw1llnRWzxoHFhKiomk8ll0fJ38TMMg+7ubiwtLaGhoSFk19NEIdq+WXyfKX4FGN2tRKMCjHZwNzY2hpxoTiRo+M6TiLjDz6totVpMTU3hgQcewNlnn4233noL5eXlOHToUFyWvbMsiyuvvBLz8/N49913Y306EeWMERI+DocDR48excGDB/HCCy+AYRhcfvnl2L9/P84777yIXpT8RWtpaYkrWy0oKFh2XIfDwc3frq+vj8s7LiGhvlmLi4tobGyMictANCvAqBsB9ZJa6UaiADA1NYW+vr6Qw3c6nQ6PPvoofv3rX8NsNqOqqoqr3Ny5c2dchCgpX/7yl/HKK6/g3XffXfH5zDNSSPg4nU68++67ePbZZ/HCCy/AZDLh8ssvx759+3DBBRdEdDFzNy7Mzs5GYWEhl/+gHmFnQvc23zersbExLkTTvQIsJSWFE5VwK8BouFKj0aCpqWlFJ2Ip09PT6O3tDSsHZDQacfXVV0OhUOCvf/0r3n33Xbz44ot45ZVX0N3dHTe5wzvuuAMvvvgi3n77bZSXl8f6dCLOGS8kfBiGwQcffIDnnnsOzz//PAwGAy699FLs27cPF198cUS/7DabjRMVg8EAAMjMzMSmTZtW/CJDfbOkUim2bNkSl13qQlaA0bHHOp0OTU1NKz5cCXwiIuEUTpjNZlx77bVgWRYvv/yyS9iTuoPHGkII7rzzTjz//PM4duwYqqurY31KUUEUEi+wLIvjx49zojI9PY2LL74Y+/btw549eyIWhpibm4NarUZeXh4YhoFer0d6erqL/1ci9EIEitVqRVtbG1JTUxNm58WyLAwGAyf8wVSAEUI4X7QzwfYeAGZmZtDT0xOWiFitVlx33XUwGo04cuRI3OaSvvKVr+DAgQN48cUXXXpHsrOzV/RnLQpJALAsC7VazYnKyMgILrjgAuzbtw+XX365YDMapqam0Nvbi9raWs4Sw+FwuFi1pKSkcFYtmZmZCS0qZrMZra2tUCqVqK2tjYs7ymChFWA0BGa1Wr1WgNEc0NLSEhobG1eMlYsvaIf+li1bkJeXF9Jz2Gw23HjjjdBqtXjttdfi2tna2/fx8ccfxy233BLdk4kiopAECSEE3d3dnKlkX18fzj//fOzfvx+XX345cnNzg17c6Qjg0dFRbNmyxWv8mGEY6HQ6aDQazv+LxuyjPXAoXKhvVrxMcBQCXxVgeXl5OHnyJMxmMxobG+OyykhoqIiE06HvcDhw0003YXR0FG+++eaKb9JMVEQhCQOaMD148CAOHjyIjo4OnH322di3bx+uvPJKFBQU+F0gWZbl4uXBjACmYS+NRuPi/1VQUICcnJy4Xph9+WatJGgFmEajwfz8PKRSKUpKSlBUVLRirP69MTs7i87OzrBExOl04vOf/zz6+vpw9OjRuEmkiyxHFBKBoE10Bw8exKFDh3DixAns2rULV155Jfbt24fVq1cvWzAZhkFHRwesVisaGhpCDnXQmD0VFUIIJyoqlSquQkbB+GatBBiGgVqthsPhwOrVq6HX6zE3NydoBVi8odVq0dHRgc2bN6OgoCCk52AYBrfddhva2tpw9OhRrFq1SuCzFBESUUgiACEE4+PjnKh88MEH2LZtG2fVUlJSgomJCTz55JO49NJLBa1UooaWNLzCMEzcWIGE45uViDidTrS1tQEAGhoaOEcDGqKkTXYymYz7jOLVAyxQqIiE8xkzDIM777wT7733Ho4dOxbXc1hETiMKSYQhhGBqaoqzv3/nnXewbt06biTu4cOHI2rVsri4yHXV2+12zqolLy8vqr5EQvpmJQK0pFkmk6G+vt6rgHurAKN5lUSoYqPodDq0t7eHJSIsy+Luu+/GW2+9haNHj6K0tFTgsxSJBKKQRBFCCF555RV85jOfQW5uLqamprBhwwbs378f+/btQ01NTcRCHIQQGI1GTlQsFktU/KX43dtngk8YcLqRsbW1FcnJyairqwtYDKjwU1HhV4Dl5eXFRZOmN3Q6HTo6OlBbWxtyGIplWfznf/4nDh8+jGPHjqGiokLgsxSJFKKQRJEjR47gmmuuwQMPPICvfOUrMBgMePHFF3Hw4EG88cYbqKio4OzvN27cGNEQh9FodKkuUqlUXMxeqAWLNt5ptdqo+GbFAzabDa2trZwjQaifoafZHUqlkhP+eOpJoOOAwxWR7373u3j22WfPqEa+lYIoJFFkaGgIXV1d2Ldv37LfLSws4KWXXuIGdRUXF3OiUl9fH1FRMZvNnKgsLi5CqVRyVi3hFAB0dXVxPRPxtPBFCqvVipaWFmRlZQl+I2C1WrnPaH5+HhkZGS4eYLFK1uv1eqjV6rDGARNC8MMf/hBPPvkkjh07hpqaGoHPUiTSiEIShywtLbnMVMnLy+Ocirdt2xZRUaELlkajwcLCArKysrgGyEDFIB59syKNxWJBS0sLcnJyUFtbG9GF3W63c02q/AqwYAdChQsVkZqaGqxevTqk5yCE4Kc//Sl+97vf4ejRoyvaan0lIwpJnGM2m/Hqq69yM1UyMjK46q/m5uaIJmNtNhvXB2EwGJCRkcGJijf/L75vVn19fdwNGooEZrMZLS0tyM/Px/r166O6O4hVBZjBYEBbW1vYIvLLX/4Sv/zlL/Hmm2+ivr5e2JMUiRqikCQQVqsVr7/+OjdTRaFQcDuV3bt3R9Ts0OFwcKIyNzfH+X8VFBQgIyMDEokkIX2zwsVoNKKlpQVFRUWorq6OaT8IvwJMq9WCYRiXKj2hPg8qIuvXrw+5NJcQgt/85jd44IEHcOTIEWzbtk2QcxOJDaKQJCh2u91lpgrLsti7dy83UyWS4SSn0+li1ZKSkoKcnBxotVrk5uYmrG9WsCwtLaGlpQVr1qxBZWVlXDUVeqsAc58yGCzz8/NobW0Nq6GUEILHHnsMP/jBD/DKK6+gubk5pOcRiR9EIVkBOJ1OvPPOO9xMFYvF4jJTJZLmgAzDYGJiAoODgyCEIDk5mdupKJXKuFpchWRhYQGtra0oKytLiHkTRqMx7Aqw+fl5tLW1oaqqyut4a38QQvDEE0/g3nvvxeHDh1f0HPMzCVFIVhgMw+D999/Hc889hxdeeAHz8/O45JJLsH//flx88cWCz77g+2aVlJRAr9dzoRWJROLi/7VSdil0Qa2oqEjIhjlaUKHVarncl78KMCqc4YrI008/ja9//ev4+9//jvPPPz/clyISJySMkPz4xz/GP/7xD6jVaigUCszPz8f6lOIelmXx8ccfc/b3Go0GF110Efbv349LL7007Jkqvnyz3Du2CSEuVi2JKiq0Uqm6ujrkBTWeoLkvfgUY/ZxoBRgVkcrKSpSUlIR0HEII/va3v+HOO+/EwYMHcckllwj8SgLn7bffxoMPPoiWlhZMT0/j+eefx/79+2N2PiuBhBGS73//+1AqlZiYmMAf//hHUUiChGVZtLW1cfb3Y2NjuPDCC7Fv3z5cdtllQZeN0tkpgdhhEEIwPz/PiYrT6UReXh4KCwtj7v8VDLTxLpwkczzDMAw3BVKr1UIqlUKpVEKn06GyshJlZWUhP/ehQ4dw22234ZlnnsHevXuFO+kQeOWVV/Dee++hqakJV199tSgkApAwQkJ54okncPfdd4tCEgaEEHR1dXGiMjAw4DJTRaVS+RSVcHyz+ElgjUYDm83GiUq0/b+Cge6+wmm8SyRYlsXk5CT6+/s5oacVYLm5uUF9TocPH8att96Kp59+GldddVWkTjkkJBKJKCQCIArJGQ4hBP39/dxMlc7OTpxzzjnYt28frrjiCpeZKizLYmhoCJOTk4L4ZlH/LyoqFosFKpWK66qPl9ntdEDTmeJaDHxSkVZWVobS0lJO/LVaLfc50WS9rwqwV199FZ/73Ofw+OOP4zOf+UwUX0FgiEIiDKKQiHBQg0Vqf9/S0oJdu3Zx4a/77rsPqamp+J//+Z+I+GbR6YIajSZi/l/BMj09jd7eXmzevPmMGaxERaS0tNRjRRp/CqSvCrA333wTN9xwAx599FF89rOfjcsKPlFIhCGmQvKtb30LDzzwgM/H9Pb2unjviEISHQghGBsbw8GDB/Hcc8/hww8/RFJSEu6880584QtfQElJSUQXBovFwjkVU/8vKirRmnU+MTGBgYGBM8b6HjhdJnzixAmUlJQE5L5rtVq5ZL3BYMBbb70Fp9OJqqoqfPe738XDDz+MW265JS5FBBCFRChiKiRarRZzc3M+H1NRUeFyNyoKSXQxmUy4+uqrMTMzg+uuuw6vv/463n33XWzZsoWzv6+oqIjoQuFuWJiVlcWJitDlzJSxsTEMDQ2hvr4eOTk5ETlGvEG79GmDZbA4HA4888wzeOSRR6BWq1FQUICbbroJV111FXbu3BmXlXqikAhDTDObtMtWJH6555574HA48M477yArKwv33nsvNBoNXnjhBRw6dAg//OEPXWaqRMJrKiUlBSUlJSgpKYHdbudEZXBw0KUHQqhw28jICIaHh9HQ0AClUinIc8Y7JpMJLS0tKC4uDklEACApKQnr1q3D8PAwfv7zn6O8vBwvvPAC9u7diyuvvBJPPPGEsCctEjckTI5kbGwMer0ef//73/Hggw/inXfeAQBUVVWdEXMuYgX11fIUTiKEQK/Xu8xUqaqq4uzvI22V4t4DkZqaioKCAhQWFnL+X8FACMHw8DDGxsbQ2NiIrKysCJ15fGEymXDixAlOREK9EWhtbcUVV1yB733ve/ja177GPY/D4YDBYAh5frvQGI1GDA4OAjg9AvkXv/gFzj//fKhUqpD7ZM50EkZIbrnlFjz55JPLfn706FGcd9550T8hERcIIS4zVV577TWsWbOGE5UtW7ZEVFSo/9fs7Cx0Oh0UCgUnKllZWX4XR0IIBgcHMTU1haampjPm5oTuRIqKilBVVRWyiHR0dODyyy/HN77xDXzzm9+M25wIABw7dsxjV/3NN98s7ppCJGGERCSxWFpawj/+8Q8cPHgQr776KvLy8nDllVfiqquuwtatWyMqKu6NdTKZzMWqxX2RI4RgYGAAGo0GTU1NXi3yVxpmsxknTpzAqlWrwnIu7unpwZ49e3DHHXfgvvvui2sREYkMopCIRByTyYRXX30Vhw4dwuHDh5GVlcXNVNm5c2dEO9tZluX8v2ZnZyGRSJCfn4/CwkJOVHp7ezE3N4empqaIJe/jDTpDpaCgAOvWrQt58e/v78eePXvwhS98Af/1X/8lisgZiigkIlHFYrFwM1X+/ve/Izk5GVdccQX2798f8ZkqLMtyVi0ajQYsyyIpKQkMw2Dr1q1nzE7EYrHgxIkTYYvI4OAg9uzZgxtuuAE/+9nP4rIqSyQ6iEIiEjPsdjveeustbqYKAG6myrnnnhvRJkSGYdDW1oalpSXIZDIX/y8hh0DFG1REwp3mODIygksvvRT79+/HQw89JIrIGY746QvE//7v/6KsrAwpKSnYsWMHPv7441ifUtyjUChw6aWX4ve//z2mp6fxzDPPICUlBV/60pdQXl6O2267DS+//DKsVqugx2VZFl1dXXA4HNi9ezfOPvtsbN26FWlpaRgcHMSxY8fQ3t6O6elpOBwOQY8dS6xWK1paWpCXlxeWiIyPj+Oyyy7DZZdddsaLCCEEF154oUc349/+9rec0exKR9yRCMAzzzyDm266Cb/73e+wY8cOPPTQQ3j22WfR398fNyWPiQTDMHjvvfe4mSoLCwvc3e9FF10UVh6DYRh0dHTAbrejsbFxWSiNEAKTycR11Zv+f3v3HtPU3cYB/FuQgka5CUURSb0QhwiUKSpsAxSEgJugu7jFCS4kI26SkCUS3VxGNGYXjDMCU6JMzDaNDovATKdyNQpeS7USwUvYQLmKUiwgbU/P+8dezvvyenmRU3oofT4J/xxIefpPvz3n9/s9T28vpk6dOqy+UmPZ06dPcfXqVbi6usLX13fEIdLa2oqYmBiEhYXhwIED4/bO7VU0NzfD398f33//PVJSUgAAjY2N8Pf3x759+7B+/XqBKxx9FCQmsGTJEgQHByM7OxvAP994Z86cidTUVGzZskXg6iyb0WjEpUuXuJkqHR0diI6ORkJCAmJiYl5ppgrDMFCpVGAYBkFBQcNaj/nfvlIuLi7cDjB7e3s+b81sBu9EXFxceIVIe3s7YmNjERwcjPz8fAqR/3L48GFs2rQJN27cgFQqRWRkJJydnSGXy4UuzSwoSHjS6XSYNGkSCgoKhrRZSEpKQnd3N4qKioQrbpwxGo1QKpVc+/vm5mZERUUhISEBcXFxLz0vYjAYUFtbC5FIBJlMNqJ29f39/VyoaDQaODk5caEy3HG15jYwMICrV6/C2dkZ8+fPH3GIdHZ2YuXKlfDz88Nvv/02Ztv9CykhIQEajQZr1qzBjh07UFdXZzWdOyhIeGppacGMGTNQXV2NkJAQ7np6ejqqqqpw6dIlAasbvwbXOQZD5c6dO1i+fDni4+OfmanS19cHtVoNOzs7BAYGmuSb9MDAABcqjx8/xpQpU4aMqx0LBgYGcO3aNTg6OsLPz2/EIfLo0SPExcVhzpw5OH78+Jhp7z/WdHR0wM/PD48ePcKJEyesqn+X9a6SEYtmY2ODgIAAbN++HWq1GiqVCqGhocjNzcXs2bMRHx+PvLw8qNVqhISEQKlUQiaTmexxjL29PWbOnImFCxciLCwMXl5e6O7uRk1NDWpqanDv3j1otVoI9T1Np9OZJES6u7sRHx8Pb29vHDt2jELkJSQSCVJSUrjec9aEgoSnwa2i7e3tQ663t7dj2rRpAlVlXUQiEXx9ffH1119DqVTi1q1biIqKws8//4ylS5eiq6sLDMOgra1tVD7YxWIxZsyYgaCgIISHh0MqlUKr1eLSpUuorq7GnTt30NPTY7ZQ0el0uHr1KqZMmcIrRHp6erBmzRq4ubmhoKDAYjcamNOECROs8rEfBQlPYrEYCxcuRFlZGXfNaDSirKxsyKMuYh4ikQhz587Fxx9/jL6+PqxevRpbtmxBcXExXnvtNaxYsQJZWVloamoalQ92Ozs7TJ8+HYGBgYiIiMDcuXO5sxvnz59HQ0MDuru7Ry1UBu9EJk+ezCtEtFot3nvvPUyaNAknT5402wwYYpmsLzpHwRdffIGkpCQsWrQIixcvxp49e9Db24tPPvlE6NKsEsuy+OijjxAeHo79+/fDxsYGmzdvxoMHDyCXyyGXy7Ft2zbIZDKu/f2sWbNM3t7D1tYWHh4e8PDwAMMwXKuW2traIf2/nJ2dTXIWYzBEJk2ahAULFoz4Nfv6+vDBBx/A1tYWxcXFY3YjARk7aLHdRLKzs5GZmYm2tjbIZDLs3bsXS5YsEbosq9XS0oLp06c/NxxYlkV7ezsKCwshl8tRWVkJPz8/rlMxn7Yhw2E0GvH48WO0t7ejs7MTLMtyoeLq6jqiANDr9bh27RomTpwIf3//EYfI06dPsXbtWq4/mrW00jeVjIwMnDx5EiqVSuhSzIqChFg1lmXR1dXFzVQpKyuDj48P16nY19d3VE9usyyL7u5u7gAkwzBwd3eHRCLB1KlTh7U5YDBEHBwcEBAQMOJ6BwYGsG7dOjx8+BBnzpyxmqFehD8KEkL+bXCmSnFxMTdTxdvbmwsVPh/Sw/3/PT09XKjodDq4ublBIpHAzc3tuYu4er0eSqUSYrGY18wXnU6HxMRENDc3o6ysDK6urnzfDrEiFCSEvEBPT8+QmSoSiYQLlYULF456qGi1Wi5U+vv7h7RqsbOzg8FggFKp5M7HjLQevV6P5ORkNDQ0oLy8fMwcosvJyeEeFwcGBiIrKwuLFy8WuizyHBQkhAxDb28vFAoF5HI5Tp06BScnJ26mypIlS0a9XYhWq+UOQGq1Wri4uKC/vx8ODg4ICgoa8f83GAxISUnB9evXUVFRAQ8PDxNXPjLUv86yUJAQ8or6+/tx5swZyOVylJSUwMHBYchMldE+R/DkyROoVCoYDAYwDANnZ2dusf5VtukyDINNmzahpqYGlZWV8PT0HMWqXw31r7MsFCRW6ty5c8jMzMS1a9fQ2tqKwsJCqzuNawo6nQ6lpaWQy+UoKiqCSCTCypUrsXr1aoSFhZn8EN9gzzAbGxvIZDLo9XpuUJdGo4GjoyM3q/5l23aNRiPS0tJQUVGBiooKeHt7m7ROPqh/neWhA4lWqre3F4GBgcjJyRG6FIsmFosRFxeHgwcPoqWlBUePHoW9vT1SUlIwe/ZspKSkQKFQmGSmymD34sEQsbW1hYODA7y9vREcHIy33noLnp6e6OrqwoULF3Dx4kU0Njait7d3yOsYjUakp6fj7NmzKC0tHVMhAgAPHz4EwzDPPGbz8PBAW1ubQFWRl6EDiVYqNjYWsbGxQpcxrtjZ2SEyMhKRkZHIycnB+fPnUVBQgLS0NPT09CA2NhYJCQmIiop65ZkqgxMdAbywZ5i9vT28vLzg5eUFvV6Pzs5OtLe34969exCJRFAoFHj33XdRVFSE4uJiVFRUYNasWSZ578S60R0JIaPA1tYW4eHhyMrKwt9//w2FQgFPT098+eWXmDVrFtavX48TJ05Aq9X+39cavBNhWXbYC+t2dnbw9PREUFAQIiIi4Orqivr6esTExOCnn35CbGzsqLZq4YP611keChJCRpmNjQ1CQ0Oxe/du3L17F+Xl5fDx8cGOHTsglUrx4Ycf4ujRo9BoNM98sDMMg+vXr8NoNI54d9aECRMgk8kQGhoKFxcX7Nq1C/39/YiKioJUKkVdXZ2p3qpJUP86y0OL7QQikYgW2wVgNBpx48YNnDhxAnK5HHfv3kVkZCRWrVqFt99+G2KxGJ9++inWrVuHmJiYEe8GY1kWu3fvxp49e1BeXo7AwEAA/5xkLysrw7Jly8ZcP61jx44hKSkJubm5XP+648ePo76+fsxsUSb/QUFCKEjGAJZlcevWLW5QV11dHSZPnoyJEydCoVBg7ty5I+r/xbIssrKy8MMPP+DMmTNYtGjRKFQ/Oqh/neWgICEUJGPMwMAA4uLi0NDQAHd3d9y8eROhoaFISEjAqlWrMG3atGGFCsuyyM3Nxfbt2/Hnn39i6dKlZqieWCNaI7FSWq0WKpWK61La2NgIlUqFpqYmYQuzcgzDYO3atdBoNLh58yaUSiVu376Nd955BwUFBZg3bx6io6ORnZ2N5ubmFy6WsyyLQ4cOISMjAyUlJRQiZFTRHYmVqqysxLJly565npSUhPz8fPMXRDh5eXlYvXr1M40TWZbF/fv3uZkqFy5cwOuvv87NVJFKpRCJRGBZFr/88gs2b96MkpISRERECPNGiNWgICHEArEsi7a2Nm6mSlVVFRYsWID4+HjY29tj586dkMvliI6OFrpUYgUoSAixcP89U+XIkSMoLy/Hr7/+inXr1gldGrESFCSEjCMsy+LBgwfw8vISuhRiRShICCGE8EK7tgghhPBCQUIIIYQXChJiMb799lsEBwdjypQpkEgkSEhIQENDg9BlEWL1KEiIxaiqqsLnn3+Oixcv4uzZs9Dr9YiOjn5m3gYhxLxosZ1YrM7OTkgkElRVVSEsLEzocgixWnRHQiyWRqMBgGdOgBNCzIuCxMIxDIPQ0FCsWbNmyHWNRoOZM2fiq6++Eqiy0TU4c/yNN97AggULhC6HEKtGj7bGgdu3b0Mmk+HAgQPcaebExERcv34dV65cgVgsFrhC09u4cSMUCgXOnz9Ph+8IERgFyTixd+9eZGRkoK6uDpcvX8b777+PK1eucEOMxpNNmzahqKgI586do5njhIwBFCTjBMuyWL58OWxtbaFWq5Gamopt27YJXZZJsSyL1NRUFBYWorKyEj4+PkKXRADs3LkTp06dgkqlglgsRnd3t9AlETOjIBlH6uvr4evrC39/fyiVyhGPZh2rPvvsMxw5cgRFRUWYN28ed93JyWnMjYq1Jt988w2cnZ1x//595OXlUZBYIQqScSQ9PR05OTmwsbGBWq2GVCoVuiSTetFUwEOHDmHDhg3mLYY8Iz8/H2lpaRQkVoh2bY0T1dXV+PHHH/HHH39g8eLFSE5OfuH0PEvFsuxzfyhECBEWBck40NfXhw0bNmDjxo1YtmwZ8vLycPnyZezfv1/o0gghVoCCZBzYunUrWJbFd999BwCQSqXYtWsX0tPT8ddffwlbHLFIW7ZsgUgkeulPfX290GWSMYLWSCxcVVUVIiMjUVlZiTfffHPI72JiYmAwGFBaWvrC9QVCnqezsxNdXV0v/ZvZs2cPOaNEayTWa3xt67FC4eHhMBgMz/3d6dOnzVwNGS/c3d3h7u4udBnEQtCjLULMYN++fQgICICjoyMcHR0REhIChUIhdFkm0dTUBJVKhaamJjAMA5VKBZVKBa1WK3RpxEzo0RYhZlBSUgJbW1v4+PiAZVkcPnwYmZmZqK2thZ+fn9Dl8bJhwwYcPnz4mesVFRWIiIgwf0HE7ChICBGIq6srMjMzkZycLHQphPBCaySEmBnDMPj999/R29uLkJAQocshhDcKEkLMRK1WIyQkBE+fPsXkyZNRWFiI+fPnC10WIbzRoy1CzESn06GpqQkajQYFBQU4ePAgqqqqKEyIxaMgIUQgUVFRmDNnDnJzc4UuhRBeaPsvIQIxGo0YGBgQugxCeKM1EkLMYOvWrYiNjYW3tzeePHmCI0eOoLKykg6NknGBgoQQM+jo6EBiYiJaW1vh5OSEgIAAnD59GitWrBC6NEJ4ozUSQgghvNAaCSGEEF4oSAghhPBCQUIIIYQXChJCCCG8UJAQQgjhhYKEEEIILxQkhBBCeKEgIYQQwgsFCSGEEF4oSAghhPBCQUIIIYQXChJCCCG8/Athi9lwV9FosgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "# import torch\n", + "# import os\n", + "# import openai\n", + "# import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(projection='3d')\n", + "ww = [generate_coordinates(x / 10) for x in range(11)]\n", + "data4 = np.stack(ww)\n", + "print(data4)\n", + "print(data4.shape)\n", + "ax.scatter(data4[:,0], data4[:,2], data4[:,1])\n", + "ax.set_title('Visualization of Object Trajectory')\n", + "ax.set_xlabel('X')\n", + "ax.set_ylabel('Y')\n", + "ax.set_zlabel('Z')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "4DGen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/guidance/imagedream_utils.py b/guidance/imagedream_utils.py new file mode 100644 index 0000000..f8465f0 --- /dev/null +++ b/guidance/imagedream_utils.py @@ -0,0 +1,334 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms.functional as TF + +from imagedream.camera_utils import get_camera, convert_opengl_to_blender, normalize_camera +from imagedream.model_zoo import build_model +from imagedream.ldm.models.diffusion.ddim import DDIMSampler + +from diffusers import DDIMScheduler + +class ImageDream(nn.Module): + def __init__( + self, + device, + model_name='sd-v2.1-base-4view-ipmv', + ckpt_path=None, + t_range=[0.02, 0.98], + ): + super().__init__() + + self.device = device + self.model_name = model_name + self.ckpt_path = ckpt_path + + self.model = build_model(self.model_name, ckpt_path=self.ckpt_path).eval().to(self.device) + self.model.device = device + for p in self.model.parameters(): + p.requires_grad_(False) + + self.dtype = torch.float32 + + self.num_train_timesteps = 1000 + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + + self.image_embeddings = {} + self.embeddings = {} + + self.scheduler = DDIMScheduler.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="scheduler", torch_dtype=self.dtype + ) + + @torch.no_grad() + def get_image_text_embeds(self, image, prompts, negative_prompts): + + image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False) + image_pil = TF.to_pil_image(image[0]) + image_embeddings = {} + ww = self.model.get_learned_image_conditioning(image_pil).repeat(5,1,1) + image_embeddings['pos'] = ww # [5, 257, 1280] + image_embeddings['neg'] = torch.zeros_like(ww) + + image_embeddings['ip_img'] = self.encode_imgs(image) + image_embeddings['neg_ip_img'] = torch.zeros_like(image_embeddings['ip_img']) + + pos_embeds = self.encode_text(prompts).repeat(5,1,1) + neg_embeds = self.encode_text(negative_prompts).repeat(5,1,1) + embeddings = {} + embeddings['pos'] = pos_embeds + embeddings['neg'] = neg_embeds + return image_embeddings, embeddings + + @torch.no_grad() + def prepare_embeds(self, image_li, prompts, negative_prompts): + return [self.get_image_text_embeds(image_li[idx:idx + 1], prompts, negative_prompts) for idx in range(len(image_li))] + # return [self.get_image_text_embeds(x, prompts, negative_prompts) for x in image_li] + + def encode_text(self, prompt): + # prompt: [str] + embeddings = self.model.get_learned_conditioning(prompt).to(self.device) + return embeddings + + @torch.no_grad() + def refine(self, pred_rgb, camera, + guidance_scale=5, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + real_batch_size = batch_size // 4 + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_256.to(self.dtype)) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + camera = camera[:, [0, 2, 1, 3]] # to blender convention (flip y & z axis) + camera[:, 1] *= -1 + camera = normalize_camera(camera).view(batch_size, 16) + + # extra view + camera = camera.view(real_batch_size, 4, 16) + camera = torch.cat([camera, torch.zeros_like(camera[:, :1])], dim=1) # [rB, 5, 16] + camera = camera.view(real_batch_size * 5, 16) + + camera = camera.repeat(2, 1) + embeddings = torch.cat([self.embeddings['neg'].repeat(real_batch_size, 1, 1), self.embeddings['pos'].repeat(real_batch_size, 1, 1)], dim=0) + image_embeddings = torch.cat([self.image_embeddings['neg'].repeat(real_batch_size, 1, 1), self.image_embeddings['pos'].repeat(real_batch_size, 1, 1)], dim=0) + ip_img_embeddings= torch.cat([self.image_embeddings['neg_ip_img'].repeat(real_batch_size, 1, 1, 1), self.image_embeddings['ip_img'].repeat(real_batch_size, 1, 1, 1)], dim=0) + + context = { + "context": embeddings, + "ip": image_embeddings, + "ip_img": ip_img_embeddings, + "camera": camera, + "num_frames": 4 + 1 + } + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + # extra view + + latents = latents.view(real_batch_size, 4, 4, 32, 32) + latents = torch.cat([latents, torch.zeros_like(latents[:, :1])], dim=1).view(-1, 4, 32, 32) + latent_model_input = torch.cat([latents] * 2) + + tt = torch.cat([t.unsqueeze(0).repeat(real_batch_size * 5)] * 2).to(self.device) + + noise_pred = self.model.apply_model(latent_model_input, tt, context) + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + + # remove extra view + noise_pred_uncond = noise_pred_uncond.reshape(real_batch_size, 5, 4, 32, 32)[:, :-1].reshape(-1, 4, 32, 32) + noise_pred_cond = noise_pred_cond.reshape(real_batch_size, 5, 4, 32, 32)[:, :-1].reshape(-1, 4, 32, 32) + latents = latents.reshape(real_batch_size, 5, 4, 32, 32)[:, :-1].reshape(-1, 4, 32, 32) + + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + cur_embedding, + pred_rgb, # [B, C, H, W] + camera, # [B, 4, 4] + step_ratio=None, + guidance_scale=5, + as_latent=False, + ): + image_embeddings_cur, embeddings_cur = cur_embedding + batch_size = pred_rgb.shape[0] + real_batch_size = batch_size // 4 + pred_rgb = pred_rgb.to(self.dtype) + + if as_latent: + latents = F.interpolate(pred_rgb, (32, 32), mode="bilinear", align_corners=False) * 2 - 1 + else: + # interp to 256x256 to be fed into vae. + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode="bilinear", align_corners=False) + # encode image into latents with vae, requires grad! + latents = self.encode_imgs(pred_rgb_256) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (real_batch_size,), dtype=torch.long, device=self.device).repeat(4) + + camera = camera[:, [0, 2, 1, 3]] # to blender convention (flip y & z axis) + camera[:, 1] *= -1 + camera = normalize_camera(camera).view(batch_size, 16) + + # extra view + camera = camera.view(real_batch_size, 4, 16) + camera = torch.cat([camera, torch.zeros_like(camera[:, :1])], dim=1) # [rB, 5, 16] + camera = camera.view(real_batch_size * 5, 16) + + camera = camera.repeat(2, 1) + embeddings = torch.cat([embeddings_cur['neg'].repeat(real_batch_size, 1, 1), embeddings_cur['pos'].repeat(real_batch_size, 1, 1)], dim=0) + image_embeddings = torch.cat([image_embeddings_cur['neg'].repeat(real_batch_size, 1, 1), image_embeddings_cur['pos'].repeat(real_batch_size, 1, 1)], dim=0) + ip_img_embeddings= torch.cat([image_embeddings_cur['neg_ip_img'].repeat(real_batch_size, 1, 1, 1), image_embeddings_cur['ip_img'].repeat(real_batch_size, 1, 1, 1)], dim=0) + + context = { + "context": embeddings, + "ip": image_embeddings, + "ip_img": ip_img_embeddings, + "camera": camera, + "num_frames": 4 + 1 + } + + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) + latents_noisy = self.model.q_sample(latents, t, noise) # [B=4, 4, 32, 32] + # extra view + t = t.view(real_batch_size, 4) + t = torch.cat([t, t[:, :1]], dim=1).view(-1) + latents_noisy = latents_noisy.view(real_batch_size, 4, 4, 32, 32) + latents_noisy = torch.cat([latents_noisy, torch.zeros_like(latents_noisy[:, :1])], dim=1).view(-1, 4, 32, 32) + # pred noise + latent_model_input = torch.cat([latents_noisy] * 2) + tt = torch.cat([t] * 2) + + # import kiui + # kiui.lo(latent_model_input, t, context['context'], context['camera']) + + noise_pred = self.model.apply_model(latent_model_input, tt, context) + + # perform guidance (high scale from paper!) + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + + # remove extra view + noise_pred_uncond = noise_pred_uncond.reshape(real_batch_size, 5, 4, 32, 32)[:, :-1].reshape(-1, 4, 32, 32) + noise_pred_cond = noise_pred_cond.reshape(real_batch_size, 5, 4, 32, 32)[:, :-1].reshape(-1, 4, 32, 32) + + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + grad = (noise_pred - noise) + grad = torch.nan_to_num(grad) + + target = (latents - grad).detach() + loss = F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0] + # loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0] + + return loss + + def decode_latents(self, latents): + imgs = self.model.decode_first_stage(latents) + imgs = ((imgs + 1) / 2).clamp(0, 1) + return imgs + + def encode_imgs(self, imgs): + # imgs: [B, 3, 256, 256] + imgs = 2 * imgs - 1 + latents = self.model.get_first_stage_encoding(self.model.encode_first_stage(imgs)) + return latents # [B, 4, 32, 32] + + @torch.no_grad() + def prompt_to_img( + self, + image, + prompts, + negative_prompts="", + height=256, + width=256, + num_inference_steps=50, + guidance_scale=5.0, + latents=None, + elevation=0, + azimuth_start=0, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + real_batch_size = len(prompts) + batch_size = len(prompts) * 5 + + # Text embeds -> img latents + sampler = DDIMSampler(self.model) + shape = [4, height // 8, width // 8] + + c_ = {"context": self.encode_text(prompts).repeat(5,1,1)} + uc_ = {"context": self.encode_text(negative_prompts).repeat(5,1,1)} + + # image embeddings + image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False) + image_pil = TF.to_pil_image(image[0]) + image_embeddings = self.model.get_learned_image_conditioning(image_pil).repeat(5,1,1).to(self.device) + c_["ip"] = image_embeddings + uc_["ip"] = torch.zeros_like(image_embeddings) + + ip_img = self.encode_imgs(image) + c_["ip_img"] = ip_img + uc_["ip_img"] = torch.zeros_like(ip_img) + + camera = get_camera(4, elevation=elevation, azimuth_start=azimuth_start, extra_view=True) + camera = camera.repeat(real_batch_size, 1).to(self.device) + + c_["camera"] = uc_["camera"] = camera + c_["num_frames"] = uc_["num_frames"] = 5 + + kiui.lo(image_embeddings, ip_img, camera) + + latents, _ = sampler.sample(S=num_inference_steps, conditioning=c_, + batch_size=batch_size, shape=shape, + verbose=False, + unconditional_guidance_scale=guidance_scale, + unconditional_conditioning=uc_, + eta=0, x_T=None) + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [4, 3, 256, 256] + + kiui.lo(latents, imgs) + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +if __name__ == "__main__": + import argparse + import matplotlib.pyplot as plt + import kiui + + parser = argparse.ArgumentParser() + parser.add_argument("image", type=str) + parser.add_argument("prompt", type=str) + parser.add_argument("--negative", default="", type=str) + parser.add_argument("--steps", type=int, default=30) + opt = parser.parse_args() + + device = torch.device("cuda") + + sd = ImageDream(device) + + image = kiui.read_image(opt.image, mode='tensor') + image = image.permute(2, 0, 1).unsqueeze(0).to(device) + + while True: + imgs = sd.prompt_to_img(image, opt.prompt, opt.negative, num_inference_steps=opt.steps) + + grid = np.concatenate([ + np.concatenate([imgs[0], imgs[1]], axis=1), + np.concatenate([imgs[2], imgs[3]], axis=1), + ], axis=0) + + # visualize image + plt.imshow(grid) + plt.show() diff --git a/guidance/mvdream_utils.py b/guidance/mvdream_utils.py new file mode 100644 index 0000000..006d924 --- /dev/null +++ b/guidance/mvdream_utils.py @@ -0,0 +1,278 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mvdream.camera_utils import get_camera, convert_opengl_to_blender, normalize_camera +from mvdream.model_zoo import build_model +from mvdream.ldm.models.diffusion.ddim import DDIMSampler + +from diffusers import DDIMScheduler + +class MVDream(nn.Module): + def __init__( + self, + device, + model_name='sd-v2.1-base-4view', + ckpt_path=None, + t_range=[0.02, 0.98], + ): + super().__init__() + + self.device = device + self.model_name = model_name + self.ckpt_path = ckpt_path + + self.model = build_model(self.model_name, ckpt_path=self.ckpt_path).eval().to(self.device) + self.model.device = device + for p in self.model.parameters(): + p.requires_grad_(False) + + self.dtype = torch.float32 + + self.num_train_timesteps = 1000 + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + + self.embeddings = {} + + self.scheduler = DDIMScheduler.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="scheduler", torch_dtype=self.dtype + ) + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts=["ugly, bad anatomy, blurry, pixelated obscure, unnatural colors, poor lighting, dull, and unclear, cropped, lowres, low quality, artifacts, duplicate, morbid, mutilated, poorly drawn face, deformed, dehydrated, bad proportions"]): + pos_embeds = self.encode_text(prompts).repeat(4,1,1) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts).repeat(4,1,1) + self.embeddings['pos'] = pos_embeds + self.embeddings['neg'] = neg_embeds + + def encode_text(self, prompt): + # prompt: [str] + embeddings = self.model.get_learned_conditioning(prompt).to(self.device) + return embeddings + + @torch.no_grad() + def refine(self, pred_rgb, camera, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + real_batch_size = batch_size // 4 + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_256.to(self.dtype)) + # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + camera = camera[:, [0, 2, 1, 3]] # to blender convention (flip y & z axis) + camera[:, 1] *= -1 + camera = normalize_camera(camera).view(batch_size, 16) + camera = camera.repeat(2, 1) + + embeddings = torch.cat([self.embeddings['neg'].repeat(real_batch_size, 1, 1), self.embeddings['pos'].repeat(real_batch_size, 1, 1)], dim=0) + context = {"context": embeddings, "camera": camera, "num_frames": 4} + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + + tt = torch.cat([t.unsqueeze(0).repeat(batch_size)] * 2).to(self.device) + + noise_pred = self.model.apply_model(latent_model_input, tt, context) + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, # [B, C, H, W], B is multiples of 4 + camera, # [B, 4, 4] + step_ratio=None, + guidance_scale=100, + as_latent=False, + ): + + batch_size = pred_rgb.shape[0] + real_batch_size = batch_size // 4 + pred_rgb = pred_rgb.to(self.dtype) + + if as_latent: + latents = F.interpolate(pred_rgb, (32, 32), mode="bilinear", align_corners=False) * 2 - 1 + else: + # interp to 256x256 to be fed into vae. + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode="bilinear", align_corners=False) + # encode image into latents with vae, requires grad! + latents = self.encode_imgs(pred_rgb_256) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (real_batch_size,), dtype=torch.long, device=self.device).repeat(4) + + # camera = convert_opengl_to_blender(camera) + # flip_yz = torch.tensor([[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]).unsqueeze(0) + # camera = torch.matmul(flip_yz.to(camera), camera) + camera = camera[:, [0, 2, 1, 3]] # to blender convention (flip y & z axis) + camera[:, 1] *= -1 + camera = normalize_camera(camera).view(batch_size, 16) + + ############### + # sampler = DDIMSampler(self.model) + # shape = [4, 32, 32] + # c_ = {"context": self.embeddings['pos']} + # uc_ = {"context": self.embeddings['neg']} + + # # print(camera) + + # # camera = get_camera(4, elevation=0, azimuth_start=0) + # # camera = camera.repeat(batch_size // 4, 1).to(self.device) + + # # print(camera) + + # c_["camera"] = uc_["camera"] = camera + # c_["num_frames"] = uc_["num_frames"] = 4 + + # latents_, _ = sampler.sample(S=30, conditioning=c_, + # batch_size=batch_size, shape=shape, + # verbose=False, + # unconditional_guidance_scale=guidance_scale, + # unconditional_conditioning=uc_, + # eta=0, x_T=None) + + # # Img latents -> imgs + # imgs = self.decode_latents(latents_) # [4, 3, 256, 256] + # import kiui + # kiui.vis.plot_image(imgs) + ############### + + camera = camera.repeat(2, 1) + embeddings = torch.cat([self.embeddings['neg'].repeat(real_batch_size, 1, 1), self.embeddings['pos'].repeat(real_batch_size, 1, 1)], dim=0) + context = {"context": embeddings, "camera": camera, "num_frames": 4} + + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) + latents_noisy = self.model.q_sample(latents, t, noise) + # pred noise + latent_model_input = torch.cat([latents_noisy] * 2) + tt = torch.cat([t] * 2) + + # import kiui + # kiui.lo(latent_model_input, t, context['context'], context['camera']) + + # print('???', latent_model_input.device, tt.device, embeddings.device, camera.device) + noise_pred = self.model.apply_model(latent_model_input, tt, context) + + # perform guidance (high scale from paper!) + noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_pos - noise_pred_uncond) + + grad = (noise_pred - noise) + grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0] + + return loss + + def decode_latents(self, latents): + imgs = self.model.decode_first_stage(latents) + imgs = ((imgs + 1) / 2).clamp(0, 1) + return imgs + + def encode_imgs(self, imgs): + # imgs: [B, 3, 256, 256] + imgs = 2 * imgs - 1 + latents = self.model.get_first_stage_encoding(self.model.encode_first_stage(imgs)) + return latents # [B, 4, 32, 32] + + @torch.no_grad() + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=256, + width=256, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + elevation=0, + azimuth_start=0, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + batch_size = len(prompts) * 4 + + # Text embeds -> img latents + sampler = DDIMSampler(self.model) + shape = [4, height // 8, width // 8] + c_ = {"context": self.encode_text(prompts).repeat(4,1,1)} + uc_ = {"context": self.encode_text(negative_prompts).repeat(4,1,1)} + + camera = get_camera(4, elevation=elevation, azimuth_start=azimuth_start) + camera = camera.repeat(batch_size // 4, 1).to(self.device) + + c_["camera"] = uc_["camera"] = camera + c_["num_frames"] = uc_["num_frames"] = 4 + + latents, _ = sampler.sample(S=num_inference_steps, conditioning=c_, + batch_size=batch_size, shape=shape, + verbose=False, + unconditional_guidance_scale=guidance_scale, + unconditional_conditioning=uc_, + eta=0, x_T=None) + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [4, 3, 256, 256] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +if __name__ == "__main__": + import argparse + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + parser.add_argument("prompt", type=str) + parser.add_argument("--negative", default="", type=str) + parser.add_argument("--steps", type=int, default=30) + opt = parser.parse_args() + + device = torch.device("cuda") + + sd = MVDream(device) + + while True: + imgs = sd.prompt_to_img(opt.prompt, opt.negative, num_inference_steps=opt.steps) + + grid = np.concatenate([ + np.concatenate([imgs[0], imgs[1]], axis=1), + np.concatenate([imgs[2], imgs[3]], axis=1), + ], axis=0) + + # visualize image + plt.imshow(grid) + plt.show() diff --git a/guidance/sd_utils.py b/guidance/sd_utils.py new file mode 100644 index 0000000..92138c3 --- /dev/null +++ b/guidance/sd_utils.py @@ -0,0 +1,443 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, +) +from diffusers.utils.import_utils import is_xformers_available + +from typing import List + +# suppress partial model loading warning +logging.set_verbosity_error() + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True + + +class StableDiffusion(nn.Module): + def __init__( + self, + device, + fp16=True, + vram_O=False, + sd_version="2.1", + hf_key=None, + t_range=[0.02, 0.98], + ): + super().__init__() + + self.device = device + self.sd_version = sd_version + + if hf_key is not None: + print(f"[INFO] using hugging face custom model key: {hf_key}") + model_key = hf_key + elif self.sd_version == "2.1": + model_key = "stabilityai/stable-diffusion-2-1-base" + elif self.sd_version == "2.0": + model_key = "stabilityai/stable-diffusion-2-base" + elif self.sd_version == "1.5": + model_key = "runwayml/stable-diffusion-v1-5" + else: + raise ValueError( + f"Stable-diffusion version {self.sd_version} not supported." + ) + + self.dtype = torch.float16 if fp16 else torch.float32 + + # Create model + pipe = StableDiffusionPipeline.from_pretrained( + model_key, torch_dtype=self.dtype + ) + + if vram_O: + pipe.enable_sequential_cpu_offload() + pipe.enable_vae_slicing() + pipe.unet.to(memory_format=torch.channels_last) + pipe.enable_attention_slicing(1) + # pipe.enable_model_cpu_offload() + else: + pipe.to(device) + + self.vae = pipe.vae + self.tokenizer = pipe.tokenizer + self.text_encoder = pipe.text_encoder + self.unet = pipe.unet + + self.scheduler = DDIMScheduler.from_pretrained( + model_key, subfolder="scheduler", torch_dtype=self.dtype + ) + + del pipe + + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + self.embeddings = None + self.object_embeddings = [] + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts): + pos_embeds = self.encode_text(prompts) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0) # [2, 77, 768] + + @torch.no_grad() + def get_objects_text_embeds(self, object_prompts, negative_prompts): + for prompt in object_prompts: + pos_embeds = self.encode_text([prompt]) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + self.object_embeddings.append(torch.cat([neg_embeds, pos_embeds], dim=0)) # [2, 77, 768] + + def encode_text(self, prompt): + # prompt: [str] + inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0] + return embeddings + + @torch.no_grad() + def refine(self, pred_rgb, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_512.to(self.dtype)) + # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings, + ).sample + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, + step_ratio=None, + guidance_scale=100, + as_latent=False, + background=None, + obj_id=None, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb = pred_rgb.to(self.dtype) + + if as_latent: + latents = F.interpolate(pred_rgb, (64, 64), mode="bilinear", align_corners=False) * 2 - 1 + else: + # interp to 512x512 to be fed into vae. + # pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode="bilinear", align_corners=False) + # encode image into latents with vae, requires grad! + height, width = pred_rgb.shape[-2], pred_rgb.shape[-1] + if height != width: + pad = abs((height-width) // 2) + if height>width: + padding = (pad, pad) + pred_rgb = F.pad(pred_rgb, padding, "constant", background[0].item()) + else: + padding = (0, 0, pad, pad) + pred_rgb = F.pad(pred_rgb, padding, "constant", background[0].item()) + latents = self.encode_imgs(pred_rgb) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) + latents_noisy = self.scheduler.add_noise(latents, noise, t) + # pred noise + latent_model_input = torch.cat([latents_noisy] * 2) + tt = torch.cat([t] * 2) + + if obj_id is not None: + noise_pred = self.unet( + latent_model_input, tt, encoder_hidden_states=self.object_embeddings[obj_id].repeat(batch_size, 1, 1) + ).sample + else: + noise_pred = self.unet( + latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1) + ).sample + + # perform guidance (high scale from paper!) + noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_pos - noise_pred_uncond + ) + + grad = w * (noise_pred - noise) + grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0] + + return loss + + @torch.no_grad() + def produce_latents( + self, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if latents is None: + latents = torch.randn( + ( + self.embeddings.shape[0] // 2, + self.unet.in_channels, + height // 8, + width // 8, + ), + device=self.device, + ) + + self.scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(self.scheduler.timesteps): + # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. + latent_model_input = torch.cat([latents] * 2) + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings + ).sample + + # perform guidance + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + return latents + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + imgs = self.vae.decode(latents).sample + imgs = (imgs / 2 + 0.5).clamp(0, 1) + + return imgs + + def encode_imgs(self, imgs): + # imgs: [B, 3, H, W] + + imgs = 2 * imgs - 1 + + posterior = self.vae.encode(imgs).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + return latents + + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + # Prompts -> text embeds + self.get_text_embeds(prompts, negative_prompts) + + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + @torch.no_grad() + def generate_img( + self, + emb, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + neg_prompt = self.encode_text([""]) + self.embeddings = torch.cat([neg_prompt, emb.unsqueeze(0)], dim=0) # + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +def window_score(x, gamma: float = 0.6) -> torch.Tensor: + # return torch.exp(-torch.abs(gamma*x)) + return torch.cos(gamma*x) + + +# Collect similar info from attentive features for neglected concept +def sim_correction(embeddings: torch.Tensor, + correction_indices: List[int], + scores: torch.Tensor, + window: bool = True) -> torch.Tensor: + """ Embeddings shape (77, 768), computes similarity between embeddings, combine using similarity scores""" + ntk, dim = embeddings.shape + device = embeddings.device + + for i, tk in enumerate(correction_indices): + alpha = scores[i] + v = embeddings[tk].clone() + + sim = v.unsqueeze(0) * embeddings # nth,dim 77,768 + sim = torch.relu(sim) # 77,768 + + ind = torch.lt(sim, 0.5) # relu is not needed in this case + sim[ind] = 0. + sim[:tk] = 0. # 77, 768 + sim /= max(sim.max(), 1e-6) + + if window: + ws = window_score(torch.arange(0, ntk - tk).to(device), gamma=0.8) + ws = ws.unsqueeze(-1) # 77 - tk,1 + sim[tk:] = ws * sim[tk:] # 77, 768 + + successor = torch.sum(sim * embeddings, dim=0) + embeddings[tk] = (1 - alpha) * embeddings[tk] + alpha * successor + embeddings[tk] *= v.norm() / embeddings[tk].norm() + + return embeddings + +if __name__ == "__main__": + import argparse + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + parser.add_argument("prompt", type=str) + parser.add_argument("--negative", default="", type=str) + parser.add_argument( + "--sd_version", + type=str, + default="1.5", + choices=["1.5", "2.0", "2.1"], + help="stable diffusion version", + ) + parser.add_argument( + "--hf_key", + type=str, + default=None, + help="hugging face Stable diffusion model key", + ) + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument( + "--vram_O", action="store_true", help="optimization for low VRAM usage" + ) + parser.add_argument("-H", type=int, default=512) + parser.add_argument("-W", type=int, default=512) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--steps", type=int, default=50) + opt = parser.parse_args() + + seed_everything(opt.seed) + + device = torch.device("cuda") + + sd = StableDiffusion(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key) + + # imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) + + # visualize image + # plt.imshow(imgs[0]) + # plt.show() + ww = sd.encode_text('a photo of a cat and a dog') + # ww = sd.encode_text('A teddy bear with a yellow bird') + token_indices = [5, 8] + cor_scores1 = [0.3, 0] + # from IPython import embed + # embed() + res = sim_correction(embeddings=ww[0], correction_indices=token_indices, scores=torch.tensor(cor_scores1, device=device)) + + imgs = sd.generate_img(res, opt.H, opt.W, opt.steps) + from PIL import Image + for i in range(len(imgs)): + Image.fromarray(imgs[i]).save(f'b_{i}.png') + imgs = sd.generate_img(ww[0], opt.H, opt.W, opt.steps) + from PIL import Image + for i in range(len(imgs)): + Image.fromarray(imgs[i]).save(f'c_{i}.png') diff --git a/guidance/zero123_utils.py b/guidance/zero123_utils.py new file mode 100644 index 0000000..65b3824 --- /dev/null +++ b/guidance/zero123_utils.py @@ -0,0 +1,244 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + DDIMScheduler, + StableDiffusionPipeline, +) +import torchvision.transforms.functional as TF + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +import sys +sys.path.append('./') + +from zero123 import Zero123Pipeline + + +class Zero123(nn.Module): + def __init__(self, device, fp16=True, t_range=[0.2, 0.6], zero123_path='ashawkey/stable-zero123-diffusers'): + super().__init__() + + self.device = device + self.fp16 = fp16 + self.dtype = torch.float16 if fp16 else torch.float32 + self.pipe = Zero123Pipeline.from_pretrained( + zero123_path, + variant="fp16_ema" if self.fp16 else None, + torch_dtype=self.dtype, + ).to(self.device) + + # for param in self.pipe.parameters(): + # param.requires_grad = False + + self.pipe.image_encoder.eval() + self.pipe.vae.eval() + self.pipe.unet.eval() + self.pipe.clip_camera_projection.eval() + + self.vae = self.pipe.vae + self.unet = self.pipe.unet + + self.pipe.set_progress_bar_config(disable=True) + + self.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config) + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + # embeddings = None + self.use_stable_zero123 = 'stable' in zero123_path + + def get_cam_embeddings(self, polar, azimuth, radius): + if self.use_stable_zero123: + T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), np.deg2rad(np.full_like(polar, 90))], axis=-1) + # 90 because pose0 is fixed + # https://github.com/threestudio-project/threestudio/pull/356/files#diff-7cab41ca8761951def6987763141c5cfe7b1e3c0d174ac3cb0f5b4ca8ec8309aR220 + else: + # original zero123 camera embedding + T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1) + T = torch.from_numpy(T).unsqueeze(0).unsqueeze(0).to(dtype=self.dtype, device=self.device) # [8, 1, 4] + # print(T.shape) + # T = torch.from_numpy(T).unsqueeze(1).to(dtype=self.dtype, device=self.device) # [8, 1, 4] + return T + + @torch.no_grad() + def get_img_embeds(self, x): + # x: image tensor in [0, 1] + x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False) + x_pil = [TF.to_pil_image(image) for image in x] + x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype) + c = self.pipe.image_encoder(x_clip).image_embeds + v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor + embeddings = [c, v] + return embeddings + + @torch.no_grad() + def get_img_embeds_pil(self, x, x_pil): + #x: image tensor in [0, 1] + x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False) + x_pil = [TF.to_pil_image(image) for image in x] + x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype) + c = self.pipe.image_encoder(x_clip).image_embeds + v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor + return c, v + + + @torch.no_grad() + def get_vis_image(self, pred_rgb_256, latents_noisy, t, noise_pred): + # print(pred_rgb_256.shape, latents_noisy.shape, t.shape, noise_pred.shape) + with torch.no_grad(): + # visualize predicted denoised image + result_hopefully_less_noisy_image = self.decode_latents(self.pred_x0(latents_noisy, t, noise_pred)) + + # visualize noisier image + result_noisier_image = self.decode_latents(latents_noisy) + + # all 3 input images are [1, 3, H, W], e.g. [1, 3, 512, 512] + viz_images = torch.cat([pred_rgb_256, result_noisier_image, result_hopefully_less_noisy_image],dim=-1) + return viz_images + + def pred_x0(self, sample, timestep, model_output): + alpha_prod_t = self.alphas[timestep].to(self.device).view(-1, 1, 1, 1) + + beta_prod_t = 1 - alpha_prod_t + # print('alpha_prod_t', alpha_prod_t.shape) + if self.scheduler.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.scheduler.config.prediction_type == "sample": + pred_original_sample = model_output + elif self.scheduler.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + # predict V + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," + " or `v_prediction`" + ) + + return pred_original_sample + + def train_step(self, pred_rgb, polar, azimuth, radius, embeddings, step_ratio=None, guidance_scale=2, as_latent=False): + # pred_rgb: tensor [1, 3, H, W] in [0, 1] + + batch_size = pred_rgb.shape[0] + + if as_latent: + latents = F.interpolate(pred_rgb, (32, 32), mode='bilinear', align_corners=False) * 2 - 1 + else: + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_256.to(self.dtype)) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + with torch.no_grad(): + noise = torch.randn_like(latents) + latents_noisy = self.scheduler.add_noise(latents, noise, t) + + x_in = torch.cat([latents_noisy] * 2) + t_in = torch.cat([t] * 2) + + T = self.get_cam_embeddings(polar, azimuth, radius) + # T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1) + # T = torch.from_numpy(T).unsqueeze(0).unsqueeze(0).to(self.dtype).to(self.device) # [8, 1, 4] + # T = torch.from_numpy(T).unsqueeze(1).to(self.dtype).to(self.device) # [8, 1, 4] + # print('embeddings[0].repeat(batch_size, 1, 1) ',embeddings[0].repeat(batch_size, 1, 1).shape) #[4, 1, 768] + # print('T ',T.shape) #[1, 1, 4] + cc_emb = torch.cat([embeddings[0].repeat(batch_size, 1, 1), T.repeat(batch_size,1,1)], dim=-1) + cc_emb = self.pipe.clip_camera_projection(cc_emb) + cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0) + + vae_emb = embeddings[1].repeat(batch_size, 1, 1, 1) + vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0) + + noise_pred = self.unet( + torch.cat([x_in, vae_emb], dim=1), + t_in.to(self.unet.dtype), + encoder_hidden_states=cc_emb, + ).sample + + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + grad = w * (noise_pred - noise) + grad = torch.nan_to_num(grad) + + target = (latents - grad).detach() + loss = F.mse_loss(latents.float(), target, reduction='sum') + + im = self.get_vis_image(pred_rgb_256[:4], latents_noisy[:4], t[:4], noise_pred[:4]) + + return loss, im + + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + imgs = self.vae.decode(latents.to(self.dtype)).sample + imgs = (imgs / 2 + 0.5).clamp(0, 1) + + return imgs + + def encode_imgs(self, imgs, mode=False): + # imgs: [B, 3, H, W] + + imgs = 2 * imgs - 1 + + posterior = self.vae.encode(imgs).latent_dist + if mode: + latents = posterior.mode() + else: + latents = posterior.sample() + latents = latents * self.vae.config.scaling_factor + + return latents + + +if __name__ == '__main__': + import cv2 + import argparse + import numpy as np + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + + parser.add_argument('input', type=str) + parser.add_argument('--polar', type=float, default=0, help='delta polar angle in [-90, 90]') + parser.add_argument('--azimuth', type=float, default=0, help='delta azimuth angle in [-180, 180]') + parser.add_argument('--radius', type=float, default=0, help='delta camera radius multiplier in [-0.5, 0.5]') + + opt = parser.parse_args() + + device = torch.device('cuda') + + print(f'[INFO] loading image from {opt.input} ...') + image = cv2.imread(opt.input, cv2.IMREAD_UNCHANGED) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_AREA) + image = image.astype(np.float32) / 255.0 + image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).contiguous().to(device) + + print(f'[INFO] loading model ...') + zero123 = Zero123(device) + + print(f'[INFO] running embed ...') + emb=zero123.get_img_embeds(image) + print(f'[INFO] running model ...') + while True: + outputs = zero123.refine(image, polar=[opt.polar], azimuth=[opt.azimuth], radius=[opt.radius], embeddings=emb,strength=0) + plt.imshow(outputs.float().cpu().numpy().transpose(0, 2, 3, 1)[0]) + plt.show() diff --git a/guidance/zero123_xl_utils.py b/guidance/zero123_xl_utils.py new file mode 100644 index 0000000..1e441ee --- /dev/null +++ b/guidance/zero123_xl_utils.py @@ -0,0 +1,277 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + DDIMScheduler, + StableDiffusionPipeline, +) +import torchvision.transforms.functional as TF + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +import sys +sys.path.append('./') + +from zero123 import Zero123Pipeline + + +class Zero123(nn.Module): + def __init__(self, device, fp16=True, t_range=[0.2, 0.6]): + super().__init__() + + self.device = device + self.fp16 = fp16 + self.dtype = torch.float16 if fp16 else torch.float32 + zero123_path="bennyguo/zero123-xl-diffusers" + self.pipe = Zero123Pipeline.from_pretrained( + zero123_path, + variant="fp16_ema" if self.fp16 else None, + torch_dtype=self.dtype, + ).to(self.device) + + # for param in self.pipe.parameters(): + # param.requires_grad = False + + self.pipe.image_encoder.eval() + self.pipe.vae.eval() + self.pipe.unet.eval() + self.pipe.clip_camera_projection.eval() + + self.vae = self.pipe.vae + self.unet = self.pipe.unet + + self.pipe.set_progress_bar_config(disable=True) + + self.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config) + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + # embeddings = None + + @torch.no_grad() + def get_img_embeds(self, x): + # x: image tensor in [0, 1] + x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False) + x_pil = [TF.to_pil_image(image) for image in x] + x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype) + c = self.pipe.image_encoder(x_clip).image_embeds + v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor + embeddings = [c, v] + return embeddings + + @torch.no_grad() + def get_img_embeds_pil(self, x, x_pil): + #x: image tensor in [0, 1] + x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False) + x_pil = [TF.to_pil_image(image) for image in x] + x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype) + c = self.pipe.image_encoder(x_clip).image_embeds + v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor + return c, v + + + @torch.no_grad() + def get_vis_image(self, pred_rgb_256, latents_noisy, t, noise_pred): + # print(pred_rgb_256.shape, latents_noisy.shape, t.shape, noise_pred.shape) + with torch.no_grad(): + # visualize predicted denoised image + result_hopefully_less_noisy_image = self.decode_latents(self.pred_x0(latents_noisy, t, noise_pred)) + + # visualize noisier image + result_noisier_image = self.decode_latents(latents_noisy) + + # all 3 input images are [1, 3, H, W], e.g. [1, 3, 512, 512] + viz_images = torch.cat([pred_rgb_256, result_noisier_image, result_hopefully_less_noisy_image],dim=-1) + return viz_images + + @torch.no_grad() + def refine(self, pred_rgb, polar, azimuth, radius, embeddings, + guidance_scale=5, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + + self.scheduler.set_timesteps(steps) + + if strength == 0: + init_step = 0 + latents = torch.randn((1, 4, 32, 32), device=self.device, dtype=self.dtype) + else: + init_step = int(steps * strength) + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_256.to(self.dtype)) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1) #(1,4) + T = torch.from_numpy(T).unsqueeze(1).to(self.dtype).to(self.device) # [1, 1, 4] + cc_emb = torch.cat([embeddings[0].repeat(batch_size, 1, 1), T], dim=-1) #embeddings[0] shape [1,768] + cc_emb = self.pipe.clip_camera_projection(cc_emb) + cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0) + + vae_emb = embeddings[1].repeat(batch_size, 1, 1, 1) + vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + print('step:',i) + x_in = torch.cat([latents] * 2) + t_in = torch.cat([t.view(1)] * 2).to(self.device) + + noise_pred = self.unet( + torch.cat([x_in, vae_emb], dim=1), + t_in.to(self.unet.dtype), + encoder_hidden_states=cc_emb, + ).sample + + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 256, 256] + return imgs + + + def pred_x0(self, sample, timestep, model_output): + alpha_prod_t = self.alphas[timestep].to(self.device).view(-1, 1, 1, 1) + + beta_prod_t = 1 - alpha_prod_t + # print('alpha_prod_t', alpha_prod_t.shape) + if self.scheduler.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.scheduler.config.prediction_type == "sample": + pred_original_sample = model_output + elif self.scheduler.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + # predict V + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," + " or `v_prediction`" + ) + + return pred_original_sample + + def train_step(self, pred_rgb, polar, azimuth, radius, embeddings, step_ratio=None, guidance_scale=2, as_latent=False): + # pred_rgb: tensor [1, 3, H, W] in [0, 1] + + batch_size = pred_rgb.shape[0] + + if as_latent: + latents = F.interpolate(pred_rgb, (32, 32), mode='bilinear', align_corners=False) * 2 - 1 + else: + pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_256.to(self.dtype)) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + with torch.no_grad(): + noise = torch.randn_like(latents) + latents_noisy = self.scheduler.add_noise(latents, noise, t) + + x_in = torch.cat([latents_noisy] * 2) + t_in = torch.cat([t] * 2) + + T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1) + T = torch.from_numpy(T).unsqueeze(0).unsqueeze(0).to(self.dtype).to(self.device) # [8, 1, 4] + # T = torch.from_numpy(T).unsqueeze(1).to(self.dtype).to(self.device) # [8, 1, 4] + # print('embeddings[0].repeat(batch_size, 1, 1) ',embeddings[0].repeat(batch_size, 1, 1).shape) #[4, 1, 768] + # print('T ',T.shape) #[1, 1, 4] + cc_emb = torch.cat([embeddings[0].repeat(batch_size, 1, 1), T.repeat(batch_size,1,1)], dim=-1) + cc_emb = self.pipe.clip_camera_projection(cc_emb) + cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0) + + vae_emb = embeddings[1].repeat(batch_size, 1, 1, 1) + vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0) + + noise_pred = self.unet( + torch.cat([x_in, vae_emb], dim=1), + t_in.to(self.unet.dtype), + encoder_hidden_states=cc_emb, + ).sample + + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + grad = w * (noise_pred - noise) + grad = torch.nan_to_num(grad) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') + + im = self.get_vis_image(pred_rgb_256[:4], latents_noisy[:4], t[:4], noise_pred[:4]) + + return loss, im + + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + imgs = self.vae.decode(latents.to(self.dtype)).sample + imgs = (imgs / 2 + 0.5).clamp(0, 1) + + return imgs + + def encode_imgs(self, imgs, mode=False): + # imgs: [B, 3, H, W] + + imgs = 2 * imgs - 1 + + posterior = self.vae.encode(imgs).latent_dist + if mode: + latents = posterior.mode() + else: + latents = posterior.sample() + latents = latents * self.vae.config.scaling_factor + + return latents + + +if __name__ == '__main__': + import cv2 + import argparse + import numpy as np + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + + parser.add_argument('input', type=str) + parser.add_argument('--polar', type=float, default=0, help='delta polar angle in [-90, 90]') + parser.add_argument('--azimuth', type=float, default=0, help='delta azimuth angle in [-180, 180]') + parser.add_argument('--radius', type=float, default=0, help='delta camera radius multiplier in [-0.5, 0.5]') + + opt = parser.parse_args() + + device = torch.device('cuda') + + print(f'[INFO] loading image from {opt.input} ...') + image = cv2.imread(opt.input, cv2.IMREAD_UNCHANGED) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_AREA) + image = image.astype(np.float32) / 255.0 + image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).contiguous().to(device) + + print(f'[INFO] loading model ...') + zero123 = Zero123(device) + + print(f'[INFO] running embed ...') + emb=zero123.get_img_embeds(image) + print(f'[INFO] running model ...') + while True: + outputs = zero123.refine(image, polar=[opt.polar], azimuth=[opt.azimuth], radius=[opt.radius], embeddings=emb,strength=0) + plt.imshow(outputs.float().cpu().numpy().transpose(0, 2, 3, 1)[0]) + plt.show() diff --git a/guidance/zeroscope_utils.py b/guidance/zeroscope_utils.py new file mode 100644 index 0000000..6ab3e20 --- /dev/null +++ b/guidance/zeroscope_utils.py @@ -0,0 +1,690 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, +) +# from diffusers.utils.import_utils import is_xformers_available + +from typing import List + +# suppress partial model loading warning +logging.set_verbosity_error() + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True + + +class ZeroScope(nn.Module): + def __init__( + self, + device, + fp16=True, + vram_O=False, + t_range=[0.2, 0.8], + # t_range=[0.02, 0.98], + ): + # # sd_version="2.1", + # hf_key=None, + super().__init__() + + self.device = device + # self.sd_version = sd_version + model_key = 'cerspense/zeroscope_v2_576w' + self.weights_dtype = torch.float16 if fp16 else torch.float32 + + # if hf_key is not None: + # print(f"[INFO] using hugging face custom model key: {hf_key}") + # model_key = hf_key + # elif self.sd_version == "2.1": + # model_key = "stabilityai/stable-diffusion-2-1-base" + # elif self.sd_version == "2.0": + # model_key = "stabilityai/stable-diffusion-2-base" + # elif self.sd_version == "1.5": + # model_key = "runwayml/stable-diffusion-v1-5" + # else: + # raise ValueError( + # f"Stable-diffusion version {self.sd_version} not supported." + # ) + + self.dtype = torch.float16 if fp16 else torch.float32 + + # Create model + pipe = StableDiffusionPipeline.from_pretrained( + model_key, torch_dtype=self.dtype + ) + + # if vram_O: + # pipe.enable_sequential_cpu_offload() + # pipe.enable_vae_slicing() + # pipe.unet.to(memory_format=torch.channels_last) + # pipe.enable_attention_slicing(1) + # pipe.enable_model_cpu_offload() + # else: + pipe.to(device) + + self.vae = pipe.vae + self.vae.eval() + for p in self.vae.parameters(): + p.requires_grad = False + self.tokenizer = pipe.tokenizer + self.text_encoder = pipe.text_encoder + self.unet = pipe.unet + self.unet.eval() + + self.scheduler = DDIMScheduler.from_pretrained( + model_key, subfolder="scheduler", torch_dtype=self.dtype + ) + + del pipe + + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + self.embeddings = None + + def encode_images(self, imgs, normalize: bool = True): + # iamge is B, 3, N, 320, 576 + # breakpoint() + if len(imgs.shape) == 4: + print("Only given an image an not video") + imgs = imgs[:, :, None] + # breakpoint() + batch_size, channels, num_frames, height, width = imgs.shape + imgs = imgs.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + input_dtype = imgs.dtype + if normalize: + imgs = imgs * 2.0 - 1.0 + # breakpoint() + + # if self.cfg.low_ram_vae > 0: + # vnum = self.cfg.low_ram_vae + # mask_vae = torch.randperm(imgs.shape[0]) < vnum + # with torch.no_grad(): + # posterior_mask = torch.cat( + # [ + # self.vae.encode( + # imgs[~mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(imgs.shape[0] - vnum) + # ], + # dim=0, + # ) + # posterior = torch.cat( + # [ + # self.vae.encode( + # imgs[mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(vnum) + # ], + # dim=0, + # ) + # posterior_full = torch.zeros( + # imgs.shape[0], + # *posterior.shape[1:], + # device=posterior.device, + # dtype=posterior.dtype, + # ) + # posterior_full[~mask_vae] = posterior_mask + # posterior_full[mask_vae] = posterior + # latents = posterior_full * self.vae.config.scaling_factor + # else: + posterior = self.vae.encode(imgs.to(self.weights_dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = ( + latents[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + latents.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + return latents.to(input_dtype) + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts=['static, low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']): + pos_embeds = self.encode_text(prompts) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + # self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0) # [2, 77, 768] # wrong order... + # embs = zs.encode_text(['a cat running with a dog']) + # neg_prompt = zs.encode_text([""]) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([pos_embeds, neg_embeds], dim=0) + return embeddings + + def encode_text(self, prompt): + # prompt: [str] + inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0] + return embeddings + + def forward_unet( + self, + latents, + t, + encoder_hidden_states, + ): + input_dtype = latents.dtype + # print(latents.shape, latents.device, t.shape, t.device, encoder_hidden_states.shape, encoder_hidden_states.device) + return self.unet( + latents.to(self.weights_dtype), + t.to(self.weights_dtype), + encoder_hidden_states=encoder_hidden_states.to(self.weights_dtype), + ).sample.to(input_dtype) + + + + @torch.no_grad() + def refine(self, pred_rgb, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_512.to(self.dtype)) + # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings, + ).sample + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, + text_embs, + step_ratio=None, + guidance_scale=100, + as_latent=False, + ): + # print(pred_rgb.shape) + batch_size = pred_rgb.shape[0] // 16 + # batch_size = 1 + pred_rgb = pred_rgb.to(self.dtype) # B, C, H, W + + if as_latent: + latents = F.interpolate(pred_rgb, (40, 72), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None]# * 2 - 1 + else: + # interp to 512x512 to be fed into vae. + pred_rgb_512 = F.interpolate(pred_rgb, (320, 576), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None] + # encode image into latents with vae, requires grad! + latents = self.encode_images(pred_rgb_512) + # print(latents.shape) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + # w(t), sigma_t^2 + # w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + # predict the noise residual with unet, NO grad! + # with torch.no_grad(): + # # add noise + # noise = torch.randn_like(latents) + # latents_noisy = self.scheduler.add_noise(latents, noise, t) + # # pred noise + # latent_model_input = torch.cat([latents_noisy] * 2) + # tt = torch.cat([t] * 2) + + # noise_pred = self.unet( + # latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1) + # ).sample + + # # perform guidance (high scale from paper!) + # noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + # noise_pred = noise_pred_uncond + guidance_scale * ( + # noise_pred_pos - noise_pred_uncond + # ) + grad = self.compute_grad_sds(latents, text_embs, t, use_csd=True).to(latents.dtype) + + # grad = w * (noise_pred - noise) + grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target.float(), reduction='sum') / latents.shape[0] + rgb_target = self.decode_latents(target).permute(0, 2, 1, 3, 4) + # print(latents.dtype, target.dtype, pred_rgb_512.dtype) + # print(latents.dtype, target.dtype, pred_rgb_512.dtype, rgb_target.dtype) + # print(pred_rgb_512.shape, rgb_target.shape) + loss += 0.05 * F.mse_loss(pred_rgb_512.float(), rgb_target.detach().float(), reduction='sum') / rgb_target.shape[0] + # loss += 0.05 * F.mse_loss(pred_rgb_512, rgb_target.half().detach(), reduction='sum') / rgb_target.shape[0] + + return loss + # return loss.half() + + @torch.no_grad() + def produce_latents( + self, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if latents is None: + latents = torch.randn( + ( + self.embeddings.shape[0] // 2, + self.unet.in_channels, + height // 8, + width // 8, + ), + device=self.device, + ) + + self.scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(self.scheduler.timesteps): + # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. + latent_model_input = torch.cat([latents] * 2) + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings + ).sample + + # perform guidance + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + return latents + + # def decode_latents(self, latents): + # latents = 1 / self.vae.config.scaling_factor * latents + + # imgs = self.vae.decode(latents).sample + # imgs = (imgs / 2 + 0.5).clamp(0, 1) + + # return imgs + @torch.no_grad() + def decode_latents(self, latents): + # TODO: Make decoding align with previous version + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + + image = self.vae.decode(latents.to(self.weights_dtype)).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + # .permute(0, 2, 1, 3, 4) + ) + # video = video.permute(0, ) + # print(video.shape) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + # video = video.float() + video = (video / 2 + 0.5).clamp(0, 1) + return video + + def compute_grad_csd( + self, + latents, + text_embeddings, + t, + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = 100 * ( + # noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + + # if self.cfg.weighting_strategy == "sds": + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + # elif self.cfg.weighting_strategy == "uniform": + # w = 1 + # elif self.cfg.weighting_strategy == "fantasia3d": + # w = (self.alphas[t] ** 0.5 * (1 - self.alphas[t])).view(-1, 1, 1, 1) + # else: + # raise ValueError( + # f"Unknown weighting strategy: {self.cfg.weighting_strategy}" + # ) + + grad = w * (noise_pred) + # grad = w * (noise_pred - noise) + return grad + + def compute_grad_sds( + self, + latents, + text_embeddings, + t, + use_csd=False + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + + # if self.cfg.weighting_strategy == "sds": + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + # elif self.cfg.weighting_strategy == "uniform": + # w = 1 + # elif self.cfg.weighting_strategy == "fantasia3d": + # w = (self.alphas[t] ** 0.5 * (1 - self.alphas[t])).view(-1, 1, 1, 1) + # else: + # raise ValueError( + # f"Unknown weighting strategy: {self.cfg.weighting_strategy}" + # ) + if use_csd: + grad = w * (noise_pred - noise_pred_text) + else: + grad = w * (noise_pred - noise) + return grad + + + # def encode_imgs(self, imgs): + # # imgs: [B, 3, H, W] + + # imgs = 2 * imgs - 1 + + # posterior = self.vae.encode(imgs).latent_dist + # latents = posterior.sample() * self.vae.config.scaling_factor + + # return latents + + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + # Prompts -> text embeds + # self.get_text_embeds(prompts, negative_prompts) + + # # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + @torch.no_grad() + def generate_img( + self, + emb, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + neg_prompt = self.encode_text([""]) + self.embeddings = torch.cat([neg_prompt, emb.unsqueeze(0)], dim=0) # + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +def window_score(x, gamma: float = 0.6) -> torch.Tensor: + # return torch.exp(-torch.abs(gamma*x)) + return torch.cos(gamma*x) + + +# Collect similar info from attentive features for neglected concept +def sim_correction(embeddings: torch.Tensor, + correction_indices: List[int], + scores: torch.Tensor, + window: bool = True) -> torch.Tensor: + """ Embeddings shape (77, 768), computes similarity between embeddings, combine using similarity scores""" + ntk, dim = embeddings.shape + device = embeddings.device + + for i, tk in enumerate(correction_indices): + alpha = scores[i] + v = embeddings[tk].clone() + + sim = v.unsqueeze(0) * embeddings # nth,dim 77,768 + sim = torch.relu(sim) # 77,768 + + ind = torch.lt(sim, 0.5) # relu is not needed in this case + sim[ind] = 0. + sim[:tk] = 0. # 77, 768 + sim /= max(sim.max(), 1e-6) + + if window: + ws = window_score(torch.arange(0, ntk - tk).to(device), gamma=0.8) + ws = ws.unsqueeze(-1) # 77 - tk,1 + sim[tk:] = ws * sim[tk:] # 77, 768 + + successor = torch.sum(sim * embeddings, dim=0) + embeddings[tk] = (1 - alpha) * embeddings[tk] + alpha * successor + embeddings[tk] *= v.norm() / embeddings[tk].norm() + + return embeddings + +if __name__ == "__main__": + import torchvision, tqdm + @torch.no_grad() + def save_results(results, filename, fps=10): + video = results.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w] + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + # already in [0,1] + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + + import argparse, os + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + # parser.add_argument("prompt", type=str) + # parser.add_argument("--negative", default="", type=str) + # parser.add_argument( + # "--sd_version", + # type=str, + # default="1.5", + # choices=["1.5", "2.0", "2.1"], + # help="stable diffusion version", + # ) + # parser.add_argument( + # "--hf_key", + # type=str, + # default=None, + # help="hugging face Stable diffusion model key", + # ) + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument( + "--vram_O", action="store_true", help="optimization for low VRAM usage" + ) + parser.add_argument("-H", type=int, default=512) + parser.add_argument("-W", type=int, default=512) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--steps", type=int, default=50) + opt = parser.parse_args() + + seed_everything(opt.seed) + + device = torch.device("cuda") + + zs = ZeroScope(device, opt.fp16, opt.vram_O) + # sd = ZeroScope(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key) + + # imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) + + # visualize image + # plt.imshow(imgs[0]) + # plt.show() + embs = zs.encode_text(['a panda dancing']) + # embs = zs.encode_text(['a bee flying around a flower']) + # embs = zs.encode_text(['a cat running with a dog']) + neg_prompt = zs.encode_text(['static, low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([embs, neg_prompt], dim=0) # + # embeddings = torch.cat([neg_prompt, embs], dim=0) # + # embeddings = torch.cat([neg_prompt, embs.unsqueeze(0)], dim=0) # + # in rgb + # use_rgb = True + use_rgb = False + if use_rgb: + rgbs = torch.rand(1, 3, 320, 576).cuda().repeat(16, 1, 1, 1).clamp(0, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1000)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings) + loss_sds.backward() + optimizer.step() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + if step % 20 == 0: + video_path = os.path.join('./output', f"sds_rgb_{step}.mp4") + save_results(rgbs.data.cpu().unsqueeze(0), video_path, fps=10) + else: + rgbs = torch.randn(1, 4, 40, 72).cuda().repeat(16, 1, 1, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings, as_latent=True) + loss_sds.backward() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + optimizer.step() + with torch.no_grad(): + if step % 100 == 0: + # if step % 100 == 0 and step > 0: + video_path = os.path.join('./output', f"sds_{step}.mp4") + out = zs.decode_latents(rgbs.permute(1, 0, 2, 3)[None].detach()) + save_results(out.data.cpu(), video_path, fps=10) + # ww = sd.encode_text('A teddy bear with a yellow bird') + # token_indices = [5, 8] + # cor_scores1 = [0.3, 0] + # from IPython import embed + # embed() + # res = sim_correction(embeddings=ww[0], correction_indices=token_indices, scores=torch.tensor(cor_scores1, device=device)) + + # imgs = sd.generate_img(res, opt.H, opt.W, opt.steps) + # from PIL import Image + # for i in range(len(imgs)): + # Image.fromarray(imgs[i]).save(f'b_{i}.png') + # imgs = sd.generate_img(ww[0], opt.H, opt.W, opt.steps) + # from PIL import Image + # for i in range(len(imgs)): + # Image.fromarray(imgs[i]).save(f'c_{i}.png') diff --git a/guidance/zeroscope_utils_attn.py b/guidance/zeroscope_utils_attn.py new file mode 100644 index 0000000..21ff636 --- /dev/null +++ b/guidance/zeroscope_utils_attn.py @@ -0,0 +1,649 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, +) +# from diffusers.utils.import_utils import is_xformers_available + +from typing import List + +# suppress partial model loading warning +logging.set_verbosity_error() + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import imageio + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True + +from attn_utils.unet_attn import UNet3DConditionModel_Attn +from attn_utils.attention_refocusing_loss import caculate_loss_att_fixed_cnt, caculate_loss_self_att +class ZeroScope(nn.Module): + def __init__( + self, + device, + fp16=False, + vram_O=False, + t_range=[0.02, 0.98], + opt=None, + ): + # # sd_version="2.1", + # hf_key=None, + super().__init__() + self.opt = opt + self.device = device + # self.sd_version = sd_version + model_key = 'cerspense/zeroscope_v2_576w' + self.weights_dtype = torch.float16 if fp16 else torch.float32 + + # if hf_key is not None: + # print(f"[INFO] using hugging face custom model key: {hf_key}") + # model_key = hf_key + # elif self.sd_version == "2.1": + # model_key = "stabilityai/stable-diffusion-2-1-base" + # elif self.sd_version == "2.0": + # model_key = "stabilityai/stable-diffusion-2-base" + # elif self.sd_version == "1.5": + # model_key = "runwayml/stable-diffusion-v1-5" + # else: + # raise ValueError( + # f"Stable-diffusion version {self.sd_version} not supported." + # ) + + self.dtype = torch.float16 if fp16 else torch.float32 + + # Create model + pipe = StableDiffusionPipeline.from_pretrained( + model_key, torch_dtype=self.dtype + ) + + # if vram_O: + # pipe.enable_sequential_cpu_offload() + # pipe.enable_vae_slicing() + # pipe.unet.to(memory_format=torch.channels_last) + # pipe.enable_attention_slicing(1) + # pipe.enable_model_cpu_offload() + # else: + pipe.to(device) + + self.vae = pipe.vae + self.vae.eval() + for p in self.vae.parameters(): + p.requires_grad = False + self.tokenizer = pipe.tokenizer + self.text_encoder = pipe.text_encoder + # self.unet = pipe.unet + # self.unet = UNet3DConditionModel_Attn.from_config(pipe.unet.config).to(device) + self.unet = UNet3DConditionModel_Attn.from_pretrained(model_key, subfolder='unet', torch_dtype=self.dtype).to(device) + self.unet.eval() + self.scheduler = DDIMScheduler.from_pretrained( + model_key, subfolder="scheduler", torch_dtype=self.dtype + ) + + del pipe + + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + self.embeddings = None + + def encode_images(self, imgs, normalize: bool = True): + # iamge is B, 3, N, 320, 576 + # breakpoint() + if len(imgs.shape) == 4: + print("Only given an image an not video") + imgs = imgs[:, :, None] + # breakpoint() + batch_size, channels, num_frames, height, width = imgs.shape + imgs = imgs.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + input_dtype = imgs.dtype + if normalize: + imgs = imgs * 2.0 - 1.0 + # breakpoint() + + # if self.cfg.low_ram_vae > 0: + # vnum = self.cfg.low_ram_vae + # mask_vae = torch.randperm(imgs.shape[0]) < vnum + # with torch.no_grad(): + # posterior_mask = torch.cat( + # [ + # self.vae.encode( + # imgs[~mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(imgs.shape[0] - vnum) + # ], + # dim=0, + # ) + # posterior = torch.cat( + # [ + # self.vae.encode( + # imgs[mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(vnum) + # ], + # dim=0, + # ) + # posterior_full = torch.zeros( + # imgs.shape[0], + # *posterior.shape[1:], + # device=posterior.device, + # dtype=posterior.dtype, + # ) + # posterior_full[~mask_vae] = posterior_mask + # posterior_full[mask_vae] = posterior + # latents = posterior_full * self.vae.config.scaling_factor + # else: + posterior = self.vae.encode(imgs.to(self.weights_dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = ( + latents[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + latents.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + return latents.to(input_dtype) + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts=['low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']): + pos_embeds = self.encode_text(prompts) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + # self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0) # [2, 77, 768] # wrong order... + # embs = zs.encode_text(['a cat running with a dog']) + # neg_prompt = zs.encode_text([""]) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([pos_embeds, neg_embeds], dim=0) + return embeddings + + def encode_text(self, prompt): + # prompt: [str] + inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0] + return embeddings + + def forward_unet( + self, + latents, + t, + encoder_hidden_states, + ): + input_dtype = latents.dtype + # print(latents.shape, latents.device, t.shape, t.device, encoder_hidden_states.shape, encoder_hidden_states.device) + res, t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss = self.unet( + latents.to(self.weights_dtype), + t.to(self.weights_dtype), + encoder_hidden_states=encoder_hidden_states.to(self.weights_dtype), + ) + res = res.sample.to(input_dtype) + return res, t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss + + @torch.no_grad() + def refine(self, pred_rgb, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_512.to(self.dtype)) + # latents = torch.randn((1, 4, 48, 48), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + ### + print('will meet error') + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings, + ).sample + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, + text_embs, + step_ratio=None, + guidance_scale=100, + as_latent=False, + ): + + batch_size = pred_rgb.shape[0] // 16 + # batch_size = 1 + pred_rgb = pred_rgb.to(self.dtype) # B, C, H, W + + if as_latent: + latents = F.interpolate(pred_rgb, (48, 48), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None]# * 2 - 1 + else: + # interp to 512x512 to be fed into vae. + pred_rgb_512 = F.interpolate(pred_rgb, (320, 576), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None] + # encode image into latents with vae, requires grad! + latents = self.encode_images(pred_rgb_512) ### doublecheck + # print(latents.shape) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + # w(t), sigma_t^2 + # w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + # predict the noise residual with unet, NO grad! + # with torch.no_grad(): + # # add noise + # noise = torch.randn_like(latents) + # latents_noisy = self.scheduler.add_noise(latents, noise, t) + # # pred noise + # latent_model_input = torch.cat([latents_noisy] * 2) + # tt = torch.cat([t] * 2) + + # noise_pred = self.unet( + # latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1) + # ).sample + + # # perform guidance (high scale from paper!) + # noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + # noise_pred = noise_pred_uncond + guidance_scale * ( + # noise_pred_pos - noise_pred_uncond + # ) + grad, t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss = self.compute_grad_sds(latents, text_embs, t) + + # grad = w * (noise_pred - noise) + # grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + # for i, list in enumerate(t2d_attn_weight_listss): + # print(i, len(list)) + # for j, attn in enumerate(list): + # print(j, type(attn), attn.shape) + + # for i, list in enumerate(t2d_cross_attn_weight_listss): + # print(i, len(list)) + # for j, attn in enumerate(list): + # print(j, type(attn), attn.shape) + + bboxes = [np.array([[0.01953125, 0.08984375, 0.40039062, 0.52929688]]), np.array([[0.45898438, 0.20117188, 0.99804688, 0.79882812]])] + object_positions = [[2], [10]] + loss1 = caculate_loss_self_att(t2d_attn_weight_listss, bboxes=bboxes)*self.opt.w_l1 + loss2, min_inside, max_outside = caculate_loss_att_fixed_cnt(t2d_cross_attn_weight_listss, bboxes=bboxes, + object_positions=object_positions, t = 0) + loss2= loss2 * self.opt.w_l2 + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0] + print(f"loss: {loss} loss1: {loss1} loss2: {loss2}") + return loss1+loss2 + + @torch.no_grad() + def produce_latents( + self, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if latents is None: + latents = torch.randn( + ( + self.embeddings.shape[0] // 2, + self.unet.in_channels, + height // 8, + width // 8, + ), + device=self.device, + ) + + self.scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(self.scheduler.timesteps): + # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. + latent_model_input = torch.cat([latents] * 2) + # predict the noise residual + ### + print('will meet error') + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings + ).sample + + # perform guidance + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + return latents + + # def decode_latents(self, latents): + # latents = 1 / self.vae.config.scaling_factor * latents + + # imgs = self.vae.decode(latents).sample + # imgs = (imgs / 2 + 0.5).clamp(0, 1) + + # return imgs + def decode_latents(self, latents): + # TODO: Make decoding align with previous version + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + + image = self.vae.decode(latents).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + # .permute(0, 2, 1, 3, 4) + ) + # video = video.permute(0, ) + # print(video.shape) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + # video = video.float() + video = (video / 2 + 0.5).clamp(0, 1) + return video + + def compute_grad_sds( + self, + latents, + text_embeddings, + t, + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred, t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + + # if self.cfg.weighting_strategy == "sds": + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + # elif self.cfg.weighting_strategy == "uniform": + # w = 1 + # elif self.cfg.weighting_strategy == "fantasia3d": + # w = (self.alphas[t] ** 0.5 * (1 - self.alphas[t])).view(-1, 1, 1, 1) + # else: + # raise ValueError( + # f"Unknown weighting strategy: {self.cfg.weighting_strategy}" + # ) + + grad = w * (noise_pred - noise) + return grad, t2d_attn_weight_listss, t2d_cross_attn_weight_listss, temp_attn_weight_listss, temp_cross_attn_weight_listss + + + # def encode_imgs(self, imgs): + # # imgs: [B, 3, H, W] + + # imgs = 2 * imgs - 1 + + # posterior = self.vae.encode(imgs).latent_dist + # latents = posterior.sample() * self.vae.config.scaling_factor + + # return latents + + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + # Prompts -> text embeds + # self.get_text_embeds(prompts, negative_prompts) + + # # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 48, 48] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + @torch.no_grad() + def generate_img( + self, + emb, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + neg_prompt = self.encode_text([""]) + self.embeddings = torch.cat([neg_prompt, emb.unsqueeze(0)], dim=0) # + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 48, 48] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +def window_score(x, gamma: float = 0.6) -> torch.Tensor: + # return torch.exp(-torch.abs(gamma*x)) + return torch.cos(gamma*x) + + +# Collect similar info from attentive features for neglected concept +def sim_correction(embeddings: torch.Tensor, + correction_indices: List[int], + scores: torch.Tensor, + window: bool = True) -> torch.Tensor: + """ Embeddings shape (77, 768), computes similarity between embeddings, combine using similarity scores""" + ntk, dim = embeddings.shape + device = embeddings.device + + for i, tk in enumerate(correction_indices): + alpha = scores[i] + v = embeddings[tk].clone() + + sim = v.unsqueeze(0) * embeddings # nth,dim 77,768 + sim = torch.relu(sim) # 77,768 + + ind = torch.lt(sim, 0.5) # relu is not needed in this case + sim[ind] = 0. + sim[:tk] = 0. # 77, 768 + sim /= max(sim.max(), 1e-6) + + if window: + ws = window_score(torch.arange(0, ntk - tk).to(device), gamma=0.8) + ws = ws.unsqueeze(-1) # 77 - tk,1 + sim[tk:] = ws * sim[tk:] # 77, 768 + + successor = torch.sum(sim * embeddings, dim=0) + embeddings[tk] = (1 - alpha) * embeddings[tk] + alpha * successor + embeddings[tk] *= v.norm() / embeddings[tk].norm() + + return embeddings + +if __name__ == "__main__": + import torchvision, tqdm + @torch.no_grad() + def save_results(results, filename, fps=10): + video = results.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w] + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + # already in [0,1] + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + # torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + imageio.mimwrite(filename, grid, format='gif') + # imageio.mimwrite(filename, grid, format='mp4', fps=8) + import argparse, os + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + # parser.add_argument("prompt", type=str) + # parser.add_argument("--negative", default="", type=str) + # parser.add_argument( + # "--sd_version", + # type=str, + # default="1.5", + # choices=["1.5", "2.0", "2.1"], + # help="stable diffusion version", + # ) + # parser.add_argument( + # "--hf_key", + # type=str, + # default=None, + # help="hugging face Stable diffusion model key", + # ) + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument( + "--vram_O", action="store_true", help="optimization for low VRAM usage" + ) + parser.add_argument("-H", type=int, default=512) + parser.add_argument("-W", type=int, default=512) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--steps", type=int, default=50) + parser.add_argument("--prompt", type=str, default='a bee is flying on the left of a flower') + parser.add_argument("--save_dir", type=str, default='output') + + parser.add_argument("--w_l1", type=float, default=1.0) + parser.add_argument("--w_l2", type=float, default=1.0) + parser.add_argument("--use_rgb", action="store_true", help="use rgb") + + opt = parser.parse_args() + + seed_everything(opt.seed) + + device = torch.device("cuda") + + zs = ZeroScope(device, opt.fp16, opt.vram_O, opt=opt) + # sd = ZeroScope(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key) + + # imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) + + # visualize image + # plt.imshow(imgs[0]) + # plt.show() + embs = zs.encode_text([opt.prompt]) + neg_prompt = zs.encode_text(['static, low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([embs, neg_prompt], dim=0) # + # embeddings = torch.cat([neg_prompt, embs], dim=0) # + # embeddings = torch.cat([neg_prompt, embs.unsqueeze(0)], dim=0) # + os.makedirs(opt.save_dir, exist_ok=True) + if opt.use_rgb: + rgbs = torch.randn(1, 3, 320, 576).cuda().repeat(16, 1, 1, 1).clamp(0, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings) + loss_sds.backward() + optimizer.step() + if step % 100 == 0 and step > 0: + video_path = os.path.join(opt.save_dir, f"sds_rgb_{step}.gif") + save_results(rgbs.data.cpu().unsqueeze(0), video_path, fps=10) + else: + rgbs = torch.randn(1, 4, 8, 8).cuda().repeat(16, 1, 1, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings, as_latent=True) + loss_sds.backward() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + optimizer.step() + with torch.no_grad(): + if step % 100 == 0: + # if step % 100 == 0 and step > 0: + video_path = os.path.join(opt.save_dir, f"sds_{step}.gif") + out = zs.decode_latents(rgbs.permute(1, 0, 2, 3)[None].detach()) + save_results(out.data.cpu(), video_path, fps=10) + + diff --git a/guidance/zeroscope_utils_hifa.py b/guidance/zeroscope_utils_hifa.py new file mode 100644 index 0000000..0afbf45 --- /dev/null +++ b/guidance/zeroscope_utils_hifa.py @@ -0,0 +1,708 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, +) +# from diffusers.utils.import_utils import is_xformers_available + +from typing import List + +# suppress partial model loading warning +logging.set_verbosity_error() + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True + + +class ZeroScope(nn.Module): + def __init__( + self, + device, + fp16=True, + vram_O=False, + t_range=[0.2, 0.8], + # t_range=[0.02, 0.98], + ): + # # sd_version="2.1", + # hf_key=None, + super().__init__() + + self.device = device + # self.sd_version = sd_version + model_key = 'cerspense/zeroscope_v2_576w' + self.weights_dtype = torch.float16 if fp16 else torch.float32 + + # if hf_key is not None: + # print(f"[INFO] using hugging face custom model key: {hf_key}") + # model_key = hf_key + # elif self.sd_version == "2.1": + # model_key = "stabilityai/stable-diffusion-2-1-base" + # elif self.sd_version == "2.0": + # model_key = "stabilityai/stable-diffusion-2-base" + # elif self.sd_version == "1.5": + # model_key = "runwayml/stable-diffusion-v1-5" + # else: + # raise ValueError( + # f"Stable-diffusion version {self.sd_version} not supported." + # ) + + self.dtype = torch.float16 if fp16 else torch.float32 + + # Create model + pipe = StableDiffusionPipeline.from_pretrained( + model_key, torch_dtype=self.dtype + ) + + # if vram_O: + # pipe.enable_sequential_cpu_offload() + # pipe.enable_vae_slicing() + # pipe.unet.to(memory_format=torch.channels_last) + # pipe.enable_attention_slicing(1) + # pipe.enable_model_cpu_offload() + # else: + pipe.to(device) + + self.vae = pipe.vae + self.vae.eval() + for p in self.vae.parameters(): + p.requires_grad = False + self.tokenizer = pipe.tokenizer + self.text_encoder = pipe.text_encoder + self.unet = pipe.unet + self.unet.eval() + + self.scheduler = DDIMScheduler.from_pretrained( + model_key, subfolder="scheduler", torch_dtype=self.dtype + ) + self.scheduler.set_timesteps(self.scheduler.config.num_train_timesteps, device=self.device) + self.scheduler.alphas_cumprod = self.scheduler.alphas_cumprod.to(device) + + del pipe + + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + self.embeddings = None + + def encode_images(self, imgs, normalize: bool = True): + # iamge is B, 3, N, 320, 576 + # breakpoint() + if len(imgs.shape) == 4: + print("Only given an image an not video") + imgs = imgs[:, :, None] + # breakpoint() + batch_size, channels, num_frames, height, width = imgs.shape + imgs = imgs.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + input_dtype = imgs.dtype + if normalize: + imgs = imgs * 2.0 - 1.0 + # breakpoint() + + # if self.cfg.low_ram_vae > 0: + # vnum = self.cfg.low_ram_vae + # mask_vae = torch.randperm(imgs.shape[0]) < vnum + # with torch.no_grad(): + # posterior_mask = torch.cat( + # [ + # self.vae.encode( + # imgs[~mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(imgs.shape[0] - vnum) + # ], + # dim=0, + # ) + # posterior = torch.cat( + # [ + # self.vae.encode( + # imgs[mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(vnum) + # ], + # dim=0, + # ) + # posterior_full = torch.zeros( + # imgs.shape[0], + # *posterior.shape[1:], + # device=posterior.device, + # dtype=posterior.dtype, + # ) + # posterior_full[~mask_vae] = posterior_mask + # posterior_full[mask_vae] = posterior + # latents = posterior_full * self.vae.config.scaling_factor + # else: + posterior = self.vae.encode(imgs.to(self.weights_dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = ( + latents[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + latents.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + return latents.to(input_dtype) + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts=['static, low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']): + pos_embeds = self.encode_text(prompts) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + # self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0) # [2, 77, 768] # wrong order... + # embs = zs.encode_text(['a cat running with a dog']) + # neg_prompt = zs.encode_text([""]) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([pos_embeds, neg_embeds], dim=0) + return embeddings + + def encode_text(self, prompt): + # prompt: [str] + inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0] + return embeddings + + def forward_unet( + self, + latents, + t, + encoder_hidden_states, + ): + input_dtype = latents.dtype + # print(latents.shape, latents.device, t.shape, t.device, encoder_hidden_states.shape, encoder_hidden_states.device) + return self.unet( + latents.to(self.weights_dtype), + t.to(self.weights_dtype), + encoder_hidden_states=encoder_hidden_states.to(self.weights_dtype), + ).sample.to(input_dtype) + + + + @torch.no_grad() + def refine(self, pred_rgb, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_512.to(self.dtype)) + # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings, + ).sample + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, + text_embs, + step_ratio=None, + guidance_scale=100, + as_latent=False, + ): + # print(pred_rgb.shape) + batch_size = pred_rgb.shape[0] // 16 + # batch_size = 1 + pred_rgb = pred_rgb.to(self.dtype) # B, C, H, W + + if as_latent: + latents = F.interpolate(pred_rgb, (40, 72), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None]# * 2 - 1 + else: + # interp to 512x512 to be fed into vae. + pred_rgb_512 = F.interpolate(pred_rgb, (320, 576), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None] + # encode image into latents with vae, requires grad! + latents = self.encode_images(pred_rgb_512) + # print(latents.shape) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + # w(t), sigma_t^2 + # w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + # predict the noise residual with unet, NO grad! + # with torch.no_grad(): + # # add noise + # noise = torch.randn_like(latents) + # latents_noisy = self.scheduler.add_noise(latents, noise, t) + # # pred noise + # latent_model_input = torch.cat([latents_noisy] * 2) + # tt = torch.cat([t] * 2) + + # noise_pred = self.unet( + # latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1) + # ).sample + + # # perform guidance (high scale from paper!) + # noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + # noise_pred = noise_pred_uncond + guidance_scale * ( + # noise_pred_pos - noise_pred_uncond + # ) + grad, latent_prev = self.compute_grad_hifa(latents, text_embs, t, use_csd=True) + grad = grad.to(latents.dtype) + # grad = self.compute_grad_sds(latents, text_embs, t).to(latents.dtype) + + # grad = w * (noise_pred - noise) + grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + target = (latents - grad).detach() + loss = 0.5 * F.mse_loss(latents.float(), target.float(), reduction='sum') / latents.shape[0] + rgb_target = self.decode_latents(latent_prev).permute(0, 2, 1, 3, 4) + # print(latents.dtype, target.dtype, pred_rgb_512.dtype) + # print(latents.dtype, target.dtype, pred_rgb_512.dtype, rgb_target.dtype) + # print(pred_rgb_512.min(), pred_rgb_512.max(), rgb_target.min(), rgb_target.max()) + # print(pred_rgb_512.shape, rgb_target.shape) + loss2 = 0.05 * F.mse_loss(pred_rgb_512.float(), rgb_target.detach().float(), reduction="sum") / rgb_target.shape[0] + # loss2 = 512 * F.mse_loss(pred_rgb_512, rgb_target.detach(), reduction='mean') / rgb_target.shape[0] + # print(loss2.mean().item()) + loss = loss + loss2 + # loss += 0.05 * F.mse_loss(pred_rgb_512, rgb_target.half().detach(), reduction='sum') / rgb_target.shape[0] + + return loss + # return loss.half() + + @torch.no_grad() + def produce_latents( + self, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if latents is None: + latents = torch.randn( + ( + self.embeddings.shape[0] // 2, + self.unet.in_channels, + height // 8, + width // 8, + ), + device=self.device, + ) + + self.scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(self.scheduler.timesteps): + # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. + latent_model_input = torch.cat([latents] * 2) + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings + ).sample + + # perform guidance + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + return latents + + # def decode_latents(self, latents): + # latents = 1 / self.vae.config.scaling_factor * latents + + # imgs = self.vae.decode(latents).sample + # imgs = (imgs / 2 + 0.5).clamp(0, 1) + + # return imgs + @torch.no_grad() + def decode_latents(self, latents): + # TODO: Make decoding align with previous version + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + + image = self.vae.decode(latents.to(self.weights_dtype)).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + # .permute(0, 2, 1, 3, 4) + ) + # video = video.permute(0, ) + # print(video.shape) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + # video = video.float() + video = (video / 2 + 0.5).clamp(0, 1) + return video + + def compute_grad_sds( + self, + latents, + text_embeddings, + t, + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + + # if self.cfg.weighting_strategy == "sds": + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + # elif self.cfg.weighting_strategy == "uniform": + # w = 1 + # elif self.cfg.weighting_strategy == "fantasia3d": + # w = (self.alphas[t] ** 0.5 * (1 - self.alphas[t])).view(-1, 1, 1, 1) + # else: + # raise ValueError( + # f"Unknown weighting strategy: {self.cfg.weighting_strategy}" + # ) + + grad = w * (noise_pred - noise) + return grad + + def compute_grad_hifa( + self, + latents, + text_embeddings, + t, + use_csd=False + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + if use_csd: + grad = w * (noise_pred - noise_pred_text) + else: + grad = w * (noise_pred - noise) + + # with torch.no_grad(): + # latents = self.scheduler.step(noise_pred, t, latents, eta=1.0).prev_sample + prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps + alpha_prod_t = self.scheduler.alphas_cumprod[t] + alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + if self.scheduler.config.prediction_type == "epsilon": + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) + pred_epsilon = noise_pred + elif self.scheduler.config.prediction_type == "sample": + pred_original_sample = noise_pred + pred_epsilon = (latents - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.scheduler.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * latents - (beta_prod_t**0.5) * noise_pred + pred_epsilon = (alpha_prod_t**0.5) * noise_pred + (beta_prod_t**0.5) * latents + if self.scheduler.config.thresholding: + pred_original_sample = self.scheduler._threshold_sample(pred_original_sample) + elif self.scheduler.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.scheduler.config.clip_sample_range, self.scheduler.config.clip_sample_range + ) + + return grad, pred_original_sample + + + # def encode_imgs(self, imgs): + # # imgs: [B, 3, H, W] + + # imgs = 2 * imgs - 1 + + # posterior = self.vae.encode(imgs).latent_dist + # latents = posterior.sample() * self.vae.config.scaling_factor + + # return latents + + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + # Prompts -> text embeds + # self.get_text_embeds(prompts, negative_prompts) + + # # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + @torch.no_grad() + def generate_img( + self, + emb, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + neg_prompt = self.encode_text([""]) + self.embeddings = torch.cat([neg_prompt, emb.unsqueeze(0)], dim=0) # + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +def window_score(x, gamma: float = 0.6) -> torch.Tensor: + # return torch.exp(-torch.abs(gamma*x)) + return torch.cos(gamma*x) + + +# Collect similar info from attentive features for neglected concept +def sim_correction(embeddings: torch.Tensor, + correction_indices: List[int], + scores: torch.Tensor, + window: bool = True) -> torch.Tensor: + """ Embeddings shape (77, 768), computes similarity between embeddings, combine using similarity scores""" + ntk, dim = embeddings.shape + device = embeddings.device + + for i, tk in enumerate(correction_indices): + alpha = scores[i] + v = embeddings[tk].clone() + + sim = v.unsqueeze(0) * embeddings # nth,dim 77,768 + sim = torch.relu(sim) # 77,768 + + ind = torch.lt(sim, 0.5) # relu is not needed in this case + sim[ind] = 0. + sim[:tk] = 0. # 77, 768 + sim /= max(sim.max(), 1e-6) + + if window: + ws = window_score(torch.arange(0, ntk - tk).to(device), gamma=0.8) + ws = ws.unsqueeze(-1) # 77 - tk,1 + sim[tk:] = ws * sim[tk:] # 77, 768 + + successor = torch.sum(sim * embeddings, dim=0) + embeddings[tk] = (1 - alpha) * embeddings[tk] + alpha * successor + embeddings[tk] *= v.norm() / embeddings[tk].norm() + + return embeddings + +if __name__ == "__main__": + import torchvision, tqdm + @torch.no_grad() + def save_results(results, filename, fps=10): + video = results.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w] + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + # already in [0,1] + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + + import argparse, os + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + # parser.add_argument("prompt", type=str) + # parser.add_argument("--negative", default="", type=str) + # parser.add_argument( + # "--sd_version", + # type=str, + # default="1.5", + # choices=["1.5", "2.0", "2.1"], + # help="stable diffusion version", + # ) + # parser.add_argument( + # "--hf_key", + # type=str, + # default=None, + # help="hugging face Stable diffusion model key", + # ) + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument( + "--vram_O", action="store_true", help="optimization for low VRAM usage" + ) + parser.add_argument("-H", type=int, default=512) + parser.add_argument("-W", type=int, default=512) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--steps", type=int, default=50) + opt = parser.parse_args() + + seed_everything(opt.seed) + + device = torch.device("cuda") + + zs = ZeroScope(device, opt.fp16, opt.vram_O) + # sd = ZeroScope(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key) + + # imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) + + # visualize image + # plt.imshow(imgs[0]) + # plt.show() + embs = zs.encode_text(['a panda dancing']) + # embs = zs.encode_text(['a bee flying around a flower']) + # embs = zs.encode_text(['a cat running with a dog']) + neg_prompt = zs.encode_text(['static, low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([embs, neg_prompt], dim=0) # + # embeddings = torch.cat([neg_prompt, embs], dim=0) # + # embeddings = torch.cat([neg_prompt, embs.unsqueeze(0)], dim=0) # + # in rgb + use_rgb = True + # use_rgb = False + if use_rgb: + rgbs = torch.rand(1, 3, 320, 576).cuda().repeat(16, 1, 1, 1).clamp(0, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1000)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings) + loss_sds.backward() + optimizer.step() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + if step % 20 == 0: + video_path = os.path.join('./output', f"sds_rgb_{step}.mp4") + save_results(rgbs.data.cpu().unsqueeze(0), video_path, fps=10) + else: + rgbs = torch.randn(1, 4, 40, 72).cuda().repeat(16, 1, 1, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.1) + for step in tqdm.tqdm(range(1001)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings, as_latent=True) + loss_sds.backward() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + optimizer.step() + with torch.no_grad(): + if step % 100 == 0: + # if step % 100 == 0 and step > 0: + video_path = os.path.join('./output', f"sds_{step}.mp4") + out = zs.decode_latents(rgbs.permute(1, 0, 2, 3)[None].detach()) + save_results(out.data.cpu(), video_path, fps=10) + # ww = sd.encode_text('A teddy bear with a yellow bird') + # token_indices = [5, 8] + # cor_scores1 = [0.3, 0] + # from IPython import embed + # embed() + # res = sim_correction(embeddings=ww[0], correction_indices=token_indices, scores=torch.tensor(cor_scores1, device=device)) + + # imgs = sd.generate_img(res, opt.H, opt.W, opt.steps) + # from PIL import Image + # for i in range(len(imgs)): + # Image.fromarray(imgs[i]).save(f'b_{i}.png') + # imgs = sd.generate_img(ww[0], opt.H, opt.W, opt.steps) + # from PIL import Image + # for i in range(len(imgs)): + # Image.fromarray(imgs[i]).save(f'c_{i}.png') diff --git a/guidance/zeroscope_utils_mask.py b/guidance/zeroscope_utils_mask.py new file mode 100644 index 0000000..50d4998 --- /dev/null +++ b/guidance/zeroscope_utils_mask.py @@ -0,0 +1,627 @@ +from transformers import CLIPTextModel, CLIPTokenizer, logging +from diffusers import ( + AutoencoderKL, + UNet2DConditionModel, + PNDMScheduler, + DDIMScheduler, + StableDiffusionPipeline, +) +# from diffusers.utils.import_utils import is_xformers_available + +from typing import List + +# suppress partial model loading warning +logging.set_verbosity_error() + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import imageio + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = True + + +class ZeroScope(nn.Module): + def __init__( + self, + device, + fp16=False, + vram_O=False, + t_range=[0.02, 0.98], ### why not using full range? + ): + # # sd_version="2.1", + # hf_key=None, + super().__init__() + + self.device = device + # self.sd_version = sd_version + model_key = 'cerspense/zeroscope_v2_576w' + self.weights_dtype = torch.float32 + + # if hf_key is not None: + # print(f"[INFO] using hugging face custom model key: {hf_key}") + # model_key = hf_key + # elif self.sd_version == "2.1": + # model_key = "stabilityai/stable-diffusion-2-1-base" + # elif self.sd_version == "2.0": + # model_key = "stabilityai/stable-diffusion-2-base" + # elif self.sd_version == "1.5": + # model_key = "runwayml/stable-diffusion-v1-5" + # else: + # raise ValueError( + # f"Stable-diffusion version {self.sd_version} not supported." + # ) + + self.dtype = torch.float16 if fp16 else torch.float32 + + # Create model + pipe = StableDiffusionPipeline.from_pretrained( + model_key, torch_dtype=self.dtype + ) + + # if vram_O: + # pipe.enable_sequential_cpu_offload() + # pipe.enable_vae_slicing() + # pipe.unet.to(memory_format=torch.channels_last) + # pipe.enable_attention_slicing(1) + # pipe.enable_model_cpu_offload() + # else: + pipe.to(device) + + self.vae = pipe.vae + self.tokenizer = pipe.tokenizer + self.text_encoder = pipe.text_encoder + self.unet = pipe.unet + + self.scheduler = DDIMScheduler.from_pretrained( + model_key, subfolder="scheduler", torch_dtype=self.dtype + ) + + del pipe + + self.num_train_timesteps = self.scheduler.config.num_train_timesteps + self.min_step = int(self.num_train_timesteps * t_range[0]) + self.max_step = int(self.num_train_timesteps * t_range[1]) + self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience + + self.embeddings = None + + def encode_images(self, imgs, normalize: bool = True): + # iamge is B, 3, N, 320, 576 + # breakpoint() + if len(imgs.shape) == 4: + print("Only given an image an not video") + imgs = imgs[:, :, None] + # breakpoint() + batch_size, channels, num_frames, height, width = imgs.shape + imgs = imgs.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + input_dtype = imgs.dtype + if normalize: + imgs = imgs * 2.0 - 1.0 + # breakpoint() + + # if self.cfg.low_ram_vae > 0: + # vnum = self.cfg.low_ram_vae + # mask_vae = torch.randperm(imgs.shape[0]) < vnum + # with torch.no_grad(): + # posterior_mask = torch.cat( + # [ + # self.vae.encode( + # imgs[~mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(imgs.shape[0] - vnum) + # ], + # dim=0, + # ) + # posterior = torch.cat( + # [ + # self.vae.encode( + # imgs[mask_vae][i : i + 1].to(self.weights_dtype) + # ).latent_dist.sample() + # for i in range(vnum) + # ], + # dim=0, + # ) + # posterior_full = torch.zeros( + # imgs.shape[0], + # *posterior.shape[1:], + # device=posterior.device, + # dtype=posterior.dtype, + # ) + # posterior_full[~mask_vae] = posterior_mask + # posterior_full[mask_vae] = posterior + # latents = posterior_full * self.vae.config.scaling_factor + # else: + posterior = self.vae.encode(imgs.to(self.weights_dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = ( + latents[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + latents.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + return latents.to(input_dtype) + + @torch.no_grad() + def get_text_embeds(self, prompts, negative_prompts=['low motion, static statue, not moving, no motion, text, watermark, copyright, blurry, nsfw']): + pos_embeds = self.encode_text(prompts) # [1, 77, 768] + neg_embeds = self.encode_text(negative_prompts) + # self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0) # [2, 77, 768] # wrong order... + # embs = zs.encode_text(['a cat running with a dog']) + # neg_prompt = zs.encode_text([""]) + # print(embs.shape, neg_prompt.shape) + embeddings = torch.cat([pos_embeds, neg_embeds], dim=0) + return embeddings + + def encode_text(self, prompt): + # prompt: [str] + inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0] + return embeddings + + def forward_unet( + self, + latents, + t, + encoder_hidden_states, + ): + input_dtype = latents.dtype + # print(latents.shape, latents.device, t.shape, t.device, encoder_hidden_states.shape, encoder_hidden_states.device) + return self.unet( + latents.to(self.weights_dtype), + t.to(self.weights_dtype), + encoder_hidden_states=encoder_hidden_states.to(self.weights_dtype), + ).sample.to(input_dtype) + + @torch.no_grad() + def refine(self, pred_rgb, + guidance_scale=100, steps=50, strength=0.8, + ): + + batch_size = pred_rgb.shape[0] + pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False) + latents = self.encode_imgs(pred_rgb_512.to(self.dtype)) + # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype) + + self.scheduler.set_timesteps(steps) + init_step = int(steps * strength) + latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step]) + + for i, t in enumerate(self.scheduler.timesteps[init_step:]): + + latent_model_input = torch.cat([latents] * 2) + + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings, + ).sample + + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + return imgs + + def train_step( + self, + pred_rgb, + text_embs, + mask_rgbs=None, + step_ratio=None, + guidance_scale=100, + as_latent=False, + ): + + batch_size = pred_rgb.shape[0] // 16 + # batch_size = 1 + pred_rgb = pred_rgb.to(self.dtype) # B, C, H, W + mask_rgbs = mask_rgbs.to(self.dtype) + if as_latent: + latents = F.interpolate(pred_rgb, (40, 72), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None]# * 2 - 1 + mask_rgbs = F.interpolate(mask_rgbs, (40, 72), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None]# * 2 - 1 + else: + # interp to 512x512 to be fed into vae. + pred_rgb_512 = F.interpolate(pred_rgb, (320, 576), mode="bilinear", align_corners=False).permute(1, 0, 2, 3)[None] + # encode image into latents with vae, requires grad! + latents = self.encode_images(pred_rgb_512) ### doublecheck + # print(latents.shape) + + if step_ratio is not None: + # dreamtime-like + # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio) + t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step) + t = torch.full((batch_size,), t, dtype=torch.long, device=self.device) + else: + t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device) + + # w(t), sigma_t^2 + # w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1) + + # predict the noise residual with unet, NO grad! + # with torch.no_grad(): + # # add noise + # noise = torch.randn_like(latents) + # latents_noisy = self.scheduler.add_noise(latents, noise, t) + # # pred noise + # latent_model_input = torch.cat([latents_noisy] * 2) + # tt = torch.cat([t] * 2) + + # noise_pred = self.unet( + # latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1) + # ).sample + + # # perform guidance (high scale from paper!) + # noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2) + # noise_pred = noise_pred_uncond + guidance_scale * ( + # noise_pred_pos - noise_pred_uncond + # ) + grad = self.compute_grad_sds(latents, text_embs, t, mask_rgbs) + + # grad = w * (noise_pred - noise) + # grad = torch.nan_to_num(grad) + + # seems important to avoid NaN... + # grad = grad.clamp(-1, 1) + + target = (latents*mask_rgbs - grad).detach() + loss = 0.5 * F.mse_loss((latents*mask_rgbs).float(), target, reduction='sum') / latents.shape[0] + + return loss + + @torch.no_grad() + def produce_latents( + self, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if latents is None: + latents = torch.randn( + ( + self.embeddings.shape[0] // 2, + self.unet.in_channels, + height // 8, + width // 8, + ), + device=self.device, + ) + + self.scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(self.scheduler.timesteps): + # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. + latent_model_input = torch.cat([latents] * 2) + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=self.embeddings + ).sample + + # perform guidance + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_cond - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + return latents + + # def decode_latents(self, latents): + # latents = 1 / self.vae.config.scaling_factor * latents + + # imgs = self.vae.decode(latents).sample + # imgs = (imgs / 2 + 0.5).clamp(0, 1) + + # return imgs + def decode_latents(self, latents): + # TODO: Make decoding align with previous version + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + + image = self.vae.decode(latents).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + # .permute(0, 2, 1, 3, 4) + ) + # video = video.permute(0, ) + # print(video.shape) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + video = video.float() + video = (video / 2 + 0.5).clamp(0, 1) + return video + + def compute_grad_sds( + self, + latents, + text_embeddings, + t, + mask_rgbs, + ): + # predict the noise residual with unet, NO grad! + with torch.no_grad(): + # add noise + noise = torch.randn_like(latents) # TODO: use torch generator + latents_noisy = self.scheduler.add_noise(latents, noise, t) + latent_model_input = torch.cat([latents_noisy] * 2, dim=0) + noise_pred = self.forward_unet( + latent_model_input, + torch.cat([t] * 2), + encoder_hidden_states=text_embeddings, + ) + + # perform guidance (high scale from paper!) + noise_pred_text, noise_pred_uncond = noise_pred.chunk(2) + + noise_pred = noise_pred_text + 100 * ( + noise_pred_text - noise_pred_uncond + ) + + # if self.cfg.weighting_strategy == "sds": + # w(t), sigma_t^2 + w = (1 - self.alphas[t]).view(-1, 1, 1, 1) + # elif self.cfg.weighting_strategy == "uniform": + # w = 1 + # elif self.cfg.weighting_strategy == "fantasia3d": + # w = (self.alphas[t] ** 0.5 * (1 - self.alphas[t])).view(-1, 1, 1, 1) + # else: + # raise ValueError( + # f"Unknown weighting strategy: {self.cfg.weighting_strategy}" + # ) + grad = w * (noise_pred - noise) * mask_rgbs + return grad + + + # def encode_imgs(self, imgs): + # # imgs: [B, 3, H, W] + + # imgs = 2 * imgs - 1 + + # posterior = self.vae.encode(imgs).latent_dist + # latents = posterior.sample() * self.vae.config.scaling_factor + + # return latents + + def prompt_to_img( + self, + prompts, + negative_prompts="", + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + if isinstance(prompts, str): + prompts = [prompts] + + if isinstance(negative_prompts, str): + negative_prompts = [negative_prompts] + + # Prompts -> text embeds + # self.get_text_embeds(prompts, negative_prompts) + + # # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + @torch.no_grad() + def generate_img( + self, + emb, + height=512, + width=512, + num_inference_steps=50, + guidance_scale=7.5, + latents=None, + ): + neg_prompt = self.encode_text([""]) + self.embeddings = torch.cat([neg_prompt, emb.unsqueeze(0)], dim=0) # + # Text embeds -> img latents + latents = self.produce_latents( + height=height, + width=width, + latents=latents, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) # [1, 4, 64, 64] + + # Img latents -> imgs + imgs = self.decode_latents(latents) # [1, 3, 512, 512] + + # Img to Numpy + imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy() + imgs = (imgs * 255).round().astype("uint8") + + return imgs + + +def window_score(x, gamma: float = 0.6) -> torch.Tensor: + # return torch.exp(-torch.abs(gamma*x)) + return torch.cos(gamma*x) + + +# Collect similar info from attentive features for neglected concept +def sim_correction(embeddings: torch.Tensor, + correction_indices: List[int], + scores: torch.Tensor, + window: bool = True) -> torch.Tensor: + """ Embeddings shape (77, 768), computes similarity between embeddings, combine using similarity scores""" + ntk, dim = embeddings.shape + device = embeddings.device + + for i, tk in enumerate(correction_indices): + alpha = scores[i] + v = embeddings[tk].clone() + + sim = v.unsqueeze(0) * embeddings # nth,dim 77,768 + sim = torch.relu(sim) # 77,768 + + ind = torch.lt(sim, 0.5) # relu is not needed in this case + sim[ind] = 0. + sim[:tk] = 0. # 77, 768 + sim /= max(sim.max(), 1e-6) + + if window: + ws = window_score(torch.arange(0, ntk - tk).to(device), gamma=0.8) + ws = ws.unsqueeze(-1) # 77 - tk,1 + sim[tk:] = ws * sim[tk:] # 77, 768 + + successor = torch.sum(sim * embeddings, dim=0) + embeddings[tk] = (1 - alpha) * embeddings[tk] + alpha * successor + embeddings[tk] *= v.norm() / embeddings[tk].norm() + + return embeddings + +if __name__ == "__main__": + import torchvision, tqdm + @torch.no_grad() + def save_results(results, filename, fps=10): + video = results.permute(1, 0, 2, 3, 4) # [t, sample_num, c, h, w] + frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(video.shape[1])) for framesheet in video] #[3, 1*h, n*w] + grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] + # already in [0,1] + grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) + # torchvision.io.write_video(filename, grid, fps=fps, video_codec='h264', options={'crf': '10'}) + imageio.mimwrite(filename, grid, format='gif') + # imageio.mimwrite(filename, grid, format='mp4', fps=8) + import argparse, os + import matplotlib.pyplot as plt + + parser = argparse.ArgumentParser() + # parser.add_argument("prompt", type=str) + # parser.add_argument("--negative", default="", type=str) + # parser.add_argument( + # "--sd_version", + # type=str, + # default="1.5", + # choices=["1.5", "2.0", "2.1"], + # help="stable diffusion version", + # ) + # parser.add_argument( + # "--hf_key", + # type=str, + # default=None, + # help="hugging face Stable diffusion model key", + # ) + parser.add_argument("--fp16", action="store_true", help="use float16 for training") + parser.add_argument( + "--vram_O", action="store_true", help="optimization for low VRAM usage" + ) + parser.add_argument("-H", type=int, default=512) + parser.add_argument("-W", type=int, default=512) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--steps", type=int, default=50) + opt = parser.parse_args() + + seed_everything(opt.seed) + + device = torch.device("cuda") + + zs = ZeroScope(device, opt.fp16, opt.vram_O) + # sd = ZeroScope(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key) + + # imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps) + + # visualize image + # plt.imshow(imgs[0]) + # plt.show() + embeddings_set = [] + embs = zs.encode_text(['one flower and one bee, the bee is flying around the flower']) + neg_prompt = zs.encode_text(["fast motion, text, watermark, copyright, blurry, nsfw"]) + # print(embs.shape, neg_prompt.shape) + embeddings_1 = torch.cat([embs, neg_prompt], dim=0) # + embeddings_set.append(embeddings_1) + + embs = zs.encode_text(['grass']) + neg_prompt = zs.encode_text(["flower, bee, fast motion, text, watermark, copyright, blurry, nsfw"]) + # print(embs.shape, neg_prompt.shape) + embeddings_2 = torch.cat([embs, neg_prompt], dim=0) # + embeddings_set.append(embeddings_2) + + # use_rgb = True + use_rgb = False + if use_rgb: + rgbs = torch.rand(1, 3, 320, 576).cuda().repeat(16, 1, 1, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.05) + for step in tqdm.tqdm(range(1000)): + optimizer.zero_grad() + loss_sds = zs.train_step(rgbs, embeddings_set) + loss_sds.backward() + optimizer.step() + if step % 100 == 0 and step > 0: + video_path = os.path.join('./output_rgb', f"sds_rgb_{step}.gif") + save_results(rgbs.data.cpu().unsqueeze(0), video_path, fps=10) + else: + rgbs = torch.randn(1, 4, 40, 72).cuda().repeat(16, 1, 1, 1) + mask_rgbs = torch.zeros(1, 4, 40, 72) + mask_rgbs[:,:,10:30,20:60] = 1 + mask_rgbs = mask_rgbs.cuda().repeat(16, 1, 1, 1) + rgbs.requires_grad = True + optimizer = torch.optim.Adam([rgbs], lr=0.05) + for step in tqdm.tqdm(range(10000)): + optimizer.zero_grad() + + loss_sds = zs.train_step(rgbs, embeddings_set[0], mask_rgbs, as_latent=True) + zs.train_step(rgbs, embeddings_set[1], 1-mask_rgbs, as_latent=True) + loss_sds.backward() + if step % 20 == 0: + tqdm.tqdm.write(f"step: {step}, loss_sds: {loss_sds.item()}") + # print(f"step: {step}, loss_sds: {loss_sds.item()}") + optimizer.step() + with torch.no_grad(): + if step % 100 == 0: + # if step % 100 == 0 and step > 0: + video_path = os.path.join('./output_t1', f"sds_{step}.gif") + out = zs.decode_latents(rgbs.permute(1, 0, 2, 3)[None].detach()) + save_results(out.data.cpu(), video_path, fps=10) + diff --git a/lvdm/basics.py b/lvdm/basics.py new file mode 100644 index 0000000..55572b3 --- /dev/null +++ b/lvdm/basics.py @@ -0,0 +1,100 @@ +# adopted from +# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py +# and +# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +# and +# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py +# +# thanks! + +import torch.nn as nn +from utils.utils import instantiate_from_config + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def nonlinearity(type='silu'): + if type == 'silu': + return nn.SiLU() + elif type == 'leaky_relu': + return nn.LeakyReLU() + + +class GroupNormSpecific(nn.GroupNorm): + def forward(self, x): + return super().forward(x).type(x.dtype) + + +def normalization(channels, num_groups=32): + """ + Make a standard normalization layer. + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNormSpecific(num_groups, channels) + + +class HybridConditioner(nn.Module): + + def __init__(self, c_concat_config, c_crossattn_config): + super().__init__() + self.concat_conditioner = instantiate_from_config(c_concat_config) + self.crossattn_conditioner = instantiate_from_config(c_crossattn_config) + + def forward(self, c_concat, c_crossattn): + c_concat = self.concat_conditioner(c_concat) + c_crossattn = self.crossattn_conditioner(c_crossattn) + return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} \ No newline at end of file diff --git a/lvdm/common.py b/lvdm/common.py new file mode 100644 index 0000000..35569b2 --- /dev/null +++ b/lvdm/common.py @@ -0,0 +1,95 @@ +import math +from inspect import isfunction +import torch +from torch import nn +import torch.distributed as dist + + +def gather_data(data, return_np=True): + ''' gather data from multiple processes to one list ''' + data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())] + dist.all_gather(data_list, data) # gather not supported with NCCL + if return_np: + data_list = [data.cpu().numpy() for data in data_list] + return data_list + +def autocast(f): + def do_autocast(*args, **kwargs): + with torch.cuda.amp.autocast(enabled=True, + dtype=torch.get_autocast_gpu_dtype(), + cache_enabled=torch.is_autocast_cache_enabled()): + return f(*args, **kwargs) + return do_autocast + + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1,) * (len(x_shape) - 1))) + + +def noise_like(shape, device, repeat=False): + repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) + noise = lambda: torch.randn(shape, device=device) + return repeat_noise() if repeat else noise() + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + +def exists(val): + return val is not None + +def identity(*args, **kwargs): + return nn.Identity() + +def uniq(arr): + return{el: True for el in arr}.keys() + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + +def ismap(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] > 3) + +def isimage(x): + if not isinstance(x,torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + +def shape_to_str(x): + shape_str = "x".join([str(x) for x in x.shape]) + return shape_str + +def init_(tensor): + dim = tensor.shape[-1] + std = 1 / math.sqrt(dim) + tensor.uniform_(-std, std) + return tensor + +ckpt = torch.utils.checkpoint.checkpoint +def checkpoint(func, inputs, params, flag): + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + return ckpt(func, *inputs) + else: + return func(*inputs) + diff --git a/lvdm/distributions.py b/lvdm/distributions.py new file mode 100644 index 0000000..0b69b69 --- /dev/null +++ b/lvdm/distributions.py @@ -0,0 +1,95 @@ +import torch +import numpy as np + + +class AbstractDistribution: + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self, noise=None): + if noise is None: + noise = torch.randn(self.mean.shape) + + x = self.mean + self.std * noise.to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum(torch.pow(self.mean, 2) + + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1,2,3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + torch.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) + ) diff --git a/lvdm/ema.py b/lvdm/ema.py new file mode 100644 index 0000000..c8c75af --- /dev/null +++ b/lvdm/ema.py @@ -0,0 +1,76 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError('Decay must be between 0 and 1') + + self.m_name2s_name = {} + self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) + self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates + else torch.tensor(-1,dtype=torch.int)) + + for name, p in model.named_parameters(): + if p.requires_grad: + #remove as '.'-character is not allowed in buffers + s_name = name.replace('.','') + self.m_name2s_name.update({name:s_name}) + self.register_buffer(s_name,p.clone().detach().data) + + self.collected_params = [] + + def forward(self,model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/lvdm/models/__pycache__/autoencoder.cpython-37.pyc b/lvdm/models/__pycache__/autoencoder.cpython-37.pyc new file mode 100644 index 0000000..796acd8 Binary files /dev/null and b/lvdm/models/__pycache__/autoencoder.cpython-37.pyc differ diff --git a/lvdm/models/__pycache__/ddpm3d.cpython-37.pyc b/lvdm/models/__pycache__/ddpm3d.cpython-37.pyc new file mode 100644 index 0000000..b9c4361 Binary files /dev/null and b/lvdm/models/__pycache__/ddpm3d.cpython-37.pyc differ diff --git a/lvdm/models/__pycache__/utils_diffusion.cpython-37.pyc b/lvdm/models/__pycache__/utils_diffusion.cpython-37.pyc new file mode 100644 index 0000000..7cbc65b Binary files /dev/null and b/lvdm/models/__pycache__/utils_diffusion.cpython-37.pyc differ diff --git a/lvdm/models/autoencoder.py b/lvdm/models/autoencoder.py new file mode 100644 index 0000000..cc479d8 --- /dev/null +++ b/lvdm/models/autoencoder.py @@ -0,0 +1,219 @@ +import os +from contextlib import contextmanager +import torch +import numpy as np +from einops import rearrange +import torch.nn.functional as F +import pytorch_lightning as pl +from lvdm.modules.networks.ae_modules import Encoder, Decoder +from lvdm.distributions import DiagonalGaussianDistribution +from utils.utils import instantiate_from_config + + +class AutoencoderKL(pl.LightningModule): + def __init__(self, + ddconfig, + lossconfig, + embed_dim, + ckpt_path=None, + ignore_keys=[], + image_key="image", + colorize_nlabels=None, + monitor=None, + test=False, + logdir=None, + input_dim=4, + test_args=None, + ): + super().__init__() + self.image_key = image_key + self.encoder = Encoder(**ddconfig) + self.decoder = Decoder(**ddconfig) + self.loss = instantiate_from_config(lossconfig) + assert ddconfig["double_z"] + self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1) + self.embed_dim = embed_dim + self.input_dim = input_dim + self.test = test + self.test_args = test_args + self.logdir = logdir + if colorize_nlabels is not None: + assert type(colorize_nlabels)==int + self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1)) + if monitor is not None: + self.monitor = monitor + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + if self.test: + self.init_test() + + def init_test(self,): + self.test = True + save_dir = os.path.join(self.logdir, "test") + if 'ckpt' in self.test_args: + ckpt_name = os.path.basename(self.test_args.ckpt).split('.ckpt')[0] + f'_epoch{self._cur_epoch}' + self.root = os.path.join(save_dir, ckpt_name) + else: + self.root = save_dir + if 'test_subdir' in self.test_args: + self.root = os.path.join(save_dir, self.test_args.test_subdir) + + self.root_zs = os.path.join(self.root, "zs") + self.root_dec = os.path.join(self.root, "reconstructions") + self.root_inputs = os.path.join(self.root, "inputs") + os.makedirs(self.root, exist_ok=True) + + if self.test_args.save_z: + os.makedirs(self.root_zs, exist_ok=True) + if self.test_args.save_reconstruction: + os.makedirs(self.root_dec, exist_ok=True) + if self.test_args.save_input: + os.makedirs(self.root_inputs, exist_ok=True) + assert(self.test_args is not None) + self.test_maximum = getattr(self.test_args, 'test_maximum', None) + self.count = 0 + self.eval_metrics = {} + self.decodes = [] + self.save_decode_samples = 2048 + + def init_from_ckpt(self, path, ignore_keys=list()): + sd = torch.load(path, map_location="cpu") + try: + self._cur_epoch = sd['epoch'] + sd = sd["state_dict"] + except: + self._cur_epoch = 'null' + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print("Deleting key {} from state_dict.".format(k)) + del sd[k] + self.load_state_dict(sd, strict=False) + # self.load_state_dict(sd, strict=True) + print(f"Restored from {path}") + + def encode(self, x, **kwargs): + + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z, **kwargs): + z = self.post_quant_conv(z) + dec = self.decoder(z) + return dec + + def forward(self, input, sample_posterior=True): + posterior = self.encode(input) + if sample_posterior: + z = posterior.sample() + else: + z = posterior.mode() + dec = self.decode(z) + return dec, posterior + + def get_input(self, batch, k): + x = batch[k] + if x.dim() == 5 and self.input_dim == 4: + b,c,t,h,w = x.shape + self.b = b + self.t = t + x = rearrange(x, 'b c t h w -> (b t) c h w') + + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + + if optimizer_idx == 0: + # train encoder+decoder+logvar + aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step, + last_layer=self.get_last_layer(), split="train") + self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False) + return aeloss + + if optimizer_idx == 1: + # train the discriminator + discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step, + last_layer=self.get_last_layer(), split="train") + + self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False) + return discloss + + def validation_step(self, batch, batch_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step, + last_layer=self.get_last_layer(), split="val") + + discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step, + last_layer=self.get_last_layer(), split="val") + + self.log("val/rec_loss", log_dict_ae["val/rec_loss"]) + self.log_dict(log_dict_ae) + self.log_dict(log_dict_disc) + return self.log_dict + + def configure_optimizers(self): + lr = self.learning_rate + opt_ae = torch.optim.Adam(list(self.encoder.parameters())+ + list(self.decoder.parameters())+ + list(self.quant_conv.parameters())+ + list(self.post_quant_conv.parameters()), + lr=lr, betas=(0.5, 0.9)) + opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(), + lr=lr, betas=(0.5, 0.9)) + return [opt_ae, opt_disc], [] + + def get_last_layer(self): + return self.decoder.conv_out.weight + + @torch.no_grad() + def log_images(self, batch, only_inputs=False, **kwargs): + log = dict() + x = self.get_input(batch, self.image_key) + x = x.to(self.device) + if not only_inputs: + xrec, posterior = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec.shape[1] > 3 + x = self.to_rgb(x) + xrec = self.to_rgb(xrec) + log["samples"] = self.decode(torch.randn_like(posterior.sample())) + log["reconstructions"] = xrec + log["inputs"] = x + return log + + def to_rgb(self, x): + assert self.image_key == "segmentation" + if not hasattr(self, "colorize"): + self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x)) + x = F.conv2d(x, weight=self.colorize) + x = 2.*(x-x.min())/(x.max()-x.min()) - 1. + return x + +class IdentityFirstStage(torch.nn.Module): + def __init__(self, *args, vq_interface=False, **kwargs): + self.vq_interface = vq_interface # TODO: Should be true by default but check to not break older stuff + super().__init__() + + def encode(self, x, *args, **kwargs): + return x + + def decode(self, x, *args, **kwargs): + return x + + def quantize(self, x, *args, **kwargs): + if self.vq_interface: + return x, None, [None, None, None] + return x + + def forward(self, x, *args, **kwargs): + return x diff --git a/lvdm/models/ddpm3d.py b/lvdm/models/ddpm3d.py new file mode 100644 index 0000000..a9432f3 --- /dev/null +++ b/lvdm/models/ddpm3d.py @@ -0,0 +1,766 @@ +""" +wild mixture of +https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py +https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +https://github.com/CompVis/taming-transformers +-- merci +""" + +from functools import partial +from contextlib import contextmanager +import numpy as np +from tqdm import tqdm +from einops import rearrange, repeat +import logging +mainlogger = logging.getLogger('mainlogger') +import torch +import torch.nn as nn +from torchvision.utils import make_grid +import pytorch_lightning as pl +from utils.utils import instantiate_from_config +from lvdm.ema import LitEma +from lvdm.distributions import DiagonalGaussianDistribution +from lvdm.models.utils_diffusion import make_beta_schedule +from lvdm.modules.encoders.ip_resampler import ImageProjModel, Resampler +from lvdm.basics import disabled_train +from lvdm.common import ( + extract_into_tensor, + noise_like, + exists, + default +) + + +__conditioning_keys__ = {'concat': 'c_concat', + 'crossattn': 'c_crossattn', + 'adm': 'y'} + +class DDPM(pl.LightningModule): + # classic DDPM with Gaussian diffusion, in image space + def __init__(self, + unet_config, + timesteps=1000, + beta_schedule="linear", + loss_type="l2", + ckpt_path=None, + ignore_keys=[], + load_only_unet=False, + monitor=None, + use_ema=True, + first_stage_key="image", + image_size=256, + channels=3, + log_every_t=100, + clip_denoised=True, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + given_betas=None, + original_elbo_weight=0., + v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta + l_simple_weight=1., + conditioning_key=None, + parameterization="eps", # all assuming fixed variance schedules + scheduler_config=None, + use_positional_encodings=False, + learn_logvar=False, + logvar_init=0. + ): + super().__init__() + assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"' + self.parameterization = parameterization + mainlogger.info(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode") + self.cond_stage_model = None + self.clip_denoised = clip_denoised + self.log_every_t = log_every_t + self.first_stage_key = first_stage_key + self.channels = channels + self.temporal_length = unet_config.params.temporal_length + self.image_size = image_size + if isinstance(self.image_size, int): + self.image_size = [self.image_size, self.image_size] + self.use_positional_encodings = use_positional_encodings + self.model = DiffusionWrapper(unet_config, conditioning_key) + self.use_ema = use_ema + if self.use_ema: + self.model_ema = LitEma(self.model) + mainlogger.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.") + + self.use_scheduler = scheduler_config is not None + if self.use_scheduler: + self.scheduler_config = scheduler_config + + self.v_posterior = v_posterior + self.original_elbo_weight = original_elbo_weight + self.l_simple_weight = l_simple_weight + + if monitor is not None: + self.monitor = monitor + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet) + + self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps, + linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s) + + self.loss_type = loss_type + + self.learn_logvar = learn_logvar + self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,)) + if self.learn_logvar: + self.logvar = nn.Parameter(self.logvar, requires_grad=True) + + + def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000, + linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): + if exists(given_betas): + betas = given_betas + else: + betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, + cosine_s=cosine_s) + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep' + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / ( + 1. - alphas_cumprod) + self.v_posterior * betas + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer('posterior_variance', to_torch(posterior_variance)) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20)))) + self.register_buffer('posterior_mean_coef1', to_torch( + betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))) + self.register_buffer('posterior_mean_coef2', to_torch( + (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod))) + + if self.parameterization == "eps": + lvlb_weights = self.betas ** 2 / ( + 2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)) + elif self.parameterization == "x0": + lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod)) + else: + raise NotImplementedError("mu not supported") + # TODO how to choose this term + lvlb_weights[0] = lvlb_weights[1] + self.register_buffer('lvlb_weights', lvlb_weights, persistent=False) + assert not torch.isnan(self.lvlb_weights).all() + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.model.parameters()) + self.model_ema.copy_to(self.model) + if context is not None: + mainlogger.info(f"{context}: Switched to EMA weights") + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.model.parameters()) + if context is not None: + mainlogger.info(f"{context}: Restored training weights") + + def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + sd = torch.load(path, map_location="cpu") + if "state_dict" in list(sd.keys()): + sd = sd["state_dict"] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + mainlogger.info("Deleting key {} from state_dict.".format(k)) + del sd[k] + missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict( + sd, strict=False) + mainlogger.info(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys") + if len(missing) > 0: + mainlogger.info(f"Missing Keys: {missing}") + if len(unexpected) > 0: + mainlogger.info(f"Unexpected Keys: {unexpected}") + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start) + variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) + return mean, variance, log_variance + + def predict_start_from_noise(self, x_t, t, noise): + return ( + extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - + extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise + ) + + def q_posterior(self, x_start, x_t, t): + posterior_mean = ( + extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, x, t, clip_denoised: bool): + model_out = self.model(x, t) + if self.parameterization == "eps": + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == "x0": + x_recon = model_out + if clip_denoised: + x_recon.clamp_(-1., 1.) + + model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t) + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, x, t, clip_denoised=True, repeat_noise=False): + b, *_, device = *x.shape, x.device + model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised) + noise = noise_like(x.shape, device, repeat_noise) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + @torch.no_grad() + def p_sample_loop(self, shape, return_intermediates=False): + device = self.betas.device + b = shape[0] + img = torch.randn(shape, device=device) + intermediates = [img] + for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps): + img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long), + clip_denoised=self.clip_denoised) + if i % self.log_every_t == 0 or i == self.num_timesteps - 1: + intermediates.append(img) + if return_intermediates: + return img, intermediates + return img + + @torch.no_grad() + def sample(self, batch_size=16, return_intermediates=False): + image_size = self.image_size + channels = self.channels + return self.p_sample_loop((batch_size, channels, image_size, image_size), + return_intermediates=return_intermediates) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start * + extract_into_tensor(self.scale_arr, t, x_start.shape) + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) + + def get_input(self, batch, k): + x = batch[k] + x = x.to(memory_format=torch.contiguous_format).float() + return x + + def _get_rows_from_list(self, samples): + n_imgs_per_row = len(samples) + denoise_grid = rearrange(samples, 'n b c h w -> b n c h w') + denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w') + denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row) + return denoise_grid + + @torch.no_grad() + def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs): + log = dict() + x = self.get_input(batch, self.first_stage_key) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + x = x.to(self.device)[:N] + log["inputs"] = x + + # get diffusion row + diffusion_row = list() + x_start = x[:n_row] + + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(x_start) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + diffusion_row.append(x_noisy) + + log["diffusion_row"] = self._get_rows_from_list(diffusion_row) + + if sample: + # get denoise row + with self.ema_scope("Plotting"): + samples, denoise_row = self.sample(batch_size=N, return_intermediates=True) + + log["samples"] = samples + log["denoise_row"] = self._get_rows_from_list(denoise_row) + + if return_keys: + if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0: + return log + else: + return {key: log[key] for key in return_keys} + return log + + +class LatentDiffusion(DDPM): + """main class""" + def __init__(self, + first_stage_config, + cond_stage_config, + num_timesteps_cond=None, + cond_stage_key="caption", + cond_stage_trainable=False, + cond_stage_forward=None, + conditioning_key=None, + uncond_prob=0.2, + uncond_type="empty_seq", + scale_factor=1.0, + scale_by_std=False, + encoder_type="2d", + only_model=False, + use_scale=False, + scale_a=1, + scale_b=0.3, + mid_step=400, + fix_scale_bug=False, + *args, **kwargs): + self.num_timesteps_cond = default(num_timesteps_cond, 1) + self.scale_by_std = scale_by_std + assert self.num_timesteps_cond <= kwargs['timesteps'] + # for backwards compatibility after implementation of DiffusionWrapper + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", []) + conditioning_key = default(conditioning_key, 'crossattn') + super().__init__(conditioning_key=conditioning_key, *args, **kwargs) + + self.cond_stage_trainable = cond_stage_trainable + self.cond_stage_key = cond_stage_key # 'caption' + + # scale factor + self.use_scale=use_scale + if self.use_scale: + self.scale_a=scale_a + self.scale_b=scale_b + if fix_scale_bug: + scale_step=self.num_timesteps-mid_step + else: #bug + scale_step = self.num_timesteps + + scale_arr1 = np.linspace(scale_a, scale_b, mid_step) + scale_arr2 = np.full(scale_step, scale_b) + scale_arr = np.concatenate((scale_arr1, scale_arr2)) + scale_arr_prev = np.append(scale_a, scale_arr[:-1]) + to_torch = partial(torch.tensor, dtype=torch.float32) + self.register_buffer('scale_arr', to_torch(scale_arr)) + + try: + self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1 + except: + self.num_downs = 0 + if not scale_by_std: + self.scale_factor = scale_factor + else: + self.register_buffer('scale_factor', torch.tensor(scale_factor)) + self.instantiate_first_stage(first_stage_config) + self.instantiate_cond_stage(cond_stage_config) + self.first_stage_config = first_stage_config + self.cond_stage_config = cond_stage_config + self.clip_denoised = False + + self.cond_stage_forward = cond_stage_forward + self.encoder_type = encoder_type + assert(encoder_type in ["2d", "3d"]) + self.uncond_prob = uncond_prob + self.classifier_free_guidance = True if uncond_prob > 0 else False + assert(uncond_type in ["zero_embed", "empty_seq"]) + self.uncond_type = uncond_type + + + self.restarted_from_ckpt = False + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys, only_model=only_model) + self.restarted_from_ckpt = True + + + def make_cond_schedule(self, ): + self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long) + ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long() + self.cond_ids[:self.num_timesteps_cond] = ids + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + if self.use_scale: + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start * + extract_into_tensor(self.scale_arr, t, x_start.shape) + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) + else: + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) + + + def _freeze_model(self): + for name, para in self.model.diffusion_model.named_parameters(): + para.requires_grad = False + + def instantiate_first_stage(self, config): + model = instantiate_from_config(config) + self.first_stage_model = model.eval() + self.first_stage_model.train = disabled_train + for param in self.first_stage_model.parameters(): + param.requires_grad = False + + def instantiate_cond_stage(self, config): + if not self.cond_stage_trainable: + model = instantiate_from_config(config) + self.cond_stage_model = model.eval() + self.cond_stage_model.train = disabled_train + for param in self.cond_stage_model.parameters(): + param.requires_grad = False + else: + model = instantiate_from_config(config) + self.cond_stage_model = model + + def get_learned_conditioning(self, c): + if self.cond_stage_forward is None: + if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode): + c = self.cond_stage_model.encode(c) + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + else: + c = self.cond_stage_model(c) + else: + assert hasattr(self.cond_stage_model, self.cond_stage_forward) + c = getattr(self.cond_stage_model, self.cond_stage_forward)(c) + return c + + def get_first_stage_encoding(self, encoder_posterior, noise=None): + if isinstance(encoder_posterior, DiagonalGaussianDistribution): + z = encoder_posterior.sample(noise=noise) + elif isinstance(encoder_posterior, torch.Tensor): + z = encoder_posterior + else: + raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented") + return self.scale_factor * z + + ### @torch.no_grad() + def encode_first_stage(self, x): + if self.encoder_type == "2d" and x.dim() == 5: + b, _, t, _, _ = x.shape + x = rearrange(x, 'b c t h w -> (b t) c h w') + reshape_back = True + else: + reshape_back = False + + encoder_posterior = self.first_stage_model.encode(x) + results = self.get_first_stage_encoding(encoder_posterior) ###.detach() + + if reshape_back: + results = rearrange(results, '(b t) c h w -> b c t h w', b=b,t=t) + + return results + + ### @torch.no_grad() + def encode_first_stage_2DAE(self, x): + + b, _, t, _, _ = x.shape + results = torch.cat([self.get_first_stage_encoding(self.first_stage_model.encode(x[:,:,i])).detach().unsqueeze(2) for i in range(t)], dim=2) + + return results + + def decode_core(self, z, **kwargs): + if self.encoder_type == "2d" and z.dim() == 5: + b, _, t, _, _ = z.shape + z = rearrange(z, 'b c t h w -> (b t) c h w') + reshape_back = True + else: + reshape_back = False + + z = 1. / self.scale_factor * z + + results = self.first_stage_model.decode(z, **kwargs) + + if reshape_back: + results = rearrange(results, '(b t) c h w -> b c t h w', b=b,t=t) + return results + + @torch.no_grad() + def decode_first_stage(self, z, **kwargs): + return self.decode_core(z, **kwargs) + + def apply_model(self, x_noisy, t, cond, **kwargs): + if isinstance(cond, dict): + # hybrid case, cond is exptected to be a dict + pass + else: + if not isinstance(cond, list): + cond = [cond] + key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn' + cond = {key: cond} + # print(f'In apply_model x_noisy {x_noisy.shape} {t.shape} cond: {cond.keys()} kwargs {kwargs.keys()}') + x_recon = self.model(x_noisy, t, **cond, **kwargs) + + if isinstance(x_recon, tuple): + return x_recon[0] + else: + return x_recon + + def _get_denoise_row_from_list(self, samples, desc=''): + denoise_row = [] + for zd in tqdm(samples, desc=desc): + denoise_row.append(self.decode_first_stage(zd.to(self.device))) + n_log_timesteps = len(denoise_row) + + denoise_row = torch.stack(denoise_row) # n_log_timesteps, b, C, H, W + + if denoise_row.dim() == 5: + # img, num_imgs= n_log_timesteps * bs, grid_size=[bs,n_log_timesteps] + denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w') + denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w') + denoise_grid = make_grid(denoise_grid, nrow=n_log_timesteps) + elif denoise_row.dim() == 6: + # video, grid_size=[n_log_timesteps*bs, t] + video_length = denoise_row.shape[3] + denoise_grid = rearrange(denoise_row, 'n b c t h w -> b n c t h w') + denoise_grid = rearrange(denoise_grid, 'b n c t h w -> (b n) c t h w') + denoise_grid = rearrange(denoise_grid, 'n c t h w -> (n t) c h w') + denoise_grid = make_grid(denoise_grid, nrow=video_length) + else: + raise ValueError + + return denoise_grid + + + @torch.no_grad() + def decode_first_stage_2DAE(self, z, **kwargs): + + b, _, t, _, _ = z.shape + z = 1. / self.scale_factor * z + results = torch.cat([self.first_stage_model.decode(z[:,:,i], **kwargs).unsqueeze(2) for i in range(t)], dim=2) + + return results + + + def p_mean_variance(self, x, c, t, clip_denoised: bool, return_x0=False, score_corrector=None, corrector_kwargs=None, **kwargs): + t_in = t + model_out = self.apply_model(x, t_in, c, **kwargs) + + if score_corrector is not None: + assert self.parameterization == "eps" + model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs) + + if self.parameterization == "eps": + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == "x0": + x_recon = model_out + else: + raise NotImplementedError() + + if clip_denoised: + x_recon.clamp_(-1., 1.) + + model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t) + + if return_x0: + return model_mean, posterior_variance, posterior_log_variance, x_recon + else: + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False, return_x0=False, \ + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, **kwargs): + b, *_, device = *x.shape, x.device + outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised, return_x0=return_x0, \ + score_corrector=score_corrector, corrector_kwargs=corrector_kwargs, **kwargs) + if return_x0: + model_mean, _, model_log_variance, x0 = outputs + else: + model_mean, _, model_log_variance = outputs + + noise = noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) + + if return_x0: + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0 + else: + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + @torch.no_grad() + def p_sample_loop(self, cond, shape, return_intermediates=False, x_T=None, verbose=True, callback=None, \ + timesteps=None, mask=None, x0=None, img_callback=None, start_T=None, log_every_t=None, **kwargs): + + if not log_every_t: + log_every_t = self.log_every_t + device = self.betas.device + b = shape[0] + # sample an initial noise + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + intermediates = [img] + if timesteps is None: + timesteps = self.num_timesteps + if start_T is not None: + timesteps = min(timesteps, start_T) + + iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(range(0, timesteps)) + + if mask is not None: + assert x0 is not None + assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match + + for i in iterator: + ts = torch.full((b,), i, device=device, dtype=torch.long) + if self.shorten_cond_schedule: + assert self.model.conditioning_key != 'hybrid' + tc = self.cond_ids[ts].to(cond.device) + cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond)) + + img = self.p_sample(img, cond, ts, clip_denoised=self.clip_denoised, **kwargs) + if mask is not None: + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1. - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(img) + if callback: callback(i) + if img_callback: img_callback(img, i) + + if return_intermediates: + return img, intermediates + return img + + +# class LatentVisualDiffusion(LatentDiffusion): +# def __init__(self, cond_img_config, finegrained=False, random_cond=False, *args, **kwargs): +# super().__init__(*args, **kwargs) +# self.random_cond = random_cond +# self.instantiate_img_embedder(cond_img_config, freeze=True) +# num_tokens = 16 if finegrained else 4 +# self.image_proj_model = self.init_projector(use_finegrained=finegrained, num_tokens=num_tokens, input_dim=1024,\ +# cross_attention_dim=1024, dim=1280) + +# def instantiate_img_embedder(self, config, freeze=True): +# embedder = instantiate_from_config(config) +# if freeze: +# self.embedder = embedder.eval() +# self.embedder.train = disabled_train +# for param in self.embedder.parameters(): +# param.requires_grad = False + +# def init_projector(self, use_finegrained, num_tokens, input_dim, cross_attention_dim, dim): +# if not use_finegrained: +# image_proj_model = ImageProjModel(clip_extra_context_tokens=num_tokens, cross_attention_dim=cross_attention_dim, +# clip_embeddings_dim=input_dim +# ) +# else: +# image_proj_model = Resampler(dim=input_dim, depth=4, dim_head=64, heads=12, num_queries=num_tokens, +# embedding_dim=dim, output_dim=cross_attention_dim, ff_mult=4 +# ) +# return image_proj_model + +# ## Never delete this func: it is used in log_images() and inference stage +# def get_image_embeds(self, batch_imgs): +# ## img: b c h w +# img_token = self.embedder(batch_imgs) +# img_emb = self.image_proj_model(img_token) +# return img_emb + + +class DiffusionWrapper(pl.LightningModule): + def __init__(self, diff_model_config, conditioning_key): + super().__init__() + + self.diffusion_model = instantiate_from_config(diff_model_config) + + self.conditioning_key = conditioning_key + + def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, + c_adm=None, s=None, mask=None, **kwargs): + # temporal_context = fps is foNone + if self.conditioning_key is None: + out = self.diffusion_model(x, t) + elif self.conditioning_key == 'concat': + xc = torch.cat([x] + c_concat, dim=1) + out = self.diffusion_model(xc, t, **kwargs) + elif self.conditioning_key == 'crossattn': + cc = torch.cat(c_crossattn, 1) + # print(f"ddpm3d {x.dtype} {t.dtype} {cc.dtype}") + out = self.diffusion_model(x, t, context=cc, **kwargs) + elif self.conditioning_key == 'hybrid': + ## it is just right [b,c,t,h,w]: concatenate in channel dim + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc) + elif self.conditioning_key == 'resblockcond': + cc = c_crossattn[0] + out = self.diffusion_model(x, t, context=cc) + elif self.conditioning_key == 'adm': + cc = c_crossattn[0] + out = self.diffusion_model(x, t, y=cc) + elif self.conditioning_key == 'hybrid-adm': + assert c_adm is not None + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc, y=c_adm) + elif self.conditioning_key == 'hybrid-time': + assert s is not None + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc, s=s) + elif self.conditioning_key == 'concat-time-mask': + # assert s is not None + # mainlogger.info('x & mask:',x.shape,c_concat[0].shape) + xc = torch.cat([x] + c_concat, dim=1) + out = self.diffusion_model(xc, t, context=None, s=s, mask=mask) + elif self.conditioning_key == 'concat-adm-mask': + # assert s is not None + # mainlogger.info('x & mask:',x.shape,c_concat[0].shape) + if c_concat is not None: + xc = torch.cat([x] + c_concat, dim=1) + else: + xc = x + out = self.diffusion_model(xc, t, context=None, y=s, mask=mask) + elif self.conditioning_key == 'hybrid-adm-mask': + cc = torch.cat(c_crossattn, 1) + if c_concat is not None: + xc = torch.cat([x] + c_concat, dim=1) + else: + xc = x + out = self.diffusion_model(xc, t, context=cc, y=s, mask=mask) + elif self.conditioning_key == 'hybrid-time-adm': # adm means y, e.g., class index + # assert s is not None + assert c_adm is not None + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc, s=s, y=c_adm) + else: + raise NotImplementedError() + + return out \ No newline at end of file diff --git a/lvdm/models/samplers/ddim.py b/lvdm/models/samplers/ddim.py new file mode 100644 index 0000000..ffbd4ab --- /dev/null +++ b/lvdm/models/samplers/ddim.py @@ -0,0 +1,339 @@ +import numpy as np +from tqdm import tqdm +import torch +from lvdm.models.utils_diffusion import make_ddim_sampling_parameters, make_ddim_timesteps +from lvdm.common import noise_like + + +class DDIMSampler(object): + def __init__(self, model, schedule="linear", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + self.counter = 0 + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device("cuda"): + attr = attr.to(torch.device("cuda")) + setattr(self, name, attr) + + def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True): + self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)) + self.use_scale = self.model.use_scale + print('DDIM scale', self.use_scale) + + if self.use_scale: + self.register_buffer('scale_arr', to_torch(self.model.scale_arr)) + ddim_scale_arr = self.scale_arr.cpu()[self.ddim_timesteps] + self.register_buffer('ddim_scale_arr', ddim_scale_arr) + ddim_scale_arr = np.asarray([self.scale_arr.cpu()[0]] + self.scale_arr.cpu()[self.ddim_timesteps[:-1]].tolist()) + self.register_buffer('ddim_scale_arr_prev', ddim_scale_arr) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta,verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * ( + 1 - self.alphas_cumprod / self.alphas_cumprod_prev)) + self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample(self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + schedule_verbose=False, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + **kwargs + ): + + # check condition bs + if conditioning is not None: + if isinstance(conditioning, dict): + try: + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + except: + cbs = conditioning[list(conditioning.keys())[0]][0].shape[0] + + if cbs != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + else: + if conditioning.shape[0] != batch_size: + print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=schedule_verbose) + + # make shape + if len(shape) == 3: + C, H, W = shape + size = (batch_size, C, H, W) + elif len(shape) == 4: + C, T, H, W = shape + size = (batch_size, C, T, H, W) + # print(f'Data shape for DDIM sampling is {size}, eta {eta}') + + samples, intermediates = self.ddim_sampling(conditioning, size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + verbose=verbose, + **kwargs) + return samples, intermediates + + @torch.no_grad() + def ddim_sampling(self, cond, shape, + x_T=None, ddim_use_original_steps=False, + callback=None, timesteps=None, quantize_denoised=False, + mask=None, x0=None, img_callback=None, log_every_t=100, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, verbose=True, + cond_tau=1., target_size=None, start_timesteps=None, + **kwargs): + device = self.model.betas.device + print('ddim device', device) + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img]} + time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + if verbose: + iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + else: + iterator = time_range + + init_x0 = False + clean_cond = kwargs.pop("clean_cond", False) + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + if start_timesteps is not None: + assert x0 is not None + if step > start_timesteps*time_range[0]: + continue + elif not init_x0: + img = self.model.q_sample(x0, ts) + init_x0 = True + + # use mask to blend noised original latent (img_orig) & new sampled latent (img) + if mask is not None: + assert x0 is not None + if clean_cond: + img_orig = x0 + else: + img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img # keep original & modify use img + + index_clip = int((1 - cond_tau) * total_steps) + if index <= index_clip and target_size is not None: + target_size_ = [target_size[0], target_size[1]//8, target_size[2]//8] + img = torch.nn.functional.interpolate( + img, + size=target_size_, + mode="nearest", + ) + outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, temperature=temperature, + noise_dropout=noise_dropout, score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + x0=x0, + **kwargs) + + img, pred_x0 = outs + if callback: callback(i) + if img_callback: img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, + uc_type=None, conditional_guidance_scale_temporal=None, **kwargs): + b, *_, device = *x.shape, x.device + if x.dim() == 5: + is_video = True + else: + is_video = False + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + e_t = self.model.apply_model(x, t, c, **kwargs) # unet denoiser + else: + # with unconditional condition + print(f'in p_sample_ddim kwargs {kwargs.keys()} {kwargs["temporal_length"]}') + if isinstance(c, torch.Tensor): + e_t = self.model.apply_model(x, t, c, **kwargs) + e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **kwargs) + elif isinstance(c, dict): + e_t = self.model.apply_model(x, t, c, **kwargs) + e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **kwargs) + else: + raise NotImplementedError + print(f'in p_sample_ddim uc_type {uc_type}') + # text cfg + if uc_type is None: + e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond) + else: + if uc_type == 'cfg_original': + e_t = e_t + unconditional_guidance_scale * (e_t - e_t_uncond) + elif uc_type == 'cfg_ours': + e_t = e_t + unconditional_guidance_scale * (e_t_uncond - e_t) + else: + raise NotImplementedError + # temporal guidance + print(f'in p_sample_ddim conditional_guidance_scale_temporal {conditional_guidance_scale_temporal}') + if conditional_guidance_scale_temporal is not None: + e_t_temporal = self.model.apply_model(x, t, c, **kwargs) + e_t_image = self.model.apply_model(x, t, c, no_temporal_attn=True, **kwargs) + e_t = e_t + conditional_guidance_scale_temporal * (e_t_temporal - e_t_image) + + if score_corrector is not None: + assert self.model.parameterization == "eps" + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + # select parameters corresponding to the currently considered timestep + + if is_video: + size = (b, 1, 1, 1, 1) + else: + size = (b, 1, 1, 1) + a_t = torch.full(size, alphas[index], device=device) + a_prev = torch.full(size, alphas_prev[index], device=device) + sigma_t = torch.full(size, sigmas[index], device=device) + sqrt_one_minus_at = torch.full(size, sqrt_one_minus_alphas[index],device=device) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + if self.use_scale: + scale_arr = self.model.scale_arr if use_original_steps else self.ddim_scale_arr + scale_t = torch.full(size, scale_arr[index], device=device) + scale_arr_prev = self.model.scale_arr_prev if use_original_steps else self.ddim_scale_arr_prev + scale_t_prev = torch.full(size, scale_arr_prev[index], device=device) + pred_x0 /= scale_t + x_prev = a_prev.sqrt() * scale_t_prev * pred_x0 + dir_xt + noise + else: + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + + return x_prev, pred_x0 + + + @torch.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = torch.randn_like(x0) + + def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1,) * (len(x_shape) - 1))) + + return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise) + + @torch.no_grad() + def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None, + use_original_steps=False): + + timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc='Decoding image', total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long) + x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning) + return x_dec + diff --git a/lvdm/models/utils_diffusion.py b/lvdm/models/utils_diffusion.py new file mode 100644 index 0000000..603fa81 --- /dev/null +++ b/lvdm/models/utils_diffusion.py @@ -0,0 +1,104 @@ +import math +import numpy as np +from einops import repeat +import torch +import torch.nn.functional as F + + +def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + if not repeat_only: + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + else: + embedding = repeat(timesteps, 'b -> b d', d=dim) + return embedding + + +def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): + if schedule == "linear": + betas = ( + torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 + ) + + elif schedule == "cosine": + timesteps = ( + torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s + ) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = torch.cos(alphas).pow(2) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == "sqrt_linear": + betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) + elif schedule == "sqrt": + betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5 + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + + +def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True): + if ddim_discr_method == 'uniform': + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == 'quad': + ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int) + else: + raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"') + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f'Selected timesteps for ddim sampler: {steps_out}') + return steps_out + + +def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): + # select alphas for computing the variance schedule + # print(f'ddim_timesteps={ddim_timesteps}, len_alphacums={len(alphacums)}') + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)) + if verbose: + print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}') + print(f'For the chosen value of eta, which is {eta}, ' + f'this results in the following sigma_t schedule for ddim sampler {sigmas}') + return sigmas, alphas, alphas_prev + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) \ No newline at end of file diff --git a/lvdm/modules/__pycache__/attention.cpython-37.pyc b/lvdm/modules/__pycache__/attention.cpython-37.pyc new file mode 100644 index 0000000..5540085 Binary files /dev/null and b/lvdm/modules/__pycache__/attention.cpython-37.pyc differ diff --git a/lvdm/modules/attention.py b/lvdm/modules/attention.py new file mode 100644 index 0000000..bceba7d --- /dev/null +++ b/lvdm/modules/attention.py @@ -0,0 +1,475 @@ +from functools import partial +import torch +from torch import nn, einsum +import torch.nn.functional as F +from einops import rearrange, repeat +try: + import xformers + import xformers.ops + XFORMERS_IS_AVAILBLE = True +except: + XFORMERS_IS_AVAILBLE = False +from lvdm.common import ( + checkpoint, + exists, + default, +) +from lvdm.basics import ( + zero_module, +) + +class RelativePosition(nn.Module): + """ https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py """ + + def __init__(self, num_units, max_relative_position): + super().__init__() + self.num_units = num_units + self.max_relative_position = max_relative_position + self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units)) + nn.init.xavier_uniform_(self.embeddings_table) + + def forward(self, length_q, length_k): + device = self.embeddings_table.device + range_vec_q = torch.arange(length_q, device=device) + range_vec_k = torch.arange(length_k, device=device) + distance_mat = range_vec_k[None, :] - range_vec_q[:, None] + distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position) + final_mat = distance_mat_clipped + self.max_relative_position + final_mat = final_mat.long() + embeddings = self.embeddings_table[final_mat] + return embeddings + + +class CrossAttention(nn.Module): + + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., + relative_position=False, temporal_length=None, img_cross_attention=False): + super().__init__() + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + self.dim_head = dim_head + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)) + + self.image_cross_attention_scale = 1.0 + self.text_context_len = 77 + self.img_cross_attention = img_cross_attention + if self.img_cross_attention: + self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v_ip = nn.Linear(context_dim, inner_dim, bias=False) + + self.relative_position = relative_position + if self.relative_position: + assert(temporal_length is not None) + self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length) + self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length) + else: + ## only used for spatial attention, while NOT for temporal attention + if XFORMERS_IS_AVAILBLE and temporal_length is None: + self.forward = self.efficient_forward + + def forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + ## considering image token additionally + if context is not None and self.img_cross_attention: + context, context_img = context[:,:self.text_context_len,:], context[:,self.text_context_len:,:] + k = self.to_k(context) + v = self.to_v(context) + k_ip = self.to_k_ip(context_img) + v_ip = self.to_v_ip(context_img) + else: + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale + if self.relative_position: + len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1] + k2 = self.relative_position_k(len_q, len_k) + sim2 = einsum('b t d, t s d -> b t s', q, k2) * self.scale # TODO check + sim += sim2 + del k + + if exists(mask): + ## feasible for causal attention mask only + max_neg_value = -torch.finfo(sim.dtype).max + mask = repeat(mask, 'b i j -> (b h) i j', h=h) + sim.masked_fill_(~(mask>0.5), max_neg_value) + + # attention, what we cannot get enough of + sim = sim.softmax(dim=-1) + out = torch.einsum('b i j, b j d -> b i d', sim, v) + if self.relative_position: + v2 = self.relative_position_v(len_q, len_v) + out2 = einsum('b t s, t s d -> b t d', sim, v2) # TODO check + out += out2 + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + + ## considering image token additionally + if context is not None and self.img_cross_attention: + k_ip, v_ip = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (k_ip, v_ip)) + sim_ip = torch.einsum('b i d, b j d -> b i j', q, k_ip) * self.scale + del k_ip + sim_ip = sim_ip.softmax(dim=-1) + out_ip = torch.einsum('b i j, b j d -> b i d', sim_ip, v_ip) + out_ip = rearrange(out_ip, '(b h) n d -> b n (h d)', h=h) + out = out + self.image_cross_attention_scale * out_ip + del q + + return self.to_out(out) + + def efficient_forward(self, x, context=None, mask=None): + q = self.to_q(x) + context = default(context, x) + + ## considering image token additionally + if context is not None and self.img_cross_attention: + context, context_img = context[:,:self.text_context_len,:], context[:,self.text_context_len:,:] + k = self.to_k(context) + v = self.to_v(context) + k_ip = self.to_k_ip(context_img) + v_ip = self.to_v_ip(context_img) + else: + k = self.to_k(context) + v = self.to_v(context) + + b, _, _ = q.shape + q, k, v = map( + lambda t: t.unsqueeze(3) + .reshape(b, t.shape[1], self.heads, self.dim_head) + .permute(0, 2, 1, 3) + .reshape(b * self.heads, t.shape[1], self.dim_head) + .contiguous(), + (q, k, v), + ) + # actually compute the attention, what we cannot get enough of + out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=None) + + ## considering image token additionally + if context is not None and self.img_cross_attention: + k_ip, v_ip = map( + lambda t: t.unsqueeze(3) + .reshape(b, t.shape[1], self.heads, self.dim_head) + .permute(0, 2, 1, 3) + .reshape(b * self.heads, t.shape[1], self.dim_head) + .contiguous(), + (k_ip, v_ip), + ) + out_ip = xformers.ops.memory_efficient_attention(q, k_ip, v_ip, attn_bias=None, op=None) + out_ip = ( + out_ip.unsqueeze(0) + .reshape(b, self.heads, out.shape[1], self.dim_head) + .permute(0, 2, 1, 3) + .reshape(b, out.shape[1], self.heads * self.dim_head) + ) + + if exists(mask): + raise NotImplementedError + out = ( + out.unsqueeze(0) + .reshape(b, self.heads, out.shape[1], self.dim_head) + .permute(0, 2, 1, 3) + .reshape(b, out.shape[1], self.heads * self.dim_head) + ) + if context is not None and self.img_cross_attention: + out = out + self.image_cross_attention_scale * out_ip + return self.to_out(out) + + +class BasicTransformerBlock(nn.Module): + + def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True, + disable_self_attn=False, attention_cls=None, img_cross_attention=False): + super().__init__() + attn_cls = CrossAttention if attention_cls is None else attention_cls + self.disable_self_attn = disable_self_attn + self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, + context_dim=context_dim if self.disable_self_attn else None) + self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) + self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, + img_cross_attention=img_cross_attention) + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + self.norm3 = nn.LayerNorm(dim) + self.checkpoint = checkpoint + + def forward(self, x, context=None, mask=None): + ## implementation tricks: because checkpointing doesn't support non-tensor (e.g. None or scalar) arguments + input_tuple = (x,) ## should not be (x), otherwise *input_tuple will decouple x into multiple arguments + if context is not None: + input_tuple = (x, context) + if mask is not None: + forward_mask = partial(self._forward, mask=mask) + return checkpoint(forward_mask, (x,), self.parameters(), self.checkpoint) + if context is not None and mask is not None: + input_tuple = (x, context, mask) + return checkpoint(self._forward, input_tuple, self.parameters(), self.checkpoint) + + def _forward(self, x, context=None, mask=None): + x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None, mask=mask) + x + x = self.attn2(self.norm2(x), context=context, mask=mask) + x + x = self.ff(self.norm3(x)) + x + return x + + +class SpatialTransformer(nn.Module): + """ + Transformer block for image-like data in spatial axis. + First, project the input (aka embedding) + and reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + NEW: use_linear for more efficiency instead of the 1x1 convs + """ + + def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None, + use_checkpoint=True, disable_self_attn=False, use_linear=False, img_cross_attention=False): + super().__init__() + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + if not use_linear: + self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + else: + self.proj_in = nn.Linear(in_channels, inner_dim) + + self.transformer_blocks = nn.ModuleList([ + BasicTransformerBlock( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim, + img_cross_attention=img_cross_attention, + disable_self_attn=disable_self_attn, + checkpoint=use_checkpoint) for d in range(depth) + ]) + if not use_linear: + self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)) + else: + self.proj_out = zero_module(nn.Linear(inner_dim, in_channels)) + self.use_linear = use_linear + + + def forward(self, x, context=None): + b, c, h, w = x.shape + x_in = x + x = self.norm(x) + if not self.use_linear: + x = self.proj_in(x) + x = rearrange(x, 'b c h w -> b (h w) c').contiguous() + if self.use_linear: + x = self.proj_in(x) + for i, block in enumerate(self.transformer_blocks): + x = block(x, context=context) + if self.use_linear: + x = self.proj_out(x) + x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous() + if not self.use_linear: + x = self.proj_out(x) + return x + x_in + + +class TemporalTransformer(nn.Module): + """ + Transformer block for image-like data in temporal axis. + First, reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + """ + def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None, + use_checkpoint=True, use_linear=False, only_self_att=True, causal_attention=False, + relative_position=False, temporal_length=None): + super().__init__() + self.only_self_att = only_self_att + self.relative_position = relative_position + self.causal_attention = causal_attention + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + if not use_linear: + self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + else: + self.proj_in = nn.Linear(in_channels, inner_dim) + + if relative_position: + assert(temporal_length is not None) + attention_cls = partial(CrossAttention, relative_position=True, temporal_length=temporal_length) + else: + attention_cls = None + if self.causal_attention: + assert(temporal_length is not None) + self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length])) + + if self.only_self_att: + context_dim = None + self.transformer_blocks = nn.ModuleList([ + BasicTransformerBlock( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim, + attention_cls=attention_cls, + checkpoint=use_checkpoint) for d in range(depth) + ]) + if not use_linear: + self.proj_out = zero_module(nn.Conv1d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)) + else: + self.proj_out = zero_module(nn.Linear(inner_dim, in_channels)) + self.use_linear = use_linear + + def forward(self, x, context=None): + b, c, t, h, w = x.shape + x_in = x + x = self.norm(x) + x = rearrange(x, 'b c t h w -> (b h w) c t').contiguous() + if not self.use_linear: + x = self.proj_in(x) + x = rearrange(x, 'bhw c t -> bhw t c').contiguous() + if self.use_linear: + x = self.proj_in(x) + + if self.causal_attention: + mask = self.mask.to(x.device) + mask = repeat(mask, 'l i j -> (l bhw) i j', bhw=b*h*w) + else: + mask = None + + if self.only_self_att: + ## note: if no context is given, cross-attention defaults to self-attention + for i, block in enumerate(self.transformer_blocks): + x = block(x, mask=mask) + x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous() + else: + x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous() + context = rearrange(context, '(b t) l con -> b t l con', t=t).contiguous() + for i, block in enumerate(self.transformer_blocks): + # calculate each batch one by one (since number in shape could not greater then 65,535 for some package) + for j in range(b): + context_j = repeat( + context[j], + 't l con -> (t r) l con', r=(h * w) // t, t=t).contiguous() + ## note: causal mask will not applied in cross-attention case + x[j] = block(x[j], context=context_j) + + if self.use_linear: + x = self.proj_out(x) + x = rearrange(x, 'b (h w) t c -> b c t h w', h=h, w=w).contiguous() + if not self.use_linear: + x = rearrange(x, 'b hw t c -> (b hw) c t').contiguous() + x = self.proj_out(x) + x = rearrange(x, '(b h w) c t -> b c t h w', b=b, h=h, w=w).contiguous() + + return x + x_in + + +class GEGLU(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential( + nn.Linear(dim, inner_dim), + nn.GELU() + ) if not glu else GEGLU(dim, inner_dim) + + self.net = nn.Sequential( + project_in, + nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out) + ) + + def forward(self, x): + return self.net(x) + + +class LinearAttention(nn.Module): + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False) + self.to_out = nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3) + k = k.softmax(dim=-1) + context = torch.einsum('bhdn,bhen->bhde', k, v) + out = torch.einsum('bhde,bhdn->bhen', context, q) + out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w) + return self.to_out(out) + + +class SpatialSelfAttention(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.q = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.k = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.v = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b,c,h,w = q.shape + q = rearrange(q, 'b c h w -> b (h w) c') + k = rearrange(k, 'b c h w -> b c (h w)') + w_ = torch.einsum('bij,bjk->bik', q, k) + + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = rearrange(v, 'b c h w -> b c (h w)') + w_ = rearrange(w_, 'b i j -> b j i') + h_ = torch.einsum('bij,bjk->bik', v, w_) + h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h) + h_ = self.proj_out(h_) + + return x+h_ diff --git a/lvdm/modules/encoders/__pycache__/condition.cpython-37.pyc b/lvdm/modules/encoders/__pycache__/condition.cpython-37.pyc new file mode 100644 index 0000000..2ec5f0d Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/condition.cpython-37.pyc differ diff --git a/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-37.pyc b/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-37.pyc new file mode 100644 index 0000000..412c3c4 Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-37.pyc differ diff --git a/lvdm/modules/encoders/condition.py b/lvdm/modules/encoders/condition.py new file mode 100644 index 0000000..401a6b3 --- /dev/null +++ b/lvdm/modules/encoders/condition.py @@ -0,0 +1,392 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint +import kornia +import open_clip +from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel +from lvdm.common import autocast +from utils.utils import count_params + +class AbstractEncoder(nn.Module): + def __init__(self): + super().__init__() + + def encode(self, *args, **kwargs): + raise NotImplementedError + + +class IdentityEncoder(AbstractEncoder): + + def encode(self, x): + return x + + +class ClassEmbedder(nn.Module): + def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1): + super().__init__() + self.key = key + self.embedding = nn.Embedding(n_classes, embed_dim) + self.n_classes = n_classes + self.ucg_rate = ucg_rate + + def forward(self, batch, key=None, disable_dropout=False): + if key is None: + key = self.key + # this is for use in crossattn + c = batch[key][:, None] + if self.ucg_rate > 0. and not disable_dropout: + mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate) + c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1) + c = c.long() + c = self.embedding(c) + return c + + def get_unconditional_conditioning(self, bs, device="cuda"): + uc_class = self.n_classes - 1 # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000) + uc = torch.ones((bs,), device=device) * uc_class + uc = {self.key: uc} + return uc + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +class FrozenT5Embedder(AbstractEncoder): + """Uses the T5 transformer encoder for text""" + + def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, + freeze=True): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained(version) + self.transformer = T5EncoderModel.from_pretrained(version) + self.device = device + self.max_length = max_length # TODO: typical value? + if freeze: + self.freeze() + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, + return_overflowing_tokens=False, padding="max_length", return_tensors="pt") + tokens = batch_encoding["input_ids"].to(self.device) + outputs = self.transformer(input_ids=tokens) + + z = outputs.last_hidden_state + return z + + def encode(self, text): + return self(text) + + +class FrozenCLIPEmbedder(AbstractEncoder): + """Uses the CLIP transformer encoder for text (from huggingface)""" + LAYERS = [ + "last", + "pooled", + "hidden" + ] + + def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, + freeze=True, layer="last", layer_idx=None): # clip-vit-base-patch32 + super().__init__() + assert layer in self.LAYERS + self.tokenizer = CLIPTokenizer.from_pretrained(version) + self.transformer = CLIPTextModel.from_pretrained(version) + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + self.layer_idx = layer_idx + if layer == "hidden": + assert layer_idx is not None + assert 0 <= abs(layer_idx) <= 12 + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, + return_overflowing_tokens=False, padding="max_length", return_tensors="pt") + tokens = batch_encoding["input_ids"].to(self.device) + outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer == "hidden") + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + return z + + def encode(self, text): + return self(text) + + +class ClipImageEmbedder(nn.Module): + def __init__( + self, + model, + jit=False, + device='cuda' if torch.cuda.is_available() else 'cpu', + antialias=True, + ucg_rate=0. + ): + super().__init__() + from clip import load as load_clip + self.model, _ = load_clip(name=model, device=device, jit=jit) + + self.antialias = antialias + + self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) + self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) + self.ucg_rate = ucg_rate + + def preprocess(self, x): + # normalize to [0,1] + x = kornia.geometry.resize(x, (224, 224), + interpolation='bicubic', align_corners=True, + antialias=self.antialias) + x = (x + 1.) / 2. + # re-normalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def forward(self, x, no_dropout=False): + # x is assumed to be in range [-1,1] + out = self.model.encode_image(self.preprocess(x)) + out = out.to(x.dtype) + if self.ucg_rate > 0. and not no_dropout: + out = torch.bernoulli((1. - self.ucg_rate) * torch.ones(out.shape[0], device=out.device))[:, None] * out + return out + + +class FrozenOpenCLIPEmbedder(AbstractEncoder): + """ + Uses the OpenCLIP transformer encoder for text + """ + LAYERS = [ + # "pooled", + "last", + "penultimate" + ] + + def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77, + freeze=True, layer="last"): + super().__init__() + assert layer in self.LAYERS + model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu')) + del model.visual + self.model = model + + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + if self.layer == "last": + self.layer_idx = 0 + elif self.layer == "penultimate": + self.layer_idx = 1 + else: + raise NotImplementedError() + + def freeze(self): + self.model = self.model.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + self.device = self.model.positional_embedding.device + tokens = open_clip.tokenize(text) + z = self.encode_with_transformer(tokens.to(self.device)) + return z + + def encode_with_transformer(self, text): + x = self.model.token_embedding(text) # [batch_size, n_ctx, d_model] + x = x + self.model.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.model.ln_final(x) + return x + + def text_transformer_forward(self, x: torch.Tensor, attn_mask=None): + for i, r in enumerate(self.model.transformer.resblocks): + if i == len(self.model.transformer.resblocks) - self.layer_idx: + break + if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(r, x, attn_mask) + else: + x = r(x, attn_mask=attn_mask) + return x + + def encode(self, text): + return self(text) + + +class FrozenOpenCLIPImageEmbedder(AbstractEncoder): + """ + Uses the OpenCLIP vision transformer encoder for images + """ + + def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77, + freeze=True, layer="pooled", antialias=True, ucg_rate=0.): + super().__init__() + model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), + pretrained=version, ) + del model.transformer + self.model = model + + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + if self.layer == "penultimate": + raise NotImplementedError() + self.layer_idx = 1 + + self.antialias = antialias + + self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) + self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) + self.ucg_rate = ucg_rate + + def preprocess(self, x): + # normalize to [0,1] + x = kornia.geometry.resize(x, (224, 224), + interpolation='bicubic', align_corners=True, + antialias=self.antialias) + x = (x + 1.) / 2. + # renormalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def freeze(self): + self.model = self.model.eval() + for param in self.parameters(): + param.requires_grad = False + + @autocast + def forward(self, image, no_dropout=False): + z = self.encode_with_vision_transformer(image) + if self.ucg_rate > 0. and not no_dropout: + z = torch.bernoulli((1. - self.ucg_rate) * torch.ones(z.shape[0], device=z.device))[:, None] * z + return z + + def encode_with_vision_transformer(self, img): + img = self.preprocess(img) + x = self.model.visual(img) + return x + + def encode(self, text): + return self(text) + + + +class FrozenOpenCLIPImageEmbedderV2(AbstractEncoder): + """ + Uses the OpenCLIP vision transformer encoder for images + """ + + def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", + freeze=True, layer="pooled", antialias=True): + super().__init__() + model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), + pretrained=version, ) + del model.transformer + self.model = model + self.device = device + + if freeze: + self.freeze() + self.layer = layer + if self.layer == "penultimate": + raise NotImplementedError() + self.layer_idx = 1 + + self.antialias = antialias + self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) + self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) + + + def preprocess(self, x): + # normalize to [0,1] + x = kornia.geometry.resize(x, (224, 224), + interpolation='bicubic', align_corners=True, + antialias=self.antialias) + x = (x + 1.) / 2. + # renormalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def freeze(self): + self.model = self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False + + def forward(self, image, no_dropout=False): + ## image: b c h w + z = self.encode_with_vision_transformer(image) + return z + + def encode_with_vision_transformer(self, x): + x = self.preprocess(x) + + # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1 + if self.model.visual.input_patchnorm: + # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)') + x = x.reshape(x.shape[0], x.shape[1], self.model.visual.grid_size[0], self.model.visual.patch_size[0], self.model.visual.grid_size[1], self.model.visual.patch_size[1]) + x = x.permute(0, 2, 4, 1, 3, 5) + x = x.reshape(x.shape[0], self.model.visual.grid_size[0] * self.model.visual.grid_size[1], -1) + x = self.model.visual.patchnorm_pre_ln(x) + x = self.model.visual.conv1(x) + else: + x = self.model.visual.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + + # class embeddings and positional embeddings + x = torch.cat( + [self.model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), + x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.model.visual.positional_embedding.to(x.dtype) + + # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in + x = self.model.visual.patch_dropout(x) + x = self.model.visual.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.model.visual.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + return x + + +class FrozenCLIPT5Encoder(AbstractEncoder): + def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda", + clip_max_length=77, t5_max_length=77): + super().__init__() + self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length) + self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length) + print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, " + f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params.") + + def encode(self, text): + return self(text) + + def forward(self, text): + clip_z = self.clip_encoder.encode(text) + t5_z = self.t5_encoder.encode(text) + return [clip_z, t5_z] \ No newline at end of file diff --git a/lvdm/modules/encoders/ip_resampler.py b/lvdm/modules/encoders/ip_resampler.py new file mode 100644 index 0000000..500820a --- /dev/null +++ b/lvdm/modules/encoders/ip_resampler.py @@ -0,0 +1,136 @@ +# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py +import math +import torch +import torch.nn as nn + + +class ImageProjModel(nn.Module): + """Projection Model""" + def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4): + super().__init__() + self.cross_attention_dim = cross_attention_dim + self.clip_extra_context_tokens = clip_extra_context_tokens + self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim) + self.norm = nn.LayerNorm(cross_attention_dim) + + def forward(self, image_embeds): + #embeds = image_embeds + embeds = image_embeds.type(list(self.proj.parameters())[0].dtype) + clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim) + clip_extra_context_tokens = self.norm(clip_extra_context_tokens) + return clip_extra_context_tokens + +# FFN +def FeedForward(dim, mult=4): + inner_dim = int(dim * mult) + return nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, inner_dim, bias=False), + nn.GELU(), + nn.Linear(inner_dim, dim, bias=False), + ) + + +def reshape_tensor(x, heads): + bs, length, width = x.shape + #(bs, length, width) --> (bs, length, n_heads, dim_per_head) + x = x.view(bs, length, heads, -1) + # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) + x = x.transpose(1, 2) + # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) + x = x.reshape(bs, heads, length, -1) + return x + + +class PerceiverAttention(nn.Module): + def __init__(self, *, dim, dim_head=64, heads=8): + super().__init__() + self.scale = dim_head**-0.5 + self.dim_head = dim_head + self.heads = heads + inner_dim = dim_head * heads + + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) + self.to_out = nn.Linear(inner_dim, dim, bias=False) + + + def forward(self, x, latents): + """ + Args: + x (torch.Tensor): image features + shape (b, n1, D) + latent (torch.Tensor): latent features + shape (b, n2, D) + """ + x = self.norm1(x) + latents = self.norm2(latents) + + b, l, _ = latents.shape + + q = self.to_q(latents) + kv_input = torch.cat((x, latents), dim=-2) + k, v = self.to_kv(kv_input).chunk(2, dim=-1) + + q = reshape_tensor(q, self.heads) + k = reshape_tensor(k, self.heads) + v = reshape_tensor(v, self.heads) + + # attention + scale = 1 / math.sqrt(math.sqrt(self.dim_head)) + weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + out = weight @ v + + out = out.permute(0, 2, 1, 3).reshape(b, l, -1) + + return self.to_out(out) + + +class Resampler(nn.Module): + def __init__( + self, + dim=1024, + depth=8, + dim_head=64, + heads=16, + num_queries=8, + embedding_dim=768, + output_dim=1024, + ff_mult=4, + ): + super().__init__() + + self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) + + self.proj_in = nn.Linear(embedding_dim, dim) + + self.proj_out = nn.Linear(dim, output_dim) + self.norm_out = nn.LayerNorm(output_dim) + + self.layers = nn.ModuleList([]) + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), + FeedForward(dim=dim, mult=ff_mult), + ] + ) + ) + + def forward(self, x): + + latents = self.latents.repeat(x.size(0), 1, 1) + + x = self.proj_in(x) + + for attn, ff in self.layers: + latents = attn(x, latents) + latents + latents = ff(latents) + latents + + latents = self.proj_out(latents) + return self.norm_out(latents) \ No newline at end of file diff --git a/lvdm/modules/networks/__pycache__/ae_modules.cpython-37.pyc b/lvdm/modules/networks/__pycache__/ae_modules.cpython-37.pyc new file mode 100644 index 0000000..741e439 Binary files /dev/null and b/lvdm/modules/networks/__pycache__/ae_modules.cpython-37.pyc differ diff --git a/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-37.pyc b/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-37.pyc new file mode 100644 index 0000000..7ddddc0 Binary files /dev/null and b/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-37.pyc differ diff --git a/lvdm/modules/networks/ae_modules.py b/lvdm/modules/networks/ae_modules.py new file mode 100644 index 0000000..0c2e93f --- /dev/null +++ b/lvdm/modules/networks/ae_modules.py @@ -0,0 +1,845 @@ +# pytorch_diffusion + derived encoder decoder +import math +import torch +import numpy as np +import torch.nn as nn +from einops import rearrange +from utils.utils import instantiate_from_config +from lvdm.modules.attention import LinearAttention + +def nonlinearity(x): + # swish + return x*torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32): + return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + + + +class LinAttnBlock(LinearAttention): + """to match AttnBlock usage""" + def __init__(self, in_channels): + super().__init__(dim=in_channels, heads=1, dim_head=in_channels) + + +class AttnBlock(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.k = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.v = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b,c,h,w = q.shape + q = q.reshape(b,c,h*w) # bcl + q = q.permute(0,2,1) # bcl -> blc l=hw + k = k.reshape(b,c,h*w) # bcl + + w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j] + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = v.reshape(b,c,h*w) + w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q) + h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j] + h_ = h_.reshape(b,c,h,w) + + h_ = self.proj_out(h_) + + return x+h_ + +def make_attn(in_channels, attn_type="vanilla"): + assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown' + #print(f"making attention of type '{attn_type}' with {in_channels} in_channels") + if attn_type == "vanilla": + return AttnBlock(in_channels) + elif attn_type == "none": + return nn.Identity(in_channels) + else: + return LinAttnBlock(in_channels) + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + self.in_channels = in_channels + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=3, + stride=2, + padding=0) + def forward(self, x): + if self.with_conv: + pad = (0,1,0,1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + self.in_channels = in_channels + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if self.with_conv: + x = self.conv(x) + return x + +def get_timestep_embedding(timesteps, embedding_dim): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: + From Fairseq. + Build sinusoidal embeddings. + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + assert len(timesteps.shape) == 1 + + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb) + emb = emb.to(device=timesteps.device) + emb = timesteps.float()[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0,1,0,0)) + return emb + + + +class ResnetBlock(nn.Module): + def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, + dropout, temb_channels=512): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d(in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, + out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x+h + +class Model(nn.Module): + def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, + attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, + resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"): + super().__init__() + if use_linear_attn: attn_type = "linear" + self.ch = ch + self.temb_ch = self.ch*4 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + self.use_timestep = use_timestep + if self.use_timestep: + # timestep embedding + self.temb = nn.Module() + self.temb.dense = nn.ModuleList([ + torch.nn.Linear(self.ch, + self.temb_ch), + torch.nn.Linear(self.temb_ch, + self.temb_ch), + ]) + + # downsampling + self.conv_in = torch.nn.Conv2d(in_channels, + self.ch, + kernel_size=3, + stride=1, + padding=1) + + curr_res = resolution + in_ch_mult = (1,)+tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch*in_ch_mult[i_level] + block_out = ch*ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append(ResnetBlock(in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions-1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch*ch_mult[i_level] + skip_in = ch*ch_mult[i_level] + for i_block in range(self.num_res_blocks+1): + if i_block == self.num_res_blocks: + skip_in = ch*in_ch_mult[i_level] + block.append(ResnetBlock(in_channels=block_in+skip_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, + out_ch, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x, t=None, context=None): + #assert x.shape[2] == x.shape[3] == self.resolution + if context is not None: + # assume aligned context, cat along channel axis + x = torch.cat((x, context), dim=1) + if self.use_timestep: + # timestep embedding + assert t is not None + temb = get_timestep_embedding(t, self.ch) + temb = self.temb.dense[0](temb) + temb = nonlinearity(temb) + temb = self.temb.dense[1](temb) + else: + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions-1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks+1): + h = self.up[i_level].block[i_block]( + torch.cat([h, hs.pop()], dim=1), temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + def get_last_layer(self): + return self.conv_out.weight + + +class Encoder(nn.Module): + def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, + attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, + resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla", + **ignore_kwargs): + super().__init__() + if use_linear_attn: attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d(in_channels, + self.ch, + kernel_size=3, + stride=1, + padding=1) + + curr_res = resolution + in_ch_mult = (1,)+tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch*in_ch_mult[i_level] + block_out = ch*ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append(ResnetBlock(in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions-1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, + 2*z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + # timestep embedding + temb = None + + # print(f'encoder-input={x.shape}') + # downsampling + hs = [self.conv_in(x)] + # print(f'encoder-conv in feat={hs[0].shape}') + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + # print(f'encoder-down feat={h.shape}') + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions-1: + # print(f'encoder-downsample (input)={hs[-1].shape}') + hs.append(self.down[i_level].downsample(hs[-1])) + # print(f'encoder-downsample (output)={hs[-1].shape}') + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + # print(f'encoder-mid1 feat={h.shape}') + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + # print(f'encoder-mid2 feat={h.shape}') + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + # print(f'end feat={h.shape}') + return h + + +class Decoder(nn.Module): + def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, + attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, + resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False, + attn_type="vanilla", **ignorekwargs): + super().__init__() + if use_linear_attn: attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + in_ch_mult = (1,)+tuple(ch_mult) + block_in = ch*ch_mult[self.num_resolutions-1] + curr_res = resolution // 2**(self.num_resolutions-1) + self.z_shape = (1,z_channels,curr_res,curr_res) + print("AE working on z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape))) + + # z to block_in + self.conv_in = torch.nn.Conv2d(z_channels, + block_in, + kernel_size=3, + stride=1, + padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock(in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch*ch_mult[i_level] + for i_block in range(self.num_res_blocks+1): + block.append(ResnetBlock(in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, + out_ch, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, z): + #assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # print(f'decoder-input={z.shape}') + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + # print(f'decoder-conv in feat={h.shape}') + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + # print(f'decoder-mid feat={h.shape}') + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks+1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + # print(f'decoder-up feat={h.shape}') + if i_level != 0: + h = self.up[i_level].upsample(h) + # print(f'decoder-upsample feat={h.shape}') + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + # print(f'decoder-conv_out feat={h.shape}') + if self.tanh_out: + h = torch.tanh(h) + return h + + +class SimpleDecoder(nn.Module): + def __init__(self, in_channels, out_channels, *args, **kwargs): + super().__init__() + self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1), + ResnetBlock(in_channels=in_channels, + out_channels=2 * in_channels, + temb_channels=0, dropout=0.0), + ResnetBlock(in_channels=2 * in_channels, + out_channels=4 * in_channels, + temb_channels=0, dropout=0.0), + ResnetBlock(in_channels=4 * in_channels, + out_channels=2 * in_channels, + temb_channels=0, dropout=0.0), + nn.Conv2d(2*in_channels, in_channels, 1), + Upsample(in_channels, with_conv=True)]) + # end + self.norm_out = Normalize(in_channels) + self.conv_out = torch.nn.Conv2d(in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + for i, layer in enumerate(self.model): + if i in [1,2,3]: + x = layer(x, None) + else: + x = layer(x) + + h = self.norm_out(x) + h = nonlinearity(h) + x = self.conv_out(h) + return x + + +class UpsampleDecoder(nn.Module): + def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution, + ch_mult=(2,2), dropout=0.0): + super().__init__() + # upsampling + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + block_in = in_channels + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.res_blocks = nn.ModuleList() + self.upsample_blocks = nn.ModuleList() + for i_level in range(self.num_resolutions): + res_block = [] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + res_block.append(ResnetBlock(in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + self.res_blocks.append(nn.ModuleList(res_block)) + if i_level != self.num_resolutions - 1: + self.upsample_blocks.append(Upsample(block_in, True)) + curr_res = curr_res * 2 + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, + out_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + # upsampling + h = x + for k, i_level in enumerate(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.res_blocks[i_level][i_block](h, None) + if i_level != self.num_resolutions - 1: + h = self.upsample_blocks[k](h) + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class LatentRescaler(nn.Module): + def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2): + super().__init__() + # residual block, interpolate, residual block + self.factor = factor + self.conv_in = nn.Conv2d(in_channels, + mid_channels, + kernel_size=3, + stride=1, + padding=1) + self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth)]) + self.attn = AttnBlock(mid_channels) + self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth)]) + + self.conv_out = nn.Conv2d(mid_channels, + out_channels, + kernel_size=1, + ) + + def forward(self, x): + x = self.conv_in(x) + for block in self.res_block1: + x = block(x, None) + x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor)))) + x = self.attn(x) + for block in self.res_block2: + x = block(x, None) + x = self.conv_out(x) + return x + + +class MergedRescaleEncoder(nn.Module): + def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks, + attn_resolutions, dropout=0.0, resamp_with_conv=True, + ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1): + super().__init__() + intermediate_chn = ch * ch_mult[-1] + self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult, + z_channels=intermediate_chn, double_z=False, resolution=resolution, + attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv, + out_ch=None) + self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn, + mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth) + + def forward(self, x): + x = self.encoder(x) + x = self.rescaler(x) + return x + + +class MergedRescaleDecoder(nn.Module): + def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8), + dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1): + super().__init__() + tmp_chn = z_channels*ch_mult[-1] + self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout, + resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks, + ch_mult=ch_mult, resolution=resolution, ch=ch) + self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn, + out_channels=tmp_chn, depth=rescale_module_depth) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Upsampler(nn.Module): + def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2): + super().__init__() + assert out_size >= in_size + num_blocks = int(np.log2(out_size//in_size))+1 + factor_up = 1.+ (out_size % in_size) + print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}") + self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels, + out_channels=in_channels) + self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2, + attn_resolutions=[], in_channels=None, ch=in_channels, + ch_mult=[ch_mult for _ in range(num_blocks)]) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Resize(nn.Module): + def __init__(self, in_channels=None, learned=False, mode="bilinear"): + super().__init__() + self.with_conv = learned + self.mode = mode + if self.with_conv: + print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode") + raise NotImplementedError() + assert in_channels is not None + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d(in_channels, + in_channels, + kernel_size=4, + stride=2, + padding=1) + + def forward(self, x, scale_factor=1.0): + if scale_factor==1.0: + return x + else: + x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor) + return x + +class FirstStagePostProcessor(nn.Module): + + def __init__(self, ch_mult:list, in_channels, + pretrained_model:nn.Module=None, + reshape=False, + n_channels=None, + dropout=0., + pretrained_config=None): + super().__init__() + if pretrained_config is None: + assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' + self.pretrained_model = pretrained_model + else: + assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' + self.instantiate_pretrained(pretrained_config) + + self.do_reshape = reshape + + if n_channels is None: + n_channels = self.pretrained_model.encoder.ch + + self.proj_norm = Normalize(in_channels,num_groups=in_channels//2) + self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3, + stride=1,padding=1) + + blocks = [] + downs = [] + ch_in = n_channels + for m in ch_mult: + blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout)) + ch_in = m * n_channels + downs.append(Downsample(ch_in, with_conv=False)) + + self.model = nn.ModuleList(blocks) + self.downsampler = nn.ModuleList(downs) + + + def instantiate_pretrained(self, config): + model = instantiate_from_config(config) + self.pretrained_model = model.eval() + # self.pretrained_model.train = False + for param in self.pretrained_model.parameters(): + param.requires_grad = False + + + @torch.no_grad() + def encode_with_pretrained(self,x): + c = self.pretrained_model.encode(x) + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + return c + + def forward(self,x): + z_fs = self.encode_with_pretrained(x) + z = self.proj_norm(z_fs) + z = self.proj(z) + z = nonlinearity(z) + + for submodel, downmodel in zip(self.model,self.downsampler): + z = submodel(z,temb=None) + z = downmodel(z) + + if self.do_reshape: + z = rearrange(z,'b c h w -> b (h w) c') + return z + diff --git a/lvdm/modules/networks/openaimodel3d.py b/lvdm/modules/networks/openaimodel3d.py new file mode 100644 index 0000000..7119f6c --- /dev/null +++ b/lvdm/modules/networks/openaimodel3d.py @@ -0,0 +1,578 @@ +from functools import partial +from abc import abstractmethod +import torch +import torch.nn as nn +from einops import rearrange +import torch.nn.functional as F +from lvdm.models.utils_diffusion import timestep_embedding +from lvdm.common import checkpoint +from lvdm.basics import ( + zero_module, + conv_nd, + linear, + avg_pool_nd, + normalization +) +from lvdm.modules.attention import SpatialTransformer, TemporalTransformer + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb, context=None, batch_size=None): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb, batch_size) + elif isinstance(layer, SpatialTransformer): + x = layer(x, context) + elif isinstance(layer, TemporalTransformer): + x = rearrange(x, '(b f) c h w -> b c f h w', b=batch_size) + x = layer(x, context) + x = rearrange(x, 'b c f h w -> (b f) c h w') + else: + x = layer(x,) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, self.channels, self.out_channels, 3, stride=stride, padding=padding + ) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest') + else: + x = F.interpolate(x, scale_factor=2, mode='nearest') + if self.use_conv: + x = self.conv(x) + return x + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + use_conv=False, + up=False, + down=False, + use_temporal_conv=False, + tempspatial_aware=False + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + self.use_temporal_conv = use_temporal_conv + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + nn.Linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module(nn.Conv2d(self.out_channels, self.out_channels, 3, padding=1)), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + if self.use_temporal_conv: + self.temopral_conv = TemporalConvBlock( + self.out_channels, + self.out_channels, + dropout=0.1, + spatial_aware=tempspatial_aware + ) + + def forward(self, x, emb, batch_size=None): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + input_tuple = (x, emb,) + if batch_size: + forward_batchsize = partial(self._forward, batch_size=batch_size) + return checkpoint(forward_batchsize, input_tuple, self.parameters(), self.use_checkpoint) + return checkpoint(self._forward, input_tuple, self.parameters(), self.use_checkpoint) + + def _forward(self, x, emb, batch_size=None,): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = torch.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + h = self.skip_connection(x) + h + + if self.use_temporal_conv and batch_size: + h = rearrange(h, '(b t) c h w -> b c t h w', b=batch_size) + h = self.temopral_conv(h) + h = rearrange(h, 'b c t h w -> (b t) c h w') + return h + + +class TemporalConvBlock(nn.Module): + """ + Adapted from modelscope: https://github.com/modelscope/modelscope/blob/master/modelscope/models/multi_modal/video_synthesis/unet_sd.py + """ + + def __init__(self, in_channels, out_channels=None, dropout=0.0, spatial_aware=False): + super(TemporalConvBlock, self).__init__() + if out_channels is None: + out_channels = in_channels + self.in_channels = in_channels + self.out_channels = out_channels + kernel_shape = (3, 1, 1) if not spatial_aware else (3, 3, 3) + padding_shape = (1, 0, 0) if not spatial_aware else (1, 1, 1) + + # conv layers + self.conv1 = nn.Sequential( + nn.GroupNorm(32, in_channels), nn.SiLU(), + nn.Conv3d(in_channels, out_channels, kernel_shape, padding=padding_shape)) + self.conv2 = nn.Sequential( + nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout), + nn.Conv3d(out_channels, in_channels, kernel_shape, padding=padding_shape)) + self.conv3 = nn.Sequential( + nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout), + nn.Conv3d(out_channels, in_channels, (3, 1, 1), padding=(1, 0, 0))) + self.conv4 = nn.Sequential( + nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout), + nn.Conv3d(out_channels, in_channels, (3, 1, 1), padding=(1, 0, 0))) + + # zero out the last layer params,so the conv block is identity + nn.init.zeros_(self.conv4[-1].weight) + nn.init.zeros_(self.conv4[-1].bias) + + def forward(self, x): + identity = x + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + + return x + identity + + +class UNetModel(nn.Module): + """ + The full UNet model with attention and timestep embedding. + :param in_channels: in_channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + """ + + def __init__(self, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0.0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + context_dim=None, + use_scale_shift_norm=False, + resblock_updown=False, + num_heads=-1, + num_head_channels=-1, + transformer_depth=1, + use_linear=False, + use_checkpoint=False, + temporal_conv=False, + tempspatial_aware=False, + temporal_attention=True, + temporal_selfatt_only=True, + use_relative_position=True, + use_causal_attention=False, + temporal_length=None, + use_fp16=False, + addition_attention=False, + use_image_attention=False, + temporal_transformer_depth=1, + fps_cond=False, + ): + super(UNetModel, self).__init__() + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.temporal_attention = temporal_attention + time_embed_dim = model_channels * 4 + self.use_checkpoint = use_checkpoint + self.dtype = torch.float16 if use_fp16 else torch.float32 + self.addition_attention=addition_attention + self.use_image_attention = use_image_attention + self.fps_cond=fps_cond + + + + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + if self.fps_cond: + self.fps_embedding = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + self.input_blocks = nn.ModuleList( + [ + TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1)) + ] + ) + if self.addition_attention: + self.init_attn=TimestepEmbedSequential( + TemporalTransformer( + model_channels, + n_heads=8, + d_head=num_head_channels, + depth=transformer_depth, + context_dim=context_dim, + use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, + causal_attention=use_causal_attention, relative_position=use_relative_position, + temporal_length=temporal_length)) + + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock(ch, time_embed_dim, dropout, + out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware, + use_temporal_conv=temporal_conv + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + layers.append( + SpatialTransformer(ch, num_heads, dim_head, + depth=transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, disable_self_attn=False, + img_cross_attention=self.use_image_attention + ) + ) + if self.temporal_attention: + layers.append( + TemporalTransformer(ch, num_heads, dim_head, + depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, + causal_attention=use_causal_attention, relative_position=use_relative_position, + temporal_length=temporal_length + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock(ch, time_embed_dim, dropout, + out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True + ) + if resblock_updown + else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + layers = [ + ResBlock(ch, time_embed_dim, dropout, + dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware, + use_temporal_conv=temporal_conv + ), + SpatialTransformer(ch, num_heads, dim_head, + depth=transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, disable_self_attn=False, + img_cross_attention=self.use_image_attention + ) + ] + if self.temporal_attention: + layers.append( + TemporalTransformer(ch, num_heads, dim_head, + depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, + causal_attention=use_causal_attention, relative_position=use_relative_position, + temporal_length=temporal_length + ) + ) + layers.append( + ResBlock(ch, time_embed_dim, dropout, + dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware, + use_temporal_conv=temporal_conv + ) + ) + self.middle_block = TimestepEmbedSequential(*layers) + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(num_res_blocks + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock(ch + ich, time_embed_dim, dropout, + out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware, + use_temporal_conv=temporal_conv + ) + ] + ch = model_channels * mult + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + layers.append( + SpatialTransformer(ch, num_heads, dim_head, + depth=transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, disable_self_attn=False, + img_cross_attention=self.use_image_attention + ) + ) + if self.temporal_attention: + layers.append( + TemporalTransformer(ch, num_heads, dim_head, + depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear, + use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, + causal_attention=use_causal_attention, relative_position=use_relative_position, + temporal_length=temporal_length + ) + ) + if level and i == num_res_blocks: + out_ch = ch + layers.append( + ResBlock(ch, time_embed_dim, dropout, + out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True + ) + if resblock_updown + else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)), + ) + + def forward(self, x, timesteps, context=None, features_adapter=None, fps=16, **kwargs): + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype) + emb = self.time_embed(t_emb) + + if self.fps_cond: + if type(fps) == int: + fps = torch.full_like(timesteps, fps) + fps_emb = timestep_embedding(fps,self.model_channels, repeat_only=False).to(x.dtype) + emb += self.fps_embedding(fps_emb) + + b,_,t,_,_ = x.shape + ## repeat t times for context [(b t) 77 768] & time embedding + context = context.repeat_interleave(repeats=t, dim=0) + emb = emb.repeat_interleave(repeats=t, dim=0) + + ## always in shape (b t) c h w, except for temporal layer + x = rearrange(x, 'b c t h w -> (b t) c h w') + + h = x.type(self.dtype) + adapter_idx = 0 + hs = [] + for id, module in enumerate(self.input_blocks): + # print(f"in input_blocks {h.dtype} {emb.dtype} {context.dtype}") + h = module(h, emb, context=context, batch_size=b) + if id ==0 and self.addition_attention: + h = self.init_attn(h, emb, context=context, batch_size=b) + ## plug-in adapter features + if ((id+1)%3 == 0) and features_adapter is not None: + h = h + features_adapter[adapter_idx] + adapter_idx += 1 + hs.append(h) + if features_adapter is not None: + assert len(features_adapter)==adapter_idx, 'Wrong features_adapter' + + h = self.middle_block(h, emb, context=context, batch_size=b) + for module in self.output_blocks: + h = torch.cat([h, hs.pop()], dim=1) + h = module(h, emb, context=context, batch_size=b) + h = h.type(x.dtype) + y = self.out(h) + + # reshape back to (b c t h w) + y = rearrange(y, '(b t) c h w -> b c t h w', b=b) + return y + \ No newline at end of file diff --git a/lvdm/modules/x_transformer.py b/lvdm/modules/x_transformer.py new file mode 100644 index 0000000..f252ab4 --- /dev/null +++ b/lvdm/modules/x_transformer.py @@ -0,0 +1,640 @@ +"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers""" +from functools import partial +from inspect import isfunction +from collections import namedtuple +from einops import rearrange, repeat +import torch +from torch import nn, einsum +import torch.nn.functional as F + +# constants +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple('Intermediates', [ + 'pre_softmax_attn', + 'post_softmax_attn' +]) + +LayerIntermediates = namedtuple('Intermediates', [ + 'hiddens', + 'attn_intermediates' +]) + + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.emb = nn.Embedding(max_seq_len, dim) + self.init_() + + def init_(self): + nn.init.normal_(self.emb.weight, std=0.02) + + def forward(self, x): + n = torch.arange(x.shape[1], device=x.device) + return self.emb(n)[None, :, :] + + +class FixedPositionalEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset + sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + return emb[None, :, :] + + +# helpers + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def always(val): + def inner(*args, **kwargs): + return val + return inner + + +def not_equals(val): + def inner(x): + return x != val + return inner + + +def equals(val): + def inner(x): + return x == val + return inner + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +# keyword argument helpers + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val,) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) + kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items()))) + return kwargs_without_prefix, kwargs + + +# classes +class Scale(nn.Module): + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.value, *rest) + + +class Rezero(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.zeros(1)) + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.g, *rest) + + +class ScaleNorm(nn.Module): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class Residual(nn.Module): + def forward(self, x, residual): + return x + residual + + +class GRUGating(nn.Module): + def __init__(self, dim): + super().__init__() + self.gru = nn.GRUCell(dim, dim) + + def forward(self, x, residual): + gated_output = self.gru( + rearrange(x, 'b n d -> (b n) d'), + rearrange(residual, 'b n d -> (b n) d') + ) + + return gated_output.reshape_as(x) + + +# feedforward + +class GEGLU(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential( + nn.Linear(dim, inner_dim), + nn.GELU() + ) if not glu else GEGLU(dim, inner_dim) + + self.net = nn.Sequential( + project_in, + nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out) + ) + + def forward(self, x): + return self.net(x) + + +# attention. +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + mask=None, + talking_heads=False, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0., + on_attn=False + ): + super().__init__() + if use_entmax15: + raise NotImplementedError("Check out entmax activation instead of softmax activation!") + self.scale = dim_head ** -0.5 + self.heads = heads + self.causal = causal + self.mask = mask + + inner_dim = dim_head * heads + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_k = nn.Linear(dim, inner_dim, bias=False) + self.to_v = nn.Linear(dim, inner_dim, bias=False) + self.dropout = nn.Dropout(dropout) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + # entmax + #self.attn_fn = entmax15 if use_entmax15 else F.softmax + self.attn_fn = F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + rel_pos=None, + sinusoidal_emb=None, + prev_attn=None, + mem=None + ): + b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = torch.cat((mem, k_input), dim=-2) + v_input = torch.cat((mem, v_input), dim=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool()) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool()) + q_mask = rearrange(q_mask, 'b i -> b () i ()') + k_mask = rearrange(k_mask, 'b j -> b () () j') + input_mask = q_mask * k_mask + + if self.num_mem_kv > 0: + mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v)) + k = torch.cat((mem_k, k), dim=-2) + v = torch.cat((mem_v, v), dim=-2) + if exists(input_mask): + input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True) + + dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots + + if talking_heads: + dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous() + + if exists(rel_pos): + dots = rel_pos(dots) + + if exists(input_mask): + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if self.causal: + i, j = dots.shape[-2:] + r = torch.arange(i, device=device) + mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j') + mask = F.pad(mask, (j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, dim=-1) + post_softmax_attn = attn + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous() + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)') + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, + post_softmax_attn=post_softmax_attn + ) + + return self.to_out(out), intermediates + + +class AttentionLayers(nn.Module): + def __init__( + self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rmsnorm=False, + use_rezero=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + position_infused_attn=False, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + **kwargs + ): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) + attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) + + dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.ModuleList([]) + + self.has_pos_emb = position_infused_attn + self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None + self.rotary_pos_emb = always(None) + + assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance' + self.rel_pos = None + + self.pre_norm = pre_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ('a', 'c', 'f') + elif cross_attend and only_cross: + default_block = ('c', 'f') + else: + default_block = ('a', 'f') + + if macaron: + default_block = ('f',) + default_block + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, 'par ratio out of range' + default_block = tuple(filter(not_equals('f'), default_block)) + par_attn = par_depth // par_ratio + depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert len(default_block) <= par_width, 'default block is too large for par_ratio' + par_block = default_block + ('f',) * (par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ('f',) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' + layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals('a'), layer_types))) + + for layer_type in self.layer_types: + if layer_type == 'a': + layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs) + elif layer_type == 'c': + layer = Attention(dim, heads=heads, **attn_kwargs) + elif layer_type == 'f': + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f'invalid layer type {layer_type}') + + if isinstance(layer, Attention) and exists(branch_fn): + layer = branch_fn(layer) + + if gate_residual: + residual_fn = GRUGating(dim) + else: + residual_fn = Residual() + + self.layers.append(nn.ModuleList([ + norm_fn(), + layer, + residual_fn + ])) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + mems=None, + return_hiddens=False + ): + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + + for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)): + is_last = ind == (len(self.layers) - 1) + + if layer_type == 'a': + hiddens.append(x) + layer_mem = mems.pop(0) + + residual = x + + if self.pre_norm: + x = norm(x) + + if layer_type == 'a': + out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos, + prev_attn=prev_attn, mem=layer_mem) + elif layer_type == 'c': + out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn) + elif layer_type == 'f': + out = block(x) + + x = residual_fn(out, residual) + + if layer_type in ('a', 'c'): + intermediates.append(inter) + + if layer_type == 'a' and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == 'c' and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if not self.pre_norm and not is_last: + x = norm(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, + attn_intermediates=intermediates + ) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on encoder' + super().__init__(causal=False, **kwargs) + + + +class TransformerWrapper(nn.Module): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0., + emb_dropout=0., + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + self.num_tokens = num_tokens + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.init_() + + self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim)) + + # let funnel encoder know number of memory tokens, if specified + if hasattr(attn_layers, 'num_memory_tokens'): + attn_layers.num_memory_tokens = num_memory_tokens + + def init_(self): + nn.init.normal_(self.token_emb.weight, std=0.02) + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_mems=False, + return_attn=False, + mems=None, + **kwargs + ): + b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens + x = self.token_emb(x) + x += self.pos_emb(x) + x = self.emb_dropout(x) + + x = self.project_emb(x) + + if num_mem > 0: + mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) + x = torch.cat((mem, x), dim=1) + + # auto-handle masking after appending memory tokens + if exists(mask): + mask = F.pad(mask, (num_mem, 0), value=True) + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + mem, x = x[:, :num_mem], x[:, num_mem:] + + out = self.to_logits(x) if not return_embeddings else x + + if return_mems: + hiddens = intermediates.hiddens + new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens + new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems)) + return out, new_mems + + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + return out, attn_maps + + return out + diff --git a/mesh.py b/mesh.py new file mode 100644 index 0000000..2809934 --- /dev/null +++ b/mesh.py @@ -0,0 +1,21 @@ +import subprocess +import os + +src = '../4dfy/output/fourdfy_stage_2_low_vram/' +dst = './data/8w/' +os.makedirs(dst, exist_ok=True) + +for obj_dir in os.listdir(src): + key = obj_dir.split('@')[0] + obj_path = os.path.join(src, obj_dir, 'save/it25000-export/model.obj') + if not os.path.exists(obj_path): + print(f'no object file at {obj_path}') + continue + + dst_path = os.path.join(dst, key+'.ply') + if os.path.exists(dst_path): + continue + + command = f'python mesh2ply_8w.py {obj_path} {dst_path}' + result = subprocess.run(command, shell=True) + print(result) \ No newline at end of file diff --git a/mesh2ply.sh b/mesh2ply.sh new file mode 100644 index 0000000..a5658f2 --- /dev/null +++ b/mesh2ply.sh @@ -0,0 +1,6 @@ +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_baby_bunny@20240228-192002/save/it25000-export/model.obj data/8w/a_baby_bunny.ply +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_cat_walking@20240223-122044/save/it25000-export/model.obj data/8w/a_cat_walking.ply +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_stack_of_pancakes@20240223-175313/save/it25000-export/model.obj data/8w/a_stack_of_pancakes.ply +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_corgi@20240223-060738/save/it25000-export/model.obj data/8w/a_corgi.ply +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_standing_panda@20240221-125435/save/it25000-export/model.obj data/8w/a_standing_panda.ply +python mesh2ply_8w.py ../4dfy/output/fourdfy_stage_2_low_vram/a_horse_running@20240226-135936/save/it25000-export/model.obj data/8w/a_horse_running.ply diff --git a/mesh2ply_8w.py b/mesh2ply_8w.py new file mode 100644 index 0000000..3256a5f --- /dev/null +++ b/mesh2ply_8w.py @@ -0,0 +1,52 @@ +import open3d as o3d +import numpy as np +import fire + +def obj_to_ply(obj_filename, ply_filename): + """ + Convert .obj file to .ply point cloud using Open3D. + """ + # Read the .obj file + mesh = o3d.io.read_triangle_mesh(obj_filename) + # print(mesh.vertices) + # print(mesh.faces) + + # Check if the mesh contains vertex normals + if not mesh.has_vertex_normals(): + mesh.compute_vertex_normals() + + # Extract point cloud from the mesh + # pcd = mesh.sample_points_uniformly(number_of_points=2000) + pcd = mesh.sample_points_poisson_disk(number_of_points=80000) + + # Save the point cloud to a .ply file + o3d.io.write_point_cloud(ply_filename, pcd) + print(f"Point cloud saved to {ply_filename}") + +# Replace 'path_to_obj.obj' with the path to your .obj file and +# 'output_path.ply' with the desired output path for the .ply file. +# obj_filename = './outputs/magic123-coarse-sd/new.png-a_panda_dancing@20231022-112824/save/it10000-export/model.obj' +# ply_filename = 'a.ply' +if __name__ == '__main__': + # obj_to_ply(obj_filename, ply_filename) + fire.Fire(obj_to_ply) +# def load_obj(filename): +# """ +# Load the OBJ file and return vertices and faces. +# """ +# vertices = [] +# faces = [] + +# with open(filename, 'r') as file: +# for line in file: +# if line.startswith('v '): +# vertices.append(list(map(float, line.strip().split()[1:4]))) +# elif line.startswith('f'): +# face = [int(i.split('/')[0]) for i in line.strip().split()[1:]] +# faces.append(face) + +# return np.array(vertices), np.array(faces) + +# res = load_obj(obj_filename) +# print(res[0].shape) +# print(res[1].shape) \ No newline at end of file diff --git a/render_comp_video.py b/render_comp_video.py new file mode 100644 index 0000000..46141ab --- /dev/null +++ b/render_comp_video.py @@ -0,0 +1,231 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# +import imageio +import numpy as np +import torch +from scene.comp_scene import Scene +import os +import cv2 +from tqdm import tqdm +from os import makedirs +from gaussian_renderer.comp_renderer import render +# import torchvision +from utils.general_utils import safe_state +from argparse import ArgumentParser +from arguments import ModelParams, PipelineParams,OptimizationParams, get_combined_args, ModelHiddenParams +from scene.gaussian_model_nogrid import GaussianModel_nogrid as GaussianModel +from time import time +from scipy.spatial.transform import Rotation as R + +def prepare_offset(rotation, translation): + def func(pts): + return (torch.from_numpy(rotation).float().cuda() @ pts.permute(1, 0)).permute(1, 0) + torch.from_numpy(translation).float().cuda() + return func + +def find_rotation_matrix(v1, v2): + """ + Find the rotation matrix that aligns v1 to v2. + + Parameters: + - v1: The initial vector. + - v2: The target vector. + + Returns: + - The rotation matrix that rotates v1 to align with v2. + """ + # Normalize the target vector + if np.linalg.norm(v2) > 1e-3: + v2_normalized = v2 / np.linalg.norm(v2) + else: + v2_normalized = v2 + + # Axis of rotation (cross product of v1 and v2) + axis = np.cross(v1, v2_normalized) + + if np.linalg.norm(axis) < 1e-6: + if np.dot(v1, v2) >= 0: + # The vectors are parallel, no rotation needed + rotation_matrix = np.eye(3) + else: + # The vectors are anti-parallel, rotate 180 degrees around any orthogonal axis + rotation_matrix = R.from_euler('x', 180, degrees=True).as_matrix() + + else: + # Angle of rotation + angle = np.arccos(np.dot(v1, v2_normalized)) + + # Handle the case where the rotation is undefined because the vectors are parallel/anti-parallel + + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + + # Rodrigues' rotation formula components + K = np.array([[0, -axis[2], axis[1]], + [axis[2], 0, -axis[0]], + [-axis[1], axis[0], 0]]) + I = np.identity(3) + + # Rotation matrix + rotation_matrix = I + np.sin(angle) * K + (1 - np.cos(angle)) * np.dot(K, K) + + return rotation_matrix # [3, 3] + +def get_rotation(prev_pos, next_pos): + new_vec = next_pos - prev_pos + canonical = np.array([1, 0, 0]) + return find_rotation_matrix(canonical, new_vec) + + +def query_trajectory(generate_coordinates, t0, fps, frame_num): + # get_location = lambda t: np.array((R * np.sin(2 * np.pi * t * rot_speed), 0, R * np.cos(2 * np.pi * t * rot_speed))) + translation_list = [generate_coordinates(t0 + i * fps) for i in range(frame_num)] + return translation_list + +# def query_trajectory(t0, fps, frame_num): +# R = 0.5 +# rot_speed = 1 / 3 +# get_location = lambda t: np.array((R * np.sin(2 * np.pi * t * rot_speed), 0, R * np.cos(2 * np.pi * t * rot_speed))) +# translation_list = [get_location(t0 + i * fps) for i in range(frame_num)] +# return translation_list + +to8b = lambda x : (255*np.clip(x.cpu().numpy(),0,1)).astype(np.uint8) +def render_set_fixcam(model_path, name, iteration, views, gaussians, pipeline, background,multiview_video, fname='video_rgb.mp4', func=None, scales=None, pre_scale=False, cam_idx=25): + render_path = os.path.join(model_path, name, "ours_{}".format(iteration), "renders") + gts_path = os.path.join(model_path, name, "ours_{}".format(iteration), "gt") + + makedirs(render_path, exist_ok=True) + makedirs(gts_path, exist_ok=True) + render_images = [] + gt_list = [] + render_list = [] + print(len(views)) + + # for idx, view in enumerate(tqdm(views, desc="Rendering progress")): + # for idx in tqdm(range (100)): + # fnum = 100 + # fnum = 12 + #### + fnum = 48 + for idx in tqdm(range (fnum)): + view = views[cam_idx] + if idx == 0:time1 = time() + #ww = torch.tensor([idx / 12]).unsqueeze(0) + ww = torch.tensor([idx / fnum]).unsqueeze(0) + # ww = torch.tensor([idx / 100]).unsqueeze(0) + + # if multiview_video: + # print(idx, len(func), view.keys(), len(scales)) + rendering = render(view['cur_cam'], gaussians, pipeline, background, time=ww, stage='fine', offset=[lambda x:x, func[idx]], scales_list=scales, pre_scale=pre_scale)["render"] + # else: + # rendering = render(view['pose0_cam'], gaussians, pipeline, background, time=ww, stage='fine', offset=[lambda x:x, func[idx]], scales_list=scales, pre_scale=pre_scale)["render"] + render_images.append(to8b(rendering).transpose(1,2,0)) + render_list.append(rendering) + time2=time() + print("FPS:",(len(views)-1)/(time2-time1)) + print('Len', len(render_images)) + imageio.mimwrite(os.path.join(model_path, name, "ours_{}".format(iteration), fname), render_images, fps=8, quality=8) + +def render_set_fixtime(model_path, name, iteration, views, gaussians, pipeline, background,multiview_video, fname='video_rgb.mp4', func=None, scales=None, pre_scale=False, time_idx=8): + render_path = os.path.join(model_path, name, "ours_{}".format(iteration), "renders") + gts_path = os.path.join(model_path, name, "ours_{}".format(iteration), "gt") + + makedirs(render_path, exist_ok=True) + makedirs(gts_path, exist_ok=True) + render_images = [] + gt_list = [] + render_list = [] + print(len(views)) + + # for idx, view in enumerate(tqdm(views, desc="Rendering progress")): + # for idx in tqdm(range (100)): + fnum = 100 + # fnum = 12 + for idx in tqdm(range (fnum)): + view = views[idx] + if idx == 0:time1 = time() + #ww = torch.tensor([idx / 12]).unsqueeze(0) + ww = torch.tensor([time_idx / fnum]).unsqueeze(0) + # ww = torch.tensor([idx / 100]).unsqueeze(0) + + # if multiview_video: + # print(idx,) + rendering = render(view['cur_cam'], gaussians, pipeline, background, time=ww, stage='fine', offset=[lambda x:x, func[time_idx]], scales_list=scales, pre_scale=pre_scale)["render"] + # else: + # rendering = render(view['pose0_cam'], gaussians, pipeline, background, time=ww, stage='fine', offset=[lambda x:x, func[idx]], scales_list=scales, pre_scale=pre_scale)["render"] + render_images.append(to8b(rendering).transpose(1,2,0)) + render_list.append(rendering) + time2=time() + print("FPS:",(len(views)-1)/(time2-time1)) + print('Len', len(render_images)) + imageio.mimwrite(os.path.join(model_path, name, "ours_{}".format(iteration), fname), render_images, fps=8, quality=8) + + +from importlib import import_module +def render_sets(dataset : ModelParams, hyperparam, opt,iteration : int, pipeline : PipelineParams, skip_train : bool, skip_test : bool, skip_video: bool,multiview_video: bool): + + func_name = opt.func_name + p, m = func_name.rsplit('.', 1) + mod = import_module(p) + generate_coordinates = getattr(mod, m) + translation_list = query_trajectory(generate_coordinates, 0, 1 / 48, 48 + 1) + print('translation', translation_list) + rotation_list = [get_rotation(translation_list[i], translation_list[i + 1]) for i in range(len(translation_list) - 1)] + print(rotation_list) + func = [prepare_offset(rotation_list[i], translation_list[i]) for i in range(len(rotation_list))] + + with torch.no_grad(): + gaussians = [GaussianModel(dataset.sh_degree, hyperparam) for __ in dataset.cloud_path] + # gaussians = GaussianModel(dataset.sh_degree, hyperparam) + scene = Scene(dataset, gaussians, load_iteration=iteration, shuffle=False) + offset_list = [] + for gs in scene.gaussians: + offset_list.append(lambda x:x) + + bg_color = [1, 1, 1] if dataset.white_background else [0, 0, 0] + background = torch.tensor(bg_color, dtype=torch.float32, device="cuda") + if not skip_video: + #origin + for cam_idx in range(0, 100, 5): + render_set_fixcam(dataset.model_path,"video",scene.loaded_iter,scene.getVideoCameras(),gaussians,pipeline,background,multiview_video=False, fname=f"pose_{cam_idx}.mp4", func=func, scales=opt.scales, pre_scale=opt.pre_scale, cam_idx=cam_idx) + # for time in range(48): + # render_set_fixtime(dataset.model_path,"video",scene.loaded_iter,scene.getVideoCameras(),gaussians,pipeline,background,multiview_video=False, fname=f"time_{time}.mp4", func=func, scales=opt.scales, pre_scale=opt.pre_scale, time_idx=time) + # render_set(dataset.model_path,"video",scene.loaded_iter,scene.getVideoCameras(),gaussians,pipeline,background,multiview_video=True, fname='multiview.mp4', func=func, scales=opt.scales, pre_scale=opt.pre_scale) + # self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud")) +if __name__ == "__main__": + # Set up command line argument parser + parser = ArgumentParser(description="Testing script parameters") + model = ModelParams(parser) + op = OptimizationParams(parser) + pipeline = PipelineParams(parser) + hyperparam = ModelHiddenParams(parser) + parser.add_argument("--iteration", default=-1, type=int) + parser.add_argument("--skip_train", action="store_true") + parser.add_argument("--skip_test", action="store_true") + parser.add_argument("--quiet", action="store_true") + parser.add_argument("--skip_video", action="store_true") + parser.add_argument('--multiview_video',default=False,action="store_true") + parser.add_argument("--configs", type=str) + args = get_combined_args(parser) + print("Rendering " , args.model_path) + if args.configs: + # import mmcv + import mmengine + from utils.params_utils import merge_hparams + # config = mmcv.Config.fromfile(args.configs) + config = mmengine.Config.fromfile(args.configs) + # import mmcv + # from utils.params_utils import merge_hparams + # config = mmcv.Config.fromfile(args.configs) + args = merge_hparams(args, config) + # Initialize system state (RNG) + safe_state(args.quiet) + + render_sets(model.extract(args), hyperparam.extract(args), op.extract(args),args.iteration, pipeline.extract(args), args.skip_train, args.skip_test, args.skip_video,args.multiview_video) diff --git a/render_comp_video.sh b/render_comp_video.sh new file mode 100644 index 0000000..a59ca44 --- /dev/null +++ b/render_comp_video.sh @@ -0,0 +1 @@ +# python render_comp_video.py --skip_train --configs arguments/comp_fish_rock_zs.py --skip_test --model_path output_demo/date/fishrock_zs_date/ --iteration 3000 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..adfd1b7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +mmcv==1.6.0 +matplotlib +argparse +lpips +plyfile +imageio-ffmpeg +transformers +diffusers +accelerate +tensorboard +imageio +opencv-python +av +ipython diff --git a/scene/__init__.py b/scene/__init__.py new file mode 100644 index 0000000..1a7de03 --- /dev/null +++ b/scene/__init__.py @@ -0,0 +1,110 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import os +import random +import json +from utils.system_utils import searchForMaxIteration +from scene.dataset_readers import sceneLoadTypeCallbacks +from scene.gaussian_model import GaussianModel +# from scene.dataset import FourDGSdataset +from scene.text_dataset import Text4Ddataset +# from scene.i2v_dataset import FourDGSdataset, ImageDreamdataset +# from scene.rife_sync_dataset import FourDGSdataset +from arguments import ModelParams +from utils.camera_utils import cameraList_from_camInfos, camera_to_JSON +from torch.utils.data import Dataset +import numpy as np + +class Scene: + + gaussians : GaussianModel + + def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration=None,shuffle=True, resolution_scales=[1.0], load_coarse=False): + """b + :param path: Path to colmap scene main folder. + """ + self.model_path = args.model_path + self.loaded_iter = None + self.gaussians = gaussians + + if load_iteration: + if load_iteration == -1: + self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud")) + else: + self.loaded_iter = load_iteration + print("Loading trained model at iteration {}".format(self.loaded_iter)) + + self.train_cameras = {} + self.test_cameras = {} + self.video_cameras = {} + self.cameras_extent = 1 # scene_info.nerf_normalization["radius"] + + print("Loading Training Cameras") + # if args.imagedream: + # ds = ImageDreamdataset + # else: + # ds = FourDGSdataset + ds = Text4Ddataset + self.train_camera = ds(split='train', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + print("Loading Test Cameras") + self.maxtime = self.train_camera.pose0_num + self.test_camera = ds(split='test', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + print("Loading Video Cameras") + self.video_cameras = ds(split='video', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + xyz_max = [2.5, 2.5, 2.5] + xyz_min = [-2.5, -2.5, -2.5] + if not self.gaussians._deformation.deformation_net.no_grid: + self.gaussians._deformation.deformation_net.grid.set_aabb(xyz_max,xyz_min) + # assert not self.loaded_iter + if self.loaded_iter: + self.gaussians.load_ply(os.path.join(self.model_path, + "point_cloud", + "iteration_" + str(self.loaded_iter), + "point_cloud.ply")) + self.gaussians.load_model(os.path.join(self.model_path, + "point_cloud", + "iteration_" + str(self.loaded_iter), + )) + else: + # TODO: accept argparse + num_pts = int(5e3) + + + # random init + # self.gaussians.random_init(num_pts, 10, radius=0.5) + # point cloud init + + # cloud_path='./data/eagle1_1.ply' # + # cloud_path = './data/crackling_campfire.ply' + # cloud_path = './data/panda_dancing.ply' + # cloud_path = './data/bee.ply' + + # 4 is not used + self.gaussians.load_3studio_ply(args.cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1/2.5, load_color=True) ## 4dfy + # self.gaussians.load_3studio_ply(cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1/2.5, load_color=True) ## 4dfy + # self.gaussians.load_3studio_ply(cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1, load_color=True) ## imagedream + + def save(self, iteration, stage): + if stage == "coarse": + point_cloud_path = os.path.join(self.model_path, "point_cloud/coarse_iteration_{}".format(iteration)) + + else: + point_cloud_path = os.path.join(self.model_path, "point_cloud/iteration_{}".format(iteration)) + self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply")) + self.gaussians.save_deformation(point_cloud_path) + def getTrainCameras(self, scale=1.0): + return self.train_camera + + def getTestCameras(self, scale=1.0): + return self.test_camera + def getVideoCameras(self, scale=1.0): + return self.video_cameras \ No newline at end of file diff --git a/scene/cam_utils.py b/scene/cam_utils.py new file mode 100644 index 0000000..5d69beb --- /dev/null +++ b/scene/cam_utils.py @@ -0,0 +1,66 @@ +import numpy as np +from scipy.spatial.transform import Rotation as R + +import torch + +def dot(x, y): + if isinstance(x, np.ndarray): + return np.sum(x * y, -1, keepdims=True) + else: + return torch.sum(x * y, -1, keepdim=True) + + +def length(x, eps=1e-20): + if isinstance(x, np.ndarray): + return np.sqrt(np.maximum(np.sum(x * x, axis=-1, keepdims=True), eps)) + else: + return torch.sqrt(torch.clamp(dot(x, x), min=eps)) + + +def safe_normalize(x, eps=1e-20): + return x / length(x, eps) + + +def look_at(campos, target, opengl=True): + # campos: [N, 3], camera/eye position + # target: [N, 3], object to look at + # return: [N, 3, 3], rotation matrix + if not opengl: + # camera forward aligns with -z + forward_vector = safe_normalize(target - campos) + up_vector = np.array([0, 1, 0], dtype=np.float32) + right_vector = safe_normalize(np.cross(forward_vector, up_vector)) + up_vector = safe_normalize(np.cross(right_vector, forward_vector)) + else: + # camera forward aligns with +z + forward_vector = safe_normalize(campos - target) + up_vector = np.array([0, 1, 0], dtype=np.float32) + right_vector = safe_normalize(np.cross(up_vector, forward_vector)) + up_vector = safe_normalize(np.cross(forward_vector, right_vector)) + R = np.stack([right_vector, up_vector, forward_vector], axis=1) + return R + + +# elevation & azimuth to pose (cam2world) matrix +def orbit_camera(elevation, azimuth, radius=1, is_degree=True, target=None, opengl=True, jitter=False): + # radius: scalar + # elevation: scalar, in (-90, 90), from +y to -y is (-90, 90) + # azimuth: scalar, in (-180, 180), from +z to +x is (0, 90) + # return: [4, 4], camera pose matrix + if is_degree: + elevation = np.deg2rad(elevation) + azimuth = np.deg2rad(azimuth) + x = radius * np.cos(elevation) * np.sin(azimuth) + y = - radius * np.sin(elevation) + z = radius * np.cos(elevation) * np.cos(azimuth) + if target is None: + target = np.zeros([3], dtype=np.float32) + campos = np.array([x, y, z]) + target # [3] + T = np.eye(4, dtype=np.float32) + if jitter: + # https://github.com/ashawkey/stable-dreamfusion/blob/5550b91862a3af7842bb04875b7f1211e5095a63/nerf/provider.py#L121 + campos += np.random.rand(*campos.shape) * 0.015 + target += np.random.randn(*campos.shape) * 0.015 + T[:3, :3] = look_at(campos, target, opengl) + T[:3, 3] = campos + return T diff --git a/scene/cameras.py b/scene/cameras.py new file mode 100644 index 0000000..0abb65e --- /dev/null +++ b/scene/cameras.py @@ -0,0 +1,77 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +from torch import nn +import numpy as np +from utils.graphics_utils import getWorld2View2, getProjectionMatrix + +class Camera(nn.Module): + def __init__(self, R, T, FoVx, FoVy, image, gt_alpha_mask, + image_name, uid, + trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", time = 0 + ): + super(Camera, self).__init__() + + self.uid = uid + # self.colmap_id = colmap_id + self.R = R + self.T = T + self.FoVx = FoVx + self.FoVy = FoVy + self.image_name = image_name + self.time = time + try: + self.data_device = torch.device(data_device) + except Exception as e: + print(e) + print(f"[Warning] Custom device {data_device} failed, fallback to default cuda device" ) + self.data_device = torch.device("cuda") + self.original_image = image.clamp(0.0, 1.0) + # .to(self.data_device) + self.image_width = self.original_image.shape[2] + self.image_height = self.original_image.shape[1] + + if gt_alpha_mask is not None: + self.original_image *= gt_alpha_mask + # .to(self.data_device) + else: + self.original_image *= torch.ones((1, self.image_height, self.image_width)) + # , device=self.data_device) + + + self.zfar = 100.0 + self.znear = 0.01 + + self.trans = trans + self.scale = scale + + self.world_view_transform = torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1) + # .cuda() + self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy).transpose(0,1) + # .cuda() + self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0) + self.camera_center = self.world_view_transform.inverse()[3, :3] + +class MiniCam: + def __init__(self, width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform, time): + self.image_width = width + self.image_height = height + self.FoVy = fovy + self.FoVx = fovx + self.znear = znear + self.zfar = zfar + self.world_view_transform = world_view_transform + self.full_proj_transform = full_proj_transform + view_inv = torch.inverse(self.world_view_transform) + self.camera_center = view_inv[3][:3] + self.time = time + diff --git a/scene/colmap_loader.py b/scene/colmap_loader.py new file mode 100644 index 0000000..0f32d23 --- /dev/null +++ b/scene/colmap_loader.py @@ -0,0 +1,282 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import numpy as np +import collections +import struct + +CameraModel = collections.namedtuple( + "CameraModel", ["model_id", "model_name", "num_params"]) +Camera = collections.namedtuple( + "Camera", ["id", "model", "width", "height", "params"]) +BaseImage = collections.namedtuple( + "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]) +Point3D = collections.namedtuple( + "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]) +CAMERA_MODELS = { + CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3), + CameraModel(model_id=1, model_name="PINHOLE", num_params=4), + CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4), + CameraModel(model_id=3, model_name="RADIAL", num_params=5), + CameraModel(model_id=4, model_name="OPENCV", num_params=8), + CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8), + CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12), + CameraModel(model_id=7, model_name="FOV", num_params=5), + CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4), + CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5), + CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12) +} +CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) + for camera_model in CAMERA_MODELS]) +CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model) + for camera_model in CAMERA_MODELS]) + + +def qvec2rotmat(qvec): + return np.array([ + [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2, + 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], + 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]], + [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], + 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2, + 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]], + [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], + 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], + 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]]) + +def rotmat2qvec(R): + Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat + K = np.array([ + [Rxx - Ryy - Rzz, 0, 0, 0], + [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0], + [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0], + [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0 + eigvals, eigvecs = np.linalg.eigh(K) + qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)] + if qvec[0] < 0: + qvec *= -1 + return qvec + +class Image(BaseImage): + def qvec2rotmat(self): + return qvec2rotmat(self.qvec) + +def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"): + """Read and unpack the next bytes from a binary file. + :param fid: + :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. + :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. + :param endian_character: Any of {@, =, <, >, !} + :return: Tuple of read and unpacked values. + """ + data = fid.read(num_bytes) + return struct.unpack(endian_character + format_char_sequence, data) + +def read_points3D_text(path): + """ + see: src/base/reconstruction.cc + void Reconstruction::ReadPoints3DText(const std::string& path) + void Reconstruction::WritePoints3DText(const std::string& path) + """ + xyzs = None + rgbs = None + errors = None + with open(path, "r") as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != "#": + elems = line.split() + xyz = np.array(tuple(map(float, elems[1:4]))) + rgb = np.array(tuple(map(int, elems[4:7]))) + error = np.array(float(elems[7])) + if xyzs is None: + xyzs = xyz[None, ...] + rgbs = rgb[None, ...] + errors = error[None, ...] + else: + xyzs = np.append(xyzs, xyz[None, ...], axis=0) + rgbs = np.append(rgbs, rgb[None, ...], axis=0) + errors = np.append(errors, error[None, ...], axis=0) + return xyzs, rgbs, errors + +def read_points3D_binary(path_to_model_file): + """ + see: src/base/reconstruction.cc + void Reconstruction::ReadPoints3DBinary(const std::string& path) + void Reconstruction::WritePoints3DBinary(const std::string& path) + """ + + + with open(path_to_model_file, "rb") as fid: + num_points = read_next_bytes(fid, 8, "Q")[0] + + xyzs = np.empty((num_points, 3)) + rgbs = np.empty((num_points, 3)) + errors = np.empty((num_points, 1)) + + for p_id in range(num_points): + binary_point_line_properties = read_next_bytes( + fid, num_bytes=43, format_char_sequence="QdddBBBd") + xyz = np.array(binary_point_line_properties[1:4]) + rgb = np.array(binary_point_line_properties[4:7]) + error = np.array(binary_point_line_properties[7]) + track_length = read_next_bytes( + fid, num_bytes=8, format_char_sequence="Q")[0] + track_elems = read_next_bytes( + fid, num_bytes=8*track_length, + format_char_sequence="ii"*track_length) + xyzs[p_id] = xyz + rgbs[p_id] = rgb + errors[p_id] = error + return xyzs, rgbs, errors + +def read_intrinsics_text(path): + """ + Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py + """ + cameras = {} + with open(path, "r") as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != "#": + elems = line.split() + camera_id = int(elems[0]) + model = elems[1] + assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE" + width = int(elems[2]) + height = int(elems[3]) + params = np.array(tuple(map(float, elems[4:]))) + cameras[camera_id] = Camera(id=camera_id, model=model, + width=width, height=height, + params=params) + return cameras + +def read_extrinsics_binary(path_to_model_file): + """ + see: src/base/reconstruction.cc + void Reconstruction::ReadImagesBinary(const std::string& path) + void Reconstruction::WriteImagesBinary(const std::string& path) + """ + images = {} + with open(path_to_model_file, "rb") as fid: + num_reg_images = read_next_bytes(fid, 8, "Q")[0] + for _ in range(num_reg_images): + binary_image_properties = read_next_bytes( + fid, num_bytes=64, format_char_sequence="idddddddi") + image_id = binary_image_properties[0] + qvec = np.array(binary_image_properties[1:5]) + tvec = np.array(binary_image_properties[5:8]) + camera_id = binary_image_properties[8] + image_name = "" + current_char = read_next_bytes(fid, 1, "c")[0] + while current_char != b"\x00": # look for the ASCII 0 entry + image_name += current_char.decode("utf-8") + current_char = read_next_bytes(fid, 1, "c")[0] + num_points2D = read_next_bytes(fid, num_bytes=8, + format_char_sequence="Q")[0] + x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D, + format_char_sequence="ddq"*num_points2D) + xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])), + tuple(map(float, x_y_id_s[1::3]))]) + point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3]))) + images[image_id] = Image( + id=image_id, qvec=qvec, tvec=tvec, + camera_id=camera_id, name=image_name, + xys=xys, point3D_ids=point3D_ids) + return images + + +def read_intrinsics_binary(path_to_model_file): + """ + see: src/base/reconstruction.cc + void Reconstruction::WriteCamerasBinary(const std::string& path) + void Reconstruction::ReadCamerasBinary(const std::string& path) + """ + cameras = {} + with open(path_to_model_file, "rb") as fid: + num_cameras = read_next_bytes(fid, 8, "Q")[0] + for _ in range(num_cameras): + camera_properties = read_next_bytes( + fid, num_bytes=24, format_char_sequence="iiQQ") + camera_id = camera_properties[0] + model_id = camera_properties[1] + model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name + width = camera_properties[2] + height = camera_properties[3] + num_params = CAMERA_MODEL_IDS[model_id].num_params + params = read_next_bytes(fid, num_bytes=8*num_params, + format_char_sequence="d"*num_params) + cameras[camera_id] = Camera(id=camera_id, + model=model_name, + width=width, + height=height, + params=np.array(params)) + assert len(cameras) == num_cameras + return cameras + + +def read_extrinsics_text(path): + """ + Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py + """ + images = {} + with open(path, "r") as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != "#": + elems = line.split() + image_id = int(elems[0]) + qvec = np.array(tuple(map(float, elems[1:5]))) + tvec = np.array(tuple(map(float, elems[5:8]))) + camera_id = int(elems[8]) + image_name = elems[9] + elems = fid.readline().split() + xys = np.column_stack([tuple(map(float, elems[0::3])), + tuple(map(float, elems[1::3]))]) + point3D_ids = np.array(tuple(map(int, elems[2::3]))) + images[image_id] = Image( + id=image_id, qvec=qvec, tvec=tvec, + camera_id=camera_id, name=image_name, + xys=xys, point3D_ids=point3D_ids) + return images + + +def read_colmap_bin_array(path): + """ + Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py + + :param path: path to the colmap binary file. + :return: nd array with the floating point values in the value + """ + with open(path, "rb") as fid: + width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1, + usecols=(0, 1, 2), dtype=int) + fid.seek(0) + num_delimiter = 0 + byte = fid.read(1) + while True: + if byte == b"&": + num_delimiter += 1 + if num_delimiter >= 3: + break + byte = fid.read(1) + array = np.fromfile(fid, np.float32) + array = array.reshape((width, height, channels), order="F") + return np.transpose(array, (1, 0, 2)).squeeze() diff --git a/scene/comp_scene.py b/scene/comp_scene.py new file mode 100644 index 0000000..c361514 --- /dev/null +++ b/scene/comp_scene.py @@ -0,0 +1,106 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import os +import random +import json +from utils.system_utils import searchForMaxIteration +from scene.dataset_readers import sceneLoadTypeCallbacks +from scene.gaussian_model import GaussianModel +# from scene.dataset import FourDGSdataset +from scene.text_dataset import Text4Ddataset +# from scene.i2v_dataset import FourDGSdataset, ImageDreamdataset +# from scene.rife_sync_dataset import FourDGSdataset +from arguments import ModelParams +from utils.camera_utils import cameraList_from_camInfos, camera_to_JSON +from torch.utils.data import Dataset +import numpy as np + +class Scene: + + gaussians : GaussianModel + + def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration=None,shuffle=True, resolution_scales=[1.0], load_coarse=False): + """b + :param path: Path to colmap scene main folder. + """ + self.model_path = args.model_path + self.loaded_iter = None + self.gaussians = gaussians + self.names = [os.path.basename(x) for x in args.cloud_path] + + if load_iteration: + if load_iteration == -1: + self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud")) + else: + self.loaded_iter = load_iteration + print("Loading trained model at iteration {}".format(self.loaded_iter)) + + self.train_cameras = {} + self.test_cameras = {} + self.video_cameras = {} + self.cameras_extent = 1 # scene_info.nerf_normalization["radius"] + + print("Loading Training Cameras") + # if args.imagedream: + # ds = ImageDreamdataset + # else: + # ds = FourDGSdataset + ds = Text4Ddataset + self.train_camera = ds(split='train', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + print("Loading Test Cameras") + self.maxtime = self.train_camera.pose0_num + self.test_camera = ds(split='test', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + print("Loading Video Cameras") + self.video_cameras = ds(split='video', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static) + # xyz_max = [2.5, 2.5, 2.5] + # xyz_min = [-2.5, -2.5, -2.5] + # if not self.gaussians._deformation.deformation_net.no_grid: + # self.gaussians._deformation.deformation_net.grid.set_aabb(xyz_max,xyz_min) + # assert not self.loaded_iter + if self.loaded_iter: + for idx, _ in enumerate(self.gaussians): + _.load_ply(os.path.join(self.model_path, "point_cloud", "iteration_" + str(self.loaded_iter), f"{self.names[idx]}")) + # _.load_ply(os.path.join(self.model_path, "point_cloud", "iteration_" + str(self.loaded_iter), "point_cloud.ply")) + _.load_model(os.path.join(self.model_path, "point_cloud", "iteration_" + str(self.loaded_iter), f"{self.names[idx].replace('.ply', '')}")) + # sub directory for each object + else: + # num_pts = int(5e3) + # 4 is not used + # self.gaussians.load_3studio_ply(args.cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1/2.5, load_color=True) ## 4dfy + for idx, _ in enumerate(self.gaussians): + _.load_3studio_ply(args.cloud_path[idx], spatial_lr_scale=1, time_line=self.maxtime, pts_num=int(2.5e4), position_scale=1/2.5, load_color=True) ## 4dfy + # self.gaussians.load_3studio_ply(cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1/2.5, load_color=True) ## 4dfy + # self.gaussians.load_3studio_ply(cloud_path, spatial_lr_scale=1, time_line=self.maxtime, step=1, position_scale=1, load_color=True) ## imagedream + + def save(self, iteration, stage): + # if stage == "coarse": + # point_cloud_path = os.path.join(self.model_path, "point_cloud/coarse_iteration_{}".format(iteration)) + + # else: + # if not os.path.exists() + # TODO: save offset and rotation for objects + for idx, _ in enumerate(self.gaussians): + point_cloud_path = os.path.join(self.model_path, "point_cloud/iteration_{}".format(iteration)) + _.save_ply(os.path.join(point_cloud_path, f"{self.names[idx]}")) # comes with .ply + _.save_deformation(os.path.join(point_cloud_path, f"{self.names[idx].replace('.ply', '')}")) + # _.save_ply(os.path.join(point_cloud_path, "point_cloud.ply")) + # _.save_deformation(point_cloud_path) + def getTrainCameras(self, scale=1.0): + return self.train_camera + + def getTestCameras(self, scale=1.0): + return self.test_camera + def getVideoCameras(self, scale=1.0): + return self.video_cameras + + def get_total_points(self): + return sum([_.get_xyz.shape[0] for _ in (self.gaussians)]) \ No newline at end of file diff --git a/scene/dataset.py b/scene/dataset.py new file mode 100644 index 0000000..a97f754 --- /dev/null +++ b/scene/dataset.py @@ -0,0 +1,58 @@ +from torch.utils.data import Dataset +# from scene.cameras import Camera +import numpy as np +from utils.general_utils import PILtoTorch +from utils.graphics_utils import fov2focal, focal2fov +import torch +from utils.camera_utils import loadCam +from utils.graphics_utils import focal2fov + +from torchvision.transforms import ToTensor +from PIL import Image +import glob +from scene.cam_utils import orbit_camera +import math + +def getProjectionMatrix(znear, zfar, fovX, fovY): + tanHalfFovY = math.tan((fovY / 2)) + tanHalfFovX = math.tan((fovX / 2)) + + P = torch.zeros(4, 4) + + z_sign = 1.0 + + P[0, 0] = 1 / tanHalfFovX + P[1, 1] = 1 / tanHalfFovY + P[3, 2] = z_sign + P[2, 2] = z_sign * zfar / (zfar - znear) + P[2, 3] = -(zfar * znear) / (zfar - znear) + return P + + +class MiniCam: + def __init__(self, c2w, width, height, fovy, fovx, znear, zfar): + # c2w (pose) should be in NeRF convention. + + self.image_width = width + self.image_height = height + self.FoVy = fovy + self.FoVx = fovx + self.znear = znear + self.zfar = zfar + + w2c = np.linalg.inv(c2w) + + # rectify... + w2c[1:3, :3] *= -1 + w2c[:3, 3] *= -1 + + self.world_view_transform = torch.tensor(w2c).transpose(0, 1)#.cuda() + self.projection_matrix = ( + getProjectionMatrix( + znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy + ) + .transpose(0, 1) + # .cuda() + ) + self.full_proj_transform = self.world_view_transform @ self.projection_matrix + self.camera_center = -torch.tensor(c2w[:3, 3])#.cuda() diff --git a/scene/dataset_readers.py b/scene/dataset_readers.py new file mode 100644 index 0000000..ce6aba3 --- /dev/null +++ b/scene/dataset_readers.py @@ -0,0 +1,481 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import os +import sys +from PIL import Image +from typing import NamedTuple +from scene.colmap_loader import read_extrinsics_text, read_intrinsics_text, qvec2rotmat, \ + read_extrinsics_binary, read_intrinsics_binary, read_points3D_binary, read_points3D_text +from scene.hyper_loader import Load_hyper_data, format_hyper_data +import torchvision.transforms as transforms +import copy +from utils.graphics_utils import getWorld2View2, focal2fov, fov2focal +import numpy as np +import torch +import json +from pathlib import Path +from plyfile import PlyData, PlyElement +from utils.sh_utils import SH2RGB +from scene.gaussian_model import BasicPointCloud +from utils.general_utils import PILtoTorch +from tqdm import tqdm +class CameraInfo(NamedTuple): + uid: int + R: np.array + T: np.array + FovY: np.array + FovX: np.array + image: np.array + image_path: str + image_name: str + width: int + height: int + time : float + +class SceneInfo(NamedTuple): + point_cloud: BasicPointCloud + train_cameras: list + test_cameras: list + video_cameras: list + nerf_normalization: dict + ply_path: str + maxtime: int + +def getNerfppNorm(cam_info): + def get_center_and_diag(cam_centers): + cam_centers = np.hstack(cam_centers) + avg_cam_center = np.mean(cam_centers, axis=1, keepdims=True) + center = avg_cam_center + dist = np.linalg.norm(cam_centers - center, axis=0, keepdims=True) + diagonal = np.max(dist) + return center.flatten(), diagonal + + cam_centers = [] + + for cam in cam_info: + W2C = getWorld2View2(cam.R, cam.T) + C2W = np.linalg.inv(W2C) + cam_centers.append(C2W[:3, 3:4]) + + center, diagonal = get_center_and_diag(cam_centers) + radius = diagonal * 1.1 + + translate = -center + + return {"translate": translate, "radius": radius} + +def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder): + cam_infos = [] + for idx, key in enumerate(cam_extrinsics): + sys.stdout.write('\r') + # the exact output you're looking for: + sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics))) + sys.stdout.flush() + + extr = cam_extrinsics[key] + intr = cam_intrinsics[extr.camera_id] + height = intr.height + width = intr.width + + uid = intr.id + R = np.transpose(qvec2rotmat(extr.qvec)) + T = np.array(extr.tvec) + + if intr.model in ["SIMPLE_PINHOLE", "SIMPLE_RADIAL"]: + focal_length_x = intr.params[0] + FovY = focal2fov(focal_length_x, height) + FovX = focal2fov(focal_length_x, width) + elif intr.model=="PINHOLE": + focal_length_x = intr.params[0] + focal_length_y = intr.params[1] + FovY = focal2fov(focal_length_y, height) + FovX = focal2fov(focal_length_x, width) + elif intr.model == "OPENCV": + focal_length_x = intr.params[0] + focal_length_y = intr.params[1] + FovY = focal2fov(focal_length_y, height) + FovX = focal2fov(focal_length_x, width) + else: + assert False, "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!" + + image_path = os.path.join(images_folder, os.path.basename(extr.name)) + image_name = os.path.basename(image_path).split(".")[0] + image = Image.open(image_path) + image = PILtoTorch(image,None) + cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=image_path, image_name=image_name, width=width, height=height, + time = 0) + cam_infos.append(cam_info) + sys.stdout.write('\n') + return cam_infos + +def fetchPly(path): + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T + colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 + normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T + return BasicPointCloud(points=positions, colors=colors, normals=normals) + +def storePly(path, xyz, rgb): + # Define the dtype for the structured array + dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')] + + normals = np.zeros_like(xyz) + + elements = np.empty(xyz.shape[0], dtype=dtype) + attributes = np.concatenate((xyz, normals, rgb), axis=1) + elements[:] = list(map(tuple, attributes)) + + # Create the PlyData object and write to file + vertex_element = PlyElement.describe(elements, 'vertex') + ply_data = PlyData([vertex_element]) + ply_data.write(path) + +def readColmapSceneInfo(path, images, eval, llffhold=8): + try: + cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin") + cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.bin") + cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file) + cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file) + except: + cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.txt") + cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.txt") + cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file) + cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file) + + reading_dir = "images" if images == None else images + cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir)) + cam_infos = sorted(cam_infos_unsorted.copy(), key = lambda x : x.image_name) + + if eval: + train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0] + test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0] + else: + train_cam_infos = cam_infos + test_cam_infos = [] + + nerf_normalization = getNerfppNorm(train_cam_infos) + + ply_path = os.path.join(path, "sparse/0/points3D.ply") + bin_path = os.path.join(path, "sparse/0/points3D.bin") + txt_path = os.path.join(path, "sparse/0/points3D.txt") + if not os.path.exists(ply_path): + print("Converting point3d.bin to .ply, will happen only the first time you open the scene.") + try: + xyz, rgb, _ = read_points3D_binary(bin_path) + except: + xyz, rgb, _ = read_points3D_text(txt_path) + storePly(ply_path, xyz, rgb) + + try: + pcd = fetchPly(ply_path) + + except: + pcd = None + + scene_info = SceneInfo(point_cloud=pcd, + train_cameras=train_cam_infos, + test_cameras=test_cam_infos, + video_cameras=train_cam_infos, + maxtime=0, + nerf_normalization=nerf_normalization, + ply_path=ply_path) + return scene_info +def generateCamerasFromTransforms(path, template_transformsfile, extension, maxtime): + trans_t = lambda t : torch.Tensor([ + [1,0,0,0], + [0,1,0,0], + [0,0,1,t], + [0,0,0,1]]).float() + + rot_phi = lambda phi : torch.Tensor([ + [1,0,0,0], + [0,np.cos(phi),-np.sin(phi),0], + [0,np.sin(phi), np.cos(phi),0], + [0,0,0,1]]).float() + + rot_theta = lambda th : torch.Tensor([ + [np.cos(th),0,-np.sin(th),0], + [0,1,0,0], + [np.sin(th),0, np.cos(th),0], + [0,0,0,1]]).float() + def pose_spherical(theta, phi, radius): + c2w = trans_t(radius) + c2w = rot_phi(phi/180.*np.pi) @ c2w + c2w = rot_theta(theta/180.*np.pi) @ c2w + c2w = torch.Tensor(np.array([[-1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1]])) @ c2w + return c2w + cam_infos = [] + # generate render poses and times + render_poses = torch.stack([pose_spherical(angle, -30.0, 4.0) for angle in np.linspace(-180,180,40+1)[:-1]], 0) + render_times = torch.linspace(0,maxtime,render_poses.shape[0]) + with open(os.path.join(path, template_transformsfile)) as json_file: + template_json = json.load(json_file) + fovx = template_json["camera_angle_x"] + # load a single image to get image info. + for idx, frame in enumerate(template_json["frames"]): + cam_name = os.path.join(path, frame["file_path"] + extension) + image_path = os.path.join(path, cam_name) + image_name = Path(cam_name).stem + image = Image.open(image_path) + im_data = np.array(image.convert("RGBA")) + image = PILtoTorch(image,(800,800)) + break + # format information + for idx, (time, poses) in enumerate(zip(render_times,render_poses)): + time = time/maxtime + matrix = np.linalg.inv(np.array(poses)) + R = -np.transpose(matrix[:3,:3]) + R[:,0] = -R[:,0] + T = -matrix[:3, 3] + fovy = focal2fov(fov2focal(fovx, image.shape[1]), image.shape[2]) + FovY = fovy + FovX = fovx + cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=None, image_name=None, width=image.shape[1], height=image.shape[2], + time = time)) + return cam_infos +def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png", mapper = {}): + cam_infos = [] + + with open(os.path.join(path, transformsfile)) as json_file: + contents = json.load(json_file) + fovx = contents["camera_angle_x"] + + frames = contents["frames"] + for idx, frame in enumerate(frames): + cam_name = os.path.join(path, frame["file_path"] + extension) + time = mapper[frame["time"]] + matrix = np.linalg.inv(np.array(frame["transform_matrix"])) + R = -np.transpose(matrix[:3,:3]) + R[:,0] = -R[:,0] + T = -matrix[:3, 3] + + image_path = os.path.join(path, cam_name) + image_name = Path(cam_name).stem + image = Image.open(image_path) + + im_data = np.array(image.convert("RGBA")) + + bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0]) + + norm_data = im_data / 255.0 + arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4]) + image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB") + image = PILtoTorch(image,(800,800)) + fovy = focal2fov(fov2focal(fovx, image.shape[1]), image.shape[2]) + FovY = fovy + FovX = fovx + + cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=image_path, image_name=image_name, width=image.shape[1], height=image.shape[2], + time = time)) + + return cam_infos +def read_timeline(path): + with open(os.path.join(path, "transforms_train.json")) as json_file: + train_json = json.load(json_file) + with open(os.path.join(path, "transforms_test.json")) as json_file: + test_json = json.load(json_file) + time_line = [frame["time"] for frame in train_json["frames"]] + [frame["time"] for frame in test_json["frames"]] + time_line = set(time_line) + time_line = list(time_line) + time_line.sort() + timestamp_mapper = {} + max_time_float = max(time_line) + for index, time in enumerate(time_line): + # timestamp_mapper[time] = index + timestamp_mapper[time] = time/max_time_float + + return timestamp_mapper, max_time_float +def readNerfSyntheticInfo(path, white_background, eval, extension=".png"): + timestamp_mapper, max_time = read_timeline(path) + print("Reading Training Transforms") + train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension, timestamp_mapper) + print("Reading Test Transforms") + test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension, timestamp_mapper) + print("Generating Video Transforms") + video_cam_infos = generateCamerasFromTransforms(path, "transforms_train.json", extension, max_time) + if not eval: + train_cam_infos.extend(test_cam_infos) + test_cam_infos = [] + + nerf_normalization = getNerfppNorm(train_cam_infos) + + ply_path = os.path.join(path, "points3d.ply") + # Since this data set has no colmap data, we start with random points + num_pts = 2000 + print(f"Generating random point cloud ({num_pts})...") + + # We create random points inside the bounds of the synthetic Blender scenes + xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))) + storePly(ply_path, xyz, SH2RGB(shs) * 255) + try: + pcd = fetchPly(ply_path) + except: + pcd = None + + scene_info = SceneInfo(point_cloud=pcd, + train_cameras=train_cam_infos, + test_cameras=test_cam_infos, + video_cameras=video_cam_infos, + nerf_normalization=nerf_normalization, + ply_path=ply_path, + maxtime=max_time + ) + return scene_info +def format_infos(dataset,split): + # loading + cameras = [] + image = dataset[0][0] + if split == "train": + for idx in tqdm(range(len(dataset))): + image_path = None + image_name = f"{idx}" + time = dataset.image_times[idx] + # matrix = np.linalg.inv(np.array(pose)) + R,T = dataset.load_pose(idx) + FovX = focal2fov(dataset.focal[0], image.shape[1]) + FovY = focal2fov(dataset.focal[0], image.shape[2]) + cameras.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=image_path, image_name=image_name, width=image.shape[2], height=image.shape[1], + time = time)) + + return cameras + + +def readHyperDataInfos(datadir,use_bg_points,eval): + train_cam_infos = Load_hyper_data(datadir,0.5,use_bg_points,split ="train") + test_cam_infos = Load_hyper_data(datadir,0.5,use_bg_points,split="test") + + train_cam = format_hyper_data(train_cam_infos,"train") + max_time = train_cam_infos.max_time + video_cam_infos = copy.deepcopy(test_cam_infos) + video_cam_infos.split="video" + + ply_path = os.path.join(datadir, "points.npy") + + xyz = np.load(ply_path,allow_pickle=True) + xyz -= train_cam_infos.scene_center + xyz *= train_cam_infos.coord_scale + xyz = xyz.astype(np.float32) + shs = np.random.random((xyz.shape[0], 3)) / 255.0 + pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((xyz.shape[0], 3))) + + + nerf_normalization = getNerfppNorm(train_cam) + + scene_info = SceneInfo(point_cloud=pcd, + train_cameras=train_cam_infos, + test_cameras=test_cam_infos, + video_cameras=video_cam_infos, + nerf_normalization=nerf_normalization, + ply_path=ply_path, + maxtime=max_time + ) + + return scene_info +def format_render_poses(poses,data_infos): + cameras = [] + tensor_to_pil = transforms.ToPILImage() + len_poses = len(poses) + times = [i/len_poses for i in range(len_poses)] + image = data_infos[0][0] + for idx, p in tqdm(enumerate(poses)): + # image = None + image_path = None + image_name = f"{idx}" + time = times[idx] + pose = np.eye(4) + pose[:3,:] = p[:3,:] + # matrix = np.linalg.inv(np.array(pose)) + R = pose[:3,:3] + R = - R + R[:,0] = -R[:,0] + T = -pose[:3,3].dot(R) + FovX = focal2fov(data_infos.focal[0], image.shape[2]) + FovY = focal2fov(data_infos.focal[0], image.shape[1]) + cameras.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=image_path, image_name=image_name, width=image.shape[2], height=image.shape[1], + time = time)) + return cameras + + +def readdynerfInfo(datadir,use_bg_points,eval): + # loading all the data follow hexplane format + ply_path = os.path.join(datadir, "points3d.ply") + + from scene.neural_3D_dataset_NDC import Neural3D_NDC_Dataset + train_dataset = Neural3D_NDC_Dataset( + datadir, + "train", + 1.0, + time_scale=1, + scene_bbox_min=[-2.5, -2.0, -1.0], + scene_bbox_max=[2.5, 2.0, 1.0], + eval_index=0, + ) + test_dataset = Neural3D_NDC_Dataset( + datadir, + "test", + 1.0, + time_scale=1, + scene_bbox_min=[-2.5, -2.0, -1.0], + scene_bbox_max=[2.5, 2.0, 1.0], + eval_index=0, + ) + train_cam_infos = format_infos(train_dataset,"train") + + # test_cam_infos = format_infos(test_dataset,"test") + val_cam_infos = format_render_poses(test_dataset.val_poses,test_dataset) + nerf_normalization = getNerfppNorm(train_cam_infos) + # create pcd + # if not os.path.exists(ply_path): + # Since this data set has no colmap data, we start with random points + num_pts = 2000 + print(f"Generating random point cloud ({num_pts})...") + threshold = 3 + # xyz_max = np.array([1.5*threshold, 1.5*threshold, 1.5*threshold]) + # xyz_min = np.array([-1.5*threshold, -1.5*threshold, -3*threshold]) + xyz_max = np.array([1.5*threshold, 1.5*threshold, 1.5*threshold]) + xyz_min = np.array([-1.5*threshold, -1.5*threshold, -1.5*threshold]) + # We create random points inside the bounds of the synthetic Blender scenes + xyz = (np.random.random((num_pts, 3)))* (xyz_max-xyz_min) + xyz_min + print("point cloud initialization:",xyz.max(axis=0),xyz.min(axis=0)) + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))) + storePly(ply_path, xyz, SH2RGB(shs) * 255) + try: + # xyz = np.load + pcd = fetchPly(ply_path) + except: + pcd = None + scene_info = SceneInfo(point_cloud=pcd, + train_cameras=train_dataset, + test_cameras=test_dataset, + video_cameras=val_cam_infos, + nerf_normalization=nerf_normalization, + ply_path=ply_path, + maxtime=300 + ) + return scene_info +sceneLoadTypeCallbacks = { + "Colmap": readColmapSceneInfo, + "Blender" : readNerfSyntheticInfo, + "dynerf" : readdynerfInfo, + "nerfies": readHyperDataInfos, # NeRFies & HyperNeRF dataset proposed by [https://github.com/google/hypernerf/releases/tag/v0.1] +} diff --git a/scene/deformation.py b/scene/deformation.py new file mode 100644 index 0000000..ee63b94 --- /dev/null +++ b/scene/deformation.py @@ -0,0 +1,260 @@ +import functools +import math +import os +import time +# from tkinter import W + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.cpp_extension import load +import torch.nn.init as init +from collections import OrderedDict +from scene.hexplane import HexPlaneField + +class Deformation(nn.Module): + def __init__(self, D=8, W=256, input_ch=27, input_ch_time=9, skips=[], args=None): + super(Deformation, self).__init__() + self.D = D + self.W = W + self.input_ch = input_ch + self.input_ch_time = input_ch_time + self.skips = skips + self.grid_merge = args.grid_merge + + self.no_grid = args.no_grid + self.grid = HexPlaneField(args.bounds, args.kplanes_config, args.multires, grid_merge=args.grid_merge) + self.pos_deform, self.scales_deform, self.rotations_deform, self.opacity_deform ,self.color_deform= self.create_net() + # self.pos_deform.fc1.weight.data.zero_() + # self.pos_deform.fc1.bias.data.zero_() + # self.scales_deform.fc1.weight.data.zero_() + # self.scales_deform.fc1.bias.data.zero_() + # self.rotations_deform.fc1.weight.data.zero_() + # self.rotations_deform.fc1.bias.data.zero_() + # self.opacity_deform.fc1.weight.data.zero_() + # self.opacity_deform.fc1.bias.data.zero_() + # self.color_deform.fc1.weight.data.zero_() + # self.color_deform.fc1.bias.data.zero_() + + self.args = args + def create_net(self): + + mlp_out_dim = 0 + if self.no_grid: + self.feature_out = [nn.Linear(4,self.W)] + else: + if self.grid_merge == 'cat': + self.feature_out = [nn.Linear(mlp_out_dim + self.grid.feat_dim * 6, self.W)] + else: + self.feature_out = [nn.Linear(mlp_out_dim + self.grid.feat_dim, self.W)] + + for i in range(self.D-1): + self.feature_out.append(nn.ReLU()) + self.feature_out.append(nn.Linear(self.W,self.W)) + self.feature_out = nn.Sequential(*self.feature_out) + output_dim = self.W + # pose, scale, rotation, opacity + return \ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 3)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 3) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 1)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 4)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 4) + ), \ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 1)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 3)), + ('act4',nn.Tanh()) + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ) + + def query_time(self, rays_pts_emb, scales_emb, rotations_emb, time_emb): + + if self.no_grid: + h = torch.cat([rays_pts_emb[:,:3],time_emb[:,:1]],-1) + else: + grid_feature = self.grid(rays_pts_emb[:,:3], time_emb[:,:1]) + + h = grid_feature + + h = self.feature_out(h) + + return h + + def forward(self, rays_pts_emb, scales_emb=None, rotations_emb=None, opacity = None,color=None, time_emb=None): + # if time_emb.sum() == 0: + # # if time_emb is None: + # return self.forward_static(rays_pts_emb[:,:3], scales_emb, rotations_emb, opacity, time_emb) + # # return self.forward_static(rays_pts_emb[:,:3]) + # else: + return self.forward_dynamic(rays_pts_emb, scales_emb, rotations_emb, opacity, color,time_emb) + + def forward_static(self, pts, scales, rotations, opacity, time): + # def forward_static(self, rays_pts_emb): + return pts, scales, rotations, opacity + # print('??????? forward_static') + # grid_feature = self.grid(rays_pts_emb[:,:3]) + # dx = self.static_mlp(grid_feature) + # return rays_pts_emb[:, :3] + dx + def forward_dynamic(self,rays_pts_emb, scales_emb, rotations_emb, opacity_emb,color_emb, time_emb): + hidden = self.query_time(rays_pts_emb, scales_emb, rotations_emb, time_emb).float() + dx = self.pos_deform(hidden) + # dx = F.sigmoid(dx) * 2 - 1 # [make sure range is (-1, 1)] + dx = F.tanh(dx / 0.5) * 0.5 + # dx = dx * (time_emb ** 0.35) + pts = rays_pts_emb[:, :3] + dx + # print('deformation', rays_pts_emb[:, :3].shape, dx.shape) + # print('dx', dx.min(), dx.max(), dx.mean()) + # print(scales_emb.shape, rotations_emb.shape, opacity_emb.shape, time_emb.shape) + # print('no_ds', self.args.no_ds, self.args.no_dr, self.args.no_do) + if self.args.no_ds: + scales = scales_emb[:,:3] + else: + ds = self.scales_deform(hidden) + scales = scales_emb[:,:3] + ds + + if self.args.no_dr: + rotations = rotations_emb[:,:4] + else: + #print('dr======================================') + dr = self.rotations_deform(hidden) #[40000, 4] + rotations = rotations_emb[:,:4] + dr #([40000, 3]+[40000, 4]=[40000, 4] + # print('rotations_emb[:,:3] shape===',rotations_emb[:,:3].shape) + # print('dr shape=======',dr.shape) + # print('rotations shape=======',rotations.shape) + + if self.args.no_do: + opacity = opacity_emb[:,:1] + else: + do = self.opacity_deform(hidden) + opacity = opacity_emb[:,:1] + do + + if self.args.no_dc: + # print('no dc======================================') + color=color_emb[:,:3] + else: + # print('dc======================================') + # print('hidden shape=======',hidden.shape) + dc = self.color_deform(hidden) #[40000, 256]->[40000, 3] + color = color_emb[:,:3] + dc #[40000, 3]+[40000, 3] + # print('color_emb[:,:3] shape===',color_emb[:,:3].shape) + # print('dc shape=======',dc.shape) + # print('color shape=======',color.shape) + # hidden shape======= torch.Size([40000, 256]) [11/11 13:55:16] + # color_emb[:,:3] shape=== torch.Size([40000, 1, 3]) [11/11 13:55:16] + # dc shape======= torch.Size([40000, 3]) [11/11 13:55:16] + # color shape======= torch.Size([40000, 40000, 3]) [11/11 13:55:16] + # color_final shape torch.Size([40000, 1, 3]) [11/11 13:55:16] + # color_deform shape torch.Size([40000, 40000, 3]) [11/11 13:55:16] + # deformation_point shape torch.Size([40000]) [11/11 13:55:16] + # + do + # print("deformation value:","pts:",torch.abs(dx).mean(),"rotation:",torch.abs(dr).mean()) + + return pts, scales, rotations, opacity,color + def get_mlp_parameters(self): + parameter_list = [] + for name, param in self.named_parameters(): + if "grid" not in name: + parameter_list.append(param) + return parameter_list + def get_grid_parameters(self): + return list(self.grid.parameters() ) + # + list(self.timegrid.parameters()) +class deform_network(nn.Module): + def __init__(self, args) : + super(deform_network, self).__init__() + net_width = args.net_width + timebase_pe = args.timebase_pe + defor_depth= args.defor_depth + posbase_pe= args.posebase_pe + scale_rotation_pe = args.scale_rotation_pe + opacity_pe = args.opacity_pe + timenet_width = args.timenet_width + timenet_output = args.timenet_output + times_ch = 2*timebase_pe+1 + # self.timenet = nn.Sequential( + # nn.Linear(times_ch, timenet_width), nn.ReLU(), + # nn.Linear(timenet_width, timenet_output)) + self.deformation_net = Deformation(W=net_width, D=defor_depth, input_ch=(4+3)+((4+3)*scale_rotation_pe)*2, input_ch_time=timenet_output, args=args) + # self.register_buffer('time_poc', torch.FloatTensor([(2**i) for i in range(timebase_pe)])) + self.register_buffer('pos_poc', torch.FloatTensor([(2**i) for i in range(posbase_pe)])) + self.register_buffer('rotation_scaling_poc', torch.FloatTensor([(2**i) for i in range(scale_rotation_pe)])) + self.register_buffer('opacity_poc', torch.FloatTensor([(2**i) for i in range(opacity_pe)])) + self.apply(initialize_weights) + # print(self) + + def forward(self, point, scales=None, rotations=None, opacity=None,color=None, times_sel=None): + # raise NotImplementedError + # print('>>>>> time', times_sel) + if times_sel is not None: + means3D_, scales_, rotations_, opacity_ ,color_= self.forward_dynamic(point, scales, rotations, opacity,color, times_sel) + # return means3D_, scales, rotations, opacity + return means3D_, scales_, rotations_, opacity_,color_ + else: + raise NotImplementedError + return self.forward_static(point) + + + def forward_static(self, points): + points = self.deformation_net(points) + return points + def forward_dynamic(self, point, scales=None, rotations=None, opacity=None,color=None, times_sel=None): + # times_emb = poc_fre(times_sel, self.time_poc) + + means3D, scales, rotations, opacity,color = self.deformation_net( point, + scales, + rotations, + opacity, + color, + # times_feature, + times_sel) + return means3D, scales, rotations, opacity,color + def get_mlp_parameters(self): + return self.deformation_net.get_mlp_parameters() + # + list(self.timenet.parameters()) + def get_grid_parameters(self): + return self.deformation_net.get_grid_parameters() + +def initialize_weights(m): + # pass + if isinstance(m, nn.Linear): + # init.constant_(m.weight, 0) + init.xavier_uniform_(m.weight,gain=1) + if m.bias is not None: + init.xavier_uniform_(m.weight,gain=1) + # init.constant_(m.bias, 0) diff --git a/scene/deformation_nogrid.py b/scene/deformation_nogrid.py new file mode 100644 index 0000000..fa34096 --- /dev/null +++ b/scene/deformation_nogrid.py @@ -0,0 +1,299 @@ +import functools +import math +import os +import time +# from tkinter import W + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.cpp_extension import load +import torch.nn.init as init +from collections import OrderedDict +# from scene.hexplane import HexPlaneField + + +class positional_encoding(object): + ''' Positional Encoding (presented in NeRF) + Args: + basis_function (str): basis function + ''' + def __init__(self, basis_function='sin_cos'): + super().__init__() + self.func = basis_function + + L = 4 # 32 dim in total, same as AYG + freq_bands = 2.**(np.linspace(0, L-1, L)) + self.freq_bands = freq_bands * math.pi + + def __call__(self, p): + if self.func == 'sin_cos': + out = [] + p = p / 3.0 # change to the range [-1, 1] + for freq in self.freq_bands: + out.append(torch.sin(freq * p)) + out.append(torch.cos(freq * p)) + p = torch.cat(out, dim=-1) + return p + + +class Deformation(nn.Module): + def __init__(self, D=8, W=256, input_ch=27, input_ch_time=9, skips=[], args=None): + super(Deformation, self).__init__() + self.D = D + self.W = W + self.input_ch = input_ch + self.input_ch_time = input_ch_time + self.skips = skips + self.grid_merge = args.grid_merge + + self.no_grid = True + self.pe = positional_encoding() + # self.grid = HexPlaneField(args.bounds, args.kplanes_config, args.multires, grid_merge=args.grid_merge) + self.pos_deform, self.scales_deform, self.rotations_deform, self.opacity_deform ,self.color_deform= self.create_net() + # self.pos_deform.fc1.weight.data.zero_() + # self.pos_deform.fc1.bias.data.zero_() + # self.scales_deform.fc1.weight.data.zero_() + # self.scales_deform.fc1.bias.data.zero_() + # self.rotations_deform.fc1.weight.data.zero_() + # self.rotations_deform.fc1.bias.data.zero_() + # self.opacity_deform.fc1.weight.data.zero_() + # self.opacity_deform.fc1.bias.data.zero_() + # self.color_deform.fc1.weight.data.zero_() + # self.color_deform.fc1.bias.data.zero_() + + self.args = args + def create_net(self): + + mlp_out_dim = 0 + if self.no_grid: + self.feature_out = [nn.Linear(32,self.W)] + else: + raise NotImplementedError + if self.grid_merge == 'cat': + self.feature_out = [nn.Linear(mlp_out_dim + self.grid.feat_dim * 6, self.W)] + else: + self.feature_out = [nn.Linear(mlp_out_dim + self.grid.feat_dim, self.W)] + + for i in range(self.D-1): + if i % 2 == 0: + self.feature_out.append(nn.LayerNorm(self.W)) + self.feature_out.append(nn.ReLU()) + self.feature_out.append(nn.Linear(self.W,self.W)) + self.feature_out = nn.Sequential(*self.feature_out) + output_dim = self.W + # pose, scale, rotation, opacity + return \ + nn.Sequential( + OrderedDict([ + # ('act0', nn.ReLU()), + # ('fc2', nn.Linear(self.W, self.W)), + ('norm', nn.LayerNorm(self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 3)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 3) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 1)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 4)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 4) + ), \ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 1)), + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ),\ + nn.Sequential( + OrderedDict([ + ('act0', nn.ReLU()), + ('fc2', nn.Linear(self.W, self.W)), + ('act3', nn.ReLU()), + ('fc1', nn.Linear(self.W, 3)), + ('act4',nn.Tanh()) + ]) + # nn.ReLU(),nn.Linear(self.W,self.W),nn.ReLU(),nn.Linear(self.W, 1) + ) + + def query_time(self, rays_pts_emb, scales_emb, rotations_emb, time_emb): + + if self.no_grid: + h = torch.cat([rays_pts_emb[:,:3],time_emb[:,:1] * 3],-1) + else: + raise NotImplementedError + grid_feature = self.grid(rays_pts_emb[:,:3], time_emb[:,:1]) + + h = grid_feature + + # print('input feature for grid', h.shape) + h = self.pe(h) + # print('position embedding', h.shape) + h = self.feature_out(h) + # print('embedded shared feature', h.shape) + + return h + + def forward(self, rays_pts_emb, scales_emb=None, rotations_emb=None, opacity = None,color=None, time_emb=None): + # if time_emb.sum() == 0: + # # if time_emb is None: + # return self.forward_static(rays_pts_emb[:,:3], scales_emb, rotations_emb, opacity, time_emb) + # # return self.forward_static(rays_pts_emb[:,:3]) + # else: + return self.forward_dynamic(rays_pts_emb, scales_emb, rotations_emb, opacity, color,time_emb) + + def forward_static(self, pts, scales, rotations, opacity, time): + # def forward_static(self, rays_pts_emb): + return pts, scales, rotations, opacity + # print('??????? forward_static') + # grid_feature = self.grid(rays_pts_emb[:,:3]) + # dx = self.static_mlp(grid_feature) + # return rays_pts_emb[:, :3] + dx + def forward_dynamic(self,rays_pts_emb, scales_emb, rotations_emb, opacity_emb,color_emb, time_emb): + hidden = self.query_time(rays_pts_emb, scales_emb, rotations_emb, time_emb).float() + dx = self.pos_deform(hidden) + # dx = F.sigmoid(dx) * 2 - 1 # [make sure range is (-1, 1)] + dx = F.tanh(dx / 0.5) * 0.2 + # dx = F.tanh(dx / 0.5) * 0.5 + # dx = dx * (time_emb ** 0.35) + pts = rays_pts_emb[:, :3] + dx + # print('deformation', rays_pts_emb[:, :3].shape, dx.shape) + # print('dx', dx.min(), dx.max(), dx.mean()) + # print(scales_emb.shape, rotations_emb.shape, opacity_emb.shape, time_emb.shape) + # print('no_ds', self.args.no_ds, self.args.no_dr, self.args.no_do) + if self.args.no_ds: + scales = scales_emb[:,:3] + else: + ds = self.scales_deform(hidden) + scales = scales_emb[:,:3] + ds + + if self.args.no_dr: + rotations = rotations_emb[:,:4] + else: + #print('dr======================================') + dr = self.rotations_deform(hidden) #[40000, 4] + rotations = rotations_emb[:,:4] + dr #([40000, 3]+[40000, 4]=[40000, 4] + # print('rotations_emb[:,:3] shape===',rotations_emb[:,:3].shape) + # print('dr shape=======',dr.shape) + # print('rotations shape=======',rotations.shape) + + if self.args.no_do: + opacity = opacity_emb[:,:1] + else: + do = self.opacity_deform(hidden) + opacity = opacity_emb[:,:1] + do + + if self.args.no_dc: + # print('no dc======================================') + color=color_emb[:,:3] + else: + # print('dc======================================') + # print('hidden shape=======',hidden.shape) + dc = self.color_deform(hidden) #[40000, 256]->[40000, 3] + color = color_emb[:,:3] + dc #[40000, 3]+[40000, 3] + # print('color_emb[:,:3] shape===',color_emb[:,:3].shape) + # print('dc shape=======',dc.shape) + # print('color shape=======',color.shape) + # hidden shape======= torch.Size([40000, 256]) [11/11 13:55:16] + # color_emb[:,:3] shape=== torch.Size([40000, 1, 3]) [11/11 13:55:16] + # dc shape======= torch.Size([40000, 3]) [11/11 13:55:16] + # color shape======= torch.Size([40000, 40000, 3]) [11/11 13:55:16] + # color_final shape torch.Size([40000, 1, 3]) [11/11 13:55:16] + # color_deform shape torch.Size([40000, 40000, 3]) [11/11 13:55:16] + # deformation_point shape torch.Size([40000]) [11/11 13:55:16] + # + do + # print("deformation value:","pts:",torch.abs(dx).mean(),"rotation:",torch.abs(dr).mean()) + + return pts, scales, rotations, opacity,color + def get_mlp_parameters(self): + parameter_list = [] + for name, param in self.named_parameters(): + if "grid" not in name: + parameter_list.append(param) + return parameter_list + def get_grid_parameters(self): + return list(self.grid.parameters() ) + # + list(self.timegrid.parameters()) +class deform_network(nn.Module): + def __init__(self, args) : + super(deform_network, self).__init__() + net_width = args.net_width + timebase_pe = args.timebase_pe + defor_depth= args.defor_depth + posbase_pe= args.posebase_pe + scale_rotation_pe = args.scale_rotation_pe + opacity_pe = args.opacity_pe + timenet_width = args.timenet_width + timenet_output = args.timenet_output + times_ch = 2*timebase_pe+1 + # self.timenet = nn.Sequential( + # nn.Linear(times_ch, timenet_width), nn.ReLU(), + # nn.Linear(timenet_width, timenet_output)) + self.deformation_net = Deformation(W=net_width, D=defor_depth, input_ch=(4+3)+((4+3)*scale_rotation_pe)*2, input_ch_time=timenet_output, args=args) + # self.register_buffer('time_poc', torch.FloatTensor([(2**i) for i in range(timebase_pe)])) + self.register_buffer('pos_poc', torch.FloatTensor([(2**i) for i in range(posbase_pe)])) + self.register_buffer('rotation_scaling_poc', torch.FloatTensor([(2**i) for i in range(scale_rotation_pe)])) + self.register_buffer('opacity_poc', torch.FloatTensor([(2**i) for i in range(opacity_pe)])) + # self.apply(initialize_weights) + # print(self) + nn.init.constant_(self.deformation_net.pos_deform.fc1.weight, 0) + nn.init.constant_(self.deformation_net.pos_deform.fc1.bias, 0) + + def forward(self, point, scales=None, rotations=None, opacity=None,color=None, times_sel=None): + # raise NotImplementedError + # print('>>>>> time', times_sel) + if times_sel is not None: + means3D_, scales_, rotations_, opacity_ ,color_= self.forward_dynamic(point, scales, rotations, opacity,color, times_sel) + # return means3D_, scales, rotations, opacity + return means3D_, scales_, rotations_, opacity_,color_ + else: + raise NotImplementedError + return self.forward_static(point) + + + def forward_static(self, points): + points = self.deformation_net(points) + return points + def forward_dynamic(self, point, scales=None, rotations=None, opacity=None,color=None, times_sel=None): + # times_emb = poc_fre(times_sel, self.time_poc) + + means3D, scales, rotations, opacity,color = self.deformation_net( point, + scales, + rotations, + opacity, + color, + # times_feature, + times_sel) + return means3D, scales, rotations, opacity,color + def get_mlp_parameters(self): + return self.deformation_net.get_mlp_parameters() + # + list(self.timenet.parameters()) + def get_grid_parameters(self): + return self.deformation_net.get_grid_parameters() + +# def initialize_weights(m): +# # pass +# self.pos_deform + # if isinstance(m, nn.Linear): + # # init.constant_(m.weight, 0) + # init.xavier_uniform_(m.weight,gain=1) + # if m.bias is not None: + # init.xavier_uniform_(m.weight,gain=1) + # # init.constant_(m.bias, 0) \ No newline at end of file diff --git a/scene/gaussian_model.py b/scene/gaussian_model.py new file mode 100644 index 0000000..e033f8b --- /dev/null +++ b/scene/gaussian_model.py @@ -0,0 +1,1833 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import numpy as np +from utils.general_utils import inverse_sigmoid, get_expon_lr_func, build_rotation +from torch import nn +import os +from utils.system_utils import mkdir_p +from plyfile import PlyData, PlyElement +from random import randint +from utils.sh_utils import RGB2SH +from simple_knn._C import distCUDA2 +from utils.graphics_utils import BasicPointCloud +from utils.general_utils import strip_symmetric, build_scaling_rotation +from scene.deformation import deform_network +from scene.regulation import compute_plane_smoothness + + +def sh2rgb(x): + return x * 0.28209479177387814 + 0.5 + +class GaussianModel: + + def setup_functions(self): + def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation): + L = build_scaling_rotation(scaling_modifier * scaling, rotation) + actual_covariance = L @ L.transpose(1, 2) + symm = strip_symmetric(actual_covariance) + return symm + + self.scaling_activation = torch.exp + self.scaling_inverse_activation = torch.log + + self.covariance_activation = build_covariance_from_scaling_rotation + + self.opacity_activation = torch.sigmoid + self.inverse_opacity_activation = inverse_sigmoid + + self.rotation_activation = torch.nn.functional.normalize + + + def __init__(self, sh_degree : int, args): + self.active_sh_degree = 0 + self.max_sh_degree = sh_degree + self._xyz = torch.empty(0) + # self._deformation = torch.empty(0) + self._deformation = deform_network(args) + # self.grid = TriPlaneGrid() + self._features_dc = torch.empty(0) + self._features_rest = torch.empty(0) + self._scaling = torch.empty(0) + self._rotation = torch.empty(0) + self._opacity = torch.empty(0) + self.max_radii2D = torch.empty(0) + self.xyz_gradient_accum = torch.empty(0) + self.denom = torch.empty(0) + self.optimizer = None + self.percent_dense = 0 + self.spatial_lr_scale = 0 + self._deformation_table = torch.empty(0) + self.setup_functions() + + def capture(self): + return ( + self.active_sh_degree, + self._xyz, + self._deformation.state_dict(), + self._deformation_table, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + self.xyz_gradient_accum, + self.denom, + self.optimizer.state_dict(), + self.spatial_lr_scale, + ) + + def restore(self, model_args, training_args): + (self.active_sh_degree, + self._xyz, + self._deformation_table, + self._deformation, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + xyz_gradient_accum, + denom, + opt_dict, + self.spatial_lr_scale) = model_args + self.training_setup(training_args) + self.xyz_gradient_accum = xyz_gradient_accum + self.denom = denom + self.optimizer.load_state_dict(opt_dict) + + @property + def get_scaling(self): + #return self._scaling + + return self.scaling_activation(self._scaling) + + @property + def get_rotation(self): + #return self._rotation + return self.rotation_activation(self._rotation) + + @property + def get_xyz(self): + return self._xyz + + @property + def get_features(self): + features_dc = self._features_dc + features_rest = self._features_rest + return torch.cat((features_dc, features_rest), dim=1) + + @property + def get_features_dc(self): + features_dc = self._features_dc + return features_dc + + @property + def get_features_rest(self): + features_rest = self._features_rest + return features_rest + + + + @property + def get_opacity(self): + return self.opacity_activation(self._opacity) + + def get_covariance(self, scaling_modifier = 1): + return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation) + + def oneupSHdegree(self): + if self.active_sh_degree < self.max_sh_degree: + self.active_sh_degree += 1 + + def load_colmap_ply(self, path, spatial_lr_scale=1, time_line=4): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] + # positions = positions[::2] + print('Loaded points from ply ', positions.shape) + colors = np.zeros_like(positions) + 0.5 + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + def load_3studio_ply(self, path, spatial_lr_scale=1, time_line=4, step=1, position_scale=1, load_color=True): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['x'], vertices['z'], -vertices['y']]).T # [N, 3] # image dream axis + # positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] # 3studio coord is this + positions = positions[::step] * position_scale + print('Loaded points from ply ', positions.shape) + # positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T # [N, 3] + # positions=np.concatenate([-positions[:,0:1],-positions[:,1:2],-positions[:,2:3]],1)#*train_dataset.scale_factor) + if load_color: + colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 + else: + colors = np.zeros_like(positions) + 0.5 + + colors = colors[::step] + # normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + + def random_init(self, num_pts, lr=10, radius=1): + phis = np.random.random((num_pts,)) * 2 * np.pi + costheta = np.random.random((num_pts,)) * 2 - 1 + thetas = np.arccos(costheta) + mu = np.random.random((num_pts,)) + radius = radius * np.cbrt(mu) + x = radius * np.sin(thetas) * np.cos(phis) + y = radius * np.sin(thetas) * np.sin(phis) + z = radius * np.cos(thetas) + xyz = np.stack((x, y, z), axis=1) + # xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 + + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud( + points=xyz, colors=sh2rgb(shs), normals=np.zeros((num_pts, 3)) + ) + self.create_from_pcd(pcd, lr, 4) # 4 not used + + + def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float, time_line: int): + self.spatial_lr_scale = spatial_lr_scale + fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() + fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) + features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() + features[:, :3, 0 ] = fused_color + features[:, 3:, 1:] = 0.0 + + print("Number of points at initialisation : ", fused_point_cloud.shape[0]) + + dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) + # scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 1) + scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") + rots[:, 0] = 1 + + opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) + + self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) + self._deformation = self._deformation.to("cuda") + # self.grid = self.grid.to("cuda") + self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True)) + self._scaling = nn.Parameter(scales.requires_grad_(True)) + self._rotation = nn.Parameter(rots.requires_grad_(True)) + self._opacity = nn.Parameter(opacities.requires_grad_(True)) + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + def training_setup(self, training_args): + self.percent_dense = training_args.percent_dense + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + + + l = [ + {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"}, + {'params': list(self._deformation.get_mlp_parameters()), 'lr': training_args.deformation_lr_init * self.spatial_lr_scale, "name": "deformation"}, + {'params': list(self._deformation.get_grid_parameters()), 'lr': training_args.grid_lr_init * self.spatial_lr_scale, "name": "grid"}, + {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"}, + {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"}, + {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"}, + {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"}, + {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"} + + ] + + self.optimizer = torch.optim.Adam(l, lr=0.0) + # self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15) + self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale, + lr_final=training_args.position_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.position_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.deformation_scheduler_args = get_expon_lr_func(lr_init=training_args.deformation_lr_init*self.spatial_lr_scale, + lr_final=training_args.deformation_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.grid_scheduler_args = get_expon_lr_func(lr_init=training_args.grid_lr_init*self.spatial_lr_scale, + lr_final=training_args.grid_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + + def update_learning_rate(self, iteration): + ''' Learning rate scheduling per step ''' + for param_group in self.optimizer.param_groups: + if param_group["name"] == "xyz": + lr = self.xyz_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + if "grid" in param_group["name"]: + lr = self.grid_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + elif param_group["name"] == "deformation": + lr = self.deformation_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + + def construct_list_of_attributes(self): + l = ['x', 'y', 'z', 'nx', 'ny', 'nz'] + # All channels except the 3 DC + for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]): + l.append('f_dc_{}'.format(i)) + for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]): + l.append('f_rest_{}'.format(i)) + l.append('opacity') + for i in range(self._scaling.shape[1]): + l.append('scale_{}'.format(i)) + for i in range(self._rotation.shape[1]): + l.append('rot_{}'.format(i)) + return l + # def compute_deformation(self,time): + + # deform = self._deformation[:,:,:time].sum(dim=-1) + # xyz = self._xyz + deform + # return xyz + # def save_ply_dynamic(path): + # for time in range(self._deformation.shape(-1)): + # xyz = self.compute_deformation(time) + def load_model(self, path): + print("loading model from exists{}".format(path)) + weight_dict = torch.load(os.path.join(path,"deformation.pth"),map_location="cuda") + self._deformation.load_state_dict(weight_dict) + self._deformation = self._deformation.to("cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + if os.path.exists(os.path.join(path, "deformation_table.pth")): + self._deformation_table = torch.load(os.path.join(path, "deformation_table.pth"),map_location="cuda") + if os.path.exists(os.path.join(path, "deformation_accum.pth")): + self._deformation_accum = torch.load(os.path.join(path, "deformation_accum.pth"),map_location="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + # print(self._deformation.deformation_net.grid.) + def save_deformation(self, path): + torch.save(self._deformation.state_dict(),os.path.join(path, "deformation.pth")) + torch.save(self._deformation_table,os.path.join(path, "deformation_table.pth")) + torch.save(self._deformation_accum,os.path.join(path, "deformation_accum.pth")) + def save_ply(self, path): + mkdir_p(os.path.dirname(path)) + + xyz = self._xyz.detach().cpu().numpy() + normals = np.zeros_like(xyz) + f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + opacities = self._opacity.detach().cpu().numpy() + scale = self._scaling.detach().cpu().numpy() + rotation = self._rotation.detach().cpu().numpy() + + dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] + + elements = np.empty(xyz.shape[0], dtype=dtype_full) + attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) + elements[:] = list(map(tuple, attributes)) + el = PlyElement.describe(elements, 'vertex') + PlyData([el]).write(path) + + def reset_opacity(self): + opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01)) + optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity") + self._opacity = optimizable_tensors["opacity"] + + def load_ply(self, path): + plydata = PlyData.read(path) + + xyz = np.stack((np.asarray(plydata.elements[0]["x"]), + np.asarray(plydata.elements[0]["y"]), + np.asarray(plydata.elements[0]["z"])), axis=1) + opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis] + + features_dc = np.zeros((xyz.shape[0], 3, 1)) + features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"]) + features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"]) + features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"]) + + extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")] + extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1])) + assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3 + features_extra = np.zeros((xyz.shape[0], len(extra_f_names))) + for idx, attr_name in enumerate(extra_f_names): + features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name]) + # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC) + features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)) + + scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")] + scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1])) + scales = np.zeros((xyz.shape[0], len(scale_names))) + for idx, attr_name in enumerate(scale_names): + scales[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")] + rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1])) + rots = np.zeros((xyz.shape[0], len(rot_names))) + for idx, attr_name in enumerate(rot_names): + rots[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)) + self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True)) + self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)) + self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)) + self.active_sh_degree = self.max_sh_degree + + def replace_tensor_to_optimizer(self, tensor, name): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if group["name"] == name: + stored_state = self.optimizer.state.get(group['params'][0], None) + stored_state["exp_avg"] = torch.zeros_like(tensor) + stored_state["exp_avg_sq"] = torch.zeros_like(tensor) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(tensor.requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def _prune_optimizer(self, mask): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"]) > 1: + continue + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + stored_state["exp_avg"] = stored_state["exp_avg"][mask] + stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask] + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True))) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def prune_points(self, mask): + valid_points_mask = ~mask + optimizable_tensors = self._prune_optimizer(valid_points_mask) + + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + self._deformation_accum = self._deformation_accum[valid_points_mask] + self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask] + self._deformation_table = self._deformation_table[valid_points_mask] + self.denom = self.denom[valid_points_mask] + self.max_radii2D = self.max_radii2D[valid_points_mask] + + def cat_tensors_to_optimizer(self, tensors_dict): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"])>1:continue + assert len(group["params"]) == 1 + extension_tensor = tensors_dict[group["name"]] + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + + stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0) + stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + + return optimizable_tensors + + def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table): + d = {"xyz": new_xyz, + "f_dc": new_features_dc, + "f_rest": new_features_rest, + "opacity": new_opacities, + "scaling" : new_scaling, + "rotation" : new_rotation, + # "deformation": new_deformation + } + + optimizable_tensors = self.cat_tensors_to_optimizer(d) + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + # self._deformation = optimizable_tensors["deformation"] + + self._deformation_table = torch.cat([self._deformation_table,new_deformation_table],-1) + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0], 3), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + + def densify_and_split(self, grads, grad_threshold, scene_extent, N=2): + n_init_points = self.get_xyz.shape[0] + # Extract points that satisfy the gradient condition + padded_grad = torch.zeros((n_init_points), device="cuda") + padded_grad[:grads.shape[0]] = grads.squeeze() + print('split', padded_grad.mean(), grad_threshold) + selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) + if not selected_pts_mask.any(): + return + stds = self.get_scaling[selected_pts_mask].repeat(N,1) + means =torch.zeros((stds.size(0), 3),device="cuda") + samples = torch.normal(mean=means, std=stds) + rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1) + new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1) + new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N)) + new_rotation = self._rotation[selected_pts_mask].repeat(N,1) + new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1) + new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1) + new_opacity = self._opacity[selected_pts_mask].repeat(N,1) + new_deformation_table = self._deformation_table[selected_pts_mask].repeat(N) + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_deformation_table) + + prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) + self.prune_points(prune_filter) + + def densify_and_clone(self, grads, grad_threshold, scene_extent): + # Extract points that satisfy the gradient condition + print('clone', torch.norm(grads, dim=-1).mean(), grad_threshold) + selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent) + + new_xyz = self._xyz[selected_pts_mask] + # - 0.001 * self._xyz.grad[selected_pts_mask] + new_features_dc = self._features_dc[selected_pts_mask] + new_features_rest = self._features_rest[selected_pts_mask] + new_opacities = self._opacity[selected_pts_mask] + new_scaling = self._scaling[selected_pts_mask] + new_rotation = self._rotation[selected_pts_mask] + new_deformation_table = self._deformation_table[selected_pts_mask] + + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table) + def prune(self, max_grad, min_opacity, extent, max_screen_size): + prune_mask = (self.get_opacity < min_opacity).squeeze() + # prune_mask_2 = torch.logical_and(self.get_opacity <= inverse_sigmoid(0.101 , dtype=torch.float, device="cuda"), self.get_opacity >= inverse_sigmoid(0.999 , dtype=torch.float, device="cuda")) + # prune_mask = torch.logical_or(prune_mask, prune_mask_2) + # deformation_sum = abs(self._deformation).sum(dim=-1).mean(dim=-1) + # deformation_mask = (deformation_sum < torch.quantile(deformation_sum, torch.tensor([0.5]).to("cuda"))) + # prune_mask = prune_mask & deformation_mask + if max_screen_size: + big_points_vs = self.max_radii2D > max_screen_size + big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent + prune_mask = torch.logical_or(prune_mask, big_points_vs) + + prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws) + self.prune_points(prune_mask) + + torch.cuda.empty_cache() + def densify(self, max_grad, min_opacity, extent, max_screen_size): + grads = self.xyz_gradient_accum / self.denom + grads[grads.isnan()] = 0.0 + + self.densify_and_clone(grads, max_grad, extent) + self.densify_and_split(grads, max_grad, extent) + def standard_constaint(self): + + means3D = self._xyz.detach() + scales = self._scaling.detach() + rotations = self._rotation.detach() + opacity = self._opacity.detach() + color=self._ + time = torch.tensor(0).to("cuda").repeat(means3D.shape[0],1) + means3D_deform, scales_deform, rotations_deform, _ = self._deformation(means3D, scales, rotations, opacity, time) + position_error = (means3D_deform - means3D)**2 + rotation_error = (rotations_deform - rotations)**2 + scaling_erorr = (scales_deform - scales)**2 + return position_error.mean() + rotation_error.mean() + scaling_erorr.mean() + + + def add_densification_stats(self, viewspace_point_tensor, update_filter): + self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor[update_filter,:2], dim=-1, keepdim=True) + self.denom[update_filter] += 1 + @torch.no_grad() + # def update_deformation_table(self,threshold): + # # print("origin deformation point nums:",self._deformation_table.sum()) + # self._deformation_table = torch.gt(self._deformation_accum.max(dim=-1).values/100,threshold) + def print_deformation_weight_grad(self): + for name, weight in self._deformation.named_parameters(): + if weight.requires_grad: + if weight.grad is None: + print(name," :",weight.grad) + else: + if weight.grad.mean() != 0: + print(name," :",weight.grad.mean(), weight.grad.min(), weight.grad.max()) + print("-"*50) + def _plane_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids = [0,1,3] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _time_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids =[2, 4, 5] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _l1_regulation(self): + # model.grids is 6 x [1, rank * F_dim, reso, reso] + multi_res_grids = self._deformation.deformation_net.grid.grids + + total = 0.0 + for grids in multi_res_grids: + if len(grids) == 3: + continue + else: + # These are the spatiotemporal grids + spatiotemporal_grids = [2, 4, 5] + for grid_id in spatiotemporal_grids: + total += torch.abs(1 - grids[grid_id]).mean() + return total + def compute_regulation(self, time_smoothness_weight, l1_time_planes_weight, plane_tv_weight): + return plane_tv_weight * self._plane_regulation() + time_smoothness_weight * self._time_regulation() + l1_time_planes_weight * self._l1_regulation() + +class MLP(nn.Module): + def __init__(self): + super().__init__() + self.nn = nn.Sequential( + nn.Linear(2, 32), + nn.ReLU(), + nn.Linear(32, 32), + nn.ReLU(), + nn.Linear(32, 3), + nn.Sigmoid() + ) + def forward(self, x): + return self.nn(x) + +class GaussianModel_bg: + + def setup_functions(self): + def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation): + L = build_scaling_rotation(scaling_modifier * scaling, rotation) + actual_covariance = L @ L.transpose(1, 2) + symm = strip_symmetric(actual_covariance) + return symm + + self.scaling_activation = torch.exp + self.scaling_inverse_activation = torch.log + + self.covariance_activation = build_covariance_from_scaling_rotation + + self.opacity_activation = torch.sigmoid + self.inverse_opacity_activation = inverse_sigmoid + + self.rotation_activation = torch.nn.functional.normalize + + + def __init__(self, sh_degree : int, args): + self.active_sh_degree = 0 + self.max_sh_degree = sh_degree + self._xyz = torch.empty(0) + # self._deformation = torch.empty(0) + self._deformation = deform_network(args) + self.bg_net = MLP() + # self.grid = TriPlaneGrid() + self._features_dc = torch.empty(0) + self._features_rest = torch.empty(0) + self._scaling = torch.empty(0) + self._rotation = torch.empty(0) + self._opacity = torch.empty(0) + self.max_radii2D = torch.empty(0) + self.xyz_gradient_accum = torch.empty(0) + self.denom = torch.empty(0) + self.optimizer = None + self.percent_dense = 0 + self.spatial_lr_scale = 0 + self._deformation_table = torch.empty(0) + self.setup_functions() + + def capture(self): + return ( + self.active_sh_degree, + self._xyz, + self._deformation.state_dict(), + self._deformation_table, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + self.xyz_gradient_accum, + self.denom, + self.optimizer.state_dict(), + self.spatial_lr_scale, + ) + + def restore(self, model_args, training_args): + (self.active_sh_degree, + self._xyz, + self._deformation_table, + self._deformation, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + xyz_gradient_accum, + denom, + opt_dict, + self.spatial_lr_scale) = model_args + self.training_setup(training_args) + self.xyz_gradient_accum = xyz_gradient_accum + self.denom = denom + self.optimizer.load_state_dict(opt_dict) + + @property + def get_scaling(self): + #return self._scaling + + return self.scaling_activation(self._scaling) + + @property + def get_rotation(self): + #return self._rotation + return self.rotation_activation(self._rotation) + + @property + def get_xyz(self): + return self._xyz + + @property + def get_features(self): + features_dc = self._features_dc + features_rest = self._features_rest + return torch.cat((features_dc, features_rest), dim=1) + + @property + def get_features_dc(self): + features_dc = self._features_dc + return features_dc + + @property + def get_features_rest(self): + features_rest = self._features_rest + return features_rest + + + + @property + def get_opacity(self): + return self.opacity_activation(self._opacity) + + def get_covariance(self, scaling_modifier = 1): + return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation) + + def oneupSHdegree(self): + if self.active_sh_degree < self.max_sh_degree: + self.active_sh_degree += 1 + + def load_colmap_ply(self, path, spatial_lr_scale=1, time_line=4): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] + # positions = positions[::2] + print('Loaded points from ply ', positions.shape) + colors = np.zeros_like(positions) + 0.5 + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + def load_3studio_ply(self, path, spatial_lr_scale=1, time_line=4, step=1, position_scale=1, load_color=True): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['x'], vertices['z'], -vertices['y']]).T # [N, 3] # image dream axis + # positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] # 3studio coord is this + positions = positions[::step] * position_scale + print('Loaded points from ply ', positions.shape) + # positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T # [N, 3] + # positions=np.concatenate([-positions[:,0:1],-positions[:,1:2],-positions[:,2:3]],1)#*train_dataset.scale_factor) + if load_color: + colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 + else: + colors = np.zeros_like(positions) + 0.5 + + colors = colors[::step] + # normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + + def random_init(self, num_pts, lr=10, radius=1): + phis = np.random.random((num_pts,)) * 2 * np.pi + costheta = np.random.random((num_pts,)) * 2 - 1 + thetas = np.arccos(costheta) + mu = np.random.random((num_pts,)) + radius = radius * np.cbrt(mu) + x = radius * np.sin(thetas) * np.cos(phis) + y = radius * np.sin(thetas) * np.sin(phis) + z = radius * np.cos(thetas) + xyz = np.stack((x, y, z), axis=1) + # xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 + + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud( + points=xyz, colors=sh2rgb(shs), normals=np.zeros((num_pts, 3)) + ) + self.create_from_pcd(pcd, lr, 4) # 4 not used + + + def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float, time_line: int): + self.spatial_lr_scale = spatial_lr_scale + fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() + fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) + features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() + features[:, :3, 0 ] = fused_color + features[:, 3:, 1:] = 0.0 + + print("Number of points at initialisation : ", fused_point_cloud.shape[0]) + + dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) + # scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 1) + scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") + rots[:, 0] = 1 + + opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) + + self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) + self._deformation = self._deformation.to("cuda") + # self.grid = self.grid.to("cuda") + self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True)) + self._scaling = nn.Parameter(scales.requires_grad_(True)) + self._rotation = nn.Parameter(rots.requires_grad_(True)) + self._opacity = nn.Parameter(opacities.requires_grad_(True)) + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + def training_setup(self, training_args): + self.percent_dense = training_args.percent_dense + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + + + l = [ + {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"}, + {'params': list(self._deformation.get_mlp_parameters()), 'lr': training_args.deformation_lr_init * self.spatial_lr_scale, "name": "deformation"}, + {'params': list(self._deformation.get_grid_parameters()), 'lr': training_args.grid_lr_init * self.spatial_lr_scale, "name": "grid"}, + {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"}, + {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"}, + {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"}, + {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"}, + {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"} + + ] + + self.optimizer = torch.optim.Adam(l, lr=0.0) + # self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15) + self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale, + lr_final=training_args.position_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.position_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.deformation_scheduler_args = get_expon_lr_func(lr_init=training_args.deformation_lr_init*self.spatial_lr_scale, + lr_final=training_args.deformation_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.grid_scheduler_args = get_expon_lr_func(lr_init=training_args.grid_lr_init*self.spatial_lr_scale, + lr_final=training_args.grid_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + + def update_learning_rate(self, iteration): + ''' Learning rate scheduling per step ''' + for param_group in self.optimizer.param_groups: + if param_group["name"] == "xyz": + lr = self.xyz_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + if "grid" in param_group["name"]: + lr = self.grid_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + elif param_group["name"] == "deformation": + lr = self.deformation_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + + def construct_list_of_attributes(self): + l = ['x', 'y', 'z', 'nx', 'ny', 'nz'] + # All channels except the 3 DC + for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]): + l.append('f_dc_{}'.format(i)) + for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]): + l.append('f_rest_{}'.format(i)) + l.append('opacity') + for i in range(self._scaling.shape[1]): + l.append('scale_{}'.format(i)) + for i in range(self._rotation.shape[1]): + l.append('rot_{}'.format(i)) + return l + # def compute_deformation(self,time): + + # deform = self._deformation[:,:,:time].sum(dim=-1) + # xyz = self._xyz + deform + # return xyz + # def save_ply_dynamic(path): + # for time in range(self._deformation.shape(-1)): + # xyz = self.compute_deformation(time) + def load_model(self, path): + print("loading model from exists{}".format(path)) + weight_dict = torch.load(os.path.join(path,"deformation.pth"),map_location="cuda") + self._deformation.load_state_dict(weight_dict) + self._deformation = self._deformation.to("cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + if os.path.exists(os.path.join(path, "deformation_table.pth")): + self._deformation_table = torch.load(os.path.join(path, "deformation_table.pth"),map_location="cuda") + if os.path.exists(os.path.join(path, "deformation_accum.pth")): + self._deformation_accum = torch.load(os.path.join(path, "deformation_accum.pth"),map_location="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + # print(self._deformation.deformation_net.grid.) + def save_deformation(self, path): + torch.save(self._deformation.state_dict(),os.path.join(path, "deformation.pth")) + torch.save(self._deformation_table,os.path.join(path, "deformation_table.pth")) + torch.save(self._deformation_accum,os.path.join(path, "deformation_accum.pth")) + def save_ply(self, path): + mkdir_p(os.path.dirname(path)) + + xyz = self._xyz.detach().cpu().numpy() + normals = np.zeros_like(xyz) + f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + opacities = self._opacity.detach().cpu().numpy() + scale = self._scaling.detach().cpu().numpy() + rotation = self._rotation.detach().cpu().numpy() + + dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] + + elements = np.empty(xyz.shape[0], dtype=dtype_full) + attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) + elements[:] = list(map(tuple, attributes)) + el = PlyElement.describe(elements, 'vertex') + PlyData([el]).write(path) + + def reset_opacity(self): + opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01)) + optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity") + self._opacity = optimizable_tensors["opacity"] + + def load_ply(self, path): + plydata = PlyData.read(path) + + xyz = np.stack((np.asarray(plydata.elements[0]["x"]), + np.asarray(plydata.elements[0]["y"]), + np.asarray(plydata.elements[0]["z"])), axis=1) + opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis] + + features_dc = np.zeros((xyz.shape[0], 3, 1)) + features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"]) + features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"]) + features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"]) + + extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")] + extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1])) + assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3 + features_extra = np.zeros((xyz.shape[0], len(extra_f_names))) + for idx, attr_name in enumerate(extra_f_names): + features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name]) + # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC) + features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)) + + scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")] + scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1])) + scales = np.zeros((xyz.shape[0], len(scale_names))) + for idx, attr_name in enumerate(scale_names): + scales[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")] + rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1])) + rots = np.zeros((xyz.shape[0], len(rot_names))) + for idx, attr_name in enumerate(rot_names): + rots[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)) + self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True)) + self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)) + self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)) + self.active_sh_degree = self.max_sh_degree + + def replace_tensor_to_optimizer(self, tensor, name): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if group["name"] == name: + stored_state = self.optimizer.state.get(group['params'][0], None) + stored_state["exp_avg"] = torch.zeros_like(tensor) + stored_state["exp_avg_sq"] = torch.zeros_like(tensor) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(tensor.requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def _prune_optimizer(self, mask): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"]) > 1: + continue + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + stored_state["exp_avg"] = stored_state["exp_avg"][mask] + stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask] + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True))) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def prune_points(self, mask): + valid_points_mask = ~mask + optimizable_tensors = self._prune_optimizer(valid_points_mask) + + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + self._deformation_accum = self._deformation_accum[valid_points_mask] + self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask] + self._deformation_table = self._deformation_table[valid_points_mask] + self.denom = self.denom[valid_points_mask] + self.max_radii2D = self.max_radii2D[valid_points_mask] + + def cat_tensors_to_optimizer(self, tensors_dict): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"])>1:continue + assert len(group["params"]) == 1 + extension_tensor = tensors_dict[group["name"]] + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + + stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0) + stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + + return optimizable_tensors + + def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table): + d = {"xyz": new_xyz, + "f_dc": new_features_dc, + "f_rest": new_features_rest, + "opacity": new_opacities, + "scaling" : new_scaling, + "rotation" : new_rotation, + # "deformation": new_deformation + } + + optimizable_tensors = self.cat_tensors_to_optimizer(d) + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + # self._deformation = optimizable_tensors["deformation"] + + self._deformation_table = torch.cat([self._deformation_table,new_deformation_table],-1) + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0], 3), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + + def densify_and_split(self, grads, grad_threshold, scene_extent, N=2): + n_init_points = self.get_xyz.shape[0] + # Extract points that satisfy the gradient condition + padded_grad = torch.zeros((n_init_points), device="cuda") + padded_grad[:grads.shape[0]] = grads.squeeze() + print('split', padded_grad.mean(), grad_threshold) + selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) + if not selected_pts_mask.any(): + return + stds = self.get_scaling[selected_pts_mask].repeat(N,1) + means =torch.zeros((stds.size(0), 3),device="cuda") + samples = torch.normal(mean=means, std=stds) + rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1) + new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1) + new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N)) + new_rotation = self._rotation[selected_pts_mask].repeat(N,1) + new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1) + new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1) + new_opacity = self._opacity[selected_pts_mask].repeat(N,1) + new_deformation_table = self._deformation_table[selected_pts_mask].repeat(N) + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_deformation_table) + + prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) + self.prune_points(prune_filter) + + def densify_and_clone(self, grads, grad_threshold, scene_extent): + # Extract points that satisfy the gradient condition + print('clone', torch.norm(grads, dim=-1).mean(), grad_threshold) + selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent) + + new_xyz = self._xyz[selected_pts_mask] + # - 0.001 * self._xyz.grad[selected_pts_mask] + new_features_dc = self._features_dc[selected_pts_mask] + new_features_rest = self._features_rest[selected_pts_mask] + new_opacities = self._opacity[selected_pts_mask] + new_scaling = self._scaling[selected_pts_mask] + new_rotation = self._rotation[selected_pts_mask] + new_deformation_table = self._deformation_table[selected_pts_mask] + + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table) + def prune(self, max_grad, min_opacity, extent, max_screen_size): + prune_mask = (self.get_opacity < min_opacity).squeeze() + # prune_mask_2 = torch.logical_and(self.get_opacity <= inverse_sigmoid(0.101 , dtype=torch.float, device="cuda"), self.get_opacity >= inverse_sigmoid(0.999 , dtype=torch.float, device="cuda")) + # prune_mask = torch.logical_or(prune_mask, prune_mask_2) + # deformation_sum = abs(self._deformation).sum(dim=-1).mean(dim=-1) + # deformation_mask = (deformation_sum < torch.quantile(deformation_sum, torch.tensor([0.5]).to("cuda"))) + # prune_mask = prune_mask & deformation_mask + if max_screen_size: + big_points_vs = self.max_radii2D > max_screen_size + big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent + prune_mask = torch.logical_or(prune_mask, big_points_vs) + + prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws) + self.prune_points(prune_mask) + + torch.cuda.empty_cache() + def densify(self, max_grad, min_opacity, extent, max_screen_size): + grads = self.xyz_gradient_accum / self.denom + grads[grads.isnan()] = 0.0 + + self.densify_and_clone(grads, max_grad, extent) + self.densify_and_split(grads, max_grad, extent) + def standard_constaint(self): + + means3D = self._xyz.detach() + scales = self._scaling.detach() + rotations = self._rotation.detach() + opacity = self._opacity.detach() + color=self._ + time = torch.tensor(0).to("cuda").repeat(means3D.shape[0],1) + means3D_deform, scales_deform, rotations_deform, _ = self._deformation(means3D, scales, rotations, opacity, time) + position_error = (means3D_deform - means3D)**2 + rotation_error = (rotations_deform - rotations)**2 + scaling_erorr = (scales_deform - scales)**2 + return position_error.mean() + rotation_error.mean() + scaling_erorr.mean() + + + def add_densification_stats(self, viewspace_point_tensor, update_filter): + self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor[update_filter,:2], dim=-1, keepdim=True) + self.denom[update_filter] += 1 + @torch.no_grad() + # def update_deformation_table(self,threshold): + # # print("origin deformation point nums:",self._deformation_table.sum()) + # self._deformation_table = torch.gt(self._deformation_accum.max(dim=-1).values/100,threshold) + def print_deformation_weight_grad(self): + for name, weight in self._deformation.named_parameters(): + if weight.requires_grad: + if weight.grad is None: + print(name," :",weight.grad) + else: + if weight.grad.mean() != 0: + print(name," :",weight.grad.mean(), weight.grad.min(), weight.grad.max()) + print("-"*50) + def _plane_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids = [0,1,3] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _time_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids =[2, 4, 5] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _l1_regulation(self): + # model.grids is 6 x [1, rank * F_dim, reso, reso] + multi_res_grids = self._deformation.deformation_net.grid.grids + + total = 0.0 + for grids in multi_res_grids: + if len(grids) == 3: + continue + else: + # These are the spatiotemporal grids + spatiotemporal_grids = [2, 4, 5] + for grid_id in spatiotemporal_grids: + total += torch.abs(1 - grids[grid_id]).mean() + return total + def compute_regulation(self, time_smoothness_weight, l1_time_planes_weight, plane_tv_weight): + return plane_tv_weight * self._plane_regulation() + time_smoothness_weight * self._time_regulation() + l1_time_planes_weight * self._l1_regulation() + + +class GaussianModel_nogrid: + + def setup_functions(self): + def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation): + L = build_scaling_rotation(scaling_modifier * scaling, rotation) + actual_covariance = L @ L.transpose(1, 2) + symm = strip_symmetric(actual_covariance) + return symm + + self.scaling_activation = torch.exp + self.scaling_inverse_activation = torch.log + + self.covariance_activation = build_covariance_from_scaling_rotation + + self.opacity_activation = torch.sigmoid + self.inverse_opacity_activation = inverse_sigmoid + + self.rotation_activation = torch.nn.functional.normalize + + + def __init__(self, sh_degree : int, args): + self.active_sh_degree = 0 + self.max_sh_degree = sh_degree + self._xyz = torch.empty(0) + # self._deformation = torch.empty(0) + self._deformation = deform_network(args) + # self.grid = TriPlaneGrid() + self._features_dc = torch.empty(0) + self._features_rest = torch.empty(0) + self._scaling = torch.empty(0) + self._rotation = torch.empty(0) + self._opacity = torch.empty(0) + self.max_radii2D = torch.empty(0) + self.xyz_gradient_accum = torch.empty(0) + self.denom = torch.empty(0) + self.optimizer = None + self.percent_dense = 0 + self.spatial_lr_scale = 0 + self._deformation_table = torch.empty(0) + self.setup_functions() + + def capture(self): + return ( + self.active_sh_degree, + self._xyz, + self._deformation.state_dict(), + self._deformation_table, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + self.xyz_gradient_accum, + self.denom, + self.optimizer.state_dict(), + self.spatial_lr_scale, + ) + + def restore(self, model_args, training_args): + (self.active_sh_degree, + self._xyz, + self._deformation_table, + self._deformation, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + xyz_gradient_accum, + denom, + opt_dict, + self.spatial_lr_scale) = model_args + self.training_setup(training_args) + self.xyz_gradient_accum = xyz_gradient_accum + self.denom = denom + self.optimizer.load_state_dict(opt_dict) + + @property + def get_scaling(self): + #return self._scaling + + return self.scaling_activation(self._scaling) + + @property + def get_rotation(self): + #return self._rotation + return self.rotation_activation(self._rotation) + + @property + def get_xyz(self): + return self._xyz + + @property + def get_features(self): + features_dc = self._features_dc + features_rest = self._features_rest + return torch.cat((features_dc, features_rest), dim=1) + + @property + def get_features_dc(self): + features_dc = self._features_dc + return features_dc + + @property + def get_features_rest(self): + features_rest = self._features_rest + return features_rest + + + + @property + def get_opacity(self): + return self.opacity_activation(self._opacity) + + def get_covariance(self, scaling_modifier = 1): + return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation) + + def oneupSHdegree(self): + if self.active_sh_degree < self.max_sh_degree: + self.active_sh_degree += 1 + + def load_colmap_ply(self, path, spatial_lr_scale=1, time_line=4): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] + # positions = positions[::2] + print('Loaded points from ply ', positions.shape) + colors = np.zeros_like(positions) + 0.5 + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + def load_3studio_ply(self, path, spatial_lr_scale=1, time_line=4, step=1, position_scale=1, load_color=True): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['x'], vertices['z'], -vertices['y']]).T # [N, 3] # image dream axis + # positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] # 3studio coord is this + positions = positions[::step] * position_scale + print('Loaded points from ply ', positions.shape) + # positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T # [N, 3] + # positions=np.concatenate([-positions[:,0:1],-positions[:,1:2],-positions[:,2:3]],1)#*train_dataset.scale_factor) + if load_color: + colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 + else: + colors = np.zeros_like(positions) + 0.5 + + colors = colors[::step] + # normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + + def random_init(self, num_pts, lr=10, radius=1): + phis = np.random.random((num_pts,)) * 2 * np.pi + costheta = np.random.random((num_pts,)) * 2 - 1 + thetas = np.arccos(costheta) + mu = np.random.random((num_pts,)) + radius = radius * np.cbrt(mu) + x = radius * np.sin(thetas) * np.cos(phis) + y = radius * np.sin(thetas) * np.sin(phis) + z = radius * np.cos(thetas) + xyz = np.stack((x, y, z), axis=1) + # xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 + + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud( + points=xyz, colors=sh2rgb(shs), normals=np.zeros((num_pts, 3)) + ) + self.create_from_pcd(pcd, lr, 4) # 4 not used + + + def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float, time_line: int): + self.spatial_lr_scale = spatial_lr_scale + fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() + fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) + features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() + features[:, :3, 0 ] = fused_color + features[:, 3:, 1:] = 0.0 + + print("Number of points at initialisation : ", fused_point_cloud.shape[0]) + + dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) + # scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 1) + scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") + rots[:, 0] = 1 + + opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) + + self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) + self._deformation = self._deformation.to("cuda") + # self.grid = self.grid.to("cuda") + self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True)) + self._scaling = nn.Parameter(scales.requires_grad_(True)) + self._rotation = nn.Parameter(rots.requires_grad_(True)) + self._opacity = nn.Parameter(opacities.requires_grad_(True)) + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + def training_setup(self, training_args): + self.percent_dense = training_args.percent_dense + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + + + l = [ + {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"}, + {'params': list(self._deformation.get_mlp_parameters()), 'lr': training_args.deformation_lr_init * self.spatial_lr_scale, "name": "deformation"}, + {'params': list(self._deformation.get_grid_parameters()), 'lr': training_args.grid_lr_init * self.spatial_lr_scale, "name": "grid"}, + {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"}, + {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"}, + {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"}, + {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"}, + {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"} + + ] + + self.optimizer = torch.optim.Adam(l, lr=0.0) + # self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15) + self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale, + lr_final=training_args.position_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.position_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.deformation_scheduler_args = get_expon_lr_func(lr_init=training_args.deformation_lr_init*self.spatial_lr_scale, + lr_final=training_args.deformation_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.grid_scheduler_args = get_expon_lr_func(lr_init=training_args.grid_lr_init*self.spatial_lr_scale, + lr_final=training_args.grid_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + + def update_learning_rate(self, iteration): + ''' Learning rate scheduling per step ''' + for param_group in self.optimizer.param_groups: + if param_group["name"] == "xyz": + lr = self.xyz_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + if "grid" in param_group["name"]: + lr = self.grid_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + elif param_group["name"] == "deformation": + lr = self.deformation_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + + def construct_list_of_attributes(self): + l = ['x', 'y', 'z', 'nx', 'ny', 'nz'] + # All channels except the 3 DC + for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]): + l.append('f_dc_{}'.format(i)) + for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]): + l.append('f_rest_{}'.format(i)) + l.append('opacity') + for i in range(self._scaling.shape[1]): + l.append('scale_{}'.format(i)) + for i in range(self._rotation.shape[1]): + l.append('rot_{}'.format(i)) + return l + # def compute_deformation(self,time): + + # deform = self._deformation[:,:,:time].sum(dim=-1) + # xyz = self._xyz + deform + # return xyz + # def save_ply_dynamic(path): + # for time in range(self._deformation.shape(-1)): + # xyz = self.compute_deformation(time) + def load_model(self, path): + print("loading model from exists{}".format(path)) + weight_dict = torch.load(os.path.join(path,"deformation.pth"),map_location="cuda") + self._deformation.load_state_dict(weight_dict) + self._deformation = self._deformation.to("cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + if os.path.exists(os.path.join(path, "deformation_table.pth")): + self._deformation_table = torch.load(os.path.join(path, "deformation_table.pth"),map_location="cuda") + if os.path.exists(os.path.join(path, "deformation_accum.pth")): + self._deformation_accum = torch.load(os.path.join(path, "deformation_accum.pth"),map_location="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + # print(self._deformation.deformation_net.grid.) + def save_deformation(self, path): + torch.save(self._deformation.state_dict(),os.path.join(path, "deformation.pth")) + torch.save(self._deformation_table,os.path.join(path, "deformation_table.pth")) + torch.save(self._deformation_accum,os.path.join(path, "deformation_accum.pth")) + def save_ply(self, path): + mkdir_p(os.path.dirname(path)) + + xyz = self._xyz.detach().cpu().numpy() + normals = np.zeros_like(xyz) + f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + opacities = self._opacity.detach().cpu().numpy() + scale = self._scaling.detach().cpu().numpy() + rotation = self._rotation.detach().cpu().numpy() + + dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] + + elements = np.empty(xyz.shape[0], dtype=dtype_full) + attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) + elements[:] = list(map(tuple, attributes)) + el = PlyElement.describe(elements, 'vertex') + PlyData([el]).write(path) + + def reset_opacity(self): + opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01)) + optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity") + self._opacity = optimizable_tensors["opacity"] + + def load_ply(self, path): + plydata = PlyData.read(path) + + xyz = np.stack((np.asarray(plydata.elements[0]["x"]), + np.asarray(plydata.elements[0]["y"]), + np.asarray(plydata.elements[0]["z"])), axis=1) + opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis] + + features_dc = np.zeros((xyz.shape[0], 3, 1)) + features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"]) + features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"]) + features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"]) + + extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")] + extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1])) + assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3 + features_extra = np.zeros((xyz.shape[0], len(extra_f_names))) + for idx, attr_name in enumerate(extra_f_names): + features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name]) + # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC) + features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)) + + scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")] + scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1])) + scales = np.zeros((xyz.shape[0], len(scale_names))) + for idx, attr_name in enumerate(scale_names): + scales[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")] + rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1])) + rots = np.zeros((xyz.shape[0], len(rot_names))) + for idx, attr_name in enumerate(rot_names): + rots[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)) + self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True)) + self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)) + self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)) + self.active_sh_degree = self.max_sh_degree + + def replace_tensor_to_optimizer(self, tensor, name): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if group["name"] == name: + stored_state = self.optimizer.state.get(group['params'][0], None) + stored_state["exp_avg"] = torch.zeros_like(tensor) + stored_state["exp_avg_sq"] = torch.zeros_like(tensor) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(tensor.requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def _prune_optimizer(self, mask): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"]) > 1: + continue + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + stored_state["exp_avg"] = stored_state["exp_avg"][mask] + stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask] + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True))) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def prune_points(self, mask): + valid_points_mask = ~mask + optimizable_tensors = self._prune_optimizer(valid_points_mask) + + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + self._deformation_accum = self._deformation_accum[valid_points_mask] + self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask] + self._deformation_table = self._deformation_table[valid_points_mask] + self.denom = self.denom[valid_points_mask] + self.max_radii2D = self.max_radii2D[valid_points_mask] + + def cat_tensors_to_optimizer(self, tensors_dict): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"])>1:continue + assert len(group["params"]) == 1 + extension_tensor = tensors_dict[group["name"]] + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + + stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0) + stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + + return optimizable_tensors + + def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table): + d = {"xyz": new_xyz, + "f_dc": new_features_dc, + "f_rest": new_features_rest, + "opacity": new_opacities, + "scaling" : new_scaling, + "rotation" : new_rotation, + # "deformation": new_deformation + } + + optimizable_tensors = self.cat_tensors_to_optimizer(d) + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + # self._deformation = optimizable_tensors["deformation"] + + self._deformation_table = torch.cat([self._deformation_table,new_deformation_table],-1) + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0], 3), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + + def densify_and_split(self, grads, grad_threshold, scene_extent, N=2): + n_init_points = self.get_xyz.shape[0] + # Extract points that satisfy the gradient condition + padded_grad = torch.zeros((n_init_points), device="cuda") + padded_grad[:grads.shape[0]] = grads.squeeze() + print('split', padded_grad.mean(), grad_threshold) + selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) + if not selected_pts_mask.any(): + return + stds = self.get_scaling[selected_pts_mask].repeat(N,1) + means =torch.zeros((stds.size(0), 3),device="cuda") + samples = torch.normal(mean=means, std=stds) + rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1) + new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1) + new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N)) + new_rotation = self._rotation[selected_pts_mask].repeat(N,1) + new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1) + new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1) + new_opacity = self._opacity[selected_pts_mask].repeat(N,1) + new_deformation_table = self._deformation_table[selected_pts_mask].repeat(N) + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_deformation_table) + + prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) + self.prune_points(prune_filter) + + def densify_and_clone(self, grads, grad_threshold, scene_extent): + # Extract points that satisfy the gradient condition + print('clone', torch.norm(grads, dim=-1).mean(), grad_threshold) + selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent) + + new_xyz = self._xyz[selected_pts_mask] + # - 0.001 * self._xyz.grad[selected_pts_mask] + new_features_dc = self._features_dc[selected_pts_mask] + new_features_rest = self._features_rest[selected_pts_mask] + new_opacities = self._opacity[selected_pts_mask] + new_scaling = self._scaling[selected_pts_mask] + new_rotation = self._rotation[selected_pts_mask] + new_deformation_table = self._deformation_table[selected_pts_mask] + + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table) + def prune(self, max_grad, min_opacity, extent, max_screen_size): + prune_mask = (self.get_opacity < min_opacity).squeeze() + # prune_mask_2 = torch.logical_and(self.get_opacity <= inverse_sigmoid(0.101 , dtype=torch.float, device="cuda"), self.get_opacity >= inverse_sigmoid(0.999 , dtype=torch.float, device="cuda")) + # prune_mask = torch.logical_or(prune_mask, prune_mask_2) + # deformation_sum = abs(self._deformation).sum(dim=-1).mean(dim=-1) + # deformation_mask = (deformation_sum < torch.quantile(deformation_sum, torch.tensor([0.5]).to("cuda"))) + # prune_mask = prune_mask & deformation_mask + if max_screen_size: + big_points_vs = self.max_radii2D > max_screen_size + big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent + prune_mask = torch.logical_or(prune_mask, big_points_vs) + + prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws) + self.prune_points(prune_mask) + + torch.cuda.empty_cache() + def densify(self, max_grad, min_opacity, extent, max_screen_size): + grads = self.xyz_gradient_accum / self.denom + grads[grads.isnan()] = 0.0 + + self.densify_and_clone(grads, max_grad, extent) + self.densify_and_split(grads, max_grad, extent) + def standard_constaint(self): + + means3D = self._xyz.detach() + scales = self._scaling.detach() + rotations = self._rotation.detach() + opacity = self._opacity.detach() + color=self._ + time = torch.tensor(0).to("cuda").repeat(means3D.shape[0],1) + means3D_deform, scales_deform, rotations_deform, _ = self._deformation(means3D, scales, rotations, opacity, time) + position_error = (means3D_deform - means3D)**2 + rotation_error = (rotations_deform - rotations)**2 + scaling_erorr = (scales_deform - scales)**2 + return position_error.mean() + rotation_error.mean() + scaling_erorr.mean() + + + def add_densification_stats(self, viewspace_point_tensor, update_filter): + self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor[update_filter,:2], dim=-1, keepdim=True) + self.denom[update_filter] += 1 + @torch.no_grad() + # def update_deformation_table(self,threshold): + # # print("origin deformation point nums:",self._deformation_table.sum()) + # self._deformation_table = torch.gt(self._deformation_accum.max(dim=-1).values/100,threshold) + def print_deformation_weight_grad(self): + for name, weight in self._deformation.named_parameters(): + if weight.requires_grad: + if weight.grad is None: + print(name," :",weight.grad) + else: + if weight.grad.mean() != 0: + print(name," :",weight.grad.mean(), weight.grad.min(), weight.grad.max()) + print("-"*50) + def _plane_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids = [0,1,3] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _time_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids =[2, 4, 5] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _l1_regulation(self): + # model.grids is 6 x [1, rank * F_dim, reso, reso] + multi_res_grids = self._deformation.deformation_net.grid.grids + + total = 0.0 + for grids in multi_res_grids: + if len(grids) == 3: + continue + else: + # These are the spatiotemporal grids + spatiotemporal_grids = [2, 4, 5] + for grid_id in spatiotemporal_grids: + total += torch.abs(1 - grids[grid_id]).mean() + return total + def compute_regulation(self, time_smoothness_weight, l1_time_planes_weight, plane_tv_weight): + return plane_tv_weight * self._plane_regulation() + time_smoothness_weight * self._time_regulation() + l1_time_planes_weight * self._l1_regulation() diff --git a/scene/gaussian_model_nogrid.py b/scene/gaussian_model_nogrid.py new file mode 100644 index 0000000..530030c --- /dev/null +++ b/scene/gaussian_model_nogrid.py @@ -0,0 +1,657 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import numpy as np +from utils.general_utils import inverse_sigmoid, get_expon_lr_func, build_rotation +from torch import nn +import os +from utils.system_utils import mkdir_p +from plyfile import PlyData, PlyElement +from random import randint +from utils.sh_utils import RGB2SH +from simple_knn._C import distCUDA2 +from utils.graphics_utils import BasicPointCloud +from utils.general_utils import strip_symmetric, build_scaling_rotation +from scene.deformation_nogrid import deform_network +from scene.regulation import compute_plane_smoothness + + +def sh2rgb(x): + return x * 0.28209479177387814 + 0.5 + +class GaussianModel_nogrid: + + def setup_functions(self): + def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation): + L = build_scaling_rotation(scaling_modifier * scaling, rotation) + actual_covariance = L @ L.transpose(1, 2) + symm = strip_symmetric(actual_covariance) + return symm + + self.scaling_activation = torch.exp + self.scaling_inverse_activation = torch.log + + self.covariance_activation = build_covariance_from_scaling_rotation + + self.opacity_activation = torch.sigmoid + self.inverse_opacity_activation = inverse_sigmoid + + self.rotation_activation = torch.nn.functional.normalize + + + def __init__(self, sh_degree : int, args): + self.active_sh_degree = 0 + self.max_sh_degree = sh_degree + self._xyz = torch.empty(0) + # self._deformation = torch.empty(0) + self._deformation = deform_network(args) + # self.grid = TriPlaneGrid() + self._features_dc = torch.empty(0) + self._features_rest = torch.empty(0) + self._scaling = torch.empty(0) + self._rotation = torch.empty(0) + self._opacity = torch.empty(0) + self.max_radii2D = torch.empty(0) + self.xyz_gradient_accum = torch.empty(0) + self.denom = torch.empty(0) + self.optimizer = None + self.percent_dense = 0 + self.spatial_lr_scale = 0 + self._deformation_table = torch.empty(0) + self.setup_functions() + + def capture(self): + return ( + self.active_sh_degree, + self._xyz, + self._deformation.state_dict(), + self._deformation_table, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + self.xyz_gradient_accum, + self.denom, + self.optimizer.state_dict(), + self.spatial_lr_scale, + ) + + def restore(self, model_args, training_args): + (self.active_sh_degree, + self._xyz, + self._deformation_table, + self._deformation, + # self.grid, + self._features_dc, + self._features_rest, + self._scaling, + self._rotation, + self._opacity, + self.max_radii2D, + xyz_gradient_accum, + denom, + opt_dict, + self.spatial_lr_scale) = model_args + self.training_setup(training_args) + self.xyz_gradient_accum = xyz_gradient_accum + self.denom = denom + self.optimizer.load_state_dict(opt_dict) + + @property + def get_scaling(self): + #return self._scaling + + return self.scaling_activation(self._scaling) + + @property + def get_rotation(self): + #return self._rotation + return self.rotation_activation(self._rotation) + + @property + def get_xyz(self): + return self._xyz + + @property + def get_features(self): + features_dc = self._features_dc + features_rest = self._features_rest + return torch.cat((features_dc, features_rest), dim=1) + + @property + def get_features_dc(self): + features_dc = self._features_dc + return features_dc + + @property + def get_features_rest(self): + features_rest = self._features_rest + return features_rest + + + + @property + def get_opacity(self): + return self.opacity_activation(self._opacity) + + def get_covariance(self, scaling_modifier = 1): + return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation) + + def oneupSHdegree(self): + if self.active_sh_degree < self.max_sh_degree: + self.active_sh_degree += 1 + + def load_colmap_ply(self, path, spatial_lr_scale=1, time_line=4): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] + # positions = positions[::2] + print('Loaded points from ply ', positions.shape) + colors = np.zeros_like(positions) + 0.5 + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + def load_3studio_ply(self, path, spatial_lr_scale=1, time_line=4, pts_num=1, position_scale=1, load_color=True): + # https://github.com/graphdeco-inria/gaussian-splatting/blob/f11001b46c5c73a0a7d553353c898efd68412abe/scene/dataset_readers.py#L107 + plydata = PlyData.read(path) + vertices = plydata['vertex'] + positions = np.vstack([vertices['x'], vertices['z'], -vertices['y']]).T # [N, 3] # image dream axis + # positions = np.vstack([vertices['y'], vertices['z'], vertices['x']]).T # [N, 3] # 3studio coord is this + tot_num = positions.shape[0] + new_idx = np.random.permutation(tot_num)[:pts_num] + # positions = positions[::step] * position_scale + positions = positions[new_idx] * position_scale + print('Loaded points from ply ', positions.shape) + # positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T # [N, 3] + # positions=np.concatenate([-positions[:,0:1],-positions[:,1:2],-positions[:,2:3]],1)#*train_dataset.scale_factor) + if load_color: + colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 + else: + colors = np.zeros_like(positions) + 0.5 + + colors = colors[new_idx] + # colors = colors[::step] + # normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T + pcd = BasicPointCloud(points=positions, colors=colors, normals=None) + self.create_from_pcd(pcd, spatial_lr_scale=spatial_lr_scale, time_line=time_line) + + + def random_init(self, num_pts, lr=10, radius=1): + phis = np.random.random((num_pts,)) * 2 * np.pi + costheta = np.random.random((num_pts,)) * 2 - 1 + thetas = np.arccos(costheta) + mu = np.random.random((num_pts,)) + radius = radius * np.cbrt(mu) + x = radius * np.sin(thetas) * np.cos(phis) + y = radius * np.sin(thetas) * np.sin(phis) + z = radius * np.cos(thetas) + xyz = np.stack((x, y, z), axis=1) + # xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 + + shs = np.random.random((num_pts, 3)) / 255.0 + pcd = BasicPointCloud( + points=xyz, colors=sh2rgb(shs), normals=np.zeros((num_pts, 3)) + ) + self.create_from_pcd(pcd, lr, 4) # 4 not used + + + def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float, time_line: int): + self.spatial_lr_scale = spatial_lr_scale + fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() + fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) + features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() + features[:, :3, 0 ] = fused_color + features[:, 3:, 1:] = 0.0 + + print("Number of points at initialisation : ", fused_point_cloud.shape[0]) + + dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) + # scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 1) + scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3) + #scales = torch.ones_like(scales ) * 0.03 + rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") + rots[:, 0] = 1 + + opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) + + self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) + self._deformation = self._deformation.to("cuda") + # self.grid = self.grid.to("cuda") + self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True)) + self._scaling = nn.Parameter(scales.requires_grad_(True)) + self._rotation = nn.Parameter(rots.requires_grad_(True)) + self._opacity = nn.Parameter(opacities.requires_grad_(True)) + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + def training_setup(self, training_args): + self.percent_dense = training_args.percent_dense + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + + + l = [ + {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"}, + {'params': list(self._deformation.get_mlp_parameters()), 'lr': training_args.deformation_lr_init * self.spatial_lr_scale, "name": "deformation"}, + # {'params': list(self._deformation.get_grid_parameters()), 'lr': training_args.grid_lr_init * self.spatial_lr_scale, "name": "grid"}, + {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"}, + {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"}, + {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"}, + {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"}, + {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"} + + ] + + self.optimizer = torch.optim.Adam(l, lr=0.0) + # self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15) + self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale, + lr_final=training_args.position_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.position_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + self.deformation_scheduler_args = get_expon_lr_func(lr_init=training_args.deformation_lr_init*self.spatial_lr_scale, + lr_final=training_args.deformation_lr_final*self.spatial_lr_scale, + lr_delay_mult=training_args.deformation_lr_delay_mult, + max_steps=training_args.position_lr_max_steps) + # self.grid_scheduler_args = get_expon_lr_func(lr_init=training_args.grid_lr_init*self.spatial_lr_scale, + # lr_final=training_args.grid_lr_final*self.spatial_lr_scale, + # lr_delay_mult=training_args.deformation_lr_delay_mult, + # max_steps=training_args.position_lr_max_steps) + self.gs_neighbor_dist, self.gs_neighbor = self.get_k_nearest_neighbor() + + def update_learning_rate(self, iteration): + ''' Learning rate scheduling per step ''' + for param_group in self.optimizer.param_groups: + if param_group["name"] == "xyz": + lr = self.xyz_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + # if "grid" in param_group["name"]: + # lr = self.grid_scheduler_args(iteration) + # param_group['lr'] = lr + # return lr + elif param_group["name"] == "deformation": + lr = self.deformation_scheduler_args(iteration) + param_group['lr'] = lr + # return lr + + def construct_list_of_attributes(self): + l = ['x', 'y', 'z', 'nx', 'ny', 'nz'] + # All channels except the 3 DC + for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]): + l.append('f_dc_{}'.format(i)) + for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]): + l.append('f_rest_{}'.format(i)) + l.append('opacity') + for i in range(self._scaling.shape[1]): + l.append('scale_{}'.format(i)) + for i in range(self._rotation.shape[1]): + l.append('rot_{}'.format(i)) + return l + # def compute_deformation(self,time): + + # deform = self._deformation[:,:,:time].sum(dim=-1) + # xyz = self._xyz + deform + # return xyz + # def save_ply_dynamic(path): + # for time in range(self._deformation.shape(-1)): + # xyz = self.compute_deformation(time) + def load_model(self, path): + print("loading model from exists{}".format(path)) + weight_dict = torch.load(os.path.join(path,"deformation.pth"),map_location="cuda") + self._deformation.load_state_dict(weight_dict) + self._deformation = self._deformation.to("cuda") + self._deformation_table = torch.gt(torch.ones((self.get_xyz.shape[0]),device="cuda"),0) + self._deformation_accum = torch.zeros((self.get_xyz.shape[0],3),device="cuda") + if os.path.exists(os.path.join(path, "deformation_table.pth")): + self._deformation_table = torch.load(os.path.join(path, "deformation_table.pth"),map_location="cuda") + if os.path.exists(os.path.join(path, "deformation_accum.pth")): + self._deformation_accum = torch.load(os.path.join(path, "deformation_accum.pth"),map_location="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + # print(self._deformation.deformation_net.grid.) + def save_deformation(self, path): + mkdir_p((path)) + # mkdir_p(os.path.dirname(path)) + torch.save(self._deformation.state_dict(),os.path.join(path, "deformation.pth")) + torch.save(self._deformation_table,os.path.join(path, "deformation_table.pth")) + torch.save(self._deformation_accum,os.path.join(path, "deformation_accum.pth")) + def save_ply(self, path): + mkdir_p(os.path.dirname(path)) + + xyz = self._xyz.detach().cpu().numpy() + normals = np.zeros_like(xyz) + f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() + opacities = self._opacity.detach().cpu().numpy() + scale = self._scaling.detach().cpu().numpy() + rotation = self._rotation.detach().cpu().numpy() + + dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] + + elements = np.empty(xyz.shape[0], dtype=dtype_full) + attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) + elements[:] = list(map(tuple, attributes)) + el = PlyElement.describe(elements, 'vertex') + PlyData([el]).write(path) + + def reset_opacity(self): + opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01)) + optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity") + self._opacity = optimizable_tensors["opacity"] + + def load_ply(self, path): + plydata = PlyData.read(path) + + xyz = np.stack((np.asarray(plydata.elements[0]["x"]), + np.asarray(plydata.elements[0]["y"]), + np.asarray(plydata.elements[0]["z"])), axis=1) + opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis] + + features_dc = np.zeros((xyz.shape[0], 3, 1)) + features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"]) + features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"]) + features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"]) + + extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")] + extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1])) + assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3 + features_extra = np.zeros((xyz.shape[0], len(extra_f_names))) + for idx, attr_name in enumerate(extra_f_names): + features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name]) + # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC) + features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)) + + scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")] + scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1])) + scales = np.zeros((xyz.shape[0], len(scale_names))) + for idx, attr_name in enumerate(scale_names): + scales[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")] + rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1])) + rots = np.zeros((xyz.shape[0], len(rot_names))) + for idx, attr_name in enumerate(rot_names): + rots[:, idx] = np.asarray(plydata.elements[0][attr_name]) + + self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)) + self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) + self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True)) + self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)) + self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)) + self.active_sh_degree = self.max_sh_degree + + def replace_tensor_to_optimizer(self, tensor, name): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if group["name"] == name: + stored_state = self.optimizer.state.get(group['params'][0], None) + stored_state["exp_avg"] = torch.zeros_like(tensor) + stored_state["exp_avg_sq"] = torch.zeros_like(tensor) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(tensor.requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def _prune_optimizer(self, mask): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"]) > 1: + continue + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + stored_state["exp_avg"] = stored_state["exp_avg"][mask] + stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask] + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True))) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + return optimizable_tensors + + def prune_points(self, mask): + valid_points_mask = ~mask + optimizable_tensors = self._prune_optimizer(valid_points_mask) + + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + self._deformation_accum = self._deformation_accum[valid_points_mask] + self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask] + self._deformation_table = self._deformation_table[valid_points_mask] + self.denom = self.denom[valid_points_mask] + self.max_radii2D = self.max_radii2D[valid_points_mask] + + def cat_tensors_to_optimizer(self, tensors_dict): + optimizable_tensors = {} + for group in self.optimizer.param_groups: + if len(group["params"])>1:continue + assert len(group["params"]) == 1 + extension_tensor = tensors_dict[group["name"]] + stored_state = self.optimizer.state.get(group['params'][0], None) + if stored_state is not None: + + stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0) + stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0) + + del self.optimizer.state[group['params'][0]] + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + self.optimizer.state[group['params'][0]] = stored_state + + optimizable_tensors[group["name"]] = group["params"][0] + else: + group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) + optimizable_tensors[group["name"]] = group["params"][0] + + return optimizable_tensors + + def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table): + d = {"xyz": new_xyz, + "f_dc": new_features_dc, + "f_rest": new_features_rest, + "opacity": new_opacities, + "scaling" : new_scaling, + "rotation" : new_rotation, + # "deformation": new_deformation + } + + optimizable_tensors = self.cat_tensors_to_optimizer(d) + self._xyz = optimizable_tensors["xyz"] + self._features_dc = optimizable_tensors["f_dc"] + self._features_rest = optimizable_tensors["f_rest"] + self._opacity = optimizable_tensors["opacity"] + self._scaling = optimizable_tensors["scaling"] + self._rotation = optimizable_tensors["rotation"] + # self._deformation = optimizable_tensors["deformation"] + + self._deformation_table = torch.cat([self._deformation_table,new_deformation_table],-1) + self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self._deformation_accum = torch.zeros((self.get_xyz.shape[0], 3), device="cuda") + self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") + self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") + + def densify_and_split(self, grads, grad_threshold, scene_extent, N=2): + n_init_points = self.get_xyz.shape[0] + # Extract points that satisfy the gradient condition + padded_grad = torch.zeros((n_init_points), device="cuda") + padded_grad[:grads.shape[0]] = grads.squeeze() + print('split', padded_grad.mean(), grad_threshold) + selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) + if not selected_pts_mask.any(): + return + stds = self.get_scaling[selected_pts_mask].repeat(N,1) + means =torch.zeros((stds.size(0), 3),device="cuda") + samples = torch.normal(mean=means, std=stds) + rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1) + new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1) + new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N)) + new_rotation = self._rotation[selected_pts_mask].repeat(N,1) + new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1) + new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1) + new_opacity = self._opacity[selected_pts_mask].repeat(N,1) + new_deformation_table = self._deformation_table[selected_pts_mask].repeat(N) + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_deformation_table) + + prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) + self.prune_points(prune_filter) + + def densify_and_clone(self, grads, grad_threshold, scene_extent): + # Extract points that satisfy the gradient condition + print('clone', torch.norm(grads, dim=-1).mean(), grad_threshold) + selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False) + selected_pts_mask = torch.logical_and(selected_pts_mask, + torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent) + + new_xyz = self._xyz[selected_pts_mask] + # - 0.001 * self._xyz.grad[selected_pts_mask] + new_features_dc = self._features_dc[selected_pts_mask] + new_features_rest = self._features_rest[selected_pts_mask] + new_opacities = self._opacity[selected_pts_mask] + new_scaling = self._scaling[selected_pts_mask] + new_rotation = self._rotation[selected_pts_mask] + new_deformation_table = self._deformation_table[selected_pts_mask] + + self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_deformation_table) + def prune(self, max_grad, min_opacity, extent, max_screen_size): + prune_mask = (self.get_opacity < min_opacity).squeeze() + # prune_mask_2 = torch.logical_and(self.get_opacity <= inverse_sigmoid(0.101 , dtype=torch.float, device="cuda"), self.get_opacity >= inverse_sigmoid(0.999 , dtype=torch.float, device="cuda")) + # prune_mask = torch.logical_or(prune_mask, prune_mask_2) + # deformation_sum = abs(self._deformation).sum(dim=-1).mean(dim=-1) + # deformation_mask = (deformation_sum < torch.quantile(deformation_sum, torch.tensor([0.5]).to("cuda"))) + # prune_mask = prune_mask & deformation_mask + if max_screen_size: + big_points_vs = self.max_radii2D > max_screen_size + big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent + prune_mask = torch.logical_or(prune_mask, big_points_vs) + + prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws) + self.prune_points(prune_mask) + + torch.cuda.empty_cache() + def densify(self, max_grad, min_opacity, extent, max_screen_size): + grads = self.xyz_gradient_accum / self.denom + grads[grads.isnan()] = 0.0 + + self.densify_and_clone(grads, max_grad, extent) + self.densify_and_split(grads, max_grad, extent) + def standard_constaint(self): + + means3D = self._xyz.detach() + scales = self._scaling.detach() + rotations = self._rotation.detach() + opacity = self._opacity.detach() + color=self._ + time = torch.tensor(0).to("cuda").repeat(means3D.shape[0],1) + means3D_deform, scales_deform, rotations_deform, _ = self._deformation(means3D, scales, rotations, opacity, time) + position_error = (means3D_deform - means3D)**2 + rotation_error = (rotations_deform - rotations)**2 + scaling_erorr = (scales_deform - scales)**2 + return position_error.mean() + rotation_error.mean() + scaling_erorr.mean() + + + def add_densification_stats(self, viewspace_point_tensor, update_filter): + self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor[update_filter,:2], dim=-1, keepdim=True) + self.denom[update_filter] += 1 + @torch.no_grad() + # def update_deformation_table(self,threshold): + # # print("origin deformation point nums:",self._deformation_table.sum()) + # self._deformation_table = torch.gt(self._deformation_accum.max(dim=-1).values/100,threshold) + def print_deformation_weight_grad(self): + for name, weight in self._deformation.named_parameters(): + if weight.requires_grad: + if weight.grad is None: + print(name," :",weight.grad) + else: + if weight.grad.mean() != 0: + print(name," :",weight.grad.mean(), weight.grad.min(), weight.grad.max()) + print("-"*50) + def _plane_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids = [0,1,3] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _time_regulation(self): + multi_res_grids = self._deformation.deformation_net.grid.grids + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids =[2, 4, 5] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return total + def _l1_regulation(self): + # model.grids is 6 x [1, rank * F_dim, reso, reso] + multi_res_grids = self._deformation.deformation_net.grid.grids + + total = 0.0 + for grids in multi_res_grids: + if len(grids) == 3: + continue + else: + # These are the spatiotemporal grids + spatiotemporal_grids = [2, 4, 5] + for grid_id in spatiotemporal_grids: + total += torch.abs(1 - grids[grid_id]).mean() + return total + def compute_regulation(self, time_smoothness_weight, l1_time_planes_weight, plane_tv_weight): + return plane_tv_weight * self._plane_regulation() + time_smoothness_weight * self._time_regulation() + l1_time_planes_weight * self._l1_regulation() + + def get_k_nearest_neighbor(self, k=40): + bs = 1000 + index_list = [] + dist_list = [] + for i in range(0, self._xyz.shape[0], bs): + x = self._xyz[i:i+bs] + dists = torch.cdist(x, self._xyz) + topdist, topk = torch.topk(dists, k + 1, dim=1, largest=False) + topk = topk[:, 1:] + topdist = topdist[:, 1:] + + index_list.append(topk) + dist_list.append(topdist) + + index_list = torch.cat(index_list) + dist_list = torch.cat(dist_list) + print('neighbors', dist_list.shape, index_list.shape) + return dist_list, index_list + + def get_nn_loss(self, dx): + # idx = torch.randperm(40) + neighbors = dx[self.gs_neighbor] + # print('neighbors dx', neighbors.shape) + return (dx.reshape(-1, 1, 3) - neighbors).pow(2).sum(-1).sum(-1).mean() diff --git a/scene/hexplane.py b/scene/hexplane.py new file mode 100644 index 0000000..668732b --- /dev/null +++ b/scene/hexplane.py @@ -0,0 +1,221 @@ +import itertools +import logging as log +from typing import Optional, Union, List, Dict, Sequence, Iterable, Collection, Callable + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def get_normalized_directions(directions): + """SH encoding must be in the range [0, 1] + + Args: + directions: batch of directions + """ + return (directions + 1.0) / 2.0 + + +def normalize_aabb(pts, aabb): + return (pts - aabb[0]) * (2.0 / (aabb[1] - aabb[0])) - 1.0 +def grid_sample_wrapper(grid: torch.Tensor, coords: torch.Tensor, align_corners: bool = True) -> torch.Tensor: + grid_dim = coords.shape[-1] + + if grid.dim() == grid_dim + 1: + # no batch dimension present, need to add it + grid = grid.unsqueeze(0) + if coords.dim() == 2: + coords = coords.unsqueeze(0) + + if grid_dim == 2 or grid_dim == 3: + grid_sampler = F.grid_sample + else: + raise NotImplementedError(f"Grid-sample was called with {grid_dim}D data but is only " + f"implemented for 2 and 3D data.") + + coords = coords.view([coords.shape[0]] + [1] * (grid_dim - 1) + list(coords.shape[1:])) + B, feature_dim = grid.shape[:2] + n = coords.shape[-2] + interp = grid_sampler( + grid, # [B, feature_dim, reso, ...] + coords, # [B, 1, ..., n, grid_dim] + align_corners=align_corners, + mode='bilinear', padding_mode='border') + interp = interp.view(B, feature_dim, n).transpose(-1, -2) # [B, n, feature_dim] + interp = interp.squeeze() # [B?, n, feature_dim?] + return interp + +def init_grid_param( + grid_nd: int, + in_dim: int, + out_dim: int, + reso: Sequence[int], + a: float = 0.1, + b: float = 0.5): + assert in_dim == len(reso), "Resolution must have same number of elements as input-dimension" + has_time_planes = in_dim == 4 + assert grid_nd <= in_dim + coo_combs = list(itertools.combinations(range(in_dim), grid_nd)) + grid_coefs = nn.ParameterList() + for ci, coo_comb in enumerate(coo_combs): + new_grid_coef = nn.Parameter(torch.empty( + [1, out_dim] + [reso[cc] for cc in coo_comb[::-1]] + )) + if has_time_planes and 3 in coo_comb: # Initialize time planes to 1 + # print('time planes', new_grid_coef.shape) + # nn.init.normal_(new_grid_coef) + nn.init.ones_(new_grid_coef) + else: + nn.init.uniform_(new_grid_coef, a=a, b=b) + grid_coefs.append(new_grid_coef) + + return grid_coefs + + +def interpolate_ms_features(pts: torch.Tensor, + ms_grids: Collection[Iterable[nn.Module]], + grid_dimensions: int, + concat_features: bool, + num_levels: Optional[int], + grid_merge: str, + ) -> torch.Tensor: + coo_combs = list(itertools.combinations( + range(pts.shape[-1]), grid_dimensions) + ) + if num_levels is None: + num_levels = len(ms_grids) + multi_scale_interp = [] if concat_features else 0. + grid: nn.ParameterList + for scale_id, grid in enumerate(ms_grids[:num_levels]): + if grid_merge == 'cat': + interp_space = [] + else: + interp_space = 1. + for ci, coo_comb in enumerate(coo_combs): + # interpolate in plane + feature_dim = grid[ci].shape[1] # shape of grid[ci]: 1, out_dim, *reso + interp_out_plane = ( + grid_sample_wrapper(grid[ci], pts[..., coo_comb]) + .view(-1, feature_dim) + ) + # print(ci, coo_comb, interp_out_plane) + # compute product over planes + if grid_merge == 'plus': + interp_space = interp_space + interp_out_plane + elif grid_merge == 'mul': + interp_space = interp_space * interp_out_plane + elif grid_merge == 'cat': + interp_space.append(interp_out_plane) + else: + raise NotImplementedError + + # combine over scales + # print('length per scale', len(interp_space)) # is 6 + if concat_features: + if grid_merge == 'cat': + for cur in interp_space: + multi_scale_interp.append(cur) + else: + multi_scale_interp.append(interp_space) + else: + raise NotImplementedError + multi_scale_interp = multi_scale_interp + interp_space + + if concat_features: + multi_scale_interp = torch.cat(multi_scale_interp, dim=-1) + return multi_scale_interp + + +class HexPlaneField(nn.Module): + def __init__( + self, + bounds, + planeconfig, + multires, + grid_merge, + ) -> None: + super().__init__() + aabb = torch.tensor([[bounds,bounds,bounds], + [-bounds,-bounds,-bounds]]) + self.aabb = nn.Parameter(aabb, requires_grad=False) + self.grid_config = [planeconfig] + self.multiscale_res_multipliers = multires + self.concat_features = True + self.grid_merge = grid_merge + + # 1. Init planes + self.grids = nn.ModuleList() + self.feat_dim = 0 + for res in self.multiscale_res_multipliers: + # initialize coordinate grid + config = self.grid_config[0].copy() + # Resolution fix: multi-res only on spatial planes + config["resolution"] = [ + r * res for r in config["resolution"][:3] + ] + config["resolution"][3:] + gp = init_grid_param( + grid_nd=config["grid_dimensions"], + in_dim=config["input_coordinate_dim"], + out_dim=config["output_coordinate_dim"], + reso=config["resolution"], + ) + # shape[1] is out-dim - Concatenate over feature len for each scale + if self.concat_features: + self.feat_dim += gp[-1].shape[1] + else: + self.feat_dim = gp[-1].shape[1] + self.grids.append(gp) + # print(f"Initialized model grids: {self.grids}") + # print("feature_dim:",self.feat_dim) + + + def set_aabb(self,xyz_max, xyz_min): + aabb = torch.tensor([ + xyz_max, + xyz_min + ]) + self.aabb = nn.Parameter(aabb,requires_grad=True) + print("Voxel Plane: set aabb=",self.aabb) + + def get_density(self, pts: torch.Tensor, timestamps: Optional[torch.Tensor] = None): + """Computes and returns the densities.""" + + pts = normalize_aabb(pts, self.aabb) + pts = torch.cat((pts, timestamps), dim=-1) # [n_rays, n_samples, 4] + # print('pts', pts, self.concat_features) + + pts = pts.reshape(-1, pts.shape[-1]) + features = interpolate_ms_features( + pts, ms_grids=self.grids, # noqa + grid_dimensions=self.grid_config[0]["grid_dimensions"], + concat_features=self.concat_features, num_levels=None, grid_merge=self.grid_merge) + # print('hexplane features', features.shape) + if len(features) < 1: + raise NotImplementedError + features = torch.zeros((0, 1)).to(features.device) + + return features + + def forward(self, + pts: torch.Tensor, + timestamps: Optional[torch.Tensor] = None): + + features = self.get_density(pts, timestamps) + + return features + +if __name__ == '__main__': + kplanes_config = { + 'grid_dimensions': 2, + 'input_coordinate_dim': 4, + 'output_coordinate_dim': 32, + 'resolution': [64, 64, 64, 16] + # 'resolution': [64, 64, 64, 150] + } + grid = HexPlaneField(2.5, kplanes_config, [1, 2, 4, 8 ]) + pts = torch.randn(1, 3) + for idx in range(16): + feat = grid(pts, torch.tensor([idx / 16]).unsqueeze(0).repeat(pts.shape[0],1)) + print(feat.std(), feat.mean()) + # print(feat.shape, feat[:,:5]) + # print() diff --git a/scene/hyper_loader.py b/scene/hyper_loader.py new file mode 100644 index 0000000..dd12027 --- /dev/null +++ b/scene/hyper_loader.py @@ -0,0 +1,188 @@ +import warnings + +warnings.filterwarnings("ignore") + +import json +import os +import random + +import numpy as np +import torch +from PIL import Image +import math +from tqdm import tqdm +from scene.utils import Camera +from typing import NamedTuple +from torch.utils.data import Dataset +from utils.general_utils import PILtoTorch +# from scene.dataset_readers import +from utils.graphics_utils import getWorld2View2, focal2fov, fov2focal +import copy +class CameraInfo(NamedTuple): + uid: int + R: np.array + T: np.array + FovY: np.array + FovX: np.array + image: np.array + image_path: str + image_name: str + width: int + height: int + time : float + + +class Load_hyper_data(Dataset): + def __init__(self, + datadir, + ratio=1.0, + use_bg_points=False, + split="train" + ): + + from .utils import Camera + datadir = os.path.expanduser(datadir) + with open(f'{datadir}/scene.json', 'r') as f: + scene_json = json.load(f) + with open(f'{datadir}/metadata.json', 'r') as f: + meta_json = json.load(f) + with open(f'{datadir}/dataset.json', 'r') as f: + dataset_json = json.load(f) + + self.near = scene_json['near'] + self.far = scene_json['far'] + self.coord_scale = scene_json['scale'] + self.scene_center = scene_json['center'] + + self.all_img = dataset_json['ids'] + self.val_id = dataset_json['val_ids'] + self.split = split + if len(self.val_id) == 0: + self.i_train = np.array([i for i in np.arange(len(self.all_img)) if + (i%4 == 0)]) + self.i_test = self.i_train+2 + self.i_test = self.i_test[:-1,] + else: + self.train_id = dataset_json['train_ids'] + self.i_test = [] + self.i_train = [] + for i in range(len(self.all_img)): + id = self.all_img[i] + if id in self.val_id: + self.i_test.append(i) + if id in self.train_id: + self.i_train.append(i) + + + self.all_cam = [meta_json[i]['camera_id'] for i in self.all_img] + self.all_time = [meta_json[i]['warp_id'] for i in self.all_img] + max_time = max(self.all_time) + self.all_time = [meta_json[i]['warp_id']/max_time for i in self.all_img] + self.selected_time = set(self.all_time) + self.ratio = ratio + self.max_time = max(self.all_time) + self.min_time = min(self.all_time) + self.i_video = [i for i in range(len(self.all_img))] + self.i_video.sort() + # all poses + self.all_cam_params = [] + for im in self.all_img: + camera = Camera.from_json(f'{datadir}/camera/{im}.json') + camera = camera.scale(ratio) + camera.position -= self.scene_center + camera.position *= self.coord_scale + self.all_cam_params.append(camera) + + self.all_img = [f'{datadir}/rgb/{int(1/ratio)}x/{i}.png' for i in self.all_img] + self.h, self.w = self.all_cam_params[0].image_shape + self.map = {} + self.image_one = Image.open(self.all_img[0]) + self.image_one_torch = PILtoTorch(self.image_one,None).to(torch.float32) + + def __getitem__(self, index): + if self.split == "train": + return self.load_raw(self.i_train[index]) + + elif self.split == "test": + return self.load_raw(self.i_test[index]) + elif self.split == "video": + return self.load_video(self.i_video[index]) + def __len__(self): + if self.split == "train": + return len(self.i_train) + elif self.split == "test": + return len(self.i_test) + elif self.split == "video": + # return len(self.i_video) + return len(self.video_v2) + def load_video(self, idx): + if idx in self.map.keys(): + return self.map[idx] + camera = self.all_cam_params[idx] + w = self.image_one.size[0] + h = self.image_one.size[1] + # image = PILtoTorch(image,None) + # image = image.to(torch.float32) + time = self.all_time[idx] + R = camera.orientation.T + T = - camera.position @ R + FovY = focal2fov(camera.focal_length, self.h) + FovX = focal2fov(camera.focal_length, self.w) + image_path = "/".join(self.all_img[idx].split("/")[:-1]) + image_name = self.all_img[idx].split("/")[-1] + caminfo = CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=self.image_one_torch, + image_path=image_path, image_name=image_name, width=w, height=h, time=time, + ) + self.map[idx] = caminfo + return caminfo + def load_raw(self, idx): + if idx in self.map.keys(): + return self.map[idx] + camera = self.all_cam_params[idx] + image = Image.open(self.all_img[idx]) + w = image.size[0] + h = image.size[1] + image = PILtoTorch(image,None) + image = image.to(torch.float32) + time = self.all_time[idx] + R = camera.orientation.T + T = - camera.position @ R + FovY = focal2fov(camera.focal_length, self.h) + FovX = focal2fov(camera.focal_length, self.w) + image_path = "/".join(self.all_img[idx].split("/")[:-1]) + image_name = self.all_img[idx].split("/")[-1] + caminfo = CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, + image_path=image_path, image_name=image_name, width=w, height=h, time=time, + ) + self.map[idx] = caminfo + return caminfo + + +def format_hyper_data(data_class, split): + if split == "train": + data_idx = data_class.i_train + elif split == "test": + data_idx = data_class.i_test + # dataset = data_class.copy() + # dataset.mode = split + cam_infos = [] + for uid, index in tqdm(enumerate(data_idx)): + camera = data_class.all_cam_params[index] + # image = Image.open(data_class.all_img[index]) + # image = PILtoTorch(image,None) + time = data_class.all_time[index] + R = camera.orientation.T + T = - camera.position @ R + FovY = focal2fov(camera.focal_length, data_class.h) + FovX = focal2fov(camera.focal_length, data_class.w) + image_path = "/".join(data_class.all_img[index].split("/")[:-1]) + image_name = data_class.all_img[index].split("/")[-1] + cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=None, + image_path=image_path, image_name=image_name, width=int(data_class.w), height=int(data_class.h), time=time, + ) + cam_infos.append(cam_info) + return cam_infos + # matrix = np.linalg.inv(np.array(poses)) + # R = -np.transpose(matrix[:3,:3]) + # R[:,0] = -R[:,0] + # T = -matrix[:3, 3] \ No newline at end of file diff --git a/scene/i2v_dataset.py b/scene/i2v_dataset.py new file mode 100644 index 0000000..10ff3ba --- /dev/null +++ b/scene/i2v_dataset.py @@ -0,0 +1,589 @@ +from torch.utils.data import Dataset +# from scene.cameras import Camera +import numpy as np +from utils.general_utils import PILtoTorch +from utils.graphics_utils import fov2focal, focal2fov +import torch +from utils.camera_utils import loadCam +from utils.graphics_utils import focal2fov + +from torchvision.transforms import ToTensor +from PIL import Image +import glob +from scene.cam_utils import orbit_camera +import math, os + +def getProjectionMatrix(znear, zfar, fovX, fovY): + tanHalfFovY = math.tan((fovY / 2)) + tanHalfFovX = math.tan((fovX / 2)) + + P = torch.zeros(4, 4) + + z_sign = 1.0 + + P[0, 0] = 1 / tanHalfFovX + P[1, 1] = 1 / tanHalfFovY + P[3, 2] = z_sign + P[2, 2] = z_sign * zfar / (zfar - znear) + P[2, 3] = -(zfar * znear) / (zfar - znear) + return P + + +class MiniCam: + def __init__(self, c2w, width, height, fovy, fovx, znear, zfar): + # c2w (pose) should be in NeRF convention. + + self.image_width = width + self.image_height = height + self.FoVy = fovy + self.FoVx = fovx + self.znear = znear + self.zfar = zfar + + w2c = np.linalg.inv(c2w) + + # rectify... + w2c[1:3, :3] *= -1 + w2c[:3, 3] *= -1 + + self.world_view_transform = torch.tensor(w2c).transpose(0, 1)#.cuda() + self.projection_matrix = ( + getProjectionMatrix( + znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy + ) + .transpose(0, 1) + # .cuda() + ) + self.full_proj_transform = self.world_view_transform @ self.projection_matrix + self.camera_center = -torch.tensor(c2w[:3, 3])#.cuda() + + +class FourDGSdataset(Dataset): + def __init__( + self, + split, + frame_num = 16, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + self.radius = 2.5 + self.W = 512 + self.H = 512 + self.fovy = np.deg2rad(40) + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + pose0_dir=f'data/{self.name}_pose0/' + # pose0_dir=f'data/{self.name}_rgba_pose0/' + + frame_list = range(frame_num) + pose0_im_names = [pose0_dir + f'{x}.png' for x in frame_list] + idx_list = range(frame_num) + if not os.path.exists(pose0_im_names[0]): # check 0 index + pose0_im_names = pose0_im_names[1:] + [pose0_dir + f'{frame_num}.png'] # use 1 index + idx_list = list(idx_list)[1:] + [frame_num] + + base_dir=f'./data/{self.name}_sync' + + syncdreamer_im = [] + # for fname in t0_im_names: + assert self.static==False + if self.static==False: + for frame_idx in idx_list: + # for frame_idx in range(1, frame_num + 1): + li = [] + for view_idx in range(16): + fname = os.path.join(base_dir, f"{frame_idx}_0_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + else: + #sync only read frame0 + # (dejia): not used + for frame_idx in range(frame_num): + li = [] + frame_idx=0 + for view_idx in range(16): + fname = os.path.join(base_dir, f"{frame_idx}_0_{view_idx}_rgba.png") + # fname = os.path.join(base_dir, f"{self.name}{frame_idx}_0_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + + print(f"syncdreamer images loaded {self.syncdreamer_im.shape}.") + + self.pose0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in pose0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + ww = self.T(im) + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.pose0_im_list.append(ww) + # self.pose0_im_list.append(self.T(im)) + while len(self.pose0_im_list) < self.len_pose0: + self.pose0_im_list.append(ww) + self.pose0_im_list = torch.stack(self.pose0_im_list, dim=0)#.permute(0, 2, 3, 1) + # self.pose0_im_list = self.pose0_im_list.expand(fn, 3, 256, 256) + print(f"Pose0 images loaded {self.pose0_im_list.shape}") + self.syncdreamer_im = torch.cat([self.pose0_im_list.unsqueeze(1), self.syncdreamer_im], 1) + print(f"New syncdreamer shape {self.syncdreamer_im.shape}") + self.max_frames = self.pose0_im_list.shape[0] + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + # self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.pose0_im_list.shape[0] + if self.split == 'train': + self.t0_num = 16 + 1 # fixed + else: + self.t0_num = 100 + self.len_ = (self.t0_num) * (self.pose0_num) + + pose0_pose = orbit_camera(0, 0, self.radius) + self.pose0_cam = MiniCam( + pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + self.t0_pose = [self.pose0_cam] + [MiniCam( + # self.t0_pose = [MiniCam( + orbit_camera(-30, azimuth, self.radius), + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for azimuth in np.concatenate([np.arange(0, 180, 22.5), np.arange(-180, 0, 22.5)])] + + # we sample (pose, t) + def __getitem__(self, index): + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + 'pose0': self.pose0_im_list[pose0_idx], + 'pose0_idx': pose0_idx, + 't0_idx': t0_idx, + 't0_weight': min(abs(t0_idx), abs(self.t0_num - t0_idx)), + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + #t0_idx=0 + if self.split == 'train': + out['t0'] = self.syncdreamer_im[0][t0_idx] + out['gtim'] = self.syncdreamer_im[pose0_idx][t0_idx] # coarse stage + + t0_cam = self.t0_pose[t0_idx] + out['t0_cam'] = t0_cam + # out['sync_cam'] = self.sync_pose + + + + ## for render.py multiview_video + + ver = 0 + hor = (index / 100) * 360 + # ver = np.random.randint(-45, 45) + # hor = np.random.randint(-180, 180) + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + + # for fine stage, random seq + + rand_seq = [] + ver_list = [] + hor_list = [] + # for i in range(self.pose0_num - 1): + for i in range(self.pose0_num): + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + cur_pose = orbit_camera(ver, hor, self.radius) + ver_list.append(ver) + hor_list.append(hor) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + back_pose=orbit_camera(0, 180, self.radius) + out['back_cam']=MiniCam( + back_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 90, self.radius) + out['side_cam']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 70, self.radius) + out['side_cam2']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + front_pose=orbit_camera(0, 0, self.radius) + out['front_cam']=MiniCam( + front_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 + + +class ImageDreamdataset(Dataset): + def __init__( + self, + split, + frame_num = 16, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + # self.radius = 2.5 + self.radius = 2.0 ## imagedream https://github.com/bytedance/ImageDream/blob/13e05566ca27c66b6bc5b3ee42bc68ddfb471585/configs/imagedream-sd21-shading.yaml#L20 + self.W = 512 + self.H = 512 + self.fovy = np.deg2rad(40) + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + pose0_dir=f'./data/ImageDream/{self.name}/rgba/' + + frame_list = range(frame_num) + pose0_im_names = [pose0_dir + f'{x}.png' for x in frame_list] + idx_list = range(frame_num) + if not os.path.exists(pose0_im_names[0]): # check 0 index + pose0_im_names = pose0_im_names[1:] + [pose0_dir + f'{frame_num}.png'] # use 1 index + idx_list = list(idx_list)[1:] + [frame_num] + + base_dir=f'./data/output_svd/{self.name}' + syncdreamer_im = [] + assert self.static==False + if self.static==False: + for frame_idx in idx_list: + li = [] + for view_idx in range(4): + #view_idx=0 + fname = os.path.join(base_dir, f"{frame_idx}_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + else: + raise NotImplementedError + + + + + print(f"imagedream images loaded {self.syncdreamer_im.shape}.") + + self.pose0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in pose0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + ww = self.T(im) + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.pose0_im_list.append(ww) + # self.pose0_im_list.append(self.T(im)) + while len(self.pose0_im_list) < self.len_pose0: + self.pose0_im_list.append(ww) + self.pose0_im_list = torch.stack(self.pose0_im_list, dim=0)#.permute(0, 2, 3, 1) + # self.pose0_im_list = self.pose0_im_list.expand(fn, 3, 256, 256) + print(f"Pose0 images loaded {self.pose0_im_list.shape}") + # self.syncdreamer_im = torch.cat([self.pose0_im_list.unsqueeze(1), self.syncdreamer_im], 1) + print(f"New syncdreamer shape {self.syncdreamer_im.shape}") + self.max_frames = self.pose0_im_list.shape[0] + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + # self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.pose0_im_list.shape[0] + if self.split == 'train': + self.t0_num = 4# + 1 # fixed + else: + self.t0_num = 100 + self.len_ = (self.t0_num) * (self.pose0_num) + + # NOTE: this is different!! + pose0_pose = orbit_camera(0, 90, self.radius) + self.pose0_cam = MiniCam( + pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + # self.t0_pose = [self.pose0_cam] + [MiniCam( + self.t0_pose = [MiniCam( + orbit_camera(0, azimuth, self.radius), + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for azimuth in np.concatenate([np.arange(0, 180, 90), np.arange(-180, 0, 90)])] + + # we sample (pose, t) + def __getitem__(self, index): + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + 'pose0': self.pose0_im_list[pose0_idx], + 'pose0_idx': pose0_idx, + 't0_idx': t0_idx, + 't0_weight': min(abs(t0_idx), abs(self.t0_num - t0_idx)), + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + #t0_idx=0 + if self.split == 'train': + out['t0'] = self.syncdreamer_im[0][t0_idx] + out['gtim'] = self.syncdreamer_im[pose0_idx][t0_idx] # coarse stage + + t0_cam = self.t0_pose[t0_idx] + out['t0_cam'] = t0_cam + + ## for render.py multiview_video + ver = 0 + hor = (index / 100) * 360 + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + + # for fine stage, random seq + + rand_seq = [] + ver_list = [] + hor_list = [] + # for i in range(self.pose0_num - 1): + for i in range(self.pose0_num): + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + cur_pose = orbit_camera(ver, hor, self.radius) + ver_list.append(ver) + hor_list.append(hor) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + back_pose=orbit_camera(0, 180, self.radius) + out['back_cam']=MiniCam( + back_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 90, self.radius) + out['side_cam']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 70, self.radius) + out['side_cam2']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + front_pose=orbit_camera(0, 0, self.radius) + out['front_cam']=MiniCam( + front_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + li = [orbit_camera(ver, hor, self.radius)] + for view_i in range(1, 4): + li.append(orbit_camera(ver, hor + 90 * view_i, self.radius)) + out['dream_pose_mat'] = torch.from_numpy(np.stack(li, axis=0)) + out['dream_pose'] = [MiniCam( + cur_pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for cur_pose in li] + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 diff --git a/scene/neural_3D_dataset_NDC.py b/scene/neural_3D_dataset_NDC.py new file mode 100644 index 0000000..63bbcad --- /dev/null +++ b/scene/neural_3D_dataset_NDC.py @@ -0,0 +1,376 @@ +import concurrent.futures +import gc +import glob +import os + +import cv2 +import numpy as np +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms as T +from tqdm import tqdm + + +def normalize(v): + """Normalize a vector.""" + return v / np.linalg.norm(v) + + +def average_poses(poses): + """ + Calculate the average pose, which is then used to center all poses + using @center_poses. Its computation is as follows: + 1. Compute the center: the average of pose centers. + 2. Compute the z axis: the normalized average z axis. + 3. Compute axis y': the average y axis. + 4. Compute x' = y' cross product z, then normalize it as the x axis. + 5. Compute the y axis: z cross product x. + + Note that at step 3, we cannot directly use y' as y axis since it's + not necessarily orthogonal to z axis. We need to pass from x to y. + Inputs: + poses: (N_images, 3, 4) + Outputs: + pose_avg: (3, 4) the average pose + """ + # 1. Compute the center + center = poses[..., 3].mean(0) # (3) + + # 2. Compute the z axis + z = normalize(poses[..., 2].mean(0)) # (3) + + # 3. Compute axis y' (no need to normalize as it's not the final output) + y_ = poses[..., 1].mean(0) # (3) + + # 4. Compute the x axis + x = normalize(np.cross(z, y_)) # (3) + + # 5. Compute the y axis (as z and x are normalized, y is already of norm 1) + y = np.cross(x, z) # (3) + + pose_avg = np.stack([x, y, z, center], 1) # (3, 4) + + return pose_avg + + +def center_poses(poses, blender2opencv): + """ + Center the poses so that we can use NDC. + See https://github.com/bmild/nerf/issues/34 + Inputs: + poses: (N_images, 3, 4) + Outputs: + poses_centered: (N_images, 3, 4) the centered poses + pose_avg: (3, 4) the average pose + """ + poses = poses @ blender2opencv + pose_avg = average_poses(poses) # (3, 4) + pose_avg_homo = np.eye(4) + pose_avg_homo[ + :3 + ] = pose_avg # convert to homogeneous coordinate for faster computation + pose_avg_homo = pose_avg_homo + # by simply adding 0, 0, 0, 1 as the last row + last_row = np.tile(np.array([0, 0, 0, 1]), (len(poses), 1, 1)) # (N_images, 1, 4) + poses_homo = np.concatenate( + [poses, last_row], 1 + ) # (N_images, 4, 4) homogeneous coordinate + + poses_centered = np.linalg.inv(pose_avg_homo) @ poses_homo # (N_images, 4, 4) + # poses_centered = poses_centered @ blender2opencv + poses_centered = poses_centered[:, :3] # (N_images, 3, 4) + + return poses_centered, pose_avg_homo + + +def viewmatrix(z, up, pos): + vec2 = normalize(z) + vec1_avg = up + vec0 = normalize(np.cross(vec1_avg, vec2)) + vec1 = normalize(np.cross(vec2, vec0)) + m = np.eye(4) + m[:3] = np.stack([-vec0, vec1, vec2, pos], 1) + return m + + +def render_path_spiral(c2w, up, rads, focal, zdelta, zrate, N_rots=2, N=120): + render_poses = [] + rads = np.array(list(rads) + [1.0]) + + for theta in np.linspace(0.0, 2.0 * np.pi * N_rots, N + 1)[:-1]: + c = np.dot( + c2w[:3, :4], + np.array([np.cos(theta), -np.sin(theta), -np.sin(theta * zrate), 1.0]) + * rads, + ) + z = normalize(c - np.dot(c2w[:3, :4], np.array([0, 0, -focal, 1.0]))) + render_poses.append(viewmatrix(z, up, c)) + return render_poses + + + +def process_video(video_data_save, video_path, img_wh, downsample, transform): + """ + Load video_path data to video_data_save tensor. + """ + video_frames = cv2.VideoCapture(video_path) + count = 0 + video_images_path = video_path.split('.')[0] + image_path = os.path.join(video_images_path,"images") + + if not os.path.exists(image_path): + os.makedirs(image_path) + while video_frames.isOpened(): + ret, video_frame = video_frames.read() + if ret: + video_frame = cv2.cvtColor(video_frame, cv2.COLOR_BGR2RGB) + video_frame = Image.fromarray(video_frame) + if downsample != 1.0: + + img = video_frame.resize(img_wh, Image.LANCZOS) + img.save(os.path.join(image_path,"%04d.png"%count)) + + img = transform(img) + video_data_save[count] = img.permute(1,2,0) + count += 1 + else: + break + + else: + images_path = os.listdir(image_path) + images_path.sort() + + for path in images_path: + img = Image.open(os.path.join(image_path,path)) + if downsample != 1.0: + img = img.resize(img_wh, Image.LANCZOS) + img = transform(img) + video_data_save[count] = img.permute(1,2,0) + count += 1 + + video_frames.release() + print(f"Video {video_path} processed.") + return None + + +# define a function to process all videos +def process_videos(videos, skip_index, img_wh, downsample, transform, num_workers=1): + """ + A multi-threaded function to load all videos fastly and memory-efficiently. + To save memory, we pre-allocate a tensor to store all the images and spawn multi-threads to load the images into this tensor. + """ + all_imgs = torch.zeros(len(videos) - 1, 300, img_wh[-1] , img_wh[-2], 3) + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + # start a thread for each video + current_index = 0 + futures = [] + for index, video_path in enumerate(videos): + # skip the video with skip_index (eval video) + if index == skip_index: + continue + else: + future = executor.submit( + process_video, + all_imgs[current_index], + video_path, + img_wh, + downsample, + transform, + ) + futures.append(future) + current_index += 1 + return all_imgs + +def get_spiral(c2ws_all, near_fars, rads_scale=1.0, N_views=120): + """ + Generate a set of poses using NeRF's spiral camera trajectory as validation poses. + """ + # center pose + c2w = average_poses(c2ws_all) + + # Get average pose + up = normalize(c2ws_all[:, :3, 1].sum(0)) + + # Find a reasonable "focus depth" for this dataset + dt = 0.75 + close_depth, inf_depth = near_fars.min() * 0.9, near_fars.max() * 5.0 + focal = 1.0 / ((1.0 - dt) / close_depth + dt / inf_depth) + + # Get radii for spiral path + zdelta = near_fars.min() * 0.2 + tt = c2ws_all[:, :3, 3] + rads = np.percentile(np.abs(tt), 90, 0) * rads_scale + render_poses = render_path_spiral( + c2w, up, rads, focal, zdelta, zrate=0.5, N=N_views + ) + return np.stack(render_poses) + + +class Neural3D_NDC_Dataset(Dataset): + def __init__( + self, + datadir, + split="train", + downsample=1.0, + is_stack=True, + cal_fine_bbox=False, + N_vis=-1, + time_scale=1.0, + scene_bbox_min=[-1.0, -1.0, -1.0], + scene_bbox_max=[1.0, 1.0, 1.0], + N_random_pose=1000, + bd_factor=0.75, + eval_step=1, + eval_index=0, + sphere_scale=1.0, + ): + self.img_wh = ( + int(1352 / downsample), + int(1014 / downsample), + ) # According to the neural 3D paper, the default resolution is 1024x768 + self.root_dir = datadir + self.split = split + self.downsample = 2704 / self.img_wh[0] + self.is_stack = is_stack + self.N_vis = N_vis + self.time_scale = time_scale + self.scene_bbox = torch.tensor([scene_bbox_min, scene_bbox_max]) + + self.world_bound_scale = 1.1 + self.bd_factor = bd_factor + self.eval_step = eval_step + self.eval_index = eval_index + self.blender2opencv = np.eye(4) + self.transform = T.ToTensor() + + self.near = 0.0 + self.far = 1.0 + self.near_far = [self.near, self.far] # NDC near far is [0, 1.0] + self.white_bg = False + self.ndc_ray = True + self.depth_data = False + + self.load_meta() + print(f"meta data loaded, total image:{len(self)}") + + def load_meta(self): + """ + Load meta data from the dataset. + """ + # Read poses and video file paths. + poses_arr = np.load(os.path.join(self.root_dir, "poses_bounds.npy")) + poses = poses_arr[:, :-2].reshape([-1, 3, 5]) # (N_cams, 3, 5) + self.near_fars = poses_arr[:, -2:] + videos = glob.glob(os.path.join(self.root_dir, "cam*")) + videos = sorted(videos) + assert len(videos) == poses_arr.shape[0] + + H, W, focal = poses[0, :, -1] + focal = focal / self.downsample + self.focal = [focal, focal] + poses = np.concatenate([poses[..., 1:2], -poses[..., :1], poses[..., 2:4]], -1) + poses, _ = center_poses( + poses, self.blender2opencv + ) # Re-center poses so that the average is near the center. + + near_original = self.near_fars.min() + scale_factor = near_original * 0.75 + self.near_fars /= ( + scale_factor # rescale nearest plane so that it is at z = 4/3. + ) + poses[..., 3] /= scale_factor + + # Sample N_views poses for validation - NeRF-like camera trajectory. + N_views = 120 + self.val_poses = get_spiral(poses, self.near_fars, N_views=N_views) + # self.val_poses = self.directions + W, H = self.img_wh + poses_i_train = [] + + for i in range(len(poses)): + if i != self.eval_index: + poses_i_train.append(i) + self.poses = poses[poses_i_train] + self.poses_all = poses + self.image_paths, self.image_poses, self.image_times, N_cam, N_time = self.load_images_path(videos, self.split) + self.cam_number = N_cam + self.time_number = N_time + def get_val_pose(self): + render_poses = self.val_poses + render_times = torch.linspace(0.0, 1.0, render_poses.shape[0]) * 2.0 - 1.0 + return render_poses, self.time_scale * render_times + def load_images_path(self,videos,split): + image_paths = [] + image_poses = [] + image_times = [] + N_cams = 0 + N_time = 0 + countss = 300 + for index, video_path in enumerate(videos): + + if index == self.eval_index: + if split =="train": + continue + else: + if split == "test": + continue + N_cams +=1 + count = 0 + video_images_path = video_path.split('.')[0] + image_path = os.path.join(video_images_path,"images") + video_frames = cv2.VideoCapture(video_path) + if not os.path.exists(image_path): + print(f"no images saved in {image_path}, extract images from video.") + os.makedirs(image_path) + this_count = 0 + while video_frames.isOpened(): + ret, video_frame = video_frames.read() + if this_count >= countss:break + if ret: + video_frame = cv2.cvtColor(video_frame, cv2.COLOR_BGR2RGB) + video_frame = Image.fromarray(video_frame) + if self.downsample != 1.0: + + img = video_frame.resize(self.img_wh, Image.LANCZOS) + img.save(os.path.join(image_path,"%04d.png"%count)) + + # img = transform(img) + count += 1 + this_count+=1 + else: + break + + images_path = os.listdir(image_path) + images_path.sort() + this_count = 0 + for idx, path in enumerate(images_path): + if this_count >=countss:break + image_paths.append(os.path.join(image_path,path)) + pose = np.array(self.poses_all[index]) + R = pose[:3,:3] + R = -R + R[:,0] = -R[:,0] + T = -pose[:3,3].dot(R) + image_times.append(idx/countss) + image_poses.append((R,T)) + # if self.downsample != 1.0: + # img = video_frame.resize(self.img_wh, Image.LANCZOS) + # img.save(os.path.join(image_path,"%04d.png"%count)) + this_count+=1 + N_time = len(images_path) + + # video_data_save[count] = img.permute(1,2,0) + # count += 1 + return image_paths, image_poses, image_times, N_cams, N_time + def __len__(self): + return len(self.image_paths) + def __getitem__(self,index): + img = Image.open(self.image_paths[index]) + img = img.resize(self.img_wh, Image.LANCZOS) + + img = self.transform(img) + return img, self.image_poses[index], self.image_times[index] + def load_pose(self,index): + return self.image_poses[index] + diff --git a/scene/regulation.py b/scene/regulation.py new file mode 100644 index 0000000..80583a3 --- /dev/null +++ b/scene/regulation.py @@ -0,0 +1,176 @@ +import abc +import os +from typing import Sequence + +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.optim.lr_scheduler +from torch import nn + + + +def compute_plane_tv(t): + batch_size, c, h, w = t.shape + count_h = batch_size * c * (h - 1) * w + count_w = batch_size * c * h * (w - 1) + h_tv = torch.square(t[..., 1:, :] - t[..., :h-1, :]).sum() + w_tv = torch.square(t[..., :, 1:] - t[..., :, :w-1]).sum() + return 2 * (h_tv / count_h + w_tv / count_w) # This is summing over batch and c instead of avg + + +def compute_plane_smoothness(t): + batch_size, c, h, w = t.shape + # Convolve with a second derivative filter, in the time dimension which is dimension 2 + first_difference = t[..., 1:, :] - t[..., :h-1, :] # [batch, c, h-1, w] + second_difference = first_difference[..., 1:, :] - first_difference[..., :h-2, :] # [batch, c, h-2, w] + # Take the L2 norm of the result + return torch.square(second_difference).mean() + + +class Regularizer(): + def __init__(self, reg_type, initialization): + self.reg_type = reg_type + self.initialization = initialization + self.weight = float(self.initialization) + self.last_reg = None + + def step(self, global_step): + pass + + def report(self, d): + if self.last_reg is not None: + d[self.reg_type].update(self.last_reg.item()) + + def regularize(self, *args, **kwargs) -> torch.Tensor: + out = self._regularize(*args, **kwargs) * self.weight + self.last_reg = out.detach() + return out + + @abc.abstractmethod + def _regularize(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError() + + def __str__(self): + return f"Regularizer({self.reg_type}, weight={self.weight})" + + +class PlaneTV(Regularizer): + def __init__(self, initial_value, what: str = 'field'): + if what not in {'field', 'proposal_network'}: + raise ValueError(f'what must be one of "field" or "proposal_network" ' + f'but {what} was passed.') + name = f'planeTV-{what[:2]}' + super().__init__(name, initial_value) + self.what = what + + def step(self, global_step): + pass + + def _regularize(self, model, **kwargs): + multi_res_grids: Sequence[nn.ParameterList] + if self.what == 'field': + multi_res_grids = model.field.grids + elif self.what == 'proposal_network': + multi_res_grids = [p.grids for p in model.proposal_networks] + else: + raise NotImplementedError(self.what) + total = 0 + # Note: input to compute_plane_tv should be of shape [batch_size, c, h, w] + for grids in multi_res_grids: + if len(grids) == 3: + spatial_grids = [0, 1, 2] + else: + spatial_grids = [0, 1, 3] # These are the spatial grids; the others are spatiotemporal + for grid_id in spatial_grids: + total += compute_plane_tv(grids[grid_id]) + for grid in grids: + # grid: [1, c, h, w] + total += compute_plane_tv(grid) + return total + + +class TimeSmoothness(Regularizer): + def __init__(self, initial_value, what: str = 'field'): + if what not in {'field', 'proposal_network'}: + raise ValueError(f'what must be one of "field" or "proposal_network" ' + f'but {what} was passed.') + name = f'time-smooth-{what[:2]}' + super().__init__(name, initial_value) + self.what = what + + def _regularize(self, model, **kwargs) -> torch.Tensor: + multi_res_grids: Sequence[nn.ParameterList] + if self.what == 'field': + multi_res_grids = model.field.grids + elif self.what == 'proposal_network': + multi_res_grids = [p.grids for p in model.proposal_networks] + else: + raise NotImplementedError(self.what) + total = 0 + # model.grids is 6 x [1, rank * F_dim, reso, reso] + for grids in multi_res_grids: + if len(grids) == 3: + time_grids = [] + else: + time_grids = [2, 4, 5] + for grid_id in time_grids: + total += compute_plane_smoothness(grids[grid_id]) + return torch.as_tensor(total) + + + +class L1ProposalNetwork(Regularizer): + def __init__(self, initial_value): + super().__init__('l1-proposal-network', initial_value) + + def _regularize(self, model, **kwargs) -> torch.Tensor: + grids = [p.grids for p in model.proposal_networks] + total = 0.0 + for pn_grids in grids: + for grid in pn_grids: + total += torch.abs(grid).mean() + return torch.as_tensor(total) + + +class DepthTV(Regularizer): + def __init__(self, initial_value): + super().__init__('tv-depth', initial_value) + + def _regularize(self, model, model_out, **kwargs) -> torch.Tensor: + depth = model_out['depth'] + tv = compute_plane_tv( + depth.reshape(64, 64)[None, None, :, :] + ) + return tv + + +class L1TimePlanes(Regularizer): + def __init__(self, initial_value, what='field'): + if what not in {'field', 'proposal_network'}: + raise ValueError(f'what must be one of "field" or "proposal_network" ' + f'but {what} was passed.') + super().__init__(f'l1-time-{what[:2]}', initial_value) + self.what = what + + def _regularize(self, model, **kwargs) -> torch.Tensor: + # model.grids is 6 x [1, rank * F_dim, reso, reso] + multi_res_grids: Sequence[nn.ParameterList] + if self.what == 'field': + multi_res_grids = model.field.grids + elif self.what == 'proposal_network': + multi_res_grids = [p.grids for p in model.proposal_networks] + else: + raise NotImplementedError(self.what) + + total = 0.0 + for grids in multi_res_grids: + if len(grids) == 3: + continue + else: + # These are the spatiotemporal grids + spatiotemporal_grids = [2, 4, 5] + for grid_id in spatiotemporal_grids: + total += torch.abs(1 - grids[grid_id]).mean() + return torch.as_tensor(total) + diff --git a/scene/text_dataset.py b/scene/text_dataset.py new file mode 100644 index 0000000..a88092c --- /dev/null +++ b/scene/text_dataset.py @@ -0,0 +1,786 @@ +from torch.utils.data import Dataset +# from scene.cameras import Camera +import numpy as np +from utils.general_utils import PILtoTorch +from utils.graphics_utils import fov2focal, focal2fov +import torch +from utils.camera_utils import loadCam +from utils.graphics_utils import focal2fov + +from torchvision.transforms import ToTensor +from PIL import Image +import glob +from scene.cam_utils import orbit_camera +import math, os + +def getProjectionMatrix(znear, zfar, fovX, fovY): + tanHalfFovY = math.tan((fovY / 2)) + tanHalfFovX = math.tan((fovX / 2)) + + P = torch.zeros(4, 4) + + z_sign = 1.0 + + P[0, 0] = 1 / tanHalfFovX + P[1, 1] = 1 / tanHalfFovY + P[3, 2] = z_sign + P[2, 2] = z_sign * zfar / (zfar - znear) + P[2, 3] = -(zfar * znear) / (zfar - znear) + return P + + +class MiniCam: + def __init__(self, c2w, width, height, fovy, fovx, znear, zfar): + # c2w (pose) should be in NeRF convention. + + self.image_width = width + self.image_height = height + self.FoVy = fovy + self.FoVx = fovx + self.znear = znear + self.zfar = zfar + + w2c = np.linalg.inv(c2w) + + # rectify... + w2c[1:3, :3] *= -1 + w2c[:3, 3] *= -1 + + self.world_view_transform = torch.tensor(w2c).transpose(0, 1)#.cuda() + self.projection_matrix = ( + getProjectionMatrix( + znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy + ) + .transpose(0, 1) + # .cuda() + ) + self.full_proj_transform = self.world_view_transform @ self.projection_matrix + self.camera_center = -torch.tensor(c2w[:3, 3])#.cuda() + + +class FourDGSdataset(Dataset): + def __init__( + self, + split, + frame_num = 16, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + # self.radius = 2.5 + self.radius = 4 + self.W = 512 + self.H = 512 + self.fovy = np.deg2rad(40) + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + pose0_dir=f'data/{self.name}_pose0/' + # pose0_dir=f'data/{self.name}_rgba_pose0/' + + frame_list = range(frame_num) + pose0_im_names = [pose0_dir + f'{x}.png' for x in frame_list] + idx_list = range(frame_num) + if not os.path.exists(pose0_im_names[0]): # check 0 index + pose0_im_names = pose0_im_names[1:] + [pose0_dir + f'{frame_num}.png'] # use 1 index + idx_list = list(idx_list)[1:] + [frame_num] + + base_dir=f'./data/{self.name}_sync' + + syncdreamer_im = [] + # for fname in t0_im_names: + assert self.static==False + if self.static==False: + for frame_idx in idx_list: + # for frame_idx in range(1, frame_num + 1): + li = [] + for view_idx in range(16): + fname = os.path.join(base_dir, f"{frame_idx}_0_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + else: + #sync only read frame0 + # (dejia): not used + for frame_idx in range(frame_num): + li = [] + frame_idx=0 + for view_idx in range(16): + fname = os.path.join(base_dir, f"{frame_idx}_0_{view_idx}_rgba.png") + # fname = os.path.join(base_dir, f"{self.name}{frame_idx}_0_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + + print(f"syncdreamer images loaded {self.syncdreamer_im.shape}.") + + self.pose0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in pose0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + ww = self.T(im) + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.pose0_im_list.append(ww) + # self.pose0_im_list.append(self.T(im)) + while len(self.pose0_im_list) < self.len_pose0: + self.pose0_im_list.append(ww) + self.pose0_im_list = torch.stack(self.pose0_im_list, dim=0)#.permute(0, 2, 3, 1) + # self.pose0_im_list = self.pose0_im_list.expand(fn, 3, 256, 256) + print(f"Pose0 images loaded {self.pose0_im_list.shape}") + self.syncdreamer_im = torch.cat([self.pose0_im_list.unsqueeze(1), self.syncdreamer_im], 1) + print(f"New syncdreamer shape {self.syncdreamer_im.shape}") + self.max_frames = self.pose0_im_list.shape[0] + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + # self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.pose0_im_list.shape[0] + if self.split == 'train': + self.t0_num = 16 + 1 # fixed + else: + self.t0_num = 100 + self.len_ = (self.t0_num) * (self.pose0_num) + + pose0_pose = orbit_camera(0, 0, self.radius) + self.pose0_cam = MiniCam( + pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + self.t0_pose = [self.pose0_cam] + [MiniCam( + # self.t0_pose = [MiniCam( + orbit_camera(-30, azimuth, self.radius), + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for azimuth in np.concatenate([np.arange(0, 180, 22.5), np.arange(-180, 0, 22.5)])] + + # we sample (pose, t) + def __getitem__(self, index): + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + 'pose0': self.pose0_im_list[pose0_idx], + 'pose0_idx': pose0_idx, + 't0_idx': t0_idx, + 't0_weight': min(abs(t0_idx), abs(self.t0_num - t0_idx)), + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + #t0_idx=0 + if self.split == 'train': + out['t0'] = self.syncdreamer_im[0][t0_idx] + out['gtim'] = self.syncdreamer_im[pose0_idx][t0_idx] # coarse stage + + t0_cam = self.t0_pose[t0_idx] + out['t0_cam'] = t0_cam + # out['sync_cam'] = self.sync_pose + + + + ## for render.py multiview_video + + ver = 0 + hor = (index / 100) * 360 + # ver = np.random.randint(-45, 45) + # hor = np.random.randint(-180, 180) + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + + # for fine stage, random seq + + rand_seq = [] + ver_list = [] + hor_list = [] + # for i in range(self.pose0_num - 1): + for i in range(self.pose0_num): + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + cur_pose = orbit_camera(ver, hor, self.radius) + ver_list.append(ver) + hor_list.append(hor) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + back_pose=orbit_camera(0, 180, self.radius) + out['back_cam']=MiniCam( + back_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 90, self.radius) + out['side_cam']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 70, self.radius) + out['side_cam2']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + front_pose=orbit_camera(0, 0, self.radius) + out['front_cam']=MiniCam( + front_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 + + +class ImageDreamdataset(Dataset): + def __init__( + self, + split, + frame_num = 16, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + # self.radius = 2.5 + self.radius = 2.0 ## imagedream https://github.com/bytedance/ImageDream/blob/13e05566ca27c66b6bc5b3ee42bc68ddfb471585/configs/imagedream-sd21-shading.yaml#L20 + self.W = 512 + self.H = 512 + self.fovy = np.deg2rad(40) + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + pose0_dir=f'./data/ImageDream/{self.name}/rgba/' + + frame_list = range(frame_num) + pose0_im_names = [pose0_dir + f'{x}.png' for x in frame_list] + idx_list = range(frame_num) + if not os.path.exists(pose0_im_names[0]): # check 0 index + pose0_im_names = pose0_im_names[1:] + [pose0_dir + f'{frame_num}.png'] # use 1 index + idx_list = list(idx_list)[1:] + [frame_num] + + base_dir=f'./data/output_svd/{self.name}' + syncdreamer_im = [] + assert self.static==False + if self.static==False: + for frame_idx in idx_list: + li = [] + for view_idx in range(4): + #view_idx=0 + fname = os.path.join(base_dir, f"{frame_idx}_{view_idx}_rgba.png") + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + li.append(ww) + li = torch.stack(li, dim=0)#.permute(0, 2, 3, 1) + syncdreamer_im.append(li) + self.syncdreamer_im = torch.stack(syncdreamer_im, 0) # [fn, 16, 3, 512, 512] + else: + raise NotImplementedError + + + + + print(f"imagedream images loaded {self.syncdreamer_im.shape}.") + + self.pose0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in pose0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + ww = self.T(im) + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.pose0_im_list.append(ww) + # self.pose0_im_list.append(self.T(im)) + while len(self.pose0_im_list) < self.len_pose0: + self.pose0_im_list.append(ww) + self.pose0_im_list = torch.stack(self.pose0_im_list, dim=0)#.permute(0, 2, 3, 1) + # self.pose0_im_list = self.pose0_im_list.expand(fn, 3, 256, 256) + print(f"Pose0 images loaded {self.pose0_im_list.shape}") + # self.syncdreamer_im = torch.cat([self.pose0_im_list.unsqueeze(1), self.syncdreamer_im], 1) + print(f"New syncdreamer shape {self.syncdreamer_im.shape}") + self.max_frames = self.pose0_im_list.shape[0] + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + # self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.pose0_im_list.shape[0] + if self.split == 'train': + self.t0_num = 4# + 1 # fixed + else: + self.t0_num = 100 + self.len_ = (self.t0_num) * (self.pose0_num) + + # NOTE: this is different!! + pose0_pose = orbit_camera(0, 90, self.radius) + self.pose0_cam = MiniCam( + pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + # self.t0_pose = [self.pose0_cam] + [MiniCam( + self.t0_pose = [MiniCam( + orbit_camera(0, azimuth, self.radius), + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for azimuth in np.concatenate([np.arange(0, 180, 90), np.arange(-180, 0, 90)])] + + # we sample (pose, t) + def __getitem__(self, index): + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + 'pose0': self.pose0_im_list[pose0_idx], + 'pose0_idx': pose0_idx, + 't0_idx': t0_idx, + 't0_weight': min(abs(t0_idx), abs(self.t0_num - t0_idx)), + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + #t0_idx=0 + if self.split == 'train': + out['t0'] = self.syncdreamer_im[0][t0_idx] + out['gtim'] = self.syncdreamer_im[pose0_idx][t0_idx] # coarse stage + + t0_cam = self.t0_pose[t0_idx] + out['t0_cam'] = t0_cam + + ## for render.py multiview_video + ver = 0 + hor = (index / 100) * 360 + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + + # for fine stage, random seq + + rand_seq = [] + ver_list = [] + hor_list = [] + # for i in range(self.pose0_num - 1): + for i in range(self.pose0_num): + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + cur_pose = orbit_camera(ver, hor, self.radius) + ver_list.append(ver) + hor_list.append(hor) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + back_pose=orbit_camera(0, 180, self.radius) + out['back_cam']=MiniCam( + back_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 90, self.radius) + out['side_cam']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + side_pose=orbit_camera(0, 70, self.radius) + out['side_cam2']=MiniCam( + side_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + front_pose=orbit_camera(0, 0, self.radius) + out['front_cam']=MiniCam( + front_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + li = [orbit_camera(ver, hor, self.radius)] + for view_i in range(1, 4): + li.append(orbit_camera(ver, hor + 90 * view_i, self.radius)) + out['dream_pose_mat'] = torch.from_numpy(np.stack(li, axis=0)) + out['dream_pose'] = [MiniCam( + cur_pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for cur_pose in li] + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 + + +class Text4Ddataset(Dataset): + def __init__( + self, + split, + frame_num = 24, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + # self.radius = 2.5 + self.radius = 4 + self.W = 320 # 反了.. + self.H = 576 + # self.W = 160 # 320 # 反了.. + # self.H = 288 # 576 + # self.W = 512 + # self.H = 512 + self.fovy = np.deg2rad(40) # 30 .. 60 + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + # self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + self.max_frames = 16 + + + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + # self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.max_frames + if self.split == 'train': + self.t0_num = 4# + 1 # fixed + else: + self.t0_num = 100 + self.len_ = (self.t0_num) * (self.pose0_num) + + # NOTE: this is different!! + self.pose0_pose = orbit_camera(0, 90, self.radius) + self.pose0_cam = MiniCam( + self.pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + # self.t0_pose = [self.pose0_cam] + [MiniCam( + self.t0_pose = [MiniCam( + orbit_camera(0, azimuth, self.radius), + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for azimuth in np.concatenate([np.arange(0, 180, 90), np.arange(-180, 0, 90)])] + + # we sample (pose, t) + def __getitem__(self, index): + self.fovy = np.deg2rad(np.random.random() * 30 + 30) # 30, 60 + self.fovx = self.fovy + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + # 'pose0': self.pose0_im_list[pose0_idx], + 'pose0_idx': pose0_idx, + 't0_idx': t0_idx, + 't0_weight': min(abs(t0_idx), abs(self.t0_num - t0_idx)), + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + #t0_idx=0 + if self.split == 'train': + # out['t0'] = self.syncdreamer_im[0][t0_idx] + # out['gtim'] = self.syncdreamer_im[pose0_idx][t0_idx] # coarse stage + + t0_cam = self.t0_pose[t0_idx] + out['t0_cam'] = t0_cam + + ## for render.py multiview_video + ver = 0 + hor = (index / 100) * 360 + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + np.deg2rad(40), # fix fov for test time + np.deg2rad(40), + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + + # for fine stage, random seq + + rand_seq = [] + ver_list = [] + hor_list = [] + dist_list = [] + # for i in range(self.pose0_num - 1): + elevation_range_delta = (-40 / 4, 40 / 4) + azimuth_range_delta = (-60 / 4, 60 / 4) + dist_range_delta = (-2 / 4, 2 / 4) + target_delta = np.random.random(3) * 0.01 + ele_st = np.random.random() * 45 - 45 # [-45, 0] + azi_st = np.random.random() * 360 - 180 + acc_ver = ele_st + acc_hor = azi_st + acc_dist = self.radius + ele_delta = np.random.random() * (elevation_range_delta[1] - elevation_range_delta[0]) + elevation_range_delta[0] + azi_delta = np.random.random() * (azimuth_range_delta[1] - azimuth_range_delta[0]) + azimuth_range_delta[0] + dist_delta = np.random.random() * (dist_range_delta[1] - dist_range_delta[0]) + dist_range_delta[0] + for i in range(self.pose0_num): + acc_ver += ele_delta + acc_hor += azi_delta + acc_dist += dist_delta + cur_pose = orbit_camera(acc_ver, acc_hor, acc_dist, jitter=True) + # cur_pose = orbit_camera(acc_ver, acc_hor, acc_dist, target=target_delta * i) + ver_list.append(acc_ver) + hor_list.append(acc_hor) + dist_list.append(acc_dist) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + # self.pose0_pose, + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + out['rand_dist'] = np.array(dist_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + li = [orbit_camera(ver, hor, self.radius)] + for view_i in range(1, 4): + li.append(orbit_camera(ver, hor + 90 * view_i, self.radius)) + out['dream_pose_mat'] = torch.from_numpy(np.stack(li, axis=0)).float() + out['dream_pose'] = [MiniCam( + cur_pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) for cur_pose in li] + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 diff --git a/scene/utils.py b/scene/utils.py new file mode 100644 index 0000000..d6edf7e --- /dev/null +++ b/scene/utils.py @@ -0,0 +1,429 @@ +import copy +import json +import math +import os +import pathlib +from typing import Any, Callable, List, Optional, Text, Tuple, Union + +import numpy as np +import scipy.signal +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +PRNGKey = Any +Shape = Tuple[int] +Dtype = Any # this could be a real type? +Array = Any +Activation = Callable[[Array], Array] +Initializer = Callable[[PRNGKey, Shape, Dtype], Array] +Normalizer = Callable[[], Callable[[Array], Array]] +PathType = Union[Text, pathlib.PurePosixPath] + +from pathlib import PurePosixPath as GPath + + +def _compute_residual_and_jacobian( + x: np.ndarray, + y: np.ndarray, + xd: np.ndarray, + yd: np.ndarray, + k1: float = 0.0, + k2: float = 0.0, + k3: float = 0.0, + p1: float = 0.0, + p2: float = 0.0, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, + np.ndarray]: + """Auxiliary function of radial_and_tangential_undistort().""" + + r = x * x + y * y + d = 1.0 + r * (k1 + r * (k2 + k3 * r)) + + fx = d * x + 2 * p1 * x * y + p2 * (r + 2 * x * x) - xd + fy = d * y + 2 * p2 * x * y + p1 * (r + 2 * y * y) - yd + + # Compute derivative of d over [x, y] + d_r = (k1 + r * (2.0 * k2 + 3.0 * k3 * r)) + d_x = 2.0 * x * d_r + d_y = 2.0 * y * d_r + + # Compute derivative of fx over x and y. + fx_x = d + d_x * x + 2.0 * p1 * y + 6.0 * p2 * x + fx_y = d_y * x + 2.0 * p1 * x + 2.0 * p2 * y + + # Compute derivative of fy over x and y. + fy_x = d_x * y + 2.0 * p2 * y + 2.0 * p1 * x + fy_y = d + d_y * y + 2.0 * p2 * x + 6.0 * p1 * y + + return fx, fy, fx_x, fx_y, fy_x, fy_y + + +def _radial_and_tangential_undistort( + xd: np.ndarray, + yd: np.ndarray, + k1: float = 0, + k2: float = 0, + k3: float = 0, + p1: float = 0, + p2: float = 0, + eps: float = 1e-9, + max_iterations=10) -> Tuple[np.ndarray, np.ndarray]: + """Computes undistorted (x, y) from (xd, yd).""" + # Initialize from the distorted point. + x = xd.copy() + y = yd.copy() + + for _ in range(max_iterations): + fx, fy, fx_x, fx_y, fy_x, fy_y = _compute_residual_and_jacobian( + x=x, y=y, xd=xd, yd=yd, k1=k1, k2=k2, k3=k3, p1=p1, p2=p2) + denominator = fy_x * fx_y - fx_x * fy_y + x_numerator = fx * fy_y - fy * fx_y + y_numerator = fy * fx_x - fx * fy_x + step_x = np.where( + np.abs(denominator) > eps, x_numerator / denominator, + np.zeros_like(denominator)) + step_y = np.where( + np.abs(denominator) > eps, y_numerator / denominator, + np.zeros_like(denominator)) + + x = x + step_x + y = y + step_y + + return x, y + + +class Camera: + """Class to handle camera geometry.""" + + def __init__(self, + orientation: np.ndarray, + position: np.ndarray, + focal_length: Union[np.ndarray, float], + principal_point: np.ndarray, + image_size: np.ndarray, + skew: Union[np.ndarray, float] = 0.0, + pixel_aspect_ratio: Union[np.ndarray, float] = 1.0, + radial_distortion: Optional[np.ndarray] = None, + tangential_distortion: Optional[np.ndarray] = None, + dtype=np.float32): + """Constructor for camera class.""" + if radial_distortion is None: + radial_distortion = np.array([0.0, 0.0, 0.0], dtype) + if tangential_distortion is None: + tangential_distortion = np.array([0.0, 0.0], dtype) + + self.orientation = np.array(orientation, dtype) + self.position = np.array(position, dtype) + self.focal_length = np.array(focal_length, dtype) + self.principal_point = np.array(principal_point, dtype) + self.skew = np.array(skew, dtype) + self.pixel_aspect_ratio = np.array(pixel_aspect_ratio, dtype) + self.radial_distortion = np.array(radial_distortion, dtype) + self.tangential_distortion = np.array(tangential_distortion, dtype) + self.image_size = np.array(image_size, np.uint32) + self.dtype = dtype + + @classmethod + def from_json(cls, path: PathType): + """Loads a JSON camera into memory.""" + path = GPath(path) + # with path.open('r') as fp: + with open(path, 'r') as fp: + camera_json = json.load(fp) + + # Fix old camera JSON. + if 'tangential' in camera_json: + camera_json['tangential_distortion'] = camera_json['tangential'] + + return cls( + orientation=np.asarray(camera_json['orientation']), + position=np.asarray(camera_json['position']), + focal_length=camera_json['focal_length'], + principal_point=np.asarray(camera_json['principal_point']), + skew=camera_json['skew'], + pixel_aspect_ratio=camera_json['pixel_aspect_ratio'], + radial_distortion=np.asarray(camera_json['radial_distortion']), + tangential_distortion=np.asarray(camera_json['tangential_distortion']), + image_size=np.asarray(camera_json['image_size']), + ) + + def to_json(self): + return { + k: (v.tolist() if hasattr(v, 'tolist') else v) + for k, v in self.get_parameters().items() + } + + def get_parameters(self): + return { + 'orientation': self.orientation, + 'position': self.position, + 'focal_length': self.focal_length, + 'principal_point': self.principal_point, + 'skew': self.skew, + 'pixel_aspect_ratio': self.pixel_aspect_ratio, + 'radial_distortion': self.radial_distortion, + 'tangential_distortion': self.tangential_distortion, + 'image_size': self.image_size, + } + + @property + def scale_factor_x(self): + return self.focal_length + + @property + def scale_factor_y(self): + return self.focal_length * self.pixel_aspect_ratio + + @property + def principal_point_x(self): + return self.principal_point[0] + + @property + def principal_point_y(self): + return self.principal_point[1] + + @property + def has_tangential_distortion(self): + return any(self.tangential_distortion != 0.0) + + @property + def has_radial_distortion(self): + return any(self.radial_distortion != 0.0) + + @property + def image_size_y(self): + return self.image_size[1] + + @property + def image_size_x(self): + return self.image_size[0] + + @property + def image_shape(self): + return self.image_size_y, self.image_size_x + + @property + def optical_axis(self): + return self.orientation[2, :] + + @property + def translation(self): + return -np.matmul(self.orientation, self.position) + + def pixel_to_local_rays(self, pixels: np.ndarray): + """Returns the local ray directions for the provided pixels.""" + y = ((pixels[..., 1] - self.principal_point_y) / self.scale_factor_y) + x = ((pixels[..., 0] - self.principal_point_x - y * self.skew) / + self.scale_factor_x) + + if self.has_radial_distortion or self.has_tangential_distortion: + x, y = _radial_and_tangential_undistort( + x, + y, + k1=self.radial_distortion[0], + k2=self.radial_distortion[1], + k3=self.radial_distortion[2], + p1=self.tangential_distortion[0], + p2=self.tangential_distortion[1]) + + dirs = np.stack([x, y, np.ones_like(x)], axis=-1) + return dirs / np.linalg.norm(dirs, axis=-1, keepdims=True) + + def pixels_to_rays(self, pixels: np.ndarray) -> np.ndarray: + """Returns the rays for the provided pixels. + + Args: + pixels: [A1, ..., An, 2] tensor or np.array containing 2d pixel positions. + + Returns: + An array containing the normalized ray directions in world coordinates. + """ + if pixels.shape[-1] != 2: + raise ValueError('The last dimension of pixels must be 2.') + if pixels.dtype != self.dtype: + raise ValueError(f'pixels dtype ({pixels.dtype!r}) must match camera ' + f'dtype ({self.dtype!r})') + + batch_shape = pixels.shape[:-1] + pixels = np.reshape(pixels, (-1, 2)) + + local_rays_dir = self.pixel_to_local_rays(pixels) + rays_dir = np.matmul(self.orientation.T, local_rays_dir[..., np.newaxis]) + rays_dir = np.squeeze(rays_dir, axis=-1) + + # Normalize rays. + rays_dir /= np.linalg.norm(rays_dir, axis=-1, keepdims=True) + rays_dir = rays_dir.reshape((*batch_shape, 3)) + return rays_dir + + def pixels_to_points(self, pixels: np.ndarray, depth: np.ndarray): + rays_through_pixels = self.pixels_to_rays(pixels) + cosa = np.matmul(rays_through_pixels, self.optical_axis) + points = ( + rays_through_pixels * depth[..., np.newaxis] / cosa[..., np.newaxis] + + self.position) + return points + + def points_to_local_points(self, points: np.ndarray): + translated_points = points - self.position + local_points = (np.matmul(self.orientation, translated_points.T)).T + return local_points + + def project(self, points: np.ndarray): + """Projects a 3D point (x,y,z) to a pixel position (x,y).""" + batch_shape = points.shape[:-1] + points = points.reshape((-1, 3)) + local_points = self.points_to_local_points(points) + + # Get normalized local pixel positions. + x = local_points[..., 0] / local_points[..., 2] + y = local_points[..., 1] / local_points[..., 2] + r2 = x**2 + y**2 + + # Apply radial distortion. + distortion = 1.0 + r2 * ( + self.radial_distortion[0] + r2 * + (self.radial_distortion[1] + self.radial_distortion[2] * r2)) + + # Apply tangential distortion. + x_times_y = x * y + x = ( + x * distortion + 2.0 * self.tangential_distortion[0] * x_times_y + + self.tangential_distortion[1] * (r2 + 2.0 * x**2)) + y = ( + y * distortion + 2.0 * self.tangential_distortion[1] * x_times_y + + self.tangential_distortion[0] * (r2 + 2.0 * y**2)) + + # Map the distorted ray to the image plane and return the depth. + pixel_x = self.focal_length * x + self.skew * y + self.principal_point_x + pixel_y = (self.focal_length * self.pixel_aspect_ratio * y + + self.principal_point_y) + + pixels = np.stack([pixel_x, pixel_y], axis=-1) + return pixels.reshape((*batch_shape, 2)) + + def get_pixel_centers(self): + """Returns the pixel centers.""" + xx, yy = np.meshgrid(np.arange(self.image_size_x, dtype=self.dtype), + np.arange(self.image_size_y, dtype=self.dtype)) + return np.stack([xx, yy], axis=-1) + 0.5 + + def scale(self, scale: float): + """Scales the camera.""" + if scale <= 0: + raise ValueError('scale needs to be positive.') + + new_camera = Camera( + orientation=self.orientation.copy(), + position=self.position.copy(), + focal_length=self.focal_length * scale, + principal_point=self.principal_point.copy() * scale, + skew=self.skew, + pixel_aspect_ratio=self.pixel_aspect_ratio, + radial_distortion=self.radial_distortion.copy(), + tangential_distortion=self.tangential_distortion.copy(), + image_size=np.array((int(round(self.image_size[0] * scale)), + int(round(self.image_size[1] * scale)))), + ) + return new_camera + + def look_at(self, position, look_at, up, eps=1e-6): + """Creates a copy of the camera which looks at a given point. + + Copies the provided vision_sfm camera and returns a new camera that is + positioned at `camera_position` while looking at `look_at_position`. + Camera intrinsics are copied by this method. A common value for the + up_vector is (0, 1, 0). + + Args: + position: A (3,) numpy array representing the position of the camera. + look_at: A (3,) numpy array representing the location the camera + looks at. + up: A (3,) numpy array representing the up direction, whose + projection is parallel to the y-axis of the image plane. + eps: a small number to prevent divides by zero. + + Returns: + A new camera that is copied from the original but is positioned and + looks at the provided coordinates. + + Raises: + ValueError: If the camera position and look at position are very close + to each other or if the up-vector is parallel to the requested optical + axis. + """ + + look_at_camera = self.copy() + optical_axis = look_at - position + norm = np.linalg.norm(optical_axis) + if norm < eps: + raise ValueError('The camera center and look at position are too close.') + optical_axis /= norm + + right_vector = np.cross(optical_axis, up) + norm = np.linalg.norm(right_vector) + if norm < eps: + raise ValueError('The up-vector is parallel to the optical axis.') + right_vector /= norm + + # The three directions here are orthogonal to each other and form a right + # handed coordinate system. + camera_rotation = np.identity(3) + camera_rotation[0, :] = right_vector + camera_rotation[1, :] = np.cross(optical_axis, right_vector) + camera_rotation[2, :] = optical_axis + + look_at_camera.position = position + look_at_camera.orientation = camera_rotation + return look_at_camera + + def crop_image_domain( + self, left: int = 0, right: int = 0, top: int = 0, bottom: int = 0): + """Returns a copy of the camera with adjusted image bounds. + + Args: + left: number of pixels by which to reduce (or augment, if negative) the + image domain at the associated boundary. + right: likewise. + top: likewise. + bottom: likewise. + + The crop parameters may not cause the camera image domain dimensions to + become non-positive. + + Returns: + A camera with adjusted image dimensions. The focal length is unchanged, + and the principal point is updated to preserve the original principal + axis. + """ + + crop_left_top = np.array([left, top]) + crop_right_bottom = np.array([right, bottom]) + new_resolution = self.image_size - crop_left_top - crop_right_bottom + new_principal_point = self.principal_point - crop_left_top + if np.any(new_resolution <= 0): + raise ValueError('Crop would result in non-positive image dimensions.') + + new_camera = self.copy() + new_camera.image_size = np.array([int(new_resolution[0]), + int(new_resolution[1])]) + new_camera.principal_point = np.array([new_principal_point[0], + new_principal_point[1]]) + return new_camera + + def copy(self): + return copy.deepcopy(self) + + +''' Misc +''' +mse2psnr = lambda x : -10. * torch.log10(x) +to8b = lambda x : (255*np.clip(x,0,1)).astype(np.uint8) + + + +''' Checkpoint utils +''' \ No newline at end of file diff --git a/scene/video_dataset.py b/scene/video_dataset.py new file mode 100644 index 0000000..29e5203 --- /dev/null +++ b/scene/video_dataset.py @@ -0,0 +1,276 @@ +from torch.utils.data import Dataset +# from scene.cameras import Camera +import numpy as np +from utils.general_utils import PILtoTorch +from utils.graphics_utils import fov2focal, focal2fov +import torch +from utils.camera_utils import loadCam +from utils.graphics_utils import focal2fov + +from torchvision.transforms import ToTensor +from PIL import Image +import glob +from scene.cam_utils import orbit_camera +import math + +def getProjectionMatrix(znear, zfar, fovX, fovY): + tanHalfFovY = math.tan((fovY / 2)) + tanHalfFovX = math.tan((fovX / 2)) + + P = torch.zeros(4, 4) + + z_sign = 1.0 + + P[0, 0] = 1 / tanHalfFovX + P[1, 1] = 1 / tanHalfFovY + P[3, 2] = z_sign + P[2, 2] = z_sign * zfar / (zfar - znear) + P[2, 3] = -(zfar * znear) / (zfar - znear) + return P + + +class MiniCam: + def __init__(self, c2w, width, height, fovy, fovx, znear, zfar): + # c2w (pose) should be in NeRF convention. + + self.image_width = width + self.image_height = height + self.FoVy = fovy + self.FoVx = fovx + self.znear = znear + self.zfar = zfar + + w2c = np.linalg.inv(c2w) + + # rectify... + w2c[1:3, :3] *= -1 + w2c[:3, 3] *= -1 + + self.world_view_transform = torch.tensor(w2c).transpose(0, 1)#.cuda() + self.projection_matrix = ( + getProjectionMatrix( + znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy + ) + .transpose(0, 1) + # .cuda() + ) + self.full_proj_transform = self.world_view_transform @ self.projection_matrix + self.camera_center = -torch.tensor(c2w[:3, 3])#.cuda() + + +class FourDGSdataset(Dataset): + def __init__( + self, + split, + frame_num = 16, + name='panda', + rife=False, + static=False, + ): + self.split = split + # self.args = args + + # https://github.com/threestudio-project/threestudio/blob/main/configs/magic123-coarse-sd.yaml#L22 + self.radius = 2.5 + self.W = 512 + self.H = 512 + self.fovy = np.deg2rad(40) + self.fovx = np.deg2rad(40) + # self.fovy = np.deg2rad(49.1) + # self.fovx = np.deg2rad(49.1) + # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61 + self.near = 0.01 + self.far = 100 + self.T = ToTensor() + self.len_pose0 = frame_num + self.name=name + self.rife=rife + self.static=static + + # load t=0 sequences + dir=f'data/{self.name}_static_rgba/' + #dir = 'data/panda_static_rgba/' # generated from new.png + t0_im_names = [dir + str(x) + '_rgba.png' for x in range(1, 101)] + # t0_im_names = glob.glob(dir + '/*.png') + self.t0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in t0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + # use RGBA + ww = self.T(im) + assert ww.shape[0] == 4 + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.t0_im_list.append(ww) + self.t0_im_list = torch.stack(self.t0_im_list, dim=0)#.permute(0, 2, 3, 1) + + print(f"T0 images loaded {self.t0_im_list.shape}.") + + # load pose0 (canonical pose) frames + # dir = 'data/panda_im/' + # pose0_im_names = [dir + x for x in ['new.png', '1.png', '2.png', '3.png']] + #dir = 'data/panda_rgba_pose0/' + dir=f'data/{self.name}_rgba_pose0/' + if self.rife==False: + if frame_num==4: + if self.name=='panda': + frame_list=[0,12,14,15] + # elif self.name=='rose': + # frame_list=[0,6,13,22] + else: + frame_list=range(frame_num) + pose0_im_names = [dir + f'{x}.png' for x in frame_list] + # pose0_im_names = [dir + f'frame_{x}_rgba.png' for x in frame_list] + else: + if self.name=='astronaut': + frame_list= [0] + list(range(12, 27)) + elif self.name=='kitten': + frame_list= [0] + list(range(16, 23))+ list(range(24, 32)) + else: + frame_list=range(frame_num) + pose0_im_names = [dir + f'{x}.png' for x in frame_list] + #pose0_im_names = [dir + f'frame_{x}_rgba.png' for x in range(frame_num)] + else: + + dir=f'data/{self.name}_rife/' + frame_list=range(frame_num) + pose0_im_names = [dir + f'img{x}.png' for x in frame_list] + + if self.static: + dir=f'data/{self.name}_rgba_pose0/' + frame_list=range(frame_num) + pose0_im_names = [dir + f'{0}.png' for _ in frame_list] + + + # pose0_im_names = pose0_im_names[:2] + # pose0_im_names = glob.glob(dir + '/*.png') + self.pose0_im_list = [] + # TODO: should images be RGBA when input?? + for fname in pose0_im_names: + im = Image.open(fname).resize((self.W, self.H))#.convert('RGB') + ww = self.T(im) + ww[:3] = ww[:3] * ww[-1:] + (1 - ww[-1:]) + self.pose0_im_list.append(ww) + # self.pose0_im_list.append(self.T(im)) + while len(self.pose0_im_list) < self.len_pose0: + self.pose0_im_list.append(ww) + self.pose0_im_list = torch.stack(self.pose0_im_list, dim=0)#.permute(0, 2, 3, 1) + # self.pose0_im_list = self.pose0_im_list.expand(16, 3, 256, 256) + print(f"Pose0 images loaded {self.pose0_im_list.shape}") + self.max_frames = self.pose0_im_list.shape[0] + print(f"Loaded SDS Dataset. Max {self.max_frames} frames.") + + self.t0_num = self.t0_im_list.shape[0] + self.pose0_num = self.pose0_im_list.shape[0] + self.len_ = (self.t0_num) * (self.pose0_num) + + pose0_pose = orbit_camera(0, 0, self.radius) + self.pose0_cam = MiniCam( + pose0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + # we sample (pose, t) + def __getitem__(self, index): + if self.split == 'train': + t0_idx = index // self.pose0_num + pose0_idx = index % self.pose0_num + time = torch.tensor([pose0_idx]).unsqueeze(0)#.expand(1, self.W * self.H) + else: + t0_idx = index # self.t0_num // 2 + pose0_idx = 1 + time = torch.tensor([pose0_idx]).unsqueeze(0) + + # return Camera(R=R,T=T,FoVx=FovX,FoVy=FovY,image=image,gt_alpha_mask=None, + # image_name=f"{index}",uid=index,data_device=torch.device("cuda"),time=time) + out = { + # timestamp is per pixel + "time": time / self.pose0_num, + 't0': self.t0_im_list[t0_idx], + 'pose0': self.pose0_im_list[pose0_idx], + # 't0': self.t0_im_list[t0_idx].view(-1, 3), + # 'pose0': self.pose0_im_list[pose0_idx].view(-1, 3), + # 'bg_color': torch.ones((1, 3), dtype=torch.float32), + "pose0_cam": self.pose0_cam, + } + + t0_pose = orbit_camera(0, (t0_idx / self.t0_num) * 360, self.radius) + ver = 0 + hor = (t0_idx / self.t0_num) * 360 + # ver = np.random.randint(-45, 45) + # hor = np.random.randint(-180, 180) + pose = orbit_camera(0 + ver, hor, self.radius) + out['hor'] = hor + out['ver'] = ver + + cur_cam = MiniCam( + pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + t0_cam = MiniCam( + t0_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + ) + out['cur_cam'] = cur_cam + out['t0_cam'] = t0_cam + + # rand_seq = [t0_cam] + # start from cur_cam, generate 6 sets of offsets + # rand_seq = [cur_cam] + # ver_offset = [np.random.randint(-10, 10) for i in range(self.pose0_num - 1)] + # hor_offset = [np.random.randint(-10, 10) for i in range(self.pose0_num - 1)] + # ver_offset = np.cumsum(ver_offset) + ver + # hor_offset = np.cumsum(hor_offset) + hor + # ver_offset = np.clip(ver_offset, -15, 45) + + rand_seq = [] + ver_list = [] + hor_list = [] + # for i in range(self.pose0_num - 1): + for i in range(self.pose0_num): + ver = np.random.randint(-30, 30) + hor = np.random.randint(-180, 180) + cur_pose = orbit_camera(ver, hor, self.radius) + ver_list.append(ver) + hor_list.append(hor) + # cur_pose = orbit_camera(ver_offset[i], hor_offset[i], self.radius) + rand_seq.append(MiniCam( + cur_pose if self.split == 'train' else pose, + # cur_pose, + self.H, # NOTE: order might be wrong + self.W, + self.fovy, + self.fovx, + self.near, + self.far, + )) + out['rand_poses'] = rand_seq + out['rand_ver'] = np.array(ver_list) + out['rand_hor'] = np.array(hor_list) + # out['rand_ver'] = ver_offset + # out['rand_hor'] = hor_offset + + return out + + def __len__(self): + # we sample (pose, t) + if self.split == 'train': + return self.len_ + if self.split == 'test': + return self.pose0_num + # return self.t0_num + if self.split == 'video': + return 100 diff --git a/train_comp.py b/train_comp.py new file mode 100644 index 0000000..a4fc56a --- /dev/null +++ b/train_comp.py @@ -0,0 +1,525 @@ +import numpy as np +import random +import os +import torch + +from random import randint +from utils.loss_utils import l1_loss, ssim, l2_loss, lpips_loss +from gaussian_renderer.comp_renderer import render as render_comp +from gaussian_renderer import render as render_single +import sys +from scene.comp_scene import Scene +from scene.gaussian_model_nogrid import GaussianModel_nogrid as GaussianModel +from utils.general_utils import safe_state +import uuid +from tqdm import tqdm +from utils.image_utils import psnr +from argparse import ArgumentParser, Namespace +from arguments import ModelParams, PipelineParams, OptimizationParams, ModelHiddenParams +from torch.utils.data import DataLoader +from utils.timer import Timer +from importlib import import_module + +# import lpips +import gc +from torchvision import transforms as T +from utils.scene_utils import render_training_image +from time import time +to8b = lambda x : (255*np.clip(x.cpu().numpy(),0,1)).astype(np.uint8) + +try: + from torch.utils.tensorboard import SummaryWriter + TENSORBOARD_FOUND = True +except ImportError: + TENSORBOARD_FOUND = False + +# from guidance.zero123_utils import Zero123 +# from guidance.zeroscope_utils_hifa import ZeroScope +# from guidance.zeroscope_utils import ZeroScope +# from guidance.mvdream_utils import MVDream +from guidance.sd_utils import StableDiffusion + +from PIL import Image +from torchvision.transforms import ToTensor +from plyfile import PlyData +from scipy.spatial.transform import Rotation as R + +def prepare_offset(rotation, translation): + def func(pts): + return (torch.from_numpy(rotation).float().cuda().detach() @ pts.permute(1, 0)).permute(1, 0) + torch.from_numpy(translation).float().cuda().detach() + return func + +def find_rotation_matrix(v1, v2): + """ + Find the rotation matrix that aligns v1 to v2. + + Parameters: + - v1: The initial vector. + - v2: The target vector. + + Returns: + - The rotation matrix that rotates v1 to align with v2. + """ + # Normalize the target vector + if np.linalg.norm(v2) > 1e-3: + v2_normalized = v2 / np.linalg.norm(v2) + else: + v2_normalized = v2 + + # Axis of rotation (cross product of v1 and v2) + axis = np.cross(v1, v2_normalized) + + if np.linalg.norm(axis) < 1e-6: + if np.dot(v1, v2) >= 0: + # The vectors are parallel, no rotation needed + rotation_matrix = np.eye(3) + else: + # The vectors are anti-parallel, rotate 180 degrees around any orthogonal axis + rotation_matrix = R.from_euler('x', 180, degrees=True).as_matrix() + + else: + # Angle of rotation + angle = np.arccos(np.dot(v1, v2_normalized)) + + # Handle the case where the rotation is undefined because the vectors are parallel/anti-parallel + + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + + # Rodrigues' rotation formula components + K = np.array([[0, -axis[2], axis[1]], + [axis[2], 0, -axis[0]], + [-axis[1], axis[0], 0]]) + I = np.identity(3) + + # Rotation matrix + rotation_matrix = I + np.sin(angle) * K + (1 - np.cos(angle)) * np.dot(K, K) + + return rotation_matrix # [3, 3] + +def get_rotation(prev_pos, next_pos): + new_vec = next_pos - prev_pos + canonical = np.array([1, 0, 0]) + # canonical = np.array([0, 0, 1]) + return find_rotation_matrix(canonical, new_vec) + +# Constants +g = 9.81 # acceleration due to gravity, m/s^2 + +# Initial horizontal velocity calculation +vx = 2 # m/s, to cover 4 meters in 2 seconds + +# To calculate the initial vertical velocity, we use the equation of motion at the peak (1 second into the jump) +# At the peak, vertical velocity (v) = 0, acceleration (a) = -g, time (t) = 1 sec +# We rearrange the equation v = u + at to find u: u = v - at +vy_initial = 0 - (-g) * 1 # Initial vertical velocity + + # return np.array((x, z, y)) + # return np.array((x, y, z)) + +def query_trajectory(generate_coordinates, t0, fps, frame_num): + # get_location = lambda t: np.array((R * np.sin(2 * np.pi * t * rot_speed), 0, R * np.cos(2 * np.pi * t * rot_speed))) + translation_list = [generate_coordinates(t0 + i * fps) for i in range(frame_num)] + return translation_list + +def scene_reconstruction(dataset, opt, hyper, pipe, testing_iterations, saving_iterations, + checkpoint_iterations, checkpoint, debug_from, + gaussians, scene, stage, tb_writer, train_iter,timer, args): + first_iter = 0 + + torch.cuda.empty_cache() + gc.collect() + print(f'Start training of stage {stage}: ') + obj_prompts = [] + if opt.video_sds_type == 'zeroscope': + from guidance.zeroscope_utils import ZeroScope + zeroscope = ZeroScope('cuda', fp16=True) + emb_zs = zeroscope.get_text_embeds([opt.prompt]) + for ww in opt.obj_prompt: + obj_prompts.append(zeroscope.get_text_embeds([ww])) + else: + from VideoCrafter.scripts.evaluation.videocrafter2_utils import VideoCrafter2 + from omegaconf import OmegaConf + vc_model_config = OmegaConf.load('VideoCrafter/configs/inference_t2v_512_v2.0.yaml').pop("model", OmegaConf.create()) + vc2 = VideoCrafter2(vc_model_config, ckpt_path='model.ckpt', weights_dtype=torch.float16, device='cuda') + emb_zs = vc2.model.get_learned_conditioning([opt.prompt]) + neg_emb_zs = vc2.model.get_learned_conditioning(["text, watermark, copyright, blurry, nsfw"]) + cond = {"c_crossattn": [emb_zs], "fps": torch.tensor([6]*emb_zs.shape[0]).to(vc2.model.device).long()} + un_cond = {"c_crossattn": [neg_emb_zs], "fps": torch.tensor([6]*emb_zs.shape[0]).to(vc2.model.device).long()} + + for ww in opt.obj_prompt: + emb_zs = vc2.model.get_learned_conditioning([ww]) + obj_prompts.append({"c_crossattn": [emb_zs], "fps": torch.tensor([6]*emb_zs.shape[0]).to(vc2.model.device).long()}) + + sd = StableDiffusion('cuda', fp16=True, sd_version='2.1') + sd.get_text_embeds([opt.prompt], negative_prompts=['static statue, text, watermark, copyright, blurry, nsfw']) + sd.get_objects_text_embeds(opt.obj_prompt, negative_prompts=['static statue, text, watermark, copyright, blurry, nsfw']) + + stage_ = ['fine'] + train_iter_ = [opt.iterations] + white_bg = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda", requires_grad=False) + black_bg = torch.tensor([0, 0, 0], dtype=torch.float32, device="cuda", requires_grad=False) + + for cur_stage, train_iter in zip(stage_, train_iter_): + for gs in gaussians: + gs.training_setup(opt) + if checkpoint: + (model_params, first_iter) = torch.load(checkpoint) + for gs in gaussians: + gs.restore(model_params, opt) + iter_start = torch.cuda.Event(enable_timing = True) + iter_end = torch.cuda.Event(enable_timing = True) + viewpoint_stack = None + ema_loss_for_log = 0.0 + + final_iter = train_iter + + progress_bar = tqdm(range(first_iter, final_iter), desc=f"[{args.expname}] Training progress") + offset_list = [] + for gs in gaussians: + offset_list.append(lambda x:x) + + func_name = opt.func_name + p, m = func_name.rsplit('.', 1) + mod = import_module(p) + generate_coordinates = getattr(mod, m) + + translation_list = query_trajectory(generate_coordinates, 0, 1 / 16, 16 + 1) + rotation_list = [get_rotation(translation_list[i], translation_list[i + 1]) for i in range(len(translation_list) - 1)] + func = [prepare_offset(rotation_list[i], translation_list[i]) for i in range(len(rotation_list))] + + for iteration in range(first_iter, final_iter+1): + stage = cur_stage + loss_weight = 1 + if np.random.random() < 0.5: + background = white_bg + else: + background = black_bg + + iter_start.record() + for gs in gaussians: + gs.update_learning_rate(iteration) + if not viewpoint_stack: + viewpoint_stack = scene.getTrainCameras() + viewpoint_stack_loader = DataLoader(viewpoint_stack, batch_size=1,shuffle=True,num_workers=4,collate_fn=list) + frame_num = viewpoint_stack.pose0_num + + loader = iter(viewpoint_stack_loader) + if True: + try: + data = next(loader) + except StopIteration: + print("reset dataloader") + batch_size = 1 + loader = iter(viewpoint_stack_loader) + if (iteration - 1) == debug_from: + pipe.debug = True + images = [] + radii_list = [] + visibility_filter_list = [] + viewspace_point_tensor_list = [] + dx = [] + out_pts = [] + viewpoint_cam = data[0]['rand_poses'] + fps = 1 / frame_num + t0 = 0 + sds_idx_list = range(frame_num) + + if np.random.random() < 0.8: + use_comp = True + else: + use_comp = False + for i in sds_idx_list: + time = torch.tensor([t0 + i * fps]).unsqueeze(0).float() + offset_list[-1] = func[i] + if use_comp: + render_pkg = render_comp(viewpoint_cam[0], gaussians, pipe, background, stage=stage, time=time, offset=offset_list, scales_list=opt.scales, pre_scale=opt.pre_scale) + else: + # render individual object + gs_idx = random.choice(range(len(gaussians))) + render_pkg = render_single(viewpoint_cam[0], (gaussians[gs_idx]), pipe, background, stage=stage, time=time, offset=offset_list[gs_idx], scales_preset=opt.scales[gs_idx], pre_scale=opt.pre_scale) + + image, viewspace_point_tensor, visibility_filter, radii = render_pkg["render"], render_pkg["viewspace_points"], render_pkg["visibility_filter"], render_pkg["radii"] + fg_mask = render_pkg['alpha'] + rgba = torch.cat([image, fg_mask], dim=0) + images.append(rgba.unsqueeze(0)) + if 'dx' in render_pkg: + dx.append(render_pkg['dx']) + radii_list.append(radii.unsqueeze(0)) + visibility_filter_list.append(visibility_filter.unsqueeze(0)) + viewspace_point_tensor_list.append(viewspace_point_tensor) + radii = torch.cat(radii_list,0).max(dim=0).values + visibility_filter = torch.cat(visibility_filter_list).any(dim=0) + image_tensor = torch.cat(images,0) + # print('output', image_tensor.shape) # B, C, H, W + if len(out_pts): + out_pts = torch.stack(out_pts, 0) + + if use_comp: + if opt.video_sds_type == 'zeroscope': + loss = zeroscope.train_step(image_tensor[:, :3], emb_zs) + else: + loss = vc2.train_step(image_tensor[:, :3].unsqueeze(0).permute(0, 2, 1, 3, 4), cond, un_cond, cfg=opt.cfg, cfg_temporal=opt.cfg_temporal, as_latent=False) + + # img loss for comp renderings + randints = list(range(16)) + np.random.shuffle(randints) + img_loss = sd.train_step(image_tensor[randints[0]:randints[0]+1, :3], background=background) + sd.train_step(image_tensor[randints[1]:randints[1]+1, :3], background=background) \ + + sd.train_step(image_tensor[randints[2]:randints[2]+1, :3], background=background) + sd.train_step(image_tensor[randints[3]:randints[3]+1, :3], background=background) + print(f"origin loss is {loss}, image_loss with weight {opt.image_weight} is {img_loss * opt.image_weight}") + loss = img_loss * opt.image_weight + loss * loss_weight + + if opt.with_reg: + dx_nn_loss = [] + for cur_dx in dx: + tot = cur_dx.shape[0] + dx_nn_loss.append(gaussians[0].get_nn_loss(cur_dx[:tot//2])) + dx_nn_loss.append(gaussians[1].get_nn_loss(cur_dx[tot//2:])) + + # values inside the list are already mean-ed + loss_nn = torch.stack(dx_nn_loss).sum() + tb_writer.add_scalar(f'{stage}/dx_nn_comp', loss_nn.item(), iteration) + print(f'in comp loss_nn with weight {opt.nn_weight} is {loss_nn * opt.nn_weight}') + loss += loss_nn * opt.nn_weight + else: + # print(len(obj_prompts), gs_idx) + if opt.video_sds_type == 'zeroscope': + loss = zeroscope.train_step(image_tensor[:, :3], obj_prompts[gs_idx]) + else: + loss = vc2.train_step(image_tensor[:, :3].unsqueeze(0).permute(0, 2, 1, 3, 4), obj_prompts[gs_idx], un_cond, cfg=opt.cfg, cfg_temporal=opt.cfg_temporal, as_latent=False) + + randints = list(range(16)) + np.random.shuffle(randints) + img_loss = sd.train_step(image_tensor[randints[0]:randints[0]+1, :3], background=background, obj_id=gs_idx) + sd.train_step(image_tensor[randints[1]:randints[1]+1, :3], background=background, obj_id=gs_idx) \ + + sd.train_step(image_tensor[randints[2]:randints[2]+1, :3], background=background, obj_id=gs_idx) + sd.train_step(image_tensor[randints[3]:randints[3]+1, :3], background=background, obj_id=gs_idx) + print(f"origin loss is {loss}, image_loss with weight {opt.image_weight} is {img_loss * opt.image_weight}") + loss = img_loss * opt.image_weight + loss * loss_weight + + if opt.with_reg: + dx_nn_loss = [] + for cur_dx in dx: + dx_nn_loss.append(gaussians[gs_idx].get_nn_loss(cur_dx)) + loss_nn = torch.stack(dx_nn_loss).sum() + tb_writer.add_scalar(f'{stage}/dx_nn_sep', loss_nn.item(), iteration) + print(f'in seperate loss_nn with weight {opt.nn_weight} is {loss_nn * opt.nn_weight}') + loss += loss_nn * opt.nn_weight + + if stage == 'fine': + if (not use_comp) and gs_idx == 0: + loss_dx0 = torch.stack(dx).mean().abs() + tb_writer.add_scalar(f'{stage}/loss_dx0_mean', loss_dx0.item(), iteration) + loss_dx0 = torch.stack(dx).abs().sum() + loss += loss_dx0 * opt.loss_dx_weight + tb_writer.add_scalar(f'{stage}/loss_dx-first', loss_dx0.item(), iteration) + else: + loss_dx0 = torch.stack(dx) + loss_dx0 = loss_dx0[:, :int(gaussians[0]._xyz.shape[0])] + loss_dx0 = torch.stack(dx).abs().sum() + loss += loss_dx0 * opt.loss_dx_weight + tb_writer.add_scalar(f'{stage}/loss_dx-first', loss_dx0.item(), iteration) + + if stage == "fine" and hyper.time_smoothness_weight != 0: + tv_loss = torch.sum([gs.compute_regulation(hyper.time_smoothness_weight, hyper.plane_tv_weight, hyper.l1_time_planes) for gs in gaussians]) + loss += tv_loss + tb_writer.add_scalar(f'{stage}/loss_tv', tv_loss.item(), iteration) + loss.backward() + viewspace_point_tensor_grad = torch.zeros_like(viewspace_point_tensor) + for idx in range(0, len(viewspace_point_tensor_list)): + if viewspace_point_tensor_list[idx].grad is not None: + viewspace_point_tensor_grad = viewspace_point_tensor_grad + viewspace_point_tensor_list[idx].grad + iter_end.record() + with torch.no_grad(): + ema_loss_for_log = 0.4 * loss.item() + 0.6 * ema_loss_for_log + + total_point = sum([gs._xyz.shape[0] for gs in gaussians]) + if iteration % 10 == 0: + progress_bar.set_postfix({"Loss": f"{ema_loss_for_log:.{7}f}", + "point":f"{total_point}"}) + progress_bar.update(10) + if iteration == opt.iterations: + progress_bar.close() + timer.pause() + training_report(tb_writer, iteration, loss, l1_loss, iter_start.elapsed_time(iter_end), testing_iterations, scene, render_comp, pipe, background, stage, func, scales=opt.scales) + if (iteration in saving_iterations): + print("\n[ITER {}] Saving Gaussians".format(iteration)) + scene.save(iteration, stage) + timer.start() + + if iteration < opt.iterations: + for gs in gaussians: + gs.optimizer.step() + gs.optimizer.zero_grad(set_to_none = True) + +def training(dataset, hyper, opt, pipe, testing_iterations, saving_iterations, checkpoint_iterations, checkpoint, debug_from, expname, args): + tb_writer = prepare_output_and_logger(expname) + gaussians = [GaussianModel(dataset.sh_degree, hyper) for __ in dataset.cloud_path] # init one GS model for each ply (object) + dataset.model_path = args.model_path + timer = Timer() + scene = Scene(dataset, gaussians,load_coarse=None) + timer.start() + scene_reconstruction(dataset, opt, hyper, pipe, testing_iterations, saving_iterations, + checkpoint_iterations, checkpoint, debug_from, + gaussians, scene, "coarse", tb_writer, opt.coarse_iterations,timer, args) + +from datetime import datetime + +def prepare_output_and_logger(expname): + if not args.model_path: + unique_str = str(datetime.today().strftime('%Y-%m-%d')) + '/' + expname + '_' + datetime.today().strftime('%H:%M:%S') + args.model_path = os.path.join("./output/", unique_str) + print("Output folder: {}".format(args.model_path)) + os.makedirs(args.model_path, exist_ok = True) + with open(os.path.join(args.model_path, "cfg_args"), 'w') as cfg_log_f: + cfg_log_f.write(str(Namespace(**vars(args)))) + tb_writer = None + if TENSORBOARD_FOUND: + tb_writer = SummaryWriter(args.model_path) + else: + print("Tensorboard not available: not logging progress") + return tb_writer + +def training_report(tb_writer, iteration, loss, l1_loss, elapsed, testing_iterations, scene : Scene, renderFunc, pipe, bg, stage, func, scales): + if tb_writer: + # tb_writer.add_scalar(f'{stage}/train_loss_patches/l1_loss', Ll1.item(), iteration) + tb_writer.add_scalar(f'{stage}/train_loss_patchestotal_loss', loss.item(), iteration) + tb_writer.add_scalar(f'{stage}/iter_time', elapsed, iteration) + ww = iteration if stage == 'static' else iteration + offset_list = [] + for gs in scene.gaussians: + offset_list.append(lambda x:x) + + if iteration % 100 == 0 and ww in testing_iterations: + # if stage == 'fine': + # if ww in testing_iterations: + torch.cuda.empty_cache() + train_set = scene.getTrainCameras() + validation_configs = [{'name': 'train', 'cameras' : [train_set[idx % len(train_set)] for idx in range(10, 5000, 299)]}] + for config in validation_configs: + if config['cameras'] and len(config['cameras']) > 0: + l1_test = 0.0 + psnr_test = 0.0 + ti = (torch.tensor([0]).unsqueeze(0)) + cam_li = config['cameras'][0]['rand_poses'] + im_li = [] + num = len(cam_li) + for tii in range(num): + offset_list[-1] = func[tii] + if stage == 'static': + ti = (torch.tensor([tii * 0]).unsqueeze(0).cuda()) + else: + ti = (torch.tensor([tii / num]).unsqueeze(0).cuda()) + viewpoint = cam_li[tii] + image = torch.clamp(renderFunc(viewpoint, scene.gaussians,stage=stage, pipe=pipe, bg_color=bg, time=ti, offset=offset_list, scales_list=scales)["render"], 0.0, 1.0) + im_li.append(image) + ww = len(im_li) // 2 + r1 = torch.cat(im_li[:ww], dim=-1) + r2 = torch.cat(im_li[ww:], dim=-1) + im_li = torch.cat([r1, r2], dim=-2) + if tb_writer: + tb_writer.add_image(f"rand_seq/{stage}", im_li, global_step=iteration) + l1_test = 0.0 + psnr_test = 0.0 + ti = (torch.tensor([0]).unsqueeze(0)) + cam_li = config['cameras'][0]['rand_poses'] + im_li = [] + num = len(cam_li) + for tii in range(num): + offset_list[-1] = func[tii] + if stage == 'static': + ti = (torch.tensor([tii * 0]).unsqueeze(0).cuda()) + else: + ti = (torch.tensor([tii / num]).unsqueeze(0).cuda()) + viewpoint = cam_li[0] + image = torch.clamp(renderFunc(viewpoint, scene.gaussians,stage=stage, pipe=pipe, bg_color=bg, time=ti, offset=offset_list, scales_list=scales)["render"], 0.0, 1.0) + im_li.append(image) + ww = len(im_li) // 2 + r1 = torch.cat(im_li[:ww], dim=-1) + r2 = torch.cat(im_li[ww:], dim=-1) + im_li = torch.cat([r1, r2], dim=-2) + if tb_writer: + tb_writer.add_image(f"static_seq/{stage}", im_li, global_step=iteration) + print("\n[ITER {}] Evaluating {}".format(iteration, config['name'])) + if tb_writer: + tb_writer.add_scalar(f'{stage}/total_points', scene.get_total_points(), iteration) + torch.cuda.empty_cache() +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True +if __name__ == "__main__": + torch.cuda.empty_cache() + parser = ArgumentParser(description="Training script parameters") + setup_seed(6666) + lp = ModelParams(parser) + op = OptimizationParams(parser) + pp = PipelineParams(parser) + hp = ModelHiddenParams(parser) + parser.add_argument('--ip', type=str, default="127.0.0.1") + parser.add_argument('--port', type=int, default=6009) + parser.add_argument('--debug_from', type=int, default=-1) + parser.add_argument('--detect_anomaly', action='store_true', default=False) + parser.add_argument("--test_iterations", nargs="+", type=int, default=[i*50 for i in range(0,300)]) + parser.add_argument("--save_iterations", nargs="+", type=int, default=[2500, 3000, 3500, 4000, 4500, 5000, 7000, 8000, 9000, 14000, 20000, 30_000,45000,60000]) + parser.add_argument("--quiet", action="store_true") + parser.add_argument("--checkpoint_iterations", nargs="+", type=int, default=[]) + parser.add_argument("--start_checkpoint", type=str, default = None) + parser.add_argument('-e', "--expname", type=str, default = "") + parser.add_argument("--configs", type=str, default = "arguments/comp.py") + parser.add_argument("--yyypath", type=str, default = "") + parser.add_argument("--t0_frame0_rate", type=float, default = 1) + parser.add_argument("--name_override", type=str, default="") + parser.add_argument("--sds_ratio_override", type=float, default=-1) + parser.add_argument("--sds_weight_override", type=float, default=-1) + parser.add_argument("--iteration", default=-1, type=int) + parser.add_argument('--image_weight_override', type=float, default=-1) + parser.add_argument('--nn_weight_override', type=float, default=-1) + parser.add_argument('--cfg_override', type=float, default=-1) + parser.add_argument('--cfg_temporal_override', type=float, default=-1) + parser.add_argument('--loss_dx_weight_override', type=float, default=-1) + parser.add_argument('--with_reg_override', action='store_true', default=False) + + args = parser.parse_args(sys.argv[1:]) + args.save_iterations.append(args.iterations - 1) + if args.configs: + # import mmcv + import mmengine + from utils.params_utils import merge_hparams + # config = mmcv.Config.fromfile(args.configs) + config = mmengine.Config.fromfile(args.configs) + args = merge_hparams(args, config) + if args.name_override != '': + args.name = args.name_override + if args.sds_ratio_override != -1: + args.fine_rand_rate = args.sds_ratio_override + if args.sds_weight_override != -1: + args.lambda_zero123 = args.sds_weight_override + if args.image_weight_override != -1: + args.image_weight = args.image_weight_override + if args.nn_weight_override != -1: + args.nn_weight = args.nn_weight_override + if args.cfg_override != -1: + args.cfg = args.cfg_override + if args.cfg_temporal_override != -1: + args.cfg_temporal = args.cfg_temporal_override + if args.loss_dx_weight_override != -1: + args.loss_dx_weight = args.loss_dx_weight_override + if args.with_reg_override: + args.with_reg = args.with_reg_override + + # print(args.name) + print("Optimizing " + args.model_path) + safe_state(args.quiet) + torch.autograd.set_detect_anomaly(args.detect_anomaly) + timer1 = Timer() + timer1.start() + print('Configs: ', args) + training(lp.extract(args), hp.extract(args), op.extract(args), pp.extract(args), args.test_iterations, args.save_iterations, args.checkpoint_iterations, args.start_checkpoint, args.debug_from, args.expname, args) + print("\nTraining complete.") + print('training time:',timer1.get_elapsed_time()) + from render_comp import render_sets + + render_sets(lp.extract(args), hp.extract(args), op.extract(args), args.iterations, pp.extract(args), skip_train=True, skip_test=True, skip_video=False, multiview_video=True) + print("\Rendering complete.") + \ No newline at end of file diff --git a/train_comp.sh b/train_comp.sh new file mode 100644 index 0000000..48a0d04 --- /dev/null +++ b/train_comp.sh @@ -0,0 +1,4 @@ +# python train_comp.py --configs arguments/comp_butterfly_flower_zs.py -e butterflyflower_zs --image_weight_override 0.02 --nn_weight 1000 --with_reg --cfg_override 100.0 --loss_dx_weight_override 0.005 +# python train_comp.py --configs arguments/comp_butterfly_flower_vc.py -e butterflyflower_vc --image_weight_override 0.05 --nn_weight 1000 --with_reg --cfg_override 20.0 --loss_dx_weight_override 0.005 +# python train_comp.py --configs arguments/comp_fish_rock_zs.py -e fishrock_zs --image_weight_override 0.02 --nn_weight 1000 --with_reg --cfg_override 100.0 --loss_dx_weight_override 0.01 +# python train_comp.py --configs arguments/comp_fish_rock_vc.py -e fishrock_vc --image_weight_override 0.05 --nn_weight 1000 --with_reg --cfg_override 20.0 --loss_dx_weight_override 0.01 diff --git a/traj_funcs/butterfly_flower.py b/traj_funcs/butterfly_flower.py new file mode 100644 index 0000000..2c372ba --- /dev/null +++ b/traj_funcs/butterfly_flower.py @@ -0,0 +1,6 @@ +import numpy as np + +def generate_coordinates(timestep): # [0, 1] + x = -2 + timestep*1.7 + y = 0.6 - timestep*0.4 + return np.array([x, y, 0]) # x y z \ No newline at end of file diff --git a/traj_funcs/fish_rock.py b/traj_funcs/fish_rock.py new file mode 100644 index 0000000..c827f29 --- /dev/null +++ b/traj_funcs/fish_rock.py @@ -0,0 +1,6 @@ +import numpy as np + +def generate_coordinates(t): # [0, 1] + z = 0.8 * np.sin(t*np.pi/2) + x = -1.5 * np.cos(t*np.pi/2) + return np.array([x, 0, z]) # x y z \ No newline at end of file diff --git a/utils/__pycache__/camera_utils.cpython-37.pyc b/utils/__pycache__/camera_utils.cpython-37.pyc new file mode 100644 index 0000000..ac5d892 Binary files /dev/null and b/utils/__pycache__/camera_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/general_utils.cpython-37.pyc b/utils/__pycache__/general_utils.cpython-37.pyc new file mode 100644 index 0000000..faa3432 Binary files /dev/null and b/utils/__pycache__/general_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/graphics_utils.cpython-37.pyc b/utils/__pycache__/graphics_utils.cpython-37.pyc new file mode 100644 index 0000000..8519c77 Binary files /dev/null and b/utils/__pycache__/graphics_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/image_utils.cpython-37.pyc b/utils/__pycache__/image_utils.cpython-37.pyc new file mode 100644 index 0000000..640e824 Binary files /dev/null and b/utils/__pycache__/image_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/loss_utils.cpython-37.pyc b/utils/__pycache__/loss_utils.cpython-37.pyc new file mode 100644 index 0000000..02f6171 Binary files /dev/null and b/utils/__pycache__/loss_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/params_utils.cpython-37.pyc b/utils/__pycache__/params_utils.cpython-37.pyc new file mode 100644 index 0000000..aec7f7a Binary files /dev/null and b/utils/__pycache__/params_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/scene_utils.cpython-37.pyc b/utils/__pycache__/scene_utils.cpython-37.pyc new file mode 100644 index 0000000..de24c23 Binary files /dev/null and b/utils/__pycache__/scene_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/sh_utils.cpython-37.pyc b/utils/__pycache__/sh_utils.cpython-37.pyc new file mode 100644 index 0000000..14ba87d Binary files /dev/null and b/utils/__pycache__/sh_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/system_utils.cpython-37.pyc b/utils/__pycache__/system_utils.cpython-37.pyc new file mode 100644 index 0000000..53f2c56 Binary files /dev/null and b/utils/__pycache__/system_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/timer.cpython-37.pyc b/utils/__pycache__/timer.cpython-37.pyc new file mode 100644 index 0000000..561c601 Binary files /dev/null and b/utils/__pycache__/timer.cpython-37.pyc differ diff --git a/utils/__pycache__/utils.cpython-37.pyc b/utils/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000..fb5162a Binary files /dev/null and b/utils/__pycache__/utils.cpython-37.pyc differ diff --git a/utils/camera_utils.py b/utils/camera_utils.py new file mode 100644 index 0000000..4a23c1e --- /dev/null +++ b/utils/camera_utils.py @@ -0,0 +1,65 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +from scene.cameras import Camera +import numpy as np +from utils.general_utils import PILtoTorch +from utils.graphics_utils import fov2focal + +WARNED = False + +def loadCam(args, id, cam_info, resolution_scale): + + + # resized_image_rgb = PILtoTorch(cam_info.image, resolution) + + # gt_image = resized_image_rgb[:3, ...] + # loaded_mask = None + + # if resized_image_rgb.shape[1] == 4: + # loaded_mask = resized_image_rgb[3:4, ...] + + return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, + FoVx=cam_info.FovX, FoVy=cam_info.FovY, + image=cam_info.image, gt_alpha_mask=None, + image_name=cam_info.image_name, uid=id, data_device=args.data_device, + time = cam_info.time, +) + +def cameraList_from_camInfos(cam_infos, resolution_scale, args): + camera_list = [] + + for id, c in enumerate(cam_infos): + camera_list.append(loadCam(args, id, c, resolution_scale)) + + return camera_list + +def camera_to_JSON(id, camera : Camera): + Rt = np.zeros((4, 4)) + Rt[:3, :3] = camera.R.transpose() + Rt[:3, 3] = camera.T + Rt[3, 3] = 1.0 + + W2C = np.linalg.inv(Rt) + pos = W2C[:3, 3] + rot = W2C[:3, :3] + serializable_array_2d = [x.tolist() for x in rot] + camera_entry = { + 'id' : id, + 'img_name' : camera.image_name, + 'width' : camera.width, + 'height' : camera.height, + 'position': pos.tolist(), + 'rotation': serializable_array_2d, + 'fy' : fov2focal(camera.FovY, camera.height), + 'fx' : fov2focal(camera.FovX, camera.width) + } + return camera_entry diff --git a/utils/general_utils.py b/utils/general_utils.py new file mode 100644 index 0000000..e6a8a81 --- /dev/null +++ b/utils/general_utils.py @@ -0,0 +1,136 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import sys +from datetime import datetime +import numpy as np +import random + +def inverse_sigmoid(x): + return torch.log(x/(1-x)) + +def PILtoTorch(pil_image, resolution): + if resolution is not None: + resized_image_PIL = pil_image.resize(resolution) + else: + resized_image_PIL = pil_image + resized_image = torch.from_numpy(np.array(resized_image_PIL)) / 255.0 + if len(resized_image.shape) == 3: + return resized_image.permute(2, 0, 1) + else: + return resized_image.unsqueeze(dim=-1).permute(2, 0, 1) + +def get_expon_lr_func( + lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000 +): + """ + Copied from Plenoxels + + Continuous learning rate decay function. Adapted from JaxNeRF + The returned rate is lr_init when step=0 and lr_final when step=max_steps, and + is log-linearly interpolated elsewhere (equivalent to exponential decay). + If lr_delay_steps>0 then the learning rate will be scaled by some smooth + function of lr_delay_mult, such that the initial learning rate is + lr_init*lr_delay_mult at the beginning of optimization but will be eased back + to the normal learning rate when steps>lr_delay_steps. + :param conf: config subtree 'lr' or similar + :param max_steps: int, the number of steps during optimization. + :return HoF which takes step as input + """ + + def helper(step): + if step < 0 or (lr_init == 0.0 and lr_final == 0.0): + # Disable this parameter + return 0.0 + if lr_delay_steps > 0: + # A kind of reverse cosine decay. + delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin( + 0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1) + ) + else: + delay_rate = 1.0 + t = np.clip(step / max_steps, 0, 1) + log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t) + return delay_rate * log_lerp + + return helper + +def strip_lowerdiag(L): + uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda") + + uncertainty[:, 0] = L[:, 0, 0] + uncertainty[:, 1] = L[:, 0, 1] + uncertainty[:, 2] = L[:, 0, 2] + uncertainty[:, 3] = L[:, 1, 1] + uncertainty[:, 4] = L[:, 1, 2] + uncertainty[:, 5] = L[:, 2, 2] + return uncertainty + +def strip_symmetric(sym): + return strip_lowerdiag(sym) + +def build_rotation(r): + norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3]) + + q = r / norm[:, None] + + R = torch.zeros((q.size(0), 3, 3), device='cuda') + + r = q[:, 0] + x = q[:, 1] + y = q[:, 2] + z = q[:, 3] + + R[:, 0, 0] = 1 - 2 * (y*y + z*z) + R[:, 0, 1] = 2 * (x*y - r*z) + R[:, 0, 2] = 2 * (x*z + r*y) + R[:, 1, 0] = 2 * (x*y + r*z) + R[:, 1, 1] = 1 - 2 * (x*x + z*z) + R[:, 1, 2] = 2 * (y*z - r*x) + R[:, 2, 0] = 2 * (x*z - r*y) + R[:, 2, 1] = 2 * (y*z + r*x) + R[:, 2, 2] = 1 - 2 * (x*x + y*y) + return R + +def build_scaling_rotation(s, r): + L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda") + R = build_rotation(r) + + L[:,0,0] = s[:,0] + L[:,1,1] = s[:,1] + L[:,2,2] = s[:,2] + + L = R @ L + return L + +def safe_state(silent): + old_f = sys.stdout + class F: + def __init__(self, silent): + self.silent = silent + + def write(self, x): + if not self.silent: + if x.endswith("\n"): + old_f.write(x.replace("\n", " [{}]\n".format(str(datetime.now().strftime("%d/%m %H:%M:%S"))))) + else: + old_f.write(x) + + def flush(self): + old_f.flush() + + sys.stdout = F(silent) + + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.set_device(torch.device("cuda:0")) diff --git a/utils/graphics_utils.py b/utils/graphics_utils.py new file mode 100644 index 0000000..b4627d8 --- /dev/null +++ b/utils/graphics_utils.py @@ -0,0 +1,77 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import math +import numpy as np +from typing import NamedTuple + +class BasicPointCloud(NamedTuple): + points : np.array + colors : np.array + normals : np.array + +def geom_transform_points(points, transf_matrix): + P, _ = points.shape + ones = torch.ones(P, 1, dtype=points.dtype, device=points.device) + points_hom = torch.cat([points, ones], dim=1) + points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0)) + + denom = points_out[..., 3:] + 0.0000001 + return (points_out[..., :3] / denom).squeeze(dim=0) + +def getWorld2View(R, t): + Rt = np.zeros((4, 4)) + Rt[:3, :3] = R.transpose() + Rt[:3, 3] = t + Rt[3, 3] = 1.0 + return np.float32(Rt) + +def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0): + Rt = np.zeros((4, 4)) + Rt[:3, :3] = R.transpose() + Rt[:3, 3] = t + Rt[3, 3] = 1.0 + + C2W = np.linalg.inv(Rt) + cam_center = C2W[:3, 3] + cam_center = (cam_center + translate) * scale + C2W[:3, 3] = cam_center + Rt = np.linalg.inv(C2W) + return np.float32(Rt) + +def getProjectionMatrix(znear, zfar, fovX, fovY): + tanHalfFovY = math.tan((fovY / 2)) + tanHalfFovX = math.tan((fovX / 2)) + + top = tanHalfFovY * znear + bottom = -top + right = tanHalfFovX * znear + left = -right + + P = torch.zeros(4, 4) + + z_sign = 1.0 + + P[0, 0] = 2.0 * znear / (right - left) + P[1, 1] = 2.0 * znear / (top - bottom) + P[0, 2] = (right + left) / (right - left) + P[1, 2] = (top + bottom) / (top - bottom) + P[3, 2] = z_sign + P[2, 2] = z_sign * zfar / (zfar - znear) + P[2, 3] = -(zfar * znear) / (zfar - znear) + return P + +def fov2focal(fov, pixels): + return pixels / (2 * math.tan(fov / 2)) + +def focal2fov(focal, pixels): + return 2*math.atan(pixels/(2*focal)) \ No newline at end of file diff --git a/utils/image_utils.py b/utils/image_utils.py new file mode 100644 index 0000000..b150699 --- /dev/null +++ b/utils/image_utils.py @@ -0,0 +1,19 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch + +def mse(img1, img2): + return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) +@torch.no_grad() +def psnr(img1, img2): + mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) + return 20 * torch.log10(1.0 / torch.sqrt(mse)) diff --git a/utils/loss_utils.py b/utils/loss_utils.py new file mode 100644 index 0000000..6c1b773 --- /dev/null +++ b/utils/loss_utils.py @@ -0,0 +1,69 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +import torch +import torch.nn.functional as F +from torch.autograd import Variable +from math import exp +import lpips +def lpips_loss(img1, img2, lpips_model): + a, b, _, __ = img2.shape + ww = img1[:a, :3] + loss = lpips_model(ww * 2 - 1,img2[:, :3] * 2 - 1) + return loss.mean() +def l1_loss(network_output, gt): + return torch.abs((network_output - gt)).mean() + +def l2_loss(network_output, gt): + return ((network_output - gt) ** 2).mean() + +def gaussian(window_size, sigma): + gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)]) + return gauss / gauss.sum() + +def create_window(window_size, channel): + _1D_window = gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) + window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous()) + return window + +def ssim(img1, img2, window_size=11, size_average=True): + channel = img1.size(-3) + window = create_window(window_size, channel) + + if img1.is_cuda: + window = window.cuda(img1.get_device()) + window = window.type_as(img1) + + return _ssim(img1, img2, window, window_size, channel, size_average) + +def _ssim(img1, img2, window, window_size, channel, size_average=True): + mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) + mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq + sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq + sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 + + C1 = 0.01 ** 2 + C2 = 0.03 ** 2 + + ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + else: + return ssim_map.mean(1).mean(1).mean(1) + diff --git a/utils/params_utils.py b/utils/params_utils.py new file mode 100644 index 0000000..6f2ea64 --- /dev/null +++ b/utils/params_utils.py @@ -0,0 +1,9 @@ +def merge_hparams(args, config): + params = ["OptimizationParams", "ModelHiddenParams", "ModelParams", "PipelineParams"] + for param in params: + if param in config.keys(): + for key, value in config[param].items(): + if hasattr(args, key): + setattr(args, key, value) + + return args \ No newline at end of file diff --git a/utils/scene_utils.py b/utils/scene_utils.py new file mode 100644 index 0000000..402e3c3 --- /dev/null +++ b/utils/scene_utils.py @@ -0,0 +1,97 @@ +import torch +import os +from PIL import Image, ImageDraw, ImageFont +from matplotlib import pyplot as plt +plt.rcParams['font.sans-serif'] = ['Times New Roman'] + +import numpy as np + +import copy +@torch.no_grad() +def render_training_image(scene, gaussians, viewpoints, render_func, pipe, background, stage, iteration, time_now): + def render(gaussians, viewpoint, path, scaling): + # scaling_copy = gaussians._scaling + render_pkg = render_func(viewpoint, gaussians, pipe, background, stage=stage) + label1 = f"stage:{stage},iter:{iteration}" + times = time_now/60 + if times < 1: + end = "min" + else: + end = "mins" + label2 = "time:%.2f" % times + end + image = render_pkg["render"] + depth = render_pkg["depth"] + image_np = image.permute(1, 2, 0).cpu().numpy() # 转换通道顺序为 (H, W, 3) + depth_np = depth.permute(1, 2, 0).cpu().numpy() + depth_np /= depth_np.max() + depth_np = np.repeat(depth_np, 3, axis=2) + image_np = np.concatenate((image_np, depth_np), axis=1) + image_with_labels = Image.fromarray((np.clip(image_np,0,1) * 255).astype('uint8')) # 转换为8位图像 + # 创建PIL图像对象的副本以绘制标签 + draw1 = ImageDraw.Draw(image_with_labels) + + # 选择字体和字体大小 + font = ImageFont.truetype('./utils/TIMES.TTF', size=40) # 请将路径替换为您选择的字体文件路径 + + # 选择文本颜色 + text_color = (255, 0, 0) # 白色 + + # 选择标签的位置(左上角坐标) + label1_position = (10, 10) + label2_position = (image_with_labels.width - 100 - len(label2) * 10, 10) # 右上角坐标 + + # 在图像上添加标签 + draw1.text(label1_position, label1, fill=text_color, font=font) + draw1.text(label2_position, label2, fill=text_color, font=font) + + image_with_labels.save(path) + render_base_path = os.path.join(scene.model_path, f"{stage}_render") + point_cloud_path = os.path.join(render_base_path,"pointclouds") + image_path = os.path.join(render_base_path,"images") + if not os.path.exists(os.path.join(scene.model_path, f"{stage}_render")): + os.makedirs(render_base_path) + if not os.path.exists(point_cloud_path): + os.makedirs(point_cloud_path) + if not os.path.exists(image_path): + os.makedirs(image_path) + # image:3,800,800 + + # point_save_path = os.path.join(point_cloud_path,f"{iteration}.jpg") + for idx in range(len(viewpoints)): + image_save_path = os.path.join(image_path,f"{iteration}_{idx}.jpg") + # time = torch.tensor([idx]).unsqueeze(0) + # render(gaussians,viewpoints[idx]['pose0_cam'],image_save_path,scaling=1,time=time) + render(gaussians,viewpoints[idx]['t0_cam'],image_save_path,scaling = 1) + # render(gaussians,point_save_path,scaling = 0.1) + # 保存带有标签的图像 + + + + pc_mask = gaussians.get_opacity + pc_mask = pc_mask > 0.1 + xyz = gaussians.get_xyz.detach()[pc_mask.squeeze()].cpu().permute(1,0).numpy() + # visualize_and_save_point_cloud(xyz, viewpoint.R, viewpoint.T, point_save_path) + # 如果需要,您可以将PIL图像转换回PyTorch张量 + # return image + # image_with_labels_tensor = torch.tensor(image_with_labels, dtype=torch.float32).permute(2, 0, 1) / 255.0 +def visualize_and_save_point_cloud(point_cloud, R, T, filename): + # 创建3D散点图 + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + R = R.T + # 应用旋转和平移变换 + T = -R.dot(T) + transformed_point_cloud = np.dot(R, point_cloud) + T.reshape(-1, 1) + # pcd = o3d.geometry.PointCloud() + # pcd.points = o3d.utility.Vector3dVector(transformed_point_cloud.T) # 转置点云数据以匹配Open3D的格式 + # transformed_point_cloud[2,:] = -transformed_point_cloud[2,:] + # 可视化点云 + ax.scatter(transformed_point_cloud[0], transformed_point_cloud[1], transformed_point_cloud[2], c='g', marker='o') + ax.axis("off") + # ax.set_xlabel('X Label') + # ax.set_ylabel('Y Label') + # ax.set_zlabel('Z Label') + + # 保存渲染结果为图片 + plt.savefig(filename) + diff --git a/utils/sh_utils.py b/utils/sh_utils.py new file mode 100644 index 0000000..bbca7d1 --- /dev/null +++ b/utils/sh_utils.py @@ -0,0 +1,118 @@ +# Copyright 2021 The PlenOctree Authors. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import torch + +C0 = 0.28209479177387814 +C1 = 0.4886025119029199 +C2 = [ + 1.0925484305920792, + -1.0925484305920792, + 0.31539156525252005, + -1.0925484305920792, + 0.5462742152960396 +] +C3 = [ + -0.5900435899266435, + 2.890611442640554, + -0.4570457994644658, + 0.3731763325901154, + -0.4570457994644658, + 1.445305721320277, + -0.5900435899266435 +] +C4 = [ + 2.5033429417967046, + -1.7701307697799304, + 0.9461746957575601, + -0.6690465435572892, + 0.10578554691520431, + -0.6690465435572892, + 0.47308734787878004, + -1.7701307697799304, + 0.6258357354491761, +] + + +def eval_sh(deg, sh, dirs): + """ + Evaluate spherical harmonics at unit directions + using hardcoded SH polynomials. + Works with torch/np/jnp. + ... Can be 0 or more batch dimensions. + Args: + deg: int SH deg. Currently, 0-3 supported + sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2] + dirs: jnp.ndarray unit directions [..., 3] + Returns: + [..., C] + """ + assert deg <= 4 and deg >= 0 + coeff = (deg + 1) ** 2 + assert sh.shape[-1] >= coeff + + result = C0 * sh[..., 0] + if deg > 0: + x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3] + result = (result - + C1 * y * sh[..., 1] + + C1 * z * sh[..., 2] - + C1 * x * sh[..., 3]) + + if deg > 1: + xx, yy, zz = x * x, y * y, z * z + xy, yz, xz = x * y, y * z, x * z + result = (result + + C2[0] * xy * sh[..., 4] + + C2[1] * yz * sh[..., 5] + + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] + + C2[3] * xz * sh[..., 7] + + C2[4] * (xx - yy) * sh[..., 8]) + + if deg > 2: + result = (result + + C3[0] * y * (3 * xx - yy) * sh[..., 9] + + C3[1] * xy * z * sh[..., 10] + + C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] + + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] + + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] + + C3[5] * z * (xx - yy) * sh[..., 14] + + C3[6] * x * (xx - 3 * yy) * sh[..., 15]) + + if deg > 3: + result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] + + C4[1] * yz * (3 * xx - yy) * sh[..., 17] + + C4[2] * xy * (7 * zz - 1) * sh[..., 18] + + C4[3] * yz * (7 * zz - 3) * sh[..., 19] + + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] + + C4[5] * xz * (7 * zz - 3) * sh[..., 21] + + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] + + C4[7] * xz * (xx - 3 * yy) * sh[..., 23] + + C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24]) + return result + +def RGB2SH(rgb): + return (rgb - 0.5) / C0 + +def SH2RGB(sh): + return sh * C0 + 0.5 \ No newline at end of file diff --git a/utils/system_utils.py b/utils/system_utils.py new file mode 100644 index 0000000..a51329a --- /dev/null +++ b/utils/system_utils.py @@ -0,0 +1,28 @@ +# +# Copyright (C) 2023, Inria +# GRAPHDECO research group, https://team.inria.fr/graphdeco +# All rights reserved. +# +# This software is free for non-commercial, research and evaluation use +# under the terms of the LICENSE.md file. +# +# For inquiries contact george.drettakis@inria.fr +# + +from errno import EEXIST +from os import makedirs, path +import os + +def mkdir_p(folder_path): + # Creates a directory. equivalent to using mkdir -p on the command line + try: + makedirs(folder_path, exist_ok=True) + except OSError as exc: # Python >2.5 + if exc.errno == EEXIST and path.isdir(folder_path): + pass + else: + raise + +def searchForMaxIteration(folder): + saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)] + return max(saved_iters) diff --git a/utils/timer.py b/utils/timer.py new file mode 100644 index 0000000..c01ff93 --- /dev/null +++ b/utils/timer.py @@ -0,0 +1,24 @@ +import time +class Timer: + def __init__(self): + self.start_time = None + self.elapsed = 0 + self.paused = False + + def start(self): + if self.start_time is None: + self.start_time = time.time() + elif self.paused: + self.start_time = time.time() - self.elapsed + self.paused = False + + def pause(self): + if not self.paused: + self.elapsed = time.time() - self.start_time + self.paused = True + + def get_elapsed_time(self): + if self.paused: + return self.elapsed + else: + return time.time() - self.start_time \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..c73b93e --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,77 @@ +import importlib +import numpy as np +import cv2 +import torch +import torch.distributed as dist + + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") + return total_params + + +def check_istarget(name, para_list): + """ + name: full name of source para + para_list: partial name of target para + """ + istarget=False + for para in para_list: + if para in name: + return True + return istarget + + +def instantiate_from_config(config): + if not "target" in config: + if config == '__is_first_stage__': + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def load_npz_from_dir(data_dir): + data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)] + data = np.concatenate(data, axis=0) + return data + + +def load_npz_from_paths(data_paths): + data = [np.load(data_path)['arr_0'] for data_path in data_paths] + data = np.concatenate(data, axis=0) + return data + + +def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None): + h, w = image.shape[:2] + if resize_short_edge is not None: + k = resize_short_edge / min(h, w) + else: + k = max_resolution / (h * w) + k = k**0.5 + h = int(np.round(h * k / 64)) * 64 + w = int(np.round(w * k / 64)) * 64 + image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4) + return image + + +def setup_dist(args): + if dist.is_initialized(): + return + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group( + 'nccl', + init_method='env://' + ) \ No newline at end of file