nud7ha9 commited on
Commit
942190e
·
verified ·
1 Parent(s): dbf193b

Upload 73 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +14 -0
  2. LICENSE.txt +201 -0
  3. README.md +467 -0
  4. app.py +787 -0
  5. assets/efficiency.png +0 -0
  6. assets/logo.png +3 -0
  7. assets/logo2.jpeg +3 -0
  8. assets/none_quant_efficiency.png +3 -0
  9. assets/pipe.png +3 -0
  10. examples/multi/1/1.WAV +3 -0
  11. examples/multi/1/2.WAV +3 -0
  12. examples/multi/1/multi1.png +3 -0
  13. examples/multi/2/1.wav +3 -0
  14. examples/multi/2/multi2.png +3 -0
  15. examples/multi/3/1-man.WAV +3 -0
  16. examples/multi/3/1-woman.WAV +3 -0
  17. examples/multi/3/multi3.png +3 -0
  18. examples/multitalk_example_1.json +13 -0
  19. examples/multitalk_example_2.json +9 -0
  20. examples/multitalk_example_3.json +9 -0
  21. examples/multitalk_example_tts_1.json +11 -0
  22. examples/single/1.wav +3 -0
  23. examples/single/single1.png +3 -0
  24. examples/single_example_1.json +7 -0
  25. examples/single_example_tts_1.json +9 -0
  26. generate_multitalk.py +638 -0
  27. kokoro/__init__.py +23 -0
  28. kokoro/__main__.py +148 -0
  29. kokoro/custom_stft.py +197 -0
  30. kokoro/istftnet.py +421 -0
  31. kokoro/model.py +155 -0
  32. kokoro/modules.py +183 -0
  33. kokoro/pipeline.py +445 -0
  34. requirements.txt +18 -0
  35. src/audio_analysis/torch_utils.py +20 -0
  36. src/audio_analysis/wav2vec2.py +125 -0
  37. src/utils.py +60 -0
  38. src/vram_management/__init__.py +1 -0
  39. src/vram_management/layers.py +243 -0
  40. wan/__init__.py +6 -0
  41. wan/configs/__init__.py +58 -0
  42. wan/configs/shared_config.py +19 -0
  43. wan/configs/wan_i2v_14B.py +24 -0
  44. wan/configs/wan_multitalk_14B.py +36 -0
  45. wan/configs/wan_t2v_14B.py +29 -0
  46. wan/configs/wan_t2v_1_3B.py +29 -0
  47. wan/distributed/__init__.py +0 -0
  48. wan/distributed/fsdp.py +43 -0
  49. wan/distributed/xdit_context_parallel.py +550 -0
  50. wan/first_last_frame2video.py +377 -0
.gitattributes CHANGED
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/logo.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/logo2.jpeg filter=lfs diff=lfs merge=lfs -text
38
+ assets/none_quant_efficiency.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/pipe.png filter=lfs diff=lfs merge=lfs -text
40
+ examples/multi/1/1.WAV filter=lfs diff=lfs merge=lfs -text
41
+ examples/multi/1/2.WAV filter=lfs diff=lfs merge=lfs -text
42
+ examples/multi/1/multi1.png filter=lfs diff=lfs merge=lfs -text
43
+ examples/multi/2/1.wav filter=lfs diff=lfs merge=lfs -text
44
+ examples/multi/2/multi2.png filter=lfs diff=lfs merge=lfs -text
45
+ examples/multi/3/1-man.WAV filter=lfs diff=lfs merge=lfs -text
46
+ examples/multi/3/1-woman.WAV filter=lfs diff=lfs merge=lfs -text
47
+ examples/multi/3/multi3.png filter=lfs diff=lfs merge=lfs -text
48
+ examples/single/1.wav filter=lfs diff=lfs merge=lfs -text
49
+ examples/single/single1.png filter=lfs diff=lfs merge=lfs -text
LICENSE.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <p align="center">
4
+ <img src="assets/logo2.jpeg" alt="MultiTalk" width="240"/>
5
+ </p>
6
+
7
+ <h1>Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation</h1>
8
+
9
+
10
+ [Zhe Kong*](https://scholar.google.com/citations?user=4X3yLwsAAAAJ&hl=zh-CN) · [Feng Gao*](https://scholar.google.com/citations?user=lFkCeoYAAAAJ) ·[Yong Zhang](https://yzhang2016.github.io/)<sup>&#9993;</sup> · [Zhuoliang Kang](https://scholar.google.com/citations?user=W1ZXjMkAAAAJ&hl=en) · [Xiaoming Wei](https://scholar.google.com/citations?user=JXV5yrZxj5MC&hl=zh-CN) · [Xunliang Cai](https://openreview.net/profile?id=~Xunliang_Cai1)
11
+
12
+ [Guanying Chen](https://guanyingc.github.io/) · [Wenhan Luo](https://whluo.github.io/)<sup>&#9993;</sup>
13
+
14
+ <sup>*</sup>Equal Contribution
15
+ <sup>&#9993;</sup>Corresponding Authors
16
+
17
+
18
+ <a href='https://meigen-ai.github.io/multi-talk/'><img src='https://img.shields.io/badge/Project-Page-green'></a>
19
+ <a href='https://arxiv.org/abs/2505.22647'><img src='https://img.shields.io/badge/Technique-Report-red'></a>
20
+ <a href='https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
21
+ </div>
22
+
23
+ > **TL; DR:** MultiTalk is an audio-driven multi-person conversational video generation​​. It enables the video creation of multi-person conversation 💬, singing 🎤, interaction control 👬, and cartoon 🙊.
24
+
25
+ <p align="center">
26
+ <img src="assets/pipe.png">
27
+ </p>
28
+
29
+ ## Video Demos
30
+
31
+ <table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
32
+ <tr>
33
+ <td>
34
+ <video src="https://github.com/user-attachments/assets/e55952e6-e1b2-44a5-9887-a89307a378da" width="320" controls loop></video>
35
+ </td>
36
+ <td>
37
+ <video src="https://github.com/user-attachments/assets/f0396c19-d459-42aa-9d78-34fdea10de18" width="320" controls loop></video>
38
+ </td>
39
+ <td>
40
+ <video src="https://github.com/user-attachments/assets/3576fd04-3e5f-4933-ac7b-1c4e6a601379" width="320" controls loop></video>
41
+ </td>
42
+ </tr>
43
+ <tr>
44
+ <td>
45
+ <video src="https://github.com/user-attachments/assets/5589056e-3202-442d-a62a-2cad7a7ecb19" width="320" controls loop></video>
46
+ </td>
47
+ <td>
48
+ <video src="https://github.com/user-attachments/assets/554bfbe7-0090-492c-94be-329f5e39e175" width="320" controls loop></video>
49
+ </td>
50
+ <td>
51
+ <video src="https://github.com/user-attachments/assets/9e961f35-9413-4846-a806-8186d54061da" width="320" controls loop></video>
52
+ </td>
53
+ </tr>
54
+ <tr>
55
+ <td>
56
+ <video src="https://github.com/user-attachments/assets/342595ab-cf75-4872-8182-f20fe8c95611" width="320" controls loop></video>
57
+ </td>
58
+ <td>
59
+ <video src="https://github.com/user-attachments/assets/6476f9f0-35e0-4484-91a4-8aa646aa994a" width="320" controls loop></video>
60
+ </td>
61
+ <td>
62
+ <video src="https://github.com/user-attachments/assets/d8fc8e94-0cba-4c25-9f3a-a8d7e0a785e1" width="320" controls loop></video>
63
+ </td>
64
+ </tr>
65
+ </table>
66
+
67
+
68
+
69
+
70
+
71
+ ## ✨ Key Features
72
+
73
+ We propose **MultiTalk** , a novel framework for audio-driven multi-person conversational video generation. Given a multi-stream audio input, a reference image and a prompt, MultiTalk generates a video containing interactions following the prompt, with consistent lip motions aligned with the audio.
74
+
75
+ > - 💬 **​​Realistic Conversations**​​ - Support single & multi-person generation
76
+ > - 👥 ​​**​​Interactive Character Control**​​​​ - Direct virtual humans via prompts
77
+ > - 🎤 ​​**​​Generalization Performances**​​​​ - Support the generation of cartoon character and singing
78
+ > - 📺 **​​​​Resolution Flexibility​​**​​: 480p & 720p output at arbitrary aspect ratios
79
+ > - ⏱️ **Long Video Generation**: Support video generation up to 15 seconds
80
+
81
+ ## 🔥 Latest News
82
+
83
+ * July 11, 2025: 🔥🔥 `MultiTalk` supports INT8 [quantization](https://github.com/huggingface/optimum-quanto) and [SageAttention2.2](https://github.com/thu-ml/SageAttention), and updates the CFG strategy (2 NFE per step) for FusionX LoRA,
84
+ * July 01, 2025: 🔥🔥 `MultiTalk` supports input audios with TTS, [FusioniX](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX/blob/main/FusionX_LoRa/Wan2.1_I2V_14B_FusionX_LoRA.safetensors) and [lightx2v](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors) LoRA acceleration (requires only 4~8 steps), and Gradio.
85
+ * June 14, 2025: 🔥🔥 We release `MultiTalk` with support for `multi-GPU inference`, `teacache acceleration`, `APG` and `low-VRAM inference` (enabling 480P video generation on a single RTX 4090). [APG](https://arxiv.org/abs/2410.02416) is used to alleviate the color error accumulation in long video generation. TeaCache is capable of increasing speed by approximately 2~3x.
86
+ * June 9, 2025: 🔥🔥 We release the [weights](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk) and inference code of **MultiTalk**
87
+ * May 29, 2025: We release the [Technique-Report](https://arxiv.org/abs/2505.22647) of **MultiTalk**
88
+ * May 29, 2025: We release the [project page](https://meigen-ai.github.io/multi-talk/) of **MultiTalk**
89
+
90
+ ## 🌐 Community Works
91
+ - [Wan2GP](https://github.com/deepbeepmeep/Wan2GP): thank [deepbeepmeep](https://github.com/deepbeepmeep) for providing the project [Wan2GP](https://github.com/deepbeepmeep/Wan2GP) that enables Multitalk on very low VRAM hardware (8 GB of VRAM) and combines it with the capabilities of Vace.
92
+ - [Replicate](https://replicate.com/zsxkib/multitalk): thank [zsxkib](https://github.com/zsxkib) for pushing MultiTalk to Replicate platform, try it! Please refer to [cog-MultiTalk](https://github.com/zsxkib/cog-MultiTalk) for details.
93
+ - [Gradio Demo](https://github.com/MeiGen-AI/MultiTalk): thank [fffiloni](https://github.com/fffiloni) for developing this gradio demo on Hugging Face. Please refer to the [issue](https://github.com/MeiGen-AI/MultiTalk/issues/39) for details.
94
+ - [ComfyUI](https://github.com/kijai/ComfyUI-WanVideoWrapper/tree/multitalk): thank [kijai](https://github.com/kijai) for integrating MultiTalk into ComfyUI-WanVideoWrapper. [Rudra](https://github.com/Rudra-ai-coder) found something interesting that MultiTalk can be combined with Wanx T2V and VACE in the [issue](https://github.com/kijai/ComfyUI-WanVideoWrapper/issues/635).
95
+ - [Google Colab example](https://colab.research.google.com/drive/185OyRIpJDlpnRjhBRb7FnaRlq11BLZTa?usp=sharing), an exmaple for inference on A100 provided by [Braffolk](https://github.com/Braffolk).
96
+
97
+ ## 📑 Todo List
98
+
99
+ - [x] Release the technical report
100
+ - [x] Inference
101
+ - [x] Checkpoints
102
+ - [x] Multi-GPU Inference
103
+ - [ ] Inference acceleration
104
+ - [x] TeaCache
105
+ - [x] int8 quantization
106
+ - [ ] LCM distillation
107
+ - [ ] Sparse Attention
108
+ - [x] Run with very low VRAM
109
+ - [x] TTS integration
110
+ - [x] Gradio demo
111
+ - [ ] ComfyUI
112
+ - [ ] 1.3B model
113
+
114
+ ## Quick Start
115
+
116
+ ### 🛠️Installation
117
+
118
+ #### 1. Create a conda environment and install pytorch, xformers
119
+ ```
120
+ conda create -n multitalk python=3.10
121
+ conda activate multitalk
122
+ pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
123
+ pip install -U xformers==0.0.28 --index-url https://download.pytorch.org/whl/cu121
124
+ ```
125
+ #### 2. Flash-attn installation:
126
+ ```
127
+ pip install misaki[en]
128
+ pip install ninja
129
+ pip install psutil
130
+ pip install packaging
131
+ pip install flash_attn==2.7.4.post1
132
+ ```
133
+
134
+ #### 3. Other dependencies
135
+ ```
136
+ pip install -r requirements.txt
137
+ conda install -c conda-forge librosa
138
+ ```
139
+
140
+ #### 4. FFmeg installation
141
+ ```
142
+ conda install -c conda-forge ffmpeg
143
+ ```
144
+ or
145
+ ```
146
+ sudo yum install ffmpeg ffmpeg-devel
147
+ ```
148
+
149
+ ### 🧱Model Preparation
150
+
151
+ #### 1. Model Download
152
+
153
+ | Models | Download Link | Notes |
154
+ | --------------|-------------------------------------------------------------------------------|-------------------------------|
155
+ | Wan2.1-I2V-14B-480P | 🤗 [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P) | Base model
156
+ | chinese-wav2vec2-base | 🤗 [Huggingface](https://huggingface.co/TencentGameMate/chinese-wav2vec2-base) | Audio encoder
157
+ | Kokoro-82M | 🤗 [Huggingface](https://huggingface.co/hexgrad/Kokoro-82M) | TTS weights
158
+ | MeiGen-MultiTalk | 🤗 [Huggingface](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk) | Our audio condition weights
159
+
160
+ Download models using huggingface-cli:
161
+ ``` sh
162
+ huggingface-cli download Wan-AI/Wan2.1-I2V-14B-480P --local-dir ./weights/Wan2.1-I2V-14B-480P
163
+ huggingface-cli download TencentGameMate/chinese-wav2vec2-base --local-dir ./weights/chinese-wav2vec2-base
164
+ huggingface-cli download TencentGameMate/chinese-wav2vec2-base model.safetensors --revision refs/pr/1 --local-dir ./weights/chinese-wav2vec2-base
165
+ huggingface-cli download hexgrad/Kokoro-82M --local-dir ./weights/Kokoro-82M
166
+ huggingface-cli download MeiGen-AI/MeiGen-MultiTalk --local-dir ./weights/MeiGen-MultiTalk
167
+ ```
168
+
169
+ #### 2. Link or Copy MultiTalk Model to Wan2.1-I2V-14B-480P Directory
170
+
171
+ Link through:
172
+ ```
173
+ mv weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json_old
174
+ sudo ln -s {Absolute path}/weights/MeiGen-MultiTalk/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/
175
+ sudo ln -s {Absolute path}/weights/MeiGen-MultiTalk/multitalk.safetensors weights/Wan2.1-I2V-14B-480P/
176
+ ```
177
+
178
+ Or, copy through:
179
+ ```
180
+ mv weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json_old
181
+ cp weights/MeiGen-MultiTalk/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/
182
+ cp weights/MeiGen-MultiTalk/multitalk.safetensors weights/Wan2.1-I2V-14B-480P/
183
+ ```
184
+ ### 🔑 Quick Inference
185
+
186
+ Our model is compatible with both 480P and 720P resolutions. The current code only supports 480P inference. 720P inference requires multiple GPUs, and we will provide an update soon.
187
+ > Some tips
188
+ > - Lip synchronization accuracy:​​ Audio CFG works optimally between 3–5. Increase the audio CFG value for better synchronization.
189
+ > - ​​Video clip length:​​ The model was trained on 81-frame videos at 25 FPS. For optimal prompt following performance, generate clips at 81 frames. Generating up to 201 frames is possible, though longer clips might reduce prompt-following performance.
190
+ > - ​​Long video generation:​​ Audio CFG influences color tone consistency across segments. Set this value to 3 to alleviate tonal variations.
191
+ > - Sampling steps: If you want to generate a video fast, you can decrease the sampling steps to even 10 that will not hurt the lip synchronization accuracy, but affects the motion and visual quality. More sampling steps, better video quality.
192
+ > - TeaCache accelerate:​​ The optimal range for `--teacache_thresh` is between 0.2 and 0.5. Increasing this value can further improve acceleration, but may also lead to a decline in the quality of the generated video.
193
+
194
+ #### Usage of MultiTalk
195
+ ```
196
+ --mode streaming: long video generation.
197
+ --mode clip: generate short video with one chunk.
198
+ --use_teacache: run with TeaCache.
199
+ --size multitalk-480: generate 480P video.
200
+ --size multitalk-720: generate 720P video.
201
+ --use_apg: run with APG.
202
+ --teacache_thresh: A coefficient used for TeaCache acceleration
203
+ —-sample_text_guide_scale: When not using LoRA, the optimal value is 5. After applying LoRA, the recommended value is 1.
204
+ —-sample_audio_guide_scale: When not using LoRA, the optimal value is 4. After applying LoRA, the recommended value is 2.
205
+ ```
206
+
207
+ #### 1. Single-Person
208
+
209
+ ##### 1) Run with single GPU
210
+
211
+
212
+ ```
213
+ python generate_multitalk.py \
214
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
215
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
216
+ --input_json examples/single_example_1.json \
217
+ --sample_steps 40 \
218
+ --mode streaming \
219
+ --use_teacache \
220
+ --save_file single_long_exp
221
+ ```
222
+
223
+ ##### 2) Run with very low VRAM
224
+
225
+ If you want run with very low VRAM, set `--num_persistent_param_in_dit 0`:
226
+
227
+
228
+ ```
229
+ python generate_multitalk.py \
230
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
231
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
232
+ --input_json examples/single_example_1.json \
233
+ --sample_steps 40 \
234
+ --mode streaming \
235
+ --num_persistent_param_in_dit 0 \
236
+ --use_teacache \
237
+ --save_file single_long_lowvram_exp
238
+ ```
239
+
240
+ ##### 3) Multi-GPU inference
241
+
242
+ ```
243
+ GPU_NUM=8
244
+ torchrun --nproc_per_node=$GPU_NUM --standalone generate_multitalk.py \
245
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
246
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
247
+ --dit_fsdp --t5_fsdp \
248
+ --ulysses_size=$GPU_NUM \
249
+ --input_json examples/single_example_1.json \
250
+ --sample_steps 40 \
251
+ --mode streaming \
252
+ --use_teacache \
253
+ --save_file single_long_multigpu_exp
254
+ ```
255
+
256
+ ##### 4) Run with TTS
257
+ ```
258
+ python generate_multitalk.py \
259
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
260
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
261
+ --input_json examples/single_example_tts_1.json \
262
+ --sample_steps 40 \
263
+ --mode streaming \
264
+ --num_persistent_param_in_dit 0 \
265
+ --use_teacache \
266
+ --save_file single_long_lowvram_tts_exp \
267
+ --audio_mode tts
268
+ ```
269
+
270
+
271
+ #### 2. Multi-Person
272
+
273
+ ##### 1) Run with single GPU
274
+
275
+ ```
276
+ python generate_multitalk.py \
277
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
278
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
279
+ --input_json examples/multitalk_example_2.json \
280
+ --sample_steps 40 \
281
+ --mode streaming \
282
+ --use_teacache \
283
+ --save_file multi_long_exp
284
+ ```
285
+ ##### 2) Run with very low VRAM
286
+
287
+
288
+ ```
289
+ python generate_multitalk.py \
290
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
291
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
292
+ --input_json examples/multitalk_example_2.json \
293
+ --sample_steps 40 \
294
+ --mode streaming \
295
+ --num_persistent_param_in_dit 0 \
296
+ --use_teacache \
297
+ --save_file multi_long_lowvram_exp
298
+ ```
299
+
300
+ ##### 3) Multi-GPU inference
301
+
302
+ ```
303
+ GPU_NUM=8
304
+ torchrun --nproc_per_node=$GPU_NUM --standalone generate_multitalk.py \
305
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
306
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
307
+ --dit_fsdp --t5_fsdp --ulysses_size=$GPU_NUM \
308
+ --input_json examples/multitalk_example_2.json \
309
+ --sample_steps 40 \
310
+ --mode streaming --use_teacache \
311
+ --save_file multi_long_multigpu_exp
312
+ ```
313
+
314
+ ##### 4) Run with TTS
315
+
316
+ ```
317
+ python generate_multitalk.py \
318
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
319
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
320
+ --input_json examples/multitalk_example_tts_1.json \
321
+ --sample_steps 40 \
322
+ --mode streaming \
323
+ --num_persistent_param_in_dit 0 \
324
+ --use_teacache \
325
+ --save_file multi_long_lowvram_tts_exp \
326
+ --audio_mode tts
327
+ ```
328
+
329
+
330
+ #### 3. Run with FusioniX and CausVid(Require only 4~8 steps)
331
+
332
+ [FusioniX](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX/blob/main/FusionX_LoRa/Wan2.1_I2V_14B_FusionX_LoRA.safetensors) require 8 steps and [lightx2v](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors) requires only 4 steps.
333
+
334
+ ```
335
+ python generate_multitalk.py \
336
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
337
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
338
+ --input_json examples/single_example_1.json \
339
+ --lora_dir weights/Wan2.1_I2V_14B_FusionX_LoRA.safetensors \
340
+ --lora_scale 1.0 \
341
+ --sample_text_guide_scale 1.0 \
342
+ --sample_audio_guide_scale 2.0 \
343
+ --sample_steps 8 \
344
+ --mode streaming \
345
+ --num_persistent_param_in_dit 0 \
346
+ --save_file single_long_lowvram_fusionx_exp \
347
+ --sample_shift 2
348
+ ```
349
+
350
+ or
351
+
352
+ ```
353
+ python generate_multitalk.py \
354
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
355
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
356
+ --input_json examples/multitalk_example_2.json \
357
+ --lora_dir weights/Wan2.1_I2V_14B_FusionX_LoRA.safetensors \
358
+ --lora_scale 1.0 \
359
+ --sample_text_guide_scale 1.0 \
360
+ --sample_audio_guide_scale 2.0 \
361
+ --sample_steps 8 \
362
+ --mode streaming \
363
+ --num_persistent_param_in_dit 0 \
364
+ --save_file multi_long_lowvram_fusionx_exp \
365
+
366
+ ```
367
+
368
+ #### 4. Run with the quantization model (Only support run with single gpu)
369
+
370
+ ```
371
+ python generate_multitalk.py \
372
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
373
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
374
+ --input_json examples/multitalk_example_2.json \
375
+ --sample_steps 40 \
376
+ --mode streaming \
377
+ --use_teacache \
378
+ --quant int8 \
379
+ --quant_dir weights/MeiGen-MultiTalk \
380
+ --num_persistent_param_in_dit 0 \
381
+ --save_file multi_long_lowvram_exp_quant
382
+ ```
383
+
384
+ Run with lora:
385
+
386
+ ```
387
+ python generate_multitalk.py \
388
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
389
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
390
+ --input_json examples/multitalk_example_1.json \
391
+ --quant int8 \
392
+ --quant_dir weights/MeiGen-MultiTalk \
393
+ --lora_dir weights/MeiGen-MultiTalk/quant_models/quant_model_int8_FusionX.safetensors \
394
+ --sample_text_guide_scale 1.0 \
395
+ --sample_audio_guide_scale 2.0 \
396
+ --sample_steps 8 \
397
+ --mode streaming \
398
+ --num_persistent_param_in_dit 0 \
399
+ --save_file multi_long_lowvram_fusionx_exp_quant \
400
+ --sample_shift 2
401
+ ```
402
+
403
+ #### 5. Run with Gradio
404
+
405
+
406
+
407
+ ```
408
+ python app.py \
409
+ --lora_dir weights/Wan2.1_I2V_14B_FusionX_LoRA.safetensors \
410
+ --lora_scale 1.0 \
411
+ --num_persistent_param_in_dit 0 \
412
+ --sample_shift 2
413
+ ```
414
+
415
+ or
416
+
417
+ ```
418
+ python app.py --num_persistent_param_in_dit 0
419
+ ```
420
+
421
+ or
422
+
423
+ ```
424
+ python app.py \
425
+ --quant int8 \
426
+ --quant_dir weights/MeiGen-MultiTalk \
427
+ --lora_dir weights/MeiGen-MultiTalk/quant_models/quant_model_int8_FusionX.safetensors \
428
+ --sample_shift 2 \
429
+ --num_persistent_param_in_dit 0
430
+ ```
431
+
432
+ ## 🚀Computational Efficiency
433
+
434
+ #### 1) Non quantitative results
435
+
436
+ The results are evaluated on A100 GPUs for multi-person generation. Single-person generation uses less memory and provides faster inference.
437
+ <p align="center">
438
+ <img src="assets/efficiency.png">
439
+ </p>
440
+ TeaCache is capable of increasing speed by approximately 2~3x.
441
+
442
+ #### 2) Quantitative results
443
+
444
+ <p align="center">
445
+ <img src="assets/none_quant_efficiency.png">
446
+ </p>
447
+
448
+
449
+ ## 📚 Citation
450
+
451
+ If you find our work useful in your research, please consider citing:
452
+
453
+ ```
454
+ @article{kong2025let,
455
+ title={Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation},
456
+ author={Kong, Zhe and Gao, Feng and Zhang, Yong and Kang, Zhuoliang and Wei, Xiaoming and Cai, Xunliang and Chen, Guanying and Luo, Wenhan},
457
+ journal={arXiv preprint arXiv:2505.22647},
458
+ year={2025}
459
+ }
460
+ ```
461
+
462
+ ## 📜 License
463
+ The models in this repository are licensed under the Apache 2.0 License. We claim no rights over the your generated contents,
464
+ granting you the freedom to use them while ensuring that your usage complies with the provisions of this license.
465
+ You are fully accountable for your use of the models, which must not involve sharing any content that violates applicable laws,
466
+ causes harm to individuals or groups, disseminates personal information intended for harm, spreads misinformation, or targets vulnerable populations.
467
+
app.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import argparse
3
+ import logging
4
+ import os
5
+ os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
6
+ import sys
7
+ import json
8
+ import warnings
9
+ from datetime import datetime
10
+
11
+ import gradio as gr
12
+ warnings.filterwarnings('ignore')
13
+
14
+ import random
15
+
16
+ import torch
17
+ import torch.distributed as dist
18
+ from PIL import Image
19
+ import subprocess
20
+
21
+ import wan
22
+ from wan.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
23
+ from wan.utils.utils import cache_image, cache_video, str2bool
24
+ from wan.utils.multitalk_utils import save_video_ffmpeg
25
+ from kokoro import KPipeline
26
+ from transformers import Wav2Vec2FeatureExtractor
27
+ from src.audio_analysis.wav2vec2 import Wav2Vec2Model
28
+
29
+ import librosa
30
+ import pyloudnorm as pyln
31
+ import numpy as np
32
+ from einops import rearrange
33
+ import soundfile as sf
34
+ import re
35
+
36
+ def _validate_args(args):
37
+ # Basic check
38
+ assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
39
+ assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
40
+
41
+ # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
42
+ if args.sample_steps is None:
43
+ args.sample_steps = 40
44
+
45
+ if args.sample_shift is None:
46
+ if args.size == 'multitalk-480':
47
+ args.sample_shift = 7
48
+ elif args.size == 'multitalk-720':
49
+ args.sample_shift = 11
50
+ else:
51
+ raise NotImplementedError(f'Not supported size')
52
+
53
+ args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
54
+ 0, 99999999)
55
+ # Size check
56
+ assert args.size in SUPPORTED_SIZES[
57
+ args.
58
+ task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
59
+
60
+
61
+ def _parse_args():
62
+ parser = argparse.ArgumentParser(
63
+ description="Generate a image or video from a text prompt or image using Wan"
64
+ )
65
+ parser.add_argument(
66
+ "--task",
67
+ type=str,
68
+ default="multitalk-14B",
69
+ choices=list(WAN_CONFIGS.keys()),
70
+ help="The task to run.")
71
+ parser.add_argument(
72
+ "--size",
73
+ type=str,
74
+ default="multitalk-480",
75
+ choices=list(SIZE_CONFIGS.keys()),
76
+ help="The buckget size of the generated video. The aspect ratio of the output video will follow that of the input image."
77
+ )
78
+ parser.add_argument(
79
+ "--frame_num",
80
+ type=int,
81
+ default=81,
82
+ help="How many frames to be generated in one clip. The number should be 4n+1"
83
+ )
84
+ parser.add_argument(
85
+ "--ckpt_dir",
86
+ type=str,
87
+ default='./weights/Wan2.1-I2V-14B-480P',
88
+ help="The path to the Wan checkpoint directory.")
89
+ parser.add_argument(
90
+ "--quant_dir",
91
+ type=str,
92
+ default=None,
93
+ help="The path to the Wan quant checkpoint directory.")
94
+ parser.add_argument(
95
+ "--wav2vec_dir",
96
+ type=str,
97
+ default='./weights/chinese-wav2vec2-base',
98
+ help="The path to the wav2vec checkpoint directory.")
99
+ parser.add_argument(
100
+ "--lora_dir",
101
+ type=str,
102
+ nargs='+',
103
+ default=None,
104
+ help="The path to the LoRA checkpoint directory.")
105
+ parser.add_argument(
106
+ "--lora_scale",
107
+ type=float,
108
+ nargs='+',
109
+ default=[1.2],
110
+ help="Controls how much to influence the outputs with the LoRA parameters. Accepts multiple float values."
111
+ )
112
+ parser.add_argument(
113
+ "--offload_model",
114
+ type=str2bool,
115
+ default=None,
116
+ help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
117
+ )
118
+ parser.add_argument(
119
+ "--ulysses_size",
120
+ type=int,
121
+ default=1,
122
+ help="The size of the ulysses parallelism in DiT.")
123
+ parser.add_argument(
124
+ "--ring_size",
125
+ type=int,
126
+ default=1,
127
+ help="The size of the ring attention parallelism in DiT.")
128
+ parser.add_argument(
129
+ "--t5_fsdp",
130
+ action="store_true",
131
+ default=False,
132
+ help="Whether to use FSDP for T5.")
133
+ parser.add_argument(
134
+ "--t5_cpu",
135
+ action="store_true",
136
+ default=False,
137
+ help="Whether to place T5 model on CPU.")
138
+ parser.add_argument(
139
+ "--dit_fsdp",
140
+ action="store_true",
141
+ default=False,
142
+ help="Whether to use FSDP for DiT.")
143
+ parser.add_argument(
144
+ "--save_file",
145
+ type=str,
146
+ default=None,
147
+ help="The file to save the generated image or video to.")
148
+ parser.add_argument(
149
+ "--audio_save_dir",
150
+ type=str,
151
+ default='save_audio/gradio',
152
+ help="The path to save the audio embedding.")
153
+ parser.add_argument(
154
+ "--base_seed",
155
+ type=int,
156
+ default=42,
157
+ help="The seed to use for generating the image or video.")
158
+ parser.add_argument(
159
+ "--input_json",
160
+ type=str,
161
+ default='examples.json',
162
+ help="[meta file] The condition path to generate the video.")
163
+ parser.add_argument(
164
+ "--motion_frame",
165
+ type=int,
166
+ default=25,
167
+ help="Driven frame length used in the mode of long video genration.")
168
+ parser.add_argument(
169
+ "--mode",
170
+ type=str,
171
+ default="streaming",
172
+ choices=['clip', 'streaming'],
173
+ help="clip: generate one video chunk, streaming: long video generation")
174
+ parser.add_argument(
175
+ "--sample_steps", type=int, default=None, help="The sampling steps.")
176
+ parser.add_argument(
177
+ "--sample_shift",
178
+ type=float,
179
+ default=None,
180
+ help="Sampling shift factor for flow matching schedulers.")
181
+ parser.add_argument(
182
+ "--sample_text_guide_scale",
183
+ type=float,
184
+ default=5.0,
185
+ help="Classifier free guidance scale for text control.")
186
+ parser.add_argument(
187
+ "--sample_audio_guide_scale",
188
+ type=float,
189
+ default=4.0,
190
+ help="Classifier free guidance scale for audio control.")
191
+ parser.add_argument(
192
+ "--num_persistent_param_in_dit",
193
+ type=int,
194
+ default=None,
195
+ required=False,
196
+ help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
197
+ )
198
+ parser.add_argument(
199
+ "--use_teacache",
200
+ action="store_true",
201
+ default=False,
202
+ help="Enable teacache for video generation."
203
+ )
204
+ parser.add_argument(
205
+ "--teacache_thresh",
206
+ type=float,
207
+ default=0.2,
208
+ help="Threshold for teacache."
209
+ )
210
+ parser.add_argument(
211
+ "--use_apg",
212
+ action="store_true",
213
+ default=False,
214
+ help="Enable adaptive projected guidance for video generation (APG)."
215
+ )
216
+ parser.add_argument(
217
+ "--apg_momentum",
218
+ type=float,
219
+ default=-0.75,
220
+ help="Momentum used in adaptive projected guidance (APG)."
221
+ )
222
+ parser.add_argument(
223
+ "--apg_norm_threshold",
224
+ type=float,
225
+ default=55,
226
+ help="Norm threshold used in adaptive projected guidance (APG)."
227
+ )
228
+ parser.add_argument(
229
+ "--color_correction_strength",
230
+ type=float,
231
+ default=1.0,
232
+ help="strength for color correction [0.0 -- 1.0]."
233
+ )
234
+
235
+ parser.add_argument(
236
+ "--quant",
237
+ type=str,
238
+ default=None,
239
+ help="Quantization type, must be 'int8' or 'fp8'."
240
+ )
241
+ args = parser.parse_args()
242
+ _validate_args(args)
243
+ return args
244
+
245
+
246
+ def custom_init(device, wav2vec):
247
+ audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec, local_files_only=True).to(device)
248
+ audio_encoder.feature_extractor._freeze_parameters()
249
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
250
+ return wav2vec_feature_extractor, audio_encoder
251
+
252
+ def loudness_norm(audio_array, sr=16000, lufs=-23):
253
+ meter = pyln.Meter(sr)
254
+ loudness = meter.integrated_loudness(audio_array)
255
+ if abs(loudness) > 100:
256
+ return audio_array
257
+ normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
258
+ return normalized_audio
259
+
260
+ def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
261
+ if not (left_path=='None' or right_path=='None'):
262
+ human_speech_array1 = audio_prepare_single(left_path)
263
+ human_speech_array2 = audio_prepare_single(right_path)
264
+ elif left_path=='None':
265
+ human_speech_array2 = audio_prepare_single(right_path)
266
+ human_speech_array1 = np.zeros(human_speech_array2.shape[0])
267
+ elif right_path=='None':
268
+ human_speech_array1 = audio_prepare_single(left_path)
269
+ human_speech_array2 = np.zeros(human_speech_array1.shape[0])
270
+
271
+ if audio_type=='para':
272
+ new_human_speech1 = human_speech_array1
273
+ new_human_speech2 = human_speech_array2
274
+ elif audio_type=='add':
275
+ new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
276
+ new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
277
+ sum_human_speechs = new_human_speech1 + new_human_speech2
278
+ return new_human_speech1, new_human_speech2, sum_human_speechs
279
+
280
+ def _init_logging(rank):
281
+ # logging
282
+ if rank == 0:
283
+ # set format
284
+ logging.basicConfig(
285
+ level=logging.INFO,
286
+ format="[%(asctime)s] %(levelname)s: %(message)s",
287
+ handlers=[logging.StreamHandler(stream=sys.stdout)])
288
+ else:
289
+ logging.basicConfig(level=logging.ERROR)
290
+
291
+ def get_embedding(speech_array, wav2vec_feature_extractor, audio_encoder, sr=16000, device='cpu'):
292
+ audio_duration = len(speech_array) / sr
293
+ video_length = audio_duration * 25 # Assume the video fps is 25
294
+
295
+ # wav2vec_feature_extractor
296
+ audio_feature = np.squeeze(
297
+ wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
298
+ )
299
+ audio_feature = torch.from_numpy(audio_feature).float().to(device=device)
300
+ audio_feature = audio_feature.unsqueeze(0)
301
+
302
+ # audio encoder
303
+ with torch.no_grad():
304
+ embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
305
+
306
+ if len(embeddings) == 0:
307
+ print("Fail to extract audio embedding")
308
+ return None
309
+
310
+ audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
311
+ audio_emb = rearrange(audio_emb, "b s d -> s b d")
312
+
313
+ audio_emb = audio_emb.cpu().detach()
314
+ return audio_emb
315
+
316
+ def extract_audio_from_video(filename, sample_rate):
317
+ raw_audio_path = filename.split('/')[-1].split('.')[0]+'.wav'
318
+ ffmpeg_command = [
319
+ "ffmpeg",
320
+ "-y",
321
+ "-i",
322
+ str(filename),
323
+ "-vn",
324
+ "-acodec",
325
+ "pcm_s16le",
326
+ "-ar",
327
+ "16000",
328
+ "-ac",
329
+ "2",
330
+ str(raw_audio_path),
331
+ ]
332
+ subprocess.run(ffmpeg_command, check=True)
333
+ human_speech_array, sr = librosa.load(raw_audio_path, sr=sample_rate)
334
+ human_speech_array = loudness_norm(human_speech_array, sr)
335
+ os.remove(raw_audio_path)
336
+
337
+ return human_speech_array
338
+
339
+ def audio_prepare_single(audio_path, sample_rate=16000):
340
+ ext = os.path.splitext(audio_path)[1].lower()
341
+ if ext in ['.mp4', '.mov', '.avi', '.mkv']:
342
+ human_speech_array = extract_audio_from_video(audio_path, sample_rate)
343
+ return human_speech_array
344
+ else:
345
+ human_speech_array, sr = librosa.load(audio_path, sr=sample_rate)
346
+ human_speech_array = loudness_norm(human_speech_array, sr)
347
+ return human_speech_array
348
+
349
+ def process_tts_single(text, save_dir, voice1):
350
+ s1_sentences = []
351
+
352
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
353
+
354
+ voice_tensor = torch.load(voice1, weights_only=True)
355
+ generator = pipeline(
356
+ text, voice=voice_tensor, # <= change voice here
357
+ speed=1, split_pattern=r'\n+'
358
+ )
359
+ audios = []
360
+ for i, (gs, ps, audio) in enumerate(generator):
361
+ audios.append(audio)
362
+ audios = torch.concat(audios, dim=0)
363
+ s1_sentences.append(audios)
364
+ s1_sentences = torch.concat(s1_sentences, dim=0)
365
+ save_path1 =f'{save_dir}/s1.wav'
366
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
367
+ s1, _ = librosa.load(save_path1, sr=16000)
368
+ return s1, save_path1
369
+
370
+
371
+
372
+ def process_tts_multi(text, save_dir, voice1, voice2):
373
+ pattern = r'\(s(\d+)\)\s*(.*?)(?=\s*\(s\d+\)|$)'
374
+ matches = re.findall(pattern, text, re.DOTALL)
375
+
376
+ s1_sentences = []
377
+ s2_sentences = []
378
+
379
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
380
+ for idx, (speaker, content) in enumerate(matches):
381
+ if speaker == '1':
382
+ voice_tensor = torch.load(voice1, weights_only=True)
383
+ generator = pipeline(
384
+ content, voice=voice_tensor, # <= change voice here
385
+ speed=1, split_pattern=r'\n+'
386
+ )
387
+ audios = []
388
+ for i, (gs, ps, audio) in enumerate(generator):
389
+ audios.append(audio)
390
+ audios = torch.concat(audios, dim=0)
391
+ s1_sentences.append(audios)
392
+ s2_sentences.append(torch.zeros_like(audios))
393
+ elif speaker == '2':
394
+ voice_tensor = torch.load(voice2, weights_only=True)
395
+ generator = pipeline(
396
+ content, voice=voice_tensor, # <= change voice here
397
+ speed=1, split_pattern=r'\n+'
398
+ )
399
+ audios = []
400
+ for i, (gs, ps, audio) in enumerate(generator):
401
+ audios.append(audio)
402
+ audios = torch.concat(audios, dim=0)
403
+ s2_sentences.append(audios)
404
+ s1_sentences.append(torch.zeros_like(audios))
405
+
406
+ s1_sentences = torch.concat(s1_sentences, dim=0)
407
+ s2_sentences = torch.concat(s2_sentences, dim=0)
408
+ sum_sentences = s1_sentences + s2_sentences
409
+ save_path1 =f'{save_dir}/s1.wav'
410
+ save_path2 =f'{save_dir}/s2.wav'
411
+ save_path_sum = f'{save_dir}/sum.wav'
412
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
413
+ sf.write(save_path2, s2_sentences, 24000)
414
+ sf.write(save_path_sum, sum_sentences, 24000)
415
+
416
+ s1, _ = librosa.load(save_path1, sr=16000)
417
+ s2, _ = librosa.load(save_path2, sr=16000)
418
+ # sum, _ = librosa.load(save_path_sum, sr=16000)
419
+ return s1, s2, save_path_sum
420
+
421
+ def run_graio_demo(args):
422
+ rank = int(os.getenv("RANK", 0))
423
+ world_size = int(os.getenv("WORLD_SIZE", 1))
424
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
425
+ device = local_rank
426
+ _init_logging(rank)
427
+
428
+ if args.offload_model is None:
429
+ args.offload_model = False if world_size > 1 else True
430
+ logging.info(
431
+ f"offload_model is not specified, set to {args.offload_model}.")
432
+ if world_size > 1:
433
+ torch.cuda.set_device(local_rank)
434
+ dist.init_process_group(
435
+ backend="nccl",
436
+ init_method="env://",
437
+ rank=rank,
438
+ world_size=world_size)
439
+ else:
440
+ assert not (
441
+ args.t5_fsdp or args.dit_fsdp
442
+ ), f"t5_fsdp and dit_fsdp are not supported in non-distributed environments."
443
+ assert not (
444
+ args.ulysses_size > 1 or args.ring_size > 1
445
+ ), f"context parallel are not supported in non-distributed environments."
446
+
447
+ if args.ulysses_size > 1 or args.ring_size > 1:
448
+ assert args.ulysses_size * args.ring_size == world_size, f"The number of ulysses_size and ring_size should be equal to the world size."
449
+ from xfuser.core.distributed import (
450
+ init_distributed_environment,
451
+ initialize_model_parallel,
452
+ )
453
+ init_distributed_environment(
454
+ rank=dist.get_rank(), world_size=dist.get_world_size())
455
+
456
+ initialize_model_parallel(
457
+ sequence_parallel_degree=dist.get_world_size(),
458
+ ring_degree=args.ring_size,
459
+ ulysses_degree=args.ulysses_size,
460
+ )
461
+
462
+
463
+ cfg = WAN_CONFIGS[args.task]
464
+ if args.ulysses_size > 1:
465
+ assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
466
+
467
+ logging.info(f"Generation job args: {args}")
468
+ logging.info(f"Generation model config: {cfg}")
469
+
470
+ if dist.is_initialized():
471
+ base_seed = [args.base_seed] if rank == 0 else [None]
472
+ dist.broadcast_object_list(base_seed, src=0)
473
+ args.base_seed = base_seed[0]
474
+
475
+ assert args.task == "multitalk-14B", 'You should choose multitalk in args.task.'
476
+
477
+
478
+
479
+ wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
480
+ os.makedirs(args.audio_save_dir,exist_ok=True)
481
+
482
+
483
+ logging.info("Creating MultiTalk pipeline.")
484
+ # wan_i2v = None
485
+ wan_i2v = wan.MultiTalkPipeline(
486
+ config=cfg,
487
+ checkpoint_dir=args.ckpt_dir,
488
+ quant_dir=args.quant_dir,
489
+ device_id=device,
490
+ rank=rank,
491
+ t5_fsdp=args.t5_fsdp,
492
+ dit_fsdp=args.dit_fsdp,
493
+ use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
494
+ t5_cpu=args.t5_cpu,
495
+ lora_dir=args.lora_dir,
496
+ lora_scales=args.lora_scale,
497
+ quant=args.quant
498
+ )
499
+
500
+ if args.num_persistent_param_in_dit is not None:
501
+ wan_i2v.vram_management = True
502
+ wan_i2v.enable_vram_management(
503
+ num_persistent_param_in_dit=args.num_persistent_param_in_dit
504
+ )
505
+
506
+
507
+
508
+ def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2,
509
+ sd_steps, seed, text_guide_scale, audio_guide_scale, mode_selector, tts_text, resolution_select, human1_voice, human2_voice):
510
+ input_data = {}
511
+ input_data["prompt"] = img2vid_prompt
512
+ input_data["cond_image"] = img2vid_image
513
+ person = {}
514
+ if mode_selector == "Single Person(Local File)":
515
+ person['person1'] = img2vid_audio_1
516
+ elif mode_selector == "Single Person(TTS)":
517
+ tts_audio = {}
518
+ tts_audio['text'] = tts_text
519
+ tts_audio['human1_voice'] = human1_voice
520
+ input_data["tts_audio"] = tts_audio
521
+ elif mode_selector == "Multi Person(Local File, audio add)":
522
+ person['person1'] = img2vid_audio_1
523
+ person['person2'] = img2vid_audio_2
524
+ input_data["audio_type"] = 'add'
525
+ elif mode_selector == "Multi Person(Local File, audio parallel)":
526
+ person['person1'] = img2vid_audio_1
527
+ person['person2'] = img2vid_audio_2
528
+ input_data["audio_type"] = 'para'
529
+ else:
530
+ tts_audio = {}
531
+ tts_audio['text'] = tts_text
532
+ tts_audio['human1_voice'] = human1_voice
533
+ tts_audio['human2_voice'] = human2_voice
534
+ input_data["tts_audio"] = tts_audio
535
+
536
+ input_data["cond_audio"] = person
537
+
538
+ if 'Local File' in mode_selector:
539
+ if len(input_data['cond_audio'])==2:
540
+ new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
541
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
542
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
543
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
544
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
545
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
546
+ sf.write(sum_audio, sum_human_speechs, 16000)
547
+ torch.save(audio_embedding_1, emb1_path)
548
+ torch.save(audio_embedding_2, emb2_path)
549
+ input_data['cond_audio']['person1'] = emb1_path
550
+ input_data['cond_audio']['person2'] = emb2_path
551
+ input_data['video_audio'] = sum_audio
552
+ elif len(input_data['cond_audio'])==1:
553
+ human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
554
+ audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
555
+ emb_path = os.path.join(args.audio_save_dir, '1.pt')
556
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
557
+ sf.write(sum_audio, human_speech, 16000)
558
+ torch.save(audio_embedding, emb_path)
559
+ input_data['cond_audio']['person1'] = emb_path
560
+ input_data['video_audio'] = sum_audio
561
+ elif 'TTS' in mode_selector:
562
+ if 'human2_voice' not in input_data['tts_audio'].keys():
563
+ new_human_speech1, sum_audio = process_tts_single(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'])
564
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
565
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
566
+ torch.save(audio_embedding_1, emb1_path)
567
+ input_data['cond_audio']['person1'] = emb1_path
568
+ input_data['video_audio'] = sum_audio
569
+ else:
570
+ new_human_speech1, new_human_speech2, sum_audio = process_tts_multi(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'], input_data['tts_audio']['human2_voice'])
571
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
572
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
573
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
574
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
575
+ torch.save(audio_embedding_1, emb1_path)
576
+ torch.save(audio_embedding_2, emb2_path)
577
+ input_data['cond_audio']['person1'] = emb1_path
578
+ input_data['cond_audio']['person2'] = emb2_path
579
+ input_data['video_audio'] = sum_audio
580
+
581
+
582
+ # if len(input_data['cond_audio'])==2:
583
+ # new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
584
+ # audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
585
+ # audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
586
+ # emb1_path = os.path.join(args.audio_save_dir, '1.pt')
587
+ # emb2_path = os.path.join(args.audio_save_dir, '2.pt')
588
+ # sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
589
+ # sf.write(sum_audio, sum_human_speechs, 16000)
590
+ # torch.save(audio_embedding_1, emb1_path)
591
+ # torch.save(audio_embedding_2, emb2_path)
592
+ # input_data['cond_audio']['person1'] = emb1_path
593
+ # input_data['cond_audio']['person2'] = emb2_path
594
+ # input_data['video_audio'] = sum_audio
595
+ # elif len(input_data['cond_audio'])==1:
596
+ # human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
597
+ # audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
598
+ # emb_path = os.path.join(args.audio_save_dir, '1.pt')
599
+ # sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
600
+ # sf.write(sum_audio, human_speech, 16000)
601
+ # torch.save(audio_embedding, emb_path)
602
+ # input_data['cond_audio']['person1'] = emb_path
603
+ # input_data['video_audio'] = sum_audio
604
+
605
+ logging.info("Generating video ...")
606
+ video = wan_i2v.generate(
607
+ input_data,
608
+ size_buckget=resolution_select,
609
+ motion_frame=args.motion_frame,
610
+ frame_num=args.frame_num,
611
+ shift=args.sample_shift,
612
+ sampling_steps=sd_steps,
613
+ text_guide_scale=text_guide_scale,
614
+ audio_guide_scale=audio_guide_scale,
615
+ seed=seed,
616
+ n_prompt=n_prompt,
617
+ offload_model=args.offload_model,
618
+ max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
619
+ color_correction_strength = args.color_correction_strength,
620
+ extra_args=args,
621
+ )
622
+
623
+
624
+ if args.save_file is None:
625
+ formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
626
+ formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
627
+ "_")[:50]
628
+ args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
629
+
630
+ logging.info(f"Saving generated video to {args.save_file}.mp4")
631
+ save_video_ffmpeg(video, args.save_file, [input_data['video_audio']], high_quality_save=False)
632
+ logging.info("Finished.")
633
+
634
+ return args.save_file + '.mp4'
635
+
636
+ def toggle_audio_mode(mode):
637
+ if 'TTS' in mode:
638
+ return [
639
+ gr.Audio(visible=False, interactive=False),
640
+ gr.Audio(visible=False, interactive=False),
641
+ gr.Textbox(visible=True, interactive=True)
642
+ ]
643
+ elif 'Single' in mode:
644
+ return [
645
+ gr.Audio(visible=True, interactive=True),
646
+ gr.Audio(visible=False, interactive=False),
647
+ gr.Textbox(visible=False, interactive=False)
648
+ ]
649
+ else:
650
+ return [
651
+ gr.Audio(visible=True, interactive=True),
652
+ gr.Audio(visible=True, interactive=True),
653
+ gr.Textbox(visible=False, interactive=False)
654
+ ]
655
+
656
+
657
+ with gr.Blocks() as demo:
658
+
659
+ gr.Markdown("""
660
+ <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
661
+ MeiGen-MultiTalk
662
+ </div>
663
+ <div style="text-align: center; font-size: 16px; font-weight: normal; margin-bottom: 20px;">
664
+ Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation.
665
+ </div>
666
+ <div style="display: flex; justify-content: center; gap: 10px; flex-wrap: wrap;">
667
+ <a href='https://meigen-ai.github.io/multi-talk/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
668
+ <a href='https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
669
+ <a href='https://arxiv.org/abs/2505.22647'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
670
+ </div>
671
+
672
+
673
+ """)
674
+
675
+ with gr.Row():
676
+ with gr.Column(scale=1):
677
+ img2vid_image = gr.Image(
678
+ type="filepath",
679
+ label="Upload Input Image",
680
+ elem_id="image_upload",
681
+ )
682
+ img2vid_prompt = gr.Textbox(
683
+ label="Prompt",
684
+ placeholder="Describe the video you want to generate",
685
+ )
686
+
687
+
688
+ with gr.Accordion("Audio Options", open=True):
689
+ mode_selector = gr.Radio(
690
+ choices=["Single Person(Local File)", "Single Person(TTS)", "Multi Person(Local File, audio add)", "Multi Person(Local File, audio parallel)", "Multi Person(TTS)"],
691
+ label="Select person and audio mode.",
692
+ value="Single Person(Local File)"
693
+ )
694
+ resolution_select = gr.Radio(
695
+ choices=["multitalk-480", "multitalk-720"],
696
+ label="Select resolution.",
697
+ value="multitalk-480"
698
+ )
699
+ img2vid_audio_1 = gr.Audio(label="Conditioning Audio for speaker 1", type="filepath", visible=True)
700
+ img2vid_audio_2 = gr.Audio(label="Conditioning Audio for speaker 2", type="filepath", visible=False)
701
+ tts_text = gr.Textbox(
702
+ label="Text for TTS",
703
+ placeholder="Refer to the format in the examples",
704
+ visible=False,
705
+ interactive=False
706
+ )
707
+ mode_selector.change(
708
+ fn=toggle_audio_mode,
709
+ inputs=mode_selector,
710
+ outputs=[img2vid_audio_1, img2vid_audio_2, tts_text]
711
+ )
712
+
713
+ with gr.Accordion("Advanced Options", open=False):
714
+ with gr.Row():
715
+ sd_steps = gr.Slider(
716
+ label="Diffusion steps",
717
+ minimum=1,
718
+ maximum=1000,
719
+ value=8,
720
+ step=1)
721
+ seed = gr.Slider(
722
+ label="Seed",
723
+ minimum=-1,
724
+ maximum=2147483647,
725
+ step=1,
726
+ value=42)
727
+ with gr.Row():
728
+ text_guide_scale = gr.Slider(
729
+ label="Text Guide scale",
730
+ minimum=0,
731
+ maximum=20,
732
+ value=1.0,
733
+ step=1)
734
+ audio_guide_scale = gr.Slider(
735
+ label="Audio Guide scale",
736
+ minimum=0,
737
+ maximum=20,
738
+ value=2.0,
739
+ step=1)
740
+ with gr.Row():
741
+ human1_voice = gr.Textbox(
742
+ label="Voice for the left person",
743
+ value="weights/Kokoro-82M/voices/am_adam.pt",
744
+ )
745
+ human2_voice = gr.Textbox(
746
+ label="Voice for right person",
747
+ value="weights/Kokoro-82M/voices/af_heart.pt"
748
+ )
749
+ # with gr.Row():
750
+ n_prompt = gr.Textbox(
751
+ label="Negative Prompt",
752
+ placeholder="Describe the negative prompt you want to add",
753
+ value="bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
754
+ )
755
+
756
+ run_i2v_button = gr.Button("Generate Video")
757
+
758
+ with gr.Column(scale=2):
759
+ result_gallery = gr.Video(
760
+ label='Generated Video', interactive=False, height=600, )
761
+
762
+ gr.Examples(
763
+ examples = [
764
+ ["examples/single/single1.png", "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.", "Single Person(Local File)", "examples/single/1.wav", None, None],
765
+ ["examples/single/single1.png", "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.", "Single Person(TTS)", None, None, "Welcome to multi-talk, this is an audio-driven video generation model for multiple person."],
766
+ ["examples/multi/1/multi1.png", "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.", "Multi Person(Local File, audio add)", "examples/multi/1/1.WAV", "examples/multi/1/2.WAV", None],
767
+ ["examples/multi/3/multi3.png", "In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "Multi Person(Local File, audio parallel)", "examples/multi/3/1-man.WAV", "examples/multi/3/1-woman.WAV", None],
768
+ ["examples/multi/1/multi1.png", "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.", "Multi Person(TTS)", None, None, "(s1) do you know multi-talk? (s2) yes, I know it, that's amazing! (s1) Me too."],
769
+ ],
770
+ inputs = [img2vid_image, img2vid_prompt, mode_selector, img2vid_audio_1, img2vid_audio_2, tts_text],
771
+ )
772
+
773
+
774
+ run_i2v_button.click(
775
+ fn=generate_video,
776
+ inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2,sd_steps, seed, text_guide_scale, audio_guide_scale, mode_selector, tts_text, resolution_select, human1_voice, human2_voice],
777
+ outputs=[result_gallery],
778
+ )
779
+ demo.launch(server_name="0.0.0.0", debug=True, server_port=8418)
780
+
781
+
782
+
783
+
784
+ if __name__ == "__main__":
785
+ args = _parse_args()
786
+ run_graio_demo(args)
787
+
assets/efficiency.png ADDED
assets/logo.png ADDED

Git LFS Details

  • SHA256: 2fb97620f1515b94de007f5b5cde23e51aaa84a5cdc1eb91c021bb46b4cae3f0
  • Pointer size: 132 Bytes
  • Size of remote file: 3.31 MB
assets/logo2.jpeg ADDED

Git LFS Details

  • SHA256: 984efa12db10f378f37ba0576be90517658ed5c4a4146f2483121e9ae8fbd800
  • Pointer size: 131 Bytes
  • Size of remote file: 446 kB
assets/none_quant_efficiency.png ADDED

Git LFS Details

  • SHA256: a08c6e0ba766b37d9001e80c62706b155eed6edd1fd8cbc32f3730f947f301da
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
assets/pipe.png ADDED

Git LFS Details

  • SHA256: dca19575d5c512b93d0eab2359cc75878da2064d4ef0e1f44aaf6accc04d6e0a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.18 MB
examples/multi/1/1.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8397a9b3c0add26384afe7e544e36cbc4806d8f2d7c705e11bb2897dc1bc993b
3
+ size 315436
examples/multi/1/2.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753120ceadbdab3ce206423a1419f73018695682787414ca2f4613306be50bfc
3
+ size 544812
examples/multi/1/multi1.png ADDED

Git LFS Details

  • SHA256: 210b89972b810e760d15828323186771a56f1220e806b09fe06b0584a9f55537
  • Pointer size: 132 Bytes
  • Size of remote file: 3 MB
examples/multi/2/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eb6a408a8b5b33a732378e2a38e7412ba273186b85c324ec6a099d23fe38af
3
+ size 1273592
examples/multi/2/multi2.png ADDED

Git LFS Details

  • SHA256: 95bce20a7ed654515b33cdbda435fa152405215ae7e098ee16c8e04c4eac3f49
  • Pointer size: 132 Bytes
  • Size of remote file: 2.88 MB
examples/multi/3/1-man.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d304fd88850d6673649d1844db2894e03bf5a775123048eebcb01ab3b79bff5e
3
+ size 1503276
examples/multi/3/1-woman.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e1ebd7ae1587ebc7f0986f8b61e7fcc99c6fb57fbb15ab9373968e701afc8bf
3
+ size 1503276
examples/multi/3/multi3.png ADDED

Git LFS Details

  • SHA256: dbe21be842c62c26cba264630b96aa775cc739dfc3b02cbc8da44a9eb8b9671e
  • Pointer size: 132 Bytes
  • Size of remote file: 2.58 MB
examples/multitalk_example_1.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.",
3
+ "cond_image": "examples/multi/1/multi1.png",
4
+ "audio_type": "add",
5
+ "cond_audio": {
6
+ "person1": "examples/multi/1/1.WAV",
7
+ "person2": "examples/multi/1/2.WAV"
8
+ },
9
+ "bbox": {
10
+ "person1": [160, 120, 1280, 1080],
11
+ "person2": [160, 1320, 1280, 2280]
12
+ }
13
+ }
examples/multitalk_example_2.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "In a cozy recording studio, a man and a woman are singing together with passion and emotion. The man, with short brown hair, wears a light gray button-up shirt, his expression filled with concentration and warmth. The woman, with long wavy brown hair, dons a sleeveless dress adorned with small polka dots, her eyes closed as she belts out a heartfelt melody. The studio is equipped with professional microphones, and the background features soundproofing panels, creating an intimate and focused atmosphere. A close-up shot captures their expressions and the intensity of their performance.",
3
+ "cond_image": "examples/multi/2/multi2.png",
4
+ "audio_type": "para",
5
+ "cond_audio": {
6
+ "person1": "examples/multi/2/1.wav",
7
+ "person2": "examples/multi/2/1.wav"
8
+ }
9
+ }
examples/multitalk_example_3.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.",
3
+ "cond_image": "examples/multi/3/multi3.png",
4
+ "audio_type": "para",
5
+ "cond_audio": {
6
+ "person1": "examples/multi/3/1-man.WAV",
7
+ "person2": "examples/multi/3/1-woman.WAV"
8
+ }
9
+ }
examples/multitalk_example_tts_1.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "In a cozy recording studio, a man and a woman are singing together with passion and emotion. The man, with short brown hair, wears a light gray button-up shirt, his expression filled with concentration and warmth. The woman, with long wavy brown hair, dons a sleeveless dress adorned with small polka dots, her eyes closed as she belts out a heartfelt melody. The studio is equipped with professional microphones, and the background features soundproofing panels, creating an intimate and focused atmosphere. A close-up shot captures their expressions and the intensity of their performance.",
3
+ "cond_image": "examples/multi/1/multi1.png",
4
+ "audio_type": "para",
5
+ "tts_audio": {
6
+ "text": "(s1) do you know multi-talk? (s2) yes, I know it, that's amazing! (s1) Me too.",
7
+ "human1_voice": "weights/Kokoro-82M/voices/af_heart.pt",
8
+ "human2_voice": "weights/Kokoro-82M/voices/am_adam.pt"
9
+ },
10
+ "cond_audio":{}
11
+ }
examples/single/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba2733897f561f747e6508734bff4eeee29d0a73638e5c39c0c0b806701d4e8b
3
+ size 1888320
examples/single/single1.png ADDED

Git LFS Details

  • SHA256: 5a47d458721c4a7419d3c8ef9a5c3d89cf161ab31de9451b9bb4f321a37bc705
  • Pointer size: 132 Bytes
  • Size of remote file: 2.79 MB
examples/single_example_1.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.",
3
+ "cond_image": "examples/single/single1.png",
4
+ "cond_audio": {
5
+ "person1": "examples/single/1.wav"
6
+ }
7
+ }
examples/single_example_tts_1.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.",
3
+ "cond_image": "examples/single/single1.png",
4
+ "tts_audio": {
5
+ "text": "Welcome to multi-talk, this is an audio-driven video generation model for multiple person.",
6
+ "human1_voice": "weights/Kokoro-82M/voices/af_heart.pt"
7
+ },
8
+ "cond_audio":{}
9
+ }
generate_multitalk.py ADDED
@@ -0,0 +1,638 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import argparse
3
+ import logging
4
+ import os
5
+ import sys
6
+ import json
7
+ import warnings
8
+ from datetime import datetime
9
+
10
+ warnings.filterwarnings('ignore')
11
+
12
+ import random
13
+
14
+ import torch
15
+ import torch.distributed as dist
16
+ from PIL import Image
17
+ import subprocess
18
+
19
+ import wan
20
+ from wan.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
21
+ from wan.utils.utils import cache_image, cache_video, str2bool
22
+ from wan.utils.multitalk_utils import save_video_ffmpeg
23
+ from kokoro import KPipeline
24
+ from transformers import Wav2Vec2FeatureExtractor
25
+ from src.audio_analysis.wav2vec2 import Wav2Vec2Model
26
+
27
+ import librosa
28
+ import pyloudnorm as pyln
29
+ import numpy as np
30
+ from einops import rearrange
31
+ import soundfile as sf
32
+ import re
33
+
34
+
35
+ def _validate_args(args):
36
+ # Basic check
37
+ assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
38
+ assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
39
+
40
+ # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
41
+ if args.sample_steps is None:
42
+ args.sample_steps = 40
43
+
44
+ if args.sample_shift is None:
45
+ if args.size == 'multitalk-480':
46
+ args.sample_shift = 7
47
+ elif args.size == 'multitalk-720':
48
+ args.sample_shift = 11
49
+ else:
50
+ raise NotImplementedError(f'Not supported size')
51
+
52
+ args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
53
+ 0, 99999999)
54
+ # Size check
55
+ assert args.size in SUPPORTED_SIZES[
56
+ args.
57
+ task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
58
+
59
+
60
+ def _parse_args():
61
+ parser = argparse.ArgumentParser(
62
+ description="Generate a image or video from a text prompt or image using Wan"
63
+ )
64
+ parser.add_argument(
65
+ "--task",
66
+ type=str,
67
+ default="multitalk-14B",
68
+ choices=list(WAN_CONFIGS.keys()),
69
+ help="The task to run.")
70
+ parser.add_argument(
71
+ "--size",
72
+ type=str,
73
+ default="multitalk-480",
74
+ choices=list(SIZE_CONFIGS.keys()),
75
+ help="The buckget size of the generated video. The aspect ratio of the output video will follow that of the input image."
76
+ )
77
+ parser.add_argument(
78
+ "--frame_num",
79
+ type=int,
80
+ default=81,
81
+ help="How many frames to be generated in one clip. The number should be 4n+1"
82
+ )
83
+ parser.add_argument(
84
+ "--ckpt_dir",
85
+ type=str,
86
+ default=None,
87
+ help="The path to the Wan checkpoint directory.")
88
+ parser.add_argument(
89
+ "--quant_dir",
90
+ type=str,
91
+ default=None,
92
+ help="The path to the Wan quant checkpoint directory.")
93
+ parser.add_argument(
94
+ "--wav2vec_dir",
95
+ type=str,
96
+ default=None,
97
+ help="The path to the wav2vec checkpoint directory.")
98
+ parser.add_argument(
99
+ "--lora_dir",
100
+ type=str,
101
+ nargs='+',
102
+ default=None,
103
+ help="The paths to the LoRA checkpoint files."
104
+ )
105
+ parser.add_argument(
106
+ "--lora_scale",
107
+ type=float,
108
+ nargs='+',
109
+ default=[1.2],
110
+ help="Controls how much to influence the outputs with the LoRA parameters. Accepts multiple float values."
111
+ )
112
+ parser.add_argument(
113
+ "--offload_model",
114
+ type=str2bool,
115
+ default=None,
116
+ help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
117
+ )
118
+ parser.add_argument(
119
+ "--ulysses_size",
120
+ type=int,
121
+ default=1,
122
+ help="The size of the ulysses parallelism in DiT.")
123
+ parser.add_argument(
124
+ "--ring_size",
125
+ type=int,
126
+ default=1,
127
+ help="The size of the ring attention parallelism in DiT.")
128
+ parser.add_argument(
129
+ "--t5_fsdp",
130
+ action="store_true",
131
+ default=False,
132
+ help="Whether to use FSDP for T5.")
133
+ parser.add_argument(
134
+ "--t5_cpu",
135
+ action="store_true",
136
+ default=False,
137
+ help="Whether to place T5 model on CPU.")
138
+ parser.add_argument(
139
+ "--dit_fsdp",
140
+ action="store_true",
141
+ default=False,
142
+ help="Whether to use FSDP for DiT.")
143
+ parser.add_argument(
144
+ "--save_file",
145
+ type=str,
146
+ default=None,
147
+ help="The file to save the generated image or video to.")
148
+ parser.add_argument(
149
+ "--audio_save_dir",
150
+ type=str,
151
+ default='save_audio',
152
+ help="The path to save the audio embedding.")
153
+ parser.add_argument(
154
+ "--base_seed",
155
+ type=int,
156
+ default=42,
157
+ help="The seed to use for generating the image or video.")
158
+ parser.add_argument(
159
+ "--input_json",
160
+ type=str,
161
+ default='examples.json',
162
+ help="[meta file] The condition path to generate the video.")
163
+ parser.add_argument(
164
+ "--motion_frame",
165
+ type=int,
166
+ default=25,
167
+ help="Driven frame length used in the mode of long video genration.")
168
+ parser.add_argument(
169
+ "--mode",
170
+ type=str,
171
+ default="clip",
172
+ choices=['clip', 'streaming'],
173
+ help="clip: generate one video chunk, streaming: long video generation")
174
+ parser.add_argument(
175
+ "--sample_steps", type=int, default=None, help="The sampling steps.")
176
+ parser.add_argument(
177
+ "--sample_shift",
178
+ type=float,
179
+ default=None,
180
+ help="Sampling shift factor for flow matching schedulers.")
181
+ parser.add_argument(
182
+ "--sample_text_guide_scale",
183
+ type=float,
184
+ default=5.0,
185
+ help="Classifier free guidance scale for text control.")
186
+ parser.add_argument(
187
+ "--sample_audio_guide_scale",
188
+ type=float,
189
+ default=4.0,
190
+ help="Classifier free guidance scale for audio control.")
191
+ parser.add_argument(
192
+ "--num_persistent_param_in_dit",
193
+ type=int,
194
+ default=None,
195
+ required=False,
196
+ help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
197
+ )
198
+ parser.add_argument(
199
+ "--audio_mode",
200
+ type=str,
201
+ default="localfile",
202
+ choices=['localfile', 'tts'],
203
+ help="localfile: audio from local wav file, tts: audio from TTS")
204
+ parser.add_argument(
205
+ "--use_teacache",
206
+ action="store_true",
207
+ default=False,
208
+ help="Enable teacache for video generation."
209
+ )
210
+ parser.add_argument(
211
+ "--teacache_thresh",
212
+ type=float,
213
+ default=0.2,
214
+ help="Threshold for teacache."
215
+ )
216
+ parser.add_argument(
217
+ "--use_apg",
218
+ action="store_true",
219
+ default=False,
220
+ help="Enable adaptive projected guidance for video generation (APG)."
221
+ )
222
+ parser.add_argument(
223
+ "--apg_momentum",
224
+ type=float,
225
+ default=-0.75,
226
+ help="Momentum used in adaptive projected guidance (APG)."
227
+ )
228
+ parser.add_argument(
229
+ "--apg_norm_threshold",
230
+ type=float,
231
+ default=55,
232
+ help="Norm threshold used in adaptive projected guidance (APG)."
233
+ )
234
+ parser.add_argument(
235
+ "--color_correction_strength",
236
+ type=float,
237
+ default=1.0,
238
+ help="strength for color correction [0.0 -- 1.0]."
239
+ )
240
+
241
+ parser.add_argument(
242
+ "--quant",
243
+ type=str,
244
+ default=None,
245
+ help="Quantization type, must be 'int8' or 'fp8'."
246
+ )
247
+
248
+ args = parser.parse_args()
249
+
250
+ _validate_args(args)
251
+
252
+ return args
253
+
254
+ def custom_init(device, wav2vec):
255
+ audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec, local_files_only=True).to(device)
256
+ audio_encoder.feature_extractor._freeze_parameters()
257
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
258
+ return wav2vec_feature_extractor, audio_encoder
259
+
260
+ def loudness_norm(audio_array, sr=16000, lufs=-23):
261
+ meter = pyln.Meter(sr)
262
+ loudness = meter.integrated_loudness(audio_array)
263
+ if abs(loudness) > 100:
264
+ return audio_array
265
+ normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
266
+ return normalized_audio
267
+
268
+ def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
269
+
270
+ if not (left_path=='None' or right_path=='None'):
271
+ human_speech_array1 = audio_prepare_single(left_path)
272
+ human_speech_array2 = audio_prepare_single(right_path)
273
+ elif left_path=='None':
274
+ human_speech_array2 = audio_prepare_single(right_path)
275
+ human_speech_array1 = np.zeros(human_speech_array2.shape[0])
276
+ elif right_path=='None':
277
+ human_speech_array1 = audio_prepare_single(left_path)
278
+ human_speech_array2 = np.zeros(human_speech_array1.shape[0])
279
+
280
+ if audio_type=='para':
281
+ new_human_speech1 = human_speech_array1
282
+ new_human_speech2 = human_speech_array2
283
+ elif audio_type=='add':
284
+ new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
285
+ new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
286
+ sum_human_speechs = new_human_speech1 + new_human_speech2
287
+ return new_human_speech1, new_human_speech2, sum_human_speechs
288
+
289
+ def _init_logging(rank):
290
+ # logging
291
+ if rank == 0:
292
+ # set format
293
+ logging.basicConfig(
294
+ level=logging.INFO,
295
+ format="[%(asctime)s] %(levelname)s: %(message)s",
296
+ handlers=[logging.StreamHandler(stream=sys.stdout)])
297
+ else:
298
+ logging.basicConfig(level=logging.ERROR)
299
+
300
+ def get_embedding(speech_array, wav2vec_feature_extractor, audio_encoder, sr=16000, device='cpu'):
301
+ audio_duration = len(speech_array) / sr
302
+ video_length = audio_duration * 25 # Assume the video fps is 25
303
+
304
+ # wav2vec_feature_extractor
305
+ audio_feature = np.squeeze(
306
+ wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
307
+ )
308
+ audio_feature = torch.from_numpy(audio_feature).float().to(device=device)
309
+ audio_feature = audio_feature.unsqueeze(0)
310
+
311
+ # audio encoder
312
+ with torch.no_grad():
313
+ embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
314
+
315
+ if len(embeddings) == 0:
316
+ print("Fail to extract audio embedding")
317
+ return None
318
+
319
+ audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
320
+ audio_emb = rearrange(audio_emb, "b s d -> s b d")
321
+
322
+ audio_emb = audio_emb.cpu().detach()
323
+ return audio_emb
324
+
325
+ def extract_audio_from_video(filename, sample_rate):
326
+ raw_audio_path = filename.split('/')[-1].split('.')[0]+'.wav'
327
+ ffmpeg_command = [
328
+ "ffmpeg",
329
+ "-y",
330
+ "-i",
331
+ str(filename),
332
+ "-vn",
333
+ "-acodec",
334
+ "pcm_s16le",
335
+ "-ar",
336
+ "16000",
337
+ "-ac",
338
+ "2",
339
+ str(raw_audio_path),
340
+ ]
341
+ subprocess.run(ffmpeg_command, check=True)
342
+ human_speech_array, sr = librosa.load(raw_audio_path, sr=sample_rate)
343
+ human_speech_array = loudness_norm(human_speech_array, sr)
344
+ os.remove(raw_audio_path)
345
+
346
+ return human_speech_array
347
+
348
+ def audio_prepare_single(audio_path, sample_rate=16000):
349
+ ext = os.path.splitext(audio_path)[1].lower()
350
+ if ext in ['.mp4', '.mov', '.avi', '.mkv']:
351
+ human_speech_array = extract_audio_from_video(audio_path, sample_rate)
352
+ return human_speech_array
353
+ else:
354
+ human_speech_array, sr = librosa.load(audio_path, sr=sample_rate)
355
+ human_speech_array = loudness_norm(human_speech_array, sr)
356
+ return human_speech_array
357
+
358
+ def process_tts_single(text, save_dir, voice1):
359
+ s1_sentences = []
360
+
361
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
362
+
363
+ voice_tensor = torch.load(voice1, weights_only=True)
364
+ generator = pipeline(
365
+ text, voice=voice_tensor, # <= change voice here
366
+ speed=1, split_pattern=r'\n+'
367
+ )
368
+ audios = []
369
+ for i, (gs, ps, audio) in enumerate(generator):
370
+ audios.append(audio)
371
+ audios = torch.concat(audios, dim=0)
372
+ s1_sentences.append(audios)
373
+ s1_sentences = torch.concat(s1_sentences, dim=0)
374
+ save_path1 =f'{save_dir}/s1.wav'
375
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
376
+ s1, _ = librosa.load(save_path1, sr=16000)
377
+ return s1, save_path1
378
+
379
+
380
+
381
+ def process_tts_multi(text, save_dir, voice1, voice2):
382
+ pattern = r'\(s(\d+)\)\s*(.*?)(?=\s*\(s\d+\)|$)'
383
+ matches = re.findall(pattern, text, re.DOTALL)
384
+
385
+ s1_sentences = []
386
+ s2_sentences = []
387
+
388
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
389
+ for idx, (speaker, content) in enumerate(matches):
390
+ if speaker == '1':
391
+ voice_tensor = torch.load(voice1, weights_only=True)
392
+ generator = pipeline(
393
+ content, voice=voice_tensor, # <= change voice here
394
+ speed=1, split_pattern=r'\n+'
395
+ )
396
+ audios = []
397
+ for i, (gs, ps, audio) in enumerate(generator):
398
+ audios.append(audio)
399
+ audios = torch.concat(audios, dim=0)
400
+ s1_sentences.append(audios)
401
+ s2_sentences.append(torch.zeros_like(audios))
402
+ elif speaker == '2':
403
+ voice_tensor = torch.load(voice2, weights_only=True)
404
+ generator = pipeline(
405
+ content, voice=voice_tensor, # <= change voice here
406
+ speed=1, split_pattern=r'\n+'
407
+ )
408
+ audios = []
409
+ for i, (gs, ps, audio) in enumerate(generator):
410
+ audios.append(audio)
411
+ audios = torch.concat(audios, dim=0)
412
+ s2_sentences.append(audios)
413
+ s1_sentences.append(torch.zeros_like(audios))
414
+
415
+ s1_sentences = torch.concat(s1_sentences, dim=0)
416
+ s2_sentences = torch.concat(s2_sentences, dim=0)
417
+ sum_sentences = s1_sentences + s2_sentences
418
+ save_path1 =f'{save_dir}/s1.wav'
419
+ save_path2 =f'{save_dir}/s2.wav'
420
+ save_path_sum = f'{save_dir}/sum.wav'
421
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
422
+ sf.write(save_path2, s2_sentences, 24000)
423
+ sf.write(save_path_sum, sum_sentences, 24000)
424
+
425
+ s1, _ = librosa.load(save_path1, sr=16000)
426
+ s2, _ = librosa.load(save_path2, sr=16000)
427
+ # sum, _ = librosa.load(save_path_sum, sr=16000)
428
+ return s1, s2, save_path_sum
429
+
430
+ def generate(args):
431
+ rank = int(os.getenv("RANK", 0))
432
+ world_size = int(os.getenv("WORLD_SIZE", 1))
433
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
434
+ device = local_rank
435
+ _init_logging(rank)
436
+
437
+ if args.offload_model is None:
438
+ args.offload_model = False if world_size > 1 else True
439
+ logging.info(
440
+ f"offload_model is not specified, set to {args.offload_model}.")
441
+ if world_size > 1:
442
+ torch.cuda.set_device(local_rank)
443
+ dist.init_process_group(
444
+ backend="nccl",
445
+ init_method="env://",
446
+ rank=rank,
447
+ world_size=world_size)
448
+ else:
449
+ assert not (
450
+ args.t5_fsdp or args.dit_fsdp
451
+ ), f"t5_fsdp and dit_fsdp are not supported in non-distributed environments."
452
+ assert not (
453
+ args.ulysses_size > 1 or args.ring_size > 1
454
+ ), f"context parallel are not supported in non-distributed environments."
455
+
456
+ if args.ulysses_size > 1 or args.ring_size > 1:
457
+ assert args.ulysses_size * args.ring_size == world_size, f"The number of ulysses_size and ring_size should be equal to the world size."
458
+ from xfuser.core.distributed import (
459
+ init_distributed_environment,
460
+ initialize_model_parallel,
461
+ )
462
+ init_distributed_environment(
463
+ rank=dist.get_rank(), world_size=dist.get_world_size())
464
+
465
+ initialize_model_parallel(
466
+ sequence_parallel_degree=dist.get_world_size(),
467
+ ring_degree=args.ring_size,
468
+ ulysses_degree=args.ulysses_size,
469
+ )
470
+
471
+ # TODO: use prompt refine
472
+ # if args.use_prompt_extend:
473
+ # if args.prompt_extend_method == "dashscope":
474
+ # prompt_expander = DashScopePromptExpander(
475
+ # model_name=args.prompt_extend_model,
476
+ # is_vl="i2v" in args.task or "flf2v" in args.task)
477
+ # elif args.prompt_extend_method == "local_qwen":
478
+ # prompt_expander = QwenPromptExpander(
479
+ # model_name=args.prompt_extend_model,
480
+ # is_vl="i2v" in args.task,
481
+ # device=rank)
482
+ # else:
483
+ # raise NotImplementedError(
484
+ # f"Unsupport prompt_extend_method: {args.prompt_extend_method}")
485
+
486
+ cfg = WAN_CONFIGS[args.task]
487
+ if args.ulysses_size > 1:
488
+ assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
489
+
490
+ logging.info(f"Generation job args: {args}")
491
+ logging.info(f"Generation model config: {cfg}")
492
+
493
+ if dist.is_initialized():
494
+ base_seed = [args.base_seed] if rank == 0 else [None]
495
+ dist.broadcast_object_list(base_seed, src=0)
496
+ args.base_seed = base_seed[0]
497
+
498
+ assert args.task == "multitalk-14B", 'You should choose multitalk in args.task.'
499
+
500
+
501
+ # TODO: add prompt refine
502
+ # img = Image.open(args.image).convert("RGB")
503
+ # if args.use_prompt_extend:
504
+ # logging.info("Extending prompt ...")
505
+ # if rank == 0:
506
+ # prompt_output = prompt_expander(
507
+ # args.prompt,
508
+ # tar_lang=args.prompt_extend_target_lang,
509
+ # image=img,
510
+ # seed=args.base_seed)
511
+ # if prompt_output.status == False:
512
+ # logging.info(
513
+ # f"Extending prompt failed: {prompt_output.message}")
514
+ # logging.info("Falling back to original prompt.")
515
+ # input_prompt = args.prompt
516
+ # else:
517
+ # input_prompt = prompt_output.prompt
518
+ # input_prompt = [input_prompt]
519
+ # else:
520
+ # input_prompt = [None]
521
+ # if dist.is_initialized():
522
+ # dist.broadcast_object_list(input_prompt, src=0)
523
+ # args.prompt = input_prompt[0]
524
+ # logging.info(f"Extended prompt: {args.prompt}")
525
+
526
+ # read input files
527
+
528
+
529
+
530
+ with open(args.input_json, 'r', encoding='utf-8') as f:
531
+ input_data = json.load(f)
532
+
533
+ wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
534
+ args.audio_save_dir = os.path.join(args.audio_save_dir, input_data['cond_image'].split('/')[-1].split('.')[0])
535
+ os.makedirs(args.audio_save_dir,exist_ok=True)
536
+
537
+ if args.audio_mode=='localfile':
538
+ if len(input_data['cond_audio'])==2:
539
+ new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
540
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
541
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
542
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
543
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
544
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
545
+ sf.write(sum_audio, sum_human_speechs, 16000)
546
+ torch.save(audio_embedding_1, emb1_path)
547
+ torch.save(audio_embedding_2, emb2_path)
548
+ input_data['cond_audio']['person1'] = emb1_path
549
+ input_data['cond_audio']['person2'] = emb2_path
550
+ input_data['video_audio'] = sum_audio
551
+ elif len(input_data['cond_audio'])==1:
552
+ human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
553
+ audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
554
+ emb_path = os.path.join(args.audio_save_dir, '1.pt')
555
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
556
+ sf.write(sum_audio, human_speech, 16000)
557
+ torch.save(audio_embedding, emb_path)
558
+ input_data['cond_audio']['person1'] = emb_path
559
+ input_data['video_audio'] = sum_audio
560
+ elif args.audio_mode=='tts':
561
+ if 'human2_voice' not in input_data['tts_audio'].keys():
562
+ new_human_speech1, sum_audio = process_tts_single(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'])
563
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
564
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
565
+ torch.save(audio_embedding_1, emb1_path)
566
+ input_data['cond_audio']['person1'] = emb1_path
567
+ input_data['video_audio'] = sum_audio
568
+ else:
569
+ new_human_speech1, new_human_speech2, sum_audio = process_tts_multi(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'], input_data['tts_audio']['human2_voice'])
570
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
571
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
572
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
573
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
574
+ torch.save(audio_embedding_1, emb1_path)
575
+ torch.save(audio_embedding_2, emb2_path)
576
+ input_data['cond_audio']['person1'] = emb1_path
577
+ input_data['cond_audio']['person2'] = emb2_path
578
+ input_data['video_audio'] = sum_audio
579
+
580
+
581
+ logging.info("Creating MultiTalk pipeline.")
582
+ wan_i2v = wan.MultiTalkPipeline(
583
+ config=cfg,
584
+ checkpoint_dir=args.ckpt_dir,
585
+ quant_dir=args.quant_dir,
586
+ device_id=device,
587
+ rank=rank,
588
+ t5_fsdp=args.t5_fsdp,
589
+ dit_fsdp=args.dit_fsdp,
590
+ use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
591
+ t5_cpu=args.t5_cpu,
592
+ lora_dir=args.lora_dir,
593
+ lora_scales=args.lora_scale,
594
+ quant=args.quant
595
+ )
596
+
597
+
598
+ if args.num_persistent_param_in_dit is not None:
599
+ wan_i2v.vram_management = True
600
+ wan_i2v.enable_vram_management(
601
+ num_persistent_param_in_dit=args.num_persistent_param_in_dit
602
+ )
603
+
604
+ logging.info("Generating video ...")
605
+ video = wan_i2v.generate(
606
+ input_data,
607
+ size_buckget=args.size,
608
+ motion_frame=args.motion_frame,
609
+ frame_num=args.frame_num,
610
+ shift=args.sample_shift,
611
+ sampling_steps=args.sample_steps,
612
+ text_guide_scale=args.sample_text_guide_scale,
613
+ audio_guide_scale=args.sample_audio_guide_scale,
614
+ seed=args.base_seed,
615
+ offload_model=args.offload_model,
616
+ max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
617
+ color_correction_strength = args.color_correction_strength,
618
+ extra_args=args,
619
+ )
620
+
621
+
622
+ if rank == 0:
623
+
624
+ if args.save_file is None:
625
+ formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
626
+ formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
627
+ "_")[:50]
628
+ args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
629
+
630
+ logging.info(f"Saving generated video to {args.save_file}.mp4")
631
+ save_video_ffmpeg(video, args.save_file, [input_data['video_audio']], high_quality_save=False)
632
+
633
+ logging.info("Finished.")
634
+
635
+
636
+ if __name__ == "__main__":
637
+ args = _parse_args()
638
+ generate(args)
kokoro/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = '0.9.4'
2
+
3
+ from loguru import logger
4
+ import sys
5
+
6
+ # Remove default handler
7
+ logger.remove()
8
+
9
+ # Add custom handler with clean format including module and line number
10
+ logger.add(
11
+ sys.stderr,
12
+ format="<green>{time:HH:mm:ss}</green> | <cyan>{module:>16}:{line}</cyan> | <level>{level: >8}</level> | <level>{message}</level>",
13
+ colorize=True,
14
+ level="INFO" # "DEBUG" to enable logger.debug("message") and up prints
15
+ # "ERROR" to enable only logger.error("message") prints
16
+ # etc
17
+ )
18
+
19
+ # Disable before release or as needed
20
+ logger.disable("kokoro")
21
+
22
+ from .model import KModel
23
+ from .pipeline import KPipeline
kokoro/__main__.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Kokoro TTS CLI
2
+ Example usage:
3
+ python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
4
+
5
+ echo "Bom dia mundo, como vão vocês" > text.txt
6
+ python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
7
+
8
+ Common issues:
9
+ pip not installed: `uv pip install pip`
10
+ (Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
11
+
12
+ espeak not installed: `apt-get install espeak-ng`
13
+ """
14
+
15
+ import argparse
16
+ import wave
17
+ from pathlib import Path
18
+ from typing import Generator, TYPE_CHECKING
19
+
20
+ import numpy as np
21
+ from loguru import logger
22
+
23
+ languages = [
24
+ "a", # American English
25
+ "b", # British English
26
+ "h", # Hindi
27
+ "e", # Spanish
28
+ "f", # French
29
+ "i", # Italian
30
+ "p", # Brazilian Portuguese
31
+ "j", # Japanese
32
+ "z", # Mandarin Chinese
33
+ ]
34
+
35
+ if TYPE_CHECKING:
36
+ from kokoro import KPipeline
37
+
38
+
39
+ def generate_audio(
40
+ text: str, kokoro_language: str, voice: str, speed=1
41
+ ) -> Generator["KPipeline.Result", None, None]:
42
+ from kokoro import KPipeline
43
+
44
+ if not voice.startswith(kokoro_language):
45
+ logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
46
+ pipeline = KPipeline(lang_code=kokoro_language)
47
+ yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
48
+
49
+
50
+ def generate_and_save_audio(
51
+ output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
52
+ ) -> None:
53
+ with wave.open(str(output_file.resolve()), "wb") as wav_file:
54
+ wav_file.setnchannels(1) # Mono audio
55
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
56
+ wav_file.setframerate(24000) # Sample rate
57
+
58
+ for result in generate_audio(
59
+ text, kokoro_language=kokoro_language, voice=voice, speed=speed
60
+ ):
61
+ logger.debug(result.phonemes)
62
+ if result.audio is None:
63
+ continue
64
+ audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
65
+ wav_file.writeframes(audio_bytes)
66
+
67
+
68
+ def main() -> None:
69
+ parser = argparse.ArgumentParser()
70
+ parser.add_argument(
71
+ "-m",
72
+ "--voice",
73
+ default="af_heart",
74
+ help="Voice to use",
75
+ )
76
+ parser.add_argument(
77
+ "-l",
78
+ "--language",
79
+ help="Language to use (defaults to the one corresponding to the voice)",
80
+ choices=languages,
81
+ )
82
+ parser.add_argument(
83
+ "-o",
84
+ "--output-file",
85
+ "--output_file",
86
+ type=Path,
87
+ help="Path to output WAV file",
88
+ required=True,
89
+ )
90
+ parser.add_argument(
91
+ "-i",
92
+ "--input-file",
93
+ "--input_file",
94
+ type=Path,
95
+ help="Path to input text file (default: stdin)",
96
+ )
97
+ parser.add_argument(
98
+ "-t",
99
+ "--text",
100
+ help="Text to use instead of reading from stdin",
101
+ )
102
+ parser.add_argument(
103
+ "-s",
104
+ "--speed",
105
+ type=float,
106
+ default=1.0,
107
+ help="Speech speed",
108
+ )
109
+ parser.add_argument(
110
+ "--debug",
111
+ action="store_true",
112
+ help="Print DEBUG messages to console",
113
+ )
114
+ args = parser.parse_args()
115
+ if args.debug:
116
+ logger.level("DEBUG")
117
+ logger.debug(args)
118
+
119
+ lang = args.language or args.voice[0]
120
+
121
+ if args.text is not None and args.input_file is not None:
122
+ raise Exception("You cannot specify both 'text' and 'input_file'")
123
+ elif args.text:
124
+ text = args.text
125
+ elif args.input_file:
126
+ file: Path = args.input_file
127
+ text = file.read_text()
128
+ else:
129
+ import sys
130
+ print("Press Ctrl+D to stop reading input and start generating", flush=True)
131
+ text = '\n'.join(sys.stdin)
132
+
133
+ logger.debug(f"Input text: {text!r}")
134
+
135
+ out_file: Path = args.output_file
136
+ if not out_file.suffix == ".wav":
137
+ logger.warning("The output file name should end with .wav")
138
+ generate_and_save_audio(
139
+ output_file=out_file,
140
+ text=text,
141
+ kokoro_language=lang,
142
+ voice=args.voice,
143
+ speed=args.speed,
144
+ )
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
kokoro/custom_stft.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from attr import attr
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+ class CustomSTFT(nn.Module):
8
+ """
9
+ STFT/iSTFT without unfold/complex ops, using conv1d and conv_transpose1d.
10
+
11
+ - forward STFT => Real-part conv1d + Imag-part conv1d
12
+ - inverse STFT => Real-part conv_transpose1d + Imag-part conv_transpose1d + sum
13
+ - avoids F.unfold, so easier to export to ONNX
14
+ - uses replicate or constant padding for 'center=True' to approximate 'reflect'
15
+ (reflect is not supported for dynamic shapes in ONNX)
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ filter_length=800,
21
+ hop_length=200,
22
+ win_length=800,
23
+ window="hann",
24
+ center=True,
25
+ pad_mode="replicate", # or 'constant'
26
+ ):
27
+ super().__init__()
28
+ self.filter_length = filter_length
29
+ self.hop_length = hop_length
30
+ self.win_length = win_length
31
+ self.n_fft = filter_length
32
+ self.center = center
33
+ self.pad_mode = pad_mode
34
+
35
+ # Number of frequency bins for real-valued STFT with onesided=True
36
+ self.freq_bins = self.n_fft // 2 + 1
37
+
38
+ # Build window
39
+ assert window == 'hann', window
40
+ window_tensor = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
41
+ if self.win_length < self.n_fft:
42
+ # Zero-pad up to n_fft
43
+ extra = self.n_fft - self.win_length
44
+ window_tensor = F.pad(window_tensor, (0, extra))
45
+ elif self.win_length > self.n_fft:
46
+ window_tensor = window_tensor[: self.n_fft]
47
+ self.register_buffer("window", window_tensor)
48
+
49
+ # Precompute forward DFT (real, imag)
50
+ # PyTorch stft uses e^{-j 2 pi k n / N} => real=cos(...), imag=-sin(...)
51
+ n = np.arange(self.n_fft)
52
+ k = np.arange(self.freq_bins)
53
+ angle = 2 * np.pi * np.outer(k, n) / self.n_fft # shape (freq_bins, n_fft)
54
+ dft_real = np.cos(angle)
55
+ dft_imag = -np.sin(angle) # note negative sign
56
+
57
+ # Combine window and dft => shape (freq_bins, filter_length)
58
+ # We'll make 2 conv weight tensors of shape (freq_bins, 1, filter_length).
59
+ forward_window = window_tensor.numpy() # shape (n_fft,)
60
+ forward_real = dft_real * forward_window # (freq_bins, n_fft)
61
+ forward_imag = dft_imag * forward_window
62
+
63
+ # Convert to PyTorch
64
+ forward_real_torch = torch.from_numpy(forward_real).float()
65
+ forward_imag_torch = torch.from_numpy(forward_imag).float()
66
+
67
+ # Register as Conv1d weight => (out_channels, in_channels, kernel_size)
68
+ # out_channels = freq_bins, in_channels=1, kernel_size=n_fft
69
+ self.register_buffer(
70
+ "weight_forward_real", forward_real_torch.unsqueeze(1)
71
+ )
72
+ self.register_buffer(
73
+ "weight_forward_imag", forward_imag_torch.unsqueeze(1)
74
+ )
75
+
76
+ # Precompute inverse DFT
77
+ # Real iFFT formula => scale = 1/n_fft, doubling for bins 1..freq_bins-2 if n_fft even, etc.
78
+ # For simplicity, we won't do the "DC/nyquist not doubled" approach here.
79
+ # If you want perfect real iSTFT, you can add that logic.
80
+ # This version just yields good approximate reconstruction with Hann + typical overlap.
81
+ inv_scale = 1.0 / self.n_fft
82
+ n = np.arange(self.n_fft)
83
+ angle_t = 2 * np.pi * np.outer(n, k) / self.n_fft # shape (n_fft, freq_bins)
84
+ idft_cos = np.cos(angle_t).T # => (freq_bins, n_fft)
85
+ idft_sin = np.sin(angle_t).T # => (freq_bins, n_fft)
86
+
87
+ # Multiply by window again for typical overlap-add
88
+ # We also incorporate the scale factor 1/n_fft
89
+ inv_window = window_tensor.numpy() * inv_scale
90
+ backward_real = idft_cos * inv_window # (freq_bins, n_fft)
91
+ backward_imag = idft_sin * inv_window
92
+
93
+ # We'll implement iSTFT as real+imag conv_transpose with stride=hop.
94
+ self.register_buffer(
95
+ "weight_backward_real", torch.from_numpy(backward_real).float().unsqueeze(1)
96
+ )
97
+ self.register_buffer(
98
+ "weight_backward_imag", torch.from_numpy(backward_imag).float().unsqueeze(1)
99
+ )
100
+
101
+
102
+
103
+ def transform(self, waveform: torch.Tensor):
104
+ """
105
+ Forward STFT => returns magnitude, phase
106
+ Output shape => (batch, freq_bins, frames)
107
+ """
108
+ # waveform shape => (B, T). conv1d expects (B, 1, T).
109
+ # Optional center pad
110
+ if self.center:
111
+ pad_len = self.n_fft // 2
112
+ waveform = F.pad(waveform, (pad_len, pad_len), mode=self.pad_mode)
113
+
114
+ x = waveform.unsqueeze(1) # => (B, 1, T)
115
+ # Convolution to get real part => shape (B, freq_bins, frames)
116
+ real_out = F.conv1d(
117
+ x,
118
+ self.weight_forward_real,
119
+ bias=None,
120
+ stride=self.hop_length,
121
+ padding=0,
122
+ )
123
+ # Imag part
124
+ imag_out = F.conv1d(
125
+ x,
126
+ self.weight_forward_imag,
127
+ bias=None,
128
+ stride=self.hop_length,
129
+ padding=0,
130
+ )
131
+
132
+ # magnitude, phase
133
+ magnitude = torch.sqrt(real_out**2 + imag_out**2 + 1e-14)
134
+ phase = torch.atan2(imag_out, real_out)
135
+ # Handle the case where imag_out is 0 and real_out is negative to correct ONNX atan2 to match PyTorch
136
+ # In this case, PyTorch returns pi, ONNX returns -pi
137
+ correction_mask = (imag_out == 0) & (real_out < 0)
138
+ phase[correction_mask] = torch.pi
139
+ return magnitude, phase
140
+
141
+
142
+ def inverse(self, magnitude: torch.Tensor, phase: torch.Tensor, length=None):
143
+ """
144
+ Inverse STFT => returns waveform shape (B, T).
145
+ """
146
+ # magnitude, phase => (B, freq_bins, frames)
147
+ # Re-create real/imag => shape (B, freq_bins, frames)
148
+ real_part = magnitude * torch.cos(phase)
149
+ imag_part = magnitude * torch.sin(phase)
150
+
151
+ # conv_transpose wants shape (B, freq_bins, frames). We'll treat "frames" as time dimension
152
+ # so we do (B, freq_bins, frames) => (B, freq_bins, frames)
153
+ # But PyTorch conv_transpose1d expects (B, in_channels, input_length)
154
+ real_part = real_part # (B, freq_bins, frames)
155
+ imag_part = imag_part
156
+
157
+ # real iSTFT => convolve with "backward_real", "backward_imag", and sum
158
+ # We'll do 2 conv_transpose calls, each giving (B, 1, time),
159
+ # then add them => (B, 1, time).
160
+ real_rec = F.conv_transpose1d(
161
+ real_part,
162
+ self.weight_backward_real, # shape (freq_bins, 1, filter_length)
163
+ bias=None,
164
+ stride=self.hop_length,
165
+ padding=0,
166
+ )
167
+ imag_rec = F.conv_transpose1d(
168
+ imag_part,
169
+ self.weight_backward_imag,
170
+ bias=None,
171
+ stride=self.hop_length,
172
+ padding=0,
173
+ )
174
+ # sum => (B, 1, time)
175
+ waveform = real_rec - imag_rec # typical real iFFT has minus for imaginary part
176
+
177
+ # If we used "center=True" in forward, we should remove pad
178
+ if self.center:
179
+ pad_len = self.n_fft // 2
180
+ # Because of transposed convolution, total length might have extra samples
181
+ # We remove `pad_len` from start & end if possible
182
+ waveform = waveform[..., pad_len:-pad_len]
183
+
184
+ # If a specific length is desired, clamp
185
+ if length is not None:
186
+ waveform = waveform[..., :length]
187
+
188
+ # shape => (B, T)
189
+ return waveform
190
+
191
+ def forward(self, x: torch.Tensor):
192
+ """
193
+ Full STFT -> iSTFT pass: returns time-domain reconstruction.
194
+ Same interface as your original code.
195
+ """
196
+ mag, phase = self.transform(x)
197
+ return self.inverse(mag, phase, length=x.shape[-1])
kokoro/istftnet.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ADAPTED from https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
2
+ from kokoro.custom_stft import CustomSTFT
3
+ from torch.nn.utils import weight_norm
4
+ import math
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+
10
+ # https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
11
+ def init_weights(m, mean=0.0, std=0.01):
12
+ classname = m.__class__.__name__
13
+ if classname.find("Conv") != -1:
14
+ m.weight.data.normal_(mean, std)
15
+
16
+ def get_padding(kernel_size, dilation=1):
17
+ return int((kernel_size*dilation - dilation)/2)
18
+
19
+
20
+ class AdaIN1d(nn.Module):
21
+ def __init__(self, style_dim, num_features):
22
+ super().__init__()
23
+ # affine should be False, however there's a bug in the old torch.onnx.export (not newer dynamo) that causes the channel dimension to be lost if affine=False. When affine is true, there's additional learnably parameters. This shouldn't really matter setting it to True, since we're in inference mode
24
+ self.norm = nn.InstanceNorm1d(num_features, affine=True)
25
+ self.fc = nn.Linear(style_dim, num_features*2)
26
+
27
+ def forward(self, x, s):
28
+ h = self.fc(s)
29
+ h = h.view(h.size(0), h.size(1), 1)
30
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
31
+ return (1 + gamma) * self.norm(x) + beta
32
+
33
+
34
+ class AdaINResBlock1(nn.Module):
35
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
36
+ super(AdaINResBlock1, self).__init__()
37
+ self.convs1 = nn.ModuleList([
38
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
39
+ padding=get_padding(kernel_size, dilation[0]))),
40
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
41
+ padding=get_padding(kernel_size, dilation[1]))),
42
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
43
+ padding=get_padding(kernel_size, dilation[2])))
44
+ ])
45
+ self.convs1.apply(init_weights)
46
+ self.convs2 = nn.ModuleList([
47
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
48
+ padding=get_padding(kernel_size, 1))),
49
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
50
+ padding=get_padding(kernel_size, 1))),
51
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
52
+ padding=get_padding(kernel_size, 1)))
53
+ ])
54
+ self.convs2.apply(init_weights)
55
+ self.adain1 = nn.ModuleList([
56
+ AdaIN1d(style_dim, channels),
57
+ AdaIN1d(style_dim, channels),
58
+ AdaIN1d(style_dim, channels),
59
+ ])
60
+ self.adain2 = nn.ModuleList([
61
+ AdaIN1d(style_dim, channels),
62
+ AdaIN1d(style_dim, channels),
63
+ AdaIN1d(style_dim, channels),
64
+ ])
65
+ self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
66
+ self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
67
+
68
+ def forward(self, x, s):
69
+ for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
70
+ xt = n1(x, s)
71
+ xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
72
+ xt = c1(xt)
73
+ xt = n2(xt, s)
74
+ xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
75
+ xt = c2(xt)
76
+ x = xt + x
77
+ return x
78
+
79
+
80
+ class TorchSTFT(nn.Module):
81
+ def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
82
+ super().__init__()
83
+ self.filter_length = filter_length
84
+ self.hop_length = hop_length
85
+ self.win_length = win_length
86
+ assert window == 'hann', window
87
+ self.window = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
88
+
89
+ def transform(self, input_data):
90
+ forward_transform = torch.stft(
91
+ input_data,
92
+ self.filter_length, self.hop_length, self.win_length, window=self.window.to(input_data.device),
93
+ return_complex=True)
94
+ return torch.abs(forward_transform), torch.angle(forward_transform)
95
+
96
+ def inverse(self, magnitude, phase):
97
+ inverse_transform = torch.istft(
98
+ magnitude * torch.exp(phase * 1j),
99
+ self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
100
+ return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation
101
+
102
+ def forward(self, input_data):
103
+ self.magnitude, self.phase = self.transform(input_data)
104
+ reconstruction = self.inverse(self.magnitude, self.phase)
105
+ return reconstruction
106
+
107
+
108
+ class SineGen(nn.Module):
109
+ """ Definition of sine generator
110
+ SineGen(samp_rate, harmonic_num = 0,
111
+ sine_amp = 0.1, noise_std = 0.003,
112
+ voiced_threshold = 0,
113
+ flag_for_pulse=False)
114
+ samp_rate: sampling rate in Hz
115
+ harmonic_num: number of harmonic overtones (default 0)
116
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
117
+ noise_std: std of Gaussian noise (default 0.003)
118
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
119
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
120
+ Note: when flag_for_pulse is True, the first time step of a voiced
121
+ segment is always sin(torch.pi) or cos(0)
122
+ """
123
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
124
+ sine_amp=0.1, noise_std=0.003,
125
+ voiced_threshold=0,
126
+ flag_for_pulse=False):
127
+ super(SineGen, self).__init__()
128
+ self.sine_amp = sine_amp
129
+ self.noise_std = noise_std
130
+ self.harmonic_num = harmonic_num
131
+ self.dim = self.harmonic_num + 1
132
+ self.sampling_rate = samp_rate
133
+ self.voiced_threshold = voiced_threshold
134
+ self.flag_for_pulse = flag_for_pulse
135
+ self.upsample_scale = upsample_scale
136
+
137
+ def _f02uv(self, f0):
138
+ # generate uv signal
139
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
140
+ return uv
141
+
142
+ def _f02sine(self, f0_values):
143
+ """ f0_values: (batchsize, length, dim)
144
+ where dim indicates fundamental tone and overtones
145
+ """
146
+ # convert to F0 in rad. The interger part n can be ignored
147
+ # because 2 * torch.pi * n doesn't affect phase
148
+ rad_values = (f0_values / self.sampling_rate) % 1
149
+ # initial phase noise (no noise for fundamental component)
150
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
151
+ rand_ini[:, 0] = 0
152
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
153
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
154
+ if not self.flag_for_pulse:
155
+ rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2)
156
+ phase = torch.cumsum(rad_values, dim=1) * 2 * torch.pi
157
+ phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
158
+ sines = torch.sin(phase)
159
+ else:
160
+ # If necessary, make sure that the first time step of every
161
+ # voiced segments is sin(pi) or cos(0)
162
+ # This is used for pulse-train generation
163
+ # identify the last time step in unvoiced segments
164
+ uv = self._f02uv(f0_values)
165
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
166
+ uv_1[:, -1, :] = 1
167
+ u_loc = (uv < 1) * (uv_1 > 0)
168
+ # get the instantanouse phase
169
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
170
+ # different batch needs to be processed differently
171
+ for idx in range(f0_values.shape[0]):
172
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
173
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
174
+ # stores the accumulation of i.phase within
175
+ # each voiced segments
176
+ tmp_cumsum[idx, :, :] = 0
177
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
178
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
179
+ # within the previous voiced segment.
180
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
181
+ # get the sines
182
+ sines = torch.cos(i_phase * 2 * torch.pi)
183
+ return sines
184
+
185
+ def forward(self, f0):
186
+ """ sine_tensor, uv = forward(f0)
187
+ input F0: tensor(batchsize=1, length, dim=1)
188
+ f0 for unvoiced steps should be 0
189
+ output sine_tensor: tensor(batchsize=1, length, dim)
190
+ output uv: tensor(batchsize=1, length, 1)
191
+ """
192
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
193
+ # fundamental component
194
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
195
+ # generate sine waveforms
196
+ sine_waves = self._f02sine(fn) * self.sine_amp
197
+ # generate uv signal
198
+ # uv = torch.ones(f0.shape)
199
+ # uv = uv * (f0 > self.voiced_threshold)
200
+ uv = self._f02uv(f0)
201
+ # noise: for unvoiced should be similar to sine_amp
202
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
203
+ # for voiced regions is self.noise_std
204
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
205
+ noise = noise_amp * torch.randn_like(sine_waves)
206
+ # first: set the unvoiced part to 0 by uv
207
+ # then: additive noise
208
+ sine_waves = sine_waves * uv + noise
209
+ return sine_waves, uv, noise
210
+
211
+
212
+ class SourceModuleHnNSF(nn.Module):
213
+ """ SourceModule for hn-nsf
214
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
215
+ add_noise_std=0.003, voiced_threshod=0)
216
+ sampling_rate: sampling_rate in Hz
217
+ harmonic_num: number of harmonic above F0 (default: 0)
218
+ sine_amp: amplitude of sine source signal (default: 0.1)
219
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
220
+ note that amplitude of noise in unvoiced is decided
221
+ by sine_amp
222
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
223
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
224
+ F0_sampled (batchsize, length, 1)
225
+ Sine_source (batchsize, length, 1)
226
+ noise_source (batchsize, length 1)
227
+ uv (batchsize, length, 1)
228
+ """
229
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
230
+ add_noise_std=0.003, voiced_threshod=0):
231
+ super(SourceModuleHnNSF, self).__init__()
232
+ self.sine_amp = sine_amp
233
+ self.noise_std = add_noise_std
234
+ # to produce sine waveforms
235
+ self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
236
+ sine_amp, add_noise_std, voiced_threshod)
237
+ # to merge source harmonics into a single excitation
238
+ self.l_linear = nn.Linear(harmonic_num + 1, 1)
239
+ self.l_tanh = nn.Tanh()
240
+
241
+ def forward(self, x):
242
+ """
243
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
244
+ F0_sampled (batchsize, length, 1)
245
+ Sine_source (batchsize, length, 1)
246
+ noise_source (batchsize, length 1)
247
+ """
248
+ # source for harmonic branch
249
+ with torch.no_grad():
250
+ sine_wavs, uv, _ = self.l_sin_gen(x)
251
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
252
+ # source for noise branch, in the same shape as uv
253
+ noise = torch.randn_like(uv) * self.sine_amp / 3
254
+ return sine_merge, noise, uv
255
+
256
+
257
+ class Generator(nn.Module):
258
+ def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, disable_complex=False):
259
+ super(Generator, self).__init__()
260
+ self.num_kernels = len(resblock_kernel_sizes)
261
+ self.num_upsamples = len(upsample_rates)
262
+ self.m_source = SourceModuleHnNSF(
263
+ sampling_rate=24000,
264
+ upsample_scale=math.prod(upsample_rates) * gen_istft_hop_size,
265
+ harmonic_num=8, voiced_threshod=10)
266
+ self.f0_upsamp = nn.Upsample(scale_factor=math.prod(upsample_rates) * gen_istft_hop_size)
267
+ self.noise_convs = nn.ModuleList()
268
+ self.noise_res = nn.ModuleList()
269
+ self.ups = nn.ModuleList()
270
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
271
+ self.ups.append(weight_norm(
272
+ nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
273
+ k, u, padding=(k-u)//2)))
274
+ self.resblocks = nn.ModuleList()
275
+ for i in range(len(self.ups)):
276
+ ch = upsample_initial_channel//(2**(i+1))
277
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes,resblock_dilation_sizes)):
278
+ self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
279
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
280
+ if i + 1 < len(upsample_rates):
281
+ stride_f0 = math.prod(upsample_rates[i + 1:])
282
+ self.noise_convs.append(nn.Conv1d(
283
+ gen_istft_n_fft + 2, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
284
+ self.noise_res.append(AdaINResBlock1(c_cur, 7, [1,3,5], style_dim))
285
+ else:
286
+ self.noise_convs.append(nn.Conv1d(gen_istft_n_fft + 2, c_cur, kernel_size=1))
287
+ self.noise_res.append(AdaINResBlock1(c_cur, 11, [1,3,5], style_dim))
288
+ self.post_n_fft = gen_istft_n_fft
289
+ self.conv_post = weight_norm(nn.Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
290
+ self.ups.apply(init_weights)
291
+ self.conv_post.apply(init_weights)
292
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
293
+ self.stft = (
294
+ CustomSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
295
+ if disable_complex
296
+ else TorchSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
297
+ )
298
+
299
+ def forward(self, x, s, f0):
300
+ with torch.no_grad():
301
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
302
+ har_source, noi_source, uv = self.m_source(f0)
303
+ har_source = har_source.transpose(1, 2).squeeze(1)
304
+ har_spec, har_phase = self.stft.transform(har_source)
305
+ har = torch.cat([har_spec, har_phase], dim=1)
306
+ for i in range(self.num_upsamples):
307
+ x = F.leaky_relu(x, negative_slope=0.1)
308
+ x_source = self.noise_convs[i](har)
309
+ x_source = self.noise_res[i](x_source, s)
310
+ x = self.ups[i](x)
311
+ if i == self.num_upsamples - 1:
312
+ x = self.reflection_pad(x)
313
+ x = x + x_source
314
+ xs = None
315
+ for j in range(self.num_kernels):
316
+ if xs is None:
317
+ xs = self.resblocks[i*self.num_kernels+j](x, s)
318
+ else:
319
+ xs += self.resblocks[i*self.num_kernels+j](x, s)
320
+ x = xs / self.num_kernels
321
+ x = F.leaky_relu(x)
322
+ x = self.conv_post(x)
323
+ spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
324
+ phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
325
+ return self.stft.inverse(spec, phase)
326
+
327
+
328
+ class UpSample1d(nn.Module):
329
+ def __init__(self, layer_type):
330
+ super().__init__()
331
+ self.layer_type = layer_type
332
+
333
+ def forward(self, x):
334
+ if self.layer_type == 'none':
335
+ return x
336
+ else:
337
+ return F.interpolate(x, scale_factor=2, mode='nearest')
338
+
339
+
340
+ class AdainResBlk1d(nn.Module):
341
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), upsample='none', dropout_p=0.0):
342
+ super().__init__()
343
+ self.actv = actv
344
+ self.upsample_type = upsample
345
+ self.upsample = UpSample1d(upsample)
346
+ self.learned_sc = dim_in != dim_out
347
+ self._build_weights(dim_in, dim_out, style_dim)
348
+ self.dropout = nn.Dropout(dropout_p)
349
+ if upsample == 'none':
350
+ self.pool = nn.Identity()
351
+ else:
352
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
353
+
354
+ def _build_weights(self, dim_in, dim_out, style_dim):
355
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
356
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
357
+ self.norm1 = AdaIN1d(style_dim, dim_in)
358
+ self.norm2 = AdaIN1d(style_dim, dim_out)
359
+ if self.learned_sc:
360
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
361
+
362
+ def _shortcut(self, x):
363
+ x = self.upsample(x)
364
+ if self.learned_sc:
365
+ x = self.conv1x1(x)
366
+ return x
367
+
368
+ def _residual(self, x, s):
369
+ x = self.norm1(x, s)
370
+ x = self.actv(x)
371
+ x = self.pool(x)
372
+ x = self.conv1(self.dropout(x))
373
+ x = self.norm2(x, s)
374
+ x = self.actv(x)
375
+ x = self.conv2(self.dropout(x))
376
+ return x
377
+
378
+ def forward(self, x, s):
379
+ out = self._residual(x, s)
380
+ out = (out + self._shortcut(x)) * torch.rsqrt(torch.tensor(2))
381
+ return out
382
+
383
+
384
+ class Decoder(nn.Module):
385
+ def __init__(self, dim_in, style_dim, dim_out,
386
+ resblock_kernel_sizes,
387
+ upsample_rates,
388
+ upsample_initial_channel,
389
+ resblock_dilation_sizes,
390
+ upsample_kernel_sizes,
391
+ gen_istft_n_fft, gen_istft_hop_size,
392
+ disable_complex=False):
393
+ super().__init__()
394
+ self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
395
+ self.decode = nn.ModuleList()
396
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
397
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
398
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
399
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
400
+ self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
401
+ self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
402
+ self.asr_res = nn.Sequential(weight_norm(nn.Conv1d(512, 64, kernel_size=1)))
403
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
404
+ upsample_initial_channel, resblock_dilation_sizes,
405
+ upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, disable_complex=disable_complex)
406
+
407
+ def forward(self, asr, F0_curve, N, s):
408
+ F0 = self.F0_conv(F0_curve.unsqueeze(1))
409
+ N = self.N_conv(N.unsqueeze(1))
410
+ x = torch.cat([asr, F0, N], axis=1)
411
+ x = self.encode(x, s)
412
+ asr_res = self.asr_res(asr)
413
+ res = True
414
+ for block in self.decode:
415
+ if res:
416
+ x = torch.cat([x, asr_res, F0, N], axis=1)
417
+ x = block(x, s)
418
+ if block.upsample_type != "none":
419
+ res = False
420
+ x = self.generator(x, s, F0_curve)
421
+ return x
kokoro/model.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .istftnet import Decoder
2
+ from .modules import CustomAlbert, ProsodyPredictor, TextEncoder
3
+ from dataclasses import dataclass
4
+ from huggingface_hub import hf_hub_download
5
+ from loguru import logger
6
+ from transformers import AlbertConfig
7
+ from typing import Dict, Optional, Union
8
+ import json
9
+ import torch
10
+ import os
11
+
12
+ class KModel(torch.nn.Module):
13
+ '''
14
+ KModel is a torch.nn.Module with 2 main responsibilities:
15
+ 1. Init weights, downloading config.json + model.pth from HF if needed
16
+ 2. forward(phonemes: str, ref_s: FloatTensor) -> (audio: FloatTensor)
17
+
18
+ You likely only need one KModel instance, and it can be reused across
19
+ multiple KPipelines to avoid redundant memory allocation.
20
+
21
+ Unlike KPipeline, KModel is language-blind.
22
+
23
+ KModel stores self.vocab and thus knows how to map phonemes -> input_ids,
24
+ so there is no need to repeatedly download config.json outside of KModel.
25
+ '''
26
+
27
+ MODEL_NAMES = {
28
+ 'hexgrad/Kokoro-82M': 'kokoro-v1_0.pth',
29
+ 'hexgrad/Kokoro-82M-v1.1-zh': 'kokoro-v1_1-zh.pth',
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ repo_id: Optional[str] = None,
35
+ config: Union[Dict, str, None] = None,
36
+ model: Optional[str] = None,
37
+ disable_complex: bool = False
38
+ ):
39
+ super().__init__()
40
+ if repo_id is None:
41
+ repo_id = 'hexgrad/Kokoro-82M'
42
+ print(f"WARNING: Defaulting repo_id to {repo_id}. Pass repo_id='{repo_id}' to suppress this warning.")
43
+ self.repo_id = repo_id
44
+ if not isinstance(config, dict):
45
+ if not config:
46
+ logger.debug("No config provided, downloading from HF")
47
+ config = hf_hub_download(repo_id=repo_id, filename='config.json')
48
+ with open(config, 'r', encoding='utf-8') as r:
49
+ config = json.load(r)
50
+ logger.debug(f"Loaded config: {config}")
51
+ self.vocab = config['vocab']
52
+ self.bert = CustomAlbert(AlbertConfig(vocab_size=config['n_token'], **config['plbert']))
53
+ self.bert_encoder = torch.nn.Linear(self.bert.config.hidden_size, config['hidden_dim'])
54
+ self.context_length = self.bert.config.max_position_embeddings
55
+ self.predictor = ProsodyPredictor(
56
+ style_dim=config['style_dim'], d_hid=config['hidden_dim'],
57
+ nlayers=config['n_layer'], max_dur=config['max_dur'], dropout=config['dropout']
58
+ )
59
+ self.text_encoder = TextEncoder(
60
+ channels=config['hidden_dim'], kernel_size=config['text_encoder_kernel_size'],
61
+ depth=config['n_layer'], n_symbols=config['n_token']
62
+ )
63
+ self.decoder = Decoder(
64
+ dim_in=config['hidden_dim'], style_dim=config['style_dim'],
65
+ dim_out=config['n_mels'], disable_complex=disable_complex, **config['istftnet']
66
+ )
67
+ if not model:
68
+ try:
69
+ model = hf_hub_download(repo_id=repo_id, filename=KModel.MODEL_NAMES[repo_id])
70
+ except:
71
+ model = os.path.join(repo_id, 'kokoro-v1_0.pth')
72
+ for key, state_dict in torch.load(model, map_location='cpu', weights_only=True).items():
73
+ assert hasattr(self, key), key
74
+ try:
75
+ getattr(self, key).load_state_dict(state_dict)
76
+ except:
77
+ logger.debug(f"Did not load {key} from state_dict")
78
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
79
+ getattr(self, key).load_state_dict(state_dict, strict=False)
80
+
81
+ @property
82
+ def device(self):
83
+ return self.bert.device
84
+
85
+ @dataclass
86
+ class Output:
87
+ audio: torch.FloatTensor
88
+ pred_dur: Optional[torch.LongTensor] = None
89
+
90
+ @torch.no_grad()
91
+ def forward_with_tokens(
92
+ self,
93
+ input_ids: torch.LongTensor,
94
+ ref_s: torch.FloatTensor,
95
+ speed: float = 1
96
+ ) -> tuple[torch.FloatTensor, torch.LongTensor]:
97
+ input_lengths = torch.full(
98
+ (input_ids.shape[0],),
99
+ input_ids.shape[-1],
100
+ device=input_ids.device,
101
+ dtype=torch.long
102
+ )
103
+
104
+ text_mask = torch.arange(input_lengths.max()).unsqueeze(0).expand(input_lengths.shape[0], -1).type_as(input_lengths)
105
+ text_mask = torch.gt(text_mask+1, input_lengths.unsqueeze(1)).to(self.device)
106
+ bert_dur = self.bert(input_ids, attention_mask=(~text_mask).int())
107
+ d_en = self.bert_encoder(bert_dur).transpose(-1, -2)
108
+ s = ref_s[:, 128:]
109
+ d = self.predictor.text_encoder(d_en, s, input_lengths, text_mask)
110
+ x, _ = self.predictor.lstm(d)
111
+ duration = self.predictor.duration_proj(x)
112
+ duration = torch.sigmoid(duration).sum(axis=-1) / speed
113
+ pred_dur = torch.round(duration).clamp(min=1).long().squeeze()
114
+ indices = torch.repeat_interleave(torch.arange(input_ids.shape[1], device=self.device), pred_dur)
115
+ pred_aln_trg = torch.zeros((input_ids.shape[1], indices.shape[0]), device=self.device)
116
+ pred_aln_trg[indices, torch.arange(indices.shape[0])] = 1
117
+ pred_aln_trg = pred_aln_trg.unsqueeze(0).to(self.device)
118
+ en = d.transpose(-1, -2) @ pred_aln_trg
119
+ F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
120
+ t_en = self.text_encoder(input_ids, input_lengths, text_mask)
121
+ asr = t_en @ pred_aln_trg
122
+ audio = self.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze()
123
+ return audio, pred_dur
124
+
125
+ def forward(
126
+ self,
127
+ phonemes: str,
128
+ ref_s: torch.FloatTensor,
129
+ speed: float = 1,
130
+ return_output: bool = False
131
+ ) -> Union['KModel.Output', torch.FloatTensor]:
132
+ input_ids = list(filter(lambda i: i is not None, map(lambda p: self.vocab.get(p), phonemes)))
133
+ logger.debug(f"phonemes: {phonemes} -> input_ids: {input_ids}")
134
+ assert len(input_ids)+2 <= self.context_length, (len(input_ids)+2, self.context_length)
135
+ input_ids = torch.LongTensor([[0, *input_ids, 0]]).to(self.device)
136
+ ref_s = ref_s.to(self.device)
137
+ audio, pred_dur = self.forward_with_tokens(input_ids, ref_s, speed)
138
+ audio = audio.squeeze().cpu()
139
+ pred_dur = pred_dur.cpu() if pred_dur is not None else None
140
+ logger.debug(f"pred_dur: {pred_dur}")
141
+ return self.Output(audio=audio, pred_dur=pred_dur) if return_output else audio
142
+
143
+ class KModelForONNX(torch.nn.Module):
144
+ def __init__(self, kmodel: KModel):
145
+ super().__init__()
146
+ self.kmodel = kmodel
147
+
148
+ def forward(
149
+ self,
150
+ input_ids: torch.LongTensor,
151
+ ref_s: torch.FloatTensor,
152
+ speed: float = 1
153
+ ) -> tuple[torch.FloatTensor, torch.LongTensor]:
154
+ waveform, duration = self.kmodel.forward_with_tokens(input_ids, ref_s, speed)
155
+ return waveform, duration
kokoro/modules.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
+ from .istftnet import AdainResBlk1d
3
+ from torch.nn.utils import weight_norm
4
+ from transformers import AlbertModel
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class LinearNorm(nn.Module):
12
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
13
+ super(LinearNorm, self).__init__()
14
+ self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
15
+ nn.init.xavier_uniform_(self.linear_layer.weight, gain=nn.init.calculate_gain(w_init_gain))
16
+
17
+ def forward(self, x):
18
+ return self.linear_layer(x)
19
+
20
+
21
+ class LayerNorm(nn.Module):
22
+ def __init__(self, channels, eps=1e-5):
23
+ super().__init__()
24
+ self.channels = channels
25
+ self.eps = eps
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class TextEncoder(nn.Module):
36
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
37
+ super().__init__()
38
+ self.embedding = nn.Embedding(n_symbols, channels)
39
+ padding = (kernel_size - 1) // 2
40
+ self.cnn = nn.ModuleList()
41
+ for _ in range(depth):
42
+ self.cnn.append(nn.Sequential(
43
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
44
+ LayerNorm(channels),
45
+ actv,
46
+ nn.Dropout(0.2),
47
+ ))
48
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
49
+
50
+ def forward(self, x, input_lengths, m):
51
+ x = self.embedding(x) # [B, T, emb]
52
+ x = x.transpose(1, 2) # [B, emb, T]
53
+ m = m.unsqueeze(1)
54
+ x.masked_fill_(m, 0.0)
55
+ for c in self.cnn:
56
+ x = c(x)
57
+ x.masked_fill_(m, 0.0)
58
+ x = x.transpose(1, 2) # [B, T, chn]
59
+ lengths = input_lengths if input_lengths.device == torch.device('cpu') else input_lengths.to('cpu')
60
+ x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
61
+ self.lstm.flatten_parameters()
62
+ x, _ = self.lstm(x)
63
+ x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
64
+ x = x.transpose(-1, -2)
65
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]], device=x.device)
66
+ x_pad[:, :, :x.shape[-1]] = x
67
+ x = x_pad
68
+ x.masked_fill_(m, 0.0)
69
+ return x
70
+
71
+
72
+ class AdaLayerNorm(nn.Module):
73
+ def __init__(self, style_dim, channels, eps=1e-5):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.eps = eps
77
+ self.fc = nn.Linear(style_dim, channels*2)
78
+
79
+ def forward(self, x, s):
80
+ x = x.transpose(-1, -2)
81
+ x = x.transpose(1, -1)
82
+ h = self.fc(s)
83
+ h = h.view(h.size(0), h.size(1), 1)
84
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
85
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
86
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
87
+ x = (1 + gamma) * x + beta
88
+ return x.transpose(1, -1).transpose(-1, -2)
89
+
90
+
91
+ class ProsodyPredictor(nn.Module):
92
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
93
+ super().__init__()
94
+ self.text_encoder = DurationEncoder(sty_dim=style_dim, d_model=d_hid,nlayers=nlayers, dropout=dropout)
95
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
96
+ self.duration_proj = LinearNorm(d_hid, max_dur)
97
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
98
+ self.F0 = nn.ModuleList()
99
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
100
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
101
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
102
+ self.N = nn.ModuleList()
103
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
104
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
105
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
106
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
107
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
108
+
109
+ def forward(self, texts, style, text_lengths, alignment, m):
110
+ d = self.text_encoder(texts, style, text_lengths, m)
111
+ m = m.unsqueeze(1)
112
+ lengths = text_lengths if text_lengths.device == torch.device('cpu') else text_lengths.to('cpu')
113
+ x = nn.utils.rnn.pack_padded_sequence(d, lengths, batch_first=True, enforce_sorted=False)
114
+ self.lstm.flatten_parameters()
115
+ x, _ = self.lstm(x)
116
+ x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
117
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]], device=x.device)
118
+ x_pad[:, :x.shape[1], :] = x
119
+ x = x_pad
120
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=False))
121
+ en = (d.transpose(-1, -2) @ alignment)
122
+ return duration.squeeze(-1), en
123
+
124
+ def F0Ntrain(self, x, s):
125
+ x, _ = self.shared(x.transpose(-1, -2))
126
+ F0 = x.transpose(-1, -2)
127
+ for block in self.F0:
128
+ F0 = block(F0, s)
129
+ F0 = self.F0_proj(F0)
130
+ N = x.transpose(-1, -2)
131
+ for block in self.N:
132
+ N = block(N, s)
133
+ N = self.N_proj(N)
134
+ return F0.squeeze(1), N.squeeze(1)
135
+
136
+
137
+ class DurationEncoder(nn.Module):
138
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
139
+ super().__init__()
140
+ self.lstms = nn.ModuleList()
141
+ for _ in range(nlayers):
142
+ self.lstms.append(nn.LSTM(d_model + sty_dim, d_model // 2, num_layers=1, batch_first=True, bidirectional=True, dropout=dropout))
143
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
144
+ self.dropout = dropout
145
+ self.d_model = d_model
146
+ self.sty_dim = sty_dim
147
+
148
+ def forward(self, x, style, text_lengths, m):
149
+ masks = m
150
+ x = x.permute(2, 0, 1)
151
+ s = style.expand(x.shape[0], x.shape[1], -1)
152
+ x = torch.cat([x, s], axis=-1)
153
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
154
+ x = x.transpose(0, 1)
155
+ x = x.transpose(-1, -2)
156
+ for block in self.lstms:
157
+ if isinstance(block, AdaLayerNorm):
158
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
159
+ x = torch.cat([x, s.permute(1, 2, 0)], axis=1)
160
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
161
+ else:
162
+ lengths = text_lengths if text_lengths.device == torch.device('cpu') else text_lengths.to('cpu')
163
+ x = x.transpose(-1, -2)
164
+ x = nn.utils.rnn.pack_padded_sequence(
165
+ x, lengths, batch_first=True, enforce_sorted=False)
166
+ block.flatten_parameters()
167
+ x, _ = block(x)
168
+ x, _ = nn.utils.rnn.pad_packed_sequence(
169
+ x, batch_first=True)
170
+ x = F.dropout(x, p=self.dropout, training=False)
171
+ x = x.transpose(-1, -2)
172
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]], device=x.device)
173
+ x_pad[:, :, :x.shape[-1]] = x
174
+ x = x_pad
175
+
176
+ return x.transpose(-1, -2)
177
+
178
+
179
+ # https://github.com/yl4579/StyleTTS2/blob/main/Utils/PLBERT/util.py
180
+ class CustomAlbert(AlbertModel):
181
+ def forward(self, *args, **kwargs):
182
+ outputs = super().forward(*args, **kwargs)
183
+ return outputs.last_hidden_state
kokoro/pipeline.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import KModel
2
+ from dataclasses import dataclass
3
+ from huggingface_hub import hf_hub_download
4
+ from loguru import logger
5
+ from misaki import en, espeak
6
+ from typing import Callable, Generator, List, Optional, Tuple, Union
7
+ import re
8
+ import torch
9
+ import os
10
+
11
+ ALIASES = {
12
+ 'en-us': 'a',
13
+ 'en-gb': 'b',
14
+ 'es': 'e',
15
+ 'fr-fr': 'f',
16
+ 'hi': 'h',
17
+ 'it': 'i',
18
+ 'pt-br': 'p',
19
+ 'ja': 'j',
20
+ 'zh': 'z',
21
+ }
22
+
23
+ LANG_CODES = dict(
24
+ # pip install misaki[en]
25
+ a='American English',
26
+ b='British English',
27
+
28
+ # espeak-ng
29
+ e='es',
30
+ f='fr-fr',
31
+ h='hi',
32
+ i='it',
33
+ p='pt-br',
34
+
35
+ # pip install misaki[ja]
36
+ j='Japanese',
37
+
38
+ # pip install misaki[zh]
39
+ z='Mandarin Chinese',
40
+ )
41
+
42
+ class KPipeline:
43
+ '''
44
+ KPipeline is a language-aware support class with 2 main responsibilities:
45
+ 1. Perform language-specific G2P, mapping (and chunking) text -> phonemes
46
+ 2. Manage and store voices, lazily downloaded from HF if needed
47
+
48
+ You are expected to have one KPipeline per language. If you have multiple
49
+ KPipelines, you should reuse one KModel instance across all of them.
50
+
51
+ KPipeline is designed to work with a KModel, but this is not required.
52
+ There are 2 ways to pass an existing model into a pipeline:
53
+ 1. On init: us_pipeline = KPipeline(lang_code='a', model=model)
54
+ 2. On call: us_pipeline(text, voice, model=model)
55
+
56
+ By default, KPipeline will automatically initialize its own KModel. To
57
+ suppress this, construct a "quiet" KPipeline with model=False.
58
+
59
+ A "quiet" KPipeline yields (graphemes, phonemes, None) without generating
60
+ any audio. You can use this to phonemize and chunk your text in advance.
61
+
62
+ A "loud" KPipeline _with_ a model yields (graphemes, phonemes, audio).
63
+ '''
64
+ def __init__(
65
+ self,
66
+ lang_code: str,
67
+ repo_id: Optional[str] = None,
68
+ model: Union[KModel, bool] = True,
69
+ trf: bool = False,
70
+ en_callable: Optional[Callable[[str], str]] = None,
71
+ device: Optional[str] = None
72
+ ):
73
+ """Initialize a KPipeline.
74
+
75
+ Args:
76
+ lang_code: Language code for G2P processing
77
+ model: KModel instance, True to create new model, False for no model
78
+ trf: Whether to use transformer-based G2P
79
+ device: Override default device selection ('cuda' or 'cpu', or None for auto)
80
+ If None, will auto-select cuda if available
81
+ If 'cuda' and not available, will explicitly raise an error
82
+ """
83
+ if repo_id is None:
84
+ repo_id = 'hexgrad/Kokoro-82M'
85
+ print(f"WARNING: Defaulting repo_id to {repo_id}. Pass repo_id='{repo_id}' to suppress this warning.")
86
+ config=None
87
+ else:
88
+ config = os.path.join(repo_id, 'config.json')
89
+ self.repo_id = repo_id
90
+ lang_code = lang_code.lower()
91
+ lang_code = ALIASES.get(lang_code, lang_code)
92
+ assert lang_code in LANG_CODES, (lang_code, LANG_CODES)
93
+ self.lang_code = lang_code
94
+ self.model = None
95
+ if isinstance(model, KModel):
96
+ self.model = model
97
+ elif model:
98
+ if device == 'cuda' and not torch.cuda.is_available():
99
+ raise RuntimeError("CUDA requested but not available")
100
+ if device == 'mps' and not torch.backends.mps.is_available():
101
+ raise RuntimeError("MPS requested but not available")
102
+ if device == 'mps' and os.environ.get('PYTORCH_ENABLE_MPS_FALLBACK') != '1':
103
+ raise RuntimeError("MPS requested but fallback not enabled")
104
+ if device is None:
105
+ if torch.cuda.is_available():
106
+ device = 'cuda'
107
+ elif os.environ.get('PYTORCH_ENABLE_MPS_FALLBACK') == '1' and torch.backends.mps.is_available():
108
+ device = 'mps'
109
+ else:
110
+ device = 'cpu'
111
+ try:
112
+ self.model = KModel(repo_id=repo_id, config=config).to(device).eval()
113
+ except RuntimeError as e:
114
+ if device == 'cuda':
115
+ raise RuntimeError(f"""Failed to initialize model on CUDA: {e}.
116
+ Try setting device='cpu' or check CUDA installation.""")
117
+ raise
118
+ self.voices = {}
119
+ if lang_code in 'ab':
120
+ try:
121
+ fallback = espeak.EspeakFallback(british=lang_code=='b')
122
+ except Exception as e:
123
+ logger.warning("EspeakFallback not Enabled: OOD words will be skipped")
124
+ logger.warning({str(e)})
125
+ fallback = None
126
+ self.g2p = en.G2P(trf=trf, british=lang_code=='b', fallback=fallback, unk='')
127
+ elif lang_code == 'j':
128
+ try:
129
+ from misaki import ja
130
+ self.g2p = ja.JAG2P()
131
+ except ImportError:
132
+ logger.error("You need to `pip install misaki[ja]` to use lang_code='j'")
133
+ raise
134
+ elif lang_code == 'z':
135
+ try:
136
+ from misaki import zh
137
+ self.g2p = zh.ZHG2P(
138
+ version=None if repo_id.endswith('/Kokoro-82M') else '1.1',
139
+ en_callable=en_callable
140
+ )
141
+ except ImportError:
142
+ logger.error("You need to `pip install misaki[zh]` to use lang_code='z'")
143
+ raise
144
+ else:
145
+ language = LANG_CODES[lang_code]
146
+ logger.warning(f"Using EspeakG2P(language='{language}'). Chunking logic not yet implemented, so long texts may be truncated unless you split them with '\\n'.")
147
+ self.g2p = espeak.EspeakG2P(language=language)
148
+
149
+ def load_single_voice(self, voice: str):
150
+ if voice in self.voices:
151
+ return self.voices[voice]
152
+ if voice.endswith('.pt'):
153
+ f = voice
154
+ else:
155
+ f = hf_hub_download(repo_id=self.repo_id, filename=f'voices/{voice}.pt')
156
+ if not voice.startswith(self.lang_code):
157
+ v = LANG_CODES.get(voice, voice)
158
+ p = LANG_CODES.get(self.lang_code, self.lang_code)
159
+ logger.warning(f'Language mismatch, loading {v} voice into {p} pipeline.')
160
+ pack = torch.load(f, weights_only=True)
161
+ self.voices[voice] = pack
162
+ return pack
163
+
164
+ """
165
+ load_voice is a helper function that lazily downloads and loads a voice:
166
+ Single voice can be requested (e.g. 'af_bella') or multiple voices (e.g. 'af_bella,af_jessica').
167
+ If multiple voices are requested, they are averaged.
168
+ Delimiter is optional and defaults to ','.
169
+ """
170
+ def load_voice(self, voice: Union[str, torch.FloatTensor], delimiter: str = ",") -> torch.FloatTensor:
171
+ if isinstance(voice, torch.FloatTensor):
172
+ return voice
173
+ if voice in self.voices:
174
+ return self.voices[voice]
175
+ logger.debug(f"Loading voice: {voice}")
176
+ packs = [self.load_single_voice(v) for v in voice.split(delimiter)]
177
+ if len(packs) == 1:
178
+ return packs[0]
179
+ self.voices[voice] = torch.mean(torch.stack(packs), dim=0)
180
+ return self.voices[voice]
181
+
182
+ @staticmethod
183
+ def tokens_to_ps(tokens: List[en.MToken]) -> str:
184
+ return ''.join(t.phonemes + (' ' if t.whitespace else '') for t in tokens).strip()
185
+
186
+ @staticmethod
187
+ def waterfall_last(
188
+ tokens: List[en.MToken],
189
+ next_count: int,
190
+ waterfall: List[str] = ['!.?…', ':;', ',—'],
191
+ bumps: List[str] = [')', '”']
192
+ ) -> int:
193
+ for w in waterfall:
194
+ z = next((i for i, t in reversed(list(enumerate(tokens))) if t.phonemes in set(w)), None)
195
+ if z is None:
196
+ continue
197
+ z += 1
198
+ if z < len(tokens) and tokens[z].phonemes in bumps:
199
+ z += 1
200
+ if next_count - len(KPipeline.tokens_to_ps(tokens[:z])) <= 510:
201
+ return z
202
+ return len(tokens)
203
+
204
+ @staticmethod
205
+ def tokens_to_text(tokens: List[en.MToken]) -> str:
206
+ return ''.join(t.text + t.whitespace for t in tokens).strip()
207
+
208
+ def en_tokenize(
209
+ self,
210
+ tokens: List[en.MToken]
211
+ ) -> Generator[Tuple[str, str, List[en.MToken]], None, None]:
212
+ tks = []
213
+ pcount = 0
214
+ for t in tokens:
215
+ # American English: ɾ => T
216
+ t.phonemes = '' if t.phonemes is None else t.phonemes#.replace('ɾ', 'T')
217
+ next_ps = t.phonemes + (' ' if t.whitespace else '')
218
+ next_pcount = pcount + len(next_ps.rstrip())
219
+ if next_pcount > 510:
220
+ z = KPipeline.waterfall_last(tks, next_pcount)
221
+ text = KPipeline.tokens_to_text(tks[:z])
222
+ logger.debug(f"Chunking text at {z}: '{text[:30]}{'...' if len(text) > 30 else ''}'")
223
+ ps = KPipeline.tokens_to_ps(tks[:z])
224
+ yield text, ps, tks[:z]
225
+ tks = tks[z:]
226
+ pcount = len(KPipeline.tokens_to_ps(tks))
227
+ if not tks:
228
+ next_ps = next_ps.lstrip()
229
+ tks.append(t)
230
+ pcount += len(next_ps)
231
+ if tks:
232
+ text = KPipeline.tokens_to_text(tks)
233
+ ps = KPipeline.tokens_to_ps(tks)
234
+ yield ''.join(text).strip(), ''.join(ps).strip(), tks
235
+
236
+ @staticmethod
237
+ def infer(
238
+ model: KModel,
239
+ ps: str,
240
+ pack: torch.FloatTensor,
241
+ speed: Union[float, Callable[[int], float]] = 1
242
+ ) -> KModel.Output:
243
+ if callable(speed):
244
+ speed = speed(len(ps))
245
+ return model(ps, pack[len(ps)-1], speed, return_output=True)
246
+
247
+ def generate_from_tokens(
248
+ self,
249
+ tokens: Union[str, List[en.MToken]],
250
+ voice: str,
251
+ speed: float = 1,
252
+ model: Optional[KModel] = None
253
+ ) -> Generator['KPipeline.Result', None, None]:
254
+ """Generate audio from either raw phonemes or pre-processed tokens.
255
+
256
+ Args:
257
+ tokens: Either a phoneme string or list of pre-processed MTokens
258
+ voice: The voice to use for synthesis
259
+ speed: Speech speed modifier (default: 1)
260
+ model: Optional KModel instance (uses pipeline's model if not provided)
261
+
262
+ Yields:
263
+ KPipeline.Result containing the input tokens and generated audio
264
+
265
+ Raises:
266
+ ValueError: If no voice is provided or token sequence exceeds model limits
267
+ """
268
+ model = model or self.model
269
+ if model and voice is None:
270
+ raise ValueError('Specify a voice: pipeline.generate_from_tokens(..., voice="af_heart")')
271
+
272
+ pack = self.load_voice(voice).to(model.device) if model else None
273
+
274
+ # Handle raw phoneme string
275
+ if isinstance(tokens, str):
276
+ logger.debug("Processing phonemes from raw string")
277
+ if len(tokens) > 510:
278
+ raise ValueError(f'Phoneme string too long: {len(tokens)} > 510')
279
+ output = KPipeline.infer(model, tokens, pack, speed) if model else None
280
+ yield self.Result(graphemes='', phonemes=tokens, output=output)
281
+ return
282
+
283
+ logger.debug("Processing MTokens")
284
+ # Handle pre-processed tokens
285
+ for gs, ps, tks in self.en_tokenize(tokens):
286
+ if not ps:
287
+ continue
288
+ elif len(ps) > 510:
289
+ logger.warning(f"Unexpected len(ps) == {len(ps)} > 510 and ps == '{ps}'")
290
+ logger.warning("Truncating to 510 characters")
291
+ ps = ps[:510]
292
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
293
+ if output is not None and output.pred_dur is not None:
294
+ KPipeline.join_timestamps(tks, output.pred_dur)
295
+ yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output)
296
+
297
+ @staticmethod
298
+ def join_timestamps(tokens: List[en.MToken], pred_dur: torch.LongTensor):
299
+ # Multiply by 600 to go from pred_dur frames to sample_rate 24000
300
+ # Equivalent to dividing pred_dur frames by 40 to get timestamp in seconds
301
+ # We will count nice round half-frames, so the divisor is 80
302
+ MAGIC_DIVISOR = 80
303
+ if not tokens or len(pred_dur) < 3:
304
+ # We expect at least 3: <bos>, token, <eos>
305
+ return
306
+ # We track 2 counts, measured in half-frames: (left, right)
307
+ # This way we can cut space characters in half
308
+ # TODO: Is -3 an appropriate offset?
309
+ left = right = 2 * max(0, pred_dur[0].item() - 3)
310
+ # Updates:
311
+ # left = right + (2 * token_dur) + space_dur
312
+ # right = left + space_dur
313
+ i = 1
314
+ for t in tokens:
315
+ if i >= len(pred_dur)-1:
316
+ break
317
+ if not t.phonemes:
318
+ if t.whitespace:
319
+ i += 1
320
+ left = right + pred_dur[i].item()
321
+ right = left + pred_dur[i].item()
322
+ i += 1
323
+ continue
324
+ j = i + len(t.phonemes)
325
+ if j >= len(pred_dur):
326
+ break
327
+ t.start_ts = left / MAGIC_DIVISOR
328
+ token_dur = pred_dur[i: j].sum().item()
329
+ space_dur = pred_dur[j].item() if t.whitespace else 0
330
+ left = right + (2 * token_dur) + space_dur
331
+ t.end_ts = left / MAGIC_DIVISOR
332
+ right = left + space_dur
333
+ i = j + (1 if t.whitespace else 0)
334
+
335
+ @dataclass
336
+ class Result:
337
+ graphemes: str
338
+ phonemes: str
339
+ tokens: Optional[List[en.MToken]] = None
340
+ output: Optional[KModel.Output] = None
341
+ text_index: Optional[int] = None
342
+
343
+ @property
344
+ def audio(self) -> Optional[torch.FloatTensor]:
345
+ return None if self.output is None else self.output.audio
346
+
347
+ @property
348
+ def pred_dur(self) -> Optional[torch.LongTensor]:
349
+ return None if self.output is None else self.output.pred_dur
350
+
351
+ ### MARK: BEGIN BACKWARD COMPAT ###
352
+ def __iter__(self):
353
+ yield self.graphemes
354
+ yield self.phonemes
355
+ yield self.audio
356
+
357
+ def __getitem__(self, index):
358
+ return [self.graphemes, self.phonemes, self.audio][index]
359
+
360
+ def __len__(self):
361
+ return 3
362
+ #### MARK: END BACKWARD COMPAT ####
363
+
364
+ def __call__(
365
+ self,
366
+ text: Union[str, List[str]],
367
+ voice: Optional[str] = None,
368
+ speed: Union[float, Callable[[int], float]] = 1,
369
+ split_pattern: Optional[str] = r'\n+',
370
+ model: Optional[KModel] = None
371
+ ) -> Generator['KPipeline.Result', None, None]:
372
+ model = model or self.model
373
+ if model and voice is None:
374
+ raise ValueError('Specify a voice: en_us_pipeline(text="Hello world!", voice="af_heart")')
375
+ pack = self.load_voice(voice).to(model.device) if model else None
376
+
377
+ # Convert input to list of segments
378
+ if isinstance(text, str):
379
+ text = re.split(split_pattern, text.strip()) if split_pattern else [text]
380
+
381
+ # Process each segment
382
+ for graphemes_index, graphemes in enumerate(text):
383
+ if not graphemes.strip(): # Skip empty segments
384
+ continue
385
+
386
+ # English processing (unchanged)
387
+ if self.lang_code in 'ab':
388
+ logger.debug(f"Processing English text: {graphemes[:50]}{'...' if len(graphemes) > 50 else ''}")
389
+ _, tokens = self.g2p(graphemes)
390
+ for gs, ps, tks in self.en_tokenize(tokens):
391
+ if not ps:
392
+ continue
393
+ elif len(ps) > 510:
394
+ logger.warning(f"Unexpected len(ps) == {len(ps)} > 510 and ps == '{ps}'")
395
+ ps = ps[:510]
396
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
397
+ if output is not None and output.pred_dur is not None:
398
+ KPipeline.join_timestamps(tks, output.pred_dur)
399
+ yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output, text_index=graphemes_index)
400
+
401
+ # Non-English processing with chunking
402
+ else:
403
+ # Split long text into smaller chunks (roughly 400 characters each)
404
+ # Using sentence boundaries when possible
405
+ chunk_size = 400
406
+ chunks = []
407
+
408
+ # Try to split on sentence boundaries first
409
+ sentences = re.split(r'([.!?]+)', graphemes)
410
+ current_chunk = ""
411
+
412
+ for i in range(0, len(sentences), 2):
413
+ sentence = sentences[i]
414
+ # Add the punctuation back if it exists
415
+ if i + 1 < len(sentences):
416
+ sentence += sentences[i + 1]
417
+
418
+ if len(current_chunk) + len(sentence) <= chunk_size:
419
+ current_chunk += sentence
420
+ else:
421
+ if current_chunk:
422
+ chunks.append(current_chunk.strip())
423
+ current_chunk = sentence
424
+
425
+ if current_chunk:
426
+ chunks.append(current_chunk.strip())
427
+
428
+ # If no chunks were created (no sentence boundaries), fall back to character-based chunking
429
+ if not chunks:
430
+ chunks = [graphemes[i:i+chunk_size] for i in range(0, len(graphemes), chunk_size)]
431
+
432
+ # Process each chunk
433
+ for chunk in chunks:
434
+ if not chunk.strip():
435
+ continue
436
+
437
+ ps, _ = self.g2p(chunk)
438
+ if not ps:
439
+ continue
440
+ elif len(ps) > 510:
441
+ logger.warning(f'Truncating len(ps) == {len(ps)} > 510')
442
+ ps = ps[:510]
443
+
444
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
445
+ yield self.Result(graphemes=chunk, phonemes=ps, output=output, text_index=graphemes_index)
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opencv-python>=4.9.0.80
2
+ diffusers>=0.31.0
3
+ transformers>=4.49.0
4
+ tokenizers>=0.20.3
5
+ accelerate>=1.1.1
6
+ tqdm
7
+ imageio
8
+ easydict
9
+ ftfy
10
+ dashscope
11
+ imageio-ffmpeg
12
+ scikit-image
13
+ loguru
14
+ gradio>=5.0.0
15
+ numpy>=1.23.5,<2
16
+ xfuser>=0.4.1
17
+ pyloudnorm
18
+ optimum-quanto==0.2.6
src/audio_analysis/torch_utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ def get_mask_from_lengths(lengths, max_len=None):
6
+ lengths = lengths.to(torch.long)
7
+ if max_len is None:
8
+ max_len = torch.max(lengths).item()
9
+
10
+ ids = torch.arange(0, max_len).unsqueeze(0).expand(lengths.shape[0], -1).to(lengths.device)
11
+ mask = ids < lengths.unsqueeze(1).expand(-1, max_len)
12
+
13
+ return mask
14
+
15
+
16
+ def linear_interpolation(features, seq_len):
17
+ features = features.transpose(1, 2)
18
+ output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
19
+ return output_features.transpose(1, 2)
20
+
src/audio_analysis/wav2vec2.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2Config, Wav2Vec2Model
2
+ from transformers.modeling_outputs import BaseModelOutput
3
+
4
+ from src.audio_analysis.torch_utils import linear_interpolation
5
+
6
+ # the implementation of Wav2Vec2Model is borrowed from
7
+ # https://github.com/huggingface/transformers/blob/HEAD/src/transformers/models/wav2vec2/modeling_wav2vec2.py
8
+ # initialize our encoder with the pre-trained wav2vec 2.0 weights.
9
+ class Wav2Vec2Model(Wav2Vec2Model):
10
+ def __init__(self, config: Wav2Vec2Config):
11
+ super().__init__(config)
12
+
13
+ def forward(
14
+ self,
15
+ input_values,
16
+ seq_len,
17
+ attention_mask=None,
18
+ mask_time_indices=None,
19
+ output_attentions=None,
20
+ output_hidden_states=None,
21
+ return_dict=None,
22
+ ):
23
+ self.config.output_attentions = True
24
+
25
+ output_hidden_states = (
26
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
27
+ )
28
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
29
+
30
+ extract_features = self.feature_extractor(input_values)
31
+ extract_features = extract_features.transpose(1, 2)
32
+ extract_features = linear_interpolation(extract_features, seq_len=seq_len)
33
+
34
+ if attention_mask is not None:
35
+ # compute reduced attention_mask corresponding to feature vectors
36
+ attention_mask = self._get_feature_vector_attention_mask(
37
+ extract_features.shape[1], attention_mask, add_adapter=False
38
+ )
39
+
40
+ hidden_states, extract_features = self.feature_projection(extract_features)
41
+ hidden_states = self._mask_hidden_states(
42
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
43
+ )
44
+
45
+ encoder_outputs = self.encoder(
46
+ hidden_states,
47
+ attention_mask=attention_mask,
48
+ output_attentions=output_attentions,
49
+ output_hidden_states=output_hidden_states,
50
+ return_dict=return_dict,
51
+ )
52
+
53
+ hidden_states = encoder_outputs[0]
54
+
55
+ if self.adapter is not None:
56
+ hidden_states = self.adapter(hidden_states)
57
+
58
+ if not return_dict:
59
+ return (hidden_states, ) + encoder_outputs[1:]
60
+ return BaseModelOutput(
61
+ last_hidden_state=hidden_states,
62
+ hidden_states=encoder_outputs.hidden_states,
63
+ attentions=encoder_outputs.attentions,
64
+ )
65
+
66
+
67
+ def feature_extract(
68
+ self,
69
+ input_values,
70
+ seq_len,
71
+ ):
72
+ extract_features = self.feature_extractor(input_values)
73
+ extract_features = extract_features.transpose(1, 2)
74
+ extract_features = linear_interpolation(extract_features, seq_len=seq_len)
75
+
76
+ return extract_features
77
+
78
+ def encode(
79
+ self,
80
+ extract_features,
81
+ attention_mask=None,
82
+ mask_time_indices=None,
83
+ output_attentions=None,
84
+ output_hidden_states=None,
85
+ return_dict=None,
86
+ ):
87
+ self.config.output_attentions = True
88
+
89
+ output_hidden_states = (
90
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
91
+ )
92
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
93
+
94
+ if attention_mask is not None:
95
+ # compute reduced attention_mask corresponding to feature vectors
96
+ attention_mask = self._get_feature_vector_attention_mask(
97
+ extract_features.shape[1], attention_mask, add_adapter=False
98
+ )
99
+
100
+
101
+ hidden_states, extract_features = self.feature_projection(extract_features)
102
+ hidden_states = self._mask_hidden_states(
103
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
104
+ )
105
+
106
+ encoder_outputs = self.encoder(
107
+ hidden_states,
108
+ attention_mask=attention_mask,
109
+ output_attentions=output_attentions,
110
+ output_hidden_states=output_hidden_states,
111
+ return_dict=return_dict,
112
+ )
113
+
114
+ hidden_states = encoder_outputs[0]
115
+
116
+ if self.adapter is not None:
117
+ hidden_states = self.adapter(hidden_states)
118
+
119
+ if not return_dict:
120
+ return (hidden_states, ) + encoder_outputs[1:]
121
+ return BaseModelOutput(
122
+ last_hidden_state=hidden_states,
123
+ hidden_states=encoder_outputs.hidden_states,
124
+ attentions=encoder_outputs.attentions,
125
+ )
src/utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+
3
+ import torch
4
+
5
+ @contextmanager
6
+ def init_weights_on_device(device=torch.device("meta"), include_buffers: bool = False):
7
+ old_register_parameter = torch.nn.Module.register_parameter
8
+ if include_buffers:
9
+ old_register_buffer = torch.nn.Module.register_buffer
10
+
11
+ def register_empty_parameter(module, name, param):
12
+ old_register_parameter(module, name, param)
13
+ if param is not None:
14
+ param_cls = type(module._parameters[name])
15
+ kwargs = module._parameters[name].__dict__
16
+ kwargs["requires_grad"] = param.requires_grad
17
+ module._parameters[name] = param_cls(
18
+ module._parameters[name].to(device), **kwargs
19
+ )
20
+
21
+ def register_empty_buffer(module, name, buffer, persistent=True):
22
+ old_register_buffer(module, name, buffer, persistent=persistent)
23
+ if buffer is not None:
24
+ module._buffers[name] = module._buffers[name].to(device)
25
+
26
+ def patch_tensor_constructor(fn):
27
+ def wrapper(*args, **kwargs):
28
+ kwargs["device"] = device
29
+ return fn(*args, **kwargs)
30
+
31
+ return wrapper
32
+
33
+ if include_buffers:
34
+ tensor_constructors_to_patch = {
35
+ torch_function_name: getattr(torch, torch_function_name)
36
+ for torch_function_name in ["empty", "zeros", "ones", "full"]
37
+ }
38
+ else:
39
+ tensor_constructors_to_patch = {}
40
+
41
+ try:
42
+ torch.nn.Module.register_parameter = register_empty_parameter
43
+ if include_buffers:
44
+ torch.nn.Module.register_buffer = register_empty_buffer
45
+ for torch_function_name in tensor_constructors_to_patch.keys():
46
+ setattr(
47
+ torch,
48
+ torch_function_name,
49
+ patch_tensor_constructor(getattr(torch, torch_function_name)),
50
+ )
51
+ yield
52
+ finally:
53
+ torch.nn.Module.register_parameter = old_register_parameter
54
+ if include_buffers:
55
+ torch.nn.Module.register_buffer = old_register_buffer
56
+ for (
57
+ torch_function_name,
58
+ old_torch_function,
59
+ ) in tensor_constructors_to_patch.items():
60
+ setattr(torch, torch_function_name, old_torch_function)
src/vram_management/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .layers import *
src/vram_management/layers.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ import torch
4
+
5
+ from src.utils import init_weights_on_device
6
+ import optimum.quanto.nn.qlinear as qlinear
7
+
8
+ def cast_to(weight, dtype, device):
9
+ r = torch.empty_like(weight, dtype=dtype, device=device)
10
+ r.copy_(weight)
11
+ return r
12
+
13
+ def cast_to_device(weight, device):
14
+ if hasattr(weight, '__class__') and 'optimum.quanto' in str(weight.__class__):
15
+ return weight.to(device)
16
+ else:
17
+ r = torch.empty_like(weight, device=device)
18
+ r.copy_(weight)
19
+ return r
20
+
21
+ class AutoWrappedModule(torch.nn.Module):
22
+ def __init__(
23
+ self,
24
+ module: torch.nn.Module,
25
+ offload_dtype,
26
+ offload_device,
27
+ onload_dtype,
28
+ onload_device,
29
+ computation_dtype,
30
+ computation_device,
31
+ ):
32
+ super().__init__()
33
+ self.module = module.to(dtype=offload_dtype, device=offload_device)
34
+ self.offload_dtype = offload_dtype
35
+ self.offload_device = offload_device
36
+ self.onload_dtype = onload_dtype
37
+ self.onload_device = onload_device
38
+ self.computation_dtype = computation_dtype
39
+ self.computation_device = computation_device
40
+ self.state = 0
41
+
42
+ def offload(self):
43
+ if self.state == 1 and (
44
+ self.offload_dtype != self.onload_dtype
45
+ or self.offload_device != self.onload_device
46
+ ):
47
+ self.module.to(dtype=self.offload_dtype, device=self.offload_device)
48
+ self.state = 0
49
+
50
+ def onload(self):
51
+ if self.state == 0 and (
52
+ self.offload_dtype != self.onload_dtype
53
+ or self.offload_device != self.onload_device
54
+ ):
55
+ self.module.to(dtype=self.onload_dtype, device=self.onload_device)
56
+ self.state = 1
57
+
58
+ def forward(self, *args, **kwargs):
59
+ if (
60
+ self.onload_dtype == self.computation_dtype
61
+ and self.onload_device == self.computation_device
62
+ ):
63
+ module = self.module
64
+ else:
65
+ module = copy.deepcopy(self.module).to(
66
+ dtype=self.computation_dtype, device=self.computation_device
67
+ )
68
+ return module(*args, **kwargs)
69
+
70
+
71
+
72
+ class AutoWrappedQLinear(qlinear.QLinear):
73
+ def __init__(
74
+ self,
75
+ module: qlinear.QLinear,
76
+ offload_dtype,
77
+ offload_device,
78
+ onload_dtype,
79
+ onload_device,
80
+ computation_dtype,
81
+ computation_device,
82
+ ):
83
+ with init_weights_on_device(device=torch.device("meta")):
84
+ super().__init__(
85
+ in_features=module.in_features,
86
+ out_features=module.out_features,
87
+ bias=module.bias is not None,
88
+ device=offload_device,
89
+ )
90
+ self.weight = module.weight
91
+ self.bias = module.bias
92
+ self.offload_device = offload_device
93
+
94
+ self.onload_device = onload_device
95
+ self.computation_device = computation_device
96
+ self.state = 0
97
+
98
+ def offload(self):
99
+ if self.state == 1 and (
100
+ self.offload_device != self.onload_device
101
+ ):
102
+ self.to(device=self.offload_device)
103
+ self.state = 0
104
+
105
+ def onload(self):
106
+ if self.state == 0 and (
107
+ self.offload_device != self.onload_device
108
+ ):
109
+ self.to(device=self.onload_device)
110
+ self.state = 1
111
+
112
+ def forward(self, x, *args, **kwargs):
113
+ if (
114
+ self.onload_device == self.computation_device
115
+ ):
116
+
117
+ return torch.nn.functional.linear(x, self.weight, bias=self.bias)
118
+ else:
119
+
120
+ qweight = cast_to_device(self.weight, self.computation_device)
121
+ bias = (
122
+ None
123
+ if self.bias is None
124
+ else cast_to_device(self.bias, self.computation_device)
125
+ )
126
+ return torch.nn.functional.linear(x, qweight, bias)
127
+
128
+ class AutoWrappedLinear(torch.nn.Linear):
129
+ def __init__(
130
+ self,
131
+ module: torch.nn.Linear,
132
+ offload_dtype,
133
+ offload_device,
134
+ onload_dtype,
135
+ onload_device,
136
+ computation_dtype,
137
+ computation_device,
138
+ ):
139
+ with init_weights_on_device(device=torch.device("meta")):
140
+ super().__init__(
141
+ in_features=module.in_features,
142
+ out_features=module.out_features,
143
+ bias=module.bias is not None,
144
+ dtype=offload_dtype,
145
+ device=offload_device,
146
+ )
147
+ self.weight = module.weight
148
+ self.bias = module.bias
149
+ self.offload_dtype = offload_dtype
150
+ self.offload_device = offload_device
151
+ self.onload_dtype = onload_dtype
152
+ self.onload_device = onload_device
153
+ self.computation_dtype = computation_dtype
154
+ self.computation_device = computation_device
155
+ self.state = 0
156
+
157
+ def offload(self):
158
+ if self.state == 1 and (
159
+ self.offload_dtype != self.onload_dtype
160
+ or self.offload_device != self.onload_device
161
+ ):
162
+ self.to(dtype=self.offload_dtype, device=self.offload_device)
163
+ self.state = 0
164
+
165
+ def onload(self):
166
+ if self.state == 0 and (
167
+ self.offload_dtype != self.onload_dtype
168
+ or self.offload_device != self.onload_device
169
+ ):
170
+ self.to(dtype=self.onload_dtype, device=self.onload_device)
171
+ self.state = 1
172
+
173
+ def forward(self, x, *args, **kwargs):
174
+ if (
175
+ self.onload_dtype == self.computation_dtype
176
+ and self.onload_device == self.computation_device
177
+ ):
178
+ weight, bias = self.weight, self.bias
179
+ else:
180
+ weight = cast_to(
181
+ self.weight, self.computation_dtype, self.computation_device
182
+ )
183
+ bias = (
184
+ None
185
+ if self.bias is None
186
+ else cast_to(self.bias, self.computation_dtype, self.computation_device)
187
+ )
188
+ return torch.nn.functional.linear(x, weight, bias)
189
+
190
+
191
+ def enable_vram_management_recursively(
192
+ model: torch.nn.Module,
193
+ module_map: dict,
194
+ module_config: dict,
195
+ max_num_param=None,
196
+ overflow_module_config: dict = None,
197
+ total_num_param=0,
198
+ ):
199
+ for name, module in model.named_children():
200
+ for source_module, target_module in module_map.items():
201
+ if isinstance(module, source_module):
202
+ num_param = sum(p.numel() for p in module.parameters())
203
+ # print(str(module) + ':' + str(num_param))
204
+ if (
205
+ max_num_param is not None
206
+ and total_num_param + num_param > max_num_param
207
+ ):
208
+ # print(str(module) + '-->\t\t num:' + str(num_param) + "\t total:" + str(total_num_param))
209
+ module_config_ = overflow_module_config
210
+ else:
211
+ module_config_ = module_config
212
+ module_ = target_module(module, **module_config_)
213
+ setattr(model, name, module_)
214
+ total_num_param += num_param
215
+ break
216
+ else:
217
+ total_num_param = enable_vram_management_recursively(
218
+ module,
219
+ module_map,
220
+ module_config,
221
+ max_num_param,
222
+ overflow_module_config,
223
+ total_num_param,
224
+ )
225
+ return total_num_param
226
+
227
+
228
+ def enable_vram_management(
229
+ model: torch.nn.Module,
230
+ module_map: dict,
231
+ module_config: dict,
232
+ max_num_param=None,
233
+ overflow_module_config: dict = None,
234
+ ):
235
+ enable_vram_management_recursively(
236
+ model,
237
+ module_map,
238
+ module_config,
239
+ max_num_param,
240
+ overflow_module_config,
241
+ total_num_param=0,
242
+ )
243
+ model.vram_management_enabled = True
wan/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from . import configs, distributed, modules
2
+ from .first_last_frame2video import WanFLF2V
3
+ from .image2video import WanI2V
4
+ from .text2video import WanT2V
5
+ from .vace import WanVace, WanVaceMP
6
+ from .multitalk import MultiTalkPipeline
wan/configs/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import copy
3
+ import os
4
+
5
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
6
+
7
+ from .wan_i2v_14B import i2v_14B
8
+ from .wan_t2v_1_3B import t2v_1_3B
9
+ from .wan_t2v_14B import t2v_14B
10
+ from .wan_multitalk_14B import multitalk_14B
11
+
12
+ # the config of t2i_14B is the same as t2v_14B
13
+ t2i_14B = copy.deepcopy(t2v_14B)
14
+ t2i_14B.__name__ = 'Config: Wan T2I 14B'
15
+
16
+ # the config of flf2v_14B is the same as i2v_14B
17
+ flf2v_14B = copy.deepcopy(i2v_14B)
18
+ flf2v_14B.__name__ = 'Config: Wan FLF2V 14B'
19
+ flf2v_14B.sample_neg_prompt = "镜头切换," + flf2v_14B.sample_neg_prompt
20
+
21
+ WAN_CONFIGS = {
22
+ 't2v-14B': t2v_14B,
23
+ 't2v-1.3B': t2v_1_3B,
24
+ 'i2v-14B': i2v_14B,
25
+ 't2i-14B': t2i_14B,
26
+ 'flf2v-14B': flf2v_14B,
27
+ 'vace-1.3B': t2v_1_3B,
28
+ 'vace-14B': t2v_14B,
29
+ 'multitalk-14B': multitalk_14B,
30
+ }
31
+
32
+ SIZE_CONFIGS = {
33
+ '720*1280': (720, 1280),
34
+ '1280*720': (1280, 720),
35
+ '480*832': (480, 832),
36
+ '832*480': (832, 480),
37
+ '1024*1024': (1024, 1024),
38
+ 'multitalk-480': (640, 640),
39
+ 'multitalk-720': (960, 960),
40
+ }
41
+
42
+ MAX_AREA_CONFIGS = {
43
+ '720*1280': 720 * 1280,
44
+ '1280*720': 1280 * 720,
45
+ '480*832': 480 * 832,
46
+ '832*480': 832 * 480,
47
+ }
48
+
49
+ SUPPORTED_SIZES = {
50
+ 't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
51
+ 't2v-1.3B': ('480*832', '832*480'),
52
+ 'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
53
+ 'flf2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
54
+ 't2i-14B': tuple(SIZE_CONFIGS.keys()),
55
+ 'vace-1.3B': ('480*832', '832*480'),
56
+ 'vace-14B': ('720*1280', '1280*720', '480*832', '832*480'),
57
+ 'multitalk-14B': ('multitalk-480', 'multitalk-720'),
58
+ }
wan/configs/shared_config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ #------------------------ Wan shared config ------------------------#
6
+ wan_shared_cfg = EasyDict()
7
+
8
+ # t5
9
+ wan_shared_cfg.t5_model = 'umt5_xxl'
10
+ wan_shared_cfg.t5_dtype = torch.bfloat16
11
+ wan_shared_cfg.text_len = 512
12
+
13
+ # transformer
14
+ wan_shared_cfg.param_dtype = torch.bfloat16
15
+
16
+ # inference
17
+ wan_shared_cfg.num_train_timesteps = 1000
18
+ wan_shared_cfg.sample_fps = 16
19
+ wan_shared_cfg.sample_neg_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
wan/configs/wan_i2v_14B.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ from .shared_config import wan_shared_cfg
6
+
7
+ #------------------------ Wan I2V 14B ------------------------#
8
+
9
+ i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
10
+ i2v_14B.update(wan_shared_cfg)
11
+ i2v_14B.sample_neg_prompt = "镜头晃动," + i2v_14B.sample_neg_prompt
12
+
13
+ i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
14
+ i2v_14B.t5_tokenizer = 'google/umt5-xxl'
15
+
16
+ # clip
17
+ i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
18
+ i2v_14B.clip_dtype = torch.float16
19
+ i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
20
+ i2v_14B.clip_tokenizer = 'xlm-roberta-large'
21
+
22
+ # vae
23
+ i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
24
+ i2v_14B.vae_stride = (4, 8, 8)
wan/configs/wan_multitalk_14B.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ from .shared_config import wan_shared_cfg
6
+
7
+ #------------------------ Wan I2V 14B ------------------------#
8
+
9
+ multitalk_14B = EasyDict(__name__='Config: Wan MultiTalk AI2V 14B')
10
+ multitalk_14B.update(wan_shared_cfg)
11
+ multitalk_14B.sample_neg_prompt = 'bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards'
12
+
13
+ multitalk_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
14
+ multitalk_14B.t5_tokenizer = 'google/umt5-xxl'
15
+
16
+ # clip
17
+ multitalk_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
18
+ multitalk_14B.clip_dtype = torch.float16
19
+ multitalk_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
20
+ multitalk_14B.clip_tokenizer = 'xlm-roberta-large'
21
+
22
+ # vae
23
+ multitalk_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
24
+ multitalk_14B.vae_stride = (4, 8, 8)
25
+
26
+ # transformer
27
+ multitalk_14B.patch_size = (1, 2, 2)
28
+ multitalk_14B.dim = 5120
29
+ multitalk_14B.ffn_dim = 13824
30
+ multitalk_14B.freq_dim = 256
31
+ multitalk_14B.num_heads = 40
32
+ multitalk_14B.num_layers = 40
33
+ multitalk_14B.window_size = (-1, -1)
34
+ multitalk_14B.qk_norm = True
35
+ multitalk_14B.cross_attn_norm = True
36
+ multitalk_14B.eps = 1e-6
wan/configs/wan_t2v_14B.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ from easydict import EasyDict
3
+
4
+ from .shared_config import wan_shared_cfg
5
+
6
+ #------------------------ Wan T2V 14B ------------------------#
7
+
8
+ t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
9
+ t2v_14B.update(wan_shared_cfg)
10
+
11
+ # t5
12
+ t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
13
+ t2v_14B.t5_tokenizer = 'google/umt5-xxl'
14
+
15
+ # vae
16
+ t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
17
+ t2v_14B.vae_stride = (4, 8, 8)
18
+
19
+ # transformer
20
+ t2v_14B.patch_size = (1, 2, 2)
21
+ t2v_14B.dim = 5120
22
+ t2v_14B.ffn_dim = 13824
23
+ t2v_14B.freq_dim = 256
24
+ t2v_14B.num_heads = 40
25
+ t2v_14B.num_layers = 40
26
+ t2v_14B.window_size = (-1, -1)
27
+ t2v_14B.qk_norm = True
28
+ t2v_14B.cross_attn_norm = True
29
+ t2v_14B.eps = 1e-6
wan/configs/wan_t2v_1_3B.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ from easydict import EasyDict
3
+
4
+ from .shared_config import wan_shared_cfg
5
+
6
+ #------------------------ Wan T2V 1.3B ------------------------#
7
+
8
+ t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
9
+ t2v_1_3B.update(wan_shared_cfg)
10
+
11
+ # t5
12
+ t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
13
+ t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
14
+
15
+ # vae
16
+ t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
17
+ t2v_1_3B.vae_stride = (4, 8, 8)
18
+
19
+ # transformer
20
+ t2v_1_3B.patch_size = (1, 2, 2)
21
+ t2v_1_3B.dim = 1536
22
+ t2v_1_3B.ffn_dim = 8960
23
+ t2v_1_3B.freq_dim = 256
24
+ t2v_1_3B.num_heads = 12
25
+ t2v_1_3B.num_layers = 30
26
+ t2v_1_3B.window_size = (-1, -1)
27
+ t2v_1_3B.qk_norm = True
28
+ t2v_1_3B.cross_attn_norm = True
29
+ t2v_1_3B.eps = 1e-6
wan/distributed/__init__.py ADDED
File without changes
wan/distributed/fsdp.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import gc
3
+ from functools import partial
4
+
5
+ import torch
6
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
7
+ from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
8
+ from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
9
+ from torch.distributed.utils import _free_storage
10
+
11
+
12
+ def shard_model(
13
+ model,
14
+ device_id,
15
+ param_dtype=torch.bfloat16,
16
+ reduce_dtype=torch.float32,
17
+ buffer_dtype=torch.float32,
18
+ process_group=None,
19
+ sharding_strategy=ShardingStrategy.FULL_SHARD,
20
+ sync_module_states=True,
21
+ ):
22
+ model = FSDP(
23
+ module=model,
24
+ process_group=process_group,
25
+ sharding_strategy=sharding_strategy,
26
+ auto_wrap_policy=partial(
27
+ lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
28
+ # mixed_precision=MixedPrecision(
29
+ # param_dtype=param_dtype,
30
+ # reduce_dtype=reduce_dtype,
31
+ # buffer_dtype=buffer_dtype),
32
+ device_id=device_id,
33
+ sync_module_states=sync_module_states)
34
+ return model
35
+
36
+
37
+ def free_model(model):
38
+ for m in model.modules():
39
+ if isinstance(m, FSDP):
40
+ _free_storage(m._handle.flat_param.data)
41
+ del model
42
+ gc.collect()
43
+ torch.cuda.empty_cache()
wan/distributed/xdit_context_parallel.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.cuda.amp as amp
6
+ from xfuser.core.distributed import (
7
+ get_sequence_parallel_rank,
8
+ get_sequence_parallel_world_size,
9
+ get_sp_group,
10
+ )
11
+ from einops import rearrange
12
+ from xfuser.core.long_ctx_attention import xFuserLongContextAttention
13
+ import xformers.ops
14
+
15
+ from ..modules.model import sinusoidal_embedding_1d
16
+ from ..utils.multitalk_utils import get_attn_map_with_target, split_token_counts_and_frame_ids, normalize_and_scale
17
+ from ..modules.attention import SingleStreamAttention, SingleStreamMutiAttention
18
+
19
+
20
+ def pad_freqs(original_tensor, target_len):
21
+ seq_len, s1, s2 = original_tensor.shape
22
+ pad_size = target_len - seq_len
23
+ padding_tensor = torch.ones(
24
+ pad_size,
25
+ s1,
26
+ s2,
27
+ dtype=original_tensor.dtype,
28
+ device=original_tensor.device)
29
+ padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
30
+ return padded_tensor
31
+
32
+
33
+ @amp.autocast(enabled=False)
34
+ def rope_apply(x, grid_sizes, freqs):
35
+ """
36
+ x: [B, L, N, C].
37
+ grid_sizes: [B, 3].
38
+ freqs: [M, C // 2].
39
+ """
40
+ s, n, c = x.size(1), x.size(2), x.size(3) // 2
41
+ # split freqs
42
+ freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) # [[N, head_dim/2], [N, head_dim/2], [N, head_dim/2]] # T H W 极坐标
43
+
44
+ # loop over samples
45
+ output = []
46
+ for i, (f, h, w) in enumerate(grid_sizes.tolist()):
47
+ seq_len = f * h * w
48
+
49
+ # precompute multipliers
50
+ x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
51
+ s, n, -1, 2)) # [L, N, C/2] # 极坐标
52
+ freqs_i = torch.cat([
53
+ freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
54
+ freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
55
+ freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
56
+ ],
57
+ dim=-1).reshape(seq_len, 1, -1) # seq_lens, 1, 3 * dim / 2 (T H W)
58
+
59
+ # apply rotary embedding
60
+ sp_size = get_sequence_parallel_world_size()
61
+ sp_rank = get_sequence_parallel_rank()
62
+ freqs_i = pad_freqs(freqs_i, s * sp_size)
63
+ s_per_rank = s
64
+ freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
65
+ s_per_rank), :, :]
66
+ x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
67
+ x_i = torch.cat([x_i, x[i, s:]])
68
+
69
+ # append to collection
70
+ output.append(x_i)
71
+ return torch.stack(output).float()
72
+
73
+
74
+ def usp_dit_forward_vace(self, x, vace_context, seq_len, kwargs):
75
+ # embeddings
76
+ c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
77
+ c = [u.flatten(2).transpose(1, 2) for u in c]
78
+ c = torch.cat([
79
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
80
+ for u in c
81
+ ])
82
+
83
+ # arguments
84
+ new_kwargs = dict(x=x)
85
+ new_kwargs.update(kwargs)
86
+
87
+ # Context Parallel
88
+ c = torch.chunk(
89
+ c, get_sequence_parallel_world_size(),
90
+ dim=1)[get_sequence_parallel_rank()]
91
+
92
+ hints = []
93
+ for block in self.vace_blocks:
94
+ c, c_skip = block(c, **new_kwargs)
95
+ hints.append(c_skip)
96
+ return hints
97
+
98
+
99
+ def usp_dit_forward(
100
+ self,
101
+ x,
102
+ t,
103
+ context,
104
+ seq_len,
105
+ vace_context=None,
106
+ vace_context_scale=1.0,
107
+ clip_fea=None,
108
+ y=None,
109
+ ):
110
+ """
111
+ x: A list of videos each with shape [C, T, H, W].
112
+ t: [B].
113
+ context: A list of text embeddings each with shape [L, C].
114
+ """
115
+ if self.model_type == 'i2v':
116
+ assert clip_fea is not None and y is not None
117
+ # params
118
+ device = self.patch_embedding.weight.device
119
+ if self.freqs.device != device:
120
+ self.freqs = self.freqs.to(device)
121
+
122
+ if self.model_type != 'vace' and y is not None:
123
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
124
+
125
+ # embeddings
126
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
127
+ grid_sizes = torch.stack(
128
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
129
+ x = [u.flatten(2).transpose(1, 2) for u in x]
130
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
131
+ assert seq_lens.max() <= seq_len
132
+ x = torch.cat([
133
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
134
+ for u in x
135
+ ])
136
+
137
+ # time embeddings
138
+ with amp.autocast(dtype=torch.float32):
139
+ e = self.time_embedding(
140
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
141
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
142
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
143
+
144
+ # context
145
+ context_lens = None
146
+ context = self.text_embedding(
147
+ torch.stack([
148
+ torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
149
+ for u in context
150
+ ]))
151
+
152
+ if self.model_type != 'vace' and clip_fea is not None:
153
+ context_clip = self.img_emb(clip_fea) # bs x 257 x dim
154
+ context = torch.concat([context_clip, context], dim=1)
155
+
156
+ # arguments
157
+ kwargs = dict(
158
+ e=e0,
159
+ seq_lens=seq_lens,
160
+ grid_sizes=grid_sizes,
161
+ freqs=self.freqs,
162
+ context=context,
163
+ context_lens=context_lens)
164
+
165
+ # Context Parallel
166
+ x = torch.chunk(
167
+ x, get_sequence_parallel_world_size(),
168
+ dim=1)[get_sequence_parallel_rank()]
169
+
170
+ for block in self.blocks:
171
+ x = block(x, **kwargs)
172
+
173
+ # head
174
+ x = self.head(x, e)
175
+
176
+ # Context Parallel
177
+ x = get_sp_group().all_gather(x, dim=1)
178
+
179
+ # unpatchify
180
+ x = self.unpatchify(x, grid_sizes)
181
+ return [u.float() for u in x]
182
+
183
+
184
+ def usp_attn_forward(self,
185
+ x,
186
+ seq_lens,
187
+ grid_sizes,
188
+ freqs,
189
+ dtype=torch.bfloat16):
190
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
191
+ half_dtypes = (torch.float16, torch.bfloat16)
192
+
193
+ def half(x):
194
+ return x if x.dtype in half_dtypes else x.to(dtype)
195
+
196
+ # query, key, value function
197
+ def qkv_fn(x):
198
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
199
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
200
+ v = self.v(x).view(b, s, n, d)
201
+ return q, k, v
202
+
203
+ q, k, v = qkv_fn(x)
204
+ q = rope_apply(q, grid_sizes, freqs)
205
+ k = rope_apply(k, grid_sizes, freqs)
206
+
207
+ # TODO: We should use unpaded q,k,v for attention.
208
+ # k_lens = seq_lens // get_sequence_parallel_world_size()
209
+ # if k_lens is not None:
210
+ # q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
211
+ # k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
212
+ # v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
213
+
214
+ x = xFuserLongContextAttention()(
215
+ None,
216
+ query=half(q),
217
+ key=half(k),
218
+ value=half(v),
219
+ window_size=self.window_size)
220
+
221
+ # TODO: padding after attention.
222
+ # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
223
+
224
+ # output
225
+ x = x.flatten(2)
226
+ x = self.o(x)
227
+ return x
228
+
229
+
230
+
231
+
232
+ def usp_dit_forward_multitalk(
233
+ self,
234
+ x,
235
+ t,
236
+ context,
237
+ seq_len,
238
+ clip_fea=None,
239
+ y=None,
240
+ audio=None,
241
+ ref_target_masks=None,
242
+ ):
243
+ """
244
+ x: A list of videos each with shape [C, T, H, W].
245
+ t: [B].
246
+ context: A list of text embeddings each with shape [L, C].
247
+ """
248
+
249
+ assert clip_fea is not None and y is not None
250
+ # params
251
+ device = self.patch_embedding.weight.device
252
+ if self.freqs.device != device:
253
+ self.freqs = self.freqs.to(device)
254
+
255
+ _, T, H, W = x[0].shape
256
+ N_t = T // self.patch_size[0]
257
+ N_h = H // self.patch_size[1]
258
+ N_w = W // self.patch_size[2]
259
+
260
+ if y is not None:
261
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
262
+ x[0] = x[0].to(context[0].dtype)
263
+
264
+ # embeddings
265
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
266
+ grid_sizes = torch.stack(
267
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
268
+ x = [u.flatten(2).transpose(1, 2) for u in x]
269
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
270
+ assert seq_lens.max() <= seq_len
271
+ x = torch.cat([
272
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
273
+ for u in x
274
+ ])
275
+
276
+ # time embeddings
277
+ with amp.autocast(dtype=torch.float32):
278
+ e = self.time_embedding(
279
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
280
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
281
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
282
+
283
+ # context
284
+ context_lens = None
285
+ context = self.text_embedding(
286
+ torch.stack([
287
+ torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
288
+ for u in context
289
+ ]))
290
+
291
+ if clip_fea is not None:
292
+ context_clip = self.img_emb(clip_fea)
293
+ context = torch.concat([context_clip, context], dim=1)
294
+
295
+ # get audio token
296
+ audio_cond = audio.to(device=x.device, dtype=x.dtype)
297
+ first_frame_audio_emb_s = audio_cond[:, :1, ...]
298
+ latter_frame_audio_emb = audio_cond[:, 1:, ...]
299
+ latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=self.vae_scale)
300
+ middle_index = self.audio_window // 2
301
+ latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
302
+ latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
303
+ latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
304
+ latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
305
+ latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
306
+ latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
307
+ latter_frame_audio_emb_s = torch.concat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
308
+ audio_embedding = self.audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
309
+ human_num = len(audio_embedding)
310
+ audio_embedding = torch.concat(audio_embedding.split(1), dim=2).to(x.dtype)
311
+
312
+
313
+ # convert ref_target_masks to token_ref_target_masks
314
+ if ref_target_masks is not None:
315
+ ref_target_masks = ref_target_masks.unsqueeze(0).to(torch.float32)
316
+ token_ref_target_masks = nn.functional.interpolate(ref_target_masks, size=(N_h, N_w), mode='nearest')
317
+ token_ref_target_masks = token_ref_target_masks.squeeze(0)
318
+ token_ref_target_masks = (token_ref_target_masks > 0)
319
+ token_ref_target_masks = token_ref_target_masks.view(token_ref_target_masks.shape[0], -1)
320
+ token_ref_target_masks = token_ref_target_masks.to(x.dtype)
321
+
322
+ if self.enable_teacache:
323
+ modulated_inp = e0 if self.use_ret_steps else e
324
+ if self.cnt%3==0: # cond
325
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
326
+ should_calc_cond = True
327
+ self.accumulated_rel_l1_distance_cond = 0
328
+ else:
329
+ rescale_func = np.poly1d(self.coefficients)
330
+ self.accumulated_rel_l1_distance_cond += rescale_func(((modulated_inp-self.previous_e0_cond).abs().mean() / self.previous_e0_cond.abs().mean()).cpu().item())
331
+ # print("accumulated_rel_l1_distance_even", self.accumulated_rel_l1_distance_even)
332
+ if self.accumulated_rel_l1_distance_cond < self.teacache_thresh:
333
+ should_calc_cond = False
334
+ else:
335
+ should_calc_cond = True
336
+ self.accumulated_rel_l1_distance_cond = 0
337
+ self.previous_e0_cond = modulated_inp.clone()
338
+ elif self.cnt%3==1: # drop_text
339
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
340
+ should_calc_drop_text = True
341
+ self.accumulated_rel_l1_distance_drop_text = 0
342
+ else:
343
+ rescale_func = np.poly1d(self.coefficients)
344
+ self.accumulated_rel_l1_distance_drop_text += rescale_func(((modulated_inp-self.previous_e0_drop_text).abs().mean() / self.previous_e0_drop_text.abs().mean()).cpu().item())
345
+ if self.accumulated_rel_l1_distance_drop_text < self.teacache_thresh:
346
+ should_calc_drop_text = False
347
+ else:
348
+ should_calc_drop_text = True
349
+ self.accumulated_rel_l1_distance_drop_text = 0
350
+ self.previous_e0_drop_text = modulated_inp.clone()
351
+ else: # uncond
352
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
353
+ should_calc_uncond = True
354
+ self.accumulated_rel_l1_distance_uncond = 0
355
+ else:
356
+ rescale_func = np.poly1d(self.coefficients)
357
+ self.accumulated_rel_l1_distance_uncond += rescale_func(((modulated_inp-self.previous_e0_uncond).abs().mean() / self.previous_e0_uncond.abs().mean()).cpu().item())
358
+ if self.accumulated_rel_l1_distance_uncond < self.teacache_thresh:
359
+ should_calc_uncond = False
360
+ else:
361
+ should_calc_uncond = True
362
+ self.accumulated_rel_l1_distance_uncond = 0
363
+ self.previous_e0_uncond = modulated_inp.clone()
364
+
365
+ # Context Parallel
366
+ x = torch.chunk(
367
+ x, get_sequence_parallel_world_size(),
368
+ dim=1)[get_sequence_parallel_rank()]
369
+
370
+ # arguments
371
+ kwargs = dict(
372
+ e=e0,
373
+ seq_lens=seq_lens,
374
+ grid_sizes=grid_sizes,
375
+ freqs=self.freqs,
376
+ context=context,
377
+ context_lens=context_lens,
378
+ audio_embedding=audio_embedding,
379
+ ref_target_masks=token_ref_target_masks,
380
+ human_num=human_num,
381
+ )
382
+
383
+ if self.enable_teacache:
384
+ if self.cnt%3==0:
385
+ if not should_calc_cond:
386
+ x += self.previous_residual_cond
387
+ else:
388
+ ori_x = x.clone()
389
+ for block in self.blocks:
390
+ x = block(x, **kwargs)
391
+ self.previous_residual_cond = x - ori_x
392
+ elif self.cnt%3==1:
393
+ if not should_calc_drop_text:
394
+ x += self.previous_residual_drop_text
395
+ else:
396
+ ori_x = x.clone()
397
+ for block in self.blocks:
398
+ x = block(x, **kwargs)
399
+ self.previous_residual_drop_text = x - ori_x
400
+ else:
401
+ if not should_calc_uncond:
402
+ x += self.previous_residual_uncond
403
+ else:
404
+ ori_x = x.clone()
405
+ for block in self.blocks:
406
+ x = block(x, **kwargs)
407
+ self.previous_residual_uncond = x - ori_x
408
+ else:
409
+ for block in self.blocks:
410
+ x = block(x, **kwargs)
411
+
412
+ # head
413
+ x = self.head(x, e)
414
+
415
+ # Context Parallel
416
+ x = get_sp_group().all_gather(x, dim=1)
417
+
418
+ # unpatchify
419
+ x = self.unpatchify(x, grid_sizes)
420
+ if self.enable_teacache:
421
+ self.cnt += 1
422
+ if self.cnt >= self.num_steps:
423
+ self.cnt = 0
424
+
425
+ return torch.stack(x).float()
426
+
427
+
428
+ def usp_attn_forward_multitalk(self,
429
+ x,
430
+ seq_lens,
431
+ grid_sizes,
432
+ freqs,
433
+ dtype=torch.bfloat16,
434
+ ref_target_masks=None):
435
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
436
+ half_dtypes = (torch.float16, torch.bfloat16)
437
+
438
+ def half(x):
439
+ return x if x.dtype in half_dtypes else x.to(dtype)
440
+
441
+ # query, key, value function
442
+ def qkv_fn(x):
443
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
444
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
445
+ v = self.v(x).view(b, s, n, d)
446
+ return q, k, v
447
+
448
+ q, k, v = qkv_fn(x)
449
+ q = rope_apply(q, grid_sizes, freqs)
450
+ k = rope_apply(k, grid_sizes, freqs)
451
+
452
+
453
+ x = xFuserLongContextAttention()(
454
+ None,
455
+ query=half(q),
456
+ key=half(k),
457
+ value=half(v),
458
+ window_size=self.window_size)
459
+
460
+
461
+ # output
462
+ x = x.flatten(2)
463
+ x = self.o(x)
464
+
465
+ with torch.no_grad():
466
+ x_ref_attn_map = get_attn_map_with_target(q.type_as(x), k.type_as(x), grid_sizes[0],
467
+ ref_target_masks=ref_target_masks, enable_sp=True)
468
+
469
+ return x, x_ref_attn_map
470
+
471
+
472
+
473
+
474
+ def usp_crossattn_multi_forward_multitalk(self,
475
+ x: torch.Tensor,
476
+ encoder_hidden_states: torch.Tensor, # 1, 21, 64, C
477
+ shape=None,
478
+ x_ref_attn_map=None,
479
+ human_num=None) -> torch.Tensor:
480
+
481
+ N_t, N_h, N_w = shape
482
+ sp_size = get_sequence_parallel_world_size()
483
+ sp_rank = get_sequence_parallel_rank()
484
+ audio_tokens_per_frame = 32
485
+ visual_seqlen, frame_ids = split_token_counts_and_frame_ids(N_t, N_h * N_w, sp_size, sp_rank)
486
+ encoder_hidden_states = encoder_hidden_states[:, min(frame_ids):max(frame_ids)+1, ...]
487
+ encoder_hidden_states = rearrange(encoder_hidden_states, "B T N C -> B (T N) C")
488
+ N_a = len(frame_ids)
489
+ kv_seq = [audio_tokens_per_frame * human_num] * N_a
490
+
491
+ if human_num == 1:
492
+ return super(SingleStreamMutiAttention, self).forward(x, encoder_hidden_states, shape, enable_sp=True, kv_seq=kv_seq)
493
+
494
+
495
+ # get q for hidden_state
496
+ B, N, C = x.shape
497
+ q = self.q_linear(x)
498
+ q_shape = (B, N, self.num_heads, self.head_dim)
499
+ q = q.view(q_shape).permute((0, 2, 1, 3))
500
+
501
+ if self.qk_norm:
502
+ q = self.q_norm(q)
503
+
504
+ max_values = x_ref_attn_map.max(1).values[:, None, None]
505
+ min_values = x_ref_attn_map.min(1).values[:, None, None]
506
+ max_min_values = torch.cat([max_values, min_values], dim=2)
507
+ max_min_values = get_sp_group().all_gather(max_min_values, dim=1)
508
+
509
+ human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
510
+ human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
511
+
512
+ human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), (self.rope_h1[0], self.rope_h1[1]))
513
+ human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), (self.rope_h2[0], self.rope_h2[1]))
514
+ back = torch.full((x_ref_attn_map.size(1),), self.rope_bak, dtype=human1.dtype).to(human1.device)
515
+ max_indices = x_ref_attn_map.argmax(dim=0)
516
+ normalized_map = torch.stack([human1, human2, back], dim=1)
517
+ normalized_pos = normalized_map[range(x_ref_attn_map.size(1)), max_indices] # N
518
+ q = self.rope_1d(q, normalized_pos)
519
+
520
+ encoder_kv = self.kv_linear(encoder_hidden_states)
521
+ encoder_kv_shape = (B, encoder_hidden_states.size(1), 2, self.num_heads, self.head_dim)
522
+ encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
523
+ encoder_k, encoder_v = encoder_kv.unbind(0) # B H N C
524
+
525
+ if self.qk_norm:
526
+ encoder_k = self.add_k_norm(encoder_k)
527
+
528
+ # position embedding for condition audio embeddings
529
+ per_frame = torch.zeros(audio_tokens_per_frame * human_num, dtype=encoder_k.dtype).to(encoder_k.device)
530
+ per_frame[:audio_tokens_per_frame] = (self.rope_h1[0] + self.rope_h1[1]) / 2
531
+ per_frame[audio_tokens_per_frame:] = (self.rope_h2[0] + self.rope_h2[1]) / 2
532
+ encoder_pos = torch.concat([per_frame]*N_a, dim=0)
533
+ encoder_k = self.rope_1d(encoder_k, encoder_pos)
534
+
535
+ # get attn
536
+ q = rearrange(q, "B H M K -> B M H K")
537
+ encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
538
+ encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
539
+ attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(visual_seqlen, kv_seq)
540
+ x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=attn_bias, op=None,)
541
+ x = rearrange(x, "B M H K -> B H M K")
542
+
543
+ # linear transform
544
+ x_output_shape = (B, N, C)
545
+ x = x.transpose(1, 2)
546
+ x = x.reshape(x_output_shape)
547
+ x = self.proj(x)
548
+ x = self.proj_drop(x)
549
+
550
+ return x
wan/first_last_frame2video.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import gc
3
+ import logging
4
+ import math
5
+ import os
6
+ import random
7
+ import sys
8
+ import types
9
+ from contextlib import contextmanager
10
+ from functools import partial
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.cuda.amp as amp
15
+ import torch.distributed as dist
16
+ import torchvision.transforms.functional as TF
17
+ from tqdm import tqdm
18
+
19
+ from .distributed.fsdp import shard_model
20
+ from .modules.clip import CLIPModel
21
+ from .modules.model import WanModel
22
+ from .modules.t5 import T5EncoderModel
23
+ from .modules.vae import WanVAE
24
+ from .utils.fm_solvers import (
25
+ FlowDPMSolverMultistepScheduler,
26
+ get_sampling_sigmas,
27
+ retrieve_timesteps,
28
+ )
29
+ from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
30
+
31
+
32
+ class WanFLF2V:
33
+
34
+ def __init__(
35
+ self,
36
+ config,
37
+ checkpoint_dir,
38
+ device_id=0,
39
+ rank=0,
40
+ t5_fsdp=False,
41
+ dit_fsdp=False,
42
+ use_usp=False,
43
+ t5_cpu=False,
44
+ init_on_cpu=True,
45
+ ):
46
+ r"""
47
+ Initializes the image-to-video generation model components.
48
+
49
+ Args:
50
+ config (EasyDict):
51
+ Object containing model parameters initialized from config.py
52
+ checkpoint_dir (`str`):
53
+ Path to directory containing model checkpoints
54
+ device_id (`int`, *optional*, defaults to 0):
55
+ Id of target GPU device
56
+ rank (`int`, *optional*, defaults to 0):
57
+ Process rank for distributed training
58
+ t5_fsdp (`bool`, *optional*, defaults to False):
59
+ Enable FSDP sharding for T5 model
60
+ dit_fsdp (`bool`, *optional*, defaults to False):
61
+ Enable FSDP sharding for DiT model
62
+ use_usp (`bool`, *optional*, defaults to False):
63
+ Enable distribution strategy of USP.
64
+ t5_cpu (`bool`, *optional*, defaults to False):
65
+ Whether to place T5 model on CPU. Only works without t5_fsdp.
66
+ init_on_cpu (`bool`, *optional*, defaults to True):
67
+ Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
68
+ """
69
+ self.device = torch.device(f"cuda:{device_id}")
70
+ self.config = config
71
+ self.rank = rank
72
+ self.use_usp = use_usp
73
+ self.t5_cpu = t5_cpu
74
+
75
+ self.num_train_timesteps = config.num_train_timesteps
76
+ self.param_dtype = config.param_dtype
77
+
78
+ shard_fn = partial(shard_model, device_id=device_id)
79
+ self.text_encoder = T5EncoderModel(
80
+ text_len=config.text_len,
81
+ dtype=config.t5_dtype,
82
+ device=torch.device('cpu'),
83
+ checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
84
+ tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
85
+ shard_fn=shard_fn if t5_fsdp else None,
86
+ )
87
+
88
+ self.vae_stride = config.vae_stride
89
+ self.patch_size = config.patch_size
90
+ self.vae = WanVAE(
91
+ vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
92
+ device=self.device)
93
+
94
+ self.clip = CLIPModel(
95
+ dtype=config.clip_dtype,
96
+ device=self.device,
97
+ checkpoint_path=os.path.join(checkpoint_dir,
98
+ config.clip_checkpoint),
99
+ tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
100
+
101
+ logging.info(f"Creating WanModel from {checkpoint_dir}")
102
+ self.model = WanModel.from_pretrained(checkpoint_dir)
103
+ self.model.eval().requires_grad_(False)
104
+
105
+ if t5_fsdp or dit_fsdp or use_usp:
106
+ init_on_cpu = False
107
+
108
+ if use_usp:
109
+ from xfuser.core.distributed import get_sequence_parallel_world_size
110
+
111
+ from .distributed.xdit_context_parallel import (
112
+ usp_attn_forward,
113
+ usp_dit_forward,
114
+ )
115
+ for block in self.model.blocks:
116
+ block.self_attn.forward = types.MethodType(
117
+ usp_attn_forward, block.self_attn)
118
+ self.model.forward = types.MethodType(usp_dit_forward, self.model)
119
+ self.sp_size = get_sequence_parallel_world_size()
120
+ else:
121
+ self.sp_size = 1
122
+
123
+ if dist.is_initialized():
124
+ dist.barrier()
125
+ if dit_fsdp:
126
+ self.model = shard_fn(self.model)
127
+ else:
128
+ if not init_on_cpu:
129
+ self.model.to(self.device)
130
+
131
+ self.sample_neg_prompt = config.sample_neg_prompt
132
+
133
+ def generate(self,
134
+ input_prompt,
135
+ first_frame,
136
+ last_frame,
137
+ max_area=720 * 1280,
138
+ frame_num=81,
139
+ shift=16,
140
+ sample_solver='unipc',
141
+ sampling_steps=50,
142
+ guide_scale=5.5,
143
+ n_prompt="",
144
+ seed=-1,
145
+ offload_model=True):
146
+ r"""
147
+ Generates video frames from input first-last frame and text prompt using diffusion process.
148
+
149
+ Args:
150
+ input_prompt (`str`):
151
+ Text prompt for content generation.
152
+ first_frame (PIL.Image.Image):
153
+ Input image tensor. Shape: [3, H, W]
154
+ last_frame (PIL.Image.Image):
155
+ Input image tensor. Shape: [3, H, W]
156
+ [NOTE] If the sizes of first_frame and last_frame are mismatched, last_frame will be cropped & resized
157
+ to match first_frame.
158
+ max_area (`int`, *optional*, defaults to 720*1280):
159
+ Maximum pixel area for latent space calculation. Controls video resolution scaling
160
+ frame_num (`int`, *optional*, defaults to 81):
161
+ How many frames to sample from a video. The number should be 4n+1
162
+ shift (`float`, *optional*, defaults to 5.0):
163
+ Noise schedule shift parameter. Affects temporal dynamics
164
+ [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
165
+ sample_solver (`str`, *optional*, defaults to 'unipc'):
166
+ Solver used to sample the video.
167
+ sampling_steps (`int`, *optional*, defaults to 40):
168
+ Number of diffusion sampling steps. Higher values improve quality but slow generation
169
+ guide_scale (`float`, *optional*, defaults 5.0):
170
+ Classifier-free guidance scale. Controls prompt adherence vs. creativity
171
+ n_prompt (`str`, *optional*, defaults to ""):
172
+ Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
173
+ seed (`int`, *optional*, defaults to -1):
174
+ Random seed for noise generation. If -1, use random seed
175
+ offload_model (`bool`, *optional*, defaults to True):
176
+ If True, offloads models to CPU during generation to save VRAM
177
+
178
+ Returns:
179
+ torch.Tensor:
180
+ Generated video frames tensor. Dimensions: (C, N H, W) where:
181
+ - C: Color channels (3 for RGB)
182
+ - N: Number of frames (81)
183
+ - H: Frame height (from max_area)
184
+ - W: Frame width from max_area)
185
+ """
186
+ first_frame_size = first_frame.size
187
+ last_frame_size = last_frame.size
188
+ first_frame = TF.to_tensor(first_frame).sub_(0.5).div_(0.5).to(
189
+ self.device)
190
+ last_frame = TF.to_tensor(last_frame).sub_(0.5).div_(0.5).to(
191
+ self.device)
192
+
193
+ F = frame_num
194
+ first_frame_h, first_frame_w = first_frame.shape[1:]
195
+ aspect_ratio = first_frame_h / first_frame_w
196
+ lat_h = round(
197
+ np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
198
+ self.patch_size[1] * self.patch_size[1])
199
+ lat_w = round(
200
+ np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
201
+ self.patch_size[2] * self.patch_size[2])
202
+ first_frame_h = lat_h * self.vae_stride[1]
203
+ first_frame_w = lat_w * self.vae_stride[2]
204
+ if first_frame_size != last_frame_size:
205
+ # 1. resize
206
+ last_frame_resize_ratio = max(
207
+ first_frame_size[0] / last_frame_size[0],
208
+ first_frame_size[1] / last_frame_size[1])
209
+ last_frame_size = [
210
+ round(last_frame_size[0] * last_frame_resize_ratio),
211
+ round(last_frame_size[1] * last_frame_resize_ratio),
212
+ ]
213
+ # 2. center crop
214
+ last_frame = TF.center_crop(last_frame, last_frame_size)
215
+
216
+ max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
217
+ self.patch_size[1] * self.patch_size[2])
218
+ max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
219
+
220
+ seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
221
+ seed_g = torch.Generator(device=self.device)
222
+ seed_g.manual_seed(seed)
223
+ noise = torch.randn(
224
+ 16, (F - 1) // 4 + 1,
225
+ lat_h,
226
+ lat_w,
227
+ dtype=torch.float32,
228
+ generator=seed_g,
229
+ device=self.device)
230
+
231
+ msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
232
+ msk[:, 1:-1] = 0
233
+ msk = torch.concat([
234
+ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
235
+ ],
236
+ dim=1)
237
+ msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
238
+ msk = msk.transpose(1, 2)[0]
239
+
240
+ if n_prompt == "":
241
+ n_prompt = self.sample_neg_prompt
242
+
243
+ # preprocess
244
+ if not self.t5_cpu:
245
+ self.text_encoder.model.to(self.device)
246
+ context = self.text_encoder([input_prompt], self.device)
247
+ context_null = self.text_encoder([n_prompt], self.device)
248
+ if offload_model:
249
+ self.text_encoder.model.cpu()
250
+ else:
251
+ context = self.text_encoder([input_prompt], torch.device('cpu'))
252
+ context_null = self.text_encoder([n_prompt], torch.device('cpu'))
253
+ context = [t.to(self.device) for t in context]
254
+ context_null = [t.to(self.device) for t in context_null]
255
+
256
+ self.clip.model.to(self.device)
257
+ clip_context = self.clip.visual(
258
+ [first_frame[:, None, :, :], last_frame[:, None, :, :]])
259
+ if offload_model:
260
+ self.clip.model.cpu()
261
+
262
+ y = self.vae.encode([
263
+ torch.concat([
264
+ torch.nn.functional.interpolate(
265
+ first_frame[None].cpu(),
266
+ size=(first_frame_h, first_frame_w),
267
+ mode='bicubic').transpose(0, 1),
268
+ torch.zeros(3, F - 2, first_frame_h, first_frame_w),
269
+ torch.nn.functional.interpolate(
270
+ last_frame[None].cpu(),
271
+ size=(first_frame_h, first_frame_w),
272
+ mode='bicubic').transpose(0, 1),
273
+ ],
274
+ dim=1).to(self.device)
275
+ ])[0]
276
+ y = torch.concat([msk, y])
277
+
278
+ @contextmanager
279
+ def noop_no_sync():
280
+ yield
281
+
282
+ no_sync = getattr(self.model, 'no_sync', noop_no_sync)
283
+
284
+ # evaluation mode
285
+ with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
286
+
287
+ if sample_solver == 'unipc':
288
+ sample_scheduler = FlowUniPCMultistepScheduler(
289
+ num_train_timesteps=self.num_train_timesteps,
290
+ shift=1,
291
+ use_dynamic_shifting=False)
292
+ sample_scheduler.set_timesteps(
293
+ sampling_steps, device=self.device, shift=shift)
294
+ timesteps = sample_scheduler.timesteps
295
+ elif sample_solver == 'dpm++':
296
+ sample_scheduler = FlowDPMSolverMultistepScheduler(
297
+ num_train_timesteps=self.num_train_timesteps,
298
+ shift=1,
299
+ use_dynamic_shifting=False)
300
+ sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
301
+ timesteps, _ = retrieve_timesteps(
302
+ sample_scheduler,
303
+ device=self.device,
304
+ sigmas=sampling_sigmas)
305
+ else:
306
+ raise NotImplementedError("Unsupported solver.")
307
+
308
+ # sample videos
309
+ latent = noise
310
+
311
+ arg_c = {
312
+ 'context': [context[0]],
313
+ 'clip_fea': clip_context,
314
+ 'seq_len': max_seq_len,
315
+ 'y': [y],
316
+ }
317
+
318
+ arg_null = {
319
+ 'context': context_null,
320
+ 'clip_fea': clip_context,
321
+ 'seq_len': max_seq_len,
322
+ 'y': [y],
323
+ }
324
+
325
+ if offload_model:
326
+ torch.cuda.empty_cache()
327
+
328
+ self.model.to(self.device)
329
+ for _, t in enumerate(tqdm(timesteps)):
330
+ latent_model_input = [latent.to(self.device)]
331
+ timestep = [t]
332
+
333
+ timestep = torch.stack(timestep).to(self.device)
334
+
335
+ noise_pred_cond = self.model(
336
+ latent_model_input, t=timestep, **arg_c)[0].to(
337
+ torch.device('cpu') if offload_model else self.device)
338
+ if offload_model:
339
+ torch.cuda.empty_cache()
340
+ noise_pred_uncond = self.model(
341
+ latent_model_input, t=timestep, **arg_null)[0].to(
342
+ torch.device('cpu') if offload_model else self.device)
343
+ if offload_model:
344
+ torch.cuda.empty_cache()
345
+ noise_pred = noise_pred_uncond + guide_scale * (
346
+ noise_pred_cond - noise_pred_uncond)
347
+
348
+ latent = latent.to(
349
+ torch.device('cpu') if offload_model else self.device)
350
+
351
+ temp_x0 = sample_scheduler.step(
352
+ noise_pred.unsqueeze(0),
353
+ t,
354
+ latent.unsqueeze(0),
355
+ return_dict=False,
356
+ generator=seed_g)[0]
357
+ latent = temp_x0.squeeze(0)
358
+
359
+ x0 = [latent.to(self.device)]
360
+ del latent_model_input, timestep
361
+
362
+ if offload_model:
363
+ self.model.cpu()
364
+ torch.cuda.empty_cache()
365
+
366
+ if self.rank == 0:
367
+ videos = self.vae.decode(x0)
368
+
369
+ del noise, latent
370
+ del sample_scheduler
371
+ if offload_model:
372
+ gc.collect()
373
+ torch.cuda.synchronize()
374
+ if dist.is_initialized():
375
+ dist.barrier()
376
+
377
+ return videos[0] if self.rank == 0 else None