NEED HELP -> Facing Kernel Size Issue
CODE
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
torch_dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")
description = "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
text = "Hello! This is Maya1 the best open source voice AI model with emotions."
prompt = f'<description="{description}"> {text}'
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=500,
temperature=0.4,
top_p=0.9,
do_sample=True
)
generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
frames = len(snac_tokens) // 7
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i*7:(i+1)*7]
codes[0].append((s[0]-128266) % 4096)
codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
codes[2].extend([(s[2]-128266) % 4096, (s[3]-128266) % 4096, (s[5]-128266) % 4096, (s[6]-128266) % 4096])
Generate final audio with SNAC decoder
codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
with torch.inference_mode():
audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
Save your emotional voice output
sf.write("output.wav", audio, 24000)
print("Voice generated successfully! Play output.wav")
RESULT
Traceback (most recent call last):
File "/home/ravneet/Documents/Maya/maya.py", line 51, in
audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^
File "/home/ravneet/Documents/Maya/maya1/lib/python3.13/site-packages/snac/vq.py", line 95, in from_codes
z_q_i = self.quantizers[i].out_proj(z_p_i)
File "/home/ravneet/Documents/Maya/maya1/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ravneet/Documents/Maya/maya1/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ravneet/Documents/Maya/maya1/lib/python3.13/site-packages/torch/nn/modules/conv.py", line 371, in forward
return self._conv_forward(input, self.weight, self.bias)
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ravneet/Documents/Maya/maya1/lib/python3.13/site-packages/torch/nn/modules/conv.py", line 366, in _conv_forward
return F.conv1d(
~~~~~~~~^
input, weight, bias, self.stride, self.padding, self.dilation, self.groups
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
RuntimeError: Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size
HF-Space - https://huggingface.co/spaces/maya-research/maya1
Repo with fastAPI implementation - https://github.com/MayaResearch/maya1-fastapi