Usage
Transformers.js
If you haven't already, you can install the Transformers.js JavaScript library from NPM using:
npm i @huggingface/transformers
You can then generate audio as follows:
import { pipeline } from '@huggingface/transformers';
const tts = await pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-2-ONNX');
const input_text = '<en>This is really cool!</en>'; // Language options: "en", "ko", "es", "pt", "fr"
const audio = await tts(input_text, {
speaker_embeddings: 'https://huggingface.co/onnx-community/Supertonic-TTS-2-ONNX/resolve/main/voices/F1.bin',
num_inference_steps: 5, // Higher = better quality (typically 1-50)
speed: 1.05, // Higher = faster speech (typically 0.8-1.2)
});
await audio.save('output.wav'); // or `audio.toBlob()`;
ONNXRuntime
First, let's create a helper class, SupertonicTTS:
import os
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
class SupertonicTTS:
SAMPLE_RATE = 44100
CHUNK_COMPRESS_FACTOR = 6
BASE_CHUNK_SIZE = 512
LATENT_DIM = 24
STYLE_DIM = 128
LATENT_SIZE = BASE_CHUNK_SIZE * CHUNK_COMPRESS_FACTOR
LANGUAGES = ["en", "ko", "es", "pt", "fr"]
def __init__(self, model_path):
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
# Load ONNX sessions
self.text_encoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "text_encoder.onnx"))
self.latent_denoiser = ort.InferenceSession(os.path.join(self.model_path, "onnx", "latent_denoiser.onnx"))
self.voice_decoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "voice_decoder.onnx"))
def _load_style(self, voice: str) -> np.ndarray:
voice_path = os.path.join(self.model_path, "voices", f"{voice}.bin")
if not os.path.exists(voice_path):
raise ValueError(f"Voice '{voice}' not found.")
style_vec = np.fromfile(voice_path, dtype=np.float32)
return style_vec.reshape(1, -1, self.STYLE_DIM)
def generate(self, text: list[str], *, voice: str = "M1", speed: float = 1.0, steps: int = 5, language: str = "en") -> list[np.ndarray]:
if language not in self.LANGUAGES:
raise ValueError(f"Language '{language}' not supported. Choose from {self.LANGUAGES}.")
# 1. Prepare Text Inputs
text = [f"<{language}>{t}</{language}>" for t in text]
inputs = self.tokenizer(text, return_tensors="np", padding=True, truncation=True)
input_ids = inputs["input_ids"]
attn_mask = inputs["attention_mask"]
batch_size = input_ids.shape[0]
# 2. Prepare Style
style = self._load_style(voice).repeat(batch_size, axis=0)
# 3. Text Encoding
last_hidden_state, raw_durations = self.text_encoder.run(
None,
{"input_ids": input_ids, "attention_mask": attn_mask, "style": style}
)
durations = (raw_durations / speed * self.SAMPLE_RATE).astype(np.int64)
# 4. Latent Preparation
latent_lengths = (durations + self.LATENT_SIZE - 1) // self.LATENT_SIZE
max_len = latent_lengths.max()
latent_mask = (np.arange(max_len) < latent_lengths[:, None]).astype(np.int64)
latents = np.random.randn(batch_size, self.LATENT_DIM * self.CHUNK_COMPRESS_FACTOR, max_len).astype(np.float32)
latents *= latent_mask[:, None, :]
# 5. Denoising Loop
num_inference_steps = np.full(batch_size, steps, dtype=np.float32)
for step in range(steps):
timestep = np.full(batch_size, step, dtype=np.float32)
latents = self.latent_denoiser.run(
None,
{
"noisy_latents": latents,
"latent_mask": latent_mask,
"style": style,
"encoder_outputs": last_hidden_state,
"attention_mask": attn_mask,
"timestep": timestep,
"num_inference_steps": num_inference_steps,
},
)[0]
# 6. Decode Latents to Audio
waveforms = self.voice_decoder.run(None, {"latents": latents})[0]
# 7. Post-process: Trim padding and return list of arrays
results = []
for i, length in enumerate(latent_mask.sum(axis=1) * self.LATENT_SIZE):
results.append(waveforms[i, :length])
return results
Next, we clone this repository (using whichever way you want, git clone, huggingface_hub, etc.)
# (Optional) Download model files (or use existing local directory)
from huggingface_hub import snapshot_download
model_id = "onnx-community/Supertonic-TTS-2-ONNX"
local_dir = "supertonic-2"
snapshot_download(model_id, local_dir=local_dir)
We can then use the model as follows:
# Initialize TTS
tts = SupertonicTTS(local_dir)
# Generate audio
prompts = [
"Once upon a time, there was a brave knight.",
"Refactoring code makes it much easier to read!",
"I love this!"
]
audio_data = tts.generate(prompts, voice="M1", speed=1.0, steps=10, language="en")
# (Optional) Save to files
import soundfile as sf
for i, audio in enumerate(audio_data):
filename = f"output_{i}.wav"
sf.write(filename, audio, tts.SAMPLE_RATE)
print(f"Saved {filename}")
- Downloads last month
- 8
Model tree for onnx-community/Supertonic-TTS-2-ONNX
Base model
Supertone/supertonic-2