Usage

Transformers.js

If you haven't already, you can install the Transformers.js JavaScript library from NPM using:

npm i @huggingface/transformers

You can then generate audio as follows:

import { pipeline } from '@huggingface/transformers';

const tts = await pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-2-ONNX');

const input_text = '<en>This is really cool!</en>'; // Language options: "en", "ko", "es", "pt", "fr"
const audio = await tts(input_text, {
    speaker_embeddings: 'https://huggingface.co/onnx-community/Supertonic-TTS-2-ONNX/resolve/main/voices/F1.bin',
    num_inference_steps: 5, // Higher = better quality (typically 1-50)
    speed: 1.05, // Higher = faster speech (typically 0.8-1.2)
});
await audio.save('output.wav'); // or `audio.toBlob()`;

ONNXRuntime

First, let's create a helper class, SupertonicTTS:

import os
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

class SupertonicTTS:
    SAMPLE_RATE = 44100
    CHUNK_COMPRESS_FACTOR = 6
    BASE_CHUNK_SIZE = 512
    LATENT_DIM = 24
    STYLE_DIM = 128
    LATENT_SIZE = BASE_CHUNK_SIZE * CHUNK_COMPRESS_FACTOR
    LANGUAGES = ["en", "ko", "es", "pt", "fr"]

    def __init__(self, model_path):
        self.model_path = model_path
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

        # Load ONNX sessions
        self.text_encoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "text_encoder.onnx"))
        self.latent_denoiser = ort.InferenceSession(os.path.join(self.model_path, "onnx", "latent_denoiser.onnx"))
        self.voice_decoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "voice_decoder.onnx"))

    def _load_style(self, voice: str) -> np.ndarray:
        voice_path = os.path.join(self.model_path, "voices", f"{voice}.bin")
        if not os.path.exists(voice_path):
            raise ValueError(f"Voice '{voice}' not found.")

        style_vec = np.fromfile(voice_path, dtype=np.float32)
        return style_vec.reshape(1, -1, self.STYLE_DIM)

    def generate(self, text: list[str], *, voice: str = "M1", speed: float = 1.0, steps: int = 5, language: str = "en") -> list[np.ndarray]:
        if language not in self.LANGUAGES:
            raise ValueError(f"Language '{language}' not supported. Choose from {self.LANGUAGES}.")

        # 1. Prepare Text Inputs
        text = [f"<{language}>{t}</{language}>" for t in text]
        inputs = self.tokenizer(text, return_tensors="np", padding=True, truncation=True)
        input_ids = inputs["input_ids"]

        attn_mask = inputs["attention_mask"]
        batch_size = input_ids.shape[0]

        # 2. Prepare Style
        style = self._load_style(voice).repeat(batch_size, axis=0)

        # 3. Text Encoding
        last_hidden_state, raw_durations = self.text_encoder.run(
            None, 
            {"input_ids": input_ids, "attention_mask": attn_mask, "style": style}
        )
        durations = (raw_durations / speed * self.SAMPLE_RATE).astype(np.int64)

        # 4. Latent Preparation
        latent_lengths = (durations + self.LATENT_SIZE - 1) // self.LATENT_SIZE
        max_len = latent_lengths.max()
        latent_mask = (np.arange(max_len) < latent_lengths[:, None]).astype(np.int64)
        latents = np.random.randn(batch_size, self.LATENT_DIM * self.CHUNK_COMPRESS_FACTOR, max_len).astype(np.float32)
        latents *= latent_mask[:, None, :]

        # 5. Denoising Loop
        num_inference_steps = np.full(batch_size, steps, dtype=np.float32)
        for step in range(steps):
            timestep = np.full(batch_size, step, dtype=np.float32)
            latents = self.latent_denoiser.run(
                None,
                {
                    "noisy_latents": latents,
                    "latent_mask": latent_mask,
                    "style": style,
                    "encoder_outputs": last_hidden_state,
                    "attention_mask": attn_mask,
                    "timestep": timestep,
                    "num_inference_steps": num_inference_steps,
                },
            )[0]

        # 6. Decode Latents to Audio
        waveforms = self.voice_decoder.run(None, {"latents": latents})[0]

        # 7. Post-process: Trim padding and return list of arrays
        results = []
        for i, length in enumerate(latent_mask.sum(axis=1) * self.LATENT_SIZE):
            results.append(waveforms[i, :length])

        return results

Next, we clone this repository (using whichever way you want, git clone, huggingface_hub, etc.)

# (Optional) Download model files (or use existing local directory)
from huggingface_hub import snapshot_download
model_id = "onnx-community/Supertonic-TTS-2-ONNX"
local_dir = "supertonic-2"
snapshot_download(model_id, local_dir=local_dir)

We can then use the model as follows:

# Initialize TTS
tts = SupertonicTTS(local_dir)

# Generate audio
prompts = [
    "Once upon a time, there was a brave knight.",
    "Refactoring code makes it much easier to read!",
    "I love this!"
]
audio_data = tts.generate(prompts, voice="M1", speed=1.0, steps=10, language="en")

# (Optional) Save to files
import soundfile as sf
for i, audio in enumerate(audio_data):
    filename = f"output_{i}.wav"
    sf.write(filename, audio, tts.SAMPLE_RATE)
    print(f"Saved {filename}")

Downloads last month: 8

Model tree for onnx-community/Supertonic-TTS-2-ONNX

Base model

Supertone/supertonic-2

Quantized

(1)

this model