How to Build Your Own TTS System Using Open Source Models
Building your own TTS system is not about reinventing the model architecture — it's about assembling the right components into a production-grade pipeline that handles text normalization, model inference, audio post-processing, and serving under real load. This guide walks through building a complete TTS API service using open-source models, deployable on a GPU instance or a powerful CPU machine.
Architecture Overview
A production TTS service has these layers:
- Text Normalization: Convert numbers, abbreviations, dates, currency to speakable form ("$1,500" → "fifteen hundred dollars")
- Phonemization: Convert normalized text to phonemes (language-specific pronunciation units)
- Acoustic Model: Convert phoneme sequence to mel spectrogram
- Vocoder: Convert mel spectrogram to raw waveform audio
- Serving Layer: REST API with caching, rate limiting, audio format conversion
Modern end-to-end models like Kokoro and XTTS v2 handle steps 2–4 internally. You mostly need to handle normalization and serving.
Step 1: Text Normalization
pip install nemo_text_processing
from nemo_text_processing.text_normalization.normalize import Normalizer
normalizer = Normalizer(input_case='cased', lang='en')
raw_texts = [
"The Q4 2025 revenue was $94.9B, up 4.5% YoY.",
"Meeting at 3:30pm on Jan 15th, 2026.",
"Dr. Smith has a Ph.D. from M.I.T.",
]
for text in raw_texts:
normalized = normalizer.normalize(text, verbose=False)
print(f"IN: {text}")
print(f"OUT: {normalized}")
print()
# Output:
# IN: The Q4 2025 revenue was $94.9B, up 4.5% YoY.
# OUT: The Q four twenty twenty five revenue was ninety four point nine billion dollars, up four point five percent year over year.
Step 2: TTS Inference with Kokoro
# tts_engine.py
import threading
import numpy as np
from kokoro import KPipeline
class TTSEngine:
"""Thread-safe TTS engine with model caching."""
_instance = None
_lock = threading.Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._pipelines = {}
return cls._instance
def get_pipeline(self, lang_code: str = 'a') -> KPipeline:
if lang_code not in self._pipelines:
self._pipelines[lang_code] = KPipeline(lang_code=lang_code)
return self._pipelines[lang_code]
def synthesize(
self,
text: str,
voice: str = 'af_heart',
speed: float = 1.0,
lang_code: str = 'a',
) -> np.ndarray:
pipeline = self.get_pipeline(lang_code)
samples = []
for _, _, audio in pipeline(text, voice=voice, speed=speed):
samples.extend(audio.tolist())
return np.array(samples, dtype=np.float32)
# Singleton engine -- loaded once, shared across requests
engine = TTSEngine()
Step 3: FastAPI Serving Layer
# main.py - Production TTS API
import io
import hashlib
import soundfile as sf
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from functools import lru_cache
from tts_engine import engine
from nemo_text_processing.text_normalization.normalize import Normalizer
app = FastAPI(title="Open Source TTS API")
normalizer = Normalizer(input_case='cased', lang='en')
class TTSRequest(BaseModel):
text: str
voice: str = "af_heart"
speed: float = 1.0
format: str = "mp3" # mp3 or wav
@lru_cache(maxsize=512)
def cached_synthesize(text_hash: str, text: str, voice: str, speed: float):
"""Cache synthesis results to avoid re-generating identical requests."""
return engine.synthesize(text, voice=voice, speed=speed)
@app.post("/synthesize")
async def synthesize(req: TTSRequest):
if len(req.text) > 5000:
raise HTTPException(status_code=400, detail="Text exceeds 5000 character limit")
# Normalize text before synthesis
normalized_text = normalizer.normalize(req.text, verbose=False)
# Cache key based on content
cache_key = hashlib.md5(f"{normalized_text}{req.voice}{req.speed}".encode()).hexdigest()
audio_array = cached_synthesize(cache_key, normalized_text, req.voice, req.speed)
# Convert to requested format
buffer = io.BytesIO()
if req.format == "wav":
sf.write(buffer, audio_array, 24000, format='WAV')
media_type = "audio/wav"
else:
# Convert to MP3 using pydub
from pydub import AudioSegment
wav_buf = io.BytesIO()
sf.write(wav_buf, audio_array, 24000, format='WAV')
wav_buf.seek(0)
mp3_buf = io.BytesIO()
AudioSegment.from_wav(wav_buf).export(mp3_buf, format="mp3", bitrate="128k")
buffer = mp3_buf
media_type = "audio/mpeg"
buffer.seek(0)
return StreamingResponse(buffer, media_type=media_type)
Step 4: Docker Deployment
# Dockerfile
FROM python:3.11-slim
RUN apt-get update && apt-get install -y ffmpeg libsndfile1 && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Pre-download model weights during build (not at runtime)
RUN python -c "from kokoro import KPipeline; KPipeline(lang_code='a')"
COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
# requirements.txt:
# fastapi>=0.110.0
# uvicorn[standard]
# kokoro
# soundfile
# pydub
# nemo_text_processing
# pydantic>=2.0
# Deploy:
# docker build -t my-tts-api .
# docker run -p 8000:8000 my-tts-api
# Test:
# curl -X POST http://localhost:8000/synthesize \
# -H "Content-Type: application/json" \
# -d '{"text": "Hello world!", "voice": "af_heart"}' \
# --output speech.mp3
Production Hardening
- Request queue: Use Celery + Redis for async generation of long texts to avoid HTTP timeouts
- Rate limiting: Implement per-IP rate limits (e.g., 60 requests/minute) using slowapi
- Audio CDN caching: Hash the request parameters, store generated audio in S3, and return CDN URLs — this eliminates re-generation for repeated content
- Model warm-up: Pre-run a silent synthesis at startup to fully initialize the model weights before serving real requests
Conclusion
Building a production TTS API on open-source models is straightforward once you understand the pipeline components. The text normalization step is often the most underestimated — getting "Dr. Smith earned $1.2M in Q3" to sound correct requires proper normalization before synthesis, and handling edge cases in normalization is where most of the real engineering work lives.