How to Build a Speech Recognition System from Scratch

"From scratch" in 2026 doesn't mean implementing a neural architecture from first principles — it means assembling a production-grade speech recognition pipeline from pre-trained components, wiring it into an audio input stack, and making it robust to real-world conditions. This guide walks through building a complete, production-ready STT service using faster-whisper, with audio preprocessing, WebSocket streaming, speaker diarization, and a REST API wrapping the whole system.

System Architecture

Audio Input (mic / file / WebSocket)
    ↓
Audio Preprocessing (resampling, noise reduction, VAD)
    ↓
ASR Model (faster-whisper)
    ↓
Post-processing (punctuation, capitalization, diarization)
    ↓
Output (JSON transcript with timestamps and speaker IDs)

Step 1: Audio Preprocessing

Whisper expects 16kHz mono audio. Microphone inputs and recordings often come in different formats. A robust preprocessing layer handles resampling and format normalization before passing to the model.

pip install faster-whisper librosa soundfile noisereduce

import librosa
import noisereduce as nr
import numpy as np
import soundfile as sf

def preprocess_audio(
    input_path: str,
    output_path: str = None,
    target_sr: int = 16000,
    denoise: bool = True,
) -> np.ndarray:
    """
    Prepare audio file for Whisper:
    - Convert to mono
    - Resample to 16kHz
    - Apply noise reduction (optional)
    """
    # Load audio at original sample rate
    audio, original_sr = librosa.load(input_path, sr=None, mono=True)
    
    # Resample to Whisper's expected rate
    if original_sr != target_sr:
        audio = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
    
    # Apply spectral noise reduction
    if denoise:
        audio = nr.reduce_noise(y=audio, sr=target_sr, stationary=True)
    
    # Normalize loudness to -23 LUFS (broadcast standard)
    rms = np.sqrt(np.mean(audio**2))
    if rms > 0:
        target_rms = 0.05
        audio = audio * (target_rms / rms)
    
    if output_path:
        sf.write(output_path, audio, target_sr)
    
    return audio  # float32 numpy array, 16kHz, mono

# Usage
audio_array = preprocess_audio("meeting_recording.mp4", denoise=True)

Step 2: Core ASR with faster-whisper

from faster_whisper import WhisperModel
import numpy as np

class STTEngine:
    """Singleton ASR engine with configurable model."""
    
    def __init__(self, model_size: str = "base", device: str = "auto"):
        # "auto" uses CUDA if available, else CPU
        if device == "auto":
            import torch
            device = "cuda" if torch.cuda.is_available() else "cpu"
        
        compute_type = "float16" if device == "cuda" else "int8"
        
        print(f"Loading Whisper {model_size} on {device} ({compute_type})")
        self.model = WhisperModel(
            model_size,
            device=device,
            compute_type=compute_type,
        )
    
    def transcribe(
        self,
        audio: np.ndarray | str,
        language: str = None,
        word_timestamps: bool = False,
    ) -> dict:
        """
        Transcribe audio to text with optional word-level timestamps.
        
        Args:
            audio: numpy array (float32, 16kHz) or path to audio file
            language: ISO 639-1 code (None = auto-detect)
            word_timestamps: Include word-level timing
        
        Returns:
            dict with 'text', 'segments', 'language', 'duration'
        """
        segments, info = self.model.transcribe(
            audio,
            language=language,
            beam_size=5,
            vad_filter=True,
            vad_parameters={"min_silence_duration_ms": 500},
            word_timestamps=word_timestamps,
        )
        
        result_segments = []
        full_text = ""
        
        for seg in segments:
            segment_data = {
                "start": round(seg.start, 2),
                "end": round(seg.end, 2),
                "text": seg.text.strip(),
                "confidence": round(1 - seg.no_speech_prob, 3),
            }
            
            if word_timestamps and seg.words:
                segment_data["words"] = [
                    {"word": w.word, "start": round(w.start, 2), "end": round(w.end, 2)}
                    for w in seg.words
                ]
            
            result_segments.append(segment_data)
            full_text += seg.text
        
        return {
            "text": full_text.strip(),
            "segments": result_segments,
            "language": info.language,
            "language_confidence": round(info.language_probability, 3),
        }

# Initialize once, use everywhere
stt = STTEngine(model_size="base", device="auto")

Step 3: Speaker Diarization

pip install pyannote.audio

# Note: requires free HuggingFace token + accepting pyannote license
from pyannote.audio import Pipeline
import torch

class DiarizationEngine:
    def __init__(self, hf_token: str):
        self.pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=hf_token,
        )
        if torch.cuda.is_available():
            self.pipeline = self.pipeline.to(torch.device("cuda"))
    
    def diarize(self, audio_path: str, num_speakers: int = None) -> list:
        """Returns list of {speaker, start, end} segments."""
        diarization = self.pipeline(
            audio_path,
            num_speakers=num_speakers,  # None = auto-detect
        )
        
        segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            segments.append({
                "speaker": speaker,
                "start": round(turn.start, 2),
                "end": round(turn.end, 2),
            })
        return segments

def merge_transcription_and_diarization(
    transcript_segments: list,
    diarization_segments: list,
) -> list:
    """Assign speaker labels to transcript segments by time overlap."""
    for ts in transcript_segments:
        # Find best matching diarization segment
        ts_mid = (ts["start"] + ts["end"]) / 2
        best_match = None
        for ds in diarization_segments:
            if ds["start"] <= ts_mid <= ds["end"]:
                best_match = ds["speaker"]
                break
        ts["speaker"] = best_match or "UNKNOWN"
    return transcript_segments

Step 4: FastAPI Service

# main.py
import tempfile, os
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from stt_engine import stt, preprocess_audio

app = FastAPI(title="Open Source STT Service")

@app.post("/transcribe")
async def transcribe(
    audio: UploadFile = File(...),
    language: str = Form(None),
    word_timestamps: bool = Form(False),
    denoise: bool = Form(True),
):
    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(await audio.read())
        tmp_path = tmp.name
    
    try:
        # Preprocess
        audio_array = preprocess_audio(tmp_path, denoise=denoise)
        
        # Transcribe
        result = stt.transcribe(
            audio_array,
            language=language,
            word_timestamps=word_timestamps,
        )
        
        return JSONResponse(result)
    finally:
        os.unlink(tmp_path)

# Run with: uvicorn main:app --host 0.0.0.0 --port 8001 --workers 1
# Test with:
# curl -X POST http://localhost:8001/transcribe \
#   -F "audio=@meeting.mp3" \
#   -F "word_timestamps=true"

Conclusion

A production-grade STT service built on faster-whisper with proper audio preprocessing and speaker diarization is genuinely competitive with paid APIs for most use cases — and runs at a fraction of the per-minute cost. The audio preprocessing pipeline (noise reduction, resampling, VAD) is where the real robustness work happens. Whisper handles remarkably diverse audio quality gracefully, but feeding it clean, normalized audio consistently improves accuracy by 5–15% on challenging recordings.