feat: add PCM streaming + Kokoro voice name support

- POST /audio/speech with response_format=pcm now streams raw 16-bit
  PCM (24kHz mono) via Flask generator — compatible with customtts
  extension streaming mode
- resolve_voice() handles:
    * Standard OpenAI names (alloy, echo, ...)
    * Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first)
    * Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian
    * Explicit Kokoro aliases for common voices (bella, emma, sky, etc.)
    * Graceful fallback to alloy for unknown voices
- app.run(threaded=True) to support concurrent streaming connections
This commit is contained in:
2026-03-25 21:39:56 -07:00
parent d3ca5ab0b2
commit fef6a1b74c

View File

@@ -26,7 +26,8 @@ if os.getenv("AOTRITON", "0") == "1":
import io, time, logging, subprocess, tempfile
import torch, soundfile as sf
from flask import Flask, request, jsonify, abort, send_file
import numpy as np
from flask import Flask, request, jsonify, abort, send_file, stream_with_context
from flask_cors import CORS
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -43,15 +44,62 @@ USE_GRAPHS = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()
# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
VOICE_MAP = {
# ── Standard OpenAI voices ──────────────────────────────────────────────
"alloy": {"speaker": "Ryan", "language": "English", "instruct": ""},
"echo": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
"fable": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"onyx": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"nova": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
"shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
# ── Kokoro voice aliases (customtts extension) ──────────────────────────
# Kokoro names follow: {af|bf|am|bm}_{name} (a/b=American/British, f/m=female/male)
# We map female English → Ryan (only English speaker in 0.6B model),
# Chinese voices → Vivian. Individual names get personality instruct where fitting.
"af_bella": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"af_nicole": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
"af_sarah": {"speaker": "Ryan", "language": "English", "instruct": ""},
"af_sky": {"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
"bf_emma": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"bf_isabella":{"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
"am_adam": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"am_michael":{"speaker": "Ryan", "language": "English", "instruct": ""},
"bm_george": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"bm_lewis": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
}
DEFAULT_VOICE = "alloy"
# Kokoro prefix heuristic for voices not explicitly listed above.
# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
_KOKORO_PREFIXES = {
"af_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"bf_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"am_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"bm_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
"zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
}
def resolve_voice(raw: str) -> dict:
"""Resolve a voice string to a Qwen3-TTS speaker config.
Handles:
- Standard names: "alloy", "echo", etc.
- Kokoro blends: "af_bella+bf_emma+af_nicole" (picks first component)
- Kokoro singles: "af_bella"
- Unknown: falls back to DEFAULT_VOICE
"""
# Take only the first voice in a + blend
name = raw.split("+")[0].strip().lower()
if name in VOICE_MAP:
return VOICE_MAP[name]
# Try Kokoro prefix heuristic
for prefix, info in _KOKORO_PREFIXES.items():
if name.startswith(prefix):
log.debug("Kokoro prefix match %r%s", name, info["speaker"])
return info
log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
return VOICE_MAP[DEFAULT_VOICE]
# ── Load model ─────────────────────────────────────────────────────────────────
if USE_GRAPHS:
from faster_qwen3_tts import FasterQwen3TTS
@@ -163,9 +211,39 @@ def speech():
if not text:
abort(400, description="'input' field is required")
info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE])
info = resolve_voice(voice)
log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
# Handle PCM streaming
if fmt == "pcm" and USE_GRAPHS:
log.info("Starting PCM streaming synthesis")
t0 = time.monotonic()
try:
chunks = 0
def generator():
nonlocal chunks
for audio_chunk in tts.generate_custom_voice_streaming(
text=text,
language=info["language"],
speaker=info["speaker"],
instruct=info["instruct"] or None,
max_new_tokens=max(60, int(len(text) * 2.5))
):
chunks += 1
# Convert float32 numpy array to int16 PCM
pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
yield pcm_chunk
elapsed = time.monotonic() - t0
log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
except Exception as exc:
log.exception("PCM streaming failed")
abort(500, description=str(exc))
elif fmt == "pcm":
log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
# Fall through to regular MP3 path below
try:
t0 = time.monotonic()
wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
@@ -203,4 +281,4 @@ def json_error(e):
if __name__ == "__main__":
port = int(os.getenv("PROXY_PORT", "5000"))
log.info("Starting proxy on port %d", port)
app.run(host="0.0.0.0", port=port, debug=False)
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)