feat: add PCM streaming + Kokoro voice name support
- POST /audio/speech with response_format=pcm now streams raw 16-bit
PCM (24kHz mono) via Flask generator — compatible with customtts
extension streaming mode
- resolve_voice() handles:
* Standard OpenAI names (alloy, echo, ...)
* Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first)
* Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian
* Explicit Kokoro aliases for common voices (bella, emma, sky, etc.)
* Graceful fallback to alloy for unknown voices
- app.run(threaded=True) to support concurrent streaming connections
This commit is contained in:
@@ -26,7 +26,8 @@ if os.getenv("AOTRITON", "0") == "1":
|
|||||||
|
|
||||||
import io, time, logging, subprocess, tempfile
|
import io, time, logging, subprocess, tempfile
|
||||||
import torch, soundfile as sf
|
import torch, soundfile as sf
|
||||||
from flask import Flask, request, jsonify, abort, send_file
|
import numpy as np
|
||||||
|
from flask import Flask, request, jsonify, abort, send_file, stream_with_context
|
||||||
from flask_cors import CORS
|
from flask_cors import CORS
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
@@ -43,15 +44,62 @@ USE_GRAPHS = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()
|
|||||||
|
|
||||||
# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
|
# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
|
||||||
VOICE_MAP = {
|
VOICE_MAP = {
|
||||||
|
# ── Standard OpenAI voices ──────────────────────────────────────────────
|
||||||
"alloy": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
"alloy": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
"echo": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
|
"echo": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
|
||||||
"fable": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
|
"fable": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
|
||||||
"onyx": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
|
"onyx": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
|
||||||
"nova": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
|
"nova": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
|
||||||
"shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
|
"shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
|
||||||
|
# ── Kokoro voice aliases (customtts extension) ──────────────────────────
|
||||||
|
# Kokoro names follow: {af|bf|am|bm}_{name} (a/b=American/British, f/m=female/male)
|
||||||
|
# We map female English → Ryan (only English speaker in 0.6B model),
|
||||||
|
# Chinese voices → Vivian. Individual names get personality instruct where fitting.
|
||||||
|
"af_bella": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
|
||||||
|
"af_nicole": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
|
||||||
|
"af_sarah": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"af_sky": {"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
|
||||||
|
"bf_emma": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
|
||||||
|
"bf_isabella":{"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
|
||||||
|
"am_adam": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
|
||||||
|
"am_michael":{"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"bm_george": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
|
||||||
|
"bm_lewis": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
|
||||||
}
|
}
|
||||||
DEFAULT_VOICE = "alloy"
|
DEFAULT_VOICE = "alloy"
|
||||||
|
|
||||||
|
# Kokoro prefix heuristic for voices not explicitly listed above.
|
||||||
|
# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
|
||||||
|
_KOKORO_PREFIXES = {
|
||||||
|
"af_": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"bf_": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"am_": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"bm_": {"speaker": "Ryan", "language": "English", "instruct": ""},
|
||||||
|
"zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
|
||||||
|
"zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
|
||||||
|
}
|
||||||
|
|
||||||
|
def resolve_voice(raw: str) -> dict:
|
||||||
|
"""Resolve a voice string to a Qwen3-TTS speaker config.
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- Standard names: "alloy", "echo", etc.
|
||||||
|
- Kokoro blends: "af_bella+bf_emma+af_nicole" (picks first component)
|
||||||
|
- Kokoro singles: "af_bella"
|
||||||
|
- Unknown: falls back to DEFAULT_VOICE
|
||||||
|
"""
|
||||||
|
# Take only the first voice in a + blend
|
||||||
|
name = raw.split("+")[0].strip().lower()
|
||||||
|
if name in VOICE_MAP:
|
||||||
|
return VOICE_MAP[name]
|
||||||
|
# Try Kokoro prefix heuristic
|
||||||
|
for prefix, info in _KOKORO_PREFIXES.items():
|
||||||
|
if name.startswith(prefix):
|
||||||
|
log.debug("Kokoro prefix match %r → %s", name, info["speaker"])
|
||||||
|
return info
|
||||||
|
log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
|
||||||
|
return VOICE_MAP[DEFAULT_VOICE]
|
||||||
|
|
||||||
# ── Load model ─────────────────────────────────────────────────────────────────
|
# ── Load model ─────────────────────────────────────────────────────────────────
|
||||||
if USE_GRAPHS:
|
if USE_GRAPHS:
|
||||||
from faster_qwen3_tts import FasterQwen3TTS
|
from faster_qwen3_tts import FasterQwen3TTS
|
||||||
@@ -163,9 +211,39 @@ def speech():
|
|||||||
if not text:
|
if not text:
|
||||||
abort(400, description="'input' field is required")
|
abort(400, description="'input' field is required")
|
||||||
|
|
||||||
info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE])
|
info = resolve_voice(voice)
|
||||||
log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
|
log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
|
||||||
|
|
||||||
|
# Handle PCM streaming
|
||||||
|
if fmt == "pcm" and USE_GRAPHS:
|
||||||
|
log.info("Starting PCM streaming synthesis")
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
chunks = 0
|
||||||
|
def generator():
|
||||||
|
nonlocal chunks
|
||||||
|
for audio_chunk in tts.generate_custom_voice_streaming(
|
||||||
|
text=text,
|
||||||
|
language=info["language"],
|
||||||
|
speaker=info["speaker"],
|
||||||
|
instruct=info["instruct"] or None,
|
||||||
|
max_new_tokens=max(60, int(len(text) * 2.5))
|
||||||
|
):
|
||||||
|
chunks += 1
|
||||||
|
# Convert float32 numpy array to int16 PCM
|
||||||
|
pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
|
||||||
|
yield pcm_chunk
|
||||||
|
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
|
||||||
|
return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("PCM streaming failed")
|
||||||
|
abort(500, description=str(exc))
|
||||||
|
elif fmt == "pcm":
|
||||||
|
log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
|
||||||
|
# Fall through to regular MP3 path below
|
||||||
|
|
||||||
try:
|
try:
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
|
wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
|
||||||
@@ -203,4 +281,4 @@ def json_error(e):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
port = int(os.getenv("PROXY_PORT", "5000"))
|
port = int(os.getenv("PROXY_PORT", "5000"))
|
||||||
log.info("Starting proxy on port %d", port)
|
log.info("Starting proxy on port %d", port)
|
||||||
app.run(host="0.0.0.0", port=port, debug=False)
|
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user