feat: add PCM streaming + Kokoro voice name support

- POST /audio/speech with response_format=pcm now streams raw 16-bit PCM (24kHz mono) via Flask generator — compatible with customtts extension streaming mode - resolve_voice() handles: * Standard OpenAI names (alloy, echo, ...) * Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first) * Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian * Explicit Kokoro aliases for common voices (bella, emma, sky, etc.) * Graceful fallback to alloy for unknown voices - app.run(threaded=True) to support concurrent streaming connections
2026-03-25 21:39:56 -07:00
parent d3ca5ab0b2
commit fef6a1b74c
1 changed files with 81 additions and 3 deletions
--- a/qwen3-proxy/app.py
+++ b/qwen3-proxy/app.py
@@ -26,7 +26,8 @@ if os.getenv("AOTRITON", "0") == "1":
 import io, time, logging, subprocess, tempfile
 import torch, soundfile as sf
-from flask import Flask, request, jsonify, abort, send_file
+import numpy as np
 from flask import Flask, request, jsonify, abort, send_file, stream_with_context
 from flask_cors import CORS
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -43,15 +44,62 @@ USE_GRAPHS  = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()
 # Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
 VOICE_MAP = {
    # ── Standard OpenAI voices ──────────────────────────────────────────────
    "alloy":   {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "echo":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
    "fable":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "onyx":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "nova":    {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
    "shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
    # ── Kokoro voice aliases (customtts extension) ──────────────────────────
    # Kokoro names follow: {af|bf|am|bm}_{name}  (a/b=American/British, f/m=female/male)
    # We map female English → Ryan (only English speaker in 0.6B model),
    # Chinese voices → Vivian.  Individual names get personality instruct where fitting.
    "af_bella":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "af_nicole": {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
    "af_sarah":  {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "af_sky":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak gently and softly."},
    "bf_emma":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "bf_isabella":{"speaker": "Ryan",  "language": "English", "instruct": "Speak gently and softly."},
    "am_adam":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "am_michael":{"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bm_george": {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "bm_lewis":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
 }
 DEFAULT_VOICE = "alloy"
 # Kokoro prefix heuristic for voices not explicitly listed above.
 # af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
 _KOKORO_PREFIXES = {
    "af_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bf_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "am_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bm_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
    "zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
 }
 def resolve_voice(raw: str) -> dict:
    """Resolve a voice string to a Qwen3-TTS speaker config.
    Handles:
      - Standard names:  "alloy", "echo", etc.
      - Kokoro blends:   "af_bella+bf_emma+af_nicole"  (picks first component)
      - Kokoro singles:  "af_bella"
      - Unknown:         falls back to DEFAULT_VOICE
    """
    # Take only the first voice in a + blend
    name = raw.split("+")[0].strip().lower()
    if name in VOICE_MAP:
        return VOICE_MAP[name]
    # Try Kokoro prefix heuristic
    for prefix, info in _KOKORO_PREFIXES.items():
        if name.startswith(prefix):
            log.debug("Kokoro prefix match %r → %s", name, info["speaker"])
            return info
    log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
    return VOICE_MAP[DEFAULT_VOICE]
 # ── Load model ─────────────────────────────────────────────────────────────────
 if USE_GRAPHS:
    from faster_qwen3_tts import FasterQwen3TTS
@@ -163,9 +211,39 @@ def speech():
    if not text:
        abort(400, description="'input' field is required")
-    info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE])
+    info = resolve_voice(voice)
    log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
    # Handle PCM streaming
    if fmt == "pcm" and USE_GRAPHS:
        log.info("Starting PCM streaming synthesis")
        t0 = time.monotonic()
        try:
            chunks = 0
            def generator():
                nonlocal chunks
                for audio_chunk in tts.generate_custom_voice_streaming(
                    text=text,
                    language=info["language"],
                    speaker=info["speaker"],
                    instruct=info["instruct"] or None,
                    max_new_tokens=max(60, int(len(text) * 2.5))
                ):
                    chunks += 1
                    # Convert float32 numpy array to int16 PCM
                    pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
                    yield pcm_chunk
            elapsed = time.monotonic() - t0
            log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
            return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
        except Exception as exc:
            log.exception("PCM streaming failed")
            abort(500, description=str(exc))
    elif fmt == "pcm":
        log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
        # Fall through to regular MP3 path below
    try:
        t0 = time.monotonic()
        wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
@@ -203,4 +281,4 @@ def json_error(e):
 if __name__ == "__main__":
    port = int(os.getenv("PROXY_PORT", "5000"))
    log.info("Starting proxy on port %d", port)
-    app.run(host="0.0.0.0", port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False, threaded=True)