feat: add PCM streaming + Kokoro voice name support

- POST /audio/speech with response_format=pcm now streams raw 16-bit PCM (24kHz mono) via Flask generator — compatible with customtts extension streaming mode - resolve_voice() handles: * Standard OpenAI names (alloy, echo, ...) * Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first) * Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian * Explicit Kokoro aliases for common voices (bella, emma, sky, etc.) * Graceful fallback to alloy for unknown voices - app.run(threaded=True) to support concurrent streaming connections
2026-03-25 21:39:56 -07:00
parent d3ca5ab0b2
commit fef6a1b74c
1 changed files with 81 additions and 3 deletions
--- a/qwen3-proxy/app.py
+++ b/qwen3-proxy/app.py
@@ -26,7 +26,8 @@ if os.getenv("AOTRITON", "0") == "1":

 import io, time, logging, subprocess, tempfile
 import torch, soundfile as sf
-from flask import Flask, request, jsonify, abort, send_file
+import numpy as np
+from flask import Flask, request, jsonify, abort, send_file, stream_with_context
 from flask_cors import CORS

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -43,15 +44,62 @@ USE_GRAPHS  = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()

 # Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
 VOICE_MAP = {
+    # ── Standard OpenAI voices ──────────────────────────────────────────────
    "alloy":   {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "echo":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
    "fable":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "onyx":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "nova":    {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
    "shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
+    # ── Kokoro voice aliases (customtts extension) ──────────────────────────
+    # Kokoro names follow: {af|bf|am|bm}_{name}  (a/b=American/British, f/m=female/male)
+    # We map female English → Ryan (only English speaker in 0.6B model),
+    # Chinese voices → Vivian.  Individual names get personality instruct where fitting.
+    "af_bella":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
+    "af_nicole": {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
+    "af_sarah":  {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "af_sky":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak gently and softly."},
+    "bf_emma":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
+    "bf_isabella":{"speaker": "Ryan",  "language": "English", "instruct": "Speak gently and softly."},
+    "am_adam":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
+    "am_michael":{"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "bm_george": {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
+    "bm_lewis":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
 }
 DEFAULT_VOICE = "alloy"

+# Kokoro prefix heuristic for voices not explicitly listed above.
+# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
+_KOKORO_PREFIXES = {
+    "af_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "bf_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "am_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "bm_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
+    "zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
+}
+
+def resolve_voice(raw: str) -> dict:
+    """Resolve a voice string to a Qwen3-TTS speaker config.
+
+    Handles:
+      - Standard names:  "alloy", "echo", etc.
+      - Kokoro blends:   "af_bella+bf_emma+af_nicole"  (picks first component)
+      - Kokoro singles:  "af_bella"
+      - Unknown:         falls back to DEFAULT_VOICE
+    """
+    # Take only the first voice in a + blend
+    name = raw.split("+")[0].strip().lower()
+    if name in VOICE_MAP:
+        return VOICE_MAP[name]
+    # Try Kokoro prefix heuristic
+    for prefix, info in _KOKORO_PREFIXES.items():
+        if name.startswith(prefix):
+            log.debug("Kokoro prefix match %r → %s", name, info["speaker"])
+            return info
+    log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
+    return VOICE_MAP[DEFAULT_VOICE]
+
 # ── Load model ─────────────────────────────────────────────────────────────────
 if USE_GRAPHS:
    from faster_qwen3_tts import FasterQwen3TTS
@@ -163,9 +211,39 @@ def speech():
    if not text:
        abort(400, description="'input' field is required")

-    info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE])
+    info = resolve_voice(voice)
    log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])

+    # Handle PCM streaming
+    if fmt == "pcm" and USE_GRAPHS:
+        log.info("Starting PCM streaming synthesis")
+        t0 = time.monotonic()
+        try:
+            chunks = 0
+            def generator():
+                nonlocal chunks
+                for audio_chunk in tts.generate_custom_voice_streaming(
+                    text=text,
+                    language=info["language"],
+                    speaker=info["speaker"],
+                    instruct=info["instruct"] or None,
+                    max_new_tokens=max(60, int(len(text) * 2.5))
+                ):
+                    chunks += 1
+                    # Convert float32 numpy array to int16 PCM
+                    pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+                    yield pcm_chunk
+            
+            elapsed = time.monotonic() - t0
+            log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
+            return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
+        except Exception as exc:
+            log.exception("PCM streaming failed")
+            abort(500, description=str(exc))
+    elif fmt == "pcm":
+        log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
+        # Fall through to regular MP3 path below
+
    try:
        t0 = time.monotonic()
        wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
@@ -203,4 +281,4 @@ def json_error(e):
 if __name__ == "__main__":
    port = int(os.getenv("PROXY_PORT", "5000"))
    log.info("Starting proxy on port %d", port)
-    app.run(host="0.0.0.0", port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False, threaded=True)