From fef6a1b74c59eaa3590a68b367176bfbf43eab97 Mon Sep 17 00:00:00 2001 From: pi-bot-01 Date: Wed, 25 Mar 2026 21:39:56 -0700 Subject: [PATCH] feat: add PCM streaming + Kokoro voice name support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - POST /audio/speech with response_format=pcm now streams raw 16-bit PCM (24kHz mono) via Flask generator — compatible with customtts extension streaming mode - resolve_voice() handles: * Standard OpenAI names (alloy, echo, ...) * Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first) * Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian * Explicit Kokoro aliases for common voices (bella, emma, sky, etc.) * Graceful fallback to alloy for unknown voices - app.run(threaded=True) to support concurrent streaming connections --- qwen3-proxy/app.py | 84 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/qwen3-proxy/app.py b/qwen3-proxy/app.py index f13af45..2114ef9 100644 --- a/qwen3-proxy/app.py +++ b/qwen3-proxy/app.py @@ -26,7 +26,8 @@ if os.getenv("AOTRITON", "0") == "1": import io, time, logging, subprocess, tempfile import torch, soundfile as sf -from flask import Flask, request, jsonify, abort, send_file +import numpy as np +from flask import Flask, request, jsonify, abort, send_file, stream_with_context from flask_cors import CORS logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -43,15 +44,62 @@ USE_GRAPHS = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available() # Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct VOICE_MAP = { + # ── Standard OpenAI voices ────────────────────────────────────────────── "alloy": {"speaker": "Ryan", "language": "English", "instruct": ""}, "echo": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."}, "fable": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."}, "onyx": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."}, "nova": {"speaker": "Vivian", "language": "Chinese", "instruct": ""}, "shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."}, + # ── Kokoro voice aliases (customtts extension) ────────────────────────── + # Kokoro names follow: {af|bf|am|bm}_{name} (a/b=American/British, f/m=female/male) + # We map female English → Ryan (only English speaker in 0.6B model), + # Chinese voices → Vivian. Individual names get personality instruct where fitting. + "af_bella": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."}, + "af_nicole": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."}, + "af_sarah": {"speaker": "Ryan", "language": "English", "instruct": ""}, + "af_sky": {"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."}, + "bf_emma": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."}, + "bf_isabella":{"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."}, + "am_adam": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."}, + "am_michael":{"speaker": "Ryan", "language": "English", "instruct": ""}, + "bm_george": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."}, + "bm_lewis": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."}, } DEFAULT_VOICE = "alloy" +# Kokoro prefix heuristic for voices not explicitly listed above. +# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese +_KOKORO_PREFIXES = { + "af_": {"speaker": "Ryan", "language": "English", "instruct": ""}, + "bf_": {"speaker": "Ryan", "language": "English", "instruct": ""}, + "am_": {"speaker": "Ryan", "language": "English", "instruct": ""}, + "bm_": {"speaker": "Ryan", "language": "English", "instruct": ""}, + "zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""}, + "zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""}, +} + +def resolve_voice(raw: str) -> dict: + """Resolve a voice string to a Qwen3-TTS speaker config. + + Handles: + - Standard names: "alloy", "echo", etc. + - Kokoro blends: "af_bella+bf_emma+af_nicole" (picks first component) + - Kokoro singles: "af_bella" + - Unknown: falls back to DEFAULT_VOICE + """ + # Take only the first voice in a + blend + name = raw.split("+")[0].strip().lower() + if name in VOICE_MAP: + return VOICE_MAP[name] + # Try Kokoro prefix heuristic + for prefix, info in _KOKORO_PREFIXES.items(): + if name.startswith(prefix): + log.debug("Kokoro prefix match %r → %s", name, info["speaker"]) + return info + log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE) + return VOICE_MAP[DEFAULT_VOICE] + # ── Load model ───────────────────────────────────────────────────────────────── if USE_GRAPHS: from faster_qwen3_tts import FasterQwen3TTS @@ -163,9 +211,39 @@ def speech(): if not text: abort(400, description="'input' field is required") - info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE]) + info = resolve_voice(voice) log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"]) + # Handle PCM streaming + if fmt == "pcm" and USE_GRAPHS: + log.info("Starting PCM streaming synthesis") + t0 = time.monotonic() + try: + chunks = 0 + def generator(): + nonlocal chunks + for audio_chunk in tts.generate_custom_voice_streaming( + text=text, + language=info["language"], + speaker=info["speaker"], + instruct=info["instruct"] or None, + max_new_tokens=max(60, int(len(text) * 2.5)) + ): + chunks += 1 + # Convert float32 numpy array to int16 PCM + pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes() + yield pcm_chunk + + elapsed = time.monotonic() - t0 + log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks) + return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"}) + except Exception as exc: + log.exception("PCM streaming failed") + abort(500, description=str(exc)) + elif fmt == "pcm": + log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.") + # Fall through to regular MP3 path below + try: t0 = time.monotonic() wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"]) @@ -203,4 +281,4 @@ def json_error(e): if __name__ == "__main__": port = int(os.getenv("PROXY_PORT", "5000")) log.info("Starting proxy on port %d", port) - app.run(host="0.0.0.0", port=port, debug=False) + app.run(host="0.0.0.0", port=port, debug=False, threaded=True)