Files
qwen3-tts-ra/qwen3-proxy/app.py
pi-bot-01 fef6a1b74c feat: add PCM streaming + Kokoro voice name support
- POST /audio/speech with response_format=pcm now streams raw 16-bit
  PCM (24kHz mono) via Flask generator — compatible with customtts
  extension streaming mode
- resolve_voice() handles:
    * Standard OpenAI names (alloy, echo, ...)
    * Kokoro blend syntax: 'af_bella+bf_emma+af_nicole' (picks first)
    * Kokoro prefix heuristic: af_/bf_/am_/bm_ → Ryan, zf_/zm_ → Vivian
    * Explicit Kokoro aliases for common voices (bella, emma, sky, etc.)
    * Graceful fallback to alloy for unknown voices
- app.run(threaded=True) to support concurrent streaming connections
2026-03-25 21:39:56 -07:00

285 lines
14 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""OpenAI-compatible TTS proxy backed by Qwen3-TTS.
Implements the two endpoints that Read-Aloud's OpenAI engine uses:
GET /models — connection test
POST /audio/speech — synthesise text → mp3
Set env vars to override defaults:
QWEN_MODEL — HuggingFace model id or local path
PROXY_PORT — listening port (default 5000)
DEVICE — torch device (default: cuda:0 if available, else cpu)
AOTRITON — "1" to enable AOTriton flash attention on gfx1100.
Faster for long text (>~80 chars, e.g. novel chapters).
Slower for short sentences (e.g. read-aloud). Default: 0.
HIP_GRAPHS — "1" to use faster-qwen3-tts (HIP/CUDA graph acceleration).
Eliminates Python overhead per autoregressive token — 3-4x
faster than the standard path. Requires GPU. Default: 1.
"""
import os
# Must be set before the first torch SDPA call (checked lazily, not at import).
if os.getenv("AOTRITON", "0") == "1":
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"
import io, time, logging, subprocess, tempfile
import torch, soundfile as sf
import numpy as np
from flask import Flask, request, jsonify, abort, send_file, stream_with_context
from flask_cors import CORS
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
app = Flask(__name__)
CORS(app) # allow requests from browser extensions (chrome-extension:// etc.)
# ── Configuration ──────────────────────────────────────────────────────────────
MODEL_PATH = os.getenv("QWEN_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
DEVICE = os.getenv("DEVICE", "cuda:0" if torch.cuda.is_available() else "cpu")
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
USE_GRAPHS = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()
# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
VOICE_MAP = {
# ── Standard OpenAI voices ──────────────────────────────────────────────
"alloy": {"speaker": "Ryan", "language": "English", "instruct": ""},
"echo": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
"fable": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"onyx": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"nova": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
"shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
# ── Kokoro voice aliases (customtts extension) ──────────────────────────
# Kokoro names follow: {af|bf|am|bm}_{name} (a/b=American/British, f/m=female/male)
# We map female English → Ryan (only English speaker in 0.6B model),
# Chinese voices → Vivian. Individual names get personality instruct where fitting.
"af_bella": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"af_nicole": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
"af_sarah": {"speaker": "Ryan", "language": "English", "instruct": ""},
"af_sky": {"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
"bf_emma": {"speaker": "Ryan", "language": "English", "instruct": "Speak warmly and expressively."},
"bf_isabella":{"speaker": "Ryan", "language": "English", "instruct": "Speak gently and softly."},
"am_adam": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"am_michael":{"speaker": "Ryan", "language": "English", "instruct": ""},
"bm_george": {"speaker": "Ryan", "language": "English", "instruct": "Speak with a deep, authoritative voice."},
"bm_lewis": {"speaker": "Ryan", "language": "English", "instruct": "Speak in a calm, measured tone."},
}
DEFAULT_VOICE = "alloy"
# Kokoro prefix heuristic for voices not explicitly listed above.
# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
_KOKORO_PREFIXES = {
"af_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"bf_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"am_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"bm_": {"speaker": "Ryan", "language": "English", "instruct": ""},
"zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
"zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
}
def resolve_voice(raw: str) -> dict:
"""Resolve a voice string to a Qwen3-TTS speaker config.
Handles:
- Standard names: "alloy", "echo", etc.
- Kokoro blends: "af_bella+bf_emma+af_nicole" (picks first component)
- Kokoro singles: "af_bella"
- Unknown: falls back to DEFAULT_VOICE
"""
# Take only the first voice in a + blend
name = raw.split("+")[0].strip().lower()
if name in VOICE_MAP:
return VOICE_MAP[name]
# Try Kokoro prefix heuristic
for prefix, info in _KOKORO_PREFIXES.items():
if name.startswith(prefix):
log.debug("Kokoro prefix match %r%s", name, info["speaker"])
return info
log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
return VOICE_MAP[DEFAULT_VOICE]
# ── Load model ─────────────────────────────────────────────────────────────────
if USE_GRAPHS:
from faster_qwen3_tts import FasterQwen3TTS
log.info("Loading FasterQwen3TTS (HIP graph mode) %s on %s", MODEL_PATH, DEVICE)
tts = FasterQwen3TTS.from_pretrained(MODEL_PATH, device=DEVICE, dtype=DTYPE)
def _synthesise(text, language, speaker, instruct):
# Cap audio length proportional to input text length.
# At 12Hz token rate, ~2.5 tokens per character is a generous ceiling.
# This prevents stochastic generation from producing absurdly long audio
# (e.g. "Hello world." generating 16s of audio with default max_new_tokens=2048).
max_new_tokens = max(60, int(len(text) * 2.5))
wavs, sr = tts.generate_custom_voice(
text=text, language=language, speaker=speaker,
instruct=instruct or None,
max_new_tokens=max_new_tokens,
)
return wavs, sr
def _synthesise_greedy(text, language, speaker):
"""Deterministic synthesis for warmup — uses tight token budget."""
max_new_tokens = max(60, int(len(text) * 2.5))
wavs, sr = tts.generate_custom_voice(
text=text, language=language, speaker=speaker,
instruct=None, do_sample=False,
max_new_tokens=max_new_tokens,
)
return wavs, sr
else:
from qwen_tts import Qwen3TTSModel
log.info("Loading Qwen3TTSModel (standard mode) %s on %s", MODEL_PATH, DEVICE)
tts = Qwen3TTSModel.from_pretrained(
MODEL_PATH, device_map=DEVICE, dtype=DTYPE, attn_implementation="sdpa",
)
def _synthesise(text, language, speaker, instruct):
wavs, sr = tts.generate_custom_voice(
text=text, language=language, speaker=speaker, instruct=instruct,
)
return wavs, sr
def _synthesise_greedy(text, language, speaker):
return _synthesise(text, language, speaker, "")
# ── Patch: run the speech tokenizer decoder on CPU ────────────────────────────
# The 12Hz decoder is pure Conv1d/ConvTranspose1d. On AMD ROCm, MIOpen's solver
# for these ops falls back to ConvDirectNaiveConvFwd (named "naive" for a reason),
# causing 4-40s of GPU decode time per request.
#
# Moving to CPU sidesteps MIOpen entirely. The Ryzen's AVX2 path handles these
# small 1D convolutions in <100ms, giving end-to-end RTF > 1.0x on typical text.
def _move_decoder_to_cpu(model_obj):
try:
st = model_obj.model.model.speech_tokenizer # FasterQwen3TTS path
except AttributeError:
st = model_obj.model.speech_tokenizer # Qwen3TTSModel path
st.model.to("cpu")
st.device = torch.device("cpu")
log.info("Speech tokenizer decoder moved to CPU (bypasses MIOpen)")
_move_decoder_to_cpu(tts)
# Use greedy (deterministic) decoding so warmup produces consistent audio lengths
# and MIOpen compiles the exact shapes that common inputs will hit at runtime.
# The 3 texts below produce ~1s, ~4s, and ~6s of audio deterministically.
log.info("Warming up — HIP graph capture …")
_t = time.monotonic()
# One synthesis call captures both HIP graphs (talker + predictor).
# No MIOpen warmup needed — decoder runs on CPU now.
_synthesise_greedy("Hello.", "English", "Ryan")
log.info("Warm-up done in %.1fs — proxy ready. mode=%s",
time.monotonic() - _t, "HIP-graphs" if USE_GRAPHS else "standard-sdpa")
# ── Helpers ────────────────────────────────────────────────────────────────────
def wav_to_mp3(wav_bytes: bytes) -> bytes:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
tmp_in.write(wav_bytes)
tmp_in_path = tmp_in.name
tmp_out_path = tmp_in_path.replace(".wav", ".mp3")
try:
subprocess.run(
["ffmpeg", "-y", "-i", tmp_in_path, "-codec:a", "libmp3lame", "-q:a", "4", tmp_out_path],
check=True, capture_output=True,
)
with open(tmp_out_path, "rb") as f:
return f.read()
finally:
os.unlink(tmp_in_path)
if os.path.exists(tmp_out_path):
os.unlink(tmp_out_path)
# ── Endpoints ──────────────────────────────────────────────────────────────────
@app.route("/models", methods=["GET"])
def models():
return jsonify({"object": "list", "data": [{"id": "tts-1", "object": "model"}]})
@app.route("/audio/speech", methods=["POST"])
def speech():
data = request.get_json(force=True, silent=True) or {}
text = data.get("input", "").strip()
voice = data.get("voice", DEFAULT_VOICE)
fmt = data.get("response_format", "mp3")
if not text:
abort(400, description="'input' field is required")
info = resolve_voice(voice)
log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
# Handle PCM streaming
if fmt == "pcm" and USE_GRAPHS:
log.info("Starting PCM streaming synthesis")
t0 = time.monotonic()
try:
chunks = 0
def generator():
nonlocal chunks
for audio_chunk in tts.generate_custom_voice_streaming(
text=text,
language=info["language"],
speaker=info["speaker"],
instruct=info["instruct"] or None,
max_new_tokens=max(60, int(len(text) * 2.5))
):
chunks += 1
# Convert float32 numpy array to int16 PCM
pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
yield pcm_chunk
elapsed = time.monotonic() - t0
log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
except Exception as exc:
log.exception("PCM streaming failed")
abort(500, description=str(exc))
elif fmt == "pcm":
log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
# Fall through to regular MP3 path below
try:
t0 = time.monotonic()
wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
elapsed = time.monotonic() - t0
audio_s = len(wavs[0]) / sr
log.info("Synthesis done in %.1fs audio=%.1fs RTF=%.2fx",
elapsed, audio_s, audio_s / elapsed)
except Exception as exc:
log.exception("TTS generation failed")
abort(500, description=str(exc))
wav_buf = io.BytesIO()
sf.write(wav_buf, wavs[0], sr, format="WAV")
wav_bytes = wav_buf.getvalue()
if fmt == "mp3":
audio_bytes = wav_to_mp3(wav_bytes)
mimetype = "audio/mpeg"
else:
audio_bytes = wav_bytes
mimetype = "audio/wav"
return send_file(io.BytesIO(audio_bytes), mimetype=mimetype)
# ── Error handlers ─────────────────────────────────────────────────────────────
@app.errorhandler(400)
@app.errorhandler(404)
@app.errorhandler(500)
@app.errorhandler(502)
def json_error(e):
return jsonify({"error": {"message": str(e), "type": "proxy_error"}}), e.code
if __name__ == "__main__":
port = int(os.getenv("PROXY_PORT", "5000"))
log.info("Starting proxy on port %d", port)
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)