qwen3-tts-ra/qwen3-proxy/app.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""OpenAI-compatible TTS proxy backed by Qwen3-TTS.

Implements the two endpoints that Read-Aloud's OpenAI engine uses:
  GET  /models          — connection test
  POST /audio/speech    — synthesise text → mp3

Set env vars to override defaults:
  QWEN_MODEL   — HuggingFace model id or local path
  PROXY_PORT   — listening port (default 5000)
  DEVICE       — torch device (default: cuda:0 if available, else cpu)
  AOTRITON     — "1" to enable AOTriton flash attention on gfx1100.
                 Faster for long text (>~80 chars, e.g. novel chapters).
                 Slower for short sentences (e.g. read-aloud). Default: 0.
  HIP_GRAPHS   — "1" to use faster-qwen3-tts (HIP/CUDA graph acceleration).
                 Eliminates Python overhead per autoregressive token — 3-4x
                 faster than the standard path. Requires GPU. Default: 1.
"""

import os

# Must be set before the first torch SDPA call (checked lazily, not at import).
if os.getenv("AOTRITON", "0") == "1":
    os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

import io, time, logging, subprocess, tempfile
import torch, soundfile as sf
import numpy as np
from flask import Flask, request, jsonify, abort, send_file, stream_with_context
from flask_cors import CORS

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)  # allow requests from browser extensions (chrome-extension:// etc.)

# ── Configuration ──────────────────────────────────────────────────────────────
MODEL_PATH  = os.getenv("QWEN_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
DEVICE      = os.getenv("DEVICE", "cuda:0" if torch.cuda.is_available() else "cpu")
DTYPE       = torch.bfloat16 if torch.cuda.is_available() else torch.float32
USE_GRAPHS  = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()

# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
VOICE_MAP = {
    # ── Standard OpenAI voices ──────────────────────────────────────────────
    "alloy":   {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "echo":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
    "fable":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "onyx":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "nova":    {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
    "shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
    # ── Kokoro voice aliases (customtts extension) ──────────────────────────
    # Kokoro names follow: {af|bf|am|bm}_{name}  (a/b=American/British, f/m=female/male)
    # We map female English → Ryan (only English speaker in 0.6B model),
    # Chinese voices → Vivian.  Individual names get personality instruct where fitting.
    "af_bella":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "af_nicole": {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
    "af_sarah":  {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "af_sky":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak gently and softly."},
    "bf_emma":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
    "bf_isabella":{"speaker": "Ryan",  "language": "English", "instruct": "Speak gently and softly."},
    "am_adam":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "am_michael":{"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bm_george": {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
    "bm_lewis":  {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
}
DEFAULT_VOICE = "alloy"

# Kokoro prefix heuristic for voices not explicitly listed above.
# af_/bf_ = female English, am_/bm_ = male English, zf_/zm_ = Chinese
_KOKORO_PREFIXES = {
    "af_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bf_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "am_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "bm_": {"speaker": "Ryan",   "language": "English", "instruct": ""},
    "zf_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
    "zm_": {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
}

def resolve_voice(raw: str) -> dict:
    """Resolve a voice string to a Qwen3-TTS speaker config.

    Handles:
      - Standard names:  "alloy", "echo", etc.
      - Kokoro blends:   "af_bella+bf_emma+af_nicole"  (picks first component)
      - Kokoro singles:  "af_bella"
      - Unknown:         falls back to DEFAULT_VOICE
    """
    # Take only the first voice in a + blend
    name = raw.split("+")[0].strip().lower()
    if name in VOICE_MAP:
        return VOICE_MAP[name]
    # Try Kokoro prefix heuristic
    for prefix, info in _KOKORO_PREFIXES.items():
        if name.startswith(prefix):
            log.debug("Kokoro prefix match %r → %s", name, info["speaker"])
            return info
    log.warning("Unknown voice %r, falling back to %s", raw, DEFAULT_VOICE)
    return VOICE_MAP[DEFAULT_VOICE]

# ── Load model ─────────────────────────────────────────────────────────────────
if USE_GRAPHS:
    from faster_qwen3_tts import FasterQwen3TTS
    log.info("Loading FasterQwen3TTS (HIP graph mode) %s on %s …", MODEL_PATH, DEVICE)
    tts = FasterQwen3TTS.from_pretrained(MODEL_PATH, device=DEVICE, dtype=DTYPE)

    def _synthesise(text, language, speaker, instruct):
        # Cap audio length proportional to input text length.
        # At 12Hz token rate, ~2.5 tokens per character is a generous ceiling.
        # This prevents stochastic generation from producing absurdly long audio
        # (e.g. "Hello world." generating 16s of audio with default max_new_tokens=2048).
        max_new_tokens = max(60, int(len(text) * 2.5))
        wavs, sr = tts.generate_custom_voice(
            text=text, language=language, speaker=speaker,
            instruct=instruct or None,
            max_new_tokens=max_new_tokens,
        )
        return wavs, sr

    def _synthesise_greedy(text, language, speaker):
        """Deterministic synthesis for warmup — uses tight token budget."""
        max_new_tokens = max(60, int(len(text) * 2.5))
        wavs, sr = tts.generate_custom_voice(
            text=text, language=language, speaker=speaker,
            instruct=None, do_sample=False,
            max_new_tokens=max_new_tokens,
        )
        return wavs, sr

else:
    from qwen_tts import Qwen3TTSModel
    log.info("Loading Qwen3TTSModel (standard mode) %s on %s …", MODEL_PATH, DEVICE)
    tts = Qwen3TTSModel.from_pretrained(
        MODEL_PATH, device_map=DEVICE, dtype=DTYPE, attn_implementation="sdpa",
    )

    def _synthesise(text, language, speaker, instruct):
        wavs, sr = tts.generate_custom_voice(
            text=text, language=language, speaker=speaker, instruct=instruct,
        )
        return wavs, sr

    def _synthesise_greedy(text, language, speaker):
        return _synthesise(text, language, speaker, "")

# ── Patch: run the speech tokenizer decoder on CPU ────────────────────────────
# The 12Hz decoder is pure Conv1d/ConvTranspose1d.  On AMD ROCm, MIOpen's solver
# for these ops falls back to ConvDirectNaiveConvFwd (named "naive" for a reason),
# causing 4-40s of GPU decode time per request.
#
# Moving to CPU sidesteps MIOpen entirely.  The Ryzen's AVX2 path handles these
# small 1D convolutions in <100ms, giving end-to-end RTF > 1.0x on typical text.

def _move_decoder_to_cpu(model_obj):
    try:
        st = model_obj.model.model.speech_tokenizer   # FasterQwen3TTS path
    except AttributeError:
        st = model_obj.model.speech_tokenizer          # Qwen3TTSModel path
    st.model.to("cpu")
    st.device = torch.device("cpu")
    log.info("Speech tokenizer decoder moved to CPU (bypasses MIOpen)")

_move_decoder_to_cpu(tts)
# Use greedy (deterministic) decoding so warmup produces consistent audio lengths
# and MIOpen compiles the exact shapes that common inputs will hit at runtime.
# The 3 texts below produce ~1s, ~4s, and ~6s of audio deterministically.
log.info("Warming up — HIP graph capture …")
_t = time.monotonic()

# One synthesis call captures both HIP graphs (talker + predictor).
# No MIOpen warmup needed — decoder runs on CPU now.
_synthesise_greedy("Hello.", "English", "Ryan")
log.info("Warm-up done in %.1fs — proxy ready.  mode=%s",
         time.monotonic() - _t, "HIP-graphs" if USE_GRAPHS else "standard-sdpa")


# ── Helpers ────────────────────────────────────────────────────────────────────
def wav_to_mp3(wav_bytes: bytes) -> bytes:
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
        tmp_in.write(wav_bytes)
        tmp_in_path = tmp_in.name
    tmp_out_path = tmp_in_path.replace(".wav", ".mp3")
    try:
        subprocess.run(
            ["ffmpeg", "-y", "-i", tmp_in_path, "-codec:a", "libmp3lame", "-q:a", "4", tmp_out_path],
            check=True, capture_output=True,
        )
        with open(tmp_out_path, "rb") as f:
            return f.read()
    finally:
        os.unlink(tmp_in_path)
        if os.path.exists(tmp_out_path):
            os.unlink(tmp_out_path)


# ── Endpoints ──────────────────────────────────────────────────────────────────
@app.route("/models", methods=["GET"])
def models():
    return jsonify({"object": "list", "data": [{"id": "tts-1", "object": "model"}]})


@app.route("/audio/speech", methods=["POST"])
def speech():
    data  = request.get_json(force=True, silent=True) or {}
    text  = data.get("input", "").strip()
    voice = data.get("voice", DEFAULT_VOICE)
    fmt   = data.get("response_format", "mp3")

    if not text:
        abort(400, description="'input' field is required")

    info = resolve_voice(voice)
    log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])

    # Handle PCM streaming
    if fmt == "pcm" and USE_GRAPHS:
        log.info("Starting PCM streaming synthesis")
        t0 = time.monotonic()
        try:
            chunks = 0
            def generator():
                nonlocal chunks
                for audio_chunk in tts.generate_custom_voice_streaming(
                    text=text,
                    language=info["language"],
                    speaker=info["speaker"],
                    instruct=info["instruct"] or None,
                    max_new_tokens=max(60, int(len(text) * 2.5))
                ):
                    chunks += 1
                    # Convert float32 numpy array to int16 PCM
                    pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
                    yield pcm_chunk

            elapsed = time.monotonic() - t0
            log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
            return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
        except Exception as exc:
            log.exception("PCM streaming failed")
            abort(500, description=str(exc))
    elif fmt == "pcm":
        log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
        # Fall through to regular MP3 path below

    try:
        t0 = time.monotonic()
        wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
        elapsed = time.monotonic() - t0
        audio_s = len(wavs[0]) / sr
        log.info("Synthesis done in %.1fs  audio=%.1fs  RTF=%.2fx",
                 elapsed, audio_s, audio_s / elapsed)
    except Exception as exc:
        log.exception("TTS generation failed")
        abort(500, description=str(exc))

    wav_buf = io.BytesIO()
    sf.write(wav_buf, wavs[0], sr, format="WAV")
    wav_bytes = wav_buf.getvalue()

    if fmt == "mp3":
        audio_bytes = wav_to_mp3(wav_bytes)
        mimetype = "audio/mpeg"
    else:
        audio_bytes = wav_bytes
        mimetype = "audio/wav"

    return send_file(io.BytesIO(audio_bytes), mimetype=mimetype)


# ── Error handlers ─────────────────────────────────────────────────────────────
@app.errorhandler(400)
@app.errorhandler(404)
@app.errorhandler(500)
@app.errorhandler(502)
def json_error(e):
    return jsonify({"error": {"message": str(e), "type": "proxy_error"}}), e.code


if __name__ == "__main__":
    port = int(os.getenv("PROXY_PORT", "5000"))
    log.info("Starting proxy on port %d", port)
    app.run(host="0.0.0.0", port=port, debug=False, threaded=True)