feat: Qwen3-TTS proxy with HIP graph + CPU decoder optimisations

- OpenAI-compatible Flask proxy (POST /audio/speech, GET /models) - faster-qwen3-tts HIP graph acceleration: GPU LLM at 1.78x RTF - CPU speech tokenizer decoder: bypasses MIOpen ConvDirectNaiveConvFwd, eliminates 4-40s per-request decode overhead - attn_implementation=sdpa for transformer attention - AOTRITON env var toggle (off=short sentences, on=long-form/novel chapters) - HIP_GRAPHS env var toggle (default on) - Startup warmup with HIP graph capture (~5s) - CORS support for browser extension requests - RTF: 0.9-1.5x on AMD RX 7900 XTX (gfx1100, ROCm 6.3) Performance vs baseline (CPU-only, ~3 min/sentence): 12c: 3.2s | 44c: 2.7s | 115c: 6.6s
2026-03-25 21:18:42 -07:00
commit d3ca5ab0b2
5 changed files with 627 additions and 0 deletions
--- a/setup_qwen3_readaloud.sh
+++ b/setup_qwen3_readaloud.sh
@@ -0,0 +1,288 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# -----------------------------------------------------------------
+# Configuration – edit only if you need to change defaults
+# -----------------------------------------------------------------
+HOME_DIR="${HOME:-/home/oc}"
+# Preferred Python version for the virtual‑env (must be on the system)
+PYTHON_VERSION="3.12"
+# Fallback Python version if preferred version is not available
+FALLBACK_PYTHON_VERSION="3.10"
+# Name of the virtual‑env directory (will be created under $HOME)
+VENV_DIR="${HOME_DIR}/qwen3tts-venv"
+# Model to serve – the 0.6B CustomVoice model is quick to download
+QWEN_MODEL="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice"
+DEMO_PORT=8000   # Gradio demo port
+PROXY_PORT=5000  # Flask proxy port
+PROJECT_ROOT="$(pwd)"   # must be the directory that contains Qwen3-TTS and read-aloud
+PROXY_DIR="${PROJECT_ROOT}/qwen3-proxy"
+SYSTEMD_USER_DIR="${HOME_DIR}/.config/systemd/user"
+
+# -----------------------------------------------------------------
+# Helper functions for pretty output
+# -----------------------------------------------------------------
+info(){ echo -e "\e[32m[INFO]\e[0m $*"; }
+error(){ echo -e "\e[31m[ERROR]\e[0m $*" >&2; }
+warning(){ echo -e "\e[33m[WARNING]\e[0m $*" >&2; }
+
+# -----------------------------------------------------------------
+# 0️⃣ Helper: ensure we have a recent Python interpreter
+# ---------------------------------------------------------
+detect_python() {
+    # Try preferred version first
+    if command -v "python${PYTHON_VERSION}" >/dev/null 2>&1; then
+        echo "python${PYTHON_VERSION}"
+        return 0
+    elif command -v "python${FALLBACK_PYTHON_VERSION}" >/dev/null 2>&1; then
+        warning "Python ${PYTHON_VERSION} not found, using ${FALLBACK_PYTHON_VERSION} as fallback"
+        echo "python${FALLBACK_PYTHON_VERSION}"
+        return 0
+    elif command -v python3 >/dev/null 2>&1; then
+        warning "No specific Python version found, using python3 (may not be compatible)"
+        echo "python3"
+        return 0
+    else
+        error "No Python interpreter found. Please install Python 3.10 or higher."
+        exit 1
+    fi
+}
+
+PYTHON_BIN=$(detect_python)
+
+# -----------------------------------------------------------------
+# 1️⃣ Create (or reuse) a virtual‑env and install the Python deps
+# -----------------------------------------------------------------
+if [[ ! -d "${VENV_DIR}" ]]; then
+    info "Creating virtual‑env at ${VENV_DIR}…"
+    if ! "${PYTHON_BIN}" -m venv "${VENV_DIR}"; then
+        error "Failed to create virtual environment. Check Python installation and permissions."
+        exit 1
+    fi
+else
+    info "Virtual‑env already exists – reusing."
+fi
+
+# Activate the env for the remainder of the script
+source "${VENV_DIR}/bin/activate"
+
+# Upgrade pip (helps with binary wheels)
+info "Upgrading pip…"
+if ! pip install -U pip setuptools wheel; then
+    error "Failed to upgrade pip"
+    exit 1
+fi
+
+# Check if qwen-tts is already installed
+if pip show qwen-tts >/dev/null 2>&1; then
+    info "qwen-tts already installed, upgrading"
+    pip install -U qwen-tts
+else
+    info "Installing qwen-tts (Python wrapper)…"
+    if ! pip install qwen-tts; then
+        error "Failed to install qwen-tts"
+        exit 1
+    fi
+fi
+
+# -----------------------------------------------------------------
+# 2️⃣ Prepare the Flask proxy source tree
+# ---------------------------------------------------------
+mkdir -p "${PROXY_DIR}"
+# Create requirements.txt for the proxy
+cat > "${PROXY_DIR}/requirements.txt" <<'EOF'
+flask
+requests
+EOF
+
+# Create app.py for the proxy
+cat > "${PROXY_DIR}/app.py" <<'PY'
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Flask proxy for the Read‑Aloud extension.
+It implements the two endpoints that Read‑Aloud expects and forwards the
+actual synthesis request to a locally‑running Qwen3‑TTS Gradio demo.
+"""
+import os, io, base64
+from flask import Flask, request, jsonify, abort, send_file
+import requests
+
+app = Flask(__name__)
+
+# --------------------------------------------------------------
+# Configuration via environment variables (defaults shown)
+# --------------------------------------------------------------
+GRADIO_URL = os.getenv("GRADIO_URL", "http://127.0.0.1:8000")
+
+# Map the voice name shown in the extension to the internal speaker token
+# that the Gradio demo expects.  Extend this dict if you want more voices.
+SPEAKERS = {
+    "Vivian": {"voice_name": "Qwen3 Vivian", "lang": "zh-CN"},
+    "Ryan":   {"voice_name": "Qwen3 Ryan",   "lang": "en-US"},
+    # Add other speakers from the Qwen3‑TTS README if desired
+}
+
+@app.route("/read-aloud/list-voices/premium")
+def list_voices():
+    # Return a JSON array: [{"voice_name":…, "lang":…}, …]
+    return jsonify(list(SPEAKERS.values()))
+
+@app.route("/read-aloud/speak/<lang>/<voice_name>")
+def speak(lang, voice_name):
+    text = request.args.get("q", "")
+    if not text:
+        abort(400, "missing query parameter 'q'")
+
+    # Find the internal speaker token (case‑insensitive match)
+    internal = None
+    for key, val in SPEAKERS.items():
+        if val["voice_name"].lower() == voice_name.lower():
+            internal = key
+            break
+    if internal is None:
+        abort(404, f"voice '{voice_name}' not known to proxy")
+
+    # Build the payload for the Gradio API – the demo expects:
+    #   [text, language, speaker, instruct]
+    payload = {"data": [text, lang, internal, ""]}
+    try:
+        r = requests.post(f"{GRADIO_URL}/api/predict", json=payload, timeout=120)
+    except Exception as exc:
+        abort(502, f"cannot reach Gradio server: {exc}")
+    if r.status_code != 200:
+        abort(r.status_code, f"Gradio error: {r.text}")
+
+    try:
+        # Gradio returns something like [{"name": "audio.wav", "data": "data:audio/wav;base64,…"}]
+        data = r.json()["data"][0]["data"]
+    except Exception:
+        abort(500, "unexpected Gradio response format")
+
+    # Strip possible data‑URL prefix
+    if data.startswith("data:"):
+        b64 = data.split(",", 1)[1]
+    else:
+        b64 = data
+    wav_bytes = base64.b64decode(b64)
+    return send_file(io.BytesIO(wav_bytes), mimetype="audio/wav", as_attachment=False, download_name="speech.wav")
+
+if __name__ == "__main__":
+    # Flask's built‑in dev server is fine for a local user service
+    app.run(host="0.0.0.0", port=int(os.getenv("PROXY_PORT", "5000")), debug=False)
+PY
+
+# Install the proxy deps inside the venv
+info "Installing Flask proxy dependencies…"
+if ! pip install -r "${PROXY_DIR}/requirements.txt"; then
+    error "Failed to install Flask proxy dependencies"
+    exit 1
+fi
+
+# -----------------------------------------------------------------
+# 3️⃣ Write the systemd user unit files (they will activate the venv)
+# ---------------------------------------------------------
+mkdir -p "${SYSTEMD_USER_DIR}"
+
+# ---- qwen3-tts-demo.service ---------------------------------------
+cat > "${SYSTEMD_USER_DIR}/qwen3-tts-demo.service" <<'EOF'
+[Unit]
+Description=Qwen3‑TTS Gradio demo (CustomVoice model)
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+# Activate the virtual‑env created by the install script
+Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
+ExecStart=/bin/bash -c '\
+  source "${VENV_DIR}/bin/activate" && \
+  qwen-tts-demo "${QWEN_MODEL}" \
+    --ip 0.0.0.0 \
+    --port ${DEMO_PORT} \
+    --no-ssl-verify \
+    --share false'
+
+ExecStop=/usr/bin/pkill -f "qwen-tts-demo"
+Restart=on-failure
+RestartSec=5
+StartLimitBurst=5
+StartLimitIntervalSec=60
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
+EOF
+
+# ---- qwen3-tts-proxy.service ---------------------------------------
+cat > "${SYSTEMD_USER_DIR}/qwen3-tts-proxy.service" <<'EOF'
+[Unit]
+Description=Flask proxy translating Read‑Aloud API → Qwen3‑TTS Gradio demo
+After=qwen3-tts-demo.service
+Requires=qwen3-tts-demo.service
+
+# Wait up to ~1 minute for the Gradio demo to become reachable before starting.
+ExecStartPre=/usr/bin/bash -c '\
+  for i in {1..30}; do \
+    if curl -s http://127.0.0.1:${DEMO_PORT}/ >/dev/null 2>&1; then exit 0; fi; \
+    echo "Waiting for Qwen3‑TTS demo … ($i)"; sleep 2; \
+  done; \
+  echo "Qwen3‑TTS demo never became reachable – aborting proxy start." >&2; exit 1'
+
+[Service]
+Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
+Environment=PROXY_DIR=${PROJECT_ROOT}/qwen3-proxy
+Environment=PROXY_PORT=${PROXY_PORT}
+Environment=GRADIO_URL=http://127.0.0.1:${DEMO_PORT}
+
+ExecStart=/bin/bash -c '\
+  source "${VENV_DIR}/bin/activate" && \
+  cd "${PROXY_DIR}" && \
+  python app.py --host 0.0.0.0 --port "${PROXY_PORT}"'
+
+ExecStop=/usr/bin/pkill -f "python.*app.py"
+Restart=on-failure
+RestartSec=5
+StartLimitBurst=5
+StartLimitIntervalSec=60
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
+EOF
+
+# -----------------------------------------------------------------
+# 4️⃣ Reload systemd, enable and start the services
+# ---------------------------------------------------------
+if command -v systemctl >/dev/null 2>&1; then
+    info "Reloading user systemd daemon…"
+    systemctl --user daemon-reload || warning "Failed to reload systemd daemon"
+
+    info "Enabling & starting the Qwen3‑TTS demo service…"
+    if ! systemctl --user enable --now qwen3-tts-demo.service; then
+        error "Failed to enable/start Qwen3-TTS demo service"
+        exit 1
+    fi
+
+    info "Enabling & starting the Flask proxy service…"
+    if ! systemctl --user enable --now qwen3-tts-proxy.service; then
+        error "Failed to enable/start Flask proxy service"
+        exit 1
+    fi
+else
+    warning "systemctl not found. Services not enabled/started automatically. Please enable manually:"
+    warning "  systemctl --user enable --now qwen3-tts-demo.service"
+    warning "  systemctl --user enable --now qwen3-tts-proxy.service"
+fi
+
+# -----------------------------------------------------------------
+# 5️⃣ Final status report & next steps for the extension
+# ---------------------------------------------------------
+info "Both services should now be active. Verify with:"
+info "  systemctl --user status qwen3-tts-demo.service"
+info "  systemctl --user status qwen3-tts-proxy.service"
+
+info "When configuring the Read‑Aloud extension, set the service URL to:"
+info "  http://127.0.0.1:${PROXY_PORT}"
+
+info "Setup finished. Enjoy Qwen3‑TTS in Read‑Aloud!"