From d3ca5ab0b29dcb30a53f8e2406208a1de0711192 Mon Sep 17 00:00:00 2001
From: pi-bot-01 <pi-bot-01@dominat.us>
Date: Wed, 25 Mar 2026 21:18:42 -0700
Subject: [PATCH] feat: Qwen3-TTS proxy with HIP graph + CPU decoder
 optimisations

- OpenAI-compatible Flask proxy (POST /audio/speech, GET /models)
- faster-qwen3-tts HIP graph acceleration: GPU LLM at 1.78x RTF
- CPU speech tokenizer decoder: bypasses MIOpen ConvDirectNaiveConvFwd,
  eliminates 4-40s per-request decode overhead
- attn_implementation=sdpa for transformer attention
- AOTRITON env var toggle (off=short sentences, on=long-form/novel chapters)
- HIP_GRAPHS env var toggle (default on)
- Startup warmup with HIP graph capture (~5s)
- CORS support for browser extension requests
- RTF: 0.9-1.5x on AMD RX 7900 XTX (gfx1100, ROCm 6.3)

Performance vs baseline (CPU-only, ~3 min/sentence):
  12c: 3.2s | 44c: 2.7s | 115c: 6.6s
---
 .gitignore                   |  49 ++++++
 README.md                    |  82 ++++++++++
 qwen3-proxy/app.py           | 206 +++++++++++++++++++++++++
 qwen3-proxy/requirements.txt |   2 +
 setup_qwen3_readaloud.sh     | 288 +++++++++++++++++++++++++++++++++++
 5 files changed, 627 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 qwen3-proxy/app.py
 create mode 100644 qwen3-proxy/requirements.txt
 create mode 100755 setup_qwen3_readaloud.sh

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4b19d32
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,49 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+.eggs/
+
+# Virtual envs
+venv/
+.venv/
+env/
+*.venv
+
+# Model weights / audio output
+*.wav
+*.mp3
+*.bin
+*.safetensors
+*.pt
+*.pth
+
+# HuggingFace cache
+.cache/
+
+# Test artifacts
+test_output.*
+test_simple.py
+
+# OS
+.DS_Store
+Thumbs.db
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Submodule source trees (large, checked out separately)
+Qwen3-TTS/
+read-aloud/
+
+# Systemd units are user-specific, generated by setup script
+${HOME_DIR}/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b96e23f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,82 @@
+# qwen3-tts-ra
+
+Qwen3-TTS with Read-Aloud browser extension integration.
+
+## Components
+
+- `qwen3-proxy/` — OpenAI-compatible TTS proxy (`POST /audio/speech`)
+- `Qwen3-TTS/` — Qwen3-TTS library (submodule / clone)
+- `read-aloud/` — Read-Aloud browser extension (submodule / clone)
+- `setup_qwen3_readaloud.sh` — Initial environment setup script
+
+## Architecture
+
+```
+Read-Aloud extension
+  → POST http://localhost:5000/audio/speech
+    → qwen3-proxy/app.py (Flask, OpenAI-compatible API)
+      → faster-qwen3-tts (HIP graph acceleration, AMD gfx1100)
+        → GPU: LLM token generation at ~1.78x RTF
+        → CPU: speech tokenizer decode (bypasses MIOpen)
+```
+
+## Performance (AMD Radeon RX 7900 XTX, gfx1100)
+
+| Input | Audio | Time | RTF |
+|-------|-------|------|-----|
+| 12c "Hello world." | ~2s | ~3s | ~0.9x |
+| 44c sentence | ~4s | ~3s | **1.5x** |
+| 115c paragraph | ~10s | ~7s | **1.5x** |
+
+RTF > 1.0 = generates faster than real-time.
+
+## Key optimisations
+
+1. **HIP Graphs** (`faster-qwen3-tts`) — captures autoregressive decode loop as a static GPU program, eliminating Python overhead per token
+2. **CPU speech decoder** — moves `speech_tokenizer.model` to CPU, bypassing MIOpen's slow `ConvDirectNaiveConvFwd` fallback entirely
+3. **`attn_implementation=sdpa`** — PyTorch native SDPA for transformer attention
+4. **`MIOPEN_USER_DB_PATH`** — persistent MIOpen find-DB for LLM-side convolutions
+
+## Setup
+
+```bash
+# Install Python venv + deps
+./setup_qwen3_readaloud.sh
+
+# Start the proxy service
+systemctl --user start qwen3-tts-proxy.service
+
+# Watch logs
+journalctl --user -u qwen3-tts-proxy.service -f
+```
+
+## Read-Aloud Extension Settings
+
+In Read-Aloud → Settings → OpenAI:
+
+| Field | Value |
+|-------|-------|
+| URL | `http://127.0.0.1:5000` |
+| API Key | *(leave blank)* |
+| Voice list | see below |
+
+```json
+[
+  {"voice": "alloy",   "lang": "en-US", "model": "tts-1"},
+  {"voice": "echo",    "lang": "en-US", "model": "tts-1"},
+  {"voice": "fable",   "lang": "en-US", "model": "tts-1"},
+  {"voice": "onyx",    "lang": "en-US", "model": "tts-1"},
+  {"voice": "nova",    "lang": "zh-CN", "model": "tts-1"},
+  {"voice": "shimmer", "lang": "zh-CN", "model": "tts-1"}
+]
+```
+
+## Env vars (systemd service)
+
+| Variable | Default | Notes |
+|----------|---------|-------|
+| `QWEN_MODEL` | `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | HF model id or local path |
+| `DEVICE` | `cuda:0` | GPU device |
+| `HIP_GRAPHS` | `1` | Enable faster-qwen3-tts HIP graphs |
+| `AOTRITON` | `0` | AOTriton flash attention — faster for long text (>80 chars), slower for short sentences |
+| `PROXY_PORT` | `5000` | Listening port |
diff --git a/qwen3-proxy/app.py b/qwen3-proxy/app.py
new file mode 100644
index 0000000..f13af45
--- /dev/null
+++ b/qwen3-proxy/app.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""OpenAI-compatible TTS proxy backed by Qwen3-TTS.
+
+Implements the two endpoints that Read-Aloud's OpenAI engine uses:
+  GET  /models          — connection test
+  POST /audio/speech    — synthesise text → mp3
+
+Set env vars to override defaults:
+  QWEN_MODEL   — HuggingFace model id or local path
+  PROXY_PORT   — listening port (default 5000)
+  DEVICE       — torch device (default: cuda:0 if available, else cpu)
+  AOTRITON     — "1" to enable AOTriton flash attention on gfx1100.
+                 Faster for long text (>~80 chars, e.g. novel chapters).
+                 Slower for short sentences (e.g. read-aloud). Default: 0.
+  HIP_GRAPHS   — "1" to use faster-qwen3-tts (HIP/CUDA graph acceleration).
+                 Eliminates Python overhead per autoregressive token — 3-4x
+                 faster than the standard path. Requires GPU. Default: 1.
+"""
+
+import os
+
+# Must be set before the first torch SDPA call (checked lazily, not at import).
+if os.getenv("AOTRITON", "0") == "1":
+    os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"
+
+import io, time, logging, subprocess, tempfile
+import torch, soundfile as sf
+from flask import Flask, request, jsonify, abort, send_file
+from flask_cors import CORS
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+app = Flask(__name__)
+CORS(app)  # allow requests from browser extensions (chrome-extension:// etc.)
+
+# ── Configuration ──────────────────────────────────────────────────────────────
+MODEL_PATH  = os.getenv("QWEN_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+DEVICE      = os.getenv("DEVICE", "cuda:0" if torch.cuda.is_available() else "cpu")
+DTYPE       = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+USE_GRAPHS  = os.getenv("HIP_GRAPHS", "1") == "1" and torch.cuda.is_available()
+
+# Map OpenAI voice names → Qwen3-TTS speaker + language + optional instruct
+VOICE_MAP = {
+    "alloy":   {"speaker": "Ryan",   "language": "English", "instruct": ""},
+    "echo":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak in a calm, measured tone."},
+    "fable":   {"speaker": "Ryan",   "language": "English", "instruct": "Speak warmly and expressively."},
+    "onyx":    {"speaker": "Ryan",   "language": "English", "instruct": "Speak with a deep, authoritative voice."},
+    "nova":    {"speaker": "Vivian", "language": "Chinese", "instruct": ""},
+    "shimmer": {"speaker": "Vivian", "language": "Chinese", "instruct": "Speak gently and softly."},
+}
+DEFAULT_VOICE = "alloy"
+
+# ── Load model ─────────────────────────────────────────────────────────────────
+if USE_GRAPHS:
+    from faster_qwen3_tts import FasterQwen3TTS
+    log.info("Loading FasterQwen3TTS (HIP graph mode) %s on %s …", MODEL_PATH, DEVICE)
+    tts = FasterQwen3TTS.from_pretrained(MODEL_PATH, device=DEVICE, dtype=DTYPE)
+
+    def _synthesise(text, language, speaker, instruct):
+        # Cap audio length proportional to input text length.
+        # At 12Hz token rate, ~2.5 tokens per character is a generous ceiling.
+        # This prevents stochastic generation from producing absurdly long audio
+        # (e.g. "Hello world." generating 16s of audio with default max_new_tokens=2048).
+        max_new_tokens = max(60, int(len(text) * 2.5))
+        wavs, sr = tts.generate_custom_voice(
+            text=text, language=language, speaker=speaker,
+            instruct=instruct or None,
+            max_new_tokens=max_new_tokens,
+        )
+        return wavs, sr
+
+    def _synthesise_greedy(text, language, speaker):
+        """Deterministic synthesis for warmup — uses tight token budget."""
+        max_new_tokens = max(60, int(len(text) * 2.5))
+        wavs, sr = tts.generate_custom_voice(
+            text=text, language=language, speaker=speaker,
+            instruct=None, do_sample=False,
+            max_new_tokens=max_new_tokens,
+        )
+        return wavs, sr
+
+else:
+    from qwen_tts import Qwen3TTSModel
+    log.info("Loading Qwen3TTSModel (standard mode) %s on %s …", MODEL_PATH, DEVICE)
+    tts = Qwen3TTSModel.from_pretrained(
+        MODEL_PATH, device_map=DEVICE, dtype=DTYPE, attn_implementation="sdpa",
+    )
+
+    def _synthesise(text, language, speaker, instruct):
+        wavs, sr = tts.generate_custom_voice(
+            text=text, language=language, speaker=speaker, instruct=instruct,
+        )
+        return wavs, sr
+
+    def _synthesise_greedy(text, language, speaker):
+        return _synthesise(text, language, speaker, "")
+
+# ── Patch: run the speech tokenizer decoder on CPU ────────────────────────────
+# The 12Hz decoder is pure Conv1d/ConvTranspose1d.  On AMD ROCm, MIOpen's solver
+# for these ops falls back to ConvDirectNaiveConvFwd (named "naive" for a reason),
+# causing 4-40s of GPU decode time per request.
+#
+# Moving to CPU sidesteps MIOpen entirely.  The Ryzen's AVX2 path handles these
+# small 1D convolutions in <100ms, giving end-to-end RTF > 1.0x on typical text.
+
+def _move_decoder_to_cpu(model_obj):
+    try:
+        st = model_obj.model.model.speech_tokenizer   # FasterQwen3TTS path
+    except AttributeError:
+        st = model_obj.model.speech_tokenizer          # Qwen3TTSModel path
+    st.model.to("cpu")
+    st.device = torch.device("cpu")
+    log.info("Speech tokenizer decoder moved to CPU (bypasses MIOpen)")
+
+_move_decoder_to_cpu(tts)
+# Use greedy (deterministic) decoding so warmup produces consistent audio lengths
+# and MIOpen compiles the exact shapes that common inputs will hit at runtime.
+# The 3 texts below produce ~1s, ~4s, and ~6s of audio deterministically.
+log.info("Warming up — HIP graph capture …")
+_t = time.monotonic()
+
+# One synthesis call captures both HIP graphs (talker + predictor).
+# No MIOpen warmup needed — decoder runs on CPU now.
+_synthesise_greedy("Hello.", "English", "Ryan")
+log.info("Warm-up done in %.1fs — proxy ready.  mode=%s",
+         time.monotonic() - _t, "HIP-graphs" if USE_GRAPHS else "standard-sdpa")
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def wav_to_mp3(wav_bytes: bytes) -> bytes:
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
+        tmp_in.write(wav_bytes)
+        tmp_in_path = tmp_in.name
+    tmp_out_path = tmp_in_path.replace(".wav", ".mp3")
+    try:
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", tmp_in_path, "-codec:a", "libmp3lame", "-q:a", "4", tmp_out_path],
+            check=True, capture_output=True,
+        )
+        with open(tmp_out_path, "rb") as f:
+            return f.read()
+    finally:
+        os.unlink(tmp_in_path)
+        if os.path.exists(tmp_out_path):
+            os.unlink(tmp_out_path)
+
+
+# ── Endpoints ──────────────────────────────────────────────────────────────────
+@app.route("/models", methods=["GET"])
+def models():
+    return jsonify({"object": "list", "data": [{"id": "tts-1", "object": "model"}]})
+
+
+@app.route("/audio/speech", methods=["POST"])
+def speech():
+    data  = request.get_json(force=True, silent=True) or {}
+    text  = data.get("input", "").strip()
+    voice = data.get("voice", DEFAULT_VOICE)
+    fmt   = data.get("response_format", "mp3")
+
+    if not text:
+        abort(400, description="'input' field is required")
+
+    info = VOICE_MAP.get(voice, VOICE_MAP[DEFAULT_VOICE])
+    log.info("Synthesising %d chars | voice=%s speaker=%s", len(text), voice, info["speaker"])
+
+    try:
+        t0 = time.monotonic()
+        wavs, sr = _synthesise(text, info["language"], info["speaker"], info["instruct"])
+        elapsed = time.monotonic() - t0
+        audio_s = len(wavs[0]) / sr
+        log.info("Synthesis done in %.1fs  audio=%.1fs  RTF=%.2fx",
+                 elapsed, audio_s, audio_s / elapsed)
+    except Exception as exc:
+        log.exception("TTS generation failed")
+        abort(500, description=str(exc))
+
+    wav_buf = io.BytesIO()
+    sf.write(wav_buf, wavs[0], sr, format="WAV")
+    wav_bytes = wav_buf.getvalue()
+
+    if fmt == "mp3":
+        audio_bytes = wav_to_mp3(wav_bytes)
+        mimetype = "audio/mpeg"
+    else:
+        audio_bytes = wav_bytes
+        mimetype = "audio/wav"
+
+    return send_file(io.BytesIO(audio_bytes), mimetype=mimetype)
+
+
+# ── Error handlers ─────────────────────────────────────────────────────────────
+@app.errorhandler(400)
+@app.errorhandler(404)
+@app.errorhandler(500)
+@app.errorhandler(502)
+def json_error(e):
+    return jsonify({"error": {"message": str(e), "type": "proxy_error"}}), e.code
+
+
+if __name__ == "__main__":
+    port = int(os.getenv("PROXY_PORT", "5000"))
+    log.info("Starting proxy on port %d", port)
+    app.run(host="0.0.0.0", port=port, debug=False)
diff --git a/qwen3-proxy/requirements.txt b/qwen3-proxy/requirements.txt
new file mode 100644
index 0000000..30692b7
--- /dev/null
+++ b/qwen3-proxy/requirements.txt
@@ -0,0 +1,2 @@
+flask
+requests
diff --git a/setup_qwen3_readaloud.sh b/setup_qwen3_readaloud.sh
new file mode 100755
index 0000000..41a2e94
--- /dev/null
+++ b/setup_qwen3_readaloud.sh
@@ -0,0 +1,288 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# -----------------------------------------------------------------
+# Configuration – edit only if you need to change defaults
+# -----------------------------------------------------------------
+HOME_DIR="${HOME:-/home/oc}"
+# Preferred Python version for the virtual‑env (must be on the system)
+PYTHON_VERSION="3.12"
+# Fallback Python version if preferred version is not available
+FALLBACK_PYTHON_VERSION="3.10"
+# Name of the virtual‑env directory (will be created under $HOME)
+VENV_DIR="${HOME_DIR}/qwen3tts-venv"
+# Model to serve – the 0.6B CustomVoice model is quick to download
+QWEN_MODEL="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice"
+DEMO_PORT=8000   # Gradio demo port
+PROXY_PORT=5000  # Flask proxy port
+PROJECT_ROOT="$(pwd)"   # must be the directory that contains Qwen3-TTS and read-aloud
+PROXY_DIR="${PROJECT_ROOT}/qwen3-proxy"
+SYSTEMD_USER_DIR="${HOME_DIR}/.config/systemd/user"
+
+# -----------------------------------------------------------------
+# Helper functions for pretty output
+# -----------------------------------------------------------------
+info(){ echo -e "\e[32m[INFO]\e[0m $*"; }
+error(){ echo -e "\e[31m[ERROR]\e[0m $*" >&2; }
+warning(){ echo -e "\e[33m[WARNING]\e[0m $*" >&2; }
+
+# -----------------------------------------------------------------
+# 0️⃣ Helper: ensure we have a recent Python interpreter
+# ---------------------------------------------------------
+detect_python() {
+    # Try preferred version first
+    if command -v "python${PYTHON_VERSION}" >/dev/null 2>&1; then
+        echo "python${PYTHON_VERSION}"
+        return 0
+    elif command -v "python${FALLBACK_PYTHON_VERSION}" >/dev/null 2>&1; then
+        warning "Python ${PYTHON_VERSION} not found, using ${FALLBACK_PYTHON_VERSION} as fallback"
+        echo "python${FALLBACK_PYTHON_VERSION}"
+        return 0
+    elif command -v python3 >/dev/null 2>&1; then
+        warning "No specific Python version found, using python3 (may not be compatible)"
+        echo "python3"
+        return 0
+    else
+        error "No Python interpreter found. Please install Python 3.10 or higher."
+        exit 1
+    fi
+}
+
+PYTHON_BIN=$(detect_python)
+
+# -----------------------------------------------------------------
+# 1️⃣ Create (or reuse) a virtual‑env and install the Python deps
+# -----------------------------------------------------------------
+if [[ ! -d "${VENV_DIR}" ]]; then
+    info "Creating virtual‑env at ${VENV_DIR}…"
+    if ! "${PYTHON_BIN}" -m venv "${VENV_DIR}"; then
+        error "Failed to create virtual environment. Check Python installation and permissions."
+        exit 1
+    fi
+else
+    info "Virtual‑env already exists – reusing."
+fi
+
+# Activate the env for the remainder of the script
+source "${VENV_DIR}/bin/activate"
+
+# Upgrade pip (helps with binary wheels)
+info "Upgrading pip…"
+if ! pip install -U pip setuptools wheel; then
+    error "Failed to upgrade pip"
+    exit 1
+fi
+
+# Check if qwen-tts is already installed
+if pip show qwen-tts >/dev/null 2>&1; then
+    info "qwen-tts already installed, upgrading"
+    pip install -U qwen-tts
+else
+    info "Installing qwen-tts (Python wrapper)…"
+    if ! pip install qwen-tts; then
+        error "Failed to install qwen-tts"
+        exit 1
+    fi
+fi
+
+# -----------------------------------------------------------------
+# 2️⃣ Prepare the Flask proxy source tree
+# ---------------------------------------------------------
+mkdir -p "${PROXY_DIR}"
+# Create requirements.txt for the proxy
+cat > "${PROXY_DIR}/requirements.txt" <<'EOF'
+flask
+requests
+EOF
+
+# Create app.py for the proxy
+cat > "${PROXY_DIR}/app.py" <<'PY'
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Flask proxy for the Read‑Aloud extension.
+It implements the two endpoints that Read‑Aloud expects and forwards the
+actual synthesis request to a locally‑running Qwen3‑TTS Gradio demo.
+"""
+import os, io, base64
+from flask import Flask, request, jsonify, abort, send_file
+import requests
+
+app = Flask(__name__)
+
+# --------------------------------------------------------------
+# Configuration via environment variables (defaults shown)
+# --------------------------------------------------------------
+GRADIO_URL = os.getenv("GRADIO_URL", "http://127.0.0.1:8000")
+
+# Map the voice name shown in the extension to the internal speaker token
+# that the Gradio demo expects.  Extend this dict if you want more voices.
+SPEAKERS = {
+    "Vivian": {"voice_name": "Qwen3 Vivian", "lang": "zh-CN"},
+    "Ryan":   {"voice_name": "Qwen3 Ryan",   "lang": "en-US"},
+    # Add other speakers from the Qwen3‑TTS README if desired
+}
+
+@app.route("/read-aloud/list-voices/premium")
+def list_voices():
+    # Return a JSON array: [{"voice_name":…, "lang":…}, …]
+    return jsonify(list(SPEAKERS.values()))
+
+@app.route("/read-aloud/speak/<lang>/<voice_name>")
+def speak(lang, voice_name):
+    text = request.args.get("q", "")
+    if not text:
+        abort(400, "missing query parameter 'q'")
+
+    # Find the internal speaker token (case‑insensitive match)
+    internal = None
+    for key, val in SPEAKERS.items():
+        if val["voice_name"].lower() == voice_name.lower():
+            internal = key
+            break
+    if internal is None:
+        abort(404, f"voice '{voice_name}' not known to proxy")
+
+    # Build the payload for the Gradio API – the demo expects:
+    #   [text, language, speaker, instruct]
+    payload = {"data": [text, lang, internal, ""]}
+    try:
+        r = requests.post(f"{GRADIO_URL}/api/predict", json=payload, timeout=120)
+    except Exception as exc:
+        abort(502, f"cannot reach Gradio server: {exc}")
+    if r.status_code != 200:
+        abort(r.status_code, f"Gradio error: {r.text}")
+
+    try:
+        # Gradio returns something like [{"name": "audio.wav", "data": "data:audio/wav;base64,…"}]
+        data = r.json()["data"][0]["data"]
+    except Exception:
+        abort(500, "unexpected Gradio response format")
+
+    # Strip possible data‑URL prefix
+    if data.startswith("data:"):
+        b64 = data.split(",", 1)[1]
+    else:
+        b64 = data
+    wav_bytes = base64.b64decode(b64)
+    return send_file(io.BytesIO(wav_bytes), mimetype="audio/wav", as_attachment=False, download_name="speech.wav")
+
+if __name__ == "__main__":
+    # Flask's built‑in dev server is fine for a local user service
+    app.run(host="0.0.0.0", port=int(os.getenv("PROXY_PORT", "5000")), debug=False)
+PY
+
+# Install the proxy deps inside the venv
+info "Installing Flask proxy dependencies…"
+if ! pip install -r "${PROXY_DIR}/requirements.txt"; then
+    error "Failed to install Flask proxy dependencies"
+    exit 1
+fi
+
+# -----------------------------------------------------------------
+# 3️⃣ Write the systemd user unit files (they will activate the venv)
+# ---------------------------------------------------------
+mkdir -p "${SYSTEMD_USER_DIR}"
+
+# ---- qwen3-tts-demo.service ---------------------------------------
+cat > "${SYSTEMD_USER_DIR}/qwen3-tts-demo.service" <<'EOF'
+[Unit]
+Description=Qwen3‑TTS Gradio demo (CustomVoice model)
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+# Activate the virtual‑env created by the install script
+Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
+ExecStart=/bin/bash -c '\
+  source "${VENV_DIR}/bin/activate" && \
+  qwen-tts-demo "${QWEN_MODEL}" \
+    --ip 0.0.0.0 \
+    --port ${DEMO_PORT} \
+    --no-ssl-verify \
+    --share false'
+
+ExecStop=/usr/bin/pkill -f "qwen-tts-demo"
+Restart=on-failure
+RestartSec=5
+StartLimitBurst=5
+StartLimitIntervalSec=60
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
+EOF
+
+# ---- qwen3-tts-proxy.service ---------------------------------------
+cat > "${SYSTEMD_USER_DIR}/qwen3-tts-proxy.service" <<'EOF'
+[Unit]
+Description=Flask proxy translating Read‑Aloud API → Qwen3‑TTS Gradio demo
+After=qwen3-tts-demo.service
+Requires=qwen3-tts-demo.service
+
+# Wait up to ~1 minute for the Gradio demo to become reachable before starting.
+ExecStartPre=/usr/bin/bash -c '\
+  for i in {1..30}; do \
+    if curl -s http://127.0.0.1:${DEMO_PORT}/ >/dev/null 2>&1; then exit 0; fi; \
+    echo "Waiting for Qwen3‑TTS demo … ($i)"; sleep 2; \
+  done; \
+  echo "Qwen3‑TTS demo never became reachable – aborting proxy start." >&2; exit 1'
+
+[Service]
+Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
+Environment=PROXY_DIR=${PROJECT_ROOT}/qwen3-proxy
+Environment=PROXY_PORT=${PROXY_PORT}
+Environment=GRADIO_URL=http://127.0.0.1:${DEMO_PORT}
+
+ExecStart=/bin/bash -c '\
+  source "${VENV_DIR}/bin/activate" && \
+  cd "${PROXY_DIR}" && \
+  python app.py --host 0.0.0.0 --port "${PROXY_PORT}"'
+
+ExecStop=/usr/bin/pkill -f "python.*app.py"
+Restart=on-failure
+RestartSec=5
+StartLimitBurst=5
+StartLimitIntervalSec=60
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
+EOF
+
+# -----------------------------------------------------------------
+# 4️⃣ Reload systemd, enable and start the services
+# ---------------------------------------------------------
+if command -v systemctl >/dev/null 2>&1; then
+    info "Reloading user systemd daemon…"
+    systemctl --user daemon-reload || warning "Failed to reload systemd daemon"
+
+    info "Enabling & starting the Qwen3‑TTS demo service…"
+    if ! systemctl --user enable --now qwen3-tts-demo.service; then
+        error "Failed to enable/start Qwen3-TTS demo service"
+        exit 1
+    fi
+
+    info "Enabling & starting the Flask proxy service…"
+    if ! systemctl --user enable --now qwen3-tts-proxy.service; then
+        error "Failed to enable/start Flask proxy service"
+        exit 1
+    fi
+else
+    warning "systemctl not found. Services not enabled/started automatically. Please enable manually:"
+    warning "  systemctl --user enable --now qwen3-tts-demo.service"
+    warning "  systemctl --user enable --now qwen3-tts-proxy.service"
+fi
+
+# -----------------------------------------------------------------
+# 5️⃣ Final status report & next steps for the extension
+# ---------------------------------------------------------
+info "Both services should now be active. Verify with:"
+info "  systemctl --user status qwen3-tts-demo.service"
+info "  systemctl --user status qwen3-tts-proxy.service"
+
+info "When configuring the Read‑Aloud extension, set the service URL to:"
+info "  http://127.0.0.1:${PROXY_PORT}"
+
+info "Setup finished. Enjoy Qwen3‑TTS in Read‑Aloud!"
\ No newline at end of file