- OpenAI-compatible Flask proxy (POST /audio/speech, GET /models) - faster-qwen3-tts HIP graph acceleration: GPU LLM at 1.78x RTF - CPU speech tokenizer decoder: bypasses MIOpen ConvDirectNaiveConvFwd, eliminates 4-40s per-request decode overhead - attn_implementation=sdpa for transformer attention - AOTRITON env var toggle (off=short sentences, on=long-form/novel chapters) - HIP_GRAPHS env var toggle (default on) - Startup warmup with HIP graph capture (~5s) - CORS support for browser extension requests - RTF: 0.9-1.5x on AMD RX 7900 XTX (gfx1100, ROCm 6.3) Performance vs baseline (CPU-only, ~3 min/sentence): 12c: 3.2s | 44c: 2.7s | 115c: 6.6s
288 lines
10 KiB
Bash
Executable File
288 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# -----------------------------------------------------------------
|
||
# Configuration – edit only if you need to change defaults
|
||
# -----------------------------------------------------------------
|
||
HOME_DIR="${HOME:-/home/oc}"
|
||
# Preferred Python version for the virtual‑env (must be on the system)
|
||
PYTHON_VERSION="3.12"
|
||
# Fallback Python version if preferred version is not available
|
||
FALLBACK_PYTHON_VERSION="3.10"
|
||
# Name of the virtual‑env directory (will be created under $HOME)
|
||
VENV_DIR="${HOME_DIR}/qwen3tts-venv"
|
||
# Model to serve – the 0.6B CustomVoice model is quick to download
|
||
QWEN_MODEL="Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice"
|
||
DEMO_PORT=8000 # Gradio demo port
|
||
PROXY_PORT=5000 # Flask proxy port
|
||
PROJECT_ROOT="$(pwd)" # must be the directory that contains Qwen3-TTS and read-aloud
|
||
PROXY_DIR="${PROJECT_ROOT}/qwen3-proxy"
|
||
SYSTEMD_USER_DIR="${HOME_DIR}/.config/systemd/user"
|
||
|
||
# -----------------------------------------------------------------
|
||
# Helper functions for pretty output
|
||
# -----------------------------------------------------------------
|
||
info(){ echo -e "\e[32m[INFO]\e[0m $*"; }
|
||
error(){ echo -e "\e[31m[ERROR]\e[0m $*" >&2; }
|
||
warning(){ echo -e "\e[33m[WARNING]\e[0m $*" >&2; }
|
||
|
||
# -----------------------------------------------------------------
|
||
# 0️⃣ Helper: ensure we have a recent Python interpreter
|
||
# ---------------------------------------------------------
|
||
detect_python() {
|
||
# Try preferred version first
|
||
if command -v "python${PYTHON_VERSION}" >/dev/null 2>&1; then
|
||
echo "python${PYTHON_VERSION}"
|
||
return 0
|
||
elif command -v "python${FALLBACK_PYTHON_VERSION}" >/dev/null 2>&1; then
|
||
warning "Python ${PYTHON_VERSION} not found, using ${FALLBACK_PYTHON_VERSION} as fallback"
|
||
echo "python${FALLBACK_PYTHON_VERSION}"
|
||
return 0
|
||
elif command -v python3 >/dev/null 2>&1; then
|
||
warning "No specific Python version found, using python3 (may not be compatible)"
|
||
echo "python3"
|
||
return 0
|
||
else
|
||
error "No Python interpreter found. Please install Python 3.10 or higher."
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
PYTHON_BIN=$(detect_python)
|
||
|
||
# -----------------------------------------------------------------
|
||
# 1️⃣ Create (or reuse) a virtual‑env and install the Python deps
|
||
# -----------------------------------------------------------------
|
||
if [[ ! -d "${VENV_DIR}" ]]; then
|
||
info "Creating virtual‑env at ${VENV_DIR}…"
|
||
if ! "${PYTHON_BIN}" -m venv "${VENV_DIR}"; then
|
||
error "Failed to create virtual environment. Check Python installation and permissions."
|
||
exit 1
|
||
fi
|
||
else
|
||
info "Virtual‑env already exists – reusing."
|
||
fi
|
||
|
||
# Activate the env for the remainder of the script
|
||
source "${VENV_DIR}/bin/activate"
|
||
|
||
# Upgrade pip (helps with binary wheels)
|
||
info "Upgrading pip…"
|
||
if ! pip install -U pip setuptools wheel; then
|
||
error "Failed to upgrade pip"
|
||
exit 1
|
||
fi
|
||
|
||
# Check if qwen-tts is already installed
|
||
if pip show qwen-tts >/dev/null 2>&1; then
|
||
info "qwen-tts already installed, upgrading"
|
||
pip install -U qwen-tts
|
||
else
|
||
info "Installing qwen-tts (Python wrapper)…"
|
||
if ! pip install qwen-tts; then
|
||
error "Failed to install qwen-tts"
|
||
exit 1
|
||
fi
|
||
fi
|
||
|
||
# -----------------------------------------------------------------
|
||
# 2️⃣ Prepare the Flask proxy source tree
|
||
# ---------------------------------------------------------
|
||
mkdir -p "${PROXY_DIR}"
|
||
# Create requirements.txt for the proxy
|
||
cat > "${PROXY_DIR}/requirements.txt" <<'EOF'
|
||
flask
|
||
requests
|
||
EOF
|
||
|
||
# Create app.py for the proxy
|
||
cat > "${PROXY_DIR}/app.py" <<'PY'
|
||
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""Flask proxy for the Read‑Aloud extension.
|
||
It implements the two endpoints that Read‑Aloud expects and forwards the
|
||
actual synthesis request to a locally‑running Qwen3‑TTS Gradio demo.
|
||
"""
|
||
import os, io, base64
|
||
from flask import Flask, request, jsonify, abort, send_file
|
||
import requests
|
||
|
||
app = Flask(__name__)
|
||
|
||
# --------------------------------------------------------------
|
||
# Configuration via environment variables (defaults shown)
|
||
# --------------------------------------------------------------
|
||
GRADIO_URL = os.getenv("GRADIO_URL", "http://127.0.0.1:8000")
|
||
|
||
# Map the voice name shown in the extension to the internal speaker token
|
||
# that the Gradio demo expects. Extend this dict if you want more voices.
|
||
SPEAKERS = {
|
||
"Vivian": {"voice_name": "Qwen3 Vivian", "lang": "zh-CN"},
|
||
"Ryan": {"voice_name": "Qwen3 Ryan", "lang": "en-US"},
|
||
# Add other speakers from the Qwen3‑TTS README if desired
|
||
}
|
||
|
||
@app.route("/read-aloud/list-voices/premium")
|
||
def list_voices():
|
||
# Return a JSON array: [{"voice_name":…, "lang":…}, …]
|
||
return jsonify(list(SPEAKERS.values()))
|
||
|
||
@app.route("/read-aloud/speak/<lang>/<voice_name>")
|
||
def speak(lang, voice_name):
|
||
text = request.args.get("q", "")
|
||
if not text:
|
||
abort(400, "missing query parameter 'q'")
|
||
|
||
# Find the internal speaker token (case‑insensitive match)
|
||
internal = None
|
||
for key, val in SPEAKERS.items():
|
||
if val["voice_name"].lower() == voice_name.lower():
|
||
internal = key
|
||
break
|
||
if internal is None:
|
||
abort(404, f"voice '{voice_name}' not known to proxy")
|
||
|
||
# Build the payload for the Gradio API – the demo expects:
|
||
# [text, language, speaker, instruct]
|
||
payload = {"data": [text, lang, internal, ""]}
|
||
try:
|
||
r = requests.post(f"{GRADIO_URL}/api/predict", json=payload, timeout=120)
|
||
except Exception as exc:
|
||
abort(502, f"cannot reach Gradio server: {exc}")
|
||
if r.status_code != 200:
|
||
abort(r.status_code, f"Gradio error: {r.text}")
|
||
|
||
try:
|
||
# Gradio returns something like [{"name": "audio.wav", "data": "data:audio/wav;base64,…"}]
|
||
data = r.json()["data"][0]["data"]
|
||
except Exception:
|
||
abort(500, "unexpected Gradio response format")
|
||
|
||
# Strip possible data‑URL prefix
|
||
if data.startswith("data:"):
|
||
b64 = data.split(",", 1)[1]
|
||
else:
|
||
b64 = data
|
||
wav_bytes = base64.b64decode(b64)
|
||
return send_file(io.BytesIO(wav_bytes), mimetype="audio/wav", as_attachment=False, download_name="speech.wav")
|
||
|
||
if __name__ == "__main__":
|
||
# Flask's built‑in dev server is fine for a local user service
|
||
app.run(host="0.0.0.0", port=int(os.getenv("PROXY_PORT", "5000")), debug=False)
|
||
PY
|
||
|
||
# Install the proxy deps inside the venv
|
||
info "Installing Flask proxy dependencies…"
|
||
if ! pip install -r "${PROXY_DIR}/requirements.txt"; then
|
||
error "Failed to install Flask proxy dependencies"
|
||
exit 1
|
||
fi
|
||
|
||
# -----------------------------------------------------------------
|
||
# 3️⃣ Write the systemd user unit files (they will activate the venv)
|
||
# ---------------------------------------------------------
|
||
mkdir -p "${SYSTEMD_USER_DIR}"
|
||
|
||
# ---- qwen3-tts-demo.service ---------------------------------------
|
||
cat > "${SYSTEMD_USER_DIR}/qwen3-tts-demo.service" <<'EOF'
|
||
[Unit]
|
||
Description=Qwen3‑TTS Gradio demo (CustomVoice model)
|
||
After=network-online.target
|
||
Wants=network-online.target
|
||
|
||
[Service]
|
||
# Activate the virtual‑env created by the install script
|
||
Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
|
||
ExecStart=/bin/bash -c '\
|
||
source "${VENV_DIR}/bin/activate" && \
|
||
qwen-tts-demo "${QWEN_MODEL}" \
|
||
--ip 0.0.0.0 \
|
||
--port ${DEMO_PORT} \
|
||
--no-ssl-verify \
|
||
--share false'
|
||
|
||
ExecStop=/usr/bin/pkill -f "qwen-tts-demo"
|
||
Restart=on-failure
|
||
RestartSec=5
|
||
StartLimitBurst=5
|
||
StartLimitIntervalSec=60
|
||
StandardOutput=journal
|
||
StandardError=journal
|
||
|
||
[Install]
|
||
WantedBy=default.target
|
||
EOF
|
||
|
||
# ---- qwen3-tts-proxy.service ---------------------------------------
|
||
cat > "${SYSTEMD_USER_DIR}/qwen3-tts-proxy.service" <<'EOF'
|
||
[Unit]
|
||
Description=Flask proxy translating Read‑Aloud API → Qwen3‑TTS Gradio demo
|
||
After=qwen3-tts-demo.service
|
||
Requires=qwen3-tts-demo.service
|
||
|
||
# Wait up to ~1 minute for the Gradio demo to become reachable before starting.
|
||
ExecStartPre=/usr/bin/bash -c '\
|
||
for i in {1..30}; do \
|
||
if curl -s http://127.0.0.1:${DEMO_PORT}/ >/dev/null 2>&1; then exit 0; fi; \
|
||
echo "Waiting for Qwen3‑TTS demo … ($i)"; sleep 2; \
|
||
done; \
|
||
echo "Qwen3‑TTS demo never became reachable – aborting proxy start." >&2; exit 1'
|
||
|
||
[Service]
|
||
Environment=VENV_DIR=${HOME_DIR}/qwen3tts-venv
|
||
Environment=PROXY_DIR=${PROJECT_ROOT}/qwen3-proxy
|
||
Environment=PROXY_PORT=${PROXY_PORT}
|
||
Environment=GRADIO_URL=http://127.0.0.1:${DEMO_PORT}
|
||
|
||
ExecStart=/bin/bash -c '\
|
||
source "${VENV_DIR}/bin/activate" && \
|
||
cd "${PROXY_DIR}" && \
|
||
python app.py --host 0.0.0.0 --port "${PROXY_PORT}"'
|
||
|
||
ExecStop=/usr/bin/pkill -f "python.*app.py"
|
||
Restart=on-failure
|
||
RestartSec=5
|
||
StartLimitBurst=5
|
||
StartLimitIntervalSec=60
|
||
StandardOutput=journal
|
||
StandardError=journal
|
||
|
||
[Install]
|
||
WantedBy=default.target
|
||
EOF
|
||
|
||
# -----------------------------------------------------------------
|
||
# 4️⃣ Reload systemd, enable and start the services
|
||
# ---------------------------------------------------------
|
||
if command -v systemctl >/dev/null 2>&1; then
|
||
info "Reloading user systemd daemon…"
|
||
systemctl --user daemon-reload || warning "Failed to reload systemd daemon"
|
||
|
||
info "Enabling & starting the Qwen3‑TTS demo service…"
|
||
if ! systemctl --user enable --now qwen3-tts-demo.service; then
|
||
error "Failed to enable/start Qwen3-TTS demo service"
|
||
exit 1
|
||
fi
|
||
|
||
info "Enabling & starting the Flask proxy service…"
|
||
if ! systemctl --user enable --now qwen3-tts-proxy.service; then
|
||
error "Failed to enable/start Flask proxy service"
|
||
exit 1
|
||
fi
|
||
else
|
||
warning "systemctl not found. Services not enabled/started automatically. Please enable manually:"
|
||
warning " systemctl --user enable --now qwen3-tts-demo.service"
|
||
warning " systemctl --user enable --now qwen3-tts-proxy.service"
|
||
fi
|
||
|
||
# -----------------------------------------------------------------
|
||
# 5️⃣ Final status report & next steps for the extension
|
||
# ---------------------------------------------------------
|
||
info "Both services should now be active. Verify with:"
|
||
info " systemctl --user status qwen3-tts-demo.service"
|
||
info " systemctl --user status qwen3-tts-proxy.service"
|
||
|
||
info "When configuring the Read‑Aloud extension, set the service URL to:"
|
||
info " http://127.0.0.1:${PROXY_PORT}"
|
||
|
||
info "Setup finished. Enjoy Qwen3‑TTS in Read‑Aloud!" |