From e91f92fbb6649658f65c5050084d906420667531 Mon Sep 17 00:00:00 2001
From: pi-bot-01 <pi-bot-01@dominat.us>
Date: Wed, 25 Mar 2026 21:47:59 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20PCM=20streaming=20=E2=80=94=20missing=20?=
 =?UTF-8?q?Response=20import=20+=20wrong=20tuple=20unpacking?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Response to flask imports (caused NameError on every PCM request)
- Unpack (audio, sr, timing) tuple correctly from generate_custom_voice_streaming
  (was iterating the tuple itself, passing a 3-element object to np.clip)
- Move elapsed/chunk logging inside the generator so it fires after stream ends
- PCM streaming now working: 12c test → 2.3s audio in 1.8s, 3 chunks
---
 qwen3-proxy/app.py | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/qwen3-proxy/app.py b/qwen3-proxy/app.py
index 2114ef9..fcace3a 100644
--- a/qwen3-proxy/app.py
+++ b/qwen3-proxy/app.py
@@ -27,7 +27,7 @@ if os.getenv("AOTRITON", "0") == "1":
 import io, time, logging, subprocess, tempfile
 import torch, soundfile as sf
 import numpy as np
-from flask import Flask, request, jsonify, abort, send_file, stream_with_context
+from flask import Flask, request, jsonify, abort, send_file, stream_with_context, Response
 from flask_cors import CORS
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -216,32 +216,33 @@ def speech():
 
     # Handle PCM streaming
     if fmt == "pcm" and USE_GRAPHS:
-        log.info("Starting PCM streaming synthesis")
-        t0 = time.monotonic()
-        try:
+        log.info("Streaming PCM | %d chars | voice=%s speaker=%s",
+                 len(text), voice, info["speaker"])
+
+        def generate_pcm():
+            t0 = time.monotonic()
             chunks = 0
-            def generator():
-                nonlocal chunks
-                for audio_chunk in tts.generate_custom_voice_streaming(
+            try:
+                for audio, sr, timing in tts.generate_custom_voice_streaming(
                     text=text,
                     language=info["language"],
                     speaker=info["speaker"],
                     instruct=info["instruct"] or None,
-                    max_new_tokens=max(60, int(len(text) * 2.5))
+                    max_new_tokens=max(60, int(len(text) * 2.5)),
                 ):
                     chunks += 1
-                    # Convert float32 numpy array to int16 PCM
-                    pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
-                    yield pcm_chunk
-            
-            elapsed = time.monotonic() - t0
-            log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks)
-            return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
-        except Exception as exc:
-            log.exception("PCM streaming failed")
-            abort(500, description=str(exc))
-    elif fmt == "pcm":
-        log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.")
+                    pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
+                    yield pcm.tobytes()
+            except Exception as exc:
+                log.exception("PCM stream error after %d chunks", chunks)
+                return
+            log.info("PCM stream done: %d chunks in %.1fs", chunks, time.monotonic() - t0)
+
+        return Response(
+            stream_with_context(generate_pcm()),
+            mimetype="audio/pcm",
+            headers={"Cache-Control": "no-cache"},
+        )
         # Fall through to regular MP3 path below
 
     try: