From e91f92fbb6649658f65c5050084d906420667531 Mon Sep 17 00:00:00 2001 From: pi-bot-01 Date: Wed, 25 Mar 2026 21:47:59 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20PCM=20streaming=20=E2=80=94=20missing=20?= =?UTF-8?q?Response=20import=20+=20wrong=20tuple=20unpacking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Response to flask imports (caused NameError on every PCM request) - Unpack (audio, sr, timing) tuple correctly from generate_custom_voice_streaming (was iterating the tuple itself, passing a 3-element object to np.clip) - Move elapsed/chunk logging inside the generator so it fires after stream ends - PCM streaming now working: 12c test → 2.3s audio in 1.8s, 3 chunks --- qwen3-proxy/app.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/qwen3-proxy/app.py b/qwen3-proxy/app.py index 2114ef9..fcace3a 100644 --- a/qwen3-proxy/app.py +++ b/qwen3-proxy/app.py @@ -27,7 +27,7 @@ if os.getenv("AOTRITON", "0") == "1": import io, time, logging, subprocess, tempfile import torch, soundfile as sf import numpy as np -from flask import Flask, request, jsonify, abort, send_file, stream_with_context +from flask import Flask, request, jsonify, abort, send_file, stream_with_context, Response from flask_cors import CORS logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -216,32 +216,33 @@ def speech(): # Handle PCM streaming if fmt == "pcm" and USE_GRAPHS: - log.info("Starting PCM streaming synthesis") - t0 = time.monotonic() - try: + log.info("Streaming PCM | %d chars | voice=%s speaker=%s", + len(text), voice, info["speaker"]) + + def generate_pcm(): + t0 = time.monotonic() chunks = 0 - def generator(): - nonlocal chunks - for audio_chunk in tts.generate_custom_voice_streaming( + try: + for audio, sr, timing in tts.generate_custom_voice_streaming( text=text, language=info["language"], speaker=info["speaker"], instruct=info["instruct"] or None, - max_new_tokens=max(60, int(len(text) * 2.5)) + max_new_tokens=max(60, int(len(text) * 2.5)), ): chunks += 1 - # Convert float32 numpy array to int16 PCM - pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes() - yield pcm_chunk - - elapsed = time.monotonic() - t0 - log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks) - return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"}) - except Exception as exc: - log.exception("PCM streaming failed") - abort(500, description=str(exc)) - elif fmt == "pcm": - log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.") + pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16) + yield pcm.tobytes() + except Exception as exc: + log.exception("PCM stream error after %d chunks", chunks) + return + log.info("PCM stream done: %d chunks in %.1fs", chunks, time.monotonic() - t0) + + return Response( + stream_with_context(generate_pcm()), + mimetype="audio/pcm", + headers={"Cache-Control": "no-cache"}, + ) # Fall through to regular MP3 path below try: