fix: PCM streaming — missing Response import + wrong tuple unpacking

- Add Response to flask imports (caused NameError on every PCM request)
- Unpack (audio, sr, timing) tuple correctly from generate_custom_voice_streaming
  (was iterating the tuple itself, passing a 3-element object to np.clip)
- Move elapsed/chunk logging inside the generator so it fires after stream ends
- PCM streaming now working: 12c test → 2.3s audio in 1.8s, 3 chunks
This commit is contained in:
2026-03-25 21:47:59 -07:00
parent fef6a1b74c
commit e91f92fbb6

View File

@@ -27,7 +27,7 @@ if os.getenv("AOTRITON", "0") == "1":
import io, time, logging, subprocess, tempfile import io, time, logging, subprocess, tempfile
import torch, soundfile as sf import torch, soundfile as sf
import numpy as np import numpy as np
from flask import Flask, request, jsonify, abort, send_file, stream_with_context from flask import Flask, request, jsonify, abort, send_file, stream_with_context, Response
from flask_cors import CORS from flask_cors import CORS
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -216,32 +216,33 @@ def speech():
# Handle PCM streaming # Handle PCM streaming
if fmt == "pcm" and USE_GRAPHS: if fmt == "pcm" and USE_GRAPHS:
log.info("Starting PCM streaming synthesis") log.info("Streaming PCM | %d chars | voice=%s speaker=%s",
t0 = time.monotonic() len(text), voice, info["speaker"])
try:
def generate_pcm():
t0 = time.monotonic()
chunks = 0 chunks = 0
def generator(): try:
nonlocal chunks for audio, sr, timing in tts.generate_custom_voice_streaming(
for audio_chunk in tts.generate_custom_voice_streaming(
text=text, text=text,
language=info["language"], language=info["language"],
speaker=info["speaker"], speaker=info["speaker"],
instruct=info["instruct"] or None, instruct=info["instruct"] or None,
max_new_tokens=max(60, int(len(text) * 2.5)) max_new_tokens=max(60, int(len(text) * 2.5)),
): ):
chunks += 1 chunks += 1
# Convert float32 numpy array to int16 PCM pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
pcm_chunk = (np.clip(audio_chunk, -1.0, 1.0) * 32767).astype(np.int16).tobytes() yield pcm.tobytes()
yield pcm_chunk except Exception as exc:
log.exception("PCM stream error after %d chunks", chunks)
elapsed = time.monotonic() - t0 return
log.info("PCM streaming completed in %.1fs with %d chunks", elapsed, chunks) log.info("PCM stream done: %d chunks in %.1fs", chunks, time.monotonic() - t0)
return Response(stream_with_context(generator()), mimetype="audio/pcm", headers={"Cache-Control": "no-cache"})
except Exception as exc: return Response(
log.exception("PCM streaming failed") stream_with_context(generate_pcm()),
abort(500, description=str(exc)) mimetype="audio/pcm",
elif fmt == "pcm": headers={"Cache-Control": "no-cache"},
log.warning("PCM streaming requires HIP_GRAPHS=1 to be enabled. Returning MP3 instead.") )
# Fall through to regular MP3 path below # Fall through to regular MP3 path below
try: try: