9router/open-sse/handlers/ttsProviders/genericFormats.js

// Generic config-driven TTS handlers — dispatched by ttsConfig.format.
// Each handler accepts { baseUrl, apiKey, text, modelId, voiceId } and returns { base64, format }.
import { responseToBase64, throwUpstreamError } from "./_base.js";

// Hyperbolic: POST { text } → { audio: base64 }
async function hyperbolic({ baseUrl, apiKey, text }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
    body: JSON.stringify({ text }),
  });
  if (!res.ok) await throwUpstreamError(res);
  const data = await res.json();
  return { base64: data.audio, format: "mp3" };
}

// Deepgram: model via query, Token auth, returns binary
async function deepgram({ baseUrl, apiKey, text, modelId }) {
  const url = new URL(baseUrl);
  url.searchParams.set("model", modelId || "aura-asteria-en");
  const res = await fetch(url.toString(), {
    method: "POST",
    headers: { "Content-Type": "application/json", "Authorization": `Token ${apiKey}` },
    body: JSON.stringify({ text }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "mp3");
}

// Nvidia NIM: POST { input: { text }, voice, model } → binary
async function nvidia({ baseUrl, apiKey, text, modelId, voiceId }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
    body: JSON.stringify({ input: { text }, voice: voiceId || "default", model: modelId }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "wav");
}

// HuggingFace: POST {baseUrl}/{modelId} { inputs: text } → binary
async function huggingface({ baseUrl, apiKey, text, modelId }) {
  if (!modelId || modelId.includes("..")) throw new Error("Invalid HuggingFace model ID");
  const res = await fetch(`${baseUrl}/${modelId}`, {
    method: "POST",
    headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
    body: JSON.stringify({ inputs: text }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "wav");
}

// Inworld: Basic auth, JSON { audioContent }
async function inworld({ baseUrl, apiKey, text, modelId, voiceId }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: { "Content-Type": "application/json", "Authorization": `Basic ${apiKey}` },
    body: JSON.stringify({
      text,
      voiceId: voiceId || "Alex",
      modelId: modelId || "inworld-tts-1.5-mini",
      audioConfig: { audioEncoding: "MP3" },
    }),
  });
  if (!res.ok) await throwUpstreamError(res);
  const data = await res.json();
  if (!data.audioContent) throw new Error("Inworld TTS returned no audio");
  return { base64: data.audioContent, format: "mp3" };
}

// Cartesia: X-API-Key header
async function cartesia({ baseUrl, apiKey, text, modelId, voiceId }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "X-API-Key": apiKey,
      "Cartesia-Version": "2024-06-10",
    },
    body: JSON.stringify({
      model_id: modelId || "sonic-2",
      transcript: text,
      ...(voiceId ? { voice: { mode: "id", id: voiceId } } : {}),
      output_format: { container: "mp3", bit_rate: 128000, sample_rate: 44100 },
    }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "mp3");
}

// PlayHT: token format "userId:apiKey", voice = s3 URL
async function playht({ baseUrl, apiKey, text, modelId, voiceId }) {
  const [userId, key] = (apiKey || ":").split(":");
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "Accept": "audio/mpeg",
      "X-USER-ID": userId || "",
      "Authorization": `Bearer ${key || apiKey}`,
    },
    body: JSON.stringify({
      text,
      voice: voiceId || "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
      voice_engine: modelId || "PlayDialog",
      output_format: "mp3",
      speed: 1,
    }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "mp3");
}

// Coqui (local, noAuth): POST { text, speaker_id } → WAV
async function coqui({ baseUrl, text, voiceId }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ text, ...(voiceId ? { speaker_id: voiceId } : {}) }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "wav");
}

// Tortoise (local, noAuth)
async function tortoise({ baseUrl, text, voiceId }) {
  const res = await fetch(baseUrl, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ text, voice: voiceId || "random" }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "wav");
}

// OpenAI-compatible upstream (qwen3-tts, etc.)
async function openaiCompat({ baseUrl, apiKey, text, modelId, voiceId }) {
  const headers = { "Content-Type": "application/json" };
  if (apiKey) headers["Authorization"] = `Bearer ${apiKey}`;
  const res = await fetch(baseUrl, {
    method: "POST",
    headers,
    body: JSON.stringify({
      model: modelId,
      input: text,
      voice: voiceId || "alloy",
      response_format: "mp3",
      speed: 1.0,
    }),
  });
  if (!res.ok) await throwUpstreamError(res);
  return responseToBase64(res, "mp3");
}

// format → handler dispatcher
export const FORMAT_HANDLERS = {
  hyperbolic,
  deepgram,
  "nvidia-tts": nvidia,
  "huggingface-tts": huggingface,
  inworld,
  cartesia,
  playht,
  coqui,
  tortoise,
  openai: openaiCompat,
};