Feat : Skills

This commit is contained in:
decolua
2026-05-04 11:29:02 +07:00
parent f08fa5f78d
commit 9c6be62a54
52 changed files with 2666 additions and 1581 deletions

View File

@@ -556,6 +556,40 @@ export const PROVIDER_MODELS = {
{ id: "black-forest-labs/FLUX.1-schnell", name: "FLUX.1 Schnell", type: "image", params: [] },
{ id: "stabilityai/stable-diffusion-xl-base-1.0", name: "SDXL Base 1.0", type: "image", params: [] },
],
"fal-ai": [
{ id: "fal-ai/flux/schnell", name: "FLUX Schnell", type: "image", params: ["n", "size"] },
{ id: "fal-ai/flux/dev", name: "FLUX Dev", type: "image", params: ["n", "size"] },
{ id: "fal-ai/flux-pro/v1.1", name: "FLUX Pro v1.1", type: "image", params: ["n", "size"] },
{ id: "fal-ai/flux-pro/v1.1-ultra", name: "FLUX Pro v1.1 Ultra", type: "image", params: ["n", "size"] },
{ id: "fal-ai/recraft-v3", name: "Recraft V3", type: "image", params: ["n", "size", "style"] },
{ id: "fal-ai/ideogram/v2", name: "Ideogram V2", type: "image", params: ["n", "size", "style"] },
{ id: "fal-ai/stable-diffusion-v35-large", name: "SD 3.5 Large", type: "image", params: ["n", "size"] },
],
"stability-ai": [
{ id: "stable-image-ultra", name: "Stable Image Ultra", type: "image", params: ["size"] },
{ id: "stable-image-core", name: "Stable Image Core", type: "image", params: ["size", "style"] },
{ id: "sd3.5-large", name: "Stable Diffusion 3.5 Large", type: "image", params: ["size"] },
{ id: "sd3.5-large-turbo", name: "Stable Diffusion 3.5 Large Turbo", type: "image", params: ["size"] },
{ id: "sd3.5-medium", name: "Stable Diffusion 3.5 Medium", type: "image", params: ["size"] },
],
"black-forest-labs": [
{ id: "flux-pro-1.1", name: "FLUX Pro 1.1", type: "image", params: ["n", "size"] },
{ id: "flux-pro-1.1-ultra", name: "FLUX Pro 1.1 Ultra", type: "image", params: ["size"] },
{ id: "flux-pro", name: "FLUX Pro", type: "image", params: ["n", "size"] },
{ id: "flux-dev", name: "FLUX Dev", type: "image", params: ["n", "size"] },
{ id: "flux-kontext-pro", name: "FLUX Kontext Pro (Edit)", type: "image", params: ["size"], capabilities: ["edit"] },
{ id: "flux-kontext-max", name: "FLUX Kontext Max (Edit)", type: "image", params: ["size"], capabilities: ["edit"] },
],
recraft: [
{ id: "recraftv3", name: "Recraft V3", type: "image", params: ["n", "size", "style"] },
{ id: "recraftv2", name: "Recraft V2", type: "image", params: ["n", "size", "style"] },
],
runwayml: [
{ id: "gen4_image", name: "Gen-4 Image", type: "image", params: ["size"] },
{ id: "gen4_image_turbo", name: "Gen-4 Image Turbo", type: "image", params: ["size"] },
{ id: "gen4_turbo", name: "Gen-4 Turbo", type: "video", params: [] },
{ id: "gen3a_turbo", name: "Gen-3 Alpha Turbo", type: "video", params: [] },
],
};
// Helper functions

View File

@@ -0,0 +1,4 @@
// Shared embedding helpers
export function bearerAuth(creds) {
return { "Authorization": `Bearer ${creds.apiKey || creds.accessToken}` };
}

View File

@@ -0,0 +1,42 @@
// Google Gemini embeddings — embedContent / batchEmbedContents
const BASE = "https://generativelanguage.googleapis.com/v1beta";
function modelPath(model) {
return model.startsWith("models/") ? model : `models/${model}`;
}
export default {
buildUrl: (model, creds, { input } = {}) => {
const apiKey = creds.apiKey || creds.accessToken;
const path = modelPath(model);
const op = Array.isArray(input) ? "batchEmbedContents" : "embedContent";
return `${BASE}/${path}:${op}?key=${encodeURIComponent(apiKey)}`;
},
buildHeaders: () => ({ "Content-Type": "application/json" }),
buildBody: (model, { input }) => {
const m = modelPath(model);
if (Array.isArray(input)) {
return { requests: input.map((text) => ({ model: m, content: { parts: [{ text: String(text) }] } })) };
}
return { model: m, content: { parts: [{ text: String(input) }] } };
},
normalize: (responseBody, model) => {
if (responseBody.object === "list" && Array.isArray(responseBody.data)) return responseBody;
let items = [];
if (Array.isArray(responseBody.embeddings)) {
items = responseBody.embeddings.map((emb, idx) => ({
object: "embedding",
index: idx,
embedding: emb.values || [],
}));
} else if (responseBody.embedding?.values) {
items = [{ object: "embedding", index: 0, embedding: responseBody.embedding.values }];
}
return {
object: "list",
data: items,
model,
usage: { prompt_tokens: 0, total_tokens: 0 },
};
},
};

View File

@@ -0,0 +1,23 @@
// Embeddings provider adapter registry
import createOpenAIEmbeddingAdapter from "./openai.js";
import gemini from "./gemini.js";
import openaiCompatNode from "./openaiCompatNode.js";
const OPENAI_COMPAT_PROVIDERS = [
"openai", "openrouter", "mistral", "voyage-ai", "fireworks",
"together", "nebius", "github", "nvidia", "jina-ai",
];
const ADAPTERS = {
...Object.fromEntries(OPENAI_COMPAT_PROVIDERS.map((id) => [id, createOpenAIEmbeddingAdapter(id)])),
gemini,
google_ai_studio: gemini,
};
export function getEmbeddingAdapter(provider) {
if (ADAPTERS[provider]) return ADAPTERS[provider];
if (provider?.startsWith?.("openai-compatible-") || provider?.startsWith?.("custom-embedding-")) {
return openaiCompatNode;
}
return null;
}

View File

@@ -0,0 +1,39 @@
// OpenAI-compatible embeddings adapter (most providers)
import { bearerAuth } from "./_base.js";
const ENDPOINTS = {
openai: "https://api.openai.com/v1/embeddings",
openrouter: "https://openrouter.ai/api/v1/embeddings",
mistral: "https://api.mistral.ai/v1/embeddings",
"voyage-ai": "https://api.voyageai.com/v1/embeddings",
fireworks: "https://api.fireworks.ai/inference/v1/embeddings",
together: "https://api.together.xyz/v1/embeddings",
nebius: "https://api.tokenfactory.nebius.com/v1/embeddings",
github: "https://models.github.ai/inference/embeddings",
nvidia: "https://integrate.api.nvidia.com/v1/embeddings",
"jina-ai": "https://api.jina.ai/v1/embeddings",
};
export default function createOpenAIEmbeddingAdapter(providerId) {
return {
buildUrl: () => ENDPOINTS[providerId],
buildHeaders: (creds) => {
const headers = { "Content-Type": "application/json", ...bearerAuth(creds) };
if (providerId === "openrouter") {
headers["HTTP-Referer"] = "https://endpoint-proxy.local";
headers["X-Title"] = "Endpoint Proxy";
}
return headers;
},
buildBody: (model, { input, encoding_format, dimensions }) => {
const body = { model, input };
if (encoding_format) body.encoding_format = encoding_format;
if (dimensions != null && dimensions !== "") {
const dim = Number(dimensions);
if (Number.isFinite(dim) && dim > 0) body.dimensions = dim;
}
return body;
},
normalize: (responseBody) => responseBody,
};
}

View File

@@ -0,0 +1,13 @@
// Custom node providers (openai-compatible-* / custom-embedding-*) — baseUrl from credentials
import createOpenAIEmbeddingAdapter from "./openai.js";
const baseAdapter = createOpenAIEmbeddingAdapter("openai");
export default {
...baseAdapter,
buildUrl: (_model, creds) => {
const rawBaseUrl = creds?.providerSpecificData?.baseUrl || "https://api.openai.com/v1";
const baseUrl = rawBaseUrl.replace(/\/$/, "").replace(/\/embeddings$/, "");
return `${baseUrl}/embeddings`;
},
};

View File

@@ -1,196 +1,13 @@
import { getModelTargetFormat, PROVIDER_ID_TO_ALIAS } from "../config/providerModels.js";
import { createErrorResult, parseUpstreamError, formatProviderError } from "../utils/error.js";
import { HTTP_STATUS } from "../config/runtimeConfig.js";
import { getExecutor } from "../executors/index.js";
import { refreshWithRetry } from "../services/tokenRefresh.js";
// Google AI (Gemini) provider aliases / identifiers
const GEMINI_PROVIDERS = new Set(["gemini", "google_ai_studio"]);
// Static map: provider id → embeddings endpoint (OpenAI-compatible body format)
const EMBEDDING_URLS = {
openai: "https://api.openai.com/v1/embeddings",
openrouter: "https://openrouter.ai/api/v1/embeddings",
mistral: "https://api.mistral.ai/v1/embeddings",
"voyage-ai": "https://api.voyageai.com/v1/embeddings",
fireworks: "https://api.fireworks.ai/inference/v1/embeddings",
together: "https://api.together.xyz/v1/embeddings",
nebius: "https://api.tokenfactory.nebius.com/v1/embeddings",
github: "https://models.github.ai/inference/embeddings",
nvidia: "https://integrate.api.nvidia.com/v1/embeddings",
};
import { getEmbeddingAdapter } from "./embeddingProviders/index.js";
/**
* Check whether a provider targets the Google AI (Gemini) embeddings API.
* @param {string} provider
*/
function isGeminiProvider(provider) {
return GEMINI_PROVIDERS.has(provider);
}
/**
* Build the embeddings request body for the target provider.
* Core embeddings handler — orchestrator only. Provider-specific URL/headers/body/normalize
* live in `./embeddingProviders/{id}.js`.
*
* - OpenAI / openai-compatible / openrouter: standard { model, input } format.
* - Google AI (Gemini): different format per API spec.
* - Single input → embedContent body: { model, content: { parts: [{ text }] } }
* - Batch input → batchEmbedContents body: { requests: [{ model, content: { parts: [{ text }] } }] }
*/
function buildEmbeddingsBody(provider, model, input, encodingFormat, dimensions) {
if (isGeminiProvider(provider)) {
// Normalize model name: Gemini API expects "models/<model>" prefix
const geminiModel = model.startsWith("models/") ? model : `models/${model}`;
if (Array.isArray(input)) {
// Batch request
return {
requests: input.map((text) => ({
model: geminiModel,
content: { parts: [{ text: String(text) }] }
}))
};
} else {
// Single request
return {
model: geminiModel,
content: { parts: [{ text: String(input) }] }
};
}
}
// Default: OpenAI format
const body = { model, input };
if (encodingFormat) {
body.encoding_format = encodingFormat;
}
if (dimensions != null && dimensions !== "") {
const dim = Number(dimensions);
if (Number.isFinite(dim) && dim > 0) body.dimensions = dim;
}
return body;
}
/**
* Build the URL for the embeddings endpoint based on the provider.
* @param {string} provider
* @param {string} model
* @param {object} credentials
* @param {string|string[]} input - used to select single vs batch endpoint for Gemini
*/
function buildEmbeddingsUrl(provider, model, credentials, input) {
if (isGeminiProvider(provider)) {
const apiKey = credentials.apiKey || credentials.accessToken;
// Normalize model name for URL path
const modelPath = model.startsWith("models/") ? model : `models/${model}`;
if (Array.isArray(input)) {
// batchEmbedContents for array input (keeps response format consistent even for length=1)
return `https://generativelanguage.googleapis.com/v1beta/${modelPath}:batchEmbedContents?key=${encodeURIComponent(apiKey)}`;
}
return `https://generativelanguage.googleapis.com/v1beta/${modelPath}:embedContent?key=${encodeURIComponent(apiKey)}`;
}
if (EMBEDDING_URLS[provider]) return EMBEDDING_URLS[provider];
// openai-compatible & custom-embedding providers: use their baseUrl + /embeddings
if (provider?.startsWith?.("openai-compatible-") || provider?.startsWith?.("custom-embedding-")) {
const rawBaseUrl = credentials?.providerSpecificData?.baseUrl || "https://api.openai.com/v1";
// Defensive: strip trailing slash and accidental /embeddings to avoid double-append
const baseUrl = rawBaseUrl.replace(/\/$/, "").replace(/\/embeddings$/, "");
return `${baseUrl}/embeddings`;
}
return null;
}
/**
* Build headers for the embeddings request.
*/
function buildEmbeddingsHeaders(provider, credentials) {
const headers = { "Content-Type": "application/json" };
if (isGeminiProvider(provider)) {
// Gemini API uses API key as query param — no Authorization header needed
return headers;
}
switch (provider) {
case "openai":
case "openrouter":
headers["Authorization"] = `Bearer ${credentials.apiKey || credentials.accessToken}`;
if (provider === "openrouter") {
headers["HTTP-Referer"] = "https://endpoint-proxy.local";
headers["X-Title"] = "Endpoint Proxy";
}
break;
default:
headers["Authorization"] = `Bearer ${credentials.apiKey || credentials.accessToken}`;
}
return headers;
}
/**
* Normalize the embeddings response to OpenAI format.
*
* Gemini single response:
* { embedding: { values: [0.1, 0.2, ...] } }
*
* Gemini batch response:
* { embeddings: [{ values: [...] }, ...] }
*
* Target OpenAI format:
* { object: "list", data: [{ object: "embedding", index: 0, embedding: [...] }], model, usage: {...} }
*/
function normalizeEmbeddingsResponse(responseBody, model, provider) {
// Already in OpenAI format
if (responseBody.object === "list" && Array.isArray(responseBody.data)) {
return responseBody;
}
if (isGeminiProvider(provider)) {
let embeddingItems = [];
if (Array.isArray(responseBody.embeddings)) {
// Batch response
embeddingItems = responseBody.embeddings.map((emb, idx) => ({
object: "embedding",
index: idx,
embedding: emb.values || []
}));
} else if (responseBody.embedding?.values) {
// Single response
embeddingItems = [{
object: "embedding",
index: 0,
embedding: responseBody.embedding.values
}];
}
return {
object: "list",
data: embeddingItems,
model,
usage: {
prompt_tokens: 0,
total_tokens: 0
}
};
}
// Try to handle alternate formats gracefully
return responseBody;
}
/**
* Core embeddings handler — shared between Worker and SSE server.
*
* @param {object} options
* @param {object} options.body - Parsed request body { model, input, encoding_format }
* @param {object} options.modelInfo - { provider, model }
* @param {object} options.credentials - Provider credentials
* @param {object} [options.log] - Logger
* @param {function} [options.onCredentialsRefreshed] - Called when creds are refreshed
* @param {function} [options.onRequestSuccess] - Called on success (clear error state)
* @returns {Promise<{ success: boolean, response: Response, status?: number, error?: string }>}
*/
export async function handleEmbeddingsCore({
@@ -199,7 +16,7 @@ export async function handleEmbeddingsCore({
credentials,
log,
onCredentialsRefreshed,
onRequestSuccess
onRequestSuccess,
}) {
const { provider, model } = modelInfo;
@@ -212,19 +29,22 @@ export async function handleEmbeddingsCore({
return createErrorResult(HTTP_STATUS.BAD_REQUEST, "input must be a string or array of strings");
}
const encodingFormat = body.encoding_format || "float";
// Determine embeddings URL
const url = buildEmbeddingsUrl(provider, model, credentials, input);
if (!url) {
const adapter = getEmbeddingAdapter(provider);
if (!adapter) {
return createErrorResult(
HTTP_STATUS.BAD_REQUEST,
`Provider '${provider}' does not support embeddings. Use openai, openrouter, gemini, or an openai-compatible provider.`
`Provider '${provider}' does not support embeddings.`
);
}
const headers = buildEmbeddingsHeaders(provider, credentials);
const requestBody = buildEmbeddingsBody(provider, model, input, encodingFormat, body.dimensions);
const ctx = { input };
const url = adapter.buildUrl(model, credentials, ctx);
const headers = adapter.buildHeaders(credentials, ctx);
const requestBody = adapter.buildBody(model, {
input,
encoding_format: body.encoding_format || "float",
dimensions: body.dimensions,
});
log?.debug?.("EMBEDDINGS", `${provider.toUpperCase()} | ${model} | input_type=${Array.isArray(input) ? `array[${input.length}]` : "string"}`);
@@ -233,7 +53,7 @@ export async function handleEmbeddingsCore({
providerResponse = await fetch(url, {
method: "POST",
headers,
body: JSON.stringify(requestBody)
body: JSON.stringify(requestBody),
});
} catch (error) {
const errMsg = formatProviderError(error, provider, model, HTTP_STATUS.BAD_GATEWAY);
@@ -244,9 +64,9 @@ export async function handleEmbeddingsCore({
// Handle 401/403 — try token refresh (skip for noAuth providers)
const executor = getExecutor(provider);
if (
!executor.noAuth &&
!executor?.noAuth &&
(providerResponse.status === HTTP_STATUS.UNAUTHORIZED ||
providerResponse.status === HTTP_STATUS.FORBIDDEN)
providerResponse.status === HTTP_STATUS.FORBIDDEN)
) {
const newCredentials = await refreshWithRetry(
() => executor.refreshCredentials(credentials, log),
@@ -257,24 +77,17 @@ export async function handleEmbeddingsCore({
if (newCredentials?.accessToken || newCredentials?.apiKey) {
log?.info?.("TOKEN", `${provider.toUpperCase()} | refreshed for embeddings`);
Object.assign(credentials, newCredentials);
if (onCredentialsRefreshed && newCredentials) {
await onCredentialsRefreshed(newCredentials);
}
if (onCredentialsRefreshed) await onCredentialsRefreshed(newCredentials);
// Retry with refreshed credentials
try {
const retryHeaders = buildEmbeddingsHeaders(provider, credentials);
// Rebuild URL for Gemini since API key is embedded in query param
const retryUrl = isGeminiProvider(provider)
? buildEmbeddingsUrl(provider, model, credentials, input)
: url;
const retryHeaders = adapter.buildHeaders(credentials, ctx);
const retryUrl = adapter.buildUrl(model, credentials, ctx);
providerResponse = await fetch(retryUrl, {
method: "POST",
headers: retryHeaders,
body: JSON.stringify(requestBody)
body: JSON.stringify(requestBody),
});
} catch (retryError) {
} catch {
log?.warn?.("TOKEN", `${provider.toUpperCase()} | retry after refresh failed`);
}
} else {
@@ -292,16 +105,13 @@ export async function handleEmbeddingsCore({
let responseBody;
try {
responseBody = await providerResponse.json();
} catch (parseError) {
} catch {
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, `Invalid JSON response from ${provider}`);
}
if (onRequestSuccess) {
await onRequestSuccess();
}
const normalized = normalizeEmbeddingsResponse(responseBody, model, provider);
if (onRequestSuccess) await onRequestSuccess();
const normalized = adapter.normalize(responseBody, model);
log?.debug?.("EMBEDDINGS", `Success | usage=${JSON.stringify(normalized.usage || {})}`);
return {
@@ -309,8 +119,8 @@ export async function handleEmbeddingsCore({
response: new Response(JSON.stringify(normalized), {
headers: {
"Content-Type": "application/json",
"Access-Control-Allow-Origin": "*"
}
})
"Access-Control-Allow-Origin": "*",
},
}),
};
}

View File

@@ -1,406 +1,23 @@
import { randomUUID } from "node:crypto";
import { createErrorResult, parseUpstreamError, formatProviderError } from "../utils/error.js";
import { HTTP_STATUS } from "../config/runtimeConfig.js";
import { refreshWithRetry } from "../services/tokenRefresh.js";
import { getExecutor } from "../executors/index.js";
const CODEX_RESPONSES_URL = "https://chatgpt.com/backend-api/codex/responses";
const CODEX_USER_AGENT = "codex-imagen/0.2.6";
const CODEX_VERSION = "0.122.0";
const CODEX_ORIGINATOR = "codex_cli_rs";
const CODEX_MODEL_SUFFIX = "-image";
const CODEX_REF_DETAIL = "high";
// Image provider configurations
const IMAGE_PROVIDERS = {
openai: {
baseUrl: "https://api.openai.com/v1/images/generations",
format: "openai",
},
gemini: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta/models",
format: "gemini",
},
minimax: {
baseUrl: "https://api.minimaxi.com/v1/images/generations",
format: "openai",
},
openrouter: {
baseUrl: "https://openrouter.ai/api/v1/images/generations",
format: "openai",
},
nanobanana: {
baseUrl: "https://api.nanobananaapi.ai/api/v1/nanobanana/generate",
format: "nanobanana",
},
sdwebui: {
baseUrl: "http://localhost:7860/sdapi/v1/txt2img",
format: "sdwebui",
},
comfyui: {
baseUrl: "http://localhost:8188",
format: "comfyui",
},
huggingface: {
baseUrl: "https://api-inference.huggingface.co/models",
format: "huggingface",
},
codex: {
baseUrl: CODEX_RESPONSES_URL,
format: "codex",
stream: true,
},
};
// Decode codex chatgpt account id from idToken if not stored
function decodeCodexAccountId(idToken) {
try {
const parts = String(idToken || "").split(".");
if (parts.length !== 3) return null;
const b64 = parts[1].replace(/-/g, "+").replace(/_/g, "/");
const pad = (4 - (b64.length % 4)) % 4;
const payload = JSON.parse(Buffer.from(b64 + "=".repeat(pad), "base64").toString("utf8"));
return payload?.["https://api.openai.com/auth"]?.chatgpt_account_id || null;
} catch {
return null;
}
}
// Strip "-image" suffix to get the underlying chat model
function stripCodexImageModel(model) {
return model.endsWith(CODEX_MODEL_SUFFIX)
? model.slice(0, -CODEX_MODEL_SUFFIX.length)
: model;
}
// Normalize a single ref image input to a data URL
function toCodexDataUrl(input) {
if (!input) return null;
if (typeof input !== "string") return null;
if (/^data:image\//i.test(input) || /^https?:\/\//i.test(input)) return input;
// assume raw base64 PNG
return `data:image/png;base64,${input}`;
}
// Build content array with optional reference images, mirroring codex-imagen tagging
function buildCodexContent(prompt, refs, detail = CODEX_REF_DETAIL) {
const content = [];
refs.forEach((url, index) => {
content.push({ type: "input_text", text: `<image name=image${index + 1}>` });
content.push({ type: "input_image", image_url: url, detail });
content.push({ type: "input_text", text: "</image>" });
});
content.push({ type: "input_text", text: prompt });
return content;
}
// Parse Codex SSE stream, log progress, return final base64 image.
// Optional callbacks let caller forward events to client (SSE pipe).
async function parseCodexImageStream(response, log, callbacks = {}) {
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
let imageB64 = null;
let lastEvent = null;
let bytesReceived = 0;
let lastProgressLogMs = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
bytesReceived += value?.byteLength || 0;
buffer += decoder.decode(value, { stream: true });
// SSE events separated by blank line
let sepIdx;
while ((sepIdx = buffer.indexOf("\n\n")) !== -1) {
const block = buffer.slice(0, sepIdx);
buffer = buffer.slice(sepIdx + 2);
const lines = block.split("\n");
let eventName = null;
let dataStr = "";
for (const line of lines) {
if (line.startsWith("event:")) eventName = line.slice(6).trim();
else if (line.startsWith("data:")) dataStr += line.slice(5).trim();
}
if (!eventName) continue;
if (eventName !== lastEvent) {
log?.info?.("IMAGE", `codex progress: ${eventName}`);
lastEvent = eventName;
}
// Notify caller about progress (throttled to ~5/s to avoid flooding)
const now = Date.now();
if (callbacks.onProgress && now - lastProgressLogMs > 200) {
lastProgressLogMs = now;
callbacks.onProgress({ stage: eventName, bytesReceived });
}
if (eventName === "response.image_generation_call.partial_image" && dataStr) {
try {
const data = JSON.parse(dataStr);
if (callbacks.onPartialImage && data?.partial_image_b64) {
callbacks.onPartialImage({ b64_json: data.partial_image_b64, index: data.partial_image_index });
}
} catch {}
}
if (eventName === "response.output_item.done" && dataStr) {
try {
const data = JSON.parse(dataStr);
const item = data?.item;
if (item?.type === "image_generation_call" && item.result) {
imageB64 = item.result;
}
} catch {}
}
}
}
return imageB64;
}
// Build SSE Response that pipes codex progress + partial + done events to client
function buildCodexSseResponse(providerResponse, log, onSuccess) {
const stream = new ReadableStream({
async start(controller) {
const enc = new TextEncoder();
const send = (event, data) => {
controller.enqueue(enc.encode(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`));
};
try {
const b64 = await parseCodexImageStream(providerResponse, log, {
onProgress: (info) => send("progress", info),
onPartialImage: (info) => send("partial_image", info),
});
if (!b64) {
send("error", { message: "Codex did not return an image. Account may not be entitled (Plus/Pro required)." });
} else {
if (onSuccess) await onSuccess();
send("done", {
created: Math.floor(Date.now() / 1000),
data: [{ b64_json: b64 }],
});
}
} catch (err) {
send("error", { message: err?.message || "Stream failed" });
} finally {
controller.close();
}
},
});
return new Response(stream, {
headers: {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
},
});
}
import { getImageAdapter } from "./imageProviders/index.js";
import { urlToBase64 } from "./imageProviders/_base.js";
/**
* Build image generation URL
*/
function buildImageUrl(provider, model, credentials) {
const config = IMAGE_PROVIDERS[provider];
if (!config) return null;
switch (provider) {
case "gemini": {
const apiKey = credentials?.apiKey || credentials?.accessToken;
const modelId = model.replace(/^models\//, "");
return `${config.baseUrl}/${modelId}:generateContent?key=${encodeURIComponent(apiKey)}`;
}
case "huggingface":
return `${config.baseUrl}/${model}`;
case "codex":
return CODEX_RESPONSES_URL;
default:
return config.baseUrl;
}
}
/**
* Build request headers
*/
function buildImageHeaders(provider, credentials) {
const headers = { "Content-Type": "application/json" };
if (provider === "gemini") {
return headers;
}
if (provider === "codex") {
const accountId =
credentials?.providerSpecificData?.chatgptAccountId ||
decodeCodexAccountId(credentials?.idToken);
return {
"accept": "text/event-stream, application/json",
"authorization": `Bearer ${credentials?.accessToken || ""}`,
"chatgpt-account-id": accountId || "",
"content-type": "application/json",
"originator": CODEX_ORIGINATOR,
"session_id": randomUUID(),
"user-agent": CODEX_USER_AGENT,
"version": CODEX_VERSION,
"x-client-request-id": randomUUID(),
};
}
if (provider === "openrouter") {
headers["Authorization"] = `Bearer ${credentials?.apiKey || credentials?.accessToken}`;
headers["HTTP-Referer"] = "https://endpoint-proxy.local";
headers["X-Title"] = "Endpoint Proxy";
return headers;
}
if (provider === "huggingface") {
headers["Authorization"] = `Bearer ${credentials?.apiKey || credentials?.accessToken}`;
return headers;
}
if (credentials?.apiKey || credentials?.accessToken) {
headers["Authorization"] = `Bearer ${credentials.apiKey || credentials.accessToken}`;
}
return headers;
}
/**
* Build request body based on provider format
*/
function buildImageBody(provider, model, body) {
const { prompt, n = 1, size = "1024x1024", quality, style, response_format, image, images } = body;
switch (provider) {
case "codex": {
const refs = [];
if (Array.isArray(images)) images.forEach((i) => { const u = toCodexDataUrl(i); if (u) refs.push(u); });
const single = toCodexDataUrl(image);
if (single) refs.push(single);
const detail = body.image_detail || CODEX_REF_DETAIL;
const imgTool = { type: "image_generation", output_format: (body.output_format || "png").toLowerCase() };
if (body.size && body.size !== "") imgTool.size = body.size;
if (body.quality && body.quality !== "") imgTool.quality = body.quality;
if (body.background && body.background !== "") imgTool.background = body.background;
return {
model: stripCodexImageModel(model),
instructions: "",
input: [{ type: "message", role: "user", content: buildCodexContent(prompt, refs, detail) }],
tools: [imgTool],
tool_choice: "auto",
parallel_tool_calls: false,
prompt_cache_key: randomUUID(),
stream: true,
store: false,
reasoning: null,
};
}
case "gemini":
return {
contents: [{ parts: [{ text: prompt }] }],
generationConfig: {
responseModalities: ["TEXT", "IMAGE"],
},
};
case "sdwebui": {
const [width, height] = size.split("x").map(Number);
return {
prompt,
width: width || 512,
height: height || 512,
steps: 20,
batch_size: n,
};
}
case "nanobanana": {
const sizeMap = {
"1024x1024": "1:1",
"1024x1792": "9:16",
"1792x1024": "16:9",
};
return {
prompt,
type: "TEXTTOIAMGE",
numImages: n,
image_size: sizeMap[size] || "1:1",
};
}
default:
// OpenAI-compatible format
const requestBody = { model, prompt, n, size };
if (quality) requestBody.quality = quality;
if (style) requestBody.style = style;
if (response_format) requestBody.response_format = response_format;
return requestBody;
}
}
/**
* Normalize response to OpenAI format
*/
function normalizeImageResponse(responseBody, provider, prompt) {
// Already in OpenAI format
if (responseBody.created && Array.isArray(responseBody.data)) {
return responseBody;
}
const timestamp = Math.floor(Date.now() / 1000);
switch (provider) {
case "gemini": {
const parts = responseBody.candidates?.[0]?.content?.parts || [];
const images = parts
.filter((p) => p.inlineData?.data)
.map((p) => ({ b64_json: p.inlineData.data }));
return {
created: timestamp,
data: images.length > 0 ? images : [{ b64_json: "", revised_prompt: prompt }],
};
}
case "sdwebui": {
const images = Array.isArray(responseBody.images)
? responseBody.images.map((img) => ({ b64_json: img }))
: [];
return { created: timestamp, data: images };
}
case "nanobanana": {
if (responseBody.image) {
return {
created: timestamp,
data: [{ b64_json: responseBody.image, revised_prompt: prompt }],
};
}
return { created: timestamp, data: [] };
}
case "huggingface": {
// HuggingFace returns binary image data
return responseBody;
}
default:
return responseBody;
}
}
/**
* Core image generation handler
* Core image generation handler — orchestrator only.
* Provider-specific URL/headers/body/parse/normalize live in `./imageProviders/{id}.js`.
*
* @param {object} options
* @param {object} options.body - Request body { model, prompt, n, size, ... }
* @param {object} options.modelInfo - { provider, model }
* @param {object} options.credentials - Provider credentials
* @param {object} [options.log] - Logger
* @param {function} [options.onCredentialsRefreshed] - Called when creds are refreshed
* @param {function} [options.onRequestSuccess] - Called on success
* @param {boolean} [options.streamToClient] - Pipe SSE to client (codex)
* @param {boolean} [options.binaryOutput] - Return raw image bytes
* @param {function} [options.onCredentialsRefreshed]
* @param {function} [options.onRequestSuccess]
* @returns {Promise<{ success: boolean, response: Response, status?: number, error?: string }>}
*/
export async function handleImageGenerationCore({
@@ -419,16 +36,17 @@ export async function handleImageGenerationCore({
return createErrorResult(HTTP_STATUS.BAD_REQUEST, "Missing required field: prompt");
}
const url = buildImageUrl(provider, model, credentials);
if (!url) {
const adapter = getImageAdapter(provider);
if (!adapter) {
return createErrorResult(
HTTP_STATUS.BAD_REQUEST,
`Provider '${provider}' does not support image generation`
);
}
const headers = buildImageHeaders(provider, credentials);
const requestBody = buildImageBody(provider, model, body);
const url = adapter.buildUrl(model, credentials);
const headers = adapter.buildHeaders(credentials);
const requestBody = adapter.buildBody(model, body);
log?.debug?.("IMAGE", `${provider.toUpperCase()} | ${model} | prompt="${body.prompt.slice(0, 50)}..."`);
@@ -445,10 +63,11 @@ export async function handleImageGenerationCore({
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, errMsg);
}
// Handle 401/403 — try token refresh
// Handle 401/403 — try token refresh (skipped for noAuth providers)
const executor = getExecutor(provider);
if (
!executor?.noAuth &&
!adapter.noAuth &&
(providerResponse.status === HTTP_STATUS.UNAUTHORIZED ||
providerResponse.status === HTTP_STATUS.FORBIDDEN)
) {
@@ -461,20 +80,17 @@ export async function handleImageGenerationCore({
if (newCredentials?.accessToken || newCredentials?.apiKey) {
log?.info?.("TOKEN", `${provider.toUpperCase()} | refreshed for image generation`);
Object.assign(credentials, newCredentials);
if (onCredentialsRefreshed && newCredentials) {
await onCredentialsRefreshed(newCredentials);
}
if (onCredentialsRefreshed) await onCredentialsRefreshed(newCredentials);
try {
const retryHeaders = buildImageHeaders(provider, credentials);
const retryUrl = provider === "gemini" ? buildImageUrl(provider, model, credentials) : url;
const retryHeaders = adapter.buildHeaders(credentials);
const retryUrl = adapter.buildUrl(model, credentials);
providerResponse = await fetch(retryUrl, {
method: "POST",
headers: retryHeaders,
body: JSON.stringify(requestBody),
});
} catch (retryError) {
} catch {
log?.warn?.("TOKEN", `${provider.toUpperCase()} | retry after refresh failed`);
}
} else {
@@ -489,51 +105,42 @@ export async function handleImageGenerationCore({
return createErrorResult(statusCode, errMsg);
}
let responseBody;
// Parse provider response — adapter may override (codex SSE / async polling / binary)
let parsed;
try {
if (provider === "huggingface") {
const buffer = await providerResponse.arrayBuffer();
const base64 = Buffer.from(buffer).toString("base64");
responseBody = {
created: Math.floor(Date.now() / 1000),
data: [{ b64_json: base64 }],
};
} else if (provider === "codex") {
// SSE pipe to client (progress + partial_image + done)
if (streamToClient) {
return {
success: true,
response: buildCodexSseResponse(providerResponse, log, onRequestSuccess),
};
if (adapter.parseResponse) {
parsed = await adapter.parseResponse(providerResponse, {
headers,
log,
streamToClient,
onRequestSuccess,
});
// Codex streaming case: returns an SSE Response directly
if (parsed?.sseResponse) {
return { success: true, response: parsed.sseResponse };
}
const b64 = await parseCodexImageStream(providerResponse, log);
if (!b64) {
return createErrorResult(
HTTP_STATUS.BAD_GATEWAY,
"Codex did not return an image. Account may not be entitled (Plus/Pro required)."
);
}
responseBody = {
created: Math.floor(Date.now() / 1000),
data: [{ b64_json: b64 }],
};
} else {
responseBody = await providerResponse.json();
parsed = await providerResponse.json();
}
} catch (parseError) {
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, `Invalid response from ${provider}`);
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, parseError.message || `Invalid response from ${provider}`);
}
if (onRequestSuccess) {
await onRequestSuccess();
}
if (onRequestSuccess) await onRequestSuccess();
const normalized = normalizeImageResponse(responseBody, provider, body.prompt);
// Normalize → OpenAI-compatible shape
const normalized = adapter.normalize(parsed, body.prompt);
// Binary output: decode first b64_json into raw bytes
// Already in OpenAI shape? skip re-normalize
const finalBody = (normalized.created && Array.isArray(normalized.data)) ? normalized : parsed;
// Binary output: decode first b64_json (or fetch url) into raw bytes
if (binaryOutput) {
const first = normalized.data?.[0];
const b64 = first?.b64_json;
const first = finalBody.data?.[0];
let b64 = first?.b64_json;
if (!b64 && first?.url) {
try { b64 = await urlToBase64(first.url); } catch {}
}
if (b64) {
const buf = Buffer.from(b64, "base64");
const fmt = (body.output_format || "png").toLowerCase();
@@ -553,7 +160,7 @@ export async function handleImageGenerationCore({
return {
success: true,
response: new Response(JSON.stringify(normalized), {
response: new Response(JSON.stringify(finalBody), {
headers: {
"Content-Type": "application/json",
"Access-Control-Allow-Origin": "*",

View File

@@ -0,0 +1,31 @@
// Shared helpers for image provider adapters
export const POLL_INTERVAL_MS = 1500;
export const POLL_TIMEOUT_MS = 120000;
export const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
// Map OpenAI size to provider-specific aspect ratio
export function sizeToAspectRatio(size) {
if (!size || typeof size !== "string") return "1:1";
const map = {
"1024x1024": "1:1",
"1024x1792": "9:16",
"1792x1024": "16:9",
"1024x1536": "2:3",
"1536x1024": "3:2",
};
return map[size] || "1:1";
}
// Fetch URL → base64 (for providers returning image URLs)
export async function urlToBase64(url) {
const res = await fetch(url);
if (!res.ok) throw new Error(`Failed to fetch image: ${res.status}`);
const buf = await res.arrayBuffer();
return Buffer.from(buf).toString("base64");
}
export function nowSec() {
return Math.floor(Date.now() / 1000);
}

View File

@@ -0,0 +1,43 @@
// Black Forest Labs (FLUX) — async submit + polling_url
import { sleep, nowSec, POLL_INTERVAL_MS, POLL_TIMEOUT_MS } from "./_base.js";
const BASE_URL = "https://api.bfl.ai/v1";
export default {
async: true,
buildUrl: (model) => `${BASE_URL}/${model}`,
buildHeaders: (creds) => {
const key = creds?.apiKey || creds?.accessToken;
return { "Content-Type": "application/json", "x-key": key };
},
buildBody: (_model, body) => {
const req = { prompt: body.prompt };
if (body.size) {
const [w, h] = body.size.split("x").map(Number);
if (w) req.width = w;
if (h) req.height = h;
}
if (body.image) req.image_prompt = body.image;
return req;
},
async parseResponse(response, { headers }) {
const data = await response.json();
const pollingUrl = data.polling_url;
if (!pollingUrl) throw new Error("BFL: no polling_url returned");
const deadline = Date.now() + POLL_TIMEOUT_MS;
while (Date.now() < deadline) {
await sleep(POLL_INTERVAL_MS);
const r = await fetch(pollingUrl, { headers: { "x-key": headers["x-key"], "Accept": "application/json" } });
if (!r.ok) throw new Error(`BFL status ${r.status}`);
const s = await r.json();
if (s.status === "Ready") return s;
if (s.status === "Error" || s.status === "Failed") throw new Error(s.error || "BFL generation failed");
}
throw new Error("BFL polling timeout");
},
normalize: (responseBody) => {
const sample = responseBody.result?.sample;
if (sample) return { created: nowSec(), data: [{ url: sample }] };
return { created: nowSec(), data: [] };
},
};

View File

@@ -0,0 +1,198 @@
// Codex (ChatGPT Plus/Pro) image generation via Responses API + SSE
import { randomUUID } from "node:crypto";
import { nowSec } from "./_base.js";
const CODEX_RESPONSES_URL = "https://chatgpt.com/backend-api/codex/responses";
const CODEX_USER_AGENT = "codex-imagen/0.2.6";
const CODEX_VERSION = "0.122.0";
const CODEX_ORIGINATOR = "codex_cli_rs";
const CODEX_MODEL_SUFFIX = "-image";
const CODEX_REF_DETAIL = "high";
function decodeAccountId(idToken) {
try {
const parts = String(idToken || "").split(".");
if (parts.length !== 3) return null;
const b64 = parts[1].replace(/-/g, "+").replace(/_/g, "/");
const pad = (4 - (b64.length % 4)) % 4;
const payload = JSON.parse(Buffer.from(b64 + "=".repeat(pad), "base64").toString("utf8"));
return payload?.["https://api.openai.com/auth"]?.chatgpt_account_id || null;
} catch {
return null;
}
}
function stripImageSuffix(model) {
return model.endsWith(CODEX_MODEL_SUFFIX) ? model.slice(0, -CODEX_MODEL_SUFFIX.length) : model;
}
function toDataUrl(input) {
if (!input || typeof input !== "string") return null;
if (/^data:image\//i.test(input) || /^https?:\/\//i.test(input)) return input;
return `data:image/png;base64,${input}`;
}
function buildContent(prompt, refs, detail = CODEX_REF_DETAIL) {
const content = [];
refs.forEach((url, index) => {
content.push({ type: "input_text", text: `<image name=image${index + 1}>` });
content.push({ type: "input_image", image_url: url, detail });
content.push({ type: "input_text", text: "</image>" });
});
content.push({ type: "input_text", text: prompt });
return content;
}
// Parse Codex SSE stream → final base64 image. Optional callbacks for client streaming.
async function parseStream(response, log, callbacks = {}) {
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
let imageB64 = null;
let lastEvent = null;
let bytesReceived = 0;
let lastProgressLogMs = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
bytesReceived += value?.byteLength || 0;
buffer += decoder.decode(value, { stream: true });
let sepIdx;
while ((sepIdx = buffer.indexOf("\n\n")) !== -1) {
const block = buffer.slice(0, sepIdx);
buffer = buffer.slice(sepIdx + 2);
const lines = block.split("\n");
let eventName = null;
let dataStr = "";
for (const line of lines) {
if (line.startsWith("event:")) eventName = line.slice(6).trim();
else if (line.startsWith("data:")) dataStr += line.slice(5).trim();
}
if (!eventName) continue;
if (eventName !== lastEvent) {
log?.info?.("IMAGE", `codex progress: ${eventName}`);
lastEvent = eventName;
}
const now = Date.now();
if (callbacks.onProgress && now - lastProgressLogMs > 200) {
lastProgressLogMs = now;
callbacks.onProgress({ stage: eventName, bytesReceived });
}
if (eventName === "response.image_generation_call.partial_image" && dataStr) {
try {
const data = JSON.parse(dataStr);
if (callbacks.onPartialImage && data?.partial_image_b64) {
callbacks.onPartialImage({ b64_json: data.partial_image_b64, index: data.partial_image_index });
}
} catch {}
}
if (eventName === "response.output_item.done" && dataStr) {
try {
const data = JSON.parse(dataStr);
const item = data?.item;
if (item?.type === "image_generation_call" && item.result) {
imageB64 = item.result;
}
} catch {}
}
}
}
return imageB64;
}
// SSE Response that pipes codex progress + partial + done events to client
function buildSseResponse(providerResponse, log, onSuccess) {
const stream = new ReadableStream({
async start(controller) {
const enc = new TextEncoder();
const send = (event, data) => {
controller.enqueue(enc.encode(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`));
};
try {
const b64 = await parseStream(providerResponse, log, {
onProgress: (info) => send("progress", info),
onPartialImage: (info) => send("partial_image", info),
});
if (!b64) {
send("error", { message: "Codex did not return an image. Account may not be entitled (Plus/Pro required)." });
} else {
if (onSuccess) await onSuccess();
send("done", { created: nowSec(), data: [{ b64_json: b64 }] });
}
} catch (err) {
send("error", { message: err?.message || "Stream failed" });
} finally {
controller.close();
}
},
});
return new Response(stream, {
headers: {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
},
});
}
export default {
stream: true,
buildUrl: () => CODEX_RESPONSES_URL,
buildHeaders: (creds) => {
const accountId = creds?.providerSpecificData?.chatgptAccountId || decodeAccountId(creds?.idToken);
return {
"accept": "text/event-stream, application/json",
"authorization": `Bearer ${creds?.accessToken || ""}`,
"chatgpt-account-id": accountId || "",
"content-type": "application/json",
"originator": CODEX_ORIGINATOR,
"session_id": randomUUID(),
"user-agent": CODEX_USER_AGENT,
"version": CODEX_VERSION,
"x-client-request-id": randomUUID(),
};
},
buildBody: (model, body) => {
const refs = [];
if (Array.isArray(body.images)) body.images.forEach((i) => { const u = toDataUrl(i); if (u) refs.push(u); });
const single = toDataUrl(body.image);
if (single) refs.push(single);
const detail = body.image_detail || CODEX_REF_DETAIL;
const imgTool = { type: "image_generation", output_format: (body.output_format || "png").toLowerCase() };
if (body.size && body.size !== "") imgTool.size = body.size;
if (body.quality && body.quality !== "") imgTool.quality = body.quality;
if (body.background && body.background !== "") imgTool.background = body.background;
return {
model: stripImageSuffix(model),
instructions: "",
input: [{ type: "message", role: "user", content: buildContent(body.prompt, refs, detail) }],
tools: [imgTool],
tool_choice: "auto",
parallel_tool_calls: false,
prompt_cache_key: randomUUID(),
stream: true,
store: false,
reasoning: null,
};
},
// Custom: codex parses SSE → either pipe to client or collect b64
async parseResponse(response, { log, streamToClient, onRequestSuccess }) {
if (streamToClient) {
return { sseResponse: buildSseResponse(response, log, onRequestSuccess) };
}
const b64 = await parseStream(response, log);
if (!b64) {
throw new Error("Codex did not return an image. Account may not be entitled (Plus/Pro required).");
}
return { created: nowSec(), data: [{ b64_json: b64 }] };
},
normalize: (responseBody) => responseBody,
};

View File

@@ -0,0 +1,8 @@
// ComfyUI — local, noAuth (placeholder; full graph workflow not implemented)
export default {
noAuth: true,
buildUrl: () => "http://localhost:8188",
buildHeaders: () => ({ "Content-Type": "application/json" }),
buildBody: (_model, body) => ({ prompt: body.prompt }),
normalize: (responseBody) => responseBody,
};

View File

@@ -0,0 +1,41 @@
// Fal.ai — async submit + queue polling
import { sleep, nowSec, sizeToAspectRatio, POLL_INTERVAL_MS, POLL_TIMEOUT_MS } from "./_base.js";
const BASE_URL = "https://queue.fal.run";
export default {
async: true,
buildUrl: (model) => `${BASE_URL}/${model}`,
buildHeaders: (creds) => {
const key = creds?.apiKey || creds?.accessToken;
return { "Content-Type": "application/json", "Authorization": `Key ${key}` };
},
buildBody: (_model, body) => {
const req = { prompt: body.prompt, num_images: body.n || 1 };
if (body.size) req.image_size = sizeToAspectRatio(body.size);
if (body.image) req.image_url = body.image;
return req;
},
async parseResponse(response, { headers }) {
const { status_url, response_url } = await response.json();
const deadline = Date.now() + POLL_TIMEOUT_MS;
while (Date.now() < deadline) {
await sleep(POLL_INTERVAL_MS);
const r = await fetch(status_url, { headers });
if (!r.ok) throw new Error(`Fal status ${r.status}`);
const s = await r.json();
if (s.status === "COMPLETED") {
const fr = await fetch(response_url, { headers });
return await fr.json();
}
if (s.status === "FAILED") throw new Error(s.error || "Fal generation failed");
}
throw new Error("Fal polling timeout");
},
normalize: (responseBody) => {
const images = Array.isArray(responseBody.images)
? responseBody.images
: (responseBody.image ? [responseBody.image] : []);
return { created: nowSec(), data: images.map((img) => ({ url: img.url || img })) };
},
};

View File

@@ -0,0 +1,25 @@
// Google Gemini adapter (Nano Banana models)
import { nowSec } from "./_base.js";
const BASE_URL = "https://generativelanguage.googleapis.com/v1beta/models";
export default {
buildUrl: (model, creds) => {
const apiKey = creds?.apiKey || creds?.accessToken;
const modelId = model.replace(/^models\//, "");
return `${BASE_URL}/${modelId}:generateContent?key=${encodeURIComponent(apiKey)}`;
},
buildHeaders: () => ({ "Content-Type": "application/json" }),
buildBody: (_model, body) => ({
contents: [{ parts: [{ text: body.prompt }] }],
generationConfig: { responseModalities: ["TEXT", "IMAGE"] },
}),
normalize: (responseBody, prompt) => {
const parts = responseBody.candidates?.[0]?.content?.parts || [];
const images = parts.filter((p) => p.inlineData?.data).map((p) => ({ b64_json: p.inlineData.data }));
return {
created: nowSec(),
data: images.length > 0 ? images : [{ b64_json: "", revised_prompt: prompt }],
};
},
};

View File

@@ -0,0 +1,22 @@
// HuggingFace Inference API — returns binary image
import { nowSec } from "./_base.js";
const BASE_URL = "https://api-inference.huggingface.co/models";
export default {
buildUrl: (model) => `${BASE_URL}/${model}`,
buildHeaders: (creds) => {
const headers = { "Content-Type": "application/json" };
const key = creds?.apiKey || creds?.accessToken;
if (key) headers["Authorization"] = `Bearer ${key}`;
return headers;
},
buildBody: (_model, body) => ({ inputs: body.prompt }),
// HF returns raw image bytes — convert to b64_json
async parseResponse(response) {
const buf = await response.arrayBuffer();
const base64 = Buffer.from(buf).toString("base64");
return { created: nowSec(), data: [{ b64_json: base64 }] };
},
normalize: (responseBody) => responseBody,
};

View File

@@ -0,0 +1,37 @@
// Image provider adapter registry
import createOpenAIAdapter from "./openai.js";
import gemini from "./gemini.js";
import codex from "./codex.js";
import sdwebui from "./sdwebui.js";
import comfyui from "./comfyui.js";
import huggingface from "./huggingface.js";
import nanobanana from "./nanobanana.js";
import falAi from "./falAi.js";
import stabilityAi from "./stabilityAi.js";
import blackForestLabs from "./blackForestLabs.js";
import runwayml from "./runwayml.js";
const ADAPTERS = {
openai: createOpenAIAdapter("openai"),
minimax: createOpenAIAdapter("minimax"),
openrouter: createOpenAIAdapter("openrouter"),
recraft: createOpenAIAdapter("recraft"),
gemini,
codex,
sdwebui,
comfyui,
huggingface,
nanobanana,
"fal-ai": falAi,
"stability-ai": stabilityAi,
"black-forest-labs": blackForestLabs,
runwayml,
};
export function getImageAdapter(provider) {
return ADAPTERS[provider] || null;
}
export function isImageProvider(provider) {
return provider in ADAPTERS;
}

View File

@@ -0,0 +1,58 @@
// NanoBanana API — async submit + poll record-info
import { sleep, nowSec, sizeToAspectRatio, POLL_INTERVAL_MS, POLL_TIMEOUT_MS } from "./_base.js";
const SUBMIT_URL = "https://api.nanobananaapi.ai/api/v1/nanobanana/generate";
const POLL_BASE = "https://api.nanobananaapi.ai/api/v1/nanobanana/record-info";
export default {
async: true,
buildUrl: () => SUBMIT_URL,
buildHeaders: (creds) => {
const headers = { "Content-Type": "application/json" };
const key = creds?.apiKey || creds?.accessToken;
if (key) headers["Authorization"] = `Bearer ${key}`;
return headers;
},
buildBody: (_model, body) => {
const ratio = sizeToAspectRatio(body.size);
const isEdit = !!(body.image || (Array.isArray(body.images) && body.images.length));
const req = {
prompt: body.prompt,
type: isEdit ? "IMAGETOIAMGE" : "TEXTTOIAMGE",
numImages: body.n || 1,
image_size: ratio,
// API requires callBackUrl; we poll instead so a dummy URL is fine.
callBackUrl: "https://localhost/callback",
};
if (isEdit) {
const urls = Array.isArray(body.images) ? body.images.filter(Boolean) : [];
if (body.image) urls.push(body.image);
req.imageUrls = urls;
}
return req;
},
// Async: parse submit → poll until SUCCESS, return raw poll data
async parseResponse(response, { headers }) {
const submitData = await response.json();
if (submitData.code !== 200) throw new Error(submitData.msg || "NanoBanana submit failed");
const taskId = submitData.data?.taskId;
if (!taskId) throw new Error("NanoBanana: no taskId returned");
const pollUrl = `${POLL_BASE}?taskId=${encodeURIComponent(taskId)}`;
const deadline = Date.now() + POLL_TIMEOUT_MS;
while (Date.now() < deadline) {
await sleep(POLL_INTERVAL_MS);
const r = await fetch(pollUrl, { headers });
if (!r.ok) throw new Error(`NanoBanana status ${r.status}`);
const s = await r.json();
const flag = s.data?.successFlag;
if (flag === 1) return s.data;
if (flag === 2 || flag === 3) throw new Error(s.data?.errorMessage || "NanoBanana generation failed");
}
throw new Error("NanoBanana polling timeout");
},
normalize: (responseBody, prompt) => {
const url = responseBody.response?.resultImageUrl || responseBody.response?.originImageUrl;
if (url) return { created: nowSec(), data: [{ url, revised_prompt: prompt }] };
return { created: nowSec(), data: [] };
},
};

View File

@@ -0,0 +1,33 @@
// OpenAI-compatible adapter (used by openai, minimax, openrouter, recraft)
const ENDPOINTS = {
openai: "https://api.openai.com/v1/images/generations",
minimax: "https://api.minimaxi.com/v1/images/generations",
openrouter: "https://openrouter.ai/api/v1/images/generations",
recraft: "https://external.api.recraft.ai/v1/images/generations",
};
export default function createOpenAIAdapter(providerId) {
return {
buildUrl: () => ENDPOINTS[providerId],
buildHeaders: (creds) => {
const headers = { "Content-Type": "application/json" };
const key = creds?.apiKey || creds?.accessToken;
if (key) headers["Authorization"] = `Bearer ${key}`;
if (providerId === "openrouter") {
headers["HTTP-Referer"] = "https://endpoint-proxy.local";
headers["X-Title"] = "Endpoint Proxy";
}
return headers;
},
buildBody: (model, body) => {
const { prompt, n = 1, size = "1024x1024", quality, style, response_format } = body;
const req = { model, prompt, n, size };
if (quality) req.quality = quality;
if (style) req.style = style;
if (response_format) req.response_format = response_format;
return req;
},
normalize: (responseBody) => responseBody,
};
}

View File

@@ -0,0 +1,47 @@
// Runway ML — async submit + /tasks/{id} polling
import { sleep, nowSec, sizeToAspectRatio, POLL_INTERVAL_MS, POLL_TIMEOUT_MS } from "./_base.js";
const BASE_URL = "https://api.dev.runwayml.com/v1";
export default {
async: true,
buildUrl: (model) => {
// Image models (gen4_image*) → text_to_image; video models → image_to_video
return `${BASE_URL}/${model.includes("image") ? "text_to_image" : "image_to_video"}`;
},
buildHeaders: (creds) => {
const key = creds?.apiKey || creds?.accessToken;
return {
"Content-Type": "application/json",
"Authorization": `Bearer ${key}`,
"X-Runway-Version": "2024-11-06",
};
},
buildBody: (model, body) => {
const isVideo = !model.includes("image");
const ratio = sizeToAspectRatio(body.size);
if (isVideo) {
return { promptText: body.prompt, model, ratio, duration: 5, ...(body.image ? { promptImage: body.image } : {}) };
}
return { promptText: body.prompt, model, ratio, ...(body.image ? { referenceImages: [{ uri: body.image }] } : {}) };
},
async parseResponse(response, { headers }) {
const { id } = await response.json();
if (!id) throw new Error("Runway: no task id returned");
const taskUrl = `${BASE_URL}/tasks/${id}`;
const deadline = Date.now() + POLL_TIMEOUT_MS;
while (Date.now() < deadline) {
await sleep(POLL_INTERVAL_MS);
const r = await fetch(taskUrl, { headers });
if (!r.ok) throw new Error(`Runway status ${r.status}`);
const s = await r.json();
if (s.status === "SUCCEEDED") return s;
if (s.status === "FAILED" || s.status === "CANCELLED") throw new Error(s.failure || "Runway task failed");
}
throw new Error("Runway polling timeout");
},
normalize: (responseBody) => {
const outputs = Array.isArray(responseBody.output) ? responseBody.output : [];
return { created: nowSec(), data: outputs.map((url) => ({ url })) };
},
};

View File

@@ -0,0 +1,17 @@
// SD WebUI (AUTOMATIC1111) — local, noAuth
import { nowSec } from "./_base.js";
export default {
noAuth: true,
buildUrl: () => "http://localhost:7860/sdapi/v1/txt2img",
buildHeaders: () => ({ "Content-Type": "application/json" }),
buildBody: (_model, body) => {
const { prompt, n = 1, size = "1024x1024" } = body;
const [width, height] = size.split("x").map(Number);
return { prompt, width: width || 512, height: height || 512, steps: 20, batch_size: n };
},
normalize: (responseBody) => {
const images = Array.isArray(responseBody.images) ? responseBody.images.map((img) => ({ b64_json: img })) : [];
return { created: nowSec(), data: images };
},
};

View File

@@ -0,0 +1,34 @@
// Stability AI v2 — sync, returns { image: "<b64>" }
import { nowSec, sizeToAspectRatio } from "./_base.js";
const BASE_URL = "https://api.stability.ai/v2beta/stable-image/generate";
// Map model id → endpoint segment
function modelToEndpoint(model) {
if (model.includes("ultra")) return "ultra";
if (model.includes("sd3")) return "sd3";
return "core";
}
export default {
buildUrl: (model) => `${BASE_URL}/${modelToEndpoint(model)}`,
buildHeaders: (creds) => {
const key = creds?.apiKey || creds?.accessToken;
return {
"Content-Type": "application/json",
"Authorization": `Bearer ${key}`,
"Accept": "application/json",
};
},
buildBody: (model, body) => {
const req = { prompt: body.prompt, output_format: (body.output_format || "png").toLowerCase() };
if (body.size) req.aspect_ratio = sizeToAspectRatio(body.size);
if (body.style) req.style_preset = body.style;
if (model.includes("sd3")) req.model = model;
return req;
},
normalize: (responseBody) => {
if (responseBody.image) return { created: nowSec(), data: [{ b64_json: responseBody.image }] };
return { created: nowSec(), data: [] };
},
};

View File

@@ -1,12 +1,15 @@
import { Buffer } from "node:buffer";
import { createErrorResult } from "../utils/error.js";
import { HTTP_STATUS } from "../config/runtimeConfig.js";
import { execFile } from "child_process";
import { promisify } from "util";
import { mkdtemp, readFile, rm } from "fs/promises";
import { tmpdir } from "os";
import { join } from "path";
import { getTtsAdapter, synthesizeViaConfig } from "./ttsProviders/index.js";
const execFileAsync = promisify(execFile);
// Re-export voice fetchers + voices APIs for backward compat with existing routes
export {
VOICE_FETCHERS,
fetchEdgeTtsVoices,
fetchLocalDeviceVoices,
fetchElevenLabsVoices,
} from "./ttsProviders/index.js";
// ── Response Formatter (DRY) ───────────────────────────────────
function createTtsResponse(base64Audio, format, responseFormat) {
@@ -25,7 +28,7 @@ function createTtsResponse(base64Audio, format, responseFormat) {
};
}
// Binary format (default): return raw MP3
// Binary format (default): return raw audio
return {
success: true,
response: new Response(audioBuffer, {
@@ -38,691 +41,11 @@ function createTtsResponse(base64Audio, format, responseFormat) {
};
}
// ── Token cache per engine ─────────────────────────────────────
const cache = {
google: { token: null, tokenTime: 0 },
bing: { token: null, tokenTime: 0 },
};
const GOOGLE_REFRESH = 11 * 60 * 1000;
const BING_REFRESH = 5 * 60 * 1000; // conservative: token TTL is 1h but refresh early
const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36";
const SEC_CH_HEADERS = {
"sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-full-version": '"146.0.7680.178"',
"sec-ch-ua-full-version-list": '"Chromium";v="146.0.7680.178", "Not-A.Brand";v="24.0.0.0", "Google Chrome";v="146.0.7680.178"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"15.1.0"',
};
// ── Google TTS ─────────────────────────────────────────────────
async function getGoogleToken() {
const now = Date.now();
if (cache.google.token && now - cache.google.tokenTime < GOOGLE_REFRESH) {
return cache.google.token;
}
const res = await fetch("https://translate.google.com/", {
headers: { "User-Agent": UA },
});
if (!res.ok) throw new Error(`Google translate fetch failed: ${res.status}`);
const html = await res.text();
const fSid = html.match(/"FdrFJe":"(.*?)"/)?.[ 1];
const bl = html.match(/"cfb2h":"(.*?)"/)?.[ 1];
if (!fSid || !bl) throw new Error("Failed to parse Google token");
cache.google.token = { "f.sid": fSid, bl };
cache.google.tokenTime = now;
return cache.google.token;
}
let _googleIdx = 0;
async function googleTts(text, lang) {
const token = await getGoogleToken();
const cleanText = text.replace(/[@^*()\\/\-_+=><"'\u201c\u201d\u3010\u3011]/g, " ").replaceAll(", ", ". ");
const rpcId = "jQ1olc";
const reqId = (++_googleIdx * 100000) + Math.floor(1000 + Math.random() * 9000);
const query = new URLSearchParams({
rpcids: rpcId,
"f.sid": token["f.sid"],
bl: token.bl,
hl: lang,
"soc-app": 1, "soc-platform": 1, "soc-device": 1,
_reqid: reqId,
rt: "c",
});
const payload = [cleanText, lang, null, "undefined", [0]];
const body = new URLSearchParams();
body.append("f.req", JSON.stringify([[[rpcId, JSON.stringify(payload), null, "generic"]]]));
const res = await fetch(`https://translate.google.com/_/TranslateWebserverUi/data/batchexecute?${query}`, {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded", "Referer": "https://translate.google.com/" },
body: body.toString(),
});
if (!res.ok) throw new Error(`Google TTS failed: ${res.status}`);
const data = await res.text();
const split = JSON.parse(data.split("\n")[3]);
const base64 = JSON.parse(split[0][2])[0];
if (!base64 || base64.length < 100) throw new Error("Google TTS returned empty audio");
return base64; // base64 MP3
}
// ── Bing TTS ───────────────────────────────────────────────────
async function getBingToken() {
const now = Date.now();
if (cache.bing.token && now - cache.bing.tokenTime < BING_REFRESH) {
return cache.bing.token;
}
const res = await fetch("https://www.bing.com/translator", {
headers: { "User-Agent": UA, "Accept-Language": "vi,en-US;q=0.9,en;q=0.8" },
});
if (!res.ok) throw new Error(`Bing translator fetch failed: ${res.status}`);
const rawCookies = res.headers.getSetCookie?.() || [];
const cookie = rawCookies.map((c) => c.split(";")[0]).join("; ");
const html = await res.text();
const match = html.match(/params_AbusePreventionHelper\s*=\s*\[([^,]+),([^,]+),/);
if (!match) throw new Error("Failed to parse Bing token");
cache.bing.token = { key: match[1], token: match[2].replace(/"/g, ""), cookie };
cache.bing.tokenTime = now;
return cache.bing.token;
}
async function bingTtsRequest(text, voiceId, token) {
const parts = voiceId.split("-");
const xmlLang = parts.slice(0, 2).join("-");
const gender = voiceId.toLowerCase().includes("male") ? "Male" : "Female";
const ssml = `<speak version='1.0' xml:lang='${xmlLang}'><voice xml:lang='${xmlLang}' xml:gender='${gender}' name='${voiceId}'><prosody rate='0.00%'>${text}</prosody></voice></speak>`;
const body = new URLSearchParams();
body.append("ssml", ssml);
body.append("token", token.token);
body.append("key", token.key);
return fetch("https://www.bing.com/tfettts?isVertical=1&&IG=1&IID=translator.5023&SFX=1", {
method: "POST",
body: body.toString(),
headers: {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*",
"Origin": "https://www.bing.com",
"Referer": "https://www.bing.com/translator",
"User-Agent": UA,
...(token.cookie ? { "Cookie": token.cookie } : {}),
},
});
}
async function bingTts(text, voiceId) {
let token = await getBingToken();
let res = await bingTtsRequest(text, voiceId, token);
// On 429/captcha: invalidate cache and retry once with fresh token
if (res.status === 429 || res.status === 403) {
cache.bing.token = null;
cache.bing.tokenTime = 0;
token = await getBingToken();
res = await bingTtsRequest(text, voiceId, token);
}
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(`Bing TTS failed: ${res.status}${body ? " - " + body : ""}`);
}
const buf = await res.arrayBuffer();
if (buf.byteLength < 1024) throw new Error("Bing TTS returned empty audio");
return Buffer.from(buf).toString("base64"); // base64 MP3
}
// ── Local Device TTS (macOS `say` + Windows SAPI + ffmpeg) ──────
let _localVoicesCache = null;
async function fetchLocalDeviceVoicesMac() {
const { stdout } = await execFileAsync("say", ["-v", "?"]);
const voices = [];
for (const line of stdout.split("\n")) {
// Format: "Name locale # sample"
const m = line.match(/^([^\s].*?)\s{2,}([a-z]{2}_[A-Z]{2})/);
if (!m) continue;
const name = m[1].trim();
const locale = m[2].trim(); // e.g. en_US
const lang = locale.split("_")[0];
const country = locale.split("_")[1];
voices.push({ id: name, name, locale, lang, country, gender: "" });
}
return voices;
}
async function fetchLocalDeviceVoicesWin() {
// Use -WindowStyle Hidden to suppress PowerShell popup window
const script = [
"Add-Type -AssemblyName System.Speech;",
"$s = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"$s.GetInstalledVoices() | ForEach-Object { $v = $_.VoiceInfo;",
"[PSCustomObject]@{ Name=$v.Name; Culture=$v.Culture.Name; Gender=$v.Gender } }",
"| ConvertTo-Json -Compress",
].join(" ");
const { stdout } = await execFileAsync(
"powershell.exe",
["-NoProfile", "-NonInteractive", "-WindowStyle", "Hidden", "-Command", script],
{ windowsHide: true }
);
const raw = JSON.parse(stdout.trim() || "[]");
// Normalize: single object → array
const list = Array.isArray(raw) ? raw : [raw];
return list.map((v) => {
const culture = v.Culture || "en-US";
const [lang, country = ""] = culture.split("-");
// Gender: 0=NotSet, 1=Male, 2=Female (SAPI enum)
const genderMap = { 1: "Male", 2: "Female", Male: "Male", Female: "Female" };
return {
id: v.Name,
name: v.Name,
locale: culture.replace("-", "_"),
lang,
country,
gender: genderMap[v.Gender] || "",
};
});
}
export async function fetchLocalDeviceVoices() {
if (_localVoicesCache) return _localVoicesCache;
try {
const voices = process.platform === "win32"
? await fetchLocalDeviceVoicesWin()
: await fetchLocalDeviceVoicesMac();
_localVoicesCache = voices;
return voices;
} catch {
return [];
}
}
async function localDeviceTts(text, voiceId) {
const dir = await mkdtemp(join(tmpdir(), "tts-"));
const aiffPath = join(dir, "out.aiff");
const mp3Path = join(dir, "out.mp3");
try {
const args = voiceId ? ["-v", voiceId, "-o", aiffPath, text] : ["-o", aiffPath, text];
await execFileAsync("say", args);
await execFileAsync("ffmpeg", ["-y", "-i", aiffPath, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3Path]);
const buf = await readFile(mp3Path);
return buf.toString("base64");
} finally {
await rm(dir, { recursive: true, force: true });
}
}
// ── Voices list (Edge TTS public endpoint) ─────────────────────
let _voicesCache = null;
let _voicesCacheTime = 0;
const VOICES_TTL = 24 * 60 * 60 * 1000;
export async function fetchEdgeTtsVoices() {
const now = Date.now();
if (_voicesCache && now - _voicesCacheTime < VOICES_TTL) return _voicesCache;
const res = await fetch(
"https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4",
{ headers: { "User-Agent": UA } }
);
if (!res.ok) throw new Error(`Edge TTS voices fetch failed: ${res.status}`);
const voices = await res.json();
_voicesCache = voices;
_voicesCacheTime = now;
return voices;
}
// ── ElevenLabs TTS ─────────────────────────────────────────────
const _elevenlabsVoicesCache = new Map(); // Cache by API key
export async function fetchElevenLabsVoices(apiKey) {
if (!apiKey) throw new Error("ElevenLabs API key required");
const now = Date.now();
const cached = _elevenlabsVoicesCache.get(apiKey);
if (cached && now - cached.time < VOICES_TTL) {
return cached.voices;
}
const res = await fetch("https://api.elevenlabs.io/v1/voices", {
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
},
});
if (!res.ok) throw new Error(`ElevenLabs voices fetch failed: ${res.status}`);
const data = await res.json();
// Normalize: add lang from labels.language for grouping
const voices = (data.voices || []).map((v) => ({
...v,
lang: v.labels?.language || "en",
}));
_elevenlabsVoicesCache.set(apiKey, { voices, time: now });
return voices;
}
async function elevenlabsTts(text, voiceId, apiKey, modelId = "eleven_flash_v2_5") {
const res = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
},
body: JSON.stringify({
text,
model_id: modelId,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
},
}),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
throw new Error(err?.detail?.message || `ElevenLabs TTS failed: ${res.status}`);
}
const buf = await res.arrayBuffer();
if (buf.byteLength < 1024) throw new Error("ElevenLabs TTS returned empty audio");
return Buffer.from(buf).toString("base64");
}
// ── Voice Fetcher Registry (DRY) ───────────────────────────────
export const VOICE_FETCHERS = {
"edge-tts": fetchEdgeTtsVoices,
"local-device": fetchLocalDeviceVoices,
"elevenlabs": fetchElevenLabsVoices,
// google-tts: uses hardcoded language codes
// openai: uses hardcoded voices from providerModels.js
};
// ── OpenRouter TTS (via chat completions + audio modality) ───────────────────
async function handleOpenRouterTts({ model, input, credentials, responseFormat = "mp3" }) {
if (!credentials?.apiKey) {
return createErrorResult(HTTP_STATUS.UNAUTHORIZED, "No OpenRouter API key configured");
}
// model format: "tts-model/voice" e.g. "openai/gpt-4o-mini-tts/alloy"
let ttsModel = "openai/gpt-4o-mini-tts";
let voice = "alloy";
if (model && model.includes("/")) {
const lastSlash = model.lastIndexOf("/");
const maybVoice = model.slice(lastSlash + 1);
const maybeModel = model.slice(0, lastSlash);
// voice names are simple lowercase words, model names contain "/"
if (maybeModel.includes("/")) {
ttsModel = maybeModel;
voice = maybVoice;
} else {
voice = model;
}
} else if (model) {
voice = model;
}
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${credentials.apiKey}`,
"HTTP-Referer": "https://endpoint-proxy.local",
"X-Title": "Endpoint Proxy",
},
body: JSON.stringify({
model: ttsModel,
modalities: ["text", "audio"],
audio: { voice, format: "wav" },
stream: true,
messages: [{ role: "user", content: input }],
}),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
return createErrorResult(res.status, err?.error?.message || `OpenRouter TTS failed: ${res.status}`);
}
// Parse SSE stream, accumulate base64 audio chunks
const chunks = [];
const reader = res.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop();
for (const line of lines) {
if (!line.startsWith("data: ") || line === "data: [DONE]") continue;
try {
const json = JSON.parse(line.slice(6));
const audioData = json.choices?.[0]?.delta?.audio?.data;
if (audioData) chunks.push(audioData);
} catch {}
}
}
if (chunks.length === 0) {
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, "OpenRouter TTS returned no audio data");
}
const base64Audio = chunks.join("");
return createTtsResponse(base64Audio, "wav", responseFormat);
}
// ── OpenAI TTS ───────────────────────────────────────────────────────────────
async function handleOpenAiTts({ model, input, credentials, responseFormat = "mp3" }) {
if (!credentials?.apiKey) {
return createErrorResult(HTTP_STATUS.UNAUTHORIZED, "No OpenAI API key configured");
}
// model format: "tts-model/voice" e.g. "tts-1/alloy" or "gpt-4o-mini-tts/nova"
let ttsModel = "gpt-4o-mini-tts";
let voice = "alloy";
if (model && model.includes("/")) {
const parts = model.split("/");
if (parts.length === 2) {
[ttsModel, voice] = parts;
}
} else if (model) {
voice = model;
}
const baseUrl = (credentials.baseUrl || "https://api.openai.com").replace(/\/+$/, "");
const res = await fetch(`${baseUrl}/v1/audio/speech`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${credentials.apiKey}`,
},
body: JSON.stringify({ model: ttsModel, voice, input }),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
return createErrorResult(res.status, err?.error?.message || `OpenAI TTS failed: ${res.status}`);
}
const buf = await res.arrayBuffer();
const base64 = Buffer.from(buf).toString("base64");
return createTtsResponse(base64, "mp3", responseFormat);
}
// ── Generic TTS Format Handlers (config-driven via ttsConfig.format) ──────
// Parse `model` string as "modelId/voiceId" or "modelId" (modelId may contain slashes — match against known list)
function parseModelVoice(model, defaultModel = "", defaultVoice = "", knownModels = []) {
if (!model) return { modelId: defaultModel, voiceId: defaultVoice };
// Find longest known model id that prefixes `model`
const known = knownModels.map((m) => m.id || m).filter(Boolean).sort((a, b) => b.length - a.length);
for (const id of known) {
if (model === id) return { modelId: id, voiceId: defaultVoice };
if (model.startsWith(`${id}/`)) return { modelId: id, voiceId: model.slice(id.length + 1) };
}
// Fallback: split on last "/" so "vendor/model/voice" → model="vendor/model", voice="voice"
const idx = model.lastIndexOf("/");
if (idx > 0) return { modelId: model.slice(0, idx), voiceId: model.slice(idx + 1) };
return { modelId: defaultModel || model, voiceId: defaultVoice || model };
}
// Convert upstream Response (binary audio) to { base64, format }
async function responseToBase64(res, defaultFormat = "mp3") {
const buf = await res.arrayBuffer();
if (buf.byteLength < 100) throw new Error("Upstream returned empty audio");
const ctype = res.headers.get("content-type") || "";
let format = defaultFormat;
if (ctype.includes("wav")) format = "wav";
else if (ctype.includes("mpeg") || ctype.includes("mp3")) format = "mp3";
else if (ctype.includes("ogg")) format = "ogg";
return { base64: Buffer.from(buf).toString("base64"), format };
}
async function throwUpstreamError(res) {
const text = await res.text().catch(() => "");
let msg = `Upstream error (${res.status})`;
try {
const parsed = JSON.parse(text);
msg = parsed?.error?.message || parsed?.message || parsed?.detail?.message || (typeof parsed?.detail === "string" ? parsed.detail : null) || text || msg;
} catch { msg = text || msg; }
throw new Error(msg);
}
// Hyperbolic: POST { text } → { audio: base64 }
async function ttsHyperbolic({ baseUrl, apiKey, text }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ text }),
});
if (!res.ok) await throwUpstreamError(res);
const data = await res.json();
return { base64: data.audio, format: "mp3" };
}
// Deepgram: model via query, Token auth, returns binary
async function ttsDeepgram({ baseUrl, apiKey, text, modelId }) {
const url = new URL(baseUrl);
url.searchParams.set("model", modelId || "aura-asteria-en");
const res = await fetch(url.toString(), {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Token ${apiKey}` },
body: JSON.stringify({ text }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// Nvidia NIM: POST { input: { text }, voice, model } → binary
async function ttsNvidia({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ input: { text }, voice: voiceId || "default", model: modelId }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// HuggingFace: POST {baseUrl}/{modelId} { inputs: text } → binary
async function ttsHuggingFace({ baseUrl, apiKey, text, modelId }) {
if (!modelId || modelId.includes("..")) throw new Error("Invalid HuggingFace model ID");
const res = await fetch(`${baseUrl}/${modelId}`, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ inputs: text }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// Inworld: POST { text, voiceId, modelId, audioConfig } → JSON { audioContent }
async function ttsInworld({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Basic ${apiKey}` },
body: JSON.stringify({
text,
voiceId: voiceId || "Alex",
modelId: modelId || "inworld-tts-1.5-mini",
audioConfig: { audioEncoding: "MP3" },
}),
});
if (!res.ok) await throwUpstreamError(res);
const data = await res.json();
if (!data.audioContent) throw new Error("Inworld TTS returned no audio");
return { base64: data.audioContent, format: "mp3" };
}
// Cartesia: POST { model_id, transcript, voice, output_format } → binary
async function ttsCartesia({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-API-Key": apiKey,
"Cartesia-Version": "2024-06-10",
},
body: JSON.stringify({
model_id: modelId || "sonic-2",
transcript: text,
...(voiceId ? { voice: { mode: "id", id: voiceId } } : {}),
output_format: { container: "mp3", bit_rate: 128000, sample_rate: 44100 },
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// PlayHT: token format "userId:apiKey", voice = s3 URL
async function ttsPlayHt({ baseUrl, apiKey, text, modelId, voiceId }) {
const [userId, key] = (apiKey || ":").split(":");
const res = await fetch(baseUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Accept": "audio/mpeg",
"X-USER-ID": userId || "",
"Authorization": `Bearer ${key || apiKey}`,
},
body: JSON.stringify({
text,
voice: voiceId || "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
voice_engine: modelId || "PlayDialog",
output_format: "mp3",
speed: 1,
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// Coqui (local, noAuth): POST { text, speaker_id } → WAV
async function ttsCoqui({ baseUrl, text, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, ...(voiceId ? { speaker_id: voiceId } : {}) }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// Tortoise (local, noAuth): POST { text, voice } → binary
async function ttsTortoise({ baseUrl, text, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, voice: voiceId || "random" }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// OpenAI-compatible (qwen3-tts, openai-compat): POST { model, input, voice } → binary
async function ttsOpenAiCompat({ baseUrl, apiKey, text, modelId, voiceId }) {
const headers = { "Content-Type": "application/json" };
if (apiKey) headers["Authorization"] = `Bearer ${apiKey}`;
const res = await fetch(baseUrl, {
method: "POST",
headers,
body: JSON.stringify({
model: modelId,
input: text,
voice: voiceId || "alloy",
response_format: "mp3",
speed: 1.0,
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// Format → handler dispatcher (DRY)
const FORMAT_HANDLERS = {
hyperbolic: ttsHyperbolic,
deepgram: ttsDeepgram,
"nvidia-tts": ttsNvidia,
"huggingface-tts": ttsHuggingFace,
inworld: ttsInworld,
cartesia: ttsCartesia,
playht: ttsPlayHt,
coqui: ttsCoqui,
tortoise: ttsTortoise,
openai: ttsOpenAiCompat,
};
// ── TTS Provider Registry (legacy noAuth + special providers) ──────────
const TTS_PROVIDERS = {
"google-tts": {
synthesize: async (text, model) => {
const base64 = await googleTts(text, model || "en");
return { base64, format: "mp3" };
},
requiresCredentials: false,
},
"edge-tts": {
synthesize: async (text, model) => {
const base64 = await bingTts(text, model || "vi-VN-HoaiMyNeural");
return { base64, format: "mp3" };
},
requiresCredentials: false,
},
"local-device": {
synthesize: async (text, model) => {
const base64 = await localDeviceTts(text, model);
return { base64, format: "mp3" };
},
requiresCredentials: false,
},
"elevenlabs": {
synthesize: async (text, model, credentials) => {
if (!credentials?.apiKey) throw new Error("ElevenLabs API key required");
let modelId = "eleven_flash_v2_5";
let voiceId = model;
if (model && model.includes("/")) [modelId, voiceId] = model.split("/");
const base64 = await elevenlabsTts(text, voiceId, credentials.apiKey, modelId);
return { base64, format: "mp3" };
},
requiresCredentials: true,
},
"openai": {
synthesize: async (text, model, credentials, responseFormat) => {
return await handleOpenAiTts({ model, input: text, credentials, responseFormat });
},
requiresCredentials: true,
},
"openrouter": {
synthesize: async (text, model, credentials, responseFormat) => {
return await handleOpenRouterTts({ model, input: text, credentials, responseFormat });
},
requiresCredentials: true,
},
};
// ── Generic dispatcher: providers with ttsConfig.format ────────────────
// Resolves to TTS_PROVIDERS first; falls back to ttsConfig.format dispatch.
async function synthesizeViaConfig(provider, text, model, credentials) {
const { AI_PROVIDERS } = await import("@/shared/constants/providers");
const cfg = AI_PROVIDERS[provider]?.ttsConfig;
if (!cfg) return null;
const handler = FORMAT_HANDLERS[cfg.format];
if (!handler) return null;
const apiKey = credentials?.apiKey;
if (cfg.authType !== "none" && !apiKey) throw new Error(`${provider} API key required`);
const defaultModel = cfg.models?.[0]?.id || "";
const { modelId, voiceId } = parseModelVoice(model, defaultModel, "", cfg.models || []);
return handler({ baseUrl: cfg.baseUrl, apiKey, text, modelId, voiceId });
}
// ── Core handler ───────────────────────────────────────────────
/**
* Synthesize text to audio.
* Synthesize text to audio. Provider logic lives in `./ttsProviders/{id}.js`
* or is dispatched generically via `ttsConfig.format`.
*
* @returns {Promise<{success, response, status?, error?}>}
*/
export async function handleTtsCore({ provider, model, input, credentials, responseFormat = "mp3" }) {
@@ -730,17 +53,17 @@ export async function handleTtsCore({ provider, model, input, credentials, respo
return createErrorResult(HTTP_STATUS.BAD_REQUEST, "Missing required field: input");
}
const ttsProvider = TTS_PROVIDERS[provider];
try {
// Legacy/special providers (google-tts, edge-tts, local-device, elevenlabs, openai, openrouter)
if (ttsProvider) {
const result = await ttsProvider.synthesize(input.trim(), model, credentials, responseFormat);
// Special-case adapters (google-tts, edge-tts, local-device, elevenlabs, openai, openrouter)
const adapter = getTtsAdapter(provider);
if (adapter) {
const result = await adapter.synthesize(input.trim(), model, credentials, responseFormat);
// Adapter may return a full {success, response} (legacy) or {base64, format}
if (result.success !== undefined) return result;
return createTtsResponse(result.base64, result.format, responseFormat);
}
// Generic config-driven dispatcher (hyperbolic, deepgram, nvidia, huggingface, inworld, cartesia, playht, coqui, tortoise, qwen, ...)
// Generic config-driven (hyperbolic, deepgram, nvidia, huggingface, inworld, cartesia, playht, coqui, tortoise, qwen, ...)
const result = await synthesizeViaConfig(provider, input.trim(), model, credentials);
if (result) return createTtsResponse(result.base64, result.format, responseFormat);

View File

@@ -0,0 +1,39 @@
// Shared TTS helpers
import { Buffer } from "node:buffer";
export const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36";
// Convert upstream Response (binary audio) to { base64, format }
export async function responseToBase64(res, defaultFormat = "mp3") {
const buf = await res.arrayBuffer();
if (buf.byteLength < 100) throw new Error("Upstream returned empty audio");
const ctype = res.headers.get("content-type") || "";
let format = defaultFormat;
if (ctype.includes("wav")) format = "wav";
else if (ctype.includes("mpeg") || ctype.includes("mp3")) format = "mp3";
else if (ctype.includes("ogg")) format = "ogg";
return { base64: Buffer.from(buf).toString("base64"), format };
}
export async function throwUpstreamError(res) {
const text = await res.text().catch(() => "");
let msg = `Upstream error (${res.status})`;
try {
const parsed = JSON.parse(text);
msg = parsed?.error?.message || parsed?.message || parsed?.detail?.message || (typeof parsed?.detail === "string" ? parsed.detail : null) || text || msg;
} catch { msg = text || msg; }
throw new Error(msg);
}
// Parse `model` string as "modelId/voiceId" — match against known model list (longest prefix wins)
export function parseModelVoice(model, defaultModel = "", defaultVoice = "", knownModels = []) {
if (!model) return { modelId: defaultModel, voiceId: defaultVoice };
const known = knownModels.map((m) => m.id || m).filter(Boolean).sort((a, b) => b.length - a.length);
for (const id of known) {
if (model === id) return { modelId: id, voiceId: defaultVoice };
if (model.startsWith(`${id}/`)) return { modelId: id, voiceId: model.slice(id.length + 1) };
}
const idx = model.lastIndexOf("/");
if (idx > 0) return { modelId: model.slice(0, idx), voiceId: model.slice(idx + 1) };
return { modelId: defaultModel || model, voiceId: defaultVoice || model };
}

View File

@@ -0,0 +1,89 @@
// Microsoft Edge / Bing TTS (no auth) — via Bing translator endpoint
import { Buffer } from "node:buffer";
import { UA } from "./_base.js";
const REFRESH_MS = 5 * 60 * 1000; // token TTL ~1h, refresh early
const VOICES_TTL = 24 * 60 * 60 * 1000;
const cache = { token: null, tokenTime: 0 };
let _voicesCache = null;
let _voicesCacheTime = 0;
async function getToken() {
const now = Date.now();
if (cache.token && now - cache.tokenTime < REFRESH_MS) return cache.token;
const res = await fetch("https://www.bing.com/translator", {
headers: { "User-Agent": UA, "Accept-Language": "vi,en-US;q=0.9,en;q=0.8" },
});
if (!res.ok) throw new Error(`Bing translator fetch failed: ${res.status}`);
const rawCookies = res.headers.getSetCookie?.() || [];
const cookie = rawCookies.map((c) => c.split(";")[0]).join("; ");
const html = await res.text();
const match = html.match(/params_AbusePreventionHelper\s*=\s*\[([^,]+),([^,]+),/);
if (!match) throw new Error("Failed to parse Bing token");
cache.token = { key: match[1], token: match[2].replace(/"/g, ""), cookie };
cache.tokenTime = now;
return cache.token;
}
async function ttsRequest(text, voiceId, token) {
const parts = voiceId.split("-");
const xmlLang = parts.slice(0, 2).join("-");
const gender = voiceId.toLowerCase().includes("male") ? "Male" : "Female";
const ssml = `<speak version='1.0' xml:lang='${xmlLang}'><voice xml:lang='${xmlLang}' xml:gender='${gender}' name='${voiceId}'><prosody rate='0.00%'>${text}</prosody></voice></speak>`;
const body = new URLSearchParams();
body.append("ssml", ssml);
body.append("token", token.token);
body.append("key", token.key);
return fetch("https://www.bing.com/tfettts?isVertical=1&&IG=1&IID=translator.5023&SFX=1", {
method: "POST",
body: body.toString(),
headers: {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*",
"Origin": "https://www.bing.com",
"Referer": "https://www.bing.com/translator",
"User-Agent": UA,
...(token.cookie ? { "Cookie": token.cookie } : {}),
},
});
}
export async function fetchEdgeTtsVoices() {
const now = Date.now();
if (_voicesCache && now - _voicesCacheTime < VOICES_TTL) return _voicesCache;
const res = await fetch(
"https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4",
{ headers: { "User-Agent": UA } }
);
if (!res.ok) throw new Error(`Edge TTS voices fetch failed: ${res.status}`);
const voices = await res.json();
_voicesCache = voices;
_voicesCacheTime = now;
return voices;
}
export default {
noAuth: true,
async synthesize(text, model) {
const voiceId = model || "vi-VN-HoaiMyNeural";
let token = await getToken();
let res = await ttsRequest(text, voiceId, token);
// 429/403: invalidate cache and retry once
if (res.status === 429 || res.status === 403) {
cache.token = null;
cache.tokenTime = 0;
token = await getToken();
res = await ttsRequest(text, voiceId, token);
}
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(`Bing TTS failed: ${res.status}${body ? " - " + body : ""}`);
}
const buf = await res.arrayBuffer();
if (buf.byteLength < 1024) throw new Error("Bing TTS returned empty audio");
return { base64: Buffer.from(buf).toString("base64"), format: "mp3" };
},
};

View File

@@ -0,0 +1,48 @@
// ElevenLabs TTS — voice id with optional model_id prefix
import { Buffer } from "node:buffer";
const VOICES_TTL = 24 * 60 * 60 * 1000;
const _voicesCache = new Map(); // by API key
export async function fetchElevenLabsVoices(apiKey) {
if (!apiKey) throw new Error("ElevenLabs API key required");
const now = Date.now();
const cached = _voicesCache.get(apiKey);
if (cached && now - cached.time < VOICES_TTL) return cached.voices;
const res = await fetch("https://api.elevenlabs.io/v1/voices", {
headers: { "xi-api-key": apiKey, "Content-Type": "application/json" },
});
if (!res.ok) throw new Error(`ElevenLabs voices fetch failed: ${res.status}`);
const data = await res.json();
// Normalize: derive lang from labels for grouping
const voices = (data.voices || []).map((v) => ({ ...v, lang: v.labels?.language || "en" }));
_voicesCache.set(apiKey, { voices, time: now });
return voices;
}
export default {
async synthesize(text, model, credentials) {
if (!credentials?.apiKey) throw new Error("ElevenLabs API key required");
let modelId = "eleven_flash_v2_5";
let voiceId = model;
if (model && model.includes("/")) [modelId, voiceId] = model.split("/");
const res = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, {
method: "POST",
headers: { "xi-api-key": credentials.apiKey, "Content-Type": "application/json" },
body: JSON.stringify({
text,
model_id: modelId,
voice_settings: { stability: 0.5, similarity_boost: 0.75 },
}),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
throw new Error(err?.detail?.message || `ElevenLabs TTS failed: ${res.status}`);
}
const buf = await res.arrayBuffer();
if (buf.byteLength < 1024) throw new Error("ElevenLabs TTS returned empty audio");
return { base64: Buffer.from(buf).toString("base64"), format: "mp3" };
},
};

View File

@@ -0,0 +1,167 @@
// Generic config-driven TTS handlers — dispatched by ttsConfig.format.
// Each handler accepts { baseUrl, apiKey, text, modelId, voiceId } and returns { base64, format }.
import { responseToBase64, throwUpstreamError } from "./_base.js";
// Hyperbolic: POST { text } → { audio: base64 }
async function hyperbolic({ baseUrl, apiKey, text }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ text }),
});
if (!res.ok) await throwUpstreamError(res);
const data = await res.json();
return { base64: data.audio, format: "mp3" };
}
// Deepgram: model via query, Token auth, returns binary
async function deepgram({ baseUrl, apiKey, text, modelId }) {
const url = new URL(baseUrl);
url.searchParams.set("model", modelId || "aura-asteria-en");
const res = await fetch(url.toString(), {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Token ${apiKey}` },
body: JSON.stringify({ text }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// Nvidia NIM: POST { input: { text }, voice, model } → binary
async function nvidia({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ input: { text }, voice: voiceId || "default", model: modelId }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// HuggingFace: POST {baseUrl}/{modelId} { inputs: text } → binary
async function huggingface({ baseUrl, apiKey, text, modelId }) {
if (!modelId || modelId.includes("..")) throw new Error("Invalid HuggingFace model ID");
const res = await fetch(`${baseUrl}/${modelId}`, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` },
body: JSON.stringify({ inputs: text }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// Inworld: Basic auth, JSON { audioContent }
async function inworld({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Basic ${apiKey}` },
body: JSON.stringify({
text,
voiceId: voiceId || "Alex",
modelId: modelId || "inworld-tts-1.5-mini",
audioConfig: { audioEncoding: "MP3" },
}),
});
if (!res.ok) await throwUpstreamError(res);
const data = await res.json();
if (!data.audioContent) throw new Error("Inworld TTS returned no audio");
return { base64: data.audioContent, format: "mp3" };
}
// Cartesia: X-API-Key header
async function cartesia({ baseUrl, apiKey, text, modelId, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-API-Key": apiKey,
"Cartesia-Version": "2024-06-10",
},
body: JSON.stringify({
model_id: modelId || "sonic-2",
transcript: text,
...(voiceId ? { voice: { mode: "id", id: voiceId } } : {}),
output_format: { container: "mp3", bit_rate: 128000, sample_rate: 44100 },
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// PlayHT: token format "userId:apiKey", voice = s3 URL
async function playht({ baseUrl, apiKey, text, modelId, voiceId }) {
const [userId, key] = (apiKey || ":").split(":");
const res = await fetch(baseUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Accept": "audio/mpeg",
"X-USER-ID": userId || "",
"Authorization": `Bearer ${key || apiKey}`,
},
body: JSON.stringify({
text,
voice: voiceId || "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
voice_engine: modelId || "PlayDialog",
output_format: "mp3",
speed: 1,
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// Coqui (local, noAuth): POST { text, speaker_id } → WAV
async function coqui({ baseUrl, text, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, ...(voiceId ? { speaker_id: voiceId } : {}) }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// Tortoise (local, noAuth)
async function tortoise({ baseUrl, text, voiceId }) {
const res = await fetch(baseUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, voice: voiceId || "random" }),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "wav");
}
// OpenAI-compatible upstream (qwen3-tts, etc.)
async function openaiCompat({ baseUrl, apiKey, text, modelId, voiceId }) {
const headers = { "Content-Type": "application/json" };
if (apiKey) headers["Authorization"] = `Bearer ${apiKey}`;
const res = await fetch(baseUrl, {
method: "POST",
headers,
body: JSON.stringify({
model: modelId,
input: text,
voice: voiceId || "alloy",
response_format: "mp3",
speed: 1.0,
}),
});
if (!res.ok) await throwUpstreamError(res);
return responseToBase64(res, "mp3");
}
// format → handler dispatcher
export const FORMAT_HANDLERS = {
hyperbolic,
deepgram,
"nvidia-tts": nvidia,
"huggingface-tts": huggingface,
inworld,
cartesia,
playht,
coqui,
tortoise,
openai: openaiCompat,
};

View File

@@ -0,0 +1,54 @@
// Google Translate TTS (no auth) — scrape token + batchexecute RPC
import { UA } from "./_base.js";
const REFRESH_MS = 11 * 60 * 1000;
const cache = { token: null, tokenTime: 0 };
let _idx = 0;
async function getToken() {
const now = Date.now();
if (cache.token && now - cache.tokenTime < REFRESH_MS) return cache.token;
const res = await fetch("https://translate.google.com/", { headers: { "User-Agent": UA } });
if (!res.ok) throw new Error(`Google translate fetch failed: ${res.status}`);
const html = await res.text();
const fSid = html.match(/"FdrFJe":"(.*?)"/)?.[1];
const bl = html.match(/"cfb2h":"(.*?)"/)?.[1];
if (!fSid || !bl) throw new Error("Failed to parse Google token");
cache.token = { "f.sid": fSid, bl };
cache.tokenTime = now;
return cache.token;
}
export default {
noAuth: true,
async synthesize(text, model) {
const lang = model || "en";
const token = await getToken();
const cleanText = text.replace(/[@^*()\\/\-_+=><"'\u201c\u201d\u3010\u3011]/g, " ").replaceAll(", ", ". ");
const rpcId = "jQ1olc";
const reqId = (++_idx * 100000) + Math.floor(1000 + Math.random() * 9000);
const query = new URLSearchParams({
rpcids: rpcId,
"f.sid": token["f.sid"],
bl: token.bl,
hl: lang,
"soc-app": 1, "soc-platform": 1, "soc-device": 1,
_reqid: reqId,
rt: "c",
});
const payload = [cleanText, lang, null, "undefined", [0]];
const body = new URLSearchParams();
body.append("f.req", JSON.stringify([[[rpcId, JSON.stringify(payload), null, "generic"]]]));
const res = await fetch(`https://translate.google.com/_/TranslateWebserverUi/data/batchexecute?${query}`, {
method: "POST",
headers: { "Content-Type": "application/x-www-form-urlencoded", "Referer": "https://translate.google.com/" },
body: body.toString(),
});
if (!res.ok) throw new Error(`Google TTS failed: ${res.status}`);
const data = await res.text();
const split = JSON.parse(data.split("\n")[3]);
const base64 = JSON.parse(split[0][2])[0];
if (!base64 || base64.length < 100) throw new Error("Google TTS returned empty audio");
return { base64, format: "mp3" };
},
};

View File

@@ -0,0 +1,47 @@
// TTS provider registry
import googleTts from "./googleTts.js";
import edgeTts, { fetchEdgeTtsVoices } from "./edgeTts.js";
import localDevice, { fetchLocalDeviceVoices } from "./localDevice.js";
import elevenlabs, { fetchElevenLabsVoices } from "./elevenlabs.js";
import openai from "./openai.js";
import openrouter from "./openrouter.js";
import { FORMAT_HANDLERS } from "./genericFormats.js";
import { parseModelVoice } from "./_base.js";
// Special providers with custom synthesize() logic
const SPECIAL_ADAPTERS = {
"google-tts": googleTts,
"edge-tts": edgeTts,
"local-device": localDevice,
elevenlabs,
openai,
openrouter,
};
export function getTtsAdapter(provider) {
return SPECIAL_ADAPTERS[provider] || null;
}
// Generic config-driven dispatcher (uses ttsConfig.format)
export async function synthesizeViaConfig(provider, text, model, credentials) {
const { AI_PROVIDERS } = await import("@/shared/constants/providers");
const cfg = AI_PROVIDERS[provider]?.ttsConfig;
if (!cfg) return null;
const handler = FORMAT_HANDLERS[cfg.format];
if (!handler) return null;
const apiKey = credentials?.apiKey;
if (cfg.authType !== "none" && !apiKey) throw new Error(`${provider} API key required`);
const defaultModel = cfg.models?.[0]?.id || "";
const { modelId, voiceId } = parseModelVoice(model, defaultModel, "", cfg.models || []);
return handler({ baseUrl: cfg.baseUrl, apiKey, text, modelId, voiceId });
}
// Voice fetchers (used by /api/media-providers/tts/voices route)
export const VOICE_FETCHERS = {
"edge-tts": fetchEdgeTtsVoices,
"local-device": fetchLocalDeviceVoices,
elevenlabs: fetchElevenLabsVoices,
};
// Re-export for backward compat
export { fetchEdgeTtsVoices, fetchLocalDeviceVoices, fetchElevenLabsVoices };

View File

@@ -0,0 +1,87 @@
// Local device TTS — macOS `say` + Windows SAPI + ffmpeg
import { execFile } from "node:child_process";
import { promisify } from "node:util";
import { mkdtemp, readFile, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
const execFileAsync = promisify(execFile);
let _voicesCache = null;
async function fetchVoicesMac() {
const { stdout } = await execFileAsync("say", ["-v", "?"]);
const voices = [];
for (const line of stdout.split("\n")) {
const m = line.match(/^([^\s].*?)\s{2,}([a-z]{2}_[A-Z]{2})/);
if (!m) continue;
const name = m[1].trim();
const locale = m[2].trim();
const lang = locale.split("_")[0];
const country = locale.split("_")[1];
voices.push({ id: name, name, locale, lang, country, gender: "" });
}
return voices;
}
async function fetchVoicesWin() {
const script = [
"Add-Type -AssemblyName System.Speech;",
"$s = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"$s.GetInstalledVoices() | ForEach-Object { $v = $_.VoiceInfo;",
"[PSCustomObject]@{ Name=$v.Name; Culture=$v.Culture.Name; Gender=$v.Gender } }",
"| ConvertTo-Json -Compress",
].join(" ");
const { stdout } = await execFileAsync(
"powershell.exe",
["-NoProfile", "-NonInteractive", "-WindowStyle", "Hidden", "-Command", script],
{ windowsHide: true }
);
const raw = JSON.parse(stdout.trim() || "[]");
const list = Array.isArray(raw) ? raw : [raw];
return list.map((v) => {
const culture = v.Culture || "en-US";
const [lang, country = ""] = culture.split("-");
const genderMap = { 1: "Male", 2: "Female", Male: "Male", Female: "Female" };
return {
id: v.Name, name: v.Name,
locale: culture.replace("-", "_"),
lang, country,
gender: genderMap[v.Gender] || "",
};
});
}
export async function fetchLocalDeviceVoices() {
if (_voicesCache) return _voicesCache;
try {
const voices = process.platform === "win32" ? await fetchVoicesWin() : await fetchVoicesMac();
_voicesCache = voices;
return voices;
} catch {
return [];
}
}
async function synthesizeMacOrWin(text, voiceId) {
const dir = await mkdtemp(join(tmpdir(), "tts-"));
const aiffPath = join(dir, "out.aiff");
const mp3Path = join(dir, "out.mp3");
try {
const args = voiceId ? ["-v", voiceId, "-o", aiffPath, text] : ["-o", aiffPath, text];
await execFileAsync("say", args);
await execFileAsync("ffmpeg", ["-y", "-i", aiffPath, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3Path]);
const buf = await readFile(mp3Path);
return buf.toString("base64");
} finally {
await rm(dir, { recursive: true, force: true });
}
}
export default {
noAuth: true,
async synthesize(text, model) {
const base64 = await synthesizeMacOrWin(text, model);
return { base64, format: "mp3" };
},
};

View File

@@ -0,0 +1,30 @@
// OpenAI TTS — model format: "tts-model/voice"
import { Buffer } from "node:buffer";
export default {
async synthesize(text, model, credentials) {
if (!credentials?.apiKey) throw new Error("No OpenAI API key configured");
let ttsModel = "gpt-4o-mini-tts";
let voice = "alloy";
if (model && model.includes("/")) {
const parts = model.split("/");
if (parts.length === 2) [ttsModel, voice] = parts;
} else if (model) {
voice = model;
}
const baseUrl = (credentials.baseUrl || "https://api.openai.com").replace(/\/+$/, "");
const res = await fetch(`${baseUrl}/v1/audio/speech`, {
method: "POST",
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${credentials.apiKey}` },
body: JSON.stringify({ model: ttsModel, voice, input: text }),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
throw new Error(err?.error?.message || `OpenAI TTS failed: ${res.status}`);
}
const buf = await res.arrayBuffer();
return { base64: Buffer.from(buf).toString("base64"), format: "mp3" };
},
};

View File

@@ -0,0 +1,70 @@
// OpenRouter TTS — via chat completions + audio modality (SSE stream)
export default {
async synthesize(text, model, credentials) {
if (!credentials?.apiKey) throw new Error("No OpenRouter API key configured");
// model format: "tts-model/voice" e.g. "openai/gpt-4o-mini-tts/alloy"
let ttsModel = "openai/gpt-4o-mini-tts";
let voice = "alloy";
if (model && model.includes("/")) {
const lastSlash = model.lastIndexOf("/");
const maybVoice = model.slice(lastSlash + 1);
const maybeModel = model.slice(0, lastSlash);
if (maybeModel.includes("/")) {
ttsModel = maybeModel;
voice = maybVoice;
} else {
voice = model;
}
} else if (model) {
voice = model;
}
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${credentials.apiKey}`,
"HTTP-Referer": "https://endpoint-proxy.local",
"X-Title": "Endpoint Proxy",
},
body: JSON.stringify({
model: ttsModel,
modalities: ["text", "audio"],
audio: { voice, format: "wav" },
stream: true,
messages: [{ role: "user", content: text }],
}),
});
if (!res.ok) {
const err = await res.json().catch(() => ({}));
throw new Error(err?.error?.message || `OpenRouter TTS failed: ${res.status}`);
}
// Parse SSE stream, accumulate base64 audio chunks
const chunks = [];
const reader = res.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop();
for (const line of lines) {
if (!line.startsWith("data: ") || line === "data: [DONE]") continue;
try {
const json = JSON.parse(line.slice(6));
const audioData = json.choices?.[0]?.delta?.audio?.data;
if (audioData) chunks.push(audioData);
} catch {}
}
}
if (chunks.length === 0) throw new Error("OpenRouter TTS returned no audio data");
return { base64: chunks.join(""), format: "wav" };
},
};

View File

@@ -67,6 +67,23 @@ const ALIAS_TO_PROVIDER_ID = {
"xiaomi-mimo": "xiaomi-mimo",
cf: "cloudflare-ai",
"cloudflare-ai": "cloudflare-ai",
// Image/video providers
fal: "fal-ai",
"fal-ai": "fal-ai",
stability: "stability-ai",
"stability-ai": "stability-ai",
bfl: "black-forest-labs",
"black-forest-labs": "black-forest-labs",
recraft: "recraft",
topaz: "topaz",
runway: "runwayml",
runwayml: "runwayml",
// Embedding/rerank
jina: "jina-ai",
"jina-ai": "jina-ai",
// TTS
polly: "aws-polly",
"aws-polly": "aws-polly",
};
/**