mirror of
https://github.com/decolua/9router.git
synced 2026-05-08 12:01:28 +00:00
- Speech-to-Text: full pipeline with sttCore handler, /v1/audio/transcriptions endpoint, sttConfig for OpenAI, Gemini, Groq, Deepgram, AssemblyAI, HuggingFace, NVIDIA Parakeet; new 9router-stt skill - Gemini TTS: add gemini provider with 30 prebuilt voices and TTS_PROVIDER_CONFIG - Usage: implement GLM (intl/cn) and MiniMax (intl/cn) quota fetchers; refactor Gemini CLI usage to use retrieveUserQuota with per-model buckets - Disabled models: lowdb-backed disabledModelsDb + /api/models/disabled route - Header search: reusable Zustand store (headerSearchStore) wired into Header - CLI tools: add Claude Cowork tool card and cowork-settings API - Providers: introduce mediaPriority sorting in getProvidersByKind, add Kimi K2.6, reorder hermes, drop qwen STT kind - UI: expand media-providers/[kind]/[id] page (+314), enhance OAuthModal, ModelSelectModal, ProviderTopology, ProxyPools, ProviderLimits - Assets: refresh provider PNGs (alicode, byteplus, cloudflare-ai, nvidia, ollama, vertex, volcengine-ark) and add aws-polly, fal-ai, jina-ai, recraft, runwayml, stability-ai, topaz, black-forest-labs
195 lines
8.4 KiB
JavaScript
195 lines
8.4 KiB
JavaScript
import { Buffer } from "node:buffer";
|
|
import { createErrorResult } from "../utils/error.js";
|
|
import { HTTP_STATUS } from "../config/runtimeConfig.js";
|
|
import { AI_PROVIDERS } from "../../src/shared/constants/providers.js";
|
|
|
|
// Build auth headers from sttConfig + token
|
|
function buildAuthHeaders(cfg, token) {
|
|
if (!token) return {};
|
|
switch (cfg.authHeader) {
|
|
case "bearer": return { "Authorization": `Bearer ${token}` };
|
|
case "token": return { "Authorization": `Token ${token}` };
|
|
case "x-api-key": return { "x-api-key": token };
|
|
case "key": return { "Authorization": `Key ${token}` };
|
|
default: return { "Authorization": `Bearer ${token}` };
|
|
}
|
|
}
|
|
|
|
// Map browser file MIME / ext → audio MIME for binary formats (deepgram/HF)
|
|
function resolveAudioContentType(file) {
|
|
const t = (file.type || "").toLowerCase();
|
|
if (t.startsWith("audio/")) return t;
|
|
const name = typeof file.name === "string" ? file.name.toLowerCase() : "";
|
|
const ext = name.includes(".") ? name.split(".").pop() : "";
|
|
const map = { mp3: "audio/mpeg", mp4: "audio/mp4", m4a: "audio/mp4", wav: "audio/wav", ogg: "audio/ogg", flac: "audio/flac", webm: "audio/webm", aac: "audio/aac", opus: "audio/opus" };
|
|
return map[ext] || "application/octet-stream";
|
|
}
|
|
|
|
async function upstreamError(res) {
|
|
let txt = "";
|
|
try { txt = await res.text(); } catch {}
|
|
let msg = txt || `Upstream error (${res.status})`;
|
|
try { const j = JSON.parse(txt); msg = j?.error?.message || j?.error || j?.message || msg; } catch {}
|
|
return createErrorResult(res.status, typeof msg === "string" ? msg : JSON.stringify(msg));
|
|
}
|
|
|
|
// Deepgram: raw binary POST + model query param
|
|
async function transcribeDeepgram(cfg, file, model, token, formData) {
|
|
const url = new URL(cfg.baseUrl);
|
|
url.searchParams.set("model", model);
|
|
url.searchParams.set("smart_format", "true");
|
|
url.searchParams.set("punctuate", "true");
|
|
const lang = formData.get("language");
|
|
if (typeof lang === "string" && lang.trim()) url.searchParams.set("language", lang.trim());
|
|
else url.searchParams.set("detect_language", "true");
|
|
|
|
const buf = await file.arrayBuffer();
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: { ...buildAuthHeaders(cfg, token), "Content-Type": resolveAudioContentType(file) },
|
|
body: buf,
|
|
});
|
|
if (!res.ok) return upstreamError(res);
|
|
const data = await res.json();
|
|
const text = data.results?.channels?.[0]?.alternatives?.[0]?.transcript ?? "";
|
|
return jsonResponse({ text });
|
|
}
|
|
|
|
// AssemblyAI: upload → submit → poll (max 120s)
|
|
async function transcribeAssemblyAI(cfg, file, model, token) {
|
|
const auth = buildAuthHeaders(cfg, token);
|
|
const buf = await file.arrayBuffer();
|
|
const up = await fetch("https://api.assemblyai.com/v2/upload", {
|
|
method: "POST", headers: { ...auth, "Content-Type": "application/octet-stream" }, body: buf,
|
|
});
|
|
if (!up.ok) return upstreamError(up);
|
|
const { upload_url } = await up.json();
|
|
|
|
const sub = await fetch(cfg.baseUrl, {
|
|
method: "POST",
|
|
headers: { ...auth, "Content-Type": "application/json" },
|
|
body: JSON.stringify({ audio_url: upload_url, speech_models: [model], language_detection: true }),
|
|
});
|
|
if (!sub.ok) return upstreamError(sub);
|
|
const { id } = await sub.json();
|
|
|
|
const start = Date.now();
|
|
while (Date.now() - start < 120_000) {
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
const poll = await fetch(`${cfg.baseUrl}/${id}`, { headers: auth });
|
|
if (!poll.ok) continue;
|
|
const r = await poll.json();
|
|
if (r.status === "completed") return jsonResponse({ text: r.text || "" });
|
|
if (r.status === "error") return createErrorResult(500, r.error || "AssemblyAI failed");
|
|
}
|
|
return createErrorResult(504, "AssemblyAI timeout after 120s");
|
|
}
|
|
|
|
// Nvidia NIM: multipart, normalize response
|
|
async function transcribeNvidia(cfg, file, model, token) {
|
|
const fd = new FormData();
|
|
fd.append("file", file, file.name || "audio.wav");
|
|
fd.append("model", model);
|
|
const res = await fetch(cfg.baseUrl, { method: "POST", headers: buildAuthHeaders(cfg, token), body: fd });
|
|
if (!res.ok) return upstreamError(res);
|
|
const data = await res.json();
|
|
return jsonResponse({ text: data.text || data.transcript || "" });
|
|
}
|
|
|
|
// Gemini: generateContent with inline_data audio + transcription prompt
|
|
async function transcribeGemini(cfg, file, model, token, formData) {
|
|
const buf = await file.arrayBuffer();
|
|
const b64 = Buffer.from(buf).toString("base64");
|
|
const mime = resolveAudioContentType(file);
|
|
const lang = formData.get("language");
|
|
const userPrompt = formData.get("prompt");
|
|
let promptText = userPrompt && typeof userPrompt === "string" && userPrompt.trim()
|
|
? userPrompt.trim()
|
|
: "Generate a transcript of the speech. Return only the transcribed text, no commentary.";
|
|
if (typeof lang === "string" && lang.trim()) promptText += ` Language: ${lang.trim()}.`;
|
|
|
|
const url = `${cfg.baseUrl}/${model}:generateContent?key=${token}`;
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
contents: [{ parts: [{ text: promptText }, { inline_data: { mime_type: mime, data: b64 } }] }],
|
|
}),
|
|
});
|
|
if (!res.ok) return upstreamError(res);
|
|
const data = await res.json();
|
|
const text = data?.candidates?.[0]?.content?.parts?.map((p) => p.text).filter(Boolean).join("") || "";
|
|
return jsonResponse({ text });
|
|
}
|
|
|
|
// HuggingFace: POST raw binary to {baseUrl}/{model_id}
|
|
async function transcribeHuggingFace(cfg, file, model, token) {
|
|
if (model.includes("..") || model.includes("//")) return createErrorResult(400, "Invalid model ID");
|
|
const url = `${cfg.baseUrl.replace(/\/+$/, "")}/${model}`;
|
|
const buf = await file.arrayBuffer();
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: { ...buildAuthHeaders(cfg, token), "Content-Type": resolveAudioContentType(file) },
|
|
body: buf,
|
|
});
|
|
if (!res.ok) return upstreamError(res);
|
|
const data = await res.json();
|
|
return jsonResponse({ text: data.text || "" });
|
|
}
|
|
|
|
// Default: OpenAI/Groq/Whisper-compatible multipart
|
|
async function transcribeOpenAICompatible(cfg, file, model, token, formData) {
|
|
const fd = new FormData();
|
|
fd.append("file", file, file.name || "audio.wav");
|
|
fd.append("model", model);
|
|
for (const k of ["language", "prompt", "response_format", "temperature"]) {
|
|
const v = formData.get(k);
|
|
if (v !== null && v !== undefined && v !== "") fd.append(k, v);
|
|
}
|
|
const res = await fetch(cfg.baseUrl, { method: "POST", headers: buildAuthHeaders(cfg, token), body: fd });
|
|
if (!res.ok) return upstreamError(res);
|
|
const ct = res.headers.get("content-type") || "application/json";
|
|
const txt = await res.text();
|
|
return { success: true, response: new Response(txt, { status: 200, headers: { "Content-Type": ct, "Access-Control-Allow-Origin": "*" } }) };
|
|
}
|
|
|
|
function jsonResponse(obj) {
|
|
return {
|
|
success: true,
|
|
response: new Response(JSON.stringify(obj), {
|
|
status: 200,
|
|
headers: { "Content-Type": "application/json", "Access-Control-Allow-Origin": "*" },
|
|
}),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* STT core handler — dispatch by sttConfig.format.
|
|
* @returns {Promise<{success, response, status?, error?}>}
|
|
*/
|
|
export async function handleSttCore({ provider, model, formData, credentials }) {
|
|
const file = formData.get("file");
|
|
if (!file) return createErrorResult(HTTP_STATUS.BAD_REQUEST, "Missing required field: file");
|
|
|
|
const cfg = AI_PROVIDERS[provider]?.sttConfig;
|
|
if (!cfg) return createErrorResult(HTTP_STATUS.BAD_REQUEST, `Provider '${provider}' does not support STT`);
|
|
|
|
const token = cfg.authType === "none" ? null : (credentials?.apiKey || credentials?.accessToken);
|
|
if (cfg.authType !== "none" && !token) {
|
|
return createErrorResult(HTTP_STATUS.UNAUTHORIZED, `No credentials for STT provider: ${provider}`);
|
|
}
|
|
|
|
try {
|
|
switch (cfg.format) {
|
|
case "deepgram": return await transcribeDeepgram(cfg, file, model, token, formData);
|
|
case "assemblyai": return await transcribeAssemblyAI(cfg, file, model, token);
|
|
case "nvidia-asr": return await transcribeNvidia(cfg, file, model, token);
|
|
case "huggingface-asr": return await transcribeHuggingFace(cfg, file, model, token);
|
|
case "gemini-stt": return await transcribeGemini(cfg, file, model, token, formData);
|
|
default: return await transcribeOpenAICompatible(cfg, file, model, token, formData);
|
|
}
|
|
} catch (err) {
|
|
return createErrorResult(HTTP_STATUS.BAD_GATEWAY, err.message || "STT request failed");
|
|
}
|
|
}
|