From 001f04a9ee797030c1c428f3f861222d0a6d0b78 Mon Sep 17 00:00:00 2001 From: Gabe Date: Fri, 10 Oct 2025 21:17:51 +0800 Subject: [PATCH] fix: subtitle --- src/apis/index.js | 1 + src/apis/trans.js | 6 ++- src/config/api.js | 56 +++++++++++++++++++------- src/hooks/Api.js | 3 +- src/subtitle/YouTubeCaptionProvider.js | 46 ++++++++++++++++++--- src/subtitle/vtt.js | 44 ++++++++++++++++++++ 6 files changed, 133 insertions(+), 23 deletions(-) create mode 100644 src/subtitle/vtt.js diff --git a/src/apis/index.js b/src/apis/index.js index 26ff63f..5e1191f 100644 --- a/src/apis/index.js +++ b/src/apis/index.js @@ -510,6 +510,7 @@ export const apiSubtitle = async ({ apiSetting, }) => { const cacheOpts = { + apiSlug: apiSetting.apiSlug, videoId, fromLang, toLang, diff --git a/src/apis/trans.js b/src/apis/trans.js index 3d63364..0ade766 100644 --- a/src/apis/trans.js +++ b/src/apis/trans.js @@ -34,6 +34,7 @@ import { parseJsonObj, extractJson } from "../libs/utils"; import { kissLog } from "../libs/log"; import { fetchData } from "../libs/fetch"; import { getMsgHistory } from "./history"; +import { parseBilingualVtt } from "../subtitle/vtt"; const keyMap = new Map(); const urlMap = new Map(); @@ -118,8 +119,9 @@ const parseSTRes = (raw) => { } try { - const jsonString = extractJson(raw); - const data = JSON.parse(jsonString); + // const jsonString = extractJson(raw); + // const data = JSON.parse(jsonString); + const data = parseBilingualVtt(raw); if (Array.isArray(data)) { return data; } diff --git a/src/config/api.js b/src/config/api.js index a388d7f..726af32 100644 --- a/src/config/api.js +++ b/src/config/api.js @@ -354,23 +354,49 @@ Output: {"translations":[{"id":1,"text":"一个React组件","sourceLangua Fail-safe: On any error, return {"translations":[]}.`; -const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array. +// const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array. -Output (valid JSON array, output ONLY this array): -[{ - "text": "string", // Full sentence with correct punctuation - "translation": "string", // Translation in ${INPUT_PLACE_TO} - "start": int, // Start time (ms) - "end": int, // End time (ms) - "duration": int // end - start -}] +// Output (valid JSON array, output ONLY this array): +// [{ +// "text": "string", // Full sentence with correct punctuation +// "translation": "string", // Translation in ${INPUT_PLACE_TO} +// "start": int, // Start time (ms) +// "end": int, // End time (ms) +// }] -Guidelines: -1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically. -2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing. -3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'. -4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]'). -`; +// Guidelines: +// 1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically. +// 2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing. +// 3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'. +// 4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]'). +// `; + +const defaultSubtitlePrompt = `You are an expert AI for subtitle generation. Convert a JSON array of word-level timestamps into a bilingual VTT file. + +**Workflow:** +1. Merge \`text\` fields into complete sentences; ignore empty text. +2. Split long sentences into smaller, manageable subtitle cues (one sentence per cue). +3. Translate each cue into ${INPUT_PLACE_TO}. +4. Format as VTT: + - Start with \`WEBVTT\`. + - Each cue: timestamps (\`start --> end\` in milliseconds), original text, translated text. + - Keep non-speech text (e.g., \`[Music]\`) untranslated. + - Separate cues with a blank line. + +**Output:** Only the pure VTT content. + +**Example:** +\`\`\`vtt +WEBVTT + +1000 --> 3500 +Hello world! +你好,世界! + +4000 --> 6000 +Good morning. +早上好。 +\`\`\``; const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => { console.log("request hook args:", args); diff --git a/src/hooks/Api.js b/src/hooks/Api.js index fffebfc..fd0f279 100644 --- a/src/hooks/Api.js +++ b/src/hooks/Api.js @@ -44,7 +44,7 @@ export function useApiList() { ); const aiEnabledApis = useMemo( - () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)), + () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiType)), [enabledApis] ); @@ -124,6 +124,7 @@ export function useApiItem(apiSlug) { apiSlug: item.apiSlug, apiName: item.apiName, apiType: item.apiType, + key: item.key, }; } return item; diff --git a/src/subtitle/YouTubeCaptionProvider.js b/src/subtitle/YouTubeCaptionProvider.js index 863c28d..3942e1d 100644 --- a/src/subtitle/YouTubeCaptionProvider.js +++ b/src/subtitle/YouTubeCaptionProvider.js @@ -111,6 +111,11 @@ class YouTubeCaptionProvider { kissControls.appendChild(toggleButton); toggleButton.onclick = () => { + if (this.#isBusy) { + logger.info(`Youtube Provider: It's budy now...`); + return; + } + if (!this.#enabled) { logger.info(`Youtube Provider: Feature toggled ON.`); this.#startManager(); @@ -283,9 +288,10 @@ class YouTubeCaptionProvider { OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) || "auto"; if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) { + // todo: 切分多次发送接受以适应接口处理能力 subtitles = await this.#aiSegment({ videoId, - events, + events: this.#flatEvents(events), fromLang, toLang, segApiSetting, @@ -408,10 +414,7 @@ class YouTubeCaptionProvider { lines = this.#processSubtitles({ events, usePause: true }); } - return lines.map((item) => ({ - ...item, - duration: Math.max(0, item.end - item.start), - })); + return lines; } #isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) { @@ -580,6 +583,39 @@ class YouTubeCaptionProvider { return sentences; } + + #flatEvents(events = []) { + const segments = []; + let buffer = null; + + events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => { + segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => { + const text = utf8.trim().replace(/\s+/g, " "); + const start = tStartMs + tOffsetMs; + + if (buffer) { + if (!buffer.end || buffer.end > start) { + buffer.end = start; + } + segments.push(buffer); + buffer = null; + } + + buffer = { + text, + start, + }; + + if (j === segs.length - 1) { + buffer.end = tStartMs + dDurationMs; + } + }); + }); + + segments.push(buffer); + + return segments.filter((item) => item.text); + } } export const YouTubeInitializer = (() => { diff --git a/src/subtitle/vtt.js b/src/subtitle/vtt.js new file mode 100644 index 0000000..1a20162 --- /dev/null +++ b/src/subtitle/vtt.js @@ -0,0 +1,44 @@ +function millisecondsStringToNumber(msString) { + const cleanString = msString.trim(); + const milliseconds = parseInt(cleanString, 10); + + if (isNaN(milliseconds)) { + return 0; + } + + return milliseconds; +} + +export function parseBilingualVtt(vttText) { + const cleanText = vttText.replace(/^\uFEFF/, "").trim(); + const cues = cleanText.split(/\n\n+/); + + const result = []; + + for (const cue of cues) { + if (!cue.includes("-->")) continue; + + const lines = cue.split("\n"); + + const timestampLineIndex = lines.findIndex((line) => line.includes("-->")); + if (timestampLineIndex === -1) continue; + + const [startTimeString, endTimeString] = + lines[timestampLineIndex].split(" --> "); + const textLines = lines.slice(timestampLineIndex + 1); + + if (startTimeString && endTimeString && textLines.length > 0) { + const originalText = textLines[0].trim(); + const translatedText = (textLines[1] || "").trim(); + + result.push({ + start: millisecondsStringToNumber(startTimeString), + end: millisecondsStringToNumber(endTimeString), + text: originalText, + translation: translatedText, + }); + } + } + + return result; +}