fix: subtitle

2025-10-10 21:17:51 +08:00
parent 3844d2eb75
commit 001f04a9ee
6 changed files with 133 additions and 23 deletions
--- a/src/apis/index.js
+++ b/src/apis/index.js
@@ -510,6 +510,7 @@ export const apiSubtitle = async ({
  apiSetting,
 }) => {
  const cacheOpts = {
    apiSlug: apiSetting.apiSlug,
    videoId,
    fromLang,
    toLang,
--- a/src/apis/trans.js
+++ b/src/apis/trans.js
@@ -34,6 +34,7 @@ import { parseJsonObj, extractJson } from "../libs/utils";
 import { kissLog } from "../libs/log";
 import { fetchData } from "../libs/fetch";
 import { getMsgHistory } from "./history";
 import { parseBilingualVtt } from "../subtitle/vtt";
 const keyMap = new Map();
 const urlMap = new Map();
@@ -118,8 +119,9 @@ const parseSTRes = (raw) => {
  }
  try {
-    const jsonString = extractJson(raw);
+    // const jsonString = extractJson(raw);
-    const data = JSON.parse(jsonString);
+    // const data = JSON.parse(jsonString);
    const data = parseBilingualVtt(raw);
    if (Array.isArray(data)) {
      return data;
    }
--- a/src/config/api.js
+++ b/src/config/api.js
@@ -354,23 +354,49 @@ Output: {"translations":[{"id":1,"text":"一个<b>React</b>组件","sourceLangua
 Fail-safe: On any error, return {"translations":[]}.`;
-const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
+// const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
-Output (valid JSON array, output ONLY this array):
+// Output (valid JSON array, output ONLY this array):
-[{
+// [{
-  "text": "string",        // Full sentence with correct punctuation
+//   "text": "string",        // Full sentence with correct punctuation
-  "translation": "string", // Translation in ${INPUT_PLACE_TO}
+//   "translation": "string", // Translation in ${INPUT_PLACE_TO}
-  "start": int,            // Start time (ms)
+//   "start": int,            // Start time (ms)
-  "end": int,              // End time (ms)
+//   "end": int,              // End time (ms)
-  "duration": int          // end - start
+// }]
 }]
-Guidelines:
+// Guidelines:
-1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
+// 1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
-2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
+// 2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
-3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
+// 3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
-4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
+// 4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
-`;
+// `;
 const defaultSubtitlePrompt = `You are an expert AI for subtitle generation. Convert a JSON array of word-level timestamps into a bilingual VTT file.
 **Workflow:**
 1. Merge \`text\` fields into complete sentences; ignore empty text.
 2. Split long sentences into smaller, manageable subtitle cues (one sentence per cue).
 3. Translate each cue into ${INPUT_PLACE_TO}.
 4. Format as VTT:
   - Start with \`WEBVTT\`.
   - Each cue: timestamps (\`start --> end\` in milliseconds), original text, translated text.
   - Keep non-speech text (e.g., \`[Music]\`) untranslated.
   - Separate cues with a blank line.
 **Output:** Only the pure VTT content.
 **Example:**
 \`\`\`vtt
 WEBVTT
 1000 --> 3500
 Hello world!
 你好，世界！
 4000 --> 6000
 Good morning.
 早上好。
 \`\`\``;
 const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => {
  console.log("request hook args:", args);
--- a/src/hooks/Api.js
+++ b/src/hooks/Api.js
@@ -44,7 +44,7 @@ export function useApiList() {
  );
  const aiEnabledApis = useMemo(
-    () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)),
+    () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiType)),
    [enabledApis]
  );
@@ -124,6 +124,7 @@ export function useApiItem(apiSlug) {
            apiSlug: item.apiSlug,
            apiName: item.apiName,
            apiType: item.apiType,
            key: item.key,
          };
        }
        return item;
--- a/src/subtitle/YouTubeCaptionProvider.js
+++ b/src/subtitle/YouTubeCaptionProvider.js
@@ -111,6 +111,11 @@ class YouTubeCaptionProvider {
    kissControls.appendChild(toggleButton);
    toggleButton.onclick = () => {
      if (this.#isBusy) {
        logger.info(`Youtube Provider: It's budy now...`);
        return;
      }
      if (!this.#enabled) {
        logger.info(`Youtube Provider: Feature toggled ON.`);
        this.#startManager();
@@ -283,9 +288,10 @@ class YouTubeCaptionProvider {
        OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
        "auto";
      if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
        // todo: 切分多次发送接受以适应接口处理能力
        subtitles = await this.#aiSegment({
          videoId,
-          events,
+          events: this.#flatEvents(events),
          fromLang,
          toLang,
          segApiSetting,
@@ -408,10 +414,7 @@ class YouTubeCaptionProvider {
      lines = this.#processSubtitles({ events, usePause: true });
    }
-    return lines.map((item) => ({
+    return lines;
      ...item,
      duration: Math.max(0, item.end - item.start),
    }));
  }
  #isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
@@ -580,6 +583,39 @@ class YouTubeCaptionProvider {
    return sentences;
  }
  #flatEvents(events = []) {
    const segments = [];
    let buffer = null;
    events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => {
      segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => {
        const text = utf8.trim().replace(/\s+/g, " ");
        const start = tStartMs + tOffsetMs;
        if (buffer) {
          if (!buffer.end || buffer.end > start) {
            buffer.end = start;
          }
          segments.push(buffer);
          buffer = null;
        }
        buffer = {
          text,
          start,
        };
        if (j === segs.length - 1) {
          buffer.end = tStartMs + dDurationMs;
        }
      });
    });
    segments.push(buffer);
    return segments.filter((item) => item.text);
  }
 }
 export const YouTubeInitializer = (() => {
--- a/src/subtitle/vtt.js
+++ b/src/subtitle/vtt.js
@@ -0,0 +1,44 @@
 function millisecondsStringToNumber(msString) {
  const cleanString = msString.trim();
  const milliseconds = parseInt(cleanString, 10);
  if (isNaN(milliseconds)) {
    return 0;
  }
  return milliseconds;
 }
 export function parseBilingualVtt(vttText) {
  const cleanText = vttText.replace(/^\uFEFF/, "").trim();
  const cues = cleanText.split(/\n\n+/);
  const result = [];
  for (const cue of cues) {
    if (!cue.includes("-->")) continue;
    const lines = cue.split("\n");
    const timestampLineIndex = lines.findIndex((line) => line.includes("-->"));
    if (timestampLineIndex === -1) continue;
    const [startTimeString, endTimeString] =
      lines[timestampLineIndex].split(" --> ");
    const textLines = lines.slice(timestampLineIndex + 1);
    if (startTimeString && endTimeString && textLines.length > 0) {
      const originalText = textLines[0].trim();
      const translatedText = (textLines[1] || "").trim();
      result.push({
        start: millisecondsStringToNumber(startTimeString),
        end: millisecondsStringToNumber(endTimeString),
        text: originalText,
        translation: translatedText,
      });
    }
  }
  return result;
 }