feat: support subtitle chunks for AI

This commit is contained in:
Gabe
2025-10-11 21:06:38 +08:00
parent d9b4399c57
commit 1afe976777
6 changed files with 304 additions and 84 deletions

View File

@@ -517,6 +517,7 @@ export const apiTranslate = async ({
// 字幕处理/翻译 // 字幕处理/翻译
export const apiSubtitle = async ({ export const apiSubtitle = async ({
videoId, videoId,
chunkSign,
fromLang = "auto", fromLang = "auto",
toLang, toLang,
events = [], events = [],
@@ -525,6 +526,7 @@ export const apiSubtitle = async ({
const cacheOpts = { const cacheOpts = {
apiSlug: apiSetting.apiSlug, apiSlug: apiSetting.apiSlug,
videoId, videoId,
chunkSign,
fromLang, fromLang,
toLang, toLang,
}; };

View File

@@ -1554,9 +1554,9 @@ export const I18N = {
zh_TW: `啟用字幕翻譯`, zh_TW: `啟用字幕翻譯`,
}, },
is_bilingual_view: { is_bilingual_view: {
zh: `启用双语显示`, zh: `双语显示`,
en: `DEnable bilingual display`, en: `Enable bilingual display`,
zh_TW: `啟用雙語顯示`, zh_TW: `雙語顯示`,
}, },
background_styles: { background_styles: {
zh: `背景样式`, zh: `背景样式`,
@@ -1578,6 +1578,11 @@ export const I18N = {
en: `AI intelligent punctuation`, en: `AI intelligent punctuation`,
zh_TW: `AI智慧斷句`, zh_TW: `AI智慧斷句`,
}, },
ai_chunk_length: {
zh: `AI处理切割长度`,
en: `AI processing chunk length`,
zh_TW: `AI处理切割长度`,
},
subtitle_helper_1: { subtitle_helper_1: {
zh: `1、目前仅支持Youtube且仅支持浏览器扩展。`, zh: `1、目前仅支持Youtube且仅支持浏览器扩展。`,
en: `1. Currently only supports Youtube and browser extensions.`, en: `1. Currently only supports Youtube and browser extensions.`,

View File

@@ -112,6 +112,7 @@ export const DEFAULT_SUBTITLE_SETTING = {
enabled: true, // 是否开启 enabled: true, // 是否开启
apiSlug: OPT_TRANS_MICROSOFT, apiSlug: OPT_TRANS_MICROSOFT,
segSlug: "-", // AI智能断句 segSlug: "-", // AI智能断句
chunkLength: 1000, // AI处理切割长度
// fromLang: "en", // fromLang: "en",
toLang: "zh-CN", toLang: "zh-CN",
isBilingual: true, // 是否双语显示 isBilingual: true, // 是否双语显示

View File

@@ -230,4 +230,23 @@ export class BilingualSubtitleManager {
} }
} }
} }
/**
* 追加新的字幕
* @param {Array<object>} newSubtitlesChunk - 新的、要追加的字幕数据块。
*/
appendSubtitles(newSubtitlesChunk) {
if (!newSubtitlesChunk || newSubtitlesChunk.length === 0) {
return;
}
logger.info(
`Bilingual Subtitle Manager: Appending ${newSubtitlesChunk.length} new subtitles...`
);
this.#formattedSubtitles.push(...newSubtitlesChunk);
this.#formattedSubtitles.sort((a, b) => a.start - b.start);
this.#currentSubtitleIndex = -1;
this.onTimeUpdate();
}
} }

View File

@@ -218,10 +218,14 @@ class YouTubeCaptionProvider {
return docUrl.searchParams.get("v"); return docUrl.searchParams.get("v");
} }
async #aiSegment({ videoId, toLang, events, segApiSetting }) { async #aiSegment({ videoId, fromLang, toLang, chunkEvents, segApiSetting }) {
try { try {
const events = chunkEvents.filter((item) => item.text);
const chunkSign = `${events[0].start} --> ${events[events.length - 1].end}`;
const subtitles = await apiSubtitle({ const subtitles = await apiSubtitle({
videoId, videoId,
chunkSign,
fromLang,
toLang, toLang,
events, events,
apiSetting: segApiSetting, apiSetting: segApiSetting,
@@ -279,7 +283,8 @@ class YouTubeCaptionProvider {
return; return;
} }
let subtitles = []; const flatEvents = this.#flatEvents(events);
if (!flatEvents.length) return;
const { segApiSetting, toLang } = this.#setting; const { segApiSetting, toLang } = this.#setting;
const lang = potUrl.searchParams.get("lang"); const lang = potUrl.searchParams.get("lang");
@@ -287,26 +292,77 @@ class YouTubeCaptionProvider {
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang) || OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang) ||
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) || OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
"auto"; "auto";
if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) { if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
// todo: 切分多次发送接受以适应接口处理能力 logger.info("Youtube Provider: Starting AI ...");
subtitles = await this.#aiSegment({
const eventChunks = this.#splitEventsIntoChunks(
flatEvents,
segApiSetting.chunkLength
);
const subtitlesFallback = () =>
this.#formatSubtitles(flatEvents, fromLang);
if (eventChunks.length === 0) {
this.#onCaptionsReady({
videoId,
subtitles: subtitlesFallback(),
fromLang,
isInitialLoad: true,
});
return;
}
const firstChunkEvents = eventChunks[0];
const firstBatchSubtitles = await this.#aiSegment({
videoId, videoId,
events: this.#flatEvents(events), chunkEvents: firstChunkEvents,
fromLang, fromLang,
toLang, toLang,
segApiSetting, segApiSetting,
}); });
}
if (!subtitles?.length) { if (!firstBatchSubtitles?.length) {
subtitles = this.#formatSubtitles(events, fromLang); this.#onCaptionsReady({
} videoId,
if (!subtitles?.length) { subtitles: subtitlesFallback(),
logger.info("Youtube Provider: No subtitles after format."); fromLang,
return; isInitialLoad: true,
} });
return;
}
this.#onCaptionsReady({ videoId, subtitles, fromLang }); this.#onCaptionsReady({
videoId,
subtitles: firstBatchSubtitles,
fromLang,
isInitialLoad: true,
});
if (eventChunks.length > 1) {
const remainingChunks = eventChunks.slice(1);
this.#processRemainingChunksAsync({
chunks: remainingChunks,
videoId,
fromLang,
toLang,
segApiSetting,
});
}
} else {
const subtitles = this.#formatSubtitles(flatEvents, fromLang);
if (!subtitles?.length) {
logger.info("Youtube Provider: No subtitles after format.");
return;
}
this.#onCaptionsReady({
videoId,
subtitles,
fromLang,
isInitialLoad: true,
});
}
} catch (error) { } catch (error) {
logger.warn("Youtube Provider: unknow error", error); logger.warn("Youtube Provider: unknow error", error);
} finally { } finally {
@@ -382,8 +438,8 @@ class YouTubeCaptionProvider {
} }
} }
#formatSubtitles(events, lang) { #formatSubtitles(flatEvents, lang) {
if (!events?.length) return []; if (!flatEvents?.length) return [];
const noSpaceLanguages = [ const noSpaceLanguages = [
"zh", // 中文 "zh", // 中文
@@ -396,25 +452,49 @@ class YouTubeCaptionProvider {
]; ];
if (noSpaceLanguages.some((l) => lang?.startsWith(l))) { if (noSpaceLanguages.some((l) => lang?.startsWith(l))) {
return events const subtitles = [];
.map(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => ({ let currentLine = null;
text: segs const MAX_LENGTH = 100;
.map(({ utf8 = "" }) => utf8)
.join("") for (const segment of flatEvents) {
?.trim(), if (segment.text) {
start: tStartMs, if (!currentLine) {
end: tStartMs + dDurationMs, currentLine = {
})) text: segment.text,
.filter((item) => item.text); start: segment.start,
end: segment.end,
};
} else {
currentLine.text += segment.text;
currentLine.end = segment.end;
}
if (currentLine.text.length >= MAX_LENGTH) {
subtitles.push(currentLine);
currentLine = null;
}
} else {
if (currentLine) {
subtitles.push(currentLine);
currentLine = null;
}
}
}
if (currentLine) {
subtitles.push(currentLine);
}
return subtitles;
} }
let lines = this.#processSubtitles({ events }); let subtitles = this.#processSubtitles({ flatEvents });
const isPoor = this.#isQualityPoor(lines); const isPoor = this.#isQualityPoor(subtitles);
if (isPoor) { if (isPoor) {
lines = this.#processSubtitles({ events, usePause: true }); subtitles = this.#processSubtitles({ flatEvents, usePause: true });
} }
return lines; return subtitles;
} }
#isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) { #isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
@@ -426,9 +506,9 @@ class YouTubeCaptionProvider {
} }
#processSubtitles({ #processSubtitles({
events, flatEvents,
usePause = false, usePause = false,
timeout = 1500, timeout = 1000,
maxWords = 15, maxWords = 15,
} = {}) { } = {}) {
const groupedPauseWords = { const groupedPauseWords = {
@@ -516,67 +596,54 @@ class YouTubeCaptionProvider {
let currentBuffer = []; let currentBuffer = [];
let bufferWordCount = 0; let bufferWordCount = 0;
const joinSegs = (segs) => ({
text: segs
.map((s) => s.text)
.join(" ")
.trim(),
start: segs[0].start,
end: segs[segs.length - 1].end,
});
const flushBuffer = () => { const flushBuffer = () => {
if (currentBuffer.length > 0) { if (currentBuffer.length > 0) {
sentences.push(joinSegs(currentBuffer)); sentences.push({
text: currentBuffer
.map((s) => s.text)
.join(" ")
.trim(),
start: currentBuffer[0].start,
end: currentBuffer[currentBuffer.length - 1].end,
});
} }
currentBuffer = []; currentBuffer = [];
bufferWordCount = 0; bufferWordCount = 0;
}; };
events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => { flatEvents.forEach((segment) => {
segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => { if (!segment.text) return;
const text = utf8?.trim().replace(/\s+/g, " ") || "";
if (!text) return;
const start = tStartMs + tOffsetMs; const lastSegment = currentBuffer[currentBuffer.length - 1];
const lastSegment = currentBuffer[currentBuffer.length - 1];
if (lastSegment) { if (lastSegment) {
if (!lastSegment.end || lastSegment.end > start) { const isEndOfSentence = /[.?!…\])]$/.test(lastSegment.text);
lastSegment.end = start; const isPauseOfSentence = /[,]$/.test(lastSegment.text);
} const isTimeout = segment.start - lastSegment.end > timeout;
const isWordLimitExceeded =
(usePause || isPauseOfSentence) && bufferWordCount >= maxWords;
const isEndOfSentence = /[.?!…\])]$/.test(lastSegment.text); const startsWithSign = /^[[(♪]/.test(segment.text);
const isPauseOfSentence = /[,]$/.test(lastSegment.text); const startsWithPauseWord =
const isTimeout = start - lastSegment.end > timeout; usePause &&
const isWordLimitExceeded = groupedPauseWords["1"].has(
(usePause || isPauseOfSentence) && bufferWordCount >= maxWords; segment.text.toLowerCase().split(" ")[0]
) &&
currentBuffer.length > 1;
const startsWithSign = /^[[(♪]/.test(text); if (
const startsWithPauseWord = isEndOfSentence ||
usePause && isTimeout ||
groupedPauseWords["1"].has(text.toLowerCase().split(" ")[0]) && // todo: 考虑连词开头 isWordLimitExceeded ||
currentBuffer.length > 1; startsWithSign ||
startsWithPauseWord
if ( ) {
isEndOfSentence || flushBuffer();
isTimeout ||
isWordLimitExceeded ||
startsWithSign ||
startsWithPauseWord
) {
flushBuffer();
}
} }
}
const currentSegment = { text, start }; currentBuffer.push(segment);
if (j === segs.length - 1) { bufferWordCount += segment.text.split(/\s+/).length;
currentSegment.end = tStartMs + dDurationMs;
}
currentBuffer.push(currentSegment);
bufferWordCount += text.split(/\s+/).length;
});
}); });
flushBuffer(); flushBuffer();
@@ -614,7 +681,114 @@ class YouTubeCaptionProvider {
segments.push(buffer); segments.push(buffer);
return segments.filter((item) => item.text); return segments;
}
#splitEventsIntoChunks(flatEvents, chunkLength = 1000) {
if (!flatEvents || flatEvents.length === 0) {
return [];
}
const eventChunks = [];
let currentChunk = [];
let currentChunkTextLength = 0;
const MAX_CHUNK_LENGTH = chunkLength + 500;
const PAUSE_THRESHOLD_MS = 1000;
for (let i = 0; i < flatEvents.length; i++) {
const event = flatEvents[i];
currentChunk.push(event);
currentChunkTextLength += event.text.length;
const isLastEvent = i === flatEvents.length - 1;
if (isLastEvent) {
continue;
}
let shouldSplit = false;
if (currentChunkTextLength >= MAX_CHUNK_LENGTH) {
shouldSplit = true;
} else if (currentChunkTextLength >= chunkLength) {
const isEndOfSentence = /[.?!…\])]$/.test(event.text);
const nextEvent = flatEvents[i + 1];
const pauseDuration = nextEvent.start - event.end;
if (isEndOfSentence || pauseDuration > PAUSE_THRESHOLD_MS) {
shouldSplit = true;
}
}
if (shouldSplit) {
eventChunks.push(currentChunk);
currentChunk = [];
currentChunkTextLength = 0;
}
}
if (currentChunk.length > 0) {
eventChunks.push(currentChunk);
}
return eventChunks;
}
async #processRemainingChunksAsync({
chunks,
videoId,
fromLang,
toLang,
segApiSetting,
}) {
logger.info(`Youtube Provider: Starting for ${chunks.length} chunks.`);
for (let i = 0; i < chunks.length; i++) {
const chunkEvents = chunks[i];
const chunkNum = i + 2;
logger.info(
`Youtube Provider: Processing subtitle chunk ${chunkNum}/${chunks.length + 1}...`
);
let subtitlesForThisChunk = [];
try {
const aiSubtitles = await this.#aiSegment({
videoId,
chunkEvents,
fromLang,
toLang,
segApiSetting,
});
if (aiSubtitles?.length > 0) {
subtitlesForThisChunk = aiSubtitles;
} else {
logger.info(
`Youtube Provider: AI segmentation for chunk ${chunkNum} returned no data.`
);
subtitlesForThisChunk = this.#formatSubtitles(chunkEvents, fromLang);
}
} catch (chunkError) {
subtitlesForThisChunk = this.#formatSubtitles(chunkEvents, fromLang);
}
if (this.#videoId !== videoId) {
logger.info("Youtube Provider: videoId changed!");
break;
}
if (subtitlesForThisChunk.length > 0 && this.#managerInstance) {
logger.info(
`Youtube Provider: Appending ${subtitlesForThisChunk.length} subtitles from chunk ${chunkNum}.`
);
this.#managerInstance.appendSubtitles(subtitlesForThisChunk);
} else {
logger.info(`Youtube Provider: Chunk ${chunkNum} no subtitles.`);
}
await sleep(randomBetween(500, 1000));
}
logger.info("Youtube Provider: All subtitle chunks processed.");
} }
} }

View File

@@ -10,6 +10,7 @@ import Alert from "@mui/material/Alert";
import Switch from "@mui/material/Switch"; import Switch from "@mui/material/Switch";
import { useSubtitle } from "../../hooks/Subtitle"; import { useSubtitle } from "../../hooks/Subtitle";
import { useApiList } from "../../hooks/Api"; import { useApiList } from "../../hooks/Api";
import { limitNumber } from "../../libs/utils";
export default function SubtitleSetting() { export default function SubtitleSetting() {
const i18n = useI18n(); const i18n = useI18n();
@@ -19,6 +20,12 @@ export default function SubtitleSetting() {
const handleChange = (e) => { const handleChange = (e) => {
e.preventDefault(); e.preventDefault();
let { name, value } = e.target; let { name, value } = e.target;
switch (name) {
case "chunkLength":
value = limitNumber(value, 200, 20000);
break;
default:
}
updateSubtitle({ updateSubtitle({
[name]: value, [name]: value,
}); });
@@ -28,6 +35,7 @@ export default function SubtitleSetting() {
enabled, enabled,
apiSlug, apiSlug,
segSlug, segSlug,
chunkLength,
toLang, toLang,
isBilingual, isBilingual,
windowStyle, windowStyle,
@@ -96,6 +104,17 @@ export default function SubtitleSetting() {
))} ))}
</TextField> </TextField>
</Grid> </Grid>
<Grid item xs={12} sm={12} md={6} lg={3}>
<TextField
fullWidth
size="small"
label={i18n("ai_chunk_length")}
type="number"
name="chunkLength"
value={chunkLength}
onChange={handleChange}
/>
</Grid>
<Grid item xs={12} sm={12} md={6} lg={3}> <Grid item xs={12} sm={12} md={6} lg={3}>
<TextField <TextField
fullWidth fullWidth