#!/usr/bin/env python # -*- coding: UTF-8 -*- """ Subtitle text utilities. This module provides a shared, cross-platform way to read and normalize subtitle content. Both Short Drama Editing (混剪) and Short Drama Narration (解说) should consume subtitle content through this module to avoid platform-specific parsing issues (e.g. Windows UTF-16 SRT, timestamp separators, etc.). """ from __future__ import annotations import os import re from dataclasses import dataclass from typing import Iterable, Optional _SRT_TIME_RE = re.compile( r"\b\d{2}:\d{2}:\d{2}(?:[,.]\d{3})?\s*-->\s*\d{2}:\d{2}:\d{2}(?:[,.]\d{3})?\b" ) _SRT_MS_DOT_RE = re.compile(r"(\b\d{2}:\d{2}:\d{2})\.(\d{3}\b)") @dataclass(frozen=True) class DecodedSubtitle: text: str encoding: str def has_timecodes(text: str) -> bool: """Return True if the subtitle text contains at least one SRT timecode.""" if not text: return False return _SRT_TIME_RE.search(text) is not None def normalize_subtitle_text(text: str) -> str: """ Normalize subtitle text to improve cross-platform reliability. - Unifies line endings to LF - Removes BOM and NUL bytes - Normalizes millisecond separators from '.' to ',' in timecodes """ if text is None: return "" normalized = str(text) # Strip BOM. if normalized.startswith("\ufeff"): normalized = normalized.lstrip("\ufeff") # Remove NUL bytes (common when UTF-16 is mis-decoded elsewhere). normalized = normalized.replace("\x00", "") # Normalize newlines. normalized = normalized.replace("\r\n", "\n").replace("\r", "\n") # Normalize timestamp millisecond separator: 00:00:01.000 -> 00:00:01,000 normalized = _SRT_MS_DOT_RE.sub(r"\1,\2", normalized) return normalized.strip() def decode_subtitle_bytes( data: bytes, *, encodings: Optional[Iterable[str]] = None, ) -> DecodedSubtitle: """ Decode subtitle bytes using a small set of common encodings. Preference is given to decodings that yield detectable SRT timecodes. """ if data is None: return DecodedSubtitle(text="", encoding="utf-8") candidates = list(encodings) if encodings else [ "utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be", "gbk", "gb2312", ] decoded_results: list[DecodedSubtitle] = [] for encoding in candidates: try: decoded_text = data.decode(encoding) except UnicodeDecodeError: continue decoded_results.append( DecodedSubtitle(text=normalize_subtitle_text(decoded_text), encoding=encoding) ) # Fast path: if we already see timecodes, keep the first such decode. if has_timecodes(decoded_results[-1].text): return decoded_results[-1] if decoded_results: # Fall back to the first successful decoding. return decoded_results[0] # Last resort: replace undecodable bytes. return DecodedSubtitle(text=normalize_subtitle_text(data.decode("utf-8", errors="replace")), encoding="utf-8") def read_subtitle_text(file_path: str) -> DecodedSubtitle: """Read subtitle file from disk, decode and normalize its text.""" if not file_path or not str(file_path).strip(): return DecodedSubtitle(text="", encoding="utf-8") normalized_path = os.path.abspath(str(file_path)) with open(normalized_path, "rb") as f: data = f.read() return decode_subtitle_bytes(data)