NarratoAI/app/services/subtitle_merger.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

'''
@Project: NarratoAI
@File   : subtitle_merger
@Author : viccy
@Date   : 2025/5/6 下午4:00
'''

import re
import os
from datetime import datetime, timedelta


def parse_time(time_str):
    """解析时间字符串为timedelta对象"""
    hours, minutes, seconds_ms = time_str.split(':')
    seconds, milliseconds = seconds_ms.split(',')

    td = timedelta(
        hours=int(hours),
        minutes=int(minutes),
        seconds=int(seconds),
        milliseconds=int(milliseconds)
    )
    return td


def format_time(td):
    """将timedelta对象格式化为SRT时间字符串"""
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    milliseconds = td.microseconds // 1000

    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def extract_time_range_from_filename(filename):
    """从文件名中提取时间范围"""
    pattern = r'subtitle_(\d{2})_(\d{2})_(\d{2})-(\d{2})_(\d{2})_(\d{2})'
    match = re.search(pattern, filename)

    if not match:
        return None, None

    start_h, start_m, start_s, end_h, end_m, end_s = map(int, match.groups())

    start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s)
    end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s)

    return start_time, end_time


def merge_subtitle_files(subtitle_files, output_file=None):
    """
    合并多个SRT字幕文件

    参数:
        subtitle_files: 包含SRT文件路径的列表
        output_file: 输出文件的路径，如果为None则自动生成

    返回:
        合并后的字幕文件路径
    """
    # 按文件名中的开始时间排序
    sorted_files = sorted(subtitle_files,
                          key=lambda x: extract_time_range_from_filename(x)[0])

    merged_subtitles = []
    subtitle_index = 1

    for file_path in sorted_files:
        # 从文件名获取起始时间偏移
        offset_time, _ = extract_time_range_from_filename(file_path)

        if offset_time is None:
            print(f"警告: 无法从文件名 {os.path.basename(file_path)} 中提取时间范围，跳过该文件")
            continue

        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 解析字幕文件
        subtitle_blocks = re.split(r'\n\s*\n', content.strip())

        for block in subtitle_blocks:
            lines = block.strip().split('\n')
            if len(lines) < 3:  # 确保块有足够的行数
                continue

            # 解析时间轴行
            time_line = lines[1]
            time_parts = time_line.split(' --> ')
            if len(time_parts) != 2:
                continue

            start_time = parse_time(time_parts[0])
            end_time = parse_time(time_parts[1])

            # 应用时间偏移
            adjusted_start_time = start_time + offset_time
            adjusted_end_time = end_time + offset_time

            # 重建字幕块
            adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}"
            text_lines = lines[2:]

            new_block = [
                str(subtitle_index),
                adjusted_time_line,
                *text_lines
            ]

            merged_subtitles.append('\n'.join(new_block))
            subtitle_index += 1

    # 合并所有字幕块
    merged_content = '\n\n'.join(merged_subtitles)

    # 确定输出文件路径
    if output_file is None:
        # 自动生成输出文件名
        first_file_path = sorted_files[0]
        last_file_path = sorted_files[-1]
        _, first_end = extract_time_range_from_filename(first_file_path)
        _, last_end = extract_time_range_from_filename(last_file_path)

        dir_path = os.path.dirname(first_file_path)
        first_start_str = os.path.basename(first_file_path).split('-')[0].replace('subtitle_', '')
        last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60)
        last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}"

        output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt")

    # 写入合并后的内容
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(merged_content)

    return output_file


if __name__ == '__main__':
    subtitle_files = [
        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt",
        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt",
        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt",
        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt",
    ]

    output_file = merge_subtitle_files(subtitle_files)
    print(f"字幕文件已合并至: {output_file}")