feat(audio): 统一音量配置并修复原声音量默认值问题

引入AudioVolumeDefaults类集中管理音量配置,确保全局一致性
修复原声音量默认值为0.7以解决短剧解说模式问题
添加音量验证和详细日志便于调试
This commit is contained in:
linyq 2025-07-02 17:54:00 +08:00
parent 7a8de5e791
commit 1792311ef4
6 changed files with 94 additions and 37 deletions

3
.gitignore vendored
View File

@ -34,3 +34,6 @@ resource/srt/*.srt
app/models/faster-whisper-large-v2/* app/models/faster-whisper-large-v2/*
app/models/faster-whisper-large-v3/* app/models/faster-whisper-large-v3/*
app/models/bert/* app/models/bert/*
bug清单.md
task.md

View File

@ -1,6 +1,6 @@
import warnings import warnings
from enum import Enum from enum import Enum
from typing import Any, List, Optional from typing import Any, List, Optional, Union
import pydantic import pydantic
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -13,6 +13,24 @@ warnings.filterwarnings(
) )
class AudioVolumeDefaults:
"""音量配置默认值常量类 - 确保全局一致性"""
# 语音音量默认值
VOICE_VOLUME = 1.0
TTS_VOLUME = 1.0
# 原声音量默认值 - 这是修复bug的关键
ORIGINAL_VOLUME = 0.7
# 背景音乐音量默认值
BGM_VOLUME = 0.3
# 音量范围
MIN_VOLUME = 0.0
MAX_VOLUME = 1.0
class VideoConcatMode(str, Enum): class VideoConcatMode(str, Enum):
random = "random" random = "random"
sequential = "sequential" sequential = "sequential"
@ -101,7 +119,7 @@ class VideoParams(BaseModel):
video_subject: str video_subject: str
video_script: str = "" # 用于生成视频的脚本 video_script: str = "" # 用于生成视频的脚本
video_terms: Optional[str | list] = None # 用于生成视频的关键词 video_terms: Optional[Union[str, list]] = None # 用于生成视频的关键词
video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
video_clip_duration: Optional[int] = 5 video_clip_duration: Optional[int] = 5
@ -113,11 +131,11 @@ class VideoParams(BaseModel):
video_language: Optional[str] = "" # auto detect video_language: Optional[str] = "" # auto detect
voice_name: Optional[str] = "" voice_name: Optional[str] = ""
voice_volume: Optional[float] = 1.0 voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
voice_rate: Optional[float] = 1.0 voice_rate: Optional[float] = 1.0
bgm_type: Optional[str] = "random" bgm_type: Optional[str] = "random"
bgm_file: Optional[str] = "" bgm_file: Optional[str] = ""
bgm_volume: Optional[float] = 0.2 bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME
subtitle_enabled: Optional[bool] = True subtitle_enabled: Optional[bool] = True
subtitle_position: Optional[str] = "bottom" # top, bottom, center subtitle_position: Optional[str] = "bottom" # top, bottom, center
@ -157,11 +175,11 @@ class AudioRequest(BaseModel):
video_script: str video_script: str
video_language: Optional[str] = "" video_language: Optional[str] = ""
voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female" voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
voice_volume: Optional[float] = 1.0 voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
voice_rate: Optional[float] = 1.2 voice_rate: Optional[float] = 1.2
bgm_type: Optional[str] = "random" bgm_type: Optional[str] = "random"
bgm_file: Optional[str] = "" bgm_file: Optional[str] = ""
bgm_volume: Optional[float] = 0.2 bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME
video_source: Optional[str] = "local" video_source: Optional[str] = "local"
@ -347,7 +365,7 @@ class VideoClipParams(BaseModel):
# video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称") voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
voice_volume: Optional[float] = Field(default=1.0, description="解说语音音量") voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
voice_rate: Optional[float] = Field(default=1.0, description="语速") voice_rate: Optional[float] = Field(default=1.0, description="语速")
voice_pitch: Optional[float] = Field(default=1.0, description="语调") voice_pitch: Optional[float] = Field(default=1.0, description="语调")
@ -367,9 +385,9 @@ class VideoClipParams(BaseModel):
n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度 n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度
tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量(后处理)") tts_volume: Optional[float] = Field(default=AudioVolumeDefaults.TTS_VOLUME, description="解说语音音量(后处理)")
original_volume: Optional[float] = Field(default=1.0, description="视频原声音量") original_volume: Optional[float] = Field(default=AudioVolumeDefaults.ORIGINAL_VOLUME, description="视频原声音量")
bgm_volume: Optional[float] = Field(default=0.3, description="背景音乐音量") bgm_volume: Optional[float] = Field(default=AudioVolumeDefaults.BGM_VOLUME, description="背景音乐音量")
class VideoTranscriptionRequest(BaseModel): class VideoTranscriptionRequest(BaseModel):

View File

@ -24,6 +24,7 @@ from moviepy.video.tools.subtitles import SubtitlesClip
from PIL import ImageFont from PIL import ImageFont
from app.utils import utils from app.utils import utils
from app.models.schema import AudioVolumeDefaults
def merge_materials( def merge_materials(
@ -66,11 +67,12 @@ def merge_materials(
if options is None: if options is None:
options = {} options = {}
# 设置默认参数值 # 设置默认参数值 - 使用统一的音量配置
voice_volume = options.get('voice_volume', 1.0) voice_volume = options.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME)
bgm_volume = options.get('bgm_volume', 0.3) bgm_volume = options.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME)
original_audio_volume = options.get('original_audio_volume', 0.0) # 默认为0即不保留原声 # 修复bug: 将原声音量默认值从0.0改为0.7,确保短剧解说模式下原片音量正常
keep_original_audio = options.get('keep_original_audio', False) # 是否保留原声 original_audio_volume = options.get('original_audio_volume', AudioVolumeDefaults.ORIGINAL_VOLUME)
keep_original_audio = options.get('keep_original_audio', True) # 默认保留原声
subtitle_font = options.get('subtitle_font', '') subtitle_font = options.get('subtitle_font', '')
subtitle_font_size = options.get('subtitle_font_size', 40) subtitle_font_size = options.get('subtitle_font_size', 40)
subtitle_color = options.get('subtitle_color', '#FFFFFF') subtitle_color = options.get('subtitle_color', '#FFFFFF')
@ -82,6 +84,24 @@ def merge_materials(
threads = options.get('threads', 2) threads = options.get('threads', 2)
fps = options.get('fps', 30) fps = options.get('fps', 30)
# 音量配置日志 - 便于调试音量问题
logger.info(f"音量配置详情:")
logger.info(f" - 配音音量: {voice_volume}")
logger.info(f" - 背景音乐音量: {bgm_volume}")
logger.info(f" - 原声音量: {original_audio_volume}")
logger.info(f" - 是否保留原声: {keep_original_audio}")
# 音量参数验证
def validate_volume(volume, name):
if not (AudioVolumeDefaults.MIN_VOLUME <= volume <= AudioVolumeDefaults.MAX_VOLUME):
logger.warning(f"{name}音量 {volume} 超出有效范围 [{AudioVolumeDefaults.MIN_VOLUME}, {AudioVolumeDefaults.MAX_VOLUME}],将被限制")
return max(AudioVolumeDefaults.MIN_VOLUME, min(volume, AudioVolumeDefaults.MAX_VOLUME))
return volume
voice_volume = validate_volume(voice_volume, "配音")
bgm_volume = validate_volume(bgm_volume, "背景音乐")
original_audio_volume = validate_volume(original_audio_volume, "原声")
# 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值 # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值
if subtitle_bg_color == 'transparent': if subtitle_bg_color == 'transparent':
subtitle_bg_color = None # None在新版MoviePy中表示透明背景 subtitle_bg_color = None # None在新版MoviePy中表示透明背景

View File

@ -314,24 +314,35 @@ def generate_video_v3(
audio_clips = [] audio_clips = []
# 添加原声(设置音量) # 添加原声(设置音量)
logger.debug(f"音量配置: {volume_config}") logger.info(f"音量配置详情: {volume_config}")
if video.audio is not None: if video.audio is not None:
original_audio = video.audio.volumex(volume_config['original']) original_volume = volume_config['original']
logger.info(f"应用原声音量: {original_volume}")
original_audio = video.audio.volumex(original_volume)
audio_clips.append(original_audio) audio_clips.append(original_audio)
logger.info("原声音频已添加到合成列表")
else:
logger.warning("视频没有音轨,无法添加原声")
# 添加BGM如果提供 # 添加BGM如果提供
if bgm_path: if bgm_path:
logger.info(f"添加背景音乐: {bgm_path}")
bgm = AudioFileClip(bgm_path) bgm = AudioFileClip(bgm_path)
if bgm.duration < video.duration: if bgm.duration < video.duration:
bgm = loop_audio_clip(bgm, video.duration) bgm = loop_audio_clip(bgm, video.duration)
else: else:
bgm = bgm.subclip(0, video.duration) bgm = bgm.subclip(0, video.duration)
bgm = bgm.volumex(volume_config['bgm']) bgm_volume = volume_config['bgm']
logger.info(f"应用BGM音量: {bgm_volume}")
bgm = bgm.volumex(bgm_volume)
audio_clips.append(bgm) audio_clips.append(bgm)
# 添加解说音频(如果提供) # 添加解说音频(如果提供)
if narration_path: if narration_path:
narration = AudioFileClip(narration_path).volumex(volume_config['narration']) logger.info(f"添加解说音频: {narration_path}")
narration_volume = volume_config['narration']
logger.info(f"应用解说音量: {narration_volume}")
narration = AudioFileClip(narration_path).volumex(narration_volume)
audio_clips.append(narration) audio_clips.append(narration)
# 合成最终视频(包含字幕) # 合成最终视频(包含字幕)
@ -342,8 +353,12 @@ def generate_video_v3(
final_video = video final_video = video
if audio_clips: if audio_clips:
logger.info(f"合成音频轨道,共 {len(audio_clips)} 个音频片段")
final_audio = CompositeAudioClip(audio_clips) final_audio = CompositeAudioClip(audio_clips)
final_video = final_video.set_audio(final_audio) final_video = final_video.set_audio(final_audio)
logger.info("音频合成完成")
else:
logger.warning("没有音频轨道需要合成")
# 导出视频 # 导出视频
logger.info("开始导出视频...") # 调试信息 logger.info("开始导出视频...") # 调试信息

View File

@ -3,6 +3,7 @@ import os
from uuid import uuid4 from uuid import uuid4
from app.config import config from app.config import config
from app.services import voice from app.services import voice
from app.models.schema import AudioVolumeDefaults
from app.utils import utils from app.utils import utils
from webui.utils.cache import get_songs_cache from webui.utils.cache import get_songs_cache
@ -94,12 +95,12 @@ def render_azure_v2_settings(tr):
def render_voice_parameters(tr): def render_voice_parameters(tr):
"""渲染语音参数设置""" """渲染语音参数设置"""
# 音量 # 音量 - 使用统一的默认值
voice_volume = st.slider( voice_volume = st.slider(
tr("Speech Volume"), tr("Speech Volume"),
min_value=0.0, min_value=AudioVolumeDefaults.MIN_VOLUME,
max_value=1.0, max_value=AudioVolumeDefaults.MAX_VOLUME,
value=1.0, value=AudioVolumeDefaults.VOICE_VOLUME,
step=0.01, step=0.01,
help=tr("Adjust the volume of the original audio") help=tr("Adjust the volume of the original audio")
) )
@ -187,12 +188,12 @@ def render_bgm_settings(tr):
if custom_bgm_file and os.path.exists(custom_bgm_file): if custom_bgm_file and os.path.exists(custom_bgm_file):
st.session_state['bgm_file'] = custom_bgm_file st.session_state['bgm_file'] = custom_bgm_file
# 背景音乐音量 # 背景音乐音量 - 使用统一的默认值
bgm_volume = st.slider( bgm_volume = st.slider(
tr("Background Music Volume"), tr("Background Music Volume"),
min_value=0.0, min_value=AudioVolumeDefaults.MIN_VOLUME,
max_value=1.0, max_value=AudioVolumeDefaults.MAX_VOLUME,
value=0.3, value=AudioVolumeDefaults.BGM_VOLUME,
step=0.01, step=0.01,
help=tr("Adjust the volume of the original audio") help=tr("Adjust the volume of the original audio")
) )
@ -203,10 +204,10 @@ def get_audio_params():
"""获取音频参数""" """获取音频参数"""
return { return {
'voice_name': config.ui.get("voice_name", ""), 'voice_name': config.ui.get("voice_name", ""),
'voice_volume': st.session_state.get('voice_volume', 1.0), 'voice_volume': st.session_state.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME),
'voice_rate': st.session_state.get('voice_rate', 1.0), 'voice_rate': st.session_state.get('voice_rate', 1.0),
'voice_pitch': st.session_state.get('voice_pitch', 1.0), 'voice_pitch': st.session_state.get('voice_pitch', 1.0),
'bgm_type': st.session_state.get('bgm_type', 'random'), 'bgm_type': st.session_state.get('bgm_type', 'random'),
'bgm_file': st.session_state.get('bgm_file', ''), 'bgm_file': st.session_state.get('bgm_file', ''),
'bgm_volume': st.session_state.get('bgm_volume', 0.3), 'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME),
} }

View File

@ -1,5 +1,5 @@
import streamlit as st import streamlit as st
from app.models.schema import VideoClipParams, VideoAspect from app.models.schema import VideoClipParams, VideoAspect, AudioVolumeDefaults
def render_video_panel(tr): def render_video_panel(tr):
@ -41,12 +41,12 @@ def render_video_config(tr, params):
) )
st.session_state['video_quality'] = video_qualities[quality_index][1] st.session_state['video_quality'] = video_qualities[quality_index][1]
# 原声音量 # 原声音量 - 使用统一的默认值
params.original_volume = st.slider( params.original_volume = st.slider(
tr("Original Volume"), tr("Original Volume"),
min_value=0.0, min_value=AudioVolumeDefaults.MIN_VOLUME,
max_value=1.0, max_value=AudioVolumeDefaults.MAX_VOLUME,
value=0.7, value=AudioVolumeDefaults.ORIGINAL_VOLUME,
step=0.01, step=0.01,
help=tr("Adjust the volume of the original audio") help=tr("Adjust the volume of the original audio")
) )
@ -58,5 +58,5 @@ def get_video_params():
return { return {
'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value), 'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value),
'video_quality': st.session_state.get('video_quality', '1080p'), 'video_quality': st.session_state.get('video_quality', '1080p'),
'original_volume': st.session_state.get('original_volume', 0.7) 'original_volume': st.session_state.get('original_volume', AudioVolumeDefaults.ORIGINAL_VOLUME)
} }