优化 TTS 新增 proxy 配置，新增语调配置

2026-02-04 04:08:14 +00:00 · 2024-11-05 18:33:46 +08:00 · 2024-11-05 18:33:46 +08:00 · e926e8676a
commit e926e8676a
parent 1d9b27bf46
5 changed files with 72 additions and 65 deletions
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -347,6 +347,7 @@ class VideoClipParams(BaseModel):
    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
    voice_volume: Optional[float] = Field(default=1.0, description="语音音量")
    voice_rate: Optional[float] = Field(default=1.0, description="语速")
    voice_pitch: Optional[float] = Field(default=1.0, description="语调")
    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -1032,11 +1032,11 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
-    text: str, voice_name: str, voice_rate: float, voice_file: str
+    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> [SubMaker, None]:
    # if is_azure_v2_voice(voice_name):
    #     return azure_tts_v2(text, voice_name, voice_file)
-    return azure_tts_v1(text, voice_name, voice_rate, voice_file)
+    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
 def convert_rate_to_percent(rate: float) -> str:
@ -1049,18 +1049,29 @@ def convert_rate_to_percent(rate: float) -> str:
        return f"{percent}%"
 def convert_pitch_to_percent(rate: float) -> str:
    if rate == 1.0:
        return "+0Hz"
    percent = round((rate - 1.0) * 100)
    if percent > 0:
        return f"+{percent}Hz"
    else:
        return f"{percent}Hz"
 def azure_tts_v1(
-    text: str, voice_name: str, voice_rate: float, voice_file: str
+    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> [SubMaker, None]:
    voice_name = parse_voice_name(voice_name)
    text = text.strip()
    rate_str = convert_rate_to_percent(voice_rate)
    pitch_str = convert_pitch_to_percent(voice_pitch)
    for i in range(3):
        try:
            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
            async def _do() -> SubMaker:
-                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, proxy="http://127.0.0.1:7890")
+                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
                sub_maker = edge_tts.SubMaker()
                with open(voice_file, "wb") as file:
                    async for chunk in communicate.stream():
--- a/config.example.toml
+++ b/config.example.toml
@ -1,11 +1,12 @@
 [app]
-    project_version="0.2.0"
+    project_version="0.2.2"
-    # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
+    # 支持视频理解的大模型提供商
-    # If you don't have an OPENAI API Key, you can use g4f instead
+    #   gemini
    #   qwen2-vl (待增加)
    video_llm_provider="gemini"
-    # 支持的提供商 (Supported providers):
+    # 用于生成文案的大模型支持的提供商 (Supported providers):
-    #   openai
+    #   openai (默认)
    #   moonshot (月之暗面)
    #   oneapi
    #   g4f
@ -13,8 +14,6 @@
    #   qwen (通义千问)
    #   gemini
    llm_provider="openai"
    # 支持多模态视频理解能力的大模型
    ########## Ollama Settings
    # No need to set it unless you want to use your own proxy
    ollama_base_url = ""
@ -27,7 +26,7 @@
    # No need to set it unless you want to use your own proxy
    openai_base_url = ""
    # Check your available models at https://platform.openai.com/account/limits
-    openai_model_name = "gpt-4-turbo"
+    openai_model_name = "gpt-4o"
    ########## Moonshot API Key
    # Visit https://platform.moonshot.cn/console/api-keys to get your API key.
@ -56,7 +55,7 @@
    ########## Gemini API Key
    gemini_api_key=""
-    gemini_model_name = "gemini-1.5-flash"
+    gemini_model_name = "gemini-1.5-pro"
    ########## Qwen API Key
    # Visit https://dashscope.console.aliyun.com/apiKey to get your API key
@ -66,29 +65,23 @@
    qwen_api_key = ""
    qwen_model_name = "qwen-max"
    ########## DeepSeek API Key
    # Visit https://platform.deepseek.com/api_keys to get your API key
    deepseek_api_key = ""
    deepseek_base_url = "https://api.deepseek.com"
    deepseek_model_name = "deepseek-chat"
-    # Subtitle Provider, "whisper"
+    # 字幕提供商、可选，支持 whisper 和 faster-whisper-large-v2"whisper"
-    # If empty, the subtitle will not be generated
+    # 默认为 faster-whisper-large-v2 模型地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2
    subtitle_provider = "faster-whisper-large-v2"
    subtitle_enabled = true
    #
    # ImageMagick
-    #
+    # 安装后，将自动检测到 ImageMagick，Windows 除外！
-    # Once you have installed it, ImageMagick will be automatically detected, except on Windows!
+    # 例如，在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
-    # On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
+    # 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
    # Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
    # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
    #
    # FFMPEG
    #
    # 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
@ -97,12 +90,6 @@
    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
    # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/
    # Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
    # However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
    #   RuntimeError: No ffmpeg exe could be found.
    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
    # In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/
    # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
    #########################################################################################
@ -132,7 +119,7 @@
    material_directory = ""
-    # Used for state management of the task
+    # 用于任务的状态管理
    enable_redis = false
    redis_host = "localhost"
    redis_port = 6379
@ -143,7 +130,6 @@
    max_concurrent_tasks = 5
    # webui界面是否显示配置项
    # webui hide baisc config panel
    hide_config = false
@ -161,7 +147,7 @@
    # recommended model_size: "large-v3"
    model_size="faster-whisper-large-v2"
-    # if you want to use GPU, set device="cuda"
+    # 如果要使用 GPU，请设置 device=“cuda”
    device="CPU"
    compute_type="int8"
--- a/webui.py
+++ b/webui.py
@ -549,37 +549,6 @@ with middle_panel:
        params.voice_name = voice_name
        config.ui["voice_name"] = voice_name
        # 试听语言合成
        if st.button(tr("Play Voice")):
            play_content = "感谢关注 NarratoAI，有任何问题或建议，可以关注微信公众号，求助或讨论"
            if not play_content:
                play_content = params.video_script
            if not play_content:
                play_content = tr("Voice Example")
            with st.spinner(tr("Synthesizing Voice")):
                temp_dir = utils.storage_dir("temp", create=True)
                audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
                sub_maker = voice.tts(
                    text=play_content,
                    voice_name=voice_name,
                    voice_rate=params.voice_rate,
                    voice_file=audio_file,
                )
                # 如果语音文件生成失败，请使用默认内容重试。
                if not sub_maker:
                    play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
                    sub_maker = voice.tts(
                        text=play_content,
                        voice_name=voice_name,
                        voice_rate=params.voice_rate,
                        voice_file=audio_file,
                    )
                if sub_maker and os.path.exists(audio_file):
                    st.audio(audio_file, format="audio/mp3")
                    if os.path.exists(audio_file):
                        os.remove(audio_file)
        if voice.is_azure_v2_voice(voice_name):
            saved_azure_speech_region = config.azure.get("speech_region", "")
            saved_azure_speech_key = config.azure.get("speech_key", "")
@ -604,6 +573,45 @@ with middle_panel:
            index=2,
        )
        params.voice_pitch = st.selectbox(
            tr("Speech Pitch"),
            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
            index=2,
        )
        # 试听语言合成
        if st.button(tr("Play Voice")):
            play_content = "感谢关注 NarratoAI，有任何问题或建议，可以关注微信公众号，求助或讨论"
            if not play_content:
                play_content = params.video_script
            if not play_content:
                play_content = tr("Voice Example")
            with st.spinner(tr("Synthesizing Voice")):
                temp_dir = utils.storage_dir("temp", create=True)
                audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
                sub_maker = voice.tts(
                    text=play_content,
                    voice_name=voice_name,
                    voice_rate=params.voice_rate,
                    voice_pitch=params.voice_pitch,
                    voice_file=audio_file,
                )
                # 如果语音文件生成失败，请使用默认内容重试。
                if not sub_maker:
                    play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
                    sub_maker = voice.tts(
                        text=play_content,
                        voice_name=voice_name,
                        voice_rate=params.voice_rate,
                        voice_pitch=params.voice_pitch,
                        voice_file=audio_file,
                    )
                if sub_maker and os.path.exists(audio_file):
                    st.audio(audio_file, format="audio/mp3")
                    if os.path.exists(audio_file):
                        os.remove(audio_file)
        bgm_options = [
            (tr("No Background Music"), ""),
            (tr("Random Background Music"), "random"),
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -91,6 +91,7 @@
    "Picture description": "图片描述",
    "Narration": "视频文案",
    "Rebuild": "重新生成",
-    "Video Script Load": "加载视频脚本"
+    "Video Script Load": "加载视频脚本",
    "Speech Pitch": "语调"
  }
 }