diff --git a/app/models/schema.py b/app/models/schema.py index 64e0cb6..9d0c5d4 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -347,6 +347,7 @@ class VideoClipParams(BaseModel): voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称") voice_volume: Optional[float] = Field(default=1.0, description="语音音量") voice_rate: Optional[float] = Field(default=1.0, description="语速") + voice_pitch: Optional[float] = Field(default=1.0, description="语调") bgm_name: Optional[str] = Field(default="random", description="背景音乐名称") bgm_type: Optional[str] = Field(default="random", description="背景音乐类型") diff --git a/app/services/voice.py b/app/services/voice.py index cf14578..aebff2d 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1032,11 +1032,11 @@ def is_azure_v2_voice(voice_name: str): def tts( - text: str, voice_name: str, voice_rate: float, voice_file: str + text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str ) -> [SubMaker, None]: # if is_azure_v2_voice(voice_name): # return azure_tts_v2(text, voice_name, voice_file) - return azure_tts_v1(text, voice_name, voice_rate, voice_file) + return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) def convert_rate_to_percent(rate: float) -> str: @@ -1049,18 +1049,29 @@ def convert_rate_to_percent(rate: float) -> str: return f"{percent}%" +def convert_pitch_to_percent(rate: float) -> str: + if rate == 1.0: + return "+0Hz" + percent = round((rate - 1.0) * 100) + if percent > 0: + return f"+{percent}Hz" + else: + return f"{percent}Hz" + + def azure_tts_v1( - text: str, voice_name: str, voice_rate: float, voice_file: str + text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str ) -> [SubMaker, None]: voice_name = parse_voice_name(voice_name) text = text.strip() rate_str = convert_rate_to_percent(voice_rate) + pitch_str = convert_pitch_to_percent(voice_pitch) for i in range(3): try: logger.info(f"start, voice name: {voice_name}, try: {i + 1}") async def _do() -> SubMaker: - communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, proxy="http://127.0.0.1:7890") + communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http")) sub_maker = edge_tts.SubMaker() with open(voice_file, "wb") as file: async for chunk in communicate.stream(): diff --git a/config.example.toml b/config.example.toml index 1557101..c8cfaa6 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1,11 +1,12 @@ [app] - project_version="0.2.0" - # 如果你没有 OPENAI API Key,可以使用 g4f 代替,或者使用国内的 Moonshot API - # If you don't have an OPENAI API Key, you can use g4f instead + project_version="0.2.2" + # 支持视频理解的大模型提供商 + # gemini + # qwen2-vl (待增加) video_llm_provider="gemini" - # 支持的提供商 (Supported providers): - # openai + # 用于生成文案的大模型支持的提供商 (Supported providers): + # openai (默认) # moonshot (月之暗面) # oneapi # g4f @@ -13,8 +14,6 @@ # qwen (通义千问) # gemini llm_provider="openai" - # 支持多模态视频理解能力的大模型 - ########## Ollama Settings # No need to set it unless you want to use your own proxy ollama_base_url = "" @@ -27,7 +26,7 @@ # No need to set it unless you want to use your own proxy openai_base_url = "" # Check your available models at https://platform.openai.com/account/limits - openai_model_name = "gpt-4-turbo" + openai_model_name = "gpt-4o" ########## Moonshot API Key # Visit https://platform.moonshot.cn/console/api-keys to get your API key. @@ -56,7 +55,7 @@ ########## Gemini API Key gemini_api_key="" - gemini_model_name = "gemini-1.5-flash" + gemini_model_name = "gemini-1.5-pro" ########## Qwen API Key # Visit https://dashscope.console.aliyun.com/apiKey to get your API key @@ -66,29 +65,23 @@ qwen_api_key = "" qwen_model_name = "qwen-max" - ########## DeepSeek API Key # Visit https://platform.deepseek.com/api_keys to get your API key deepseek_api_key = "" deepseek_base_url = "https://api.deepseek.com" deepseek_model_name = "deepseek-chat" - # Subtitle Provider, "whisper" - # If empty, the subtitle will not be generated + # 字幕提供商、可选,支持 whisper 和 faster-whisper-large-v2"whisper" + # 默认为 faster-whisper-large-v2 模型地址:https://huggingface.co/guillaumekln/faster-whisper-large-v2 subtitle_provider = "faster-whisper-large-v2" subtitle_enabled = true - # # ImageMagick - # - # Once you have installed it, ImageMagick will be automatically detected, except on Windows! - # On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe" - # Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe - + # 安装后,将自动检测到 ImageMagick,Windows 除外! + # 例如,在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe" + # 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe" - - # # FFMPEG # # 通常情况下,ffmpeg 会被自动下载,并且会被自动检测到。 @@ -97,12 +90,6 @@ # Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable. # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path,下载地址:https://www.gyan.dev/ffmpeg/builds/ - # Under normal circumstances, ffmpeg is downloaded automatically and detected automatically. - # However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error: - # RuntimeError: No ffmpeg exe could be found. - # Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable. - # In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/ - # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe" ######################################################################################### @@ -132,7 +119,7 @@ material_directory = "" - # Used for state management of the task + # 用于任务的状态管理 enable_redis = false redis_host = "localhost" redis_port = 6379 @@ -143,7 +130,6 @@ max_concurrent_tasks = 5 # webui界面是否显示配置项 - # webui hide baisc config panel hide_config = false @@ -161,7 +147,7 @@ # recommended model_size: "large-v3" model_size="faster-whisper-large-v2" - # if you want to use GPU, set device="cuda" + # 如果要使用 GPU,请设置 device=“cuda” device="CPU" compute_type="int8" diff --git a/webui.py b/webui.py index 7784649..faae899 100644 --- a/webui.py +++ b/webui.py @@ -549,37 +549,6 @@ with middle_panel: params.voice_name = voice_name config.ui["voice_name"] = voice_name - # 试听语言合成 - if st.button(tr("Play Voice")): - play_content = "感谢关注 NarratoAI,有任何问题或建议,可以关注微信公众号,求助或讨论" - if not play_content: - play_content = params.video_script - if not play_content: - play_content = tr("Voice Example") - with st.spinner(tr("Synthesizing Voice")): - temp_dir = utils.storage_dir("temp", create=True) - audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") - sub_maker = voice.tts( - text=play_content, - voice_name=voice_name, - voice_rate=params.voice_rate, - voice_file=audio_file, - ) - # 如果语音文件生成失败,请使用默认内容重试。 - if not sub_maker: - play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content." - sub_maker = voice.tts( - text=play_content, - voice_name=voice_name, - voice_rate=params.voice_rate, - voice_file=audio_file, - ) - - if sub_maker and os.path.exists(audio_file): - st.audio(audio_file, format="audio/mp3") - if os.path.exists(audio_file): - os.remove(audio_file) - if voice.is_azure_v2_voice(voice_name): saved_azure_speech_region = config.azure.get("speech_region", "") saved_azure_speech_key = config.azure.get("speech_key", "") @@ -604,6 +573,45 @@ with middle_panel: index=2, ) + params.voice_pitch = st.selectbox( + tr("Speech Pitch"), + options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0], + index=2, + ) + + # 试听语言合成 + if st.button(tr("Play Voice")): + play_content = "感谢关注 NarratoAI,有任何问题或建议,可以关注微信公众号,求助或讨论" + if not play_content: + play_content = params.video_script + if not play_content: + play_content = tr("Voice Example") + with st.spinner(tr("Synthesizing Voice")): + temp_dir = utils.storage_dir("temp", create=True) + audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") + sub_maker = voice.tts( + text=play_content, + voice_name=voice_name, + voice_rate=params.voice_rate, + voice_pitch=params.voice_pitch, + voice_file=audio_file, + ) + # 如果语音文件生成失败,请使用默认内容重试。 + if not sub_maker: + play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content." + sub_maker = voice.tts( + text=play_content, + voice_name=voice_name, + voice_rate=params.voice_rate, + voice_pitch=params.voice_pitch, + voice_file=audio_file, + ) + + if sub_maker and os.path.exists(audio_file): + st.audio(audio_file, format="audio/mp3") + if os.path.exists(audio_file): + os.remove(audio_file) + bgm_options = [ (tr("No Background Music"), ""), (tr("Random Background Music"), "random"), diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index cbad21b..f1bc6b2 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -91,6 +91,7 @@ "Picture description": "图片描述", "Narration": "视频文案", "Rebuild": "重新生成", - "Video Script Load": "加载视频脚本" + "Video Script Load": "加载视频脚本", + "Speech Pitch": "语调" } } \ No newline at end of file