优化 TTS 新增 proxy 配置,新增语调配置

This commit is contained in:
linyq 2024-11-05 18:33:46 +08:00
parent 1d9b27bf46
commit e926e8676a
5 changed files with 72 additions and 65 deletions

View File

@ -347,6 +347,7 @@ class VideoClipParams(BaseModel):
voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
voice_volume: Optional[float] = Field(default=1.0, description="语音音量")
voice_rate: Optional[float] = Field(default=1.0, description="语速")
voice_pitch: Optional[float] = Field(default=1.0, description="语调")
bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")

View File

@ -1032,11 +1032,11 @@ def is_azure_v2_voice(voice_name: str):
def tts(
text: str, voice_name: str, voice_rate: float, voice_file: str
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> [SubMaker, None]:
# if is_azure_v2_voice(voice_name):
# return azure_tts_v2(text, voice_name, voice_file)
return azure_tts_v1(text, voice_name, voice_rate, voice_file)
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
def convert_rate_to_percent(rate: float) -> str:
@ -1049,18 +1049,29 @@ def convert_rate_to_percent(rate: float) -> str:
return f"{percent}%"
def convert_pitch_to_percent(rate: float) -> str:
if rate == 1.0:
return "+0Hz"
percent = round((rate - 1.0) * 100)
if percent > 0:
return f"+{percent}Hz"
else:
return f"{percent}Hz"
def azure_tts_v1(
text: str, voice_name: str, voice_rate: float, voice_file: str
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> [SubMaker, None]:
voice_name = parse_voice_name(voice_name)
text = text.strip()
rate_str = convert_rate_to_percent(voice_rate)
pitch_str = convert_pitch_to_percent(voice_pitch)
for i in range(3):
try:
logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
async def _do() -> SubMaker:
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, proxy="http://127.0.0.1:7890")
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
sub_maker = edge_tts.SubMaker()
with open(voice_file, "wb") as file:
async for chunk in communicate.stream():

View File

@ -1,11 +1,12 @@
[app]
project_version="0.2.0"
# 如果你没有 OPENAI API Key可以使用 g4f 代替,或者使用国内的 Moonshot API
# If you don't have an OPENAI API Key, you can use g4f instead
project_version="0.2.2"
# 支持视频理解的大模型提供商
# gemini
# qwen2-vl (待增加)
video_llm_provider="gemini"
# 支持的提供商 (Supported providers):
# openai
# 用于生成文案的大模型支持的提供商 (Supported providers):
# openai (默认)
# moonshot (月之暗面)
# oneapi
# g4f
@ -13,8 +14,6 @@
# qwen (通义千问)
# gemini
llm_provider="openai"
# 支持多模态视频理解能力的大模型
########## Ollama Settings
# No need to set it unless you want to use your own proxy
ollama_base_url = ""
@ -27,7 +26,7 @@
# No need to set it unless you want to use your own proxy
openai_base_url = ""
# Check your available models at https://platform.openai.com/account/limits
openai_model_name = "gpt-4-turbo"
openai_model_name = "gpt-4o"
########## Moonshot API Key
# Visit https://platform.moonshot.cn/console/api-keys to get your API key.
@ -56,7 +55,7 @@
########## Gemini API Key
gemini_api_key=""
gemini_model_name = "gemini-1.5-flash"
gemini_model_name = "gemini-1.5-pro"
########## Qwen API Key
# Visit https://dashscope.console.aliyun.com/apiKey to get your API key
@ -66,29 +65,23 @@
qwen_api_key = ""
qwen_model_name = "qwen-max"
########## DeepSeek API Key
# Visit https://platform.deepseek.com/api_keys to get your API key
deepseek_api_key = ""
deepseek_base_url = "https://api.deepseek.com"
deepseek_model_name = "deepseek-chat"
# Subtitle Provider, "whisper"
# If empty, the subtitle will not be generated
# 字幕提供商、可选,支持 whisper 和 faster-whisper-large-v2"whisper"
# 默认为 faster-whisper-large-v2 模型地址https://huggingface.co/guillaumekln/faster-whisper-large-v2
subtitle_provider = "faster-whisper-large-v2"
subtitle_enabled = true
#
# ImageMagick
#
# Once you have installed it, ImageMagick will be automatically detected, except on Windows!
# On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
# Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
# 安装后,将自动检测到 ImageMagickWindows 除外!
# 例如,在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
# 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
# imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
#
# FFMPEG
#
# 通常情况下ffmpeg 会被自动下载,并且会被自动检测到。
@ -97,12 +90,6 @@
# Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
# 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path下载地址https://www.gyan.dev/ffmpeg/builds/
# Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
# However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
# RuntimeError: No ffmpeg exe could be found.
# Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
# In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/
# ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
#########################################################################################
@ -132,7 +119,7 @@
material_directory = ""
# Used for state management of the task
# 用于任务的状态管理
enable_redis = false
redis_host = "localhost"
redis_port = 6379
@ -143,7 +130,6 @@
max_concurrent_tasks = 5
# webui界面是否显示配置项
# webui hide baisc config panel
hide_config = false
@ -161,7 +147,7 @@
# recommended model_size: "large-v3"
model_size="faster-whisper-large-v2"
# if you want to use GPU, set device="cuda"
# 如果要使用 GPU请设置 device=“cuda”
device="CPU"
compute_type="int8"

View File

@ -549,37 +549,6 @@ with middle_panel:
params.voice_name = voice_name
config.ui["voice_name"] = voice_name
# 试听语言合成
if st.button(tr("Play Voice")):
play_content = "感谢关注 NarratoAI有任何问题或建议可以关注微信公众号求助或讨论"
if not play_content:
play_content = params.video_script
if not play_content:
play_content = tr("Voice Example")
with st.spinner(tr("Synthesizing Voice")):
temp_dir = utils.storage_dir("temp", create=True)
audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_file=audio_file,
)
# 如果语音文件生成失败,请使用默认内容重试。
if not sub_maker:
play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_file=audio_file,
)
if sub_maker and os.path.exists(audio_file):
st.audio(audio_file, format="audio/mp3")
if os.path.exists(audio_file):
os.remove(audio_file)
if voice.is_azure_v2_voice(voice_name):
saved_azure_speech_region = config.azure.get("speech_region", "")
saved_azure_speech_key = config.azure.get("speech_key", "")
@ -604,6 +573,45 @@ with middle_panel:
index=2,
)
params.voice_pitch = st.selectbox(
tr("Speech Pitch"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
# 试听语言合成
if st.button(tr("Play Voice")):
play_content = "感谢关注 NarratoAI有任何问题或建议可以关注微信公众号求助或讨论"
if not play_content:
play_content = params.video_script
if not play_content:
play_content = tr("Voice Example")
with st.spinner(tr("Synthesizing Voice")):
temp_dir = utils.storage_dir("temp", create=True)
audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
voice_file=audio_file,
)
# 如果语音文件生成失败,请使用默认内容重试。
if not sub_maker:
play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
voice_file=audio_file,
)
if sub_maker and os.path.exists(audio_file):
st.audio(audio_file, format="audio/mp3")
if os.path.exists(audio_file):
os.remove(audio_file)
bgm_options = [
(tr("No Background Music"), ""),
(tr("Random Background Music"), "random"),

View File

@ -91,6 +91,7 @@
"Picture description": "图片描述",
"Narration": "视频文案",
"Rebuild": "重新生成",
"Video Script Load": "加载视频脚本"
"Video Script Load": "加载视频脚本",
"Speech Pitch": "语调"
}
}