From b15f5807c18a413777183c4f4055cb0db7419461 Mon Sep 17 00:00:00 2001 From: zhifu gao Date: Tue, 30 Jun 2026 06:15:18 +0800 Subject: [PATCH] Support FunASR OpenAI transcription endpoint --- app/services/fun_asr_subtitle.py | 120 +++++++++++++++--- .../test_fun_asr_subtitle_unittest.py | 77 +++++++++++ config.example.toml | 1 + webui/i18n/en.json | 2 +- webui/i18n/zh.json | 2 +- 5 files changed, 182 insertions(+), 20 deletions(-) diff --git a/app/services/fun_asr_subtitle.py b/app/services/fun_asr_subtitle.py index f6042d4..4458f81 100644 --- a/app/services/fun_asr_subtitle.py +++ b/app/services/fun_asr_subtitle.py @@ -23,6 +23,7 @@ UPLOAD_POLICY_URL = f"{DASHSCOPE_BASE_URL}/api/v1/uploads" TRANSCRIPTION_URL = f"{DASHSCOPE_BASE_URL}/api/v1/services/audio/asr/transcription" TASK_URL_TEMPLATE = f"{DASHSCOPE_BASE_URL}/api/v1/tasks/{{task_id}}" MODEL_NAME = "fun-asr" +LOCAL_FUN_ASR_OPENAI_MODEL = "sensevoice" LOCAL_FUN_ASR_API_URL = "http://127.0.0.1:7860" LOCAL_FIRERED_ASR_API_URL = "http://127.0.0.1:7867" TERMINAL_FAILED_STATUSES = {"FAILED", "CANCELED", "UNKNOWN"} @@ -111,18 +112,42 @@ def _local_base_url(api_url: str = "") -> str: api_url = _normalize_local_api_url(api_url) parsed = urlparse(api_url) path = parsed.path.rstrip("/") - if path.endswith("/asr"): - path = path[:-4].rstrip("/") + for suffix in ("/v1/audio/transcriptions", "/v1", "/asr"): + if path.endswith(suffix): + path = path[: -len(suffix)].rstrip("/") + break return urlunparse(parsed._replace(path=path, params="", query="", fragment="")).rstrip("/") def _local_asr_url(api_url: str = "") -> str: api_url = _normalize_local_api_url(api_url) - if urlparse(api_url).path.rstrip("/").endswith("/asr"): + path = urlparse(api_url).path.rstrip("/") + if path.endswith("/asr"): return api_url + if path.endswith("/v1") or path.endswith("/v1/audio/transcriptions"): + return f"{_local_base_url(api_url)}/asr" return f"{api_url}/asr" +def _local_openai_transcriptions_url(api_url: str = "") -> str: + api_url = _normalize_local_api_url(api_url) + path = urlparse(api_url).path.rstrip("/") + if path.endswith("/v1/audio/transcriptions"): + return api_url + if path.endswith("/v1"): + return f"{api_url}/audio/transcriptions" + return f"{_local_base_url(api_url)}/v1/audio/transcriptions" + + +def _local_fun_asr_prefers_openai(api_url: str = "") -> bool: + path = urlparse(_normalize_local_api_url(api_url)).path.rstrip("/") + return path.endswith("/v1") or path.endswith("/v1/audio/transcriptions") + + +def _is_not_found_response(response: requests.Response) -> bool: + return getattr(response, "status_code", 200) == 404 + + def _absolute_local_download_url(api_url: str, download_url: str) -> str: download_url = (download_url or "").strip() if not download_url: @@ -547,27 +572,52 @@ def request_local_fun_asr( api_url: str = LOCAL_FUN_ASR_API_URL, hotword: str = "", enable_spk: Optional[bool] = None, + model: str = LOCAL_FUN_ASR_OPENAI_MODEL, timeout: float = 600.0, session=requests, ) -> dict[str, Any]: - """Call the local FunASR-Pack `/asr` API and return its JSON result.""" + """Call the local FunASR-Pack API and return its JSON result.""" _require_local_file(local_file) - data: dict[str, str] = {} + rest_data: dict[str, str] = {} if hotword.strip(): - data["hotword"] = hotword.strip() + rest_data["hotword"] = hotword.strip() if enable_spk is not None: - data["enable_spk"] = "true" if enable_spk else "false" + rest_data["enable_spk"] = "true" if enable_spk else "false" - with open(local_file, "rb") as file_obj: - files = {"file": (_safe_upload_name(local_file), file_obj)} - response = _session_post( - session, - _local_asr_url(api_url), - data=data, - files=files, - timeout=timeout, - ) - return _local_json(response, "调用本地 FunASR-Pack ASR API") + openai_data: dict[str, str] = { + "model": (model or LOCAL_FUN_ASR_OPENAI_MODEL).strip() or LOCAL_FUN_ASR_OPENAI_MODEL, + "response_format": "verbose_json", + } + if enable_spk is not None: + openai_data["spk"] = "true" if enable_spk else "false" + + rest_url = _local_asr_url(api_url) + openai_url = _local_openai_transcriptions_url(api_url) + attempts = [ + (openai_url, openai_data), + (rest_url, rest_data), + ] if _local_fun_asr_prefers_openai(api_url) else [ + (rest_url, rest_data), + (openai_url, openai_data), + ] + + last_response = None + for index, (url, data) in enumerate(attempts): + with open(local_file, "rb") as file_obj: + files = {"file": (_safe_upload_name(local_file), file_obj)} + response = _session_post( + session, + url, + data=data, + files=files, + timeout=timeout, + ) + if index == 0 and _is_not_found_response(response): + last_response = response + continue + return _local_json(response, "调用本地 FunASR-Pack ASR API") + + return _local_json(last_response, "调用本地 FunASR-Pack ASR API") def request_local_firered_asr( @@ -640,6 +690,40 @@ def _local_result_items(result_json: dict[str, Any]): yield result_json +def _openai_segment_ms(value: Any, field_name: str) -> float: + return _timestamp_ms(value, field_name) * 1000 + + +def _blocks_from_openai_segments(result_json: dict[str, Any], max_chars: int) -> list[dict[str, Any]]: + segments = result_json.get("segments") or [] + if not isinstance(segments, list): + return [] + + blocks: list[dict[str, Any]] = [] + for segment in segments: + if not isinstance(segment, dict): + continue + text = str(segment.get("text") or "").strip() + if not text: + continue + start = segment.get("start", segment.get("start_time", 0)) + end = segment.get("end", segment.get("end_time")) + start_ms = _openai_segment_ms(start, "openai.segment.start") + end_ms = _openai_segment_ms(end, "openai.segment.end") if end is not None else start_ms + 500 + blocks.extend( + _blocks_from_sentence( + { + "begin_time": start_ms, + "end_time": end_ms, + "text": text, + "speaker_id": segment.get("speaker"), + }, + max_chars=max_chars, + ) + ) + return blocks + + def _blocks_from_local_timestamp(item: dict[str, Any], max_chars: int, max_duration: float) -> list[dict[str, Any]]: text = str(item.get("text") or "").strip() timestamps = item.get("timestamp") or [] @@ -702,7 +786,7 @@ def local_fun_asr_result_to_srt( max_duration: float = 3.5, ) -> str: """Convert a FunASR-Pack JSON response into SRT when the API SRT is unavailable.""" - blocks: list[dict[str, Any]] = [] + blocks = _blocks_from_openai_segments(result_json, max_chars=max_chars) for item in _local_result_items(result_json): item_blocks = _blocks_from_local_timestamp(item, max_chars, max_duration) if not item_blocks: diff --git a/app/services/test_fun_asr_subtitle_unittest.py b/app/services/test_fun_asr_subtitle_unittest.py index d5a3ccd..928aad1 100644 --- a/app/services/test_fun_asr_subtitle_unittest.py +++ b/app/services/test_fun_asr_subtitle_unittest.py @@ -408,6 +408,69 @@ class LocalFunAsrServiceTests(unittest.TestCase): self.assertEqual(123, session.calls[0][2]["timeout"]) self.assertIn("file", session.calls[0][2]["files"]) + def test_request_local_fun_asr_falls_back_to_openai_transcriptions_on_404(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + if url.endswith("/asr"): + return FakeResponse({"detail": "Not Found"}, status_code=404) + return FakeResponse( + { + "text": "你好", + "segments": [{"start": 0.0, "end": 1.2, "text": "你好"}], + } + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + session = LocalSession() + + result = fasr.request_local_fun_asr( + str(local_file), + api_url="http://127.0.0.1:7860", + enable_spk=True, + session=session, + ) + + self.assertEqual("你好", result["text"]) + self.assertEqual("http://127.0.0.1:7860/asr", session.calls[0][1]) + self.assertEqual("http://127.0.0.1:7860/v1/audio/transcriptions", session.calls[1][1]) + self.assertEqual( + {"model": "sensevoice", "response_format": "verbose_json", "spk": "true"}, + session.calls[1][2]["data"], + ) + + def test_request_local_fun_asr_prefers_explicit_openai_base_url(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好"}) + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + session = LocalSession() + + fasr.request_local_fun_asr( + str(local_file), + api_url="http://127.0.0.1:8000/v1", + session=session, + ) + + self.assertEqual(1, len(session.calls)) + self.assertEqual("http://127.0.0.1:8000/v1/audio/transcriptions", session.calls[0][1]) + self.assertEqual( + {"model": "sensevoice", "response_format": "verbose_json"}, + session.calls[0][2]["data"], + ) + def test_create_with_local_fun_asr_copies_pack_srt_file(self): class LocalSession: def __init__(self, srt_file): @@ -480,6 +543,20 @@ class LocalFunAsrServiceTests(unittest.TestCase): self.assertIn("00:00:00,000 --> 00:00:00,600\n你好,", srt) self.assertIn("世界。", srt) + def test_local_fun_asr_result_to_srt_uses_openai_segments(self): + result = { + "text": "你好世界", + "segments": [ + {"start": 1.2, "end": 2.4, "text": "你好"}, + {"start": 2.4, "end": 3.6, "text": "世界"}, + ], + } + + srt = fasr.local_fun_asr_result_to_srt(result, max_chars=20) + + self.assertIn("00:00:01,200 --> 00:00:02,400\n你好", srt) + self.assertIn("00:00:02,400 --> 00:00:03,600\n世界", srt) + class LocalFireRedAsrServiceTests(unittest.TestCase): def test_request_local_firered_asr_posts_file_and_options(self): diff --git a/config.example.toml b/config.example.toml index 5774c1f..c80a3dc 100644 --- a/config.example.toml +++ b/config.example.toml @@ -129,6 +129,7 @@ # backend = "local" 使用本地 FunASR-Pack API;backend = "firered" 使用本地 FireRedASR2-AED-Pack API;backend = "bailian" 使用阿里百炼在线 fun-asr auto_transcribe_enabled = false backend = "local" + # 支持填写服务根地址、完整 /asr 地址,或 OpenAI-compatible /v1 地址 api_url = "http://127.0.0.1:7860" firered_api_url = "http://127.0.0.1:7867" hotword = "" diff --git a/webui/i18n/en.json b/webui/i18n/en.json index dedb3f3..00dedb5 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -483,7 +483,7 @@ "Auto Transcription FireRed Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FireRedASR2-AED-Pack API.", "Auto Transcription Online Caption": "After the final video is merged, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", "Local FunASR-Pack API URL": "Local FunASR-Pack API URL", - "Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr endpoint URL is also supported.", + "Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr, /v1, or /v1/audio/transcriptions URL is also supported.", "Local FireRedASR API URL": "Local ASR API URL", "Local FireRedASR API URL Help": "For example, http://127.0.0.1:7867. A full /asr endpoint URL is also supported.", "Fun-ASR Hotword": "Hotword", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 7ef2f2c..b711fc0 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -422,7 +422,7 @@ "Auto Transcription FireRed Caption": "将在最终视频合并完成后,通过本机运行的 FireRedASR2-AED-Pack API 生成 SRT 字幕。", "Auto Transcription Online Caption": "将在最终视频合并完成后,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", "Local FunASR-Pack API URL": "本地 FunASR-Pack API 地址", - "Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860;也可以直接填到 /asr 的完整地址。", + "Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860;也可以直接填写 /asr、/v1 或 /v1/audio/transcriptions 的完整地址。", "Local FireRedASR API URL": "本地ASR API 地址", "Local FireRedASR API URL Help": "例如 http://127.0.0.1:7867;也可以直接填到 /asr 的完整地址。", "Fun-ASR Hotword": "热词",