diff --git a/backend/docs/CONFIGURATION.md b/backend/docs/CONFIGURATION.md index 8cf7df5d7..6d65bf83e 100644 --- a/backend/docs/CONFIGURATION.md +++ b/backend/docs/CONFIGURATION.md @@ -113,7 +113,7 @@ models: base_url: https://api.minimax.io/v1 max_tokens: 4096 temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - supports_vision: true + supports_vision: false # M2.7 is text-only; M3 supports vision - name: minimax-m2.7-highspeed display_name: MiniMax M2.7 Highspeed @@ -123,7 +123,7 @@ models: base_url: https://api.minimax.io/v1 max_tokens: 4096 temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - supports_vision: true + supports_vision: false # M2.7 is text-only; M3 supports vision - name: openrouter-gemini-2.5-flash display_name: Gemini 2.5 Flash (OpenRouter) use: langchain_openai:ChatOpenAI diff --git a/backend/packages/harness/deerflow/agents/middlewares/view_image_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/view_image_middleware.py index 37432cd9a..7aa1e6a0f 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/view_image_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/view_image_middleware.py @@ -179,8 +179,10 @@ class ViewImageMiddleware(AgentMiddleware[ViewImageMiddlewareState]): # Create the image details message with text and image content image_content = self._create_image_details_message(state) - # Create a new human message with mixed content (text + images) - human_msg = HumanMessage(content=image_content) + # Create a new human message with mixed content (text + images). This is + # internal context for the model only, so hide it from the chat UI and IM + # channels (matches the other middleware-injected context messages). + human_msg = HumanMessage(content=image_content, additional_kwargs={"hide_from_ui": True}) logger.debug("Injecting image details message with images before LLM call") diff --git a/backend/packages/harness/deerflow/models/patched_minimax.py b/backend/packages/harness/deerflow/models/patched_minimax.py index 44934e2d5..7a7297bc3 100644 --- a/backend/packages/harness/deerflow/models/patched_minimax.py +++ b/backend/packages/harness/deerflow/models/patched_minimax.py @@ -114,8 +114,27 @@ class PatchedChatMiniMax(ChatOpenAI): } else: payload["extra_body"] = {"reasoning_split": True} + self._strip_user_message_names(payload) return payload + @staticmethod + def _strip_user_message_names(payload: dict) -> None: + """Drop the per-message ``name`` field from user-role messages. + + DeerFlow middlewares tag user messages with internal provenance names + (``user-input``, ``summary``, ``loop_warning``, ...). ``langchain_openai`` + serializes those into the OpenAI-compatible request, but MiniMax requires + every user-role ``name`` to be identical and otherwise rejects the request + with ``invalid params, user name must be consistent (2013)``. MiniMax does + not use the per-message author name, so strip it. + """ + messages = payload.get("messages") + if not isinstance(messages, list): + return + for message in messages: + if isinstance(message, dict) and message.get("role") == "user": + message.pop("name", None) + def _convert_chunk_to_generation_chunk( self, chunk: dict, diff --git a/backend/tests/test_model_factory.py b/backend/tests/test_model_factory.py index 562c8874c..87118d7da 100644 --- a/backend/tests/test_model_factory.py +++ b/backend/tests/test_model_factory.py @@ -715,7 +715,7 @@ def test_openai_compatible_provider_multiple_models(monkeypatch): base_url="https://api.minimax.io/v1", api_key="test-key", temperature=1.0, - supports_vision=True, + supports_vision=False, # M2.7 is text-only; M3 supports vision supports_thinking=False, ) cfg = _make_app_config([m1, m2]) diff --git a/backend/tests/test_patched_minimax.py b/backend/tests/test_patched_minimax.py index 3423f3179..3b617fdbd 100644 --- a/backend/tests/test_patched_minimax.py +++ b/backend/tests/test_patched_minimax.py @@ -1,4 +1,4 @@ -from langchain_core.messages import AIMessageChunk, HumanMessage +from langchain_core.messages import AIMessage, AIMessageChunk, HumanMessage, SystemMessage from deerflow.models.patched_minimax import PatchedChatMiniMax @@ -21,6 +21,30 @@ def test_get_request_payload_preserves_thinking_and_forces_reasoning_split(): assert payload["extra_body"]["reasoning_split"] is True +def test_get_request_payload_strips_inconsistent_user_message_names(): + """MiniMax rejects user messages whose `name` fields differ (error 2013). + + DeerFlow middlewares tag user messages with internal provenance names + (e.g. "summary", "user-input", "loop_warning"). langchain serializes those + into the OpenAI-compatible payload, and MiniMax requires every user-role + name to be consistent. Strip them so the request is accepted. + """ + model = _make_model() + + payload = model._get_request_payload( + [ + SystemMessage(content="system"), + HumanMessage(content="older summary", name="summary"), + AIMessage(content="ok"), + HumanMessage(content="latest question", name="user-input"), + ] + ) + + user_messages = [m for m in payload["messages"] if m["role"] == "user"] + assert len(user_messages) == 2 + assert all(m.get("name") is None for m in user_messages) + + def test_create_chat_result_maps_reasoning_details_to_reasoning_content(): model = _make_model() response = { diff --git a/backend/tests/test_setup_wizard.py b/backend/tests/test_setup_wizard.py index 3538289a3..9eecb2eae 100644 --- a/backend/tests/test_setup_wizard.py +++ b/backend/tests/test_setup_wizard.py @@ -54,6 +54,29 @@ class TestProviders: assert providers["deepseek"].use == "deerflow.models.patched_deepseek:PatchedChatDeepSeek" assert providers["volcengine"].extra_config["api_base"] == "https://ark.cn-beijing.volces.com/api/v3" + def test_minimax_vision_is_per_model(self): + """M3 supports vision; M2.7 variants are text-only. + + The provider-level extra_config carries the default (M3) capability, but + extra_config_for() must drop vision when an M2.7 model is selected. + """ + providers = {provider.name: provider for provider in LLM_PROVIDERS} + + for name in ("minimax", "minimax_cn"): + provider = providers[name] + assert provider.extra_config["supports_vision"] is True + assert provider.extra_config_for("MiniMax-M3")["supports_vision"] is True + assert provider.extra_config_for("MiniMax-M2.7")["supports_vision"] is False + assert provider.extra_config_for("MiniMax-M2.7-highspeed")["supports_vision"] is False + # Override must not mutate the shared provider-level config. + assert provider.extra_config["supports_vision"] is True + + def test_extra_config_for_returns_provider_config_without_override(self): + """Providers without per-model overrides return their config unchanged.""" + providers = {provider.name: provider for provider in LLM_PROVIDERS} + openai = providers["openai"] + assert openai.extra_config_for("gpt-5") == openai.extra_config + def test_llm_providers_have_required_fields(self): for p in LLM_PROVIDERS: assert p.name diff --git a/backend/tests/test_view_image_middleware.py b/backend/tests/test_view_image_middleware.py index 280b34e8b..f899dd5ec 100644 --- a/backend/tests/test_view_image_middleware.py +++ b/backend/tests/test_view_image_middleware.py @@ -356,6 +356,9 @@ class TestInjectImageMessage: # Mixed-content payload: list of text + image_url blocks assert isinstance(injected.content, list) assert any(isinstance(b, dict) and b.get("type") == "image_url" for b in injected.content) + # Internal injection: must be hidden from the chat UI (and IM channels), + # like the other middleware-injected context messages. + assert injected.additional_kwargs.get("hide_from_ui") is True class TestBeforeModel: diff --git a/config.example.yaml b/config.example.yaml index 99752cf5e..1c7d2a115 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -279,7 +279,7 @@ models: # Docs: https://platform.minimax.io/docs/api-reference/text-openai-api # - name: minimax-m3 # display_name: MiniMax M3 - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M3 # api_key: $MINIMAX_API_KEY # base_url: https://api.minimax.io/v1 @@ -289,11 +289,14 @@ models: # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] # supports_vision: true # supports_thinking: true - # # MiniMax inlines its chain-of-thought into `content` as ... - # # (reasoning_split defaults to false), not in a separate reasoning_content - # # field. Declare the thinking toggle so non-thinking paths (flash mode, - # # follow-up suggestions, title/memory generation) truly disable reasoning - # # instead of wasting tokens on — and parsing around — inline blocks. + # # PatchedChatMiniMax is the MiniMax adapter: it enables reasoning_split and + # # maps MiniMax's structured reasoning into reasoning_content (the field + # # DeerFlow understands), and it strips the per-message `name` field that + # # DeerFlow middlewares attach — MiniMax rejects requests whose user-message + # # names differ with "user name must be consistent (2013)". Declare the + # # thinking toggle so non-thinking paths (flash mode, follow-up suggestions, + # # title/memory generation) truly disable reasoning instead of spending + # # tokens on it. # when_thinking_enabled: # extra_body: # thinking: @@ -306,9 +309,12 @@ models: # NOTE: M2.x models always think — passing thinking:{type:disabled} has no # effect (per MiniMax docs), so the toggle above is omitted for M2.7. The # follow-up-suggestions endpoint strips inline defensively regardless. + # Still use the PatchedChatMiniMax adapter: it strips the per-message `name` + # field DeerFlow middlewares attach, which MiniMax otherwise rejects with + # "user name must be consistent (2013)". # - name: minimax-m2.7 # display_name: MiniMax M2.7 - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M2.7 # api_key: $MINIMAX_API_KEY # base_url: https://api.minimax.io/v1 @@ -316,12 +322,12 @@ models: # max_retries: 2 # max_tokens: 4096 # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - # supports_vision: true + # supports_vision: false # M2.7 is text-only; M3 supports vision # supports_thinking: true # - name: minimax-m2.7-highspeed # display_name: MiniMax M2.7 Highspeed - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M2.7-highspeed # api_key: $MINIMAX_API_KEY # base_url: https://api.minimax.io/v1 @@ -329,7 +335,7 @@ models: # max_retries: 2 # max_tokens: 4096 # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - # supports_vision: true + # supports_vision: false # M2.7 is text-only; M3 supports vision # supports_thinking: true # Example: MiniMax (OpenAI-compatible) - CN 中国区用户 @@ -337,7 +343,7 @@ models: # Docs: https://platform.minimaxi.com/docs/api-reference/text-openai-api # - name: minimax-m3 # display_name: MiniMax M3 - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M3 # api_key: $MINIMAX_API_KEY # base_url: https://api.minimaxi.com/v1 @@ -347,11 +353,14 @@ models: # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] # supports_vision: true # supports_thinking: true - # # MiniMax inlines its chain-of-thought into `content` as ... - # # (reasoning_split defaults to false), not in a separate reasoning_content - # # field. Declare the thinking toggle so non-thinking paths (flash mode, - # # follow-up suggestions, title/memory generation) truly disable reasoning - # # instead of wasting tokens on — and parsing around — inline blocks. + # # PatchedChatMiniMax is the MiniMax adapter: it enables reasoning_split and + # # maps MiniMax's structured reasoning into reasoning_content (the field + # # DeerFlow understands), and it strips the per-message `name` field that + # # DeerFlow middlewares attach — MiniMax rejects requests whose user-message + # # names differ with "user name must be consistent (2013)". Declare the + # # thinking toggle so non-thinking paths (flash mode, follow-up suggestions, + # # title/memory generation) truly disable reasoning instead of spending + # # tokens on it. # when_thinking_enabled: # extra_body: # thinking: @@ -364,9 +373,12 @@ models: # NOTE: M2.x models always think — passing thinking:{type:disabled} has no # effect (per MiniMax docs), so the toggle above is omitted for M2.7. The # follow-up-suggestions endpoint strips inline defensively regardless. + # Still use the PatchedChatMiniMax adapter: it strips the per-message `name` + # field DeerFlow middlewares attach, which MiniMax otherwise rejects with + # "user name must be consistent (2013)". # - name: minimax-m2.7 # display_name: MiniMax M2.7 - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M2.7 # api_key: $MINIMAX_API_KEY # base_url: https://api.minimaxi.com/v1 @@ -374,12 +386,12 @@ models: # max_retries: 2 # max_tokens: 4096 # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - # supports_vision: true + # supports_vision: false # M2.7 is text-only; M3 supports vision # supports_thinking: true # - name: minimax-m2.7-highspeed # display_name: MiniMax M2.7 Highspeed - # use: langchain_openai:ChatOpenAI + # use: deerflow.models.patched_minimax:PatchedChatMiniMax # model: MiniMax-M2.7-highspeed # api_key: $MINIMAX_API_KEY # base_url: https://api.minimaxi.com/v1 @@ -387,7 +399,7 @@ models: # max_retries: 2 # max_tokens: 4096 # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] - # supports_vision: true + # supports_vision: false # M2.7 is text-only; M3 supports vision # supports_thinking: true # Example: OpenRouter (OpenAI-compatible) diff --git a/docs/superpowers/plans/2026-06-08-minimax-generation-providers.md b/docs/superpowers/plans/2026-06-08-minimax-generation-providers.md new file mode 100644 index 000000000..c5a047645 --- /dev/null +++ b/docs/superpowers/plans/2026-06-08-minimax-generation-providers.md @@ -0,0 +1,1546 @@ +# MiniMax 接入生成类 Skill 实施计划 + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 在 image/video/podcast 三个现有 skill 中按环境变量自动接入 MiniMax 作为可选 provider,并用 skill-creator 新建一个 MiniMax 音乐生成 skill。 + +**Architecture:** 每个 skill 是 `skills/public//` 下的自包含脚本(`SKILL.md` + `scripts/generate.py`,纯 `requests`)。沙箱内目录隔离,故 MiniMax 代码在每个脚本内各自内联。`generate.py` 顶层用 `_resolve_provider()` 选 provider:`_PROVIDER` 覆盖 > 现有 provider 凭证存在 > `MINIMAX_API_KEY` 回退。测试放仓库根 `tests/skills/`,用 `importlib` 按路径加载脚本并 mock `requests`,不打真实 API。 + +**Tech Stack:** Python 3 + `requests`;测试用 pytest(通过 `uv run --no-project --with pytest --with requests --with Pillow` 运行);新 skill 用 `skills/public/skill-creator/scripts/init_skill.py` 脚手架。 + +**测试运行命令(全程统一用这条):** +```bash +uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/ -v +``` + +**关键事实(来自 MiniMax 官方文档,已核实):** +- Base URL `https://api.minimaxi.com`,Header `Authorization: Bearer $MINIMAX_API_KEY` + `Content-Type: application/json`。 +- 错误判定:响应体 `base_resp.status_code != 0` 即失败。 +- 图像 `POST /v1/image_generation` 同步,`response_format:"base64"` → `data.image_base64[0]`(base64)。参考图放 `subject_reference:[{type:"character",image_file:"data:image/jpeg;base64,..."}]`。 +- 视频三步:`POST /v1/video_generation`→`task_id`;`GET /v1/query/video_generation?task_id`→`status`(`Success`/`Fail`/...)+`file_id`;`GET /v1/files/retrieve?file_id`→`file.download_url`;下载 mp4(download_url 无需鉴权)。参考图放 `first_frame_image`(data URL)。 +- 语音 `POST /v1/t2a_v2` 同步 → `data.audio` 是 **hex** → `bytes.fromhex`。 +- 音乐 `POST /v1/music_generation` 同步 → `data.audio` 是 **hex** → mp3。无歌词非纯音乐时 `lyrics_optimizer:true`;纯音乐 `is_instrumental:true`。 +- 已核实可用 voice_id:`male-qn-qingse`、`female-tianmei`(官方 t2a 文档示例中出现)。 + +--- + +## File Structure + +**新建:** +- `tests/skills/skill_loader.py` — 按路径加载某 skill 的 `generate.py` 为模块。 +- `tests/skills/test_image_generation.py` +- `tests/skills/test_video_generation.py` +- `tests/skills/test_podcast_generation.py` +- `tests/skills/test_music_generation.py` +- `skills/public/music-generation/SKILL.md`(脚手架后替换) +- `skills/public/music-generation/scripts/generate.py`(脚手架后替换) + +**修改:** +- `skills/public/image-generation/scripts/generate.py`(整文件替换) +- `skills/public/image-generation/SKILL.md`(追加 MiniMax 说明段) +- `skills/public/video-generation/scripts/generate.py`(整文件替换) +- `skills/public/video-generation/SKILL.md`(追加 MiniMax 说明段) +- `skills/public/podcast-generation/scripts/generate.py`(整文件替换) +- `skills/public/podcast-generation/SKILL.md`(追加 MiniMax 说明段) +- `frontend/src/app/mock/api/skills/route.ts`(新增 music-generation 条目) + +--- + +## Task 0: 测试加载器 + +**Files:** +- Create: `tests/skills/skill_loader.py` + +- [ ] **Step 1: 写加载器** + +`tests/skills/skill_loader.py`: +```python +"""Load a skill's scripts/generate.py as an importable module, by file path. + +Skills live in skills/public//scripts/generate.py and are NOT a package, +so tests load them via importlib. Tests then mock the module's `requests`. +""" +import importlib.util +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load(skill_name: str): + """Return the generate.py module for skills/public/.""" + path = REPO_ROOT / "skills" / "public" / skill_name / "scripts" / "generate.py" + mod_name = skill_name.replace("-", "_") + "_generate" + spec = importlib.util.spec_from_file_location(mod_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class FakeResp: + """Minimal stand-in for requests.Response.""" + + def __init__(self, json_data=None, content=b"", status_code=200): + self._json = json_data if json_data is not None else {} + self.content = content + self.status_code = status_code + + def raise_for_status(self): + if self.status_code >= 400: + raise Exception(f"HTTP {self.status_code}") + + def json(self): + return self._json +``` + +- [ ] **Step 2: 冒烟验证加载器可加载现有脚本** + +Run: +```bash +uv run --no-project --with pytest --with requests --with Pillow python -c "import sys; sys.path.insert(0,'tests/skills'); from skill_loader import load; m=load('image-generation'); print('loaded', hasattr(m,'generate_image'))" +``` +Expected: 输出 `loaded True`(注意:此步要求 Task 1 尚未执行也能加载——当前 image generate.py 顶层 `from PIL import Image` 需 Pillow,已在命令里 `--with Pillow`)。 + +- [ ] **Step 3: Commit** + +```bash +git add tests/skills/skill_loader.py +git commit -m "test(skills): add importlib loader + FakeResp for skill tests" +``` + +--- + +## Task 1: image-generation 接入 MiniMax + +**Files:** +- Modify: `skills/public/image-generation/scripts/generate.py`(整文件替换) +- Modify: `skills/public/image-generation/SKILL.md` +- Test: `tests/skills/test_image_generation.py` + +- [ ] **Step 1: 写失败测试** + +`tests/skills/test_image_generation.py`: +```python +import base64 +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +img = load("image-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["GEMINI_API_KEY", "MINIMAX_API_KEY", "IMAGE_GENERATION_PROVIDER", + "MINIMAX_API_HOST", "MINIMAX_IMAGE_MODEL"]: + monkeypatch.delenv(k, raising=False) + + +def test_resolve_prefers_gemini(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "g") + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", + bool(__import__("os").getenv("GEMINI_API_KEY"))) == "gemini" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", False) == "minimax" + + +def test_resolve_override_wins(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "g") + monkeypatch.setenv("IMAGE_GENERATION_PROVIDER", "MiniMax") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", True) == "minimax" + + +def test_resolve_errors_when_none(monkeypatch): + with pytest.raises(ValueError): + img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", False) + + +def test_minimax_builds_payload_and_writes(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + raw = b"PNGBYTES" + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(raw).decode()]}, + "base_resp": {"status_code": 0, "status_msg": "success"}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + out = tmp_path / "o.jpg" + prompt_file = tmp_path / "p.json" + prompt_file.write_text("a red apple", encoding="utf-8") + msg = img.generate_image(str(prompt_file), [], str(out), "16:9") + + assert out.read_bytes() == raw + assert captured["url"].endswith("/v1/image_generation") + assert captured["headers"]["Authorization"] == "Bearer m" + assert captured["json"]["model"] == "image-01" + assert captured["json"]["response_format"] == "base64" + assert captured["json"]["aspect_ratio"] == "16:9" + assert "Successfully generated image" in msg + + +def test_minimax_reference_image_as_data_url(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(b"x").decode()]}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + ref = tmp_path / "ref.jpg" + ref.write_bytes(b"\xff\xd8refbytes") + prompt_file = tmp_path / "p.json" + prompt_file.write_text("scene", encoding="utf-8") + img.generate_image(str(prompt_file), [str(ref)], str(tmp_path / "o.jpg"), "1:1") + + subj = captured["json"]["subject_reference"] + assert subj[0]["type"] == "character" + assert subj[0]["image_file"].startswith("data:image/jpeg;base64,") + + +def test_minimax_raises_on_base_resp_error(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"base_resp": {"status_code": 1004, "status_msg": "auth failed"}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.json" + prompt_file.write_text("x", encoding="utf-8") + with pytest.raises(Exception) as e: + img.generate_image(str(prompt_file), [], str(tmp_path / "o.jpg"), "1:1") + assert "1004" in str(e.value) +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_image_generation.py -v` +Expected: FAIL(`_resolve_provider` / minimax 行为尚不存在)。 + +- [ ] **Step 3: 整文件替换 generate.py** + +`skills/public/image-generation/scripts/generate.py`: +```python +import base64 +import os + +import requests + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +def validate_image(image_path: str) -> bool: + """Validate if an image file can be opened and is not corrupted.""" + from PIL import Image # lazy import: keeps module importable without Pillow + + try: + with Image.open(image_path) as image: + image.verify() + with Image.open(image_path) as image: + image.load() + return True + except Exception as exc: + print(f"Warning: Image '{image_path}' is invalid or corrupted: {exc}") + return False + + +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + """Pick the generation provider. + + 1. Explicit _PROVIDER override wins. + 2. Otherwise prefer the existing provider when its credentials are present. + 3. Otherwise fall back to MiniMax when MINIMAX_API_KEY is set. + """ + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set GEMINI_API_KEY for {existing_provider}, " + f"or MINIMAX_API_KEY for minimax (optionally force with {override_env})." + ) + + +def _minimax_host() -> str: + return os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception( + f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}" + ) + + +def _to_data_url(image_path: str) -> str: + with open(image_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:image/jpeg;base64,{b64}" + + +def _generate_image_minimax( + prompt: str, reference_images: list[str], output_file: str, aspect_ratio: str +) -> str: + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + body = { + "model": os.getenv("MINIMAX_IMAGE_MODEL", "image-01"), + "prompt": prompt, + "aspect_ratio": aspect_ratio, + "response_format": "base64", + "n": 1, + "prompt_optimizer": True, + } + if reference_images: + body["subject_reference"] = [ + {"type": "character", "image_file": _to_data_url(p)} for p in reference_images + ] + response = requests.post( + f"{_minimax_host()}/v1/image_generation", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=body, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + images = (payload.get("data") or {}).get("image_base64") or [] + if not images: + raise Exception("MiniMax returned no image data") + with open(output_file, "wb") as f: + f.write(base64.b64decode(images[0])) + return f"Successfully generated image to {output_file}" + + +def _generate_image_gemini( + prompt: str, reference_images: list[str], output_file: str, aspect_ratio: str +) -> str: + parts = [] + valid_reference_images = [] + for ref_img in reference_images: + if validate_image(ref_img): + valid_reference_images.append(ref_img) + else: + print(f"Skipping invalid reference image: {ref_img}") + if len(valid_reference_images) < len(reference_images): + skipped = len(reference_images) - len(valid_reference_images) + print(f"Note: {skipped} reference image(s) were skipped due to validation failure.") + + for reference_image in valid_reference_images: + with open(reference_image, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode("utf-8") + parts.append({"inlineData": {"mimeType": "image/jpeg", "data": image_b64}}) + + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + return "GEMINI_API_KEY is not set" + response = requests.post( + "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", + headers={"x-goog-api-key": api_key, "Content-Type": "application/json"}, + json={ + "generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}}, + "contents": [{"parts": [*parts, {"text": prompt}]}], + }, + ) + response.raise_for_status() + data = response.json() + response_parts: list[dict] = data["candidates"][0]["content"]["parts"] + image_parts = [part for part in response_parts if part.get("inlineData", False)] + if len(image_parts) == 1: + base64_image = image_parts[0]["inlineData"]["data"] + with open(output_file, "wb") as f: + f.write(base64.b64decode(base64_image)) + return f"Successfully generated image to {output_file}" + raise Exception("Failed to generate image") + + +def generate_image( + prompt_file: str, + reference_images: list[str], + output_file: str, + aspect_ratio: str = "16:9", +) -> str: + with open(prompt_file, "r", encoding="utf-8") as f: + prompt = f.read() + provider = _resolve_provider( + "IMAGE_GENERATION_PROVIDER", "gemini", bool(os.getenv("GEMINI_API_KEY")) + ) + if provider == "minimax": + return _generate_image_minimax(prompt, reference_images, output_file, aspect_ratio) + return _generate_image_gemini(prompt, reference_images, output_file, aspect_ratio) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Generate images using Gemini or MiniMax API") + parser.add_argument("--prompt-file", required=True, help="Absolute path to JSON prompt file") + parser.add_argument("--reference-images", nargs="*", default=[], + help="Absolute paths to reference images (space-separated)") + parser.add_argument("--output-file", required=True, help="Output path for generated image") + parser.add_argument("--aspect-ratio", required=False, default="16:9", + help="Aspect ratio of the generated image") + args = parser.parse_args() + + try: + print(generate_image(args.prompt_file, args.reference_images, + args.output_file, args.aspect_ratio)) + except Exception as e: + print(f"Error while generating image: {e}") +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_image_generation.py -v` +Expected: PASS(7 个用例全过)。 + +- [ ] **Step 5: 更新 SKILL.md(追加 provider 说明)** + +在 `skills/public/image-generation/SKILL.md` 的 `## Notes` 段之前插入新段落: +```markdown +## Providers (Gemini / MiniMax) + +This skill auto-selects the provider by environment variables (no CLI change): + +- `GEMINI_API_KEY` set → use Gemini (default, unchanged). +- Only `MINIMAX_API_KEY` set → use MiniMax (`/v1/image_generation`, model `image-01`). +- Force one explicitly with `IMAGE_GENERATION_PROVIDER=gemini|minimax`. + +MiniMax optional overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_IMAGE_MODEL` (default `image-01`). Reference images are sent as the MiniMax +`subject_reference` character image. The CLI and `--prompt-file` / `--reference-images` +/ `--output-file` / `--aspect-ratio` arguments are identical for both providers. +``` + +- [ ] **Step 6: Commit** + +```bash +git add skills/public/image-generation/scripts/generate.py skills/public/image-generation/SKILL.md tests/skills/test_image_generation.py +git commit -m "feat(image-generation): add MiniMax provider with env auto-detect" +``` + +--- + +## Task 2: video-generation 接入 MiniMax + +**Files:** +- Modify: `skills/public/video-generation/scripts/generate.py`(整文件替换) +- Modify: `skills/public/video-generation/SKILL.md` +- Test: `tests/skills/test_video_generation.py` + +- [ ] **Step 1: 写失败测试** + +`tests/skills/test_video_generation.py`: +```python +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +vid = load("video-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["GEMINI_API_KEY", "MINIMAX_API_KEY", "VIDEO_GENERATION_PROVIDER", + "MINIMAX_API_HOST", "MINIMAX_VIDEO_MODEL"]: + monkeypatch.delenv(k, raising=False) + monkeypatch.setattr(vid.time, "sleep", lambda *_: None) + + +def test_resolve_prefers_gemini(): + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", True) == "gemini" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", False) == "minimax" + + +def test_resolve_override(monkeypatch): + monkeypatch.setenv("VIDEO_GENERATION_PROVIDER", "minimax") + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", True) == "minimax" + + +def test_minimax_full_flow(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + posts = {} + + def fake_post(url, headers=None, json=None, **kw): + posts["url"] = url + posts["json"] = json + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + if url.endswith("/v1/query/video_generation"): + assert params["task_id"] == "T1" + return FakeResp({"status": "Success", "file_id": "F1", + "base_resp": {"status_code": 0}}) + if url.endswith("/v1/files/retrieve"): + assert params["file_id"] == "F1" + return FakeResp({"file": {"download_url": "https://dl/v.mp4"}, + "base_resp": {"status_code": 0}}) + return FakeResp(content=b"MP4DATA") # the actual download + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + + out = tmp_path / "v.mp4" + pf = tmp_path / "p.json" + pf.write_text("a cat runs", encoding="utf-8") + msg = vid.generate_video(str(pf), [], str(out), "16:9") + + assert out.read_bytes() == b"MP4DATA" + assert posts["url"].endswith("/v1/video_generation") + assert posts["json"]["model"] == "MiniMax-Hailuo-2.3" + assert "successfully" in msg.lower() + + +def test_minimax_reference_first_frame(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + posts = {} + + def fake_post(url, headers=None, json=None, **kw): + posts["json"] = json + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + if url.endswith("/v1/query/video_generation"): + return FakeResp({"status": "Success", "file_id": "F1", "base_resp": {"status_code": 0}}) + if url.endswith("/v1/files/retrieve"): + return FakeResp({"file": {"download_url": "https://dl/v.mp4"}, "base_resp": {"status_code": 0}}) + return FakeResp(content=b"X") + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + ref = tmp_path / "f.jpg" + ref.write_bytes(b"\xff\xd8img") + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + vid.generate_video(str(pf), [str(ref)], str(tmp_path / "v.mp4"), "16:9") + assert posts["json"]["first_frame_image"].startswith("data:image/jpeg;base64,") + + +def test_minimax_task_fail(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + return FakeResp({"status": "Fail", "base_resp": {"status_code": 1027, "status_msg": "blocked"}}) + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + with pytest.raises(Exception): + vid.generate_video(str(pf), [], str(tmp_path / "v.mp4"), "16:9") +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_video_generation.py -v` +Expected: FAIL。 + +- [ ] **Step 3: 整文件替换 generate.py** + +`skills/public/video-generation/scripts/generate.py`: +```python +import base64 +import os +import time + +import requests + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + """Pick the provider: _PROVIDER override > existing creds > MiniMax fallback.""" + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set GEMINI_API_KEY for {existing_provider}, " + f"or MINIMAX_API_KEY for minimax (optionally force with {override_env})." + ) + + +def _minimax_host() -> str: + return os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception(f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}") + + +def _to_data_url(image_path: str) -> str: + with open(image_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:image/jpeg;base64,{b64}" + + +def _poll_video_task(host: str, auth: str, task_id: str, + max_attempts: int = 120, interval: int = 3) -> str: + for _ in range(max_attempts): + response = requests.get( + f"{host}/v1/query/video_generation", + headers={"Authorization": auth}, + params={"task_id": task_id}, + ) + response.raise_for_status() + payload = response.json() + status = payload.get("status") + if status == "Success": + return payload["file_id"] + if status == "Fail": + base = payload.get("base_resp") or {} + raise Exception( + f"MiniMax video task {task_id} failed: " + f"{base.get('status_code')} {base.get('status_msg')}" + ) + time.sleep(interval) + raise Exception(f"MiniMax video task {task_id} timed out after {max_attempts} polls") + + +def _retrieve_file_url(host: str, auth: str, file_id: str) -> str: + response = requests.get( + f"{host}/v1/files/retrieve", + headers={"Authorization": auth}, + params={"file_id": file_id}, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + return payload["file"]["download_url"] + + +def _download(url: str, output_file: str) -> None: + response = requests.get(url) + response.raise_for_status() + with open(output_file, "wb") as f: + f.write(response.content) + + +def _generate_video_minimax( + prompt: str, reference_images: list[str], output_file: str +) -> str: + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + host = _minimax_host() + auth = f"Bearer {api_key}" + body = {"model": os.getenv("MINIMAX_VIDEO_MODEL", "MiniMax-Hailuo-2.3"), "prompt": prompt} + if reference_images: + body["first_frame_image"] = _to_data_url(reference_images[0]) + response = requests.post( + f"{host}/v1/video_generation", + headers={"Authorization": auth, "Content-Type": "application/json"}, + json=body, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + task_id = payload["task_id"] + file_id = _poll_video_task(host, auth, task_id) + download_url = _retrieve_file_url(host, auth, file_id) + _download(download_url, output_file) + return f"The video has been generated successfully to {output_file}" + + +def download(url: str, output_file: str): + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + return "GEMINI_API_KEY is not set" + response = requests.get(url, headers={"x-goog-api-key": api_key}) + with open(output_file, "wb") as f: + f.write(response.content) + + +def _generate_video_gemini( + prompt: str, reference_images: list[str], output_file: str +) -> str: + reference_payload = [] + request_json = {"instances": [{"prompt": prompt}]} + for reference_image in reference_images: + with open(reference_image, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode("utf-8") + reference_payload.append( + {"image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64}, + "referenceType": "asset"} + ) + if reference_payload: + request_json["instances"][0]["referenceImages"] = reference_payload + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + return "GEMINI_API_KEY is not set" + response = requests.post( + "https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning", + headers={"x-goog-api-key": api_key, "Content-Type": "application/json"}, + json=request_json, + ) + data = response.json() + operation_name = data["name"] + while True: + response = requests.get( + f"https://generativelanguage.googleapis.com/v1beta/{operation_name}", + headers={"x-goog-api-key": api_key}, + ) + data = response.json() + if data.get("done", False): + sample = data["response"]["generateVideoResponse"]["generatedSamples"][0] + download(sample["video"]["uri"], output_file) + break + time.sleep(3) + return f"The video has been generated successfully to {output_file}" + + +def generate_video( + prompt_file: str, + reference_images: list[str], + output_file: str, + aspect_ratio: str = "16:9", +) -> str: + with open(prompt_file, "r", encoding="utf-8") as f: + prompt = f.read() + provider = _resolve_provider( + "VIDEO_GENERATION_PROVIDER", "gemini", bool(os.getenv("GEMINI_API_KEY")) + ) + if provider == "minimax": + # MiniMax video uses resolution/duration, not aspect_ratio; aspect_ratio ignored. + return _generate_video_minimax(prompt, reference_images, output_file) + return _generate_video_gemini(prompt, reference_images, output_file) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Generate videos using Gemini or MiniMax API") + parser.add_argument("--prompt-file", required=True, help="Absolute path to JSON prompt file") + parser.add_argument("--reference-images", nargs="*", default=[], + help="Absolute paths to reference images (space-separated)") + parser.add_argument("--output-file", required=True, help="Output path for generated video") + parser.add_argument("--aspect-ratio", required=False, default="16:9", + help="Aspect ratio of the generated video (Gemini only)") + args = parser.parse_args() + + try: + print(generate_video(args.prompt_file, args.reference_images, + args.output_file, args.aspect_ratio)) + except Exception as e: + print(f"Error while generating video: {e}") +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_video_generation.py -v` +Expected: PASS(6 个用例全过)。 + +- [ ] **Step 5: 更新 SKILL.md** + +在 `skills/public/video-generation/SKILL.md` 末尾追加: +```markdown +## Providers (Gemini / MiniMax) + +Auto-selected by environment variables (CLI unchanged): + +- `GEMINI_API_KEY` set → Gemini Veo (default, unchanged). +- Only `MINIMAX_API_KEY` set → MiniMax video (`/v1/video_generation`, async 3-step poll/download). +- Force with `VIDEO_GENERATION_PROVIDER=gemini|minimax`. + +MiniMax overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_VIDEO_MODEL` (default `MiniMax-Hailuo-2.3`). The first reference image is used +as MiniMax `first_frame_image`. MiniMax ignores `--aspect-ratio` (it uses resolution/duration). +``` + +- [ ] **Step 6: Commit** + +```bash +git add skills/public/video-generation/scripts/generate.py skills/public/video-generation/SKILL.md tests/skills/test_video_generation.py +git commit -m "feat(video-generation): add MiniMax provider with async poll/download" +``` + +--- + +## Task 3: podcast-generation 接入 MiniMax + +**Files:** +- Modify: `skills/public/podcast-generation/scripts/generate.py`(整文件替换) +- Modify: `skills/public/podcast-generation/SKILL.md` +- Test: `tests/skills/test_podcast_generation.py` + +- [ ] **Step 1: 写失败测试** + +`tests/skills/test_podcast_generation.py`: +```python +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +pod = load("podcast-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["VOLCENGINE_TTS_APPID", "VOLCENGINE_TTS_ACCESS_TOKEN", "VOLCENGINE_TTS_CLUSTER", + "MINIMAX_API_KEY", "PODCAST_GENERATION_PROVIDER", "MINIMAX_API_HOST", + "MINIMAX_TTS_MODEL", "MINIMAX_TTS_VOICE_MALE", "MINIMAX_TTS_VOICE_FEMALE"]: + monkeypatch.delenv(k, raising=False) + + +def test_resolve_prefers_volcengine(monkeypatch): + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + assert pod._resolve_tts_provider() == "volcengine" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert pod._resolve_tts_provider() == "minimax" + + +def test_resolve_override(monkeypatch): + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + monkeypatch.setenv("PODCAST_GENERATION_PROVIDER", "minimax") + assert pod._resolve_tts_provider() == "minimax" + + +def test_minimax_tts_decodes_hex(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["json"] = json + return FakeResp({"data": {"audio": b"audiobytes".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hello", "male-qn-qingse") + assert out == b"audiobytes" + assert captured["url"].endswith("/v1/t2a_v2") + assert captured["json"]["voice_setting"]["voice_id"] == "male-qn-qingse" + assert captured["json"]["output_format"] == "hex" + + +def test_process_line_minimax_voice_mapping(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + seen = {} + + def fake_tts(text, voice_id): + seen["voice_id"] = voice_id + return b"x" + + monkeypatch.setattr(pod, "text_to_speech_minimax", fake_tts) + line = pod.ScriptLine(speaker="female", paragraph="hi") + idx, audio = pod._process_line((0, line, 1, "minimax")) + assert audio == b"x" + assert seen["voice_id"] == "female-tianmei" + + +def test_generate_podcast_minimax_end_to_end(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"data": {"audio": b"chunk".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(pod.requests, "post", fake_post) + script = tmp_path / "s.json" + script.write_text( + '{"title":"T","locale":"en","lines":[{"speaker":"male","paragraph":"a"},' + '{"speaker":"female","paragraph":"b"}]}', + encoding="utf-8", + ) + out = tmp_path / "o.mp3" + msg = pod.generate_podcast(str(script), str(out), None) + assert out.read_bytes() == b"chunkchunk" + assert "Successfully generated podcast" in msg +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_podcast_generation.py -v` +Expected: FAIL。 + +- [ ] **Step 3: 整文件替换 generate.py** + +`skills/public/podcast-generation/scripts/generate.py`: +```python +import argparse +import base64 +import json +import logging +import os +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Literal, Optional + +import requests + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +class ScriptLine: + def __init__(self, speaker: Literal["male", "female"] = "male", paragraph: str = ""): + self.speaker = speaker + self.paragraph = paragraph + + +class Script: + def __init__(self, locale: Literal["en", "zh"] = "en", lines: Optional[list[ScriptLine]] = None): + self.locale = locale + self.lines = lines or [] + + @classmethod + def from_dict(cls, data: dict) -> "Script": + script = cls(locale=data.get("locale", "en")) + for line in data.get("lines", []): + script.lines.append( + ScriptLine(speaker=line.get("speaker", "male"), + paragraph=line.get("paragraph", "")) + ) + return script + + +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set VOLCENGINE_TTS_APPID + VOLCENGINE_TTS_ACCESS_TOKEN " + f"for {existing_provider}, or MINIMAX_API_KEY for minimax " + f"(optionally force with {override_env})." + ) + + +def _resolve_tts_provider() -> str: + has_volc = bool( + os.getenv("VOLCENGINE_TTS_APPID") and os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") + ) + return _resolve_provider("PODCAST_GENERATION_PROVIDER", "volcengine", has_volc) + + +def text_to_speech_volcengine(text: str, voice_type: str) -> Optional[bytes]: + """Convert text to speech using Volcengine TTS (returns base64-decoded mp3 bytes).""" + app_id = os.getenv("VOLCENGINE_TTS_APPID") + access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") + cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts") + url = "https://openspeech.bytedance.com/api/v1/tts" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer;{access_token}"} + payload = { + "app": {"appid": app_id, "token": "access_token", "cluster": cluster}, + "user": {"uid": "podcast-generator"}, + "audio": {"voice_type": voice_type, "encoding": "mp3", "speed_ratio": 1.2}, + "request": {"reqid": str(uuid.uuid4()), "text": text, + "text_type": "plain", "operation": "query"}, + } + try: + response = requests.post(url, json=payload, headers=headers) + if response.status_code != 200: + logger.error(f"TTS API error: {response.status_code} - {response.text}") + return None + result = response.json() + if result.get("code") != 3000: + logger.error(f"TTS error: {result.get('message')} (code: {result.get('code')})") + return None + audio_data = result.get("data") + if audio_data: + return base64.b64decode(audio_data) + except Exception as e: + logger.error(f"TTS error: {str(e)}") + return None + + +def text_to_speech_minimax(text: str, voice_id: str) -> Optional[bytes]: + """Convert text to speech using MiniMax t2a_v2 (returns hex-decoded mp3 bytes).""" + api_key = os.getenv("MINIMAX_API_KEY") + host = os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + payload = { + "model": os.getenv("MINIMAX_TTS_MODEL", "speech-2.6-hd"), + "text": text, + "voice_setting": {"voice_id": voice_id, "speed": 1.0, "vol": 1.0, "pitch": 0}, + "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "mp3", "channel": 1}, + "output_format": "hex", + } + try: + response = requests.post( + f"{host}/v1/t2a_v2", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=payload, + ) + if response.status_code != 200: + logger.error(f"MiniMax TTS error: {response.status_code} - {response.text}") + return None + result = response.json() + if (result.get("base_resp") or {}).get("status_code", 0) != 0: + base = result.get("base_resp") or {} + logger.error(f"MiniMax TTS error {base.get('status_code')}: {base.get('status_msg')}") + return None + audio_hex = (result.get("data") or {}).get("audio") + if audio_hex: + return bytes.fromhex(audio_hex) + except Exception as e: + logger.error(f"MiniMax TTS error: {str(e)}") + return None + + +def _process_line(args: tuple[int, ScriptLine, int, str]) -> tuple[int, Optional[bytes]]: + """Process a single script line for TTS. Returns (index, audio_bytes).""" + i, line, total, provider = args + logger.info(f"Processing line {i + 1}/{total} ({line.speaker}) via {provider}") + if provider == "minimax": + if line.speaker == "male": + voice = os.getenv("MINIMAX_TTS_VOICE_MALE", "male-qn-qingse") + else: + voice = os.getenv("MINIMAX_TTS_VOICE_FEMALE", "female-tianmei") + audio = text_to_speech_minimax(line.paragraph, voice) + else: + if line.speaker == "male": + voice = "zh_male_yangguangqingnian_moon_bigtts" + else: + voice = "zh_female_sajiaonvyou_moon_bigtts" + audio = text_to_speech_volcengine(line.paragraph, voice) + if not audio: + logger.warning(f"Failed to generate audio for line {i + 1}") + return (i, audio) + + +def tts_node(script: Script, max_workers: int = 4) -> list[bytes]: + """Convert script lines to audio chunks using TTS with multi-threading.""" + total = len(script.lines) + if total == 0: + raise ValueError("Script contains no lines to process") + + provider = _resolve_tts_provider() + logger.info(f"Converting script to audio using {max_workers} workers (provider={provider})...") + tasks = [(i, line, total, provider) for i, line in enumerate(script.lines)] + + results: dict[int, Optional[bytes]] = {} + failed_indices: list[int] = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(_process_line, task): task[0] for task in tasks} + for future in as_completed(futures): + idx, audio = future.result() + results[idx] = audio + if not audio: + failed_indices.append(idx) + + if failed_indices: + logger.warning( + f"Failed to generate audio for {len(failed_indices)}/{total} lines: " + f"line numbers {sorted(i + 1 for i in failed_indices)}" + ) + + audio_chunks = [] + for i in range(total): + audio = results.get(i) + if audio: + audio_chunks.append(audio) + + logger.info(f"Generated {len(audio_chunks)}/{total} audio chunks successfully") + if not audio_chunks: + raise ValueError(f"TTS generation failed for all {total} lines.") + return audio_chunks + + +def mix_audio(audio_chunks: list[bytes]) -> bytes: + """Combine audio chunks into a single audio file.""" + if not audio_chunks: + raise ValueError("No audio chunks to mix - TTS generation may have failed") + output = b"".join(audio_chunks) + if len(output) == 0: + raise ValueError("Mixed audio is empty - TTS generation may have failed") + logger.info(f"Audio mixing complete: {len(output)} bytes") + return output + + +def generate_markdown(script: Script, title: str = "Podcast Script") -> str: + lines = [f"# {title}", ""] + for line in script.lines: + speaker_name = "**Host (Male)**" if line.speaker == "male" else "**Host (Female)**" + lines.append(f"{speaker_name}: {line.paragraph}") + lines.append("") + return "\n".join(lines) + + +def generate_podcast(script_file: str, output_file: str, + transcript_file: Optional[str] = None) -> str: + with open(script_file, "r", encoding="utf-8") as f: + script_json = json.load(f) + if "lines" not in script_json: + raise ValueError( + f"Invalid script format: missing 'lines' key. Got keys: {list(script_json.keys())}" + ) + script = Script.from_dict(script_json) + logger.info(f"Loaded script with {len(script.lines)} lines") + + if transcript_file: + title = script_json.get("title", "Podcast Script") + markdown_content = generate_markdown(script, title) + transcript_dir = os.path.dirname(transcript_file) + if transcript_dir: + os.makedirs(transcript_dir, exist_ok=True) + with open(transcript_file, "w", encoding="utf-8") as f: + f.write(markdown_content) + logger.info(f"Generated transcript to {transcript_file}") + + audio_chunks = tts_node(script) + if not audio_chunks: + raise Exception("Failed to generate any audio") + output_audio = mix_audio(audio_chunks) + + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + with open(output_file, "wb") as f: + f.write(output_audio) + + result = f"Successfully generated podcast to {output_file}" + if transcript_file: + result += f" and transcript to {transcript_file}" + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate podcast from script JSON file") + parser.add_argument("--script-file", required=True, help="Absolute path to script JSON file") + parser.add_argument("--output-file", required=True, help="Output path for generated podcast MP3") + parser.add_argument("--transcript-file", required=False, + help="Output path for transcript markdown file (optional)") + args = parser.parse_args() + + try: + result = generate_podcast(args.script_file, args.output_file, args.transcript_file) + print(result) + except Exception as e: + import traceback + print(f"Error generating podcast: {e}") + traceback.print_exc() +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_podcast_generation.py -v` +Expected: PASS(6 个用例全过)。 + +- [ ] **Step 5: 更新 SKILL.md** + +在 `skills/public/podcast-generation/SKILL.md` 末尾追加: +```markdown +## Providers (Volcengine / MiniMax) + +Auto-selected by environment variables (CLI unchanged): + +- `VOLCENGINE_TTS_APPID` + `VOLCENGINE_TTS_ACCESS_TOKEN` set → Volcengine TTS (default). +- Only `MINIMAX_API_KEY` set → MiniMax TTS (`/v1/t2a_v2`). +- Force with `PODCAST_GENERATION_PROVIDER=volcengine|minimax`. + +MiniMax overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_TTS_MODEL` (default `speech-2.6-hd`), `MINIMAX_TTS_VOICE_MALE` +(default `male-qn-qingse`), `MINIMAX_TTS_VOICE_FEMALE` (default `female-tianmei`). +``` + +- [ ] **Step 6: Commit** + +```bash +git add skills/public/podcast-generation/scripts/generate.py skills/public/podcast-generation/SKILL.md tests/skills/test_podcast_generation.py +git commit -m "feat(podcast-generation): add MiniMax t2a_v2 provider with env auto-detect" +``` + +--- + +## Task 4: 新建 music-generation skill(用 skill-creator) + +**Files:** +- Create: `skills/public/music-generation/SKILL.md` +- Create: `skills/public/music-generation/scripts/generate.py` +- Modify: `frontend/src/app/mock/api/skills/route.ts` +- Test: `tests/skills/test_music_generation.py` + +- [ ] **Step 1: 用 skill-creator 脚手架生成骨架** + +Run: +```bash +uv run --no-project --with pytest python skills/public/skill-creator/scripts/init_skill.py music-generation --path skills/public +``` +Expected: 生成 `skills/public/music-generation/`(含 `SKILL.md` 占位 + `scripts/` + `references/` + `assets/`)。随后删除不需要的目录: +```bash +rm -rf skills/public/music-generation/references skills/public/music-generation/assets +rm -f skills/public/music-generation/scripts/example_script.py +``` +(若脚手架生成的示例脚本名不同,删除 `scripts/` 下除将创建的 `generate.py` 外的占位文件。) + +- [ ] **Step 2: 写失败测试** + +`tests/skills/test_music_generation.py`: +```python +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +mus = load("music-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["MINIMAX_API_KEY", "MINIMAX_API_HOST", "MINIMAX_MUSIC_MODEL"]: + monkeypatch.delenv(k, raising=False) + + +def _post_ok(captured): + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + return FakeResp({"data": {"audio": b"songbytes".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + return fake_post + + +def test_with_lyrics_payload_and_writes(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"title":"X","prompt":"pop, happy","lyrics":"[verse]\\nla la"}', + encoding="utf-8") + out = tmp_path / "o.mp3" + msg = mus.generate_music(str(spec), str(out)) + assert out.read_bytes() == b"songbytes" + assert captured["url"].endswith("/v1/music_generation") + assert captured["headers"]["Authorization"] == "Bearer m" + assert captured["json"]["model"] == "music-2.6-free" + assert captured["json"]["lyrics"] == "[verse]\nla la" + assert captured["json"]["output_format"] == "hex" + assert "Successfully generated music" in msg + + +def test_instrumental_sets_flag(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"lofi beats","is_instrumental":true}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["is_instrumental"] is True + assert "lyrics" not in captured["json"] + assert "lyrics_optimizer" not in captured["json"] + + +def test_no_lyrics_uses_optimizer(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"sad ballad"}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["lyrics_optimizer"] is True + assert "lyrics" not in captured["json"] + + +def test_model_override(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + monkeypatch.setenv("MINIMAX_MUSIC_MODEL", "music-2.6") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"jazz","lyrics":"[verse]\\nhi"}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["model"] == "music-2.6" + + +def test_raises_on_base_resp_error(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"base_resp": {"status_code": 1008, "status_msg": "no balance"}}) + + monkeypatch.setattr(mus.requests, "post", fake_post) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x","lyrics":"[verse]\\ny"}', encoding="utf-8") + with pytest.raises(Exception) as e: + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert "1008" in str(e.value) + + +def test_missing_api_key_returns_message(monkeypatch, tmp_path): + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x"}', encoding="utf-8") + msg = mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert "MINIMAX_API_KEY" in msg +``` + +- [ ] **Step 3: 运行测试确认失败** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_music_generation.py -v` +Expected: FAIL(`generate_music` 不存在)。 + +- [ ] **Step 4: 写实现 generate.py** + +`skills/public/music-generation/scripts/generate.py`: +```python +import argparse +import json +import os + +import requests + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception(f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}") + + +def generate_music(prompt_file: str, output_file: str) -> str: + """Generate a song from a JSON spec via MiniMax /v1/music_generation. + + Spec JSON: {"title": str, "prompt": str, "lyrics"?: str, "is_instrumental"?: bool} + - lyrics given -> use them (supports [Verse]/[Chorus] structure tags, \\n lines) + - is_instrumental true -> pure music, no lyrics needed + - otherwise -> lyrics_optimizer auto-writes lyrics from prompt + """ + with open(prompt_file, "r", encoding="utf-8") as f: + spec = json.load(f) + + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + + prompt = spec.get("prompt", "") + lyrics = spec.get("lyrics") + is_instrumental = bool(spec.get("is_instrumental", False)) + + body = { + "model": os.getenv("MINIMAX_MUSIC_MODEL", "music-2.6-free"), + "prompt": prompt, + "output_format": "hex", + "audio_setting": {"sample_rate": 44100, "bitrate": 256000, "format": "mp3"}, + } + if lyrics: + body["lyrics"] = lyrics + elif is_instrumental: + body["is_instrumental"] = True + else: + body["lyrics_optimizer"] = True + + host = os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + response = requests.post( + f"{host}/v1/music_generation", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=body, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + audio_hex = (payload.get("data") or {}).get("audio") + if not audio_hex: + raise Exception("MiniMax returned no audio data") + + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + with open(output_file, "wb") as f: + f.write(bytes.fromhex(audio_hex)) + return f"Successfully generated music to {output_file}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate music using MiniMax API") + parser.add_argument("--prompt-file", required=True, + help="Absolute path to JSON spec file {title, prompt, lyrics?, is_instrumental?}") + parser.add_argument("--output-file", required=True, help="Output path for generated MP3") + args = parser.parse_args() + + try: + print(generate_music(args.prompt_file, args.output_file)) + except Exception as e: + print(f"Error while generating music: {e}") +``` + +- [ ] **Step 5: 运行测试确认通过** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/test_music_generation.py -v` +Expected: PASS(6 个用例全过)。 + +- [ ] **Step 6: 写 SKILL.md** + +整文件替换 `skills/public/music-generation/SKILL.md`: +```markdown +--- +name: music-generation +description: Use this skill when the user requests to generate, create, compose, or produce music or songs — background music, theme songs, jingles, or instrumental tracks. Generates a song from a style/mood prompt and optional lyrics via the MiniMax music API. +--- + +# Music Generation Skill + +## Overview + +This skill generates songs (vocal or instrumental) from a structured JSON spec using the +MiniMax music generation API (`/v1/music_generation`). You describe the style/mood/scene in +`prompt`, optionally provide `lyrics`, and the script returns an MP3. + +## Workflow + +### Step 1: Understand Requirements + +Identify the desired style, mood, scene, language, and whether the user wants vocals or a +pure instrumental track. Decide whether to supply lyrics or let the model write them. + +### Step 2: Create the Spec JSON + +Write a JSON file in `/mnt/user-data/workspace/` named `{descriptive-name}.json`: + +```json +{ + "title": "Rainy Night Cafe", + "prompt": "indie folk, melancholic, introspective, walking alone, cafe", + "lyrics": "[verse]\nStreetlights glow the night wind sighs\n[chorus]\nPush the wooden door warm air inside" +} +``` + +Fields: +- `title` (optional): a human-readable name. +- `prompt` (required): style, mood, and scene. Drives the musical character. +- `lyrics` (optional): song lyrics. Use `\n` between lines and structure tags such as + `[Intro]`, `[Verse]`, `[Pre Chorus]`, `[Chorus]`, `[Bridge]`, `[Outro]`. +- `is_instrumental` (optional, bool): set `true` for a pure instrumental track (no lyrics needed). + +Behavior: +- `lyrics` provided → those lyrics are sung. +- `is_instrumental: true` → instrumental, no vocals. +- neither → the model auto-writes lyrics from `prompt` (`lyrics_optimizer`). + +### Step 3: Execute Generation + +```bash +python /mnt/skills/public/music-generation/scripts/generate.py \ + --prompt-file /mnt/user-data/workspace/rainy-night-cafe.json \ + --output-file /mnt/user-data/outputs/rainy-night-cafe.mp3 +``` + +Parameters: +- `--prompt-file`: Absolute path to the JSON spec (required). +- `--output-file`: Absolute path for the output MP3 (required). + +[!NOTE] +Do NOT read the python file, just call it with the parameters. + +## Environment + +- `MINIMAX_API_KEY` (required): your MiniMax interface key. +- `MINIMAX_API_HOST` (optional): default `https://api.minimaxi.com`. +- `MINIMAX_MUSIC_MODEL` (optional): default `music-2.6-free` (works for all API-key users); + paid/Token-Plan users can set `music-2.6` for higher limits. + +## Output Handling + +- Music is saved as MP3 (typically in `/mnt/user-data/outputs/`). +- Share the generated file with the user using the present_files tool. +- Offer to iterate on style or lyrics if adjustments are needed. + +## Notes + +- Keep `prompt` focused on style/mood/scene; put the actual sung words in `lyrics`. +- For non-English songs, write `lyrics` in the target language. +``` + +- [ ] **Step 7: 在前端 mock skills 列表注册 music-generation** + +修改 `frontend/src/app/mock/api/skills/route.ts`,在 `image-generation` 条目之后、`podcast-generation` 条目之前插入(保持字母序): +```typescript + { + name: "music-generation", + description: + "Use this skill when the user requests to generate, create, compose, or produce music or songs — background music, theme songs, jingles, or instrumental tracks. Generates a song from a style/mood prompt and optional lyrics via the MiniMax music API.", + license: null, + category: "public", + enabled: true, + }, +``` + +- [ ] **Step 8: 前端类型检查(确认 route.ts 无误)** + +Run: `cd frontend && pnpm typecheck` +Expected: PASS(无新增类型错误)。若 `frontend` 依赖未安装,先 `pnpm install` 再 typecheck。 + +- [ ] **Step 9: Commit** + +```bash +git add skills/public/music-generation frontend/src/app/mock/api/skills/route.ts tests/skills/test_music_generation.py +git commit -m "feat(music-generation): new MiniMax music skill via skill-creator" +``` + +--- + +## Task 5: 全量回归 + spec 覆盖核对 + +- [ ] **Step 1: 跑全部 skill 测试** + +Run: `uv run --no-project --with pytest --with requests --with Pillow pytest tests/skills/ -v` +Expected: 全部 PASS(image 7 + video 6 + podcast 6 + music 6 = 25 用例)。 + +- [ ] **Step 2: 核对四个 skill 目录结构** + +Run: +```bash +ls skills/public/music-generation skills/public/music-generation/scripts +git status --short +``` +Expected: `music-generation/SKILL.md` + `scripts/generate.py` 存在;无意外残留的脚手架占位文件(references/assets 已删)。 + +- [ ] **Step 3: spec 覆盖自查(对照设计文档)** + +逐条确认:image/video/podcast 三个 provider 自动判断 + 覆盖 ✔;music 新 skill ✔;hex 解码(podcast+music)✔;base64(image)✔;video 三步轮询 ✔;参考图 data URL(image subject_reference / video first_frame_image)✔;前端注册 ✔;环境变量齐全 ✔。如发现遗漏,补任务。 + +- [ ] **Step 4: 最终提交(如有零散改动)** + +```bash +git add -A +git commit -m "test(skills): full MiniMax generation regression green" || echo "nothing to commit" +``` diff --git a/docs/superpowers/specs/2026-06-08-minimax-generation-providers-design.md b/docs/superpowers/specs/2026-06-08-minimax-generation-providers-design.md new file mode 100644 index 000000000..5979b6613 --- /dev/null +++ b/docs/superpowers/specs/2026-06-08-minimax-generation-providers-design.md @@ -0,0 +1,175 @@ +# MiniMax 接入生成类 Skill — 设计文档 + +- 日期:2026-06-08 +- 分支:`worktree-feat-minimax-generation` +- 参考:MiniMax 开放平台 API(https://platform.minimaxi.com/docs/api-reference) + +## 1. 目标 + +1. 在现有 `image-generation`、`video-generation`、`podcast-generation` 三个 skill 中接入 MiniMax 作为可选 provider(与现有 Gemini / Volcengine 并存)。 +2. 用项目自带的 `skill-creator` skill 新建一个 `music-generation` skill,对接 MiniMax 音乐生成 API。 + +## 2. 背景与现状 + +三个生成 skill 均位于 `skills/public//`,是**自包含目录**: + +- `SKILL.md`(frontmatter:`name`、`description` + 给 agent 的使用说明,运行时路径为 `/mnt/skills/public//...`、产物写到 `/mnt/user-data/...`) +- `scripts/generate.py`(纯 `requests` 调用外部 API 的 CLI,`argparse`) +- 可选 `templates/` + +现状 provider: + +| Skill | 现 provider | 端点 | 凭证 | +|---|---|---|---| +| image-generation | Gemini | `generativelanguage.googleapis.com/.../gemini-3-pro-image-preview:generateContent` | `GEMINI_API_KEY` | +| video-generation | Gemini Veo | `.../veo-3.1-generate-preview:predictLongRunning`(长任务轮询) | `GEMINI_API_KEY` | +| podcast-generation | Volcengine TTS | `openspeech.bytedance.com/api/v1/tts`(逐行多线程,base64 音频拼接) | `VOLCENGINE_TTS_APPID` + `VOLCENGINE_TTS_ACCESS_TOKEN`(+ 可选 `VOLCENGINE_TTS_CLUSTER`) | + +MiniMax 已作为 **LLM chat provider** 接入(`config.example.yaml` + `patched_minimax.py`),但**未用于**图像/视频/音频生成。仓库中**无** music 生成功能。 + +沙箱中各 skill 目录隔离、互不 import → MiniMax 代码在每个 skill 内**各自内联**,不做跨 skill 共享模块(少量重复可接受)。 + +`skill-creator` 是仓库内真实公共 skill(`skills/public/skill-creator/`,含 `scripts/init_skill.py` 脚手架)。前端 `frontend/src/app/mock/api/skills/route.ts` 维护着 UI 展示用的 skill 列表(mock)。 + +## 3. Provider 选择机制(已和用户确认) + +每个被改造的脚本新增 `_resolve_provider()`,判定顺序: + +1. **显式覆盖**:若环境变量 `_PROVIDER` 已设(如 `IMAGE_GENERATION_PROVIDER`、`VIDEO_GENERATION_PROVIDER`、`PODCAST_GENERATION_PROVIDER`,取值 `gemini`/`volcengine`/`minimax`),直接采用,覆盖自动判断。 +2. **现有 provider 优先**:现 provider 凭证齐全 → 用现有 provider(保持完全向后兼容)。 +3. **回退 MiniMax**:否则若 `MINIMAX_API_KEY` 已设 → 用 MiniMax。 +4. 都不满足 → 抛出清晰错误,提示两套环境变量该如何配置。 + +> 设计含义:默认行为不变(已有用户配了 Gemini/Volcengine 的不受影响);只配了 MiniMax 的用户自动走 MiniMax;两者都配又想用 MiniMax 的用户用 `_PROVIDER` 强制。 + +## 4. MiniMax 接口对接细节 + +通用: + +- Base URL 默认 `https://api.minimaxi.com`,可用 `MINIMAX_API_HOST` 覆盖(备用 `https://api-bj.minimaxi.com`)。 +- Header:`Authorization: Bearer $MINIMAX_API_KEY`、`Content-Type: application/json`。 +- 统一错误处理:响应体 `base_resp.status_code != 0` → 抛带 `status_msg` 的异常。 + +### 4.1 图像 `POST /v1/image_generation`(同步) + +请求体: +```json +{ + "model": "image-01", + "prompt": "<文本>", + "aspect_ratio": "16:9", + "response_format": "base64", + "n": 1, + "prompt_optimizer": true +} +``` +- 参考图:转成 Data URL(`data:image/jpeg;base64,...`),放入 + `subject_reference: [{"type": "character", "image_file": ""}]`(仅 `image-01` 支持;用现有 `--reference-images` 的图片)。 +- 响应:`data.image_base64[0]` → `base64.b64decode` 写出文件;`response_format:url` 时取 `data.image_urls[0]` 下载(实现选 base64,少一次下载)。 +- 模型可用 `MINIMAX_IMAGE_MODEL` 覆盖(默认 `image-01`)。 + +### 4.2 视频(异步三步) + +1. `POST /v1/video_generation`: + ```json + { "model": "MiniMax-Hailuo-2.3", "prompt": "<文本>", "first_frame_image": "" } + ``` + → `{ "task_id": "...", "base_resp": {...} }` +2. 轮询 `GET /v1/query/video_generation?task_id=` → `status ∈ {Preparing,Queueing,Processing,Success,Fail}`;`Success` 时返回 `file_id`。 +3. `GET /v1/files/retrieve?file_id=` → `file.download_url`;下载 mp4 写出。 +- 参考图:第一张转 Data URL 作 `first_frame_image`。 +- 视频无 `aspect_ratio` 概念(用 resolution/duration),MiniMax 路径忽略 `--aspect-ratio`,用默认 resolution。 +- 轮询间隔 3s,设最大次数上限(如 120 次≈6 分钟)防止无限循环;`Fail`/超时报错。 +- 模型可用 `MINIMAX_VIDEO_MODEL` 覆盖(默认 `MiniMax-Hailuo-2.3`)。 + +### 4.3 播客 TTS `POST /v1/t2a_v2`(同步) + +沿用现有"逐行 + `ThreadPoolExecutor` 多线程 + 拼接"结构,仅替换单行合成函数: +```json +{ + "model": "speech-2.6-hd", + "text": "<单行文本>", + "voice_setting": { "voice_id": "", "speed": 1.0, "vol": 1.0, "pitch": 0 }, + "audio_setting": { "sample_rate": 32000, "bitrate": 128000, "format": "mp3", "channel": 1 }, + "output_format": "hex" +} +``` +- 响应 `data.audio` 为 **hex 编码** → `bytes.fromhex(audio)`(区别于 Volcengine 的 base64)。 +- 角色映射:`male`/`female` → MiniMax voice_id 预设,默认值可用 `MINIMAX_TTS_VOICE_MALE` / `MINIMAX_TTS_VOICE_FEMALE` 覆盖。 +- 模型可用 `MINIMAX_TTS_MODEL` 覆盖(默认 `speech-2.6-hd`)。 + +### 4.4 音乐 `POST /v1/music_generation`(同步,新 skill) + +请求体: +```json +{ + "model": "music-2.6-free", + "prompt": "<风格/情绪/场景>", + "lyrics": "[verse]\n...\n[chorus]\n...", + "output_format": "hex", + "audio_setting": { "sample_rate": 44100, "bitrate": 256000, "format": "mp3" } +} +``` +- 响应 `data.audio` 为 **hex** → `bytes.fromhex` 写 mp3。 +- 歌词规则: + - 提供 `lyrics`:直接用(含 `[Verse]`/`[Chorus]` 等结构标签,`\n` 分行)。 + - 未提供且 `is_instrumental` 为真:`is_instrumental:true`(不需要 lyrics)。 + - 未提供且非纯音乐:`lyrics_optimizer:true`(系统据 `prompt` 自动写词)。 +- 仅用 `MINIMAX_API_KEY`(音乐只有 MiniMax 提供,无 provider 判断);模型可用 `MINIMAX_MUSIC_MODEL` 覆盖(默认 `music-2.6-free`,付费用户可设 `music-2.6`)。 + +## 5. 各组件改动清单 + +### 5.1 `skills/public/image-generation/scripts/generate.py` +- 抽出现有 Gemini 逻辑为 `_generate_image_gemini(...)`。 +- 新增 `_generate_image_minimax(...)`、`_resolve_provider("image_generation", ...)`、`_to_data_url(path)`。 +- `generate_image(...)` 顶层按 provider 路由;保留 CLI 与签名不变。 +- `SKILL.md`:在说明里补充 MiniMax provider 与所需环境变量(不改变调用方式)。 + +### 5.2 `skills/public/video-generation/scripts/generate.py` +- 同上模式:`_generate_video_gemini`、`_generate_video_minimax`(三步轮询)、`_resolve_provider("video_generation", ...)`。 +- `SKILL.md` 补充 MiniMax provider 说明。 + +### 5.3 `skills/public/podcast-generation/scripts/generate.py` +- `text_to_speech_volcengine`(现有改名)+ `text_to_speech_minimax`;`_process_line`/`tts_node` 内按 `_resolve_provider("podcast_generation", ...)` 选择合成函数与 voice 映射。 +- 环境变量校验同时支持两套;`SKILL.md` 补充说明。 + +### 5.4 新增 `skills/public/music-generation/`(用 skill-creator) +- 用 `skill-creator/scripts/init_skill.py` 脚手架生成目录骨架,再填充: + - `SKILL.md`:frontmatter `name: music-generation` + description;说明输入 JSON 结构、调用方式、环境变量、示例(按现有生成 skill 的风格与运行时路径 `/mnt/skills/public/music-generation/...`)。 + - `scripts/generate.py`:CLI `--prompt-file --output-file `;读 JSON `{title, prompt, lyrics?, is_instrumental?}`;调 `/v1/music_generation`;hex→mp3。 +- `frontend/src/app/mock/api/skills/route.ts`:新增 `music-generation` 条目(按字母序,`category:"public"`、`enabled:true`),使其出现在 UI skill 列表。 + +## 6. 测试(TDD) + +- 框架:pytest。测试目录:仓库根 `tests/skills/`(**不放进会部署到沙箱的 skill 目录**)。 +- 用 `importlib.util.spec_from_file_location` 按路径加载各 `generate.py`。 +- `requests.post` / `requests.get` 全部用 `unittest.mock` 打桩,**不打真实 API**。 +- 覆盖点: + - `_resolve_provider`:各环境变量组合(仅现有 key / 仅 MiniMax key / 两者 / 都无 / `_PROVIDER` 覆盖)→ 正确 provider 或正确报错。 + - 请求体构造:image/video/podcast/music 各自 payload 字段、模型默认与 env 覆盖、参考图 Data URL 转换。 + - 响应解析:image base64 解码写文件、music/podcast hex 解码、video 三步流转(mock task_id→Success→download_url→内容写出)。 + - 错误:`base_resp.status_code != 0` 抛异常;video `Fail`/超时分支。 +- 先写失败测试,再实现到通过。 + +## 7. 向后兼容性 + +- 现有 CLI 参数与默认行为完全不变;仅当现 provider 凭证缺失(或显式 `_PROVIDER`)时才走 MiniMax。 +- 不改 LLM 侧已有的 MiniMax 接入。 + +## 8. 新增环境变量汇总 + +| 变量 | 用途 | 默认 | +|---|---|---| +| `MINIMAX_API_KEY` | 复用现有 LLM 同名 key | 必填(走 MiniMax 时) | +| `MINIMAX_API_HOST` | MiniMax base url | `https://api.minimaxi.com` | +| `IMAGE_GENERATION_PROVIDER` / `VIDEO_GENERATION_PROVIDER` / `PODCAST_GENERATION_PROVIDER` | 强制 provider | 不设(自动判断) | +| `MINIMAX_IMAGE_MODEL` | 图像模型 | `image-01` | +| `MINIMAX_VIDEO_MODEL` | 视频模型 | `MiniMax-Hailuo-2.3` | +| `MINIMAX_TTS_MODEL` | TTS 模型 | `speech-2.6-hd` | +| `MINIMAX_TTS_VOICE_MALE` / `MINIMAX_TTS_VOICE_FEMALE` | 播客音色 | 选定的男/女系统音色 | +| `MINIMAX_MUSIC_MODEL` | 音乐模型 | `music-2.6-free` | + +## 9. 非目标(YAGNI) + +- 不做翻唱(`music-cover` / `music_cover_preprocess`)、独立歌词生成接口(`lyrics_generation`,音乐内置 `lyrics_optimizer` 已覆盖"自动写词")、音色复刻/设计、视频模板 Agent、流式合成。 +- 不为各 skill 抽象统一 "GenerationProvider" 框架(沙箱隔离 + YAGNI)。 diff --git a/frontend/src/app/mock/api/skills/route.ts b/frontend/src/app/mock/api/skills/route.ts index 78ae3c783..abd4a126c 100644 --- a/frontend/src/app/mock/api/skills/route.ts +++ b/frontend/src/app/mock/api/skills/route.ts @@ -33,6 +33,14 @@ export function GET() { category: "public", enabled: true, }, + { + name: "music-generation", + description: + "Use this skill when the user requests to generate, create, compose, or produce music or songs — background music, theme songs, jingles, or instrumental tracks. Generates a song from a style/mood prompt and optional lyrics via the MiniMax music API.", + license: null, + category: "public", + enabled: true, + }, { name: "podcast-generation", description: diff --git a/scripts/serve.sh b/scripts/serve.sh index 485c9b5fe..3eb2ac833 100755 --- a/scripts/serve.sh +++ b/scripts/serve.sh @@ -62,9 +62,56 @@ done # ── Stop helper ────────────────────────────────────────────────────────────── -_is_repo_pid() { - local pid=$1 - lsof -p "$pid" 2>/dev/null | grep -F "$REPO_ROOT" >/dev/null +# Every deer-flow worktree (the main checkout + each linked worktree) hardcodes +# the same dev ports (8001/3000/2026), so a service started from ANY of them +# must be reclaimable from here — otherwise `make stop`/`make dev` in this +# worktree can neither kill nor take over a port held by a sibling worktree. +# DEERFLOW_ROOTS is that set of roots; processes living outside all of them +# (e.g. an unrelated project on port 3000) are still never touched. +# Sorted most-specific-first (longest path first): a linked worktree lives +# under the main checkout, so both roots are substrings of its files — checking +# the deeper root first attributes a reclaimed port to the right worktree. +DEERFLOW_ROOTS="$( + { + printf '%s\n' "$REPO_ROOT" + git -C "$REPO_ROOT" worktree list --porcelain 2>/dev/null | + awk '/^worktree /{print $2}' + } | awk 'NF && !seen[$0]++ {print length($0)"\t"$0}' | sort -rn | sed 's/^[0-9]*\t//' +)" + +# True if PID has an open file/cwd under any deer-flow worktree root. The +# trailing slash keeps a sibling dir like ".../deer-flow-notes" from matching +# the ".../deer-flow" root. +_is_deerflow_pid() { + local pid=$1 files root + files=$(lsof -p "$pid" 2>/dev/null) || return 1 + while IFS= read -r root; do + [ -n "$root" ] || continue + case "$files" in + *"$root"/*) return 0 ;; + esac + done <<< "$DEERFLOW_ROOTS" + return 1 +} + +# Report ports about to be reclaimed from a *different* worktree, so stopping +# (or starting, which stops first) isn't silently killing someone else's run. +_report_reclaimed_ports() { + local port pid files root owner + for port in 8001 3000 2026; do + for pid in $(lsof -nP -iTCP:"$port" -sTCP:LISTEN -t 2>/dev/null); do + _is_deerflow_pid "$pid" || continue + files=$(lsof -p "$pid" 2>/dev/null) + case "$files" in *"$REPO_ROOT"/*) continue ;; esac # this worktree — normal + owner="" + while IFS= read -r root; do + [ -n "$root" ] || continue + case "$files" in *"$root"/*) owner="$root"; break ;; esac + done <<< "$DEERFLOW_ROOTS" + echo " ↻ Reclaiming port $port from another worktree: ${owner:-?}" + break + done + done } _kill_repo_processes() { @@ -73,7 +120,7 @@ _kill_repo_processes() { local pids="" while IFS= read -r pid; do - if [ -n "$pid" ] && _is_repo_pid "$pid"; then + if [ -n "$pid" ] && _is_deerflow_pid "$pid"; then case " $pids " in *" $pid "*) ;; *) pids="$pids $pid" ;; @@ -92,7 +139,7 @@ _kill_repo_port() { local pids="" while IFS= read -r pid; do - if [ -n "$pid" ] && _is_repo_pid "$pid"; then + if [ -n "$pid" ] && _is_deerflow_pid "$pid"; then case " $pids " in *" $pid "*) ;; *) pids="$pids $pid" ;; @@ -141,11 +188,15 @@ _is_repo_nginx_pid() { esac args=$(ps -p "$pid" -o args= 2>/dev/null) || return 1 - case "$args" in - *"$REPO_ROOT/docker/nginx/nginx.local.conf"*|*"$REPO_ROOT"*) return 0 ;; - esac + local root + while IFS= read -r root; do + [ -n "$root" ] || continue + case "$args" in + *"$root"/docker/nginx/nginx.local.conf*|*"$root"/*) return 0 ;; + esac + done <<< "$DEERFLOW_ROOTS" - _is_repo_pid "$pid" + _is_deerflow_pid "$pid" } _kill_repo_nginx() { @@ -175,6 +226,7 @@ _kill_repo_nginx() { stop_all() { echo "Stopping all services..." + _report_reclaimed_ports _kill_repo_processes "uvicorn app.gateway.app:app" _kill_repo_processes "next dev" _kill_repo_processes "next start" @@ -182,9 +234,13 @@ stop_all() { nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true sleep 1 _kill_repo_nginx - # Force-kill any survivors still holding the service ports + # Force-kill any survivors still holding the service ports. 2026 is included + # so a lingering nginx (or any deer-flow process) that _kill_repo_nginx did + # not match by name still gets reclaimed — otherwise `make dev` fails its + # nginx port preflight. _kill_repo_port 8001 _kill_repo_port 3000 + _kill_repo_port 2026 ./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true echo "✓ All services stopped" } diff --git a/scripts/setup_wizard.py b/scripts/setup_wizard.py index 647d90a04..c3a7baf55 100644 --- a/scripts/setup_wizard.py +++ b/scripts/setup_wizard.py @@ -85,7 +85,7 @@ def main() -> int: display_name=f"{llm.provider.display_name} / {llm.model_name}", api_key_field=llm.provider.api_key_field, env_var=llm.provider.env_var, - extra_model_config=llm.provider.extra_config or None, + extra_model_config=llm.provider.extra_config_for(llm.model_name) or None, base_url=llm.base_url, search_use=search_provider.use if search_provider else None, search_tool_name=search_provider.tool_name if search_provider else "web_search", diff --git a/scripts/wizard/providers.py b/scripts/wizard/providers.py index f45057cd0..013fbd83b 100644 --- a/scripts/wizard/providers.py +++ b/scripts/wizard/providers.py @@ -19,10 +19,24 @@ class LLMProvider: api_key_field: str = "api_key" # Extra config fields beyond the common ones (merged into YAML) extra_config: dict = field(default_factory=dict) + # Per-model supports_vision overrides for providers whose models differ in + # capability (e.g. MiniMax M3 supports vision but M2.7 is text-only). The + # provider-level extra_config holds the default (default_model) capability. + model_vision_overrides: dict[str, bool] = field(default_factory=dict) auth_hint: str | None = None base_url_prompt: str | None = None model_prompt: str | None = None + def extra_config_for(self, model_name: str) -> dict: + """Return extra_config for a selected model, applying per-model overrides. + + Does not mutate the shared provider-level ``extra_config``. + """ + config = dict(self.extra_config) + if model_name in self.model_vision_overrides: + config["supports_vision"] = self.model_vision_overrides[model_name] + return config + @dataclass class WebProvider: @@ -313,6 +327,10 @@ LLM_PROVIDERS: list[LLMProvider] = [ "supports_vision": True, "supports_thinking": True, }, + model_vision_overrides={ + "MiniMax-M2.7": False, + "MiniMax-M2.7-highspeed": False, + }, ), LLMProvider( name="minimax_cn", @@ -332,6 +350,10 @@ LLM_PROVIDERS: list[LLMProvider] = [ "supports_vision": True, "supports_thinking": True, }, + model_vision_overrides={ + "MiniMax-M2.7": False, + "MiniMax-M2.7-highspeed": False, + }, ), LLMProvider( name="openrouter", diff --git a/skills/public/image-generation/SKILL.md b/skills/public/image-generation/SKILL.md index d15cb63e2..e894dffe5 100644 --- a/skills/public/image-generation/SKILL.md +++ b/skills/public/image-generation/SKILL.md @@ -178,6 +178,27 @@ For scenarios where visual accuracy is critical, **use the `image_search` tool f This approach significantly improves generation quality by providing the model with concrete visual guidance rather than relying solely on text descriptions. +## Providers (Gemini / MiniMax) + +This skill auto-selects the provider by environment variables (no CLI change): + +- `GEMINI_API_KEY` set → use Gemini (default, unchanged). +- Only `MINIMAX_API_KEY` set → use MiniMax (`/v1/image_generation`, model `image-01`). +- Force one explicitly with `IMAGE_GENERATION_PROVIDER=gemini|minimax`. + +MiniMax optional overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_IMAGE_MODEL` (default `image-01`). Reference images are sent as the MiniMax +`subject_reference` character image. The CLI and `--prompt-file` / `--reference-images` +/ `--output-file` / `--aspect-ratio` arguments are identical for both providers. + +**MiniMax prompt handling (provider-internal).** Authoring is provider-agnostic — write +the same structured JSON regardless of which provider is active. MiniMax `image-01` +consumes a single text string, so the MiniMax path itself sends only the JSON `prompt` +field (the other fields such as `style` / `composition` / `negative_prompt` apply to the +Gemini path) and enables `prompt_optimizer` so MiniMax expands it server-side. MiniMax +caps that prompt at 1500 characters; if the `prompt` field is longer, the script returns +an error instead of calling the API. The Gemini path receives the full structured JSON. + ## Notes - Always use English for prompts regardless of user's language diff --git a/skills/public/image-generation/scripts/generate.py b/skills/public/image-generation/scripts/generate.py index 7670176bb..3bdbb0bd2 100644 --- a/skills/public/image-generation/scripts/generate.py +++ b/skills/public/image-generation/scripts/generate.py @@ -1,32 +1,196 @@ import base64 +import json import os import requests -from PIL import Image + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" +# MiniMax image-01 caps the prompt at 1500 characters and rejects longer requests +# with a generic "invalid params" error, so validate before calling the API. +MINIMAX_PROMPT_MAX_CHARS = 1500 def validate_image(image_path: str) -> bool: - """ - Validate if an image file can be opened and is not corrupted. - - Args: - image_path: Path to the image file - - Returns: - True if the image is valid and can be opened, False otherwise - """ + """Validate if an image file can be opened and is not corrupted.""" + from PIL import Image # lazy import: keeps module importable without Pillow + try: - with Image.open(image_path) as img: - img.verify() # Verify that it's a valid image - # Re-open to check if it can be fully loaded (verify() may not catch all issues) - with Image.open(image_path) as img: - img.load() # Force load the image data + with Image.open(image_path) as image: + image.verify() + with Image.open(image_path) as image: + image.load() return True - except Exception as e: - print(f"Warning: Image '{image_path}' is invalid or corrupted: {e}") + except Exception as exc: + print(f"Warning: Image '{image_path}' is invalid or corrupted: {exc}") return False +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + """Pick the generation provider. + + 1. Explicit _PROVIDER override wins. + 2. Otherwise prefer the existing provider when its credentials are present. + 3. Otherwise fall back to MiniMax when MINIMAX_API_KEY is set. + """ + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set GEMINI_API_KEY for {existing_provider}, " + f"or MINIMAX_API_KEY for minimax (optionally force with {override_env})." + ) + + +def _minimax_host() -> str: + return os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception( + f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}" + ) + + +def _guess_mime(image_path: str) -> str: + ext = os.path.splitext(image_path)[1].lower() + return { + ".png": "image/png", + ".webp": "image/webp", + ".gif": "image/gif", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + }.get(ext, "image/jpeg") + + +def _to_data_url(image_path: str) -> str: + with open(image_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{_guess_mime(image_path)};base64,{b64}" + + +def _ensure_output_dir(output_file: str) -> None: + """Create the output file's parent directory so nested paths don't fail.""" + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + +def _minimax_prompt(raw: str) -> str: + """Extract the single text prompt MiniMax image-01 expects. + + The shared prompt file is structured JSON (a consolidated ``prompt`` plus + Gemini-oriented fields like ``style`` / ``composition`` / ``negative_prompt``), + but MiniMax consumes one string and expands it via ``prompt_optimizer``. The + provider adapts the input itself — the caller never needs to know MiniMax is + active. Use the JSON ``prompt`` field; fall back to the raw text for plain-text + prompt files or JSON without a ``prompt`` field. + """ + text = raw.strip() + try: + data = json.loads(text) + except (ValueError, json.JSONDecodeError): + return text + if isinstance(data, dict): + core = data.get("prompt") + if isinstance(core, str) and core.strip(): + return core.strip() + return text + + +def _generate_image_minimax( + prompt: str, reference_images: list[str], output_file: str, aspect_ratio: str +) -> str: + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + prompt = _minimax_prompt(prompt) + if len(prompt) > MINIMAX_PROMPT_MAX_CHARS: + return ( + f"Prompt is {len(prompt)} characters but MiniMax image-01 accepts at most " + f"{MINIMAX_PROMPT_MAX_CHARS}. Shorten the prompt to stay within the limit; " + f"reference images plus a tighter description usually recover the detail." + ) + body = { + "model": os.getenv("MINIMAX_IMAGE_MODEL", "image-01"), + "prompt": prompt, + "aspect_ratio": aspect_ratio, + "response_format": "base64", + "n": 1, + "prompt_optimizer": True, + } + if reference_images: + # Reference images are passed as character subjects as-is; unlike the Gemini + # path we do not pre-validate them — invalid files surface as a MiniMax API error. + body["subject_reference"] = [ + {"type": "character", "image_file": _to_data_url(p)} for p in reference_images + ] + response = requests.post( + f"{_minimax_host()}/v1/image_generation", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=body, + timeout=60, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + images = (payload.get("data") or {}).get("image_base64") or [] + if not images: + raise Exception("MiniMax returned no image data") + _ensure_output_dir(output_file) + with open(output_file, "wb") as f: + f.write(base64.b64decode(images[0])) + return f"Successfully generated image to {output_file}" + + +def _generate_image_gemini( + prompt: str, reference_images: list[str], output_file: str, aspect_ratio: str +) -> str: + parts = [] + valid_reference_images = [] + for ref_img in reference_images: + if validate_image(ref_img): + valid_reference_images.append(ref_img) + else: + print(f"Skipping invalid reference image: {ref_img}") + if len(valid_reference_images) < len(reference_images): + skipped = len(reference_images) - len(valid_reference_images) + print(f"Note: {skipped} reference image(s) were skipped due to validation failure.") + + for reference_image in valid_reference_images: + with open(reference_image, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode("utf-8") + parts.append({"inlineData": {"mimeType": "image/jpeg", "data": image_b64}}) + + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + return "GEMINI_API_KEY is not set" + response = requests.post( + "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", + headers={"x-goog-api-key": api_key, "Content-Type": "application/json"}, + json={ + "generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}}, + "contents": [{"parts": [*parts, {"text": prompt}]}], + }, + ) + response.raise_for_status() + data = response.json() + response_parts: list[dict] = data["candidates"][0]["content"]["parts"] + image_parts = [part for part in response_parts if part.get("inlineData", False)] + if len(image_parts) == 1: + base64_image = image_parts[0]["inlineData"]["data"] + _ensure_output_dir(output_file) + with open(output_file, "wb") as f: + f.write(base64.b64decode(base64_image)) + return f"Successfully generated image to {output_file}" + raise Exception("Failed to generate image") + + def generate_image( prompt_file: str, reference_images: list[str], @@ -35,98 +199,30 @@ def generate_image( ) -> str: with open(prompt_file, "r", encoding="utf-8") as f: prompt = f.read() - parts = [] - i = 0 - - # Filter out invalid reference images - valid_reference_images = [] - for ref_img in reference_images: - if validate_image(ref_img): - valid_reference_images.append(ref_img) - else: - print(f"Skipping invalid reference image: {ref_img}") - - if len(valid_reference_images) < len(reference_images): - print(f"Note: {len(reference_images) - len(valid_reference_images)} reference image(s) were skipped due to validation failure.") - - for reference_image in valid_reference_images: - i += 1 - with open(reference_image, "rb") as f: - image_b64 = base64.b64encode(f.read()).decode("utf-8") - parts.append( - { - "inlineData": { - "mimeType": "image/jpeg", - "data": image_b64, - } - } - ) - - api_key = os.getenv("GEMINI_API_KEY") - if not api_key: - return "GEMINI_API_KEY is not set" - response = requests.post( - "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", - headers={ - "x-goog-api-key": api_key, - "Content-Type": "application/json", - }, - json={ - "generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}}, - "contents": [{"parts": [*parts, {"text": prompt}]}], - }, + provider = _resolve_provider( + "IMAGE_GENERATION_PROVIDER", "gemini", bool(os.getenv("GEMINI_API_KEY")) ) - response.raise_for_status() - json = response.json() - parts: list[dict] = json["candidates"][0]["content"]["parts"] - image_parts = [part for part in parts if part.get("inlineData", False)] - if len(image_parts) == 1: - base64_image = image_parts[0]["inlineData"]["data"] - # Save the image to a file - with open(output_file, "wb") as f: - f.write(base64.b64decode(base64_image)) - return f"Successfully generated image to {output_file}" - else: - raise Exception("Failed to generate image") + if provider == "minimax": + return _generate_image_minimax(prompt, reference_images, output_file, aspect_ratio) + if provider in ("gemini", "google"): + return _generate_image_gemini(prompt, reference_images, output_file, aspect_ratio) + raise ValueError(f"Unknown image provider: {provider!r} (use 'gemini' or 'minimax')") if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Generate images using Gemini API") - parser.add_argument( - "--prompt-file", - required=True, - help="Absolute path to JSON prompt file", - ) - parser.add_argument( - "--reference-images", - nargs="*", - default=[], - help="Absolute paths to reference images (space-separated)", - ) - parser.add_argument( - "--output-file", - required=True, - help="Output path for generated image", - ) - parser.add_argument( - "--aspect-ratio", - required=False, - default="16:9", - help="Aspect ratio of the generated image", - ) - + parser = argparse.ArgumentParser(description="Generate images using Gemini or MiniMax API") + parser.add_argument("--prompt-file", required=True, help="Absolute path to JSON prompt file") + parser.add_argument("--reference-images", nargs="*", default=[], + help="Absolute paths to reference images (space-separated)") + parser.add_argument("--output-file", required=True, help="Output path for generated image") + parser.add_argument("--aspect-ratio", required=False, default="16:9", + help="Aspect ratio of the generated image") args = parser.parse_args() try: - print( - generate_image( - args.prompt_file, - args.reference_images, - args.output_file, - args.aspect_ratio, - ) - ) + print(generate_image(args.prompt_file, args.reference_images, + args.output_file, args.aspect_ratio)) except Exception as e: print(f"Error while generating image: {e}") diff --git a/skills/public/music-generation/SKILL.md b/skills/public/music-generation/SKILL.md new file mode 100644 index 000000000..131019631 --- /dev/null +++ b/skills/public/music-generation/SKILL.md @@ -0,0 +1,76 @@ +--- +name: music-generation +description: Use this skill when the user requests to generate, create, compose, or produce music or songs — background music, theme songs, jingles, or instrumental tracks. Generates a song from a style/mood prompt and optional lyrics via the MiniMax music API. +--- + +# Music Generation Skill + +## Overview + +This skill generates songs (vocal or instrumental) from a structured JSON spec using the +MiniMax music generation API (`/v1/music_generation`). You describe the style/mood/scene in +`prompt`, optionally provide `lyrics`, and the script returns an MP3. + +## Workflow + +### Step 1: Understand Requirements + +Identify the desired style, mood, scene, language, and whether the user wants vocals or a +pure instrumental track. Decide whether to supply lyrics or let the model write them. + +### Step 2: Create the Spec JSON + +Write a JSON file in `/mnt/user-data/workspace/` named `{descriptive-name}.json`: + +```json +{ + "title": "Rainy Night Cafe", + "prompt": "indie folk, melancholic, introspective, walking alone, cafe", + "lyrics": "[verse]\nStreetlights glow the night wind sighs\n[chorus]\nPush the wooden door warm air inside" +} +``` + +Fields: +- `title` (optional): a human-readable name. +- `prompt` (required): style, mood, and scene. Drives the musical character. +- `lyrics` (optional): song lyrics. Use `\n` between lines and structure tags such as + `[Intro]`, `[Verse]`, `[Pre Chorus]`, `[Chorus]`, `[Bridge]`, `[Outro]`. +- `is_instrumental` (optional, bool): set `true` for a pure instrumental track (no lyrics needed). + +Behavior: +- `lyrics` provided → those lyrics are sung. +- `is_instrumental: true` → instrumental, no vocals. +- neither → the model auto-writes lyrics from `prompt` (`lyrics_optimizer`). + +### Step 3: Execute Generation + +```bash +python /mnt/skills/public/music-generation/scripts/generate.py \ + --prompt-file /mnt/user-data/workspace/rainy-night-cafe.json \ + --output-file /mnt/user-data/outputs/rainy-night-cafe.mp3 +``` + +Parameters: +- `--prompt-file`: Absolute path to the JSON spec (required). +- `--output-file`: Absolute path for the output MP3 (required). + +[!NOTE] +Do NOT read the python file, just call it with the parameters. + +## Environment + +- `MINIMAX_API_KEY` (required): your MiniMax interface key. +- `MINIMAX_API_HOST` (optional): default `https://api.minimaxi.com`. +- `MINIMAX_MUSIC_MODEL` (optional): default `music-2.6-free` (works for all API-key users); + paid/Token-Plan users can set `music-2.6` for higher limits. + +## Output Handling + +- Music is saved as MP3 (typically in `/mnt/user-data/outputs/`). +- Share the generated file with the user using the present_files tool. +- Offer to iterate on style or lyrics if adjustments are needed. + +## Notes + +- Keep `prompt` focused on style/mood/scene; put the actual sung words in `lyrics`. +- For non-English songs, write `lyrics` in the target language. diff --git a/skills/public/music-generation/scripts/generate.py b/skills/public/music-generation/scripts/generate.py new file mode 100644 index 000000000..fab5586d4 --- /dev/null +++ b/skills/public/music-generation/scripts/generate.py @@ -0,0 +1,82 @@ +import argparse +import json +import os + +import requests + +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception(f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}") + + +def generate_music(prompt_file: str, output_file: str) -> str: + """Generate a song from a JSON spec via MiniMax /v1/music_generation. + + Spec JSON: {"title": str, "prompt": str, "lyrics"?: str, "is_instrumental"?: bool} + - lyrics given -> use them (supports [Verse]/[Chorus] structure tags, \\n lines) + - is_instrumental true -> pure music, no lyrics needed + - otherwise -> lyrics_optimizer auto-writes lyrics from prompt + """ + with open(prompt_file, "r", encoding="utf-8") as f: + spec = json.load(f) + + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + + prompt = (spec.get("prompt") or "").strip() + if not prompt: + raise ValueError("`prompt` is required in the music spec") + lyrics = spec.get("lyrics") or None # treat empty string the same as absent + is_instrumental = bool(spec.get("is_instrumental", False)) + + body = { + "model": os.getenv("MINIMAX_MUSIC_MODEL", "music-2.6-free"), + "prompt": prompt, + "output_format": "hex", + "audio_setting": {"sample_rate": 44100, "bitrate": 256000, "format": "mp3"}, + } + if lyrics: + body["lyrics"] = lyrics + elif is_instrumental: + body["is_instrumental"] = True + else: + body["lyrics_optimizer"] = True + + host = os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + response = requests.post( + f"{host}/v1/music_generation", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=body, + timeout=300, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + audio_hex = (payload.get("data") or {}).get("audio") + if not audio_hex: + raise Exception("MiniMax returned no audio data") + + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + with open(output_file, "wb") as f: + f.write(bytes.fromhex(audio_hex)) + return f"Successfully generated music to {output_file}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate music using MiniMax API") + parser.add_argument("--prompt-file", required=True, + help="Absolute path to JSON spec file {title, prompt, lyrics?, is_instrumental?}") + parser.add_argument("--output-file", required=True, help="Output path for generated MP3") + args = parser.parse_args() + + try: + print(generate_music(args.prompt_file, args.output_file)) + except Exception as e: + print(f"Error while generating music: {e}") diff --git a/skills/public/podcast-generation/SKILL.md b/skills/public/podcast-generation/SKILL.md index b78b8dd7e..896a6e936 100644 --- a/skills/public/podcast-generation/SKILL.md +++ b/skills/public/podcast-generation/SKILL.md @@ -64,6 +64,7 @@ Parameters: > - The script handles all TTS API calls and audio generation internally. > - Do NOT read the Python file, just call it with the parameters. > - Always include `--transcript-file` to generate a readable transcript for the user. +> - The TTS provider and its concurrency are selected automatically from environment variables — you do not choose or tune them. ## Script JSON Format @@ -172,8 +173,8 @@ After generation: ## Requirements The following environment variables must be set: -- `VOLCENGINE_TTS_APPID`: Volcengine TTS application ID -- `VOLCENGINE_TTS_ACCESS_TOKEN`: Volcengine TTS access token +- For Volcengine: `VOLCENGINE_TTS_APPID` and `VOLCENGINE_TTS_ACCESS_TOKEN` +- For MiniMax: `MINIMAX_API_KEY` - `VOLCENGINE_TTS_CLUSTER`: Volcengine TTS cluster (optional, defaults to "volcano_tts") ## Notes @@ -183,3 +184,20 @@ The following environment variables must be set: - Technical content should be simplified for audio accessibility in the script - Complex notations (formulas, code) should be translated to plain language in the script - Long content may result in longer podcasts + +## Providers (Volcengine / MiniMax) + +Auto-selected by environment variables: + +- `VOLCENGINE_TTS_APPID` + `VOLCENGINE_TTS_ACCESS_TOKEN` set → Volcengine TTS (default). +- Only `MINIMAX_API_KEY` set → MiniMax TTS (`/v1/t2a_v2`). +- Force with `PODCAST_GENERATION_PROVIDER=volcengine|minimax`. + +MiniMax overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_TTS_MODEL` (default `speech-2.6-hd`), `MINIMAX_TTS_VOICE_MALE` +(default `male-qn-qingse`), `MINIMAX_TTS_VOICE_FEMALE` (default `female-tianmei`). + +Concurrency is owned by each provider internally — MiniMax runs single-threaded +to reduce rate-limit failures, Volcengine uses 4 workers. There is no +caller-facing concurrency knob; transient rate limits are handled by automatic +retry with backoff. diff --git a/skills/public/podcast-generation/scripts/generate.py b/skills/public/podcast-generation/scripts/generate.py index 7e56cb562..0e65e9afd 100644 --- a/skills/public/podcast-generation/scripts/generate.py +++ b/skills/public/podcast-generation/scripts/generate.py @@ -3,6 +3,8 @@ import base64 import json import logging import os +import random +import time import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Literal, Optional @@ -12,8 +14,14 @@ import requests logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" +# MiniMax base_resp codes worth retrying: unknown, timeout, RPM limit, TPM limit. +MINIMAX_RETRYABLE_CODES = {1000, 1001, 1002, 1039} +DEFAULT_TTS_MAX_RETRIES = 4 +DEFAULT_MAX_WORKERS = 4 +DEFAULT_MINIMAX_MAX_WORKERS = 1 + -# Types class ScriptLine: def __init__(self, speaker: Literal["male", "female"] = "male", paragraph: str = ""): self.speaker = speaker @@ -30,113 +38,243 @@ class Script: script = cls(locale=data.get("locale", "en")) for line in data.get("lines", []): script.lines.append( - ScriptLine( - speaker=line.get("speaker", "male"), - paragraph=line.get("paragraph", ""), - ) + ScriptLine(speaker=line.get("speaker", "male"), + paragraph=line.get("paragraph", "")) ) return script -def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: - """Convert text to speech using Volcengine TTS.""" +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set VOLCENGINE_TTS_APPID + VOLCENGINE_TTS_ACCESS_TOKEN " + f"for {existing_provider}, or MINIMAX_API_KEY for minimax " + f"(optionally force with {override_env})." + ) + + +def _resolve_tts_provider() -> str: + has_volc = bool( + os.getenv("VOLCENGINE_TTS_APPID") and os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") + ) + provider = _resolve_provider("PODCAST_GENERATION_PROVIDER", "volcengine", has_volc) + if provider not in ("volcengine", "minimax"): + raise ValueError( + f"Unknown podcast provider: {provider!r} (use 'volcengine' or 'minimax')" + ) + return provider + + +def _default_max_retries() -> int: + try: + return int(os.getenv("MINIMAX_TTS_MAX_RETRIES", str(DEFAULT_TTS_MAX_RETRIES))) + except ValueError: + return DEFAULT_TTS_MAX_RETRIES + + +def _default_max_workers(provider: str) -> int: + """Each provider owns its own concurrency: MiniMax stays low to avoid rate + limits, Volcengine keeps the historical default. Not user-tunable by design. + """ + if provider == "minimax": + return DEFAULT_MINIMAX_MAX_WORKERS + return DEFAULT_MAX_WORKERS + + +def _parse_retry_after(response) -> Optional[float]: + """Return the server-provided Retry-After (seconds), if any.""" + headers = getattr(response, "headers", None) or {} + value = headers.get("Retry-After") + try: + return float(value) if value else None + except (TypeError, ValueError): + return None + + +def _backoff_sleep(attempt: int, retry_after: Optional[float]) -> None: + """Sleep with exponential backoff + jitter, honoring Retry-After when present. + + Jitter de-synchronizes concurrent workers that all got rate-limited at once, + avoiding a thundering-herd retry storm. + """ + base = retry_after if retry_after else min(2 ** attempt, 30) + time.sleep(base + random.uniform(0, 1)) + + +def text_to_speech_volcengine( + text: str, voice_type: str, max_retries: Optional[int] = None +) -> Optional[bytes]: + """Convert text to speech using Volcengine TTS (returns base64-decoded mp3 bytes). + + Retries with exponential backoff on transient HTTP errors (429 / 5xx). + """ app_id = os.getenv("VOLCENGINE_TTS_APPID") access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts") - - if not app_id or not access_token: - raise ValueError( - "VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN environment variables must be set" - ) - + if max_retries is None: + max_retries = _default_max_retries() url = "https://openspeech.bytedance.com/api/v1/tts" - - # Authentication: Bearer token with semicolon separator - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer;{access_token}", - } - + headers = {"Content-Type": "application/json", "Authorization": f"Bearer;{access_token}"} payload = { - "app": { - "appid": app_id, - "token": "access_token", # literal string, not the actual token - "cluster": cluster, - }, + "app": {"appid": app_id, "token": "access_token", "cluster": cluster}, "user": {"uid": "podcast-generator"}, - "audio": { - "voice_type": voice_type, - "encoding": "mp3", - "speed_ratio": 1.2, - }, - "request": { - "reqid": str(uuid.uuid4()), # must be unique UUID - "text": text, - "text_type": "plain", - "operation": "query", - }, + "audio": {"voice_type": voice_type, "encoding": "mp3", "speed_ratio": 1.2}, + "request": {"reqid": str(uuid.uuid4()), "text": text, + "text_type": "plain", "operation": "query"}, } - - try: - response = requests.post(url, json=payload, headers=headers) - + for attempt in range(max_retries + 1): + try: + response = requests.post(url, json=payload, headers=headers, timeout=60) + except Exception as e: + logger.error(f"TTS error: {e}") + if attempt < max_retries: + _backoff_sleep(attempt, None) + continue + return None + if response.status_code == 429 or response.status_code >= 500: + logger.warning( + f"Volcengine TTS transient HTTP {response.status_code} " + f"(attempt {attempt + 1}/{max_retries + 1})" + ) + if attempt < max_retries: + _backoff_sleep(attempt, _parse_retry_after(response)) + continue + return None if response.status_code != 200: logger.error(f"TTS API error: {response.status_code} - {response.text}") return None - result = response.json() if result.get("code") != 3000: logger.error(f"TTS error: {result.get('message')} (code: {result.get('code')})") return None - audio_data = result.get("data") if audio_data: return base64.b64decode(audio_data) - - except Exception as e: - logger.error(f"TTS error: {str(e)}") - + return None return None -def _process_line(args: tuple[int, ScriptLine, int]) -> tuple[int, Optional[bytes]]: +def text_to_speech_minimax( + text: str, voice_id: str, max_retries: Optional[int] = None +) -> Optional[bytes]: + """Convert text to speech using MiniMax t2a_v2 (returns hex-decoded mp3 bytes). + + Retries with exponential backoff on HTTP 429/5xx and on retryable base_resp + codes (rate/TPM limits, timeouts). Permanent errors (auth, balance, bad input) + are not retried. + """ + api_key = os.getenv("MINIMAX_API_KEY") + host = os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + if max_retries is None: + max_retries = _default_max_retries() + payload = { + "model": os.getenv("MINIMAX_TTS_MODEL", "speech-2.6-hd"), + "text": text, + "voice_setting": {"voice_id": voice_id, "speed": 1.0, "vol": 1.0, "pitch": 0}, + "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "mp3", "channel": 1}, + "output_format": "hex", + } + for attempt in range(max_retries + 1): + try: + response = requests.post( + f"{host}/v1/t2a_v2", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=payload, + timeout=60, + ) + except Exception as e: + logger.error(f"MiniMax TTS error: {e}") + if attempt < max_retries: + _backoff_sleep(attempt, None) + continue + return None + if response.status_code == 429 or response.status_code >= 500: + logger.warning( + f"MiniMax TTS rate-limited HTTP {response.status_code} " + f"(attempt {attempt + 1}/{max_retries + 1})" + ) + if attempt < max_retries: + _backoff_sleep(attempt, _parse_retry_after(response)) + continue + return None + if response.status_code != 200: + logger.error(f"MiniMax TTS error: {response.status_code} - {response.text}") + return None + result = response.json() + base = result.get("base_resp") or {} + code = base.get("status_code", 0) + if code in MINIMAX_RETRYABLE_CODES: + logger.warning( + f"MiniMax TTS retryable error {code}: {base.get('status_msg')} " + f"(attempt {attempt + 1}/{max_retries + 1})" + ) + if attempt < max_retries: + _backoff_sleep(attempt, None) + continue + return None + if code != 0: + logger.error(f"MiniMax TTS error {code}: {base.get('status_msg')}") + return None + audio_hex = (result.get("data") or {}).get("audio") + if audio_hex: + return bytes.fromhex(audio_hex) + return None + return None + + +def _process_line(args: tuple[int, ScriptLine, int, str]) -> tuple[int, Optional[bytes]]: """Process a single script line for TTS. Returns (index, audio_bytes).""" - i, line, total = args - - # Select voice based on speaker gender - if line.speaker == "male": - voice_type = "zh_male_yangguangqingnian_moon_bigtts" # Male voice + i, line, total, provider = args + logger.info(f"Processing line {i + 1}/{total} ({line.speaker}) via {provider}") + if provider == "minimax": + if line.speaker == "male": + voice = os.getenv("MINIMAX_TTS_VOICE_MALE", "male-qn-qingse") + else: + voice = os.getenv("MINIMAX_TTS_VOICE_FEMALE", "female-tianmei") + audio = text_to_speech_minimax(line.paragraph, voice) else: - voice_type = "zh_female_sajiaonvyou_moon_bigtts" # Female voice - - logger.info(f"Processing line {i + 1}/{total} ({line.speaker})") - audio = text_to_speech(line.paragraph, voice_type) - + if line.speaker == "male": + voice = "zh_male_yangguangqingnian_moon_bigtts" + else: + voice = "zh_female_sajiaonvyou_moon_bigtts" + audio = text_to_speech_volcengine(line.paragraph, voice) if not audio: logger.warning(f"Failed to generate audio for line {i + 1}") - return (i, audio) -def tts_node(script: Script, max_workers: int = 4) -> list[bytes]: - """Convert script lines to audio chunks using TTS with multi-threading.""" - logger.info(f"Converting script to audio using {max_workers} workers...") +def tts_node(script: Script) -> list[bytes]: + """Convert script lines to audio chunks using TTS with multi-threading. + Concurrency is owned by the resolved provider (see _default_max_workers); + there is no caller-facing knob. Fails loudly: if any line cannot be + synthesized (even after retries), raise rather than silently emitting an + incomplete podcast. + """ total = len(script.lines) - - # Handle empty script case if total == 0: raise ValueError("Script contains no lines to process") - # Validate required environment variables before starting TTS - if not os.getenv("VOLCENGINE_TTS_APPID") or not os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN"): + provider = _resolve_tts_provider() + max_workers = _default_max_workers(provider) + if provider == "volcengine" and not ( + os.getenv("VOLCENGINE_TTS_APPID") and os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") + ): raise ValueError( - "Missing required environment variables: VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN must be set" + "Volcengine TTS selected but VOLCENGINE_TTS_APPID / " + "VOLCENGINE_TTS_ACCESS_TOKEN are not set" ) + if provider == "minimax" and not os.getenv("MINIMAX_API_KEY"): + raise ValueError("MiniMax TTS selected but MINIMAX_API_KEY is not set") + logger.info(f"Converting script to audio using {max_workers} workers (provider={provider})...") + tasks = [(i, line, total, provider) for i, line in enumerate(script.lines)] - tasks = [(i, line, total) for i, line in enumerate(script.lines)] - - # Use ThreadPoolExecutor for parallel TTS generation results: dict[int, Optional[bytes]] = {} failed_indices: list[int] = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -144,81 +282,52 @@ def tts_node(script: Script, max_workers: int = 4) -> list[bytes]: for future in as_completed(futures): idx, audio = future.result() results[idx] = audio - # Use `not audio` to catch both None and empty bytes if not audio: failed_indices.append(idx) - # Log failed lines with 1-based indices for user-friendly output if failed_indices: - logger.warning( - f"Failed to generate audio for {len(failed_indices)}/{total} lines: " - f"line numbers {sorted(i + 1 for i in failed_indices)}" - ) - - # Collect results in order, skipping failed ones - audio_chunks = [] - for i in range(total): - audio = results.get(i) - if audio: - audio_chunks.append(audio) - - logger.info(f"Generated {len(audio_chunks)}/{total} audio chunks successfully") - - if not audio_chunks: raise ValueError( - f"TTS generation failed for all {total} lines. " - "Please check VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN environment variables." + f"TTS failed for {len(failed_indices)}/{total} lines after retries: " + f"line numbers {sorted(i + 1 for i in failed_indices)}. " + f"This is usually transient API rate limiting — wait a moment and retry." ) - + + audio_chunks = [results[i] for i in range(total)] + logger.info(f"Generated {len(audio_chunks)}/{total} audio chunks successfully") return audio_chunks def mix_audio(audio_chunks: list[bytes]) -> bytes: """Combine audio chunks into a single audio file.""" - logger.info("Mixing audio chunks...") - if not audio_chunks: raise ValueError("No audio chunks to mix - TTS generation may have failed") - output = b"".join(audio_chunks) - if len(output) == 0: raise ValueError("Mixed audio is empty - TTS generation may have failed") - logger.info(f"Audio mixing complete: {len(output)} bytes") return output def generate_markdown(script: Script, title: str = "Podcast Script") -> str: - """Generate a markdown script from the podcast script.""" lines = [f"# {title}", ""] - for line in script.lines: speaker_name = "**Host (Male)**" if line.speaker == "male" else "**Host (Female)**" lines.append(f"{speaker_name}: {line.paragraph}") lines.append("") - return "\n".join(lines) -def generate_podcast( - script_file: str, - output_file: str, - transcript_file: Optional[str] = None, -) -> str: - """Generate a podcast from a script JSON file.""" - - # Read script JSON +def generate_podcast(script_file: str, output_file: str, + transcript_file: Optional[str] = None) -> str: with open(script_file, "r", encoding="utf-8") as f: script_json = json.load(f) - if "lines" not in script_json: - raise ValueError(f"Invalid script format: missing 'lines' key. Got keys: {list(script_json.keys())}") - + raise ValueError( + f"Invalid script format: missing 'lines' key. Got keys: {list(script_json.keys())}" + ) script = Script.from_dict(script_json) logger.info(f"Loaded script with {len(script.lines)} lines") - # Generate transcript markdown if requested if transcript_file: title = script_json.get("title", "Podcast Script") markdown_content = generate_markdown(script, title) @@ -229,16 +338,11 @@ def generate_podcast( f.write(markdown_content) logger.info(f"Generated transcript to {transcript_file}") - # Convert to audio audio_chunks = tts_node(script) - if not audio_chunks: raise Exception("Failed to generate any audio") - - # Mix audio output_audio = mix_audio(audio_chunks) - # Save output output_dir = os.path.dirname(output_file) if output_dir: os.makedirs(output_dir, exist_ok=True) @@ -253,30 +357,15 @@ def generate_podcast( if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate podcast from script JSON file") - parser.add_argument( - "--script-file", - required=True, - help="Absolute path to script JSON file", - ) - parser.add_argument( - "--output-file", - required=True, - help="Output path for generated podcast MP3", - ) - parser.add_argument( - "--transcript-file", - required=False, - help="Output path for transcript markdown file (optional)", - ) - + parser.add_argument("--script-file", required=True, help="Absolute path to script JSON file") + parser.add_argument("--output-file", required=True, help="Output path for generated podcast MP3") + parser.add_argument("--transcript-file", required=False, + help="Output path for transcript markdown file (optional)") args = parser.parse_args() try: - result = generate_podcast( - args.script_file, - args.output_file, - args.transcript_file, - ) + result = generate_podcast(args.script_file, args.output_file, + args.transcript_file) print(result) except Exception as e: import traceback diff --git a/skills/public/video-generation/SKILL.md b/skills/public/video-generation/SKILL.md index e0c55b36f..7d8d55b24 100644 --- a/skills/public/video-generation/SKILL.md +++ b/skills/public/video-generation/SKILL.md @@ -137,3 +137,15 @@ After generation: - JSON format ensures structured, parsable prompts - Reference image enhance generation quality significantly - Iterative refinement is normal for optimal results + +## Providers (Gemini / MiniMax) + +Auto-selected by environment variables (CLI unchanged): + +- `GEMINI_API_KEY` set → Gemini Veo (default, unchanged). +- Only `MINIMAX_API_KEY` set → MiniMax video (`/v1/video_generation`, async 3-step poll/download). +- Force with `VIDEO_GENERATION_PROVIDER=gemini|minimax`. + +MiniMax overrides: `MINIMAX_API_HOST` (default `https://api.minimaxi.com`), +`MINIMAX_VIDEO_MODEL` (default `MiniMax-Hailuo-2.3`). The first reference image is used +as MiniMax `first_frame_image`. MiniMax ignores `--aspect-ratio` (it uses resolution/duration). diff --git a/skills/public/video-generation/scripts/generate.py b/skills/public/video-generation/scripts/generate.py index 6f28f57e4..94e26e34e 100644 --- a/skills/public/video-generation/scripts/generate.py +++ b/skills/public/video-generation/scripts/generate.py @@ -4,6 +4,185 @@ import time import requests +MINIMAX_DEFAULT_HOST = "https://api.minimaxi.com" + + +def _resolve_provider(override_env: str, existing_provider: str, has_existing_creds: bool) -> str: + """Pick the provider: _PROVIDER override > existing creds > MiniMax fallback.""" + override = os.getenv(override_env) + if override: + return override.strip().lower() + if has_existing_creds: + return existing_provider + if os.getenv("MINIMAX_API_KEY"): + return "minimax" + raise ValueError( + f"No credentials found. Set GEMINI_API_KEY for {existing_provider}, " + f"or MINIMAX_API_KEY for minimax (optionally force with {override_env})." + ) + + +def _minimax_host() -> str: + return os.getenv("MINIMAX_API_HOST", MINIMAX_DEFAULT_HOST).rstrip("/") + + +def _ensure_output_dir(output_file: str) -> None: + """Create the output file's parent directory so nested paths don't fail.""" + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + +def _check_base_resp(payload: dict) -> None: + base = payload.get("base_resp") or {} + if base.get("status_code", 0) != 0: + raise Exception(f"MiniMax error {base.get('status_code')}: {base.get('status_msg')}") + + +def _guess_mime(image_path: str) -> str: + ext = os.path.splitext(image_path)[1].lower() + return { + ".png": "image/png", + ".webp": "image/webp", + ".gif": "image/gif", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + }.get(ext, "image/jpeg") + + +def _to_data_url(image_path: str) -> str: + with open(image_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{_guess_mime(image_path)};base64,{b64}" + + +def _poll_video_task(host: str, auth: str, task_id: str, + max_attempts: int = 120, interval: int = 3) -> str: + for _ in range(max_attempts): + response = requests.get( + f"{host}/v1/query/video_generation", + headers={"Authorization": auth}, + params={"task_id": task_id}, + timeout=30, + ) + response.raise_for_status() + payload = response.json() + status = payload.get("status") + if status == "Success": + return payload["file_id"] + if status == "Fail": + base = payload.get("base_resp") or {} + raise Exception( + f"MiniMax video task {task_id} failed: " + f"{base.get('status_code')} {base.get('status_msg')}" + ) + # Surface query-level errors (bad task_id, auth) that arrive as a non-zero + # base_resp without a terminal status, then keep polling. + _check_base_resp(payload) + time.sleep(interval) + raise Exception(f"MiniMax video task {task_id} timed out after {max_attempts} polls") + + +def _retrieve_file_url(host: str, auth: str, file_id: str) -> str: + response = requests.get( + f"{host}/v1/files/retrieve", + headers={"Authorization": auth}, + params={"file_id": file_id}, + timeout=30, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + return payload["file"]["download_url"] + + +def _download(url: str, output_file: str) -> None: + response = requests.get(url, timeout=300) + response.raise_for_status() + _ensure_output_dir(output_file) + with open(output_file, "wb") as f: + f.write(response.content) + + +def _generate_video_minimax( + prompt: str, reference_images: list[str], output_file: str +) -> str: + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + return "MINIMAX_API_KEY is not set" + host = _minimax_host() + auth = f"Bearer {api_key}" + body = {"model": os.getenv("MINIMAX_VIDEO_MODEL", "MiniMax-Hailuo-2.3"), "prompt": prompt} + if reference_images: + body["first_frame_image"] = _to_data_url(reference_images[0]) + response = requests.post( + f"{host}/v1/video_generation", + headers={"Authorization": auth, "Content-Type": "application/json"}, + json=body, + timeout=60, + ) + response.raise_for_status() + payload = response.json() + _check_base_resp(payload) + task_id = payload["task_id"] + file_id = _poll_video_task(host, auth, task_id) + download_url = _retrieve_file_url(host, auth, file_id) + _download(download_url, output_file) + return f"The video has been generated successfully to {output_file}" + + +def download(url: str, output_file: str) -> None: + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError("GEMINI_API_KEY is not set") + response = requests.get(url, headers={"x-goog-api-key": api_key}, timeout=300) + response.raise_for_status() + _ensure_output_dir(output_file) + with open(output_file, "wb") as f: + f.write(response.content) + + +def _generate_video_gemini( + prompt: str, reference_images: list[str], output_file: str +) -> str: + reference_payload = [] + request_json = {"instances": [{"prompt": prompt}]} + for reference_image in reference_images: + with open(reference_image, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode("utf-8") + reference_payload.append( + {"image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64}, + "referenceType": "asset"} + ) + if reference_payload: + request_json["instances"][0]["referenceImages"] = reference_payload + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + return "GEMINI_API_KEY is not set" + response = requests.post( + "https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning", + headers={"x-goog-api-key": api_key, "Content-Type": "application/json"}, + json=request_json, + timeout=60, + ) + response.raise_for_status() + data = response.json() + operation_name = data["name"] + while True: + response = requests.get( + f"https://generativelanguage.googleapis.com/v1beta/{operation_name}", + headers={"x-goog-api-key": api_key}, + timeout=30, + ) + response.raise_for_status() + data = response.json() + if data.get("done", False): + sample = data["response"]["generateVideoResponse"]["generatedSamples"][0] + download(sample["video"]["uri"], output_file) + break + time.sleep(3) + return f"The video has been generated successfully to {output_file}" + def generate_video( prompt_file: str, @@ -13,104 +192,31 @@ def generate_video( ) -> str: with open(prompt_file, "r", encoding="utf-8") as f: prompt = f.read() - referenceImages = [] - i = 0 - json = { - "instances": [{"prompt": prompt}], - } - for reference_image in reference_images: - i += 1 - with open(reference_image, "rb") as f: - image_b64 = base64.b64encode(f.read()).decode("utf-8") - referenceImages.append( - { - "image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64}, - "referenceType": "asset", - } - ) - if i > 0: - json["instances"][0]["referenceImages"] = referenceImages - api_key = os.getenv("GEMINI_API_KEY") - if not api_key: - return "GEMINI_API_KEY is not set" - response = requests.post( - "https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning", - headers={ - "x-goog-api-key": api_key, - "Content-Type": "application/json", - }, - json=json, + provider = _resolve_provider( + "VIDEO_GENERATION_PROVIDER", "gemini", bool(os.getenv("GEMINI_API_KEY")) ) - json = response.json() - operation_name = json["name"] - while True: - response = requests.get( - f"https://generativelanguage.googleapis.com/v1beta/{operation_name}", - headers={ - "x-goog-api-key": api_key, - }, - ) - json = response.json() - if json.get("done", False): - sample = json["response"]["generateVideoResponse"]["generatedSamples"][0] - url = sample["video"]["uri"] - download(url, output_file) - break - time.sleep(3) - return f"The video has been generated successfully to {output_file}" - - -def download(url: str, output_file: str): - api_key = os.getenv("GEMINI_API_KEY") - if not api_key: - return "GEMINI_API_KEY is not set" - response = requests.get( - url, - headers={ - "x-goog-api-key": api_key, - }, - ) - with open(output_file, "wb") as f: - f.write(response.content) + if provider == "minimax": + # MiniMax video uses resolution/duration, not aspect_ratio; aspect_ratio ignored. + return _generate_video_minimax(prompt, reference_images, output_file) + if provider in ("gemini", "google"): + return _generate_video_gemini(prompt, reference_images, output_file) + raise ValueError(f"Unknown video provider: {provider!r} (use 'gemini' or 'minimax')") if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Generate videos using Gemini API") - parser.add_argument( - "--prompt-file", - required=True, - help="Absolute path to JSON prompt file", - ) - parser.add_argument( - "--reference-images", - nargs="*", - default=[], - help="Absolute paths to reference images (space-separated)", - ) - parser.add_argument( - "--output-file", - required=True, - help="Output path for generated image", - ) - parser.add_argument( - "--aspect-ratio", - required=False, - default="16:9", - help="Aspect ratio of the generated image", - ) - + parser = argparse.ArgumentParser(description="Generate videos using Gemini or MiniMax API") + parser.add_argument("--prompt-file", required=True, help="Absolute path to JSON prompt file") + parser.add_argument("--reference-images", nargs="*", default=[], + help="Absolute paths to reference images (space-separated)") + parser.add_argument("--output-file", required=True, help="Output path for generated video") + parser.add_argument("--aspect-ratio", required=False, default="16:9", + help="Aspect ratio of the generated video (Gemini only)") args = parser.parse_args() try: - print( - generate_video( - args.prompt_file, - args.reference_images, - args.output_file, - args.aspect_ratio, - ) - ) + print(generate_video(args.prompt_file, args.reference_images, + args.output_file, args.aspect_ratio)) except Exception as e: print(f"Error while generating video: {e}") diff --git a/tests/skills/skill_loader.py b/tests/skills/skill_loader.py new file mode 100644 index 000000000..8ef5c2385 --- /dev/null +++ b/tests/skills/skill_loader.py @@ -0,0 +1,39 @@ +"""Load a skill's scripts/generate.py as an importable module, by file path. + +Skills live in skills/public//scripts/generate.py and are NOT a package, +so tests load them via importlib. Tests then mock the module's `requests`. +""" +import importlib.util +import sys +from pathlib import Path + +import requests + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load(skill_name: str): + """Return the generate.py module for skills/public/.""" + path = REPO_ROOT / "skills" / "public" / skill_name / "scripts" / "generate.py" + mod_name = skill_name.replace("-", "_") + "_generate" + spec = importlib.util.spec_from_file_location(mod_name, path) + module = importlib.util.module_from_spec(spec) + sys.modules[mod_name] = module # standard pattern; lets the module resolve itself + spec.loader.exec_module(module) + return module + + +class FakeResp: + """Minimal stand-in for requests.Response.""" + + def __init__(self, json_data=None, content=b"", status_code=200): + self._json = json_data if json_data is not None else {} + self.content = content + self.status_code = status_code + + def raise_for_status(self): + if self.status_code >= 400: + raise requests.HTTPError(f"HTTP {self.status_code}") + + def json(self): + return self._json diff --git a/tests/skills/test_image_generation.py b/tests/skills/test_image_generation.py new file mode 100644 index 000000000..0992f24f0 --- /dev/null +++ b/tests/skills/test_image_generation.py @@ -0,0 +1,195 @@ +import base64 +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +img = load("image-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["GEMINI_API_KEY", "MINIMAX_API_KEY", "IMAGE_GENERATION_PROVIDER", + "MINIMAX_API_HOST", "MINIMAX_IMAGE_MODEL"]: + monkeypatch.delenv(k, raising=False) + + +def test_resolve_prefers_gemini(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "g") + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", True) == "gemini" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", False) == "minimax" + + +def test_resolve_override_wins(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "g") + monkeypatch.setenv("IMAGE_GENERATION_PROVIDER", "MiniMax") + assert img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", True) == "minimax" + + +def test_resolve_errors_when_none(monkeypatch): + with pytest.raises(ValueError): + img._resolve_provider("IMAGE_GENERATION_PROVIDER", "gemini", False) + + +def test_minimax_builds_payload_and_writes(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + raw = b"PNGBYTES" + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(raw).decode()]}, + "base_resp": {"status_code": 0, "status_msg": "success"}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + out = tmp_path / "o.jpg" + prompt_file = tmp_path / "p.json" + prompt_file.write_text("a red apple", encoding="utf-8") + msg = img.generate_image(str(prompt_file), [], str(out), "16:9") + + assert out.read_bytes() == raw + assert captured["url"].endswith("/v1/image_generation") + assert captured["headers"]["Authorization"] == "Bearer m" + assert captured["json"]["model"] == "image-01" + assert captured["json"]["response_format"] == "base64" + assert captured["json"]["aspect_ratio"] == "16:9" + assert captured["json"]["n"] == 1 + assert captured["json"]["prompt_optimizer"] is True + assert "Successfully generated image" in msg + + +def test_minimax_reference_image_as_data_url(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(b"x").decode()]}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + ref = tmp_path / "ref.jpg" + ref.write_bytes(b"\xff\xd8refbytes") + prompt_file = tmp_path / "p.json" + prompt_file.write_text("scene", encoding="utf-8") + img.generate_image(str(prompt_file), [str(ref)], str(tmp_path / "o.jpg"), "1:1") + + subj = captured["json"]["subject_reference"] + assert subj[0]["type"] == "character" + assert subj[0]["image_file"].startswith("data:image/jpeg;base64,") + import base64 as _b64 + encoded = subj[0]["image_file"].split(",", 1)[1] + assert _b64.b64decode(encoded) == b"\xff\xd8refbytes" + + +def test_minimax_raises_on_base_resp_error(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"base_resp": {"status_code": 1004, "status_msg": "auth failed"}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.json" + prompt_file.write_text("x", encoding="utf-8") + with pytest.raises(Exception) as e: + img.generate_image(str(prompt_file), [], str(tmp_path / "o.jpg"), "1:1") + assert "1004" in str(e.value) + + +def test_minimax_extracts_json_prompt_field(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(b"x").decode()]}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.json" + prompt_file.write_text( + '{"prompt": "a red barn at dawn", "style": "watercolor", ' + '"composition": "rule of thirds", "negative_prompt": "blurry"}', + encoding="utf-8", + ) + img.generate_image(str(prompt_file), [], str(tmp_path / "o.jpg"), "16:9") + + # Only the JSON `prompt` field reaches MiniMax — no other fields, no JSON syntax. + assert captured["json"]["prompt"] == "a red barn at dawn" + assert captured["json"]["prompt_optimizer"] is True + + +def test_minimax_plaintext_prompt_passes_through(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["json"] = json + return FakeResp({"data": {"image_base64": [base64.b64encode(b"x").decode()]}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.txt" + prompt_file.write_text("a red apple on a table", encoding="utf-8") + img.generate_image(str(prompt_file), [], str(tmp_path / "o.jpg"), "1:1") + + assert captured["json"]["prompt"] == "a red apple on a table" + + +def test_minimax_rejects_overlong_prompt_without_calling_api(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): # pragma: no cover + raise AssertionError("must not call the API when the prompt is over the limit") + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.json" + prompt_file.write_text('{"prompt": "' + "x" * 1600 + '"}', encoding="utf-8") + out = tmp_path / "o.jpg" + msg = img.generate_image(str(prompt_file), [], str(out), "16:9") + + assert "1500" in msg + assert "character" in msg.lower() + assert not out.exists() + + +def test_minimax_creates_nested_output_dir(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"data": {"image_base64": [base64.b64encode(b"img").decode()]}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(img.requests, "post", fake_post) + prompt_file = tmp_path / "p.txt" + prompt_file.write_text("a cat", encoding="utf-8") + out = tmp_path / "nested" / "dir" / "o.jpg" + img.generate_image(str(prompt_file), [], str(out), "1:1") + + assert out.read_bytes() == b"img" + + +def test_unknown_provider_raises(monkeypatch, tmp_path): + monkeypatch.setenv("IMAGE_GENERATION_PROVIDER", "openai") + monkeypatch.setenv("GEMINI_API_KEY", "g") + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + with pytest.raises(ValueError): + img.generate_image(str(pf), [], str(tmp_path / "o.jpg"), "1:1") + + +def test_guess_mime_by_extension(): + assert img._guess_mime("/a/b.png") == "image/png" + assert img._guess_mime("/a/b.webp") == "image/webp" + assert img._guess_mime("/a/b.jpg") == "image/jpeg" + assert img._guess_mime("/a/b.unknown") == "image/jpeg" diff --git a/tests/skills/test_music_generation.py b/tests/skills/test_music_generation.py new file mode 100644 index 000000000..5cce4c126 --- /dev/null +++ b/tests/skills/test_music_generation.py @@ -0,0 +1,135 @@ +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +mus = load("music-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["MINIMAX_API_KEY", "MINIMAX_API_HOST", "MINIMAX_MUSIC_MODEL"]: + monkeypatch.delenv(k, raising=False) + + +def _post_ok(captured): + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + return FakeResp({"data": {"audio": b"songbytes".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + return fake_post + + +def test_with_lyrics_payload_and_writes(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"title":"X","prompt":"pop, happy","lyrics":"[verse]\\nla la"}', + encoding="utf-8") + out = tmp_path / "o.mp3" + msg = mus.generate_music(str(spec), str(out)) + assert out.read_bytes() == b"songbytes" + assert captured["url"].endswith("/v1/music_generation") + assert captured["headers"]["Authorization"] == "Bearer m" + assert captured["json"]["model"] == "music-2.6-free" + assert captured["json"]["lyrics"] == "[verse]\nla la" + assert captured["json"]["output_format"] == "hex" + assert "Successfully generated music" in msg + + +def test_instrumental_sets_flag(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"lofi beats","is_instrumental":true}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["is_instrumental"] is True + assert "lyrics" not in captured["json"] + assert "lyrics_optimizer" not in captured["json"] + + +def test_no_lyrics_uses_optimizer(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"sad ballad"}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["lyrics_optimizer"] is True + assert "lyrics" not in captured["json"] + + +def test_model_override(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + monkeypatch.setenv("MINIMAX_MUSIC_MODEL", "music-2.6") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"jazz","lyrics":"[verse]\\nhi"}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["model"] == "music-2.6" + + +def test_raises_on_base_resp_error(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"base_resp": {"status_code": 1008, "status_msg": "no balance"}}) + + monkeypatch.setattr(mus.requests, "post", fake_post) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x","lyrics":"[verse]\\ny"}', encoding="utf-8") + with pytest.raises(Exception) as e: + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert "1008" in str(e.value) + + +def test_missing_api_key_returns_message(monkeypatch, tmp_path): + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x"}', encoding="utf-8") + msg = mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert "MINIMAX_API_KEY" in msg + + +def test_raises_on_missing_audio_data(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"base_resp": {"status_code": 0}}) # no "data" key + + monkeypatch.setattr(mus.requests, "post", fake_post) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x"}', encoding="utf-8") + with pytest.raises(Exception, match="no audio data"): + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + + +def test_empty_prompt_raises(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): # pragma: no cover + raise AssertionError("must not call the API when prompt is missing") + + monkeypatch.setattr(mus.requests, "post", fake_post) + spec = tmp_path / "s.json" + spec.write_text('{"title":"X","lyrics":"[verse]\\nhi"}', encoding="utf-8") # no prompt + with pytest.raises(ValueError, match="prompt"): + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + + +def test_empty_lyrics_falls_back_to_optimizer(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + monkeypatch.setattr(mus.requests, "post", _post_ok(captured)) + spec = tmp_path / "s.json" + spec.write_text('{"prompt":"x","lyrics":""}', encoding="utf-8") + mus.generate_music(str(spec), str(tmp_path / "o.mp3")) + assert captured["json"]["lyrics_optimizer"] is True + assert "lyrics" not in captured["json"] diff --git a/tests/skills/test_podcast_generation.py b/tests/skills/test_podcast_generation.py new file mode 100644 index 000000000..222a9f51e --- /dev/null +++ b/tests/skills/test_podcast_generation.py @@ -0,0 +1,253 @@ +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +pod = load("podcast-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["VOLCENGINE_TTS_APPID", "VOLCENGINE_TTS_ACCESS_TOKEN", "VOLCENGINE_TTS_CLUSTER", + "MINIMAX_API_KEY", "PODCAST_GENERATION_PROVIDER", "MINIMAX_API_HOST", + "MINIMAX_TTS_MODEL", "MINIMAX_TTS_VOICE_MALE", "MINIMAX_TTS_VOICE_FEMALE", + "MINIMAX_TTS_MAX_RETRIES"]: + monkeypatch.delenv(k, raising=False) + # never actually sleep during backoff in tests + monkeypatch.setattr(pod.time, "sleep", lambda *_: None) + + +def test_resolve_prefers_volcengine(monkeypatch): + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + assert pod._resolve_tts_provider() == "volcengine" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert pod._resolve_tts_provider() == "minimax" + + +def test_resolve_override(monkeypatch): + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + monkeypatch.setenv("PODCAST_GENERATION_PROVIDER", "minimax") + assert pod._resolve_tts_provider() == "minimax" + + +def test_resolve_unknown_raises(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + monkeypatch.setenv("PODCAST_GENERATION_PROVIDER", "openai") + with pytest.raises(ValueError): + pod._resolve_tts_provider() + + +def test_minimax_tts_decodes_hex(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + + def fake_post(url, headers=None, json=None, **kw): + captured["url"] = url + captured["json"] = json + return FakeResp({"data": {"audio": b"audiobytes".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hello", "male-qn-qingse") + assert out == b"audiobytes" + assert captured["url"].endswith("/v1/t2a_v2") + assert captured["json"]["voice_setting"]["voice_id"] == "male-qn-qingse" + assert captured["json"]["output_format"] == "hex" + + +def test_process_line_minimax_voice_mapping(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + seen = {} + + def fake_tts(text, voice_id): + seen["voice_id"] = voice_id + return b"x" + + monkeypatch.setattr(pod, "text_to_speech_minimax", fake_tts) + line = pod.ScriptLine(speaker="female", paragraph="hi") + idx, audio = pod._process_line((0, line, 1, "minimax")) + assert audio == b"x" + assert seen["voice_id"] == "female-tianmei" + + +def test_generate_podcast_minimax_end_to_end(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"data": {"audio": b"chunk".hex(), "status": 2}, + "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(pod.requests, "post", fake_post) + script = tmp_path / "s.json" + script.write_text( + '{"title":"T","locale":"en","lines":[{"speaker":"male","paragraph":"a"},' + '{"speaker":"female","paragraph":"b"}]}', + encoding="utf-8", + ) + out = tmp_path / "o.mp3" + msg = pod.generate_podcast(str(script), str(out), None) + assert out.read_bytes() == b"chunkchunk" + assert "Successfully generated podcast" in msg + + +def test_volcengine_tts_decodes_base64(monkeypatch): + import base64 + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"code": 3000, "data": base64.b64encode(b"volcbytes").decode()}) + + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_volcengine("hi", "zh_male_yangguangqingnian_moon_bigtts") + assert out == b"volcbytes" + + +def test_volcengine_without_creds_raises(monkeypatch): + monkeypatch.setenv("PODCAST_GENERATION_PROVIDER", "volcengine") + script = pod.Script(lines=[pod.ScriptLine("male", "a")]) + with pytest.raises(ValueError): + pod.tts_node(script) + + +def test_process_line_minimax_male_and_override(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + seen = [] + + def fake_tts(text, voice_id): + seen.append(voice_id) + return b"x" + + monkeypatch.setattr(pod, "text_to_speech_minimax", fake_tts) + male = pod.ScriptLine(speaker="male", paragraph="hi") + pod._process_line((0, male, 1, "minimax")) + assert seen[-1] == "male-qn-qingse" + monkeypatch.setenv("MINIMAX_TTS_VOICE_MALE", "custom-male") + pod._process_line((0, male, 1, "minimax")) + assert seen[-1] == "custom-male" + + +def _seq_post(responses): + """Return a fake requests.post that yields the given responses in order.""" + calls = {"n": 0} + + def fake_post(*a, **k): + resp = responses[min(calls["n"], len(responses) - 1)] + calls["n"] += 1 + return resp + + return fake_post, calls + + +def test_minimax_retries_on_rate_limit_code(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + fake_post, calls = _seq_post([ + FakeResp({"base_resp": {"status_code": 1002, "status_msg": "rate limit"}}), + FakeResp({"base_resp": {"status_code": 1039, "status_msg": "tpm limit"}}), + FakeResp({"data": {"audio": b"ok".hex()}, "base_resp": {"status_code": 0}}), + ]) + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hi", "male-qn-qingse", max_retries=3) + assert out == b"ok" + assert calls["n"] == 3 # two retries then success + + +def test_minimax_retries_on_http_429(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + fake_post, calls = _seq_post([ + FakeResp({}, status_code=429), + FakeResp({"data": {"audio": b"ok".hex()}, "base_resp": {"status_code": 0}}), + ]) + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hi", "male-qn-qingse", max_retries=3) + assert out == b"ok" + assert calls["n"] == 2 + + +def test_minimax_no_retry_on_auth_error(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + fake_post, calls = _seq_post([ + FakeResp({"base_resp": {"status_code": 1004, "status_msg": "auth failed"}}), + FakeResp({"data": {"audio": b"never".hex()}, "base_resp": {"status_code": 0}}), + ]) + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hi", "male-qn-qingse", max_retries=3) + assert out is None + assert calls["n"] == 1 # permanent error: no retry + + +def test_minimax_gives_up_after_max_retries(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + fake_post, calls = _seq_post([ + FakeResp({"base_resp": {"status_code": 1002, "status_msg": "rate limit"}}), + ]) + monkeypatch.setattr(pod.requests, "post", fake_post) + out = pod.text_to_speech_minimax("hi", "male-qn-qingse", max_retries=2) + assert out is None + assert calls["n"] == 3 # initial attempt + 2 retries + + +def test_tts_node_raises_on_partial_failure(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + calls = {"n": 0} + + def fake_tts(text, voice_id, **kw): + calls["n"] += 1 + return b"x" if calls["n"] == 1 else None + + monkeypatch.setattr(pod, "text_to_speech_minimax", fake_tts) + script = pod.Script(lines=[pod.ScriptLine("male", "a"), pod.ScriptLine("female", "b")]) + with pytest.raises(ValueError) as e: + pod.tts_node(script) + assert "2" in str(e.value) # mentions failed line number 2 + + +def test_tts_node_defaults_to_one_worker_for_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + captured = {} + real_executor = pod.ThreadPoolExecutor + + class CapturingExecutor(real_executor): + def __init__(self, *args, **kwargs): + captured["max_workers"] = kwargs.get("max_workers", args[0] if args else None) + super().__init__(*args, **kwargs) + + def fake_tts(text, voice_id): + return b"x" + + monkeypatch.setattr(pod, "ThreadPoolExecutor", CapturingExecutor) + monkeypatch.setattr(pod, "text_to_speech_minimax", fake_tts) + script = pod.Script(lines=[pod.ScriptLine("male", "a"), pod.ScriptLine("female", "b")]) + + assert pod.tts_node(script) == [b"x", b"x"] + assert captured["max_workers"] == 1 + + +def test_tts_node_keeps_four_worker_default_for_volcengine(monkeypatch): + monkeypatch.setenv("VOLCENGINE_TTS_APPID", "a") + monkeypatch.setenv("VOLCENGINE_TTS_ACCESS_TOKEN", "t") + captured = {} + real_executor = pod.ThreadPoolExecutor + + class CapturingExecutor(real_executor): + def __init__(self, *args, **kwargs): + captured["max_workers"] = kwargs.get("max_workers", args[0] if args else None) + super().__init__(*args, **kwargs) + + def fake_tts(text, voice_type): + return b"x" + + monkeypatch.setattr(pod, "ThreadPoolExecutor", CapturingExecutor) + monkeypatch.setattr(pod, "text_to_speech_volcengine", fake_tts) + script = pod.Script(lines=[pod.ScriptLine("male", "a"), pod.ScriptLine("female", "b")]) + + assert pod.tts_node(script) == [b"x", b"x"] + assert captured["max_workers"] == 4 diff --git a/tests/skills/test_video_generation.py b/tests/skills/test_video_generation.py new file mode 100644 index 000000000..07e2894e7 --- /dev/null +++ b/tests/skills/test_video_generation.py @@ -0,0 +1,187 @@ +import sys +from pathlib import Path + +import pytest +import requests + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from skill_loader import FakeResp, load # noqa: E402 + +vid = load("video-generation") + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for k in ["GEMINI_API_KEY", "MINIMAX_API_KEY", "VIDEO_GENERATION_PROVIDER", + "MINIMAX_API_HOST", "MINIMAX_VIDEO_MODEL"]: + monkeypatch.delenv(k, raising=False) + monkeypatch.setattr(vid.time, "sleep", lambda *_: None) + + +def test_resolve_prefers_gemini(): + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", True) == "gemini" + + +def test_resolve_falls_back_to_minimax(monkeypatch): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", False) == "minimax" + + +def test_resolve_override(monkeypatch): + monkeypatch.setenv("VIDEO_GENERATION_PROVIDER", "minimax") + assert vid._resolve_provider("VIDEO_GENERATION_PROVIDER", "gemini", True) == "minimax" + + +def test_unknown_provider_raises(monkeypatch, tmp_path): + monkeypatch.setenv("VIDEO_GENERATION_PROVIDER", "openai") + monkeypatch.setenv("GEMINI_API_KEY", "g") + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + with pytest.raises(ValueError): + vid.generate_video(str(pf), [], str(tmp_path / "v.mp4"), "16:9") + + +def test_minimax_full_flow(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + posts = {} + + def fake_post(url, headers=None, json=None, **kw): + posts["url"] = url + posts["json"] = json + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + if url.endswith("/v1/query/video_generation"): + assert params["task_id"] == "T1" + return FakeResp({"status": "Success", "file_id": "F1", + "base_resp": {"status_code": 0}}) + if url.endswith("/v1/files/retrieve"): + assert params["file_id"] == "F1" + return FakeResp({"file": {"download_url": "https://dl/v.mp4"}, + "base_resp": {"status_code": 0}}) + return FakeResp(content=b"MP4DATA") # the actual download + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + + out = tmp_path / "v.mp4" + pf = tmp_path / "p.json" + pf.write_text("a cat runs", encoding="utf-8") + msg = vid.generate_video(str(pf), [], str(out), "16:9") + + assert out.read_bytes() == b"MP4DATA" + assert posts["url"].endswith("/v1/video_generation") + assert posts["json"]["model"] == "MiniMax-Hailuo-2.3" + assert "successfully" in msg.lower() + + +def test_minimax_reference_first_frame(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + posts = {} + + def fake_post(url, headers=None, json=None, **kw): + posts["json"] = json + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + if url.endswith("/v1/query/video_generation"): + return FakeResp({"status": "Success", "file_id": "F1", "base_resp": {"status_code": 0}}) + if url.endswith("/v1/files/retrieve"): + return FakeResp({"file": {"download_url": "https://dl/v.mp4"}, "base_resp": {"status_code": 0}}) + return FakeResp(content=b"X") + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + ref = tmp_path / "f.jpg" + ref.write_bytes(b"\xff\xd8img") + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + vid.generate_video(str(pf), [str(ref)], str(tmp_path / "v.mp4"), "16:9") + assert posts["json"]["first_frame_image"].startswith("data:image/jpeg;base64,") + + +def test_minimax_task_fail(monkeypatch, tmp_path): + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + return FakeResp({"status": "Fail", "base_resp": {"status_code": 1027, "status_msg": "blocked"}}) + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + with pytest.raises(Exception): + vid.generate_video(str(pf), [], str(tmp_path / "v.mp4"), "16:9") + + +def test_minimax_poll_timeout(monkeypatch): + def fake_get(url, headers=None, params=None, **kw): + return FakeResp({"status": "Processing", "base_resp": {"status_code": 0}}) + + monkeypatch.setattr(vid.requests, "get", fake_get) + with pytest.raises(Exception) as e: + vid._poll_video_task("https://h", "Bearer m", "T1", max_attempts=3, interval=0) + assert "timed out" in str(e.value) + + +def test_minimax_task_fail_keeps_task_context(monkeypatch, tmp_path): + # A Fail status takes priority over the generic base_resp check, so the + # error keeps the task_id and the task-level failure message. + monkeypatch.setenv("MINIMAX_API_KEY", "m") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp({"task_id": "T1", "base_resp": {"status_code": 0}}) + + def fake_get(url, headers=None, params=None, **kw): + return FakeResp({"status": "Fail", "base_resp": {"status_code": 1027, "status_msg": "blocked"}}) + + monkeypatch.setattr(vid.requests, "post", fake_post) + monkeypatch.setattr(vid.requests, "get", fake_get) + pf = tmp_path / "p.json" + pf.write_text("x", encoding="utf-8") + with pytest.raises(Exception, match="task T1 failed"): + vid.generate_video(str(pf), [], str(tmp_path / "v.mp4"), "16:9") + + +def test_gemini_download_raises_on_http_error(monkeypatch, tmp_path): + monkeypatch.setenv("GEMINI_API_KEY", "g") + calls = {} + + def fake_get(url, headers=None, **kw): + calls["timeout"] = kw.get("timeout") + return FakeResp(content=b"error page", status_code=500) + + monkeypatch.setattr(vid.requests, "get", fake_get) + out = tmp_path / "sub" / "v.mp4" + with pytest.raises(requests.HTTPError): + vid.download("https://dl/v.mp4", str(out)) + assert calls["timeout"] # a timeout is now passed + assert not out.exists() + + +def test_gemini_download_writes_nested_dir(monkeypatch, tmp_path): + monkeypatch.setenv("GEMINI_API_KEY", "g") + + def fake_get(url, headers=None, **kw): + return FakeResp(content=b"VIDEO") + + monkeypatch.setattr(vid.requests, "get", fake_get) + out = tmp_path / "nested" / "dir" / "v.mp4" + vid.download("https://dl/v.mp4", str(out)) + assert out.read_bytes() == b"VIDEO" + + +def test_gemini_post_raises_on_http_error(monkeypatch, tmp_path): + monkeypatch.setenv("GEMINI_API_KEY", "g") + + def fake_post(url, headers=None, json=None, **kw): + return FakeResp(status_code=503) + + monkeypatch.setattr(vid.requests, "post", fake_post) + pf = tmp_path / "p.json" + pf.write_text("a cat", encoding="utf-8") + with pytest.raises(requests.HTTPError): + vid.generate_video(str(pf), [], str(tmp_path / "v.mp4"), "16:9")