diff --git a/backend/packages/harness/deerflow/skills/parser.py b/backend/packages/harness/deerflow/skills/parser.py index 54de8cbac..7571bd9fc 100644 --- a/backend/packages/harness/deerflow/skills/parser.py +++ b/backend/packages/harness/deerflow/skills/parser.py @@ -9,6 +9,37 @@ from .types import SKILL_MD_FILE, Skill, SkillCategory logger = logging.getLogger(__name__) +def _format_yaml_error(skill_file: Path, exc: yaml.YAMLError, source: str) -> str: + """Render a developer-friendly explanation of a YAML front-matter error.""" + + lines = [f"Invalid YAML front-matter in {skill_file}: {exc}"] + + mark = getattr(exc, "problem_mark", None) + source_lines = source.splitlines() + if mark is not None and 0 <= mark.line < len(source_lines): + offending = source_lines[mark.line] + + # mark.line is 0-based within the front-matter body; +1 makes it + # 1-based, +1 more accounts for the leading `---` fence that the + # front-matter regex strips before yaml.safe_load sees it. The + # result matches the line number an author sees in their editor. + file_line_number = mark.line + 2 + lines.append(f" line {file_line_number}: {offending}") + + # Targeted hint for the most common authoring mistake: an unquoted + # scalar value whose body contains ``: ``. We only surface the hint + # when we are confident it applies, to avoid misleading authors who + # hit unrelated YAML errors. + if getattr(exc, "problem", "") == "mapping values are not allowed here" and ":" in offending: + key, _, value = offending.partition(":") + value = value.strip() + if value and value[0] not in {'"', "'", "|", ">", "[", "{"}: + escaped = value.replace("\\", "\\\\").replace('"', '\\"') + lines.append(f' hint: values containing ":" must be quoted, e.g. {key}: "{escaped}"') + + return "\n".join(lines) + + def parse_allowed_tools(raw: object, skill_file: Path) -> list[str] | None: """Parse the optional allowed-tools frontmatter field. @@ -60,7 +91,7 @@ def parse_skill_file(skill_file: Path, category: SkillCategory, relative_path: P try: metadata = yaml.safe_load(front_matter_text) except yaml.YAMLError as exc: - logger.error("Invalid YAML front-matter in %s: %s", skill_file, exc) + logger.error("%s", _format_yaml_error(skill_file, exc, front_matter_text)) return None if not isinstance(metadata, dict): diff --git a/backend/tests/test_skills_parser.py b/backend/tests/test_skills_parser.py index 43cec34c9..52172125f 100644 --- a/backend/tests/test_skills_parser.py +++ b/backend/tests/test_skills_parser.py @@ -10,6 +10,7 @@ The parser now uses ``yaml.safe_load`` consistently with ``validation.py``. from __future__ import annotations +import logging from pathlib import Path from deerflow.skills.parser import parse_skill_file @@ -156,3 +157,156 @@ def test_parse_nonexistent_file_returns_none(tmp_path): """Non-existent files are handled gracefully.""" skill = parse_skill_file(tmp_path / "ghost" / "SKILL.md", category="custom") assert skill is None + + +# --------------------------------------------------------------------------- +# Friendly YAML error reporting +# --------------------------------------------------------------------------- + + +def test_parse_unquoted_colon_value_logs_line_and_hint(tmp_path, caplog): + """Unquoted value with ': ' produces a log that exposes the full offending line + (PyYAML truncates long lines with `...`) and a copy-pasteable quoting hint. + + Regression for issue #3333: SKILL.md authored by an LLM frequently + contains ``description: foo: bar`` which PyYAML rejects with + ``mapping values are not allowed here``. The skill is correctly skipped + (the file is not silently accepted). Before this change the only + diagnostic was PyYAML's own message, which (a) numbers lines within + the front-matter body rather than the file and (b) truncates long + values with '...'. The new behaviour pins: + * the line number an author sees in their editor (file-line, not + front-matter-line), + * the *full* offending line (no '...' truncation), and + * a copy-pasteable `key: "value"` hint. + """ + + # The description value is intentionally long enough to trigger + # PyYAML's own '...' truncation in the rendered str(exc); our hint + # must echo the *full* value regardless. + long_value = "StarRun collector: progress, errors, tables out, plus assorted diagnostic notes" + front_matter = f"name: collect-startrun\ndescription: {long_value}" + skill_file = _write_skill(tmp_path, front_matter) + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is None + combined = "\n".join(rec.getMessage() for rec in caplog.records) + assert "Invalid YAML front-matter" in combined + + # 1. File-line, not front-matter-line. `description` is the 2nd line + # of the front-matter body, which is line 3 of the file (line 1 + # is the leading `---` fence). Before this PR the log said + # `line 2`, which sent authors to the wrong row. + assert f"line 3: description: {long_value}" in combined + + # 2. The full value is preserved -- PyYAML's own message truncates + # long values with '...', so the presence of the un-truncated tail + # proves we are reading the source line ourselves, not echoing + # PyYAML's snippet. + assert "plus assorted diagnostic notes" in combined + assert "..." not in [line for line in combined.splitlines() if line.startswith(" line ")][0] + + # 3. The copy-pasteable quoting hint is the actually-new diagnostic. + assert f'hint: values containing ":" must be quoted, e.g. description: "{long_value}"' in combined + + +def test_parse_unquoted_colon_value_preserves_nested_key_indent(tmp_path, caplog): + """Nested keys must keep their leading indentation in the quoting hint. + + Regression guard for CR feedback on PR #3335: an earlier version of + the hint called ``key.strip()``, which turned `` author: foo: bar`` + into ``author: "foo: bar"``. Pasting that back under a parent mapping + silently moved the field to the top level. The hint must preserve + the original indentation so authors can copy-paste-fix in place. + """ + + # A two-space-indented nested key triggers the same scanner error, + # but its hint must keep the indentation. + front_matter = "name: nested-skill\nmetadata:\n author: Jane: Doe" + skill_file = _write_skill(tmp_path, front_matter) + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is None + combined = "\n".join(rec.getMessage() for rec in caplog.records) + # Two leading spaces in front of `author` are preserved. + assert 'hint: values containing ":" must be quoted, e.g. author: "Jane: Doe"' in combined + + +def test_parse_unrelated_yaml_error_omits_quoting_hint(tmp_path, caplog): + """Errors other than 'mapping values are not allowed' must NOT carry the quoting hint.""" + + # Unclosed flow sequence is a scanner error of a different shape; the + # quoting hint would be misleading and must be suppressed. + skill_file = _write_skill(tmp_path, "name: [unclosed\ndescription: x") + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is None + combined = "\n".join(rec.getMessage() for rec in caplog.records) + assert "Invalid YAML front-matter" in combined + assert "hint:" not in combined + + +def test_parse_valid_skill_emits_no_error_log(tmp_path, caplog): + """Sanity check: a valid SKILL.md must not produce any error logs.""" + + skill_file = _write_skill(tmp_path, 'name: ok-skill\ndescription: "Foo: bar"') + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is not None + assert skill.description == "Foo: bar" + assert not caplog.records, "valid SKILL.md must not log errors" + + +def test_parse_unquoted_colon_value_escapes_backslashes_in_hint(tmp_path, caplog): + """Backslashes in the offending value must be doubled in the hint. + + Regression guard for CR feedback on PR #3335: an earlier version of + the hint only escaped ``"`` but left ``\\`` untouched. Pasting the + suggested ``key: "..."`` back into the file would then be reparsed + as an escape sequence by PyYAML's double-quoted scalar rules and + either fail to load or silently change meaning (e.g. ``C:\\Temp`` + becoming ``C:emp``). The hint must double the backslash so the + suggested scalar is valid YAML when pasted back. + """ + + # The second ``: `` (after ``path``) is what trips PyYAML's + # "mapping values are not allowed here"; the ``C:\Temp`` segment + # carries the backslash that the hint must escape. + front_matter = "name: path-skill\ndescription: Windows path: C:\\Temp" + skill_file = _write_skill(tmp_path, front_matter) + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is None + combined = "\n".join(rec.getMessage() for rec in caplog.records) + assert r'description: "Windows path: C:\\Temp"' in combined + + +def test_parse_unquoted_colon_value_escapes_regex_in_hint(tmp_path, caplog): + """Regex-style ``\\d`` must also be escaped in the hint. + + Same root cause as the Windows-path guard above, but with a + regex-style escape that is even more likely to appear in + LLM-authored skills (e.g. a ``description`` that quotes a regex). + PyYAML rejects ``\\d`` in double-quoted scalars, so the hint must + emit ``\\\\d`` to remain valid. + """ + + front_matter = "name: regex-skill\ndescription: match: \\d+ digits" + skill_file = _write_skill(tmp_path, front_matter) + + with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"): + skill = parse_skill_file(skill_file, category="custom") + + assert skill is None + combined = "\n".join(rec.getMessage() for rec in caplog.records) + assert r'description: "match: \\d+ digits"' in combined