fix(skills): surface offending line and quoting hint on SKILL.md YAML… (#3335)

* fix(skills): surface offending line and quoting hint on SKILL.md YAML errors When a SKILL.md front-matter fails to parse, the existing log only echoes PyYAML's raw message, leaving authors to grep the file for the offending line. This is especially painful for the very common LLM-authored mistake of an unquoted scalar containing ': ' (e.g. 'description: foo: bar'), which fails with 'mapping values are not allowed here' and silently drops the skill. Enrich the error log with: - the source line PyYAML pointed at via problem_mark - a targeted, copy-pasteable quoting hint when (and only when) the error is the well-known 'mapping values are not allowed' scanner error on an unquoted value The skill is still rejected (no semantics are guessed or rewritten); only the diagnostic is improved. Fixes #3333 * improve(skills): address CR feedback on SKILL.md YAML error diagnostics Per review on #3335: - Log the file line number (mark.line + 2) instead of the front-matter-internal line number, so authors land on the right row in their editor. - Use exc.problem == "mapping values are not allowed here" for a tighter match than substring-scanning str(exc). - Preserve the offending key's leading whitespace in the quoting hint so nested mappings stay nested when authors paste the fix back. - Rewrite the regression test to actually exercise the new behaviour: PyYAML's own message already echoes the offending line (and truncates it with "..."), so the old assertion passed on main. New assertions pin (a) the file-line number, (b) the full untruncated line, and (c) the copy-pasteable hint. - Add a guard test for nested-key indentation so the partition()/strip() shape cannot regress silently. Refs #3333, #3335 * fix(skills): escape backslashes in YAML quoting hint The hint emitted by _format_yaml_error previously escaped only double quotes, so values containing backslashes (e.g. Windows paths like C:\Temp or regex escapes like \d) produced a suggested scalar that was either invalid YAML or silently re-interpreted by PyYAML's double-quoted escape rules when pasted back. Escape order matters: backslashes first, then double quotes. Adds two regression tests covering Windows-path and regex-style backslashes. Address Copilot CR feedback on PR #3335.
2026-07-29 01:15:59 +00:00 · 2026-06-03 21:53:52 +08:00 · 2026-06-03 21:53:52 +08:00 · 89ae74d4f4
commit 89ae74d4f4
parent 9a53f9dfbb
2 changed files with 186 additions and 1 deletions
--- a/backend/packages/harness/deerflow/skills/parser.py
+++ b/backend/packages/harness/deerflow/skills/parser.py
@ -9,6 +9,37 @@ from .types import SKILL_MD_FILE, Skill, SkillCategory
 logger = logging.getLogger(__name__)


+def _format_yaml_error(skill_file: Path, exc: yaml.YAMLError, source: str) -> str:
+    """Render a developer-friendly explanation of a YAML front-matter error."""
+
+    lines = [f"Invalid YAML front-matter in {skill_file}: {exc}"]
+
+    mark = getattr(exc, "problem_mark", None)
+    source_lines = source.splitlines()
+    if mark is not None and 0 <= mark.line < len(source_lines):
+        offending = source_lines[mark.line]
+
+        # mark.line is 0-based within the front-matter body; +1 makes it
+        # 1-based, +1 more accounts for the leading `---` fence that the
+        # front-matter regex strips before yaml.safe_load sees it. The
+        # result matches the line number an author sees in their editor.
+        file_line_number = mark.line + 2
+        lines.append(f"  line {file_line_number}: {offending}")
+
+        # Targeted hint for the most common authoring mistake: an unquoted
+        # scalar value whose body contains ``: ``. We only surface the hint
+        # when we are confident it applies, to avoid misleading authors who
+        # hit unrelated YAML errors.
+        if getattr(exc, "problem", "") == "mapping values are not allowed here" and ":" in offending:
+            key, _, value = offending.partition(":")
+            value = value.strip()
+            if value and value[0] not in {'"', "'", "|", ">", "[", "{"}:
+                escaped = value.replace("\\", "\\\\").replace('"', '\\"')
+                lines.append(f'  hint: values containing ":" must be quoted, e.g. {key}: "{escaped}"')
+
+    return "\n".join(lines)
+
+
 def parse_allowed_tools(raw: object, skill_file: Path) -> list[str] | None:
    """Parse the optional allowed-tools frontmatter field.

@ -60,7 +91,7 @@ def parse_skill_file(skill_file: Path, category: SkillCategory, relative_path: P
        try:
            metadata = yaml.safe_load(front_matter_text)
        except yaml.YAMLError as exc:
-            logger.error("Invalid YAML front-matter in %s: %s", skill_file, exc)
+            logger.error("%s", _format_yaml_error(skill_file, exc, front_matter_text))
            return None

        if not isinstance(metadata, dict):
--- a/backend/tests/test_skills_parser.py
+++ b/backend/tests/test_skills_parser.py
@ -10,6 +10,7 @@ The parser now uses ``yaml.safe_load`` consistently with ``validation.py``.

 from __future__ import annotations

+import logging
 from pathlib import Path

 from deerflow.skills.parser import parse_skill_file
@ -156,3 +157,156 @@ def test_parse_nonexistent_file_returns_none(tmp_path):
    """Non-existent files are handled gracefully."""
    skill = parse_skill_file(tmp_path / "ghost" / "SKILL.md", category="custom")
    assert skill is None
+
+
+# ---------------------------------------------------------------------------
+# Friendly YAML error reporting
+# ---------------------------------------------------------------------------
+
+
+def test_parse_unquoted_colon_value_logs_line_and_hint(tmp_path, caplog):
+    """Unquoted value with ': ' produces a log that exposes the full offending line
+    (PyYAML truncates long lines with `...`) and a copy-pasteable quoting hint.
+
+    Regression for issue #3333: SKILL.md authored by an LLM frequently
+    contains ``description: foo: bar`` which PyYAML rejects with
+    ``mapping values are not allowed here``. The skill is correctly skipped
+    (the file is not silently accepted). Before this change the only
+    diagnostic was PyYAML's own message, which (a) numbers lines within
+    the front-matter body rather than the file and (b) truncates long
+    values with '...'. The new behaviour pins:
+      * the line number an author sees in their editor (file-line, not
+        front-matter-line),
+      * the *full* offending line (no '...' truncation), and
+      * a copy-pasteable `key: "value"` hint.
+    """
+
+    # The description value is intentionally long enough to trigger
+    # PyYAML's own '...' truncation in the rendered str(exc); our hint
+    # must echo the *full* value regardless.
+    long_value = "StarRun collector: progress, errors, tables out, plus assorted diagnostic notes"
+    front_matter = f"name: collect-startrun\ndescription: {long_value}"
+    skill_file = _write_skill(tmp_path, front_matter)
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is None
+    combined = "\n".join(rec.getMessage() for rec in caplog.records)
+    assert "Invalid YAML front-matter" in combined
+
+    # 1. File-line, not front-matter-line. `description` is the 2nd line
+    #    of the front-matter body, which is line 3 of the file (line 1
+    #    is the leading `---` fence). Before this PR the log said
+    #    `line 2`, which sent authors to the wrong row.
+    assert f"line 3: description: {long_value}" in combined
+
+    # 2. The full value is preserved -- PyYAML's own message truncates
+    #    long values with '...', so the presence of the un-truncated tail
+    #    proves we are reading the source line ourselves, not echoing
+    #    PyYAML's snippet.
+    assert "plus assorted diagnostic notes" in combined
+    assert "..." not in [line for line in combined.splitlines() if line.startswith("  line ")][0]
+
+    # 3. The copy-pasteable quoting hint is the actually-new diagnostic.
+    assert f'hint: values containing ":" must be quoted, e.g. description: "{long_value}"' in combined
+
+
+def test_parse_unquoted_colon_value_preserves_nested_key_indent(tmp_path, caplog):
+    """Nested keys must keep their leading indentation in the quoting hint.
+
+    Regression guard for CR feedback on PR #3335: an earlier version of
+    the hint called ``key.strip()``, which turned ``  author: foo: bar``
+    into ``author: "foo: bar"``. Pasting that back under a parent mapping
+    silently moved the field to the top level. The hint must preserve
+    the original indentation so authors can copy-paste-fix in place.
+    """
+
+    # A two-space-indented nested key triggers the same scanner error,
+    # but its hint must keep the indentation.
+    front_matter = "name: nested-skill\nmetadata:\n  author: Jane: Doe"
+    skill_file = _write_skill(tmp_path, front_matter)
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is None
+    combined = "\n".join(rec.getMessage() for rec in caplog.records)
+    # Two leading spaces in front of `author` are preserved.
+    assert 'hint: values containing ":" must be quoted, e.g.   author: "Jane: Doe"' in combined
+
+
+def test_parse_unrelated_yaml_error_omits_quoting_hint(tmp_path, caplog):
+    """Errors other than 'mapping values are not allowed' must NOT carry the quoting hint."""
+
+    # Unclosed flow sequence is a scanner error of a different shape; the
+    # quoting hint would be misleading and must be suppressed.
+    skill_file = _write_skill(tmp_path, "name: [unclosed\ndescription: x")
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is None
+    combined = "\n".join(rec.getMessage() for rec in caplog.records)
+    assert "Invalid YAML front-matter" in combined
+    assert "hint:" not in combined
+
+
+def test_parse_valid_skill_emits_no_error_log(tmp_path, caplog):
+    """Sanity check: a valid SKILL.md must not produce any error logs."""
+
+    skill_file = _write_skill(tmp_path, 'name: ok-skill\ndescription: "Foo: bar"')
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is not None
+    assert skill.description == "Foo: bar"
+    assert not caplog.records, "valid SKILL.md must not log errors"
+
+
+def test_parse_unquoted_colon_value_escapes_backslashes_in_hint(tmp_path, caplog):
+    """Backslashes in the offending value must be doubled in the hint.
+
+    Regression guard for CR feedback on PR #3335: an earlier version of
+    the hint only escaped ``"`` but left ``\\`` untouched. Pasting the
+    suggested ``key: "..."`` back into the file would then be reparsed
+    as an escape sequence by PyYAML's double-quoted scalar rules and
+    either fail to load or silently change meaning (e.g. ``C:\\Temp``
+    becoming ``C:<TAB>emp``). The hint must double the backslash so the
+    suggested scalar is valid YAML when pasted back.
+    """
+
+    # The second ``: `` (after ``path``) is what trips PyYAML's
+    # "mapping values are not allowed here"; the ``C:\Temp`` segment
+    # carries the backslash that the hint must escape.
+    front_matter = "name: path-skill\ndescription: Windows path: C:\\Temp"
+    skill_file = _write_skill(tmp_path, front_matter)
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is None
+    combined = "\n".join(rec.getMessage() for rec in caplog.records)
+    assert r'description: "Windows path: C:\\Temp"' in combined
+
+
+def test_parse_unquoted_colon_value_escapes_regex_in_hint(tmp_path, caplog):
+    """Regex-style ``\\d`` must also be escaped in the hint.
+
+    Same root cause as the Windows-path guard above, but with a
+    regex-style escape that is even more likely to appear in
+    LLM-authored skills (e.g. a ``description`` that quotes a regex).
+    PyYAML rejects ``\\d`` in double-quoted scalars, so the hint must
+    emit ``\\\\d`` to remain valid.
+    """
+
+    front_matter = "name: regex-skill\ndescription: match: \\d+ digits"
+    skill_file = _write_skill(tmp_path, front_matter)
+
+    with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
+        skill = parse_skill_file(skill_file, category="custom")
+
+    assert skill is None
+    combined = "\n".join(rec.getMessage() for rec in caplog.records)
+    assert r'description: "match: \\d+ digits"' in combined