diff --git a/skills/public/systematic-literature-review/SKILL.md b/skills/public/systematic-literature-review/SKILL.md
index 977614ddc..b81dbe75a 100644
--- a/skills/public/systematic-literature-review/SKILL.md
+++ b/skills/public/systematic-literature-review/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: systematic-literature-review
-description: Use this skill whenever the user wants to survey, synthesize, or do a systematic literature review (SLR) across multiple academic papers on a topic. Triggers on queries like "review the literature on X", "survey recent papers about Y", "do an SLR on Z", "what does the literature say about W", "summarize recent research in A", or "compare findings across papers on B". Make sure to use this skill even when the user does not say the word "systematic" — the defining signal is that they want a synthesis across MANY papers rather than a deep read of a single one. Distinct from `academic-paper-review`, which does single-paper peer review. This skill searches arXiv, extracts structured metadata from each paper in parallel via subagents, synthesizes themes across the set, and emits a report in APA, IEEE, or BibTeX citation format.
+description: Use this skill when the user wants a systematic literature review, survey, or synthesis across multiple academic papers on a topic. Also covers annotated bibliographies and cross-paper comparisons. Searches arXiv and outputs reports in APA, IEEE, or BibTeX format. Not for single-paper tasks — use academic-paper-review for reviewing one paper.
 ---
 
 # Systematic Literature Review Skill
diff --git a/skills/public/systematic-literature-review/evals/evals.json b/skills/public/systematic-literature-review/evals/evals.json
new file mode 100644
index 000000000..5653a1988
--- /dev/null
+++ b/skills/public/systematic-literature-review/evals/evals.json
@@ -0,0 +1,79 @@
+{
+  "skill_name": "systematic-literature-review",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "Do a systematic literature review on diffusion models in computer vision. 10 papers, last 2 years, category cs.CV, APA format. Save to default output location.",
+      "expected_output": "A structured SLR report saved to /mnt/user-data/outputs/ with APA citations, thematic synthesis across 10 papers, and per-paper annotations.",
+      "expectations": [
+        "The skill read SKILL.md for systematic-literature-review",
+        "The arxiv_search.py script was called with a short keyword query (2-3 words), not the full topic description",
+        "The search used --category cs.CV",
+        "The search used --sort-by relevance, not submittedDate",
+        "The search was executed only once without retries",
+        "Metadata extraction was delegated via the task tool to subagents, not done inline or via python -c",
+        "The APA template file (templates/apa.md) was read",
+        "The final report was saved to /mnt/user-data/outputs/ with a filename matching slr-<topic-slug>-<YYYYMMDD>.md",
+        "The present_files tool was called to make the report visible to the user",
+        "The report contains an Executive Summary section",
+        "The report identifies at least 3 themes with cross-paper analysis",
+        "The report contains a Convergences and Disagreements section",
+        "The report contains a Gaps and Open Questions section",
+        "The report contains per-paper annotations for each of the 10 papers",
+        "The references section uses APA 7th format with arXiv URLs"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "Survey recent papers on graph neural networks for drug discovery. 5 papers, BibTeX format.",
+      "expected_output": "A structured SLR report with BibTeX citations using @misc entries for arXiv preprints.",
+      "expectations": [
+        "The skill read SKILL.md for systematic-literature-review",
+        "The arxiv_search.py script was called with a short keyword query",
+        "Metadata extraction was delegated via the task tool to subagents",
+        "The BibTeX template file (templates/bibtex.md) was read, not apa.md or ieee.md",
+        "The final report was saved to /mnt/user-data/outputs/",
+        "The present_files tool was called",
+        "The report contains BibTeX entries using @misc, not @article",
+        "Each BibTeX entry includes eprint and primaryClass fields",
+        "The report contains thematic synthesis, not just a list of papers"
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "Review the literature on retrieval-augmented generation — key findings, limitations, and open questions. 15 papers, IEEE format.",
+      "expected_output": "A structured SLR report with IEEE numeric citations and 15 papers extracted in parallel batches.",
+      "expectations": [
+        "The skill read SKILL.md for systematic-literature-review",
+        "The arxiv_search.py script was called with --max-results 15 or higher",
+        "Metadata extraction used the task tool with multiple subagent batches (15 papers requires 3 batches of 5)",
+        "The IEEE template file (templates/ieee.md) was read",
+        "The report uses IEEE numeric citations [1], [2], etc. in the text",
+        "The references section uses IEEE format with numbered entries",
+        "The report contains per-paper annotations for all papers",
+        "The report identifies themes across the papers"
+      ]
+    },
+    {
+      "id": 4,
+      "prompt": "Review this paper: https://arxiv.org/abs/2310.06825",
+      "expected_output": "The SLR skill should NOT be triggered. The request should route to academic-paper-review instead.",
+      "expectations": [
+        "The systematic-literature-review skill was NOT triggered",
+        "The agent did not call arxiv_search.py",
+        "The agent recognized this as a single-paper review request"
+      ]
+    },
+    {
+      "id": 5,
+      "prompt": "What does the literature say about RLHF?",
+      "expected_output": "The SLR skill should be triggered despite no explicit 'systematic' or 'survey' keyword, because 'the literature' implies multi-paper synthesis.",
+      "expectations": [
+        "The skill read SKILL.md for systematic-literature-review",
+        "The arxiv_search.py script was called",
+        "The agent asked a clarification question about scope (paper count, format) or used reasonable defaults",
+        "The final output is a multi-paper synthesis, not a single factual answer"
+      ]
+    }
+  ]
+}
diff --git a/skills/public/systematic-literature-review/evals/trigger_eval_set.json b/skills/public/systematic-literature-review/evals/trigger_eval_set.json
new file mode 100644
index 000000000..1a629bf63
--- /dev/null
+++ b/skills/public/systematic-literature-review/evals/trigger_eval_set.json
@@ -0,0 +1,102 @@
+[
+  {
+    "query": "Survey transformer attention variants published in the last 2 years on arXiv cs.CL",
+    "should_trigger": true,
+    "rationale": "Explicit survey request with scope and category"
+  },
+  {
+    "query": "What methods do recent papers use for few-shot learning in vision-and-language? Give me 15 papers in BibTeX.",
+    "should_trigger": true,
+    "rationale": "Multi-paper synthesis with count and format spec"
+  },
+  {
+    "query": "Review the literature on retrieval-augmented generation — key findings, limitations, and open questions",
+    "should_trigger": true,
+    "rationale": "Classic SLR phrasing with explicit synthesis structure"
+  },
+  {
+    "query": "Compare evaluation frameworks used across LLM hallucination detection papers",
+    "should_trigger": true,
+    "rationale": "Cross-paper comparison implies multi-paper synthesis"
+  },
+  {
+    "query": "Summarize recent work on Monte Carlo methods for mortgage risk — last 3 years",
+    "should_trigger": true,
+    "rationale": "Domain-specific SLR with time window"
+  },
+  {
+    "query": "Annotated bibliography on agentic tool use, 20 papers, IEEE format",
+    "should_trigger": true,
+    "rationale": "Annotated bibliography is an SLR variant"
+  },
+  {
+    "query": "What does the literature say about RLHF?",
+    "should_trigger": true,
+    "rationale": "No 'systematic' keyword but 'the literature' clearly implies multi-paper synthesis"
+  },
+  {
+    "query": "Give me an overview of diffusion model papers since 2022",
+    "should_trigger": true,
+    "rationale": "Time range + 'papers' implies breadth-first survey"
+  },
+  {
+    "query": "Are there papers comparing RAG and fine-tuning?",
+    "should_trigger": true,
+    "rationale": "Comparison query across papers implies synthesis"
+  },
+  {
+    "query": "Do a systematic literature review on graph neural networks for drug discovery, APA format",
+    "should_trigger": true,
+    "rationale": "Explicit SLR request with format"
+  },
+  {
+    "query": "Review this paper: https://arxiv.org/abs/2310.06825",
+    "should_trigger": false,
+    "rationale": "Single paper URL -> should route to academic-paper-review"
+  },
+  {
+    "query": "What is attention in transformers?",
+    "should_trigger": false,
+    "rationale": "Factual question, no multi-paper synthesis needed"
+  },
+  {
+    "query": "Search for news about AI regulation",
+    "should_trigger": false,
+    "rationale": "General web search, not academic literature review"
+  },
+  {
+    "query": "Summarize this PDF [attached]",
+    "should_trigger": false,
+    "rationale": "Single document summary, not literature review"
+  },
+  {
+    "query": "Write me a Python function to parse BibTeX files",
+    "should_trigger": false,
+    "rationale": "Coding task, not research"
+  },
+  {
+    "query": "What is the capital of France?",
+    "should_trigger": false,
+    "rationale": "Factual question, no research needed"
+  },
+  {
+    "query": "Help me debug this error in my React app",
+    "should_trigger": false,
+    "rationale": "Debugging task, not literature review"
+  },
+  {
+    "query": "Translate this paragraph to Chinese",
+    "should_trigger": false,
+    "rationale": "Translation task"
+  },
+  {
+    "query": "Explain the difference between CNN and RNN",
+    "should_trigger": false,
+    "rationale": "Conceptual explanation, not multi-paper synthesis"
+  },
+  {
+    "query": "Find me the best paper on reinforcement learning",
+    "should_trigger": false,
+    "rationale": "Singular 'best paper' implies one result, not a survey across many"
+  }
+]