Unsupervisedcom · nhorton · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/flake.lock b/flake.lock
diff --git a/src/deepwork/mcp/quality_gate.py b/src/deepwork/mcp/quality_gate.py
@@ -263,6 +263,7 @@ async def _build_payload(
         self,
         outputs: dict[str, str | list[str]],
         project_root: Path,
+        notes: str | None = None,
     ) -> str:
         """Build the user prompt payload with output file contents.
 
@@ -272,6 +273,7 @@ async def _build_payload(
         Args:
             outputs: Map of output names to file path(s)
             project_root: Project root path for reading files
+            notes: Optional notes from the agent about work done
 
         Returns:
             Formatted payload with output file contents or path listing
@@ -297,6 +299,11 @@ async def _build_payload(
                 parts.extend(output_sections)
                 parts.append(f"{SECTION_SEPARATOR} END OUTPUTS {SECTION_SEPARATOR}")
 
+        if notes:
+            parts.append(f"{SECTION_SEPARATOR} AUTHOR NOTES {SECTION_SEPARATOR}")
+            parts.append(notes)
+            parts.append(f"{SECTION_SEPARATOR} END AUTHOR NOTES {SECTION_SEPARATOR}")
+
         if not parts:
             return "[No files provided]"
 
@@ -370,6 +377,8 @@ async def build_review_instructions_file(
         parts.append("")
 
         # Build outputs listing (uses self.max_inline_files to decide inline vs path-only)
+        # Notes are handled separately below in the "Author Notes" section,
+        # so we don't pass them to _build_payload here.
         payload = await self._build_payload(outputs, project_root)
         parts.append(payload)
         parts.append("")
@@ -445,20 +454,20 @@ async def build_review_instructions_file(
     def compute_timeout(file_count: int) -> int:
         """Compute dynamic timeout based on number of files.
 
-        Base timeout is 120 seconds. For every file beyond the first 5,
-        add 30 seconds. Examples:
-          - 3 files  -> 120s
-          - 5 files  -> 120s
-          - 10 files -> 120 + 30*5 = 270s (4.5 min)
-          - 20 files -> 120 + 30*15 = 570s (9.5 min)
+        Base timeout is 240 seconds (4 minutes). For every file beyond
+        the first 5, add 30 seconds. Examples:
+          - 3 files  -> 240s
+          - 5 files  -> 240s
+          - 10 files -> 240 + 30*5 = 390s (6.5 min)
+          - 20 files -> 240 + 30*15 = 690s (11.5 min)
 
         Args:
             file_count: Total number of files being reviewed
 
         Returns:
             Timeout in seconds
         """
-        return 120 + 30 * max(0, file_count - 5)
+        return 240 + 30 * max(0, file_count - 5)
 
     async def evaluate(
         self,
@@ -502,7 +511,7 @@ async def evaluate(
             notes=notes,
             additional_review_guidance=additional_review_guidance,
         )
-        payload = await self._build_payload(outputs, project_root)
+        payload = await self._build_payload(outputs, project_root, notes=notes)
 
         # Dynamic timeout: more files = more time for the reviewer
         file_count = len(self._flatten_output_paths(outputs))

diff --git a/src/deepwork/standard_jobs/deepwork_jobs/job.yml b/src/deepwork/standard_jobs/deepwork_jobs/job.yml
@@ -54,9 +54,9 @@ steps:
     reviews:
       - run_each: job.yml
         quality_criteria:
-          "Intermediate Deliverables": "Does the job break out across the logical steps such that there are reviewable intermediate deliverables?"
+          "Intermediate Deliverables": "The job breaks out across logical steps with reviewable intermediate deliverables."
           "Reviews": |
-            Are there reviews defined for each step? Do particularly critical documents have their own reviews?
+            Reviews are defined for each step. Particularly critical documents have their own reviews.
             Note that the reviewers do not have transcript access, so if the criteria are about the conversation,
             then add a `.deepwork/tmp/[step_summary].md` step output file so the agent has a communication channel to the reviewer.
 
@@ -78,13 +78,13 @@ steps:
       - run_each: step_instruction_files
         additional_review_guidance: "Read the job.yml file in the same job directory for context on how this instruction file fits into the larger workflow."
         quality_criteria:
-          "Complete Instructions": "Is the instruction file complete (no stubs or placeholders)?"
-          "Specific & Actionable": "Are instructions tailored to the step's purpose, not generic?"
-          "Output Examples": "Does the instruction file show what good output looks like? This can be either template examples, or negative examples of what not to do. Only required if the step has ouputs"
-          "Quality Criteria": "Does the instruction file define quality criteria for its outputs?"
-          "Ask Structured Questions": "If this step gathers user input, do instructions explicitly use the phrase 'ask structured questions'? If the step has no user inputs, this criterion passes automatically."
-          "Prompt Engineering": "Does the instruction file follow Anthropic's best practices for prompt engineering?"
-          "No Redundant Info": "Does the instruction file avoid duplicating information that belongs in the job.yml's common_job_info_provided_to_all_steps_at_runtime section? Shared context (project background, terminology, conventions) should be in common_job_info, not repeated in each step."
+          "Complete Instructions": "The instruction file is complete (no stubs or placeholders)."
+          "Specific & Actionable": "Instructions are tailored to the step's purpose, not generic."
+          "Output Examples": "The instruction file shows what good output looks like. This can be either template examples, or negative examples of what not to do. Only required if the step has outputs."
+          "Quality Criteria": "The instruction file defines quality criteria for its outputs."
+          "Ask Structured Questions": "If this step gathers user input, instructions explicitly use the phrase 'ask structured questions'. If the step has no user inputs, this criterion passes automatically."
+          "Prompt Engineering": "The instruction file follows Anthropic's best practices for prompt engineering."
+          "No Redundant Info": "The instruction file avoids duplicating information that belongs in the job.yml's common_job_info_provided_to_all_steps_at_runtime section. Shared context (project background, terminology, conventions) is in common_job_info, not repeated in each step."
 
   - id: test
     name: "Test the New Workflow"
@@ -106,11 +106,11 @@ steps:
     reviews:
       - run_each: step
         quality_criteria:
-          "Workflow Invoked": "Was the new workflow actually run on the user's test case via MCP?"
-          "Output Critiqued": "Did the agent identify up to 3 top issues with the output?"
-          "User Feedback Gathered": "Did the agent ask the user about each issue and gather additional feedback?"
-          "Corrections Made": "Were all requested corrections applied to the output?"
-          "User Satisfied": "Did the user confirm the output meets their needs?"
+          "Workflow Invoked": "The new workflow was actually run on the user's test case via MCP."
+          "Output Critiqued": "The agent identified up to 3 top issues with the output."
+          "User Feedback Gathered": "The agent asked the user about each issue and gathered additional feedback."
+          "Corrections Made": "All requested corrections were applied to the output."
+          "User Satisfied": "The user confirmed the output meets their needs."
 
   - id: iterate
     name: "Iterate on Workflow Design"
@@ -170,14 +170,14 @@ steps:
     reviews:
       - run_each: step
         quality_criteria:
-          "Conversation Analyzed": "Did the agent review the conversation for DeepWork job executions?"
-          "Confusion Identified": "Did the agent identify points of confusion, errors, or inefficiencies?"
-          "Instructions Improved": "Were job instructions updated to address identified issues?"
-          "Instructions Concise": "Are instructions free of redundancy and unnecessary verbosity?"
-          "Shared Content Extracted": "Is lengthy/duplicated content extracted into referenced files?"
-          "Bespoke Learnings Captured": "Were run-specific learnings added to AGENTS.md?"
-          "File References Used": "Do AGENTS.md entries reference other files where appropriate?"
-          "Working Folder Correct": "Is AGENTS.md in the correct working folder for the job?"
+          "Conversation Analyzed": "The agent reviewed the conversation for DeepWork job executions."
+          "Confusion Identified": "The agent identified points of confusion, errors, or inefficiencies."
+          "Instructions Improved": "Job instructions were updated to address identified issues."
+          "Instructions Concise": "Instructions are free of redundancy and unnecessary verbosity."
+          "Shared Content Extracted": "Lengthy/duplicated content is extracted into referenced files."
+          "Bespoke Learnings Captured": "Run-specific learnings were added to AGENTS.md."
+          "File References Used": "AGENTS.md entries reference other files where appropriate."
+          "Working Folder Correct": "AGENTS.md is in the correct working folder for the job."
 
   - id: fix_settings
     name: "Fix Settings Files"
@@ -193,15 +193,14 @@ steps:
     reviews:
       - run_each: step
         quality_criteria:
-          "DeepWork Skills Removed": "Are `Skill(...)` entries matching jobs in `.deepwork/jobs/` removed?"
-          "Non-DeepWork Skills Preserved": "Are skills NOT matching DeepWork jobs left intact?"
-          "Stale make_new_job.sh Removed": "Are stale `Bash(...)` permissions referencing `.deepwork/jobs/deepwork_jobs/make_new_job.sh` removed?"
-          "Rules Hooks Removed": "Are all DeepWork Rules hooks and permissions removed?"
-          "Duplicate Hooks Removed": "Are duplicate hook entries consolidated or removed?"
-          "Hardcoded Paths Removed": "Are user-specific hardcoded paths (like `/Users/*/...`) removed?"
-          "Deprecated Commands Removed": "Are deprecated commands like `deepwork hook *` removed?"
-          "Valid JSON": "Is settings.json still valid JSON after modifications?"
-          "Backup Created": "Was a backup of the original settings created before modifications?"
+          "DeepWork Skills Removed": "`Skill(...)` entries matching jobs in `.deepwork/jobs/` are removed."
+          "Non-DeepWork Skills Preserved": "Skills NOT matching DeepWork jobs are left intact."
+          "Stale make_new_job.sh Removed": "Stale `Bash(...)` permissions referencing `.deepwork/jobs/deepwork_jobs/make_new_job.sh` are removed."
+          "Rules Hooks Removed": "All DeepWork Rules hooks and permissions are removed."
+          "Duplicate Hooks Removed": "Duplicate hook entries are consolidated or removed."
+          "Hardcoded Paths Removed": "User-specific hardcoded paths (like `/Users/*/...`) are removed."
+          "Deprecated Commands Removed": "Deprecated commands like `deepwork hook *` are removed."
+          "Backup Created": "A backup of the original settings was created before modifications."
 
   - id: fix_jobs
     name: "Fix Job Definitions"
@@ -225,12 +224,12 @@ steps:
       - run_each: step
         additional_review_guidance: "Read the .claude/settings.json file for context on what settings were cleaned up in the prior step."
         quality_criteria:
-          "Exposed Field Addressed": "Are `exposed: true` fields removed or noted as deprecated?"
-          "Stop Hooks Migrated": "Are `stop_hooks` migrated to `hooks.after_agent` format?"
-          "Removed Steps Cleaned": "Are references to removed steps (like `review_job_spec`) updated?"
-          "Orphaned Steps Fixed": "For jobs with no workflows, is there a single workflow (named after the job) containing all steps? For jobs with existing workflows, does each orphan get its own workflow (named after the step)?"
-          "Promise Lines Removed": "Are deprecated `<promise>Quality Criteria Met</promise>` lines removed from step instruction .md files?"
-          "Valid YAML": "Are all job.yml files valid YAML?"
+          "Exposed Field Addressed": "`exposed: true` fields are removed or noted as deprecated."
+          "Stop Hooks Migrated": "`stop_hooks` are migrated to `hooks.after_agent` format."
+          "Removed Steps Cleaned": "References to removed steps (like `review_job_spec`) are updated."
+          "Orphaned Steps Fixed": "For jobs with no workflows, there is a single workflow (named after the job) containing all steps. For jobs with existing workflows, each orphan gets its own workflow (named after the step)."
+          "Promise Lines Removed": "Step instructions do not include anything about `<promise>Quality Criteria Met</promise>`."
+          "job.ymls are readable": "Calling `get_workflows` from the Deepwork tool shows all expected jobs. If any are missing, its YML is likely bad."
 
   - id: errata
     name: "Clean Up Errata"
@@ -245,13 +244,9 @@ steps:
       - fix_jobs
     reviews:
       - run_each: step
-        additional_review_guidance: "Check the .deepwork/jobs/ directory and .claude/skills/ directory to verify the cleanup was done correctly."
+        additional_review_guidance: "You should do this in a small number or turns - tee up every data request you need in your first call. Do not invoke sub-agents."
         quality_criteria:
-          "Legacy Job Skills Removed": "Are legacy skill folders for each job removed from `.claude/skills/` and `.gemini/skills/`?"
-          "Deepwork Skill Preserved": "Does the `deepwork` skill folder still exist in `.claude/skills/deepwork/`?"
-          "Temp Files Cleaned": "Are `.deepwork/tmp/` contents cleaned appropriately?"
-          "Rules Folder Removed": "Is `.deepwork/rules/` folder backed up and removed (fully deprecated)?"
-          "Rules Job Removed": "Is `.deepwork/jobs/deepwork_rules/` removed if present?"
-          "Config Version Updated": "Is `.deepwork/config.yml` using current version format?"
-          "DeepWork Re-installed": "Was `deepwork install` run after cleanup, and does it complete without errors?"
-          "Git Status Clean": "Are changes ready to be committed (no untracked garbage files)?"
+          "Legacy Job Skills Removed": "Legacy skill folders for each job are removed from `.claude/skills/` and `.gemini/skills/`."
+          "Deepwork Skill Preserved": "The `deepwork` skill folder still exists in `.claude/skills/deepwork/`."
+          "Rules Folder Removed": "`.deepwork/rules/` folder is gone."
+          "Rules Job Removed": "`.deepwork/jobs/deepwork_rules/` is gone."
diff --git a/src/deepwork/standard_jobs/deepwork_jobs/research_report_job_best_practices.md b/src/deepwork/standard_jobs/deepwork_jobs/research_report_job_best_practices.md
@@ -150,16 +150,16 @@ reviews:
   # Content review - is the analysis sound?
   - run_each: final_report.md
     quality_criteria:
-      "Claims Cited": "Is every factual claim backed by a specific source or query from the dataroom?"
-      "Questions Answered": "Are all research questions from the scoping document addressed?"
-      "Depth": "Does the analysis go beyond surface-level observations to root causes or actionable insights?"
+      "Claims Cited": "Every factual claim is backed by a specific source or query from the dataroom."
+      "Questions Answered": "All research questions from the scoping document are addressed."
+      "Depth": "The analysis goes beyond surface-level observations to root causes or actionable insights."
 
   # Presentation review - is the output polished?
   - run_each: final_report.md
     quality_criteria:
-      "Readable Flow": "Does the document flow logically for someone reading it without prior context?"
-      "Audience Fit": "Is the language and detail level appropriate for the intended audience?"
-      "Visual Quality": "Do all charts, tables, and figures render correctly and add value?"
+      "Readable Flow": "The document flows logically for someone reading it without prior context."
+      "Audience Fit": "The language and detail level are appropriate for the intended audience."
+      "Visual Quality": "All charts, tables, and figures render correctly and add value."
 ```
 
 ### Capability Considerations

diff --git a/src/deepwork/standard_jobs/deepwork_jobs/steps/define.md b/src/deepwork/standard_jobs/deepwork_jobs/steps/define.md
@@ -203,18 +203,18 @@ For final outputs, reviews let you make sure the output meets the user's expecta
 
 **Reviews format:**
 
-Each review specifies `run_each` (what to review) and `quality_criteria` (a map of criterion name to question):
+Each review specifies `run_each` (what to review) and `quality_criteria` (a map of criterion name to a statement describing the expected state after the step completes — NOT a question):
 
 ```yaml
 reviews:
   - run_each: step  # Review all outputs together
     quality_criteria:
-      "Consistent Style": "Do all files follow the same structure?"
-      "Complete Coverage": "Are all required topics covered?"
+      "Consistent Style": "All files follow the same structure."
+      "Complete Coverage": "All required topics are covered."
   - run_each: report_files  # Review each file in a 'files'-type output individually
     quality_criteria:
-      "Well Written": "Is the content clear and well-organized?"
-      "Data-Backed": "Are claims supported by data?"
+      "Well Written": "Content is clear and well-organized."
+      "Data-Backed": "Claims are supported by data."
 ```
 
 **`run_each` options:**
@@ -229,11 +229,11 @@ reviews:
   - run_each: report_files
     additional_review_guidance: "Read the comparison_matrix.md file for context on whether claims in the report are supported by the analysis data."
     quality_criteria:
-      "Data-Backed": "Are recommendations supported by the competitive analysis data?"
+      "Data-Backed": "Recommendations are supported by the competitive analysis data."
   - run_each: step_instruction_files
     additional_review_guidance: "Read the job.yml file in the same job directory for context on how this instruction file fits into the larger workflow."
     quality_criteria:
-      "Complete Instructions": "Is the instruction file complete?"
+      "Complete Instructions": "The instruction file is complete."
 ```
 
 **When to use `additional_review_guidance`:**

diff --git a/src/deepwork/standard_jobs/deepwork_jobs/steps/fix_jobs.md b/src/deepwork/standard_jobs/deepwork_jobs/steps/fix_jobs.md
@@ -224,15 +224,15 @@ steps:
 
 ### Step 7: Migrate `quality_criteria` to `reviews`
 
-The flat `quality_criteria` field on steps has been replaced by the `reviews` array. Each review specifies `run_each` (what to review) and `quality_criteria` as a map of criterion name to question.
+The flat `quality_criteria` field on steps has been replaced by the `reviews` array. Each review specifies `run_each` (what to review) and `quality_criteria` as a map of criterion name to a statement describing the expected state (not a question).
 
 **Before (deprecated):**
 ```yaml
 steps:
   - id: my_step
     quality_criteria:
-      - "**Complete**: Is the output complete?"
-      - "**Accurate**: Is the data accurate?"
+      - "**Complete**: The output is complete."
+      - "**Accurate**: The data is accurate."
 ```
 
 **After (current format):**
@@ -242,13 +242,13 @@ steps:
     reviews:
       - run_each: step
         quality_criteria:
-          "Complete": "Is the output complete?"
-          "Accurate": "Is the data accurate?"
+          "Complete": "The output is complete."
+          "Accurate": "The data is accurate."
 ```
 
 **Migration rules:**
 
-1. **Parse the old format**: Each string typically follows `**Name**: Question` format. Extract the name (bold text) as the map key and the question as the value.
+1. **Parse the old format**: Each string typically follows `**Name**: Question/Statement` format. Extract the name (bold text) as the map key and convert the value to a statement of expected state (not a question).
 2. **Choose `run_each`**: Default to `step` (reviews all outputs together). If the step has a single primary output, consider using that output name instead.
 3. **For steps with no quality_criteria**: Use `reviews: []`
 4. **Remove the old field**: Delete the `quality_criteria` array entirely after migration.