diff --git a/.egg-state/agent-outputs/897-architect-output.json b/.egg-state/agent-outputs/897-architect-output.json new file mode 100644 index 000000000..3434e7ce6 --- /dev/null +++ b/.egg-state/agent-outputs/897-architect-output.json @@ -0,0 +1,496 @@ +{ + "issue": 897, + "phase": "plan", + "agent": "architect", + "revision": 1, + "revision_reason": null, + "title": "Architecture analysis: Deepen SDLC review quality via verdict schema expansion and prompt alignment", + + "summary": "SDLC pipeline reviewers produce shallow analysis compared to PR reviewers due to three structural root causes: (1) the verdict JSON schema instructs reviewers to write empty feedback when approving, (2) the SDLC prompt lacks the thoroughness framing and review conventions that PR prompts include, and (3) there is no approve-with-suggestions concept. The recommended approach expands the ReviewVerdict model with always-populated `analysis` and `suggestions` fields, aligns the SDLC review prompt builder with PR reviewer standards, and updates aggregation to surface non-blocking suggestions from approved verdicts.", + + "problem_statement": { + "description": "SDLC pipeline reviewers write ~1-3 sentence verdicts with empty feedback even when real issues are present, while PR reviewers on the same code produce detailed multi-section reviews with specific findings. Evidence from PR #895 (issue #871) shows SDLC reviewers approving with empty feedback fields despite scope creep observations, while PR reviewers found 5 specific code issues driving a follow-up commit.", + "root_causes": [ + { + "id": "RC-1", + "title": "Verdict schema constrains depth", + "description": "The verdict format instruction at pipelines.py:1562 says 'Detailed feedback if needs_revision, empty if approved'. This explicitly tells reviewers not to provide feedback when approving. There is no field for detailed analysis or non-blocking suggestions.", + "location": "orchestrator/routes/pipelines.py:1554-1570" + }, + { + "id": "RC-2", + "title": "SDLC prompt is less detailed than PR prompt", + "description": "PR code reviewers get 'critical infrastructure — last line of defense' framing, 6 detailed procedural steps, and review conventions (comprehensive, specific, direct, suggest fixes). SDLC code reviewers get briefer steps and no review conventions. Non-code SDLC reviewers (refine, plan, contract) get only 4 generic steps with no thoroughness guidance.", + "locations": [ + "orchestrator/routes/pipelines.py:1509-1526 (SDLC prompt steps)", + "orchestrator/routes/pipelines.py:1004-1044 (scope preambles)", + "action/build-review-prompt.sh:140-163 (PR prompt)", + "action/review-conventions.md (PR conventions, not loaded by SDLC)" + ] + }, + { + "id": "RC-3", + "title": "No approve-with-suggestions concept", + "description": "PR reviewers use the marker to trigger the feedback-addressing workflow for non-blocking suggestions on approved PRs. SDLC reviewers have binary approved/needs_revision — observations from an approving reviewer are structurally lost because _aggregate_review_verdicts() only collects feedback from needs_revision verdicts.", + "locations": [ + "action/review-conventions.md:34 (has-suggestions marker)", + "orchestrator/routes/pipelines.py:1712-1738 (aggregation ignores approved)" + ] + } + ] + }, + + "current_architecture": { + "review_flow": "Reviewers are spawned in parallel per phase. Each writes a verdict JSON to .egg-state/reviews/. _read_review_verdict() parses the JSON into ReviewVerdict. _aggregate_review_verdicts() combines all verdicts: any needs_revision = overall needs_revision. Combined feedback is passed as prior_feedback to the next review cycle's prompt.", + "key_components": [ + { + "name": "ReviewVerdict model", + "file": "orchestrator/models.py", + "lines": "97-103", + "description": "Pydantic model with 4 fields: verdict, summary, feedback, timestamp. All string fields default to empty string except verdict (required)." + }, + { + "name": "_build_review_prompt()", + "file": "orchestrator/routes/pipelines.py", + "lines": "1452-1586", + "description": "Constructs the full reviewer prompt including scope preamble, task steps, criteria, delta review instructions, prior feedback, verdict format, and phase restrictions." + }, + { + "name": "_get_reviewer_scope_preamble()", + "file": "orchestrator/routes/pipelines.py", + "lines": "1004-1044", + "description": "Returns reviewer-type-specific scope text. Code reviewer gets 'Be thorough' and 'Find ALL issues'. Others get scope boundaries without thoroughness framing." + }, + { + "name": "_aggregate_review_verdicts()", + "file": "orchestrator/routes/pipelines.py", + "lines": "1712-1738", + "description": "Combines verdicts: overall = needs_revision if any is needs_revision. Only collects feedback/summary from needs_revision verdicts. Returns tuple[str, str] (verdict, combined_feedback)." + }, + { + "name": "_read_review_verdict()", + "file": "orchestrator/routes/pipelines.py", + "lines": "1589-1632", + "description": "Reads JSON from verdict file path, parses into ReviewVerdict. Returns None if file missing or malformed (treated as approved for graceful degradation). Uses Pydantic model validation via ReviewVerdict(**data)." + } + ], + "callers_of_aggregate": [ + { + "location": "orchestrator/routes/pipelines.py:3447", + "context": "Tier 3 per-phase review loop in _run_tier3_implement(). Uses combined_feedback as prior_feedback for retry or appended to tester_gap_summary." + }, + { + "location": "orchestrator/routes/pipelines.py:5396", + "context": "Main phase execution loop in _run_pipeline_phases(). Uses combined_feedback as review_feedback stored for next cycle." + } + ], + "test_coverage": [ + { + "file": "tests/workflows/test_multi_reviewer.py", + "description": "Tests aggregation logic (all_approved, any_needs_revision, etc.). Uses simplified string verdicts, not ReviewVerdict objects — may need updating." + }, + { + "file": "orchestrator/tests/test_tier3_execute.py", + "description": "Integration tests using mock ReviewVerdict objects. Tests verdict reading, aggregation in tier 3 flow. 15+ references to ReviewVerdict." + }, + { + "file": "integration_tests/sdlc/test_refine_plan_review_cycles.py", + "description": "Integration tests for review verdict file parsing in refine/plan phases." + } + ] + }, + + "approaches": [ + { + "id": "A", + "title": "Expand Verdict Schema + Align Prompts (Recommended)", + "description": "Add `analysis` and `suggestions` fields to ReviewVerdict that are always populated regardless of verdict. Update the prompt builder to include review conventions and thoroughness instructions. Update aggregation to surface suggestions from approved verdicts.", + "schema_changes": { + "model": "ReviewVerdict in orchestrator/models.py:97-103", + "new_fields": [ + { + "name": "analysis", + "type": "str", + "default": "", + "description": "Detailed file-by-file or section-by-section analysis. Always populated regardless of verdict. Free-form markdown." + }, + { + "name": "suggestions", + "type": "str", + "default": "", + "description": "Non-blocking suggestions for improvement. Populated even on approved verdicts. Serves the role of has-suggestions from PR reviews." + } + ], + "backward_compatibility": "New fields have empty-string defaults. Old JSON files without these fields will parse into ReviewVerdict with empty analysis and suggestions. No migration needed." + }, + "prompt_changes": { + "verdict_format_section": { + "location": "orchestrator/routes/pipelines.py:1554-1570", + "change": "Update JSON template to include analysis and suggestions fields. Replace 'empty if approved' instruction with 'Always provide detailed analysis regardless of verdict. Include non-blocking suggestions even when approving.'" + }, + "thoroughness_framing": { + "location": "orchestrator/routes/pipelines.py:1480-1527", + "change": "Add review conventions content after the criteria section. For code reviewers, add 'critical infrastructure' and 'last line of defense' framing. For all reviewers, add the 5 comment quality standards from review-conventions.md (comprehensive, specific, direct, suggest fixes, provide context)." + }, + "scope_preambles": { + "location": "orchestrator/routes/pipelines.py:1004-1044", + "change": "Add analysis depth expectations to each reviewer type's preamble. For code: file-by-file analysis expected. For contract: criterion-by-criterion verification table. For refine/plan: section-by-section evaluation. For agent-design: exempt from detailed analysis (approve briefly when no concerns)." + }, + "non_code_reviewer_steps": { + "location": "orchestrator/routes/pipelines.py:1523-1526", + "change": "Expand 4-step instructions for draft-based reviewers (refine, plan, contract) to include more procedural guidance: read thoroughly, cross-reference with criteria, provide specific file/section references in analysis." + } + }, + "aggregation_changes": { + "location": "orchestrator/routes/pipelines.py:1712-1738", + "change": "Collect analysis from all verdicts (not just needs_revision). Collect suggestions from all verdicts. Return an expanded result — either change return type to tuple[str, str, str] (verdict, blocking_feedback, advisory_content) or return a typed NamedTuple/dataclass.", + "downstream_callers": [ + "pipelines.py:3447 — Tier 3 loop: must handle new return value", + "pipelines.py:5396 — Main phase loop: must handle new return value" + ] + }, + "pros": [ + "Directly addresses all three root causes", + "Clear field semantics: analysis (always), suggestions (non-blocking), feedback (blocking)", + "Backward compatible — Pydantic defaults handle old files", + "Aligns SDLC reviewer behavior with proven PR reviewer standard", + "Structured separation makes it easy for downstream code to distinguish blocking vs advisory" + ], + "cons": [ + "Larger scope: model + prompt builder + aggregation + 2 callers + tests", + "Increases token consumption per review (always-populate-analysis)", + "Adding fields doesn't guarantee quality — prompt framing is the real driver" + ] + }, + { + "id": "B", + "title": "Prompt-Only Fix (Minimal Schema Change)", + "description": "Keep the verdict schema mostly as-is. Rewrite prompt instructions to demand thoroughness. Change 'empty if approved' to require detailed content in existing summary and feedback fields. Load review conventions into SDLC prompts.", + "schema_changes": { + "model": "No changes to ReviewVerdict fields", + "note": "Relies on existing summary and feedback fields carrying richer content via better prompting" + }, + "prompt_changes": { + "verdict_format_section": "Remove 'empty if approved'. Replace with 'Always provide detailed analysis in the summary field. Provide non-blocking suggestions in the feedback field even when approving.'", + "thoroughness_framing": "Same as Approach A — add review conventions and critical infrastructure framing", + "scope_preambles": "Same as Approach A — add depth expectations" + }, + "aggregation_changes": { + "change": "Update to collect summary and feedback from approved verdicts (currently only needs_revision)" + }, + "pros": [ + "Smallest scope — no model changes", + "Prompt instructions are the primary driver of behavior, so this may be sufficient", + "No risk of breaking existing verdict parsing" + ], + "cons": [ + "Overloads existing fields — summary becomes analysis, feedback becomes suggestions. Field semantics become ambiguous.", + "Downstream code cannot distinguish blocking vs non-blocking feedback", + "The model field description ('Detailed feedback if needs_revision') stays in code, creating disconnect with prompt instructions", + "If the field semantics are unclear, future agents writing/reading verdicts will be confused" + ] + }, + { + "id": "C", + "title": "Hybrid: New Analysis Field + Third Verdict State", + "description": "Add an `analysis` field (always populated) and an `approved_with_suggestions` verdict value. When a reviewer approves but has suggestions, they use this third verdict state. No separate `suggestions` field — suggestions go in `feedback` when verdict is `approved_with_suggestions`.", + "schema_changes": { + "model": "Add analysis field. Add approved_with_suggestions as valid verdict value.", + "note": "Three-way verdict: approved, approved_with_suggestions, needs_revision" + }, + "aggregation_changes": { + "change": "approved_with_suggestions treated as approved for overall verdict. Feedback from approved_with_suggestions verdicts is collected and surfaced." + }, + "pros": [ + "Mirrors PR reviewer's has-suggestions concept directly", + "Clear semantic intent per verdict value", + "Analysis field addresses the depth problem" + ], + "cons": [ + "Three-way verdict adds complexity to all verdict-handling code paths", + "Every `if verdict == 'approved'` check must be reviewed and potentially updated", + "Doesn't address the case of clean approval with analysis but no suggestions — still no data if the reviewer finds nothing", + "More error-prone: agents must choose the right verdict value from 3 options instead of always populating suggestions" + ] + } + ], + + "recommended_approach": { + "id": "A", + "title": "Expand Verdict Schema + Align Prompts", + "justification": "The refine analysis correctly identified that the verdict format structurally discourages thorough analysis. Approach A changes the structure to match the desired behavior by giving reviewers dedicated fields (analysis, suggestions) that are always expected to be populated. This is more robust than Approach B (which overloads existing fields with ambiguous semantics) and simpler than Approach C (which adds a third verdict state affecting all conditional logic). The Pydantic model's default empty strings provide effortless backward compatibility with old verdict files.", + "key_insight": "The primary driver of behavior is the prompt, not the schema. But the schema reinforces the prompt. When the prompt says 'always provide detailed analysis' and there's a dedicated `analysis` field for it, the agent has a clear target. When the prompt says 'use the summary field for analysis' but the field is named 'summary' and described as 'Brief summary', the instruction fights the structure." + }, + + "architecture_decisions": [ + { + "id": "AD-1", + "decision": "Use free-form markdown for the analysis field, not structured sub-format", + "rationale": "PR reviewers produce effective reviews in free-form markdown. A structured format (per-file JSON objects) would constrain expressiveness and add complexity to the schema. The prompt should suggest structure (e.g., 'provide file-by-file analysis for code reviews') but the field itself should accept free-form markdown." + }, + { + "id": "AD-2", + "decision": "Exempt agent-design reviewer from always-populate-analysis requirement", + "rationale": "The agent-design reviewer is intentionally brief when there are no concerns. Its PR counterpart (build-agent-mode-design-review-prompt.sh:126-127) explicitly says 'approve with a brief note'. The prompt should say 'provide analysis of any agent-mode design concerns found, or state briefly that none were identified' rather than requiring detailed analysis. The refine analysis and agent-design review verdict both flagged this as the right approach." + }, + { + "id": "AD-3", + "decision": "Return a typed NamedTuple from _aggregate_review_verdicts instead of expanding the tuple", + "rationale": "The current return type is tuple[str, str]. Adding a third string makes callers harder to read (what is result[2]?). A NamedTuple with named fields (verdict, blocking_feedback, advisory_content) is clearer and allows future extension without changing the return signature. The 2 callers (pipelines.py:3447, 5396) are straightforward to update." + }, + { + "id": "AD-4", + "decision": "Load review conventions from shared/prompts/ directory, not from action/review-conventions.md", + "rationale": "The action/ directory is for GitHub Actions. The orchestrator should not depend on action/ files. Instead, extract the relevant review conventions into a shared file (e.g., shared/prompts/review-conventions.md) that both systems can consume, or inline the key points (comprehensive, specific, direct, suggest fixes, provide context) into the prompt builder. Since the conventions are short (5 bullet points), inlining in the prompt builder is simpler and avoids file-loading complexity. Keep the shared criteria file approach for the substantive review criteria, which are already shared." + }, + { + "id": "AD-5", + "decision": "Do not surface approved-verdict suggestions to the implementing agent's retry prompt", + "rationale": "When a reviewer approves with suggestions, those suggestions should NOT be injected into the next cycle's prior_feedback (which is for blocking issues). Instead, they should be: (a) included in the phase-completion comment posted to the issue so the human sees them, and (b) available for the integrator to reference. Injecting non-blocking suggestions into the retry loop would cause agents to treat them as blocking requirements, potentially triggering unnecessary revision cycles." + }, + { + "id": "AD-6", + "decision": "Do not migrate existing verdict files", + "rationale": "Old verdict files in .egg-state/reviews/ lack analysis and suggestions fields. The Pydantic model defaults handle this gracefully — missing fields get empty strings. Migration would touch many files for no functional benefit since old verdicts are historical records, not actively consumed." + } + ], + + "implementation_plan": { + "phases": [ + { + "id": "phase-1", + "title": "Expand ReviewVerdict model", + "tasks": [ + { + "id": "task-1", + "description": "Add `analysis` and `suggestions` fields to ReviewVerdict in orchestrator/models.py with empty string defaults and descriptive Field() annotations", + "acceptance_criteria": [ + "ReviewVerdict has analysis: str field with default ''", + "ReviewVerdict has suggestions: str field with default ''", + "Field descriptions clarify always-populate semantics for analysis and non-blocking semantics for suggestions", + "Existing verdict JSON files without new fields parse correctly (backward compat)" + ] + } + ] + }, + { + "id": "phase-2", + "title": "Create AggregatedReviewResult NamedTuple and update aggregation", + "tasks": [ + { + "id": "task-2", + "description": "Define AggregatedReviewResult NamedTuple in orchestrator/models.py with fields: verdict, blocking_feedback, advisory_content. Update _aggregate_review_verdicts() to return this type and collect analysis+suggestions from all verdicts.", + "acceptance_criteria": [ + "AggregatedReviewResult NamedTuple defined with verdict, blocking_feedback, advisory_content fields", + "_aggregate_review_verdicts() returns AggregatedReviewResult", + "blocking_feedback collects feedback from needs_revision verdicts (existing behavior)", + "advisory_content collects analysis and suggestions from ALL verdicts including approved", + "Return value is backward compatible at callers (named fields accessible)" + ] + }, + { + "id": "task-3", + "description": "Update the 2 callers of _aggregate_review_verdicts() at pipelines.py:3447 and pipelines.py:5396 to destructure AggregatedReviewResult. Only pass blocking_feedback to prior_feedback for retry loops. Log or store advisory_content separately.", + "acceptance_criteria": [ + "Tier 3 caller (pipelines.py:3447) uses result.blocking_feedback for prior_feedback", + "Main loop caller (pipelines.py:5396) uses result.blocking_feedback for review_feedback", + "Advisory content from approved verdicts is NOT injected into retry prompt", + "Advisory content is logged for observability" + ] + } + ] + }, + { + "id": "phase-3", + "title": "Update prompt builder for thoroughness and new verdict format", + "tasks": [ + { + "id": "task-4", + "description": "Update the verdict format section in _build_review_prompt() (pipelines.py:1554-1570) to include analysis and suggestions fields. Replace 'empty if approved' with instructions to always populate analysis and provide non-blocking suggestions where applicable.", + "acceptance_criteria": [ + "Verdict JSON template includes analysis and suggestions fields", + "Instructions say 'Always provide detailed analysis regardless of verdict'", + "Instructions say 'Include non-blocking suggestions for improvement even when approving'", + "The 'empty if approved' language for the feedback field is removed", + "feedback field instruction clarifies it is for blocking issues only (needs_revision)" + ] + }, + { + "id": "task-5", + "description": "Add review conventions (quality standards) to _build_review_prompt(). Inline the 5 comment quality standards from review-conventions.md: comprehensive, specific, direct, suggest fixes, provide context. Add 'critical infrastructure' framing for code reviewers.", + "acceptance_criteria": [ + "A Review Conventions section is added to the SDLC review prompt", + "5 quality standards are included: comprehensive, specific, direct, suggest fixes, provide context", + "Code reviewer prompt includes 'critical infrastructure' and 'last line of defense' framing", + "Conventions are inlined in the prompt builder, not loaded from action/review-conventions.md" + ] + }, + { + "id": "task-6", + "description": "Update _get_reviewer_scope_preamble() to add analysis depth expectations per reviewer type. Code: file-by-file analysis. Contract: criterion-by-criterion verification. Refine/plan: section-by-section evaluation. Agent-design: exempt from detailed analysis.", + "acceptance_criteria": [ + "Code reviewer preamble mentions file-by-file analysis expectation", + "Contract reviewer preamble mentions criterion-by-criterion verification expectation", + "Refine reviewer preamble mentions section-by-section evaluation expectation", + "Plan reviewer preamble mentions section-by-section evaluation expectation", + "Agent-design reviewer preamble explicitly states brief approval is acceptable when no concerns found" + ] + }, + { + "id": "task-7", + "description": "Expand the procedural steps for draft-based (non-code) reviewers from 4 steps to include more thorough review guidance. Add instructions to cross-reference with criteria sections, provide specific references, and evaluate depth.", + "acceptance_criteria": [ + "Draft-based reviewer steps include instruction to read the draft thoroughly (not skim)", + "Steps include instruction to cross-reference each criteria section with the draft content", + "Steps include instruction to cite specific sections/evidence in the analysis field", + "Steps are expanded from 4 to 6-7 without being overly prescriptive" + ] + } + ] + }, + { + "id": "phase-4", + "title": "Update tests", + "tasks": [ + { + "id": "task-8", + "description": "Update tests/workflows/test_multi_reviewer.py to test aggregation with new ReviewVerdict fields (analysis, suggestions) and AggregatedReviewResult return type.", + "acceptance_criteria": [ + "Tests verify analysis is collected from approved verdicts", + "Tests verify suggestions is collected from approved verdicts", + "Tests verify AggregatedReviewResult fields are correctly populated", + "Tests verify backward compat: verdicts without analysis/suggestions fields still work" + ] + }, + { + "id": "task-9", + "description": "Update orchestrator/tests/test_tier3_execute.py mock ReviewVerdict objects to include new fields where appropriate. Ensure existing tests still pass.", + "acceptance_criteria": [ + "Existing mock ReviewVerdict() calls still work (new fields have defaults)", + "At least one test exercises ReviewVerdict with analysis and suggestions populated", + "TestReadReviewVerdict tests backward compatibility with old-format JSON" + ] + }, + { + "id": "task-10", + "description": "Add or update prompt builder tests to verify the new verdict format section, review conventions inclusion, and scope preamble changes.", + "acceptance_criteria": [ + "Test verifies verdict format JSON in prompt includes analysis and suggestions fields", + "Test verifies review conventions text appears in generated prompt", + "Test verifies 'empty if approved' language is NOT present in generated prompt", + "Test verifies agent-design reviewer preamble does NOT require detailed analysis" + ] + } + ] + } + ] + }, + + "files_modified": [ + { + "file": "orchestrator/models.py", + "lines": "97-103", + "change": "Add analysis and suggestions fields to ReviewVerdict. Add AggregatedReviewResult NamedTuple.", + "risk": "low" + }, + { + "file": "orchestrator/routes/pipelines.py", + "lines": "1004-1044", + "change": "Update _get_reviewer_scope_preamble() with analysis depth expectations per reviewer type.", + "risk": "low" + }, + { + "file": "orchestrator/routes/pipelines.py", + "lines": "1452-1586", + "change": "Update _build_review_prompt() verdict format section, add review conventions, expand draft reviewer steps.", + "risk": "low" + }, + { + "file": "orchestrator/routes/pipelines.py", + "lines": "1712-1738", + "change": "Update _aggregate_review_verdicts() to return AggregatedReviewResult and collect from all verdicts.", + "risk": "medium — return type change affects 2 callers" + }, + { + "file": "orchestrator/routes/pipelines.py", + "lines": "3447", + "change": "Update Tier 3 caller to destructure AggregatedReviewResult.", + "risk": "low" + }, + { + "file": "orchestrator/routes/pipelines.py", + "lines": "5396", + "change": "Update main phase loop caller to destructure AggregatedReviewResult.", + "risk": "low" + }, + { + "file": "tests/workflows/test_multi_reviewer.py", + "change": "Update aggregation tests for new return type and new fields.", + "risk": "low" + }, + { + "file": "orchestrator/tests/test_tier3_execute.py", + "change": "Update mock ReviewVerdict objects and add backward compat test.", + "risk": "low" + } + ], + + "risks": [ + { + "id": "R-1", + "title": "Token consumption increase", + "likelihood": "certain", + "impact": "low-medium", + "description": "Requiring always-populated analysis will increase token usage per review. With 3 reviewers x up to 3 cycles, this could increase pipeline review cost by 30-50%. However, the whole point is deeper analysis, so increased tokens are the intended outcome, not a side effect.", + "mitigation": "The prompt should include length guidance (e.g., 'thorough but concise analysis, typically 200-500 words') to prevent unbounded growth. The agent-design reviewer exemption limits waste on reviews with no findings." + }, + { + "id": "R-2", + "title": "Aggregation return type change breaks callers", + "likelihood": "low", + "impact": "high", + "description": "Changing _aggregate_review_verdicts() return from tuple[str, str] to AggregatedReviewResult requires updating 2 callers. If a caller is missed, it will fail at runtime.", + "mitigation": "NamedTuple is iterable, so existing `verdict, feedback = _aggregate_review_verdicts(...)` destructuring will still work if the NamedTuple has exactly those as its first 2 fields. However, this is fragile. Better to update all callers explicitly and add a type annotation to catch issues at lint time." + }, + { + "id": "R-3", + "title": "Prompt changes don't actually improve review quality", + "likelihood": "low-medium", + "impact": "medium", + "description": "The hypothesis is that schema + prompt changes will produce deeper reviews. If the underlying issue is model capability or context length rather than instructions, the changes may not help.", + "mitigation": "The PR reviewer evidence shows the same model (Opus) producing thorough reviews with better prompting, so the hypothesis is well-supported. After implementation, compare a few SDLC review verdicts pre/post to validate improvement." + }, + { + "id": "R-4", + "title": "Test file uses simplified verdict format", + "likelihood": "medium", + "impact": "low", + "description": "tests/workflows/test_multi_reviewer.py uses plain string verdicts in dicts rather than ReviewVerdict objects. The test may need to be rewritten to use ReviewVerdict objects and test the actual _aggregate_review_verdicts function.", + "mitigation": "Review the test during implementation and determine if it tests a local helper or the actual production function. Update accordingly." + } + ], + + "backward_compatibility": { + "verdict_files": "Old verdict JSON files (without analysis/suggestions) parse correctly due to Pydantic Field defaults. No migration needed.", + "aggregation_callers": "The AggregatedReviewResult NamedTuple change requires updating 2 explicit callers. No external API is affected.", + "prompt_output": "Reviewers will write richer JSON with analysis and suggestions fields. Old consumers that only read verdict/summary/feedback will ignore the new fields (standard JSON behavior).", + "model_validation": "_read_review_verdict() uses ReviewVerdict(**data) — extra fields in JSON are handled by Pydantic's default behavior (ignored unless model_config forbids extras)." + }, + + "acceptance_criteria": [ + "ReviewVerdict model has analysis (str, default='') and suggestions (str, default='') fields", + "AggregatedReviewResult NamedTuple defined with verdict, blocking_feedback, advisory_content fields", + "_aggregate_review_verdicts() returns AggregatedReviewResult collecting analysis/suggestions from ALL verdicts", + "Advisory content from approved verdicts is NOT injected into retry prompt prior_feedback", + "Verdict format in _build_review_prompt() includes analysis and suggestions fields with always-populate instructions", + "'empty if approved' language is removed from the prompt", + "Review conventions (5 quality standards) are included in SDLC review prompts", + "Code reviewer prompt includes 'critical infrastructure' framing", + "Scope preambles include analysis depth expectations per reviewer type", + "Agent-design reviewer is exempt from detailed analysis requirement", + "Draft-based reviewer (refine, plan, contract) steps expanded from 4 to 6-7", + "Old verdict JSON files without new fields parse correctly (backward compatibility)", + "All existing tests pass after changes", + "New tests cover aggregation with new fields and backward compatibility" + ] +} diff --git a/.egg-state/agent-outputs/897-integrator-output.json b/.egg-state/agent-outputs/897-integrator-output.json new file mode 100644 index 000000000..b2d3da377 --- /dev/null +++ b/.egg-state/agent-outputs/897-integrator-output.json @@ -0,0 +1,67 @@ +{ + "issue": 897, + "pipeline_id": "issue-897", + "phase": "implement", + "agent": "integrator", + "status": "pass", + "summary": "Integration verified. Resolved 2 merge conflicts in test files, fixed 2 lint issues, all 252 relevant tests pass, full suite passes (6765 pass, 2 pre-existing Docker env failures unrelated to this issue).", + "integration_checks": { + "merge_conflicts_resolved": { + "status": "fixed", + "details": "Found and resolved merge conflict markers in orchestrator/tests/test_pipeline_prompts.py and tests/workflows/test_multi_reviewer.py. Both conflicts were caused by tester WIP auto-commit colliding with coder's commit; resolution kept all tester tests (HEAD side) and removed empty counterparts." + }, + "lint_issues_fixed": { + "status": "fixed", + "details": "Fixed B011 (assert False) in test_pipeline_prompts.py by replacing with pytest.raises(). Removed unused import (types.ModuleType) in test_multi_reviewer.py. Ran ruff format on 2 files." + }, + "test_suite": { + "status": "pass", + "total_tests": 6852, + "passed": 6765, + "failed": 2, + "skipped": 85, + "pre_existing_failures": [ + "orchestrator/tests/test_docker_client.py::TestDockerClientConnection::test_is_connected_false", + "orchestrator/tests/test_docker_client.py::TestContainerOperations::test_start_container_not_found" + ], + "pre_existing_failure_reason": "Docker client tests fail in sandbox environment (Docker is connected when test expects disconnected; mock type mismatch). Not related to issue #897 changes." + }, + "relevant_tests": { + "status": "pass", + "count": 252, + "files": [ + "orchestrator/tests/test_pipeline_prompts.py", + "orchestrator/tests/test_tier3_execute.py", + "tests/workflows/test_multi_reviewer.py" + ] + }, + "lint": { + "status": "pass", + "details": "All ruff check and format rules pass. All custom lint checks pass." + }, + "backward_compatibility": { + "status": "verified", + "details": "ReviewVerdict model uses default empty strings for new fields (analysis, suggestions), so old JSON without these fields parses correctly. AggregatedReviewResult is a NamedTuple supporting both named and positional access. Tests explicitly verify both backward-compat scenarios." + } + }, + "changes_summary": { + "coder_changes": { + "orchestrator/models.py": "Added analysis and suggestions fields to ReviewVerdict. Added AggregatedReviewResult NamedTuple replacing the old (str, str) return type.", + "orchestrator/routes/pipelines.py": "Updated _get_reviewer_scope_preamble with analysis format instructions for each reviewer type. Added expanded procedural steps for draft-based reviewers. Added Review Conventions section to all prompts. Updated verdict JSON template with analysis/suggestions fields and guidelines. Refactored _aggregate_review_verdicts to return AggregatedReviewResult with advisory_content. Updated both call sites in _run_tier3_implement and _run_pipeline. Added advisory content logging.", + "docs/guides/sdlc-pipeline.md": "Updated verdict schema documentation with new fields, field guidelines, and review conventions.", + "orchestrator/README.md": "Updated models.py description to mention ReviewVerdict and AggregatedReviewResult." + }, + "tester_changes": { + "orchestrator/tests/test_pipeline_prompts.py": "24 new tests in TestBuildReviewPrompt class covering verdict format, review conventions, preambles, field guidelines, procedural steps, delta review, prior feedback, phase restrictions, and error handling.", + "orchestrator/tests/test_tier3_execute.py": "5 new tests: backward compat for old JSON, full-field parsing, ReviewVerdict minimal construction, and feedback-only construction.", + "tests/workflows/test_multi_reviewer.py": "16 new tests in TestProductionAggregateReviewVerdicts class covering all approved, needs_revision, None verdicts, advisory content collection, blocking feedback, backward compat, positional access, empty dicts, feedback fallback to summary, multiple needs_revision, mixed verdicts, and suggestions prefix formatting." + }, + "integrator_fixes": { + "orchestrator/tests/test_pipeline_prompts.py": "Resolved merge conflict markers, added pytest import, replaced assert False with pytest.raises.", + "tests/workflows/test_multi_reviewer.py": "Resolved merge conflict markers, removed unused ModuleType import, ran ruff format." + } + }, + "commit": "224a06c29", + "regressions": "none", + "risk_assessment": "low — changes are additive (new fields with defaults), all existing behavior preserved, comprehensive test coverage added" +} diff --git a/.egg-state/agent-outputs/897-risk_analyst-output.json b/.egg-state/agent-outputs/897-risk_analyst-output.json new file mode 100644 index 000000000..27f7bac83 --- /dev/null +++ b/.egg-state/agent-outputs/897-risk_analyst-output.json @@ -0,0 +1,311 @@ +{ + "issue": 897, + "phase": "plan", + "agent": "risk_analyst", + "revision": 1, + "revision_reason": null, + "title": "Risk assessment: Verdict schema expansion and prompt alignment for SDLC reviewers", + + "summary": "The architect's recommended approach (Option A: expand ReviewVerdict model + align prompts) is architecturally sound and addresses all three root causes identified in issue #897. The overall risk profile is LOW-MEDIUM. The most significant risks are: (1) the NamedTuple return type change requires atomic updates to all callers — the architect's mitigation claim about NamedTuple iterable unpacking is incorrect and this will cause runtime errors if any caller is missed, (2) the test suite has a critical gap — test_multi_reviewer.py tests a local stub function, not the production _aggregate_review_verdicts(), so aggregation behavior changes may pass tests while being broken, and (3) advisory content from approved verdicts has no defined consumer in the implementation plan, creating dead code. No risks warrant blocking the approach; all are mitigable with the strategies below.", + + "architect_approach_assessment": { + "recommended_approach": "A — Expand Verdict Schema + Align Prompts", + "assessment": "Sound. The approach correctly identifies that both the schema structure and prompt framing must change together to produce the desired behavioral shift. Adding dedicated `analysis` and `suggestions` fields with always-populate semantics is better than overloading existing fields (Option B) or adding a third verdict state (Option C). The Pydantic default mechanism provides clean backward compatibility.", + "agreement_with_architect": true, + "disagreements": [ + { + "topic": "NamedTuple unpacking backward compatibility", + "architect_claim": "R-2 mitigation says 'NamedTuple is iterable, so existing verdict, feedback = _aggregate_review_verdicts(...) destructuring will still work if the NamedTuple has exactly those as its first 2 fields'", + "correction": "This is incorrect. Python raises ValueError('too many values to unpack') when unpacking a 3-element iterable into 2 variables. A 3-field NamedTuple CANNOT be destructured into 2 variables. Both callers MUST be updated atomically with the return type change. This is not a graceful degradation scenario — it is a hard failure.", + "severity": "medium", + "recommendation": "Update the implementation plan to emphasize that the aggregation return type change and both caller updates (tasks 2 and 3) MUST be in the same commit. Add a test that explicitly validates the return type." + }, + { + "topic": "Advisory content has no consumer", + "architect_claim": "AD-5 says advisory content should be included in phase-completion comments and available for the integrator", + "correction": "The implementation plan (phases 1-4) has no task for surfacing advisory_content to phase-completion comments or the integrator. The aggregation will collect advisory content into advisory_content field, but no code will read or display it. Task-3 only says 'Log or store advisory_content separately' with no specific implementation. This creates dead code that produces the illusion of capturing suggestions without actually surfacing them.", + "severity": "medium", + "recommendation": "Add a task to surface advisory_content in the phase-completion issue comment. Without this, the approved-verdict suggestions are still structurally lost — just collected into a variable that goes nowhere instead of never being collected at all." + } + ] + }, + + "risks": [ + { + "id": "R-1", + "category": "correctness", + "title": "NamedTuple return type change breaks callers if not updated atomically", + "description": "Changing _aggregate_review_verdicts() from tuple[str, str] to a 3-field AggregatedReviewResult NamedTuple will cause ValueError at both call sites (pipelines.py:3447 and pipelines.py:5396) if they still use `a, b = _aggregate_review_verdicts(...)` destructuring. Python does not allow unpacking a 3-element iterable into 2 variables.", + "likelihood": "certain_if_not_mitigated", + "impact": "high", + "impact_detail": "Runtime crash in the review loop for all pipeline phases. Every pipeline run would fail at the verdict aggregation step. This is a P0 breakage since it affects all SDLC pipelines.", + "affected_files": [ + "orchestrator/routes/pipelines.py:3447 (Tier 3 caller)", + "orchestrator/routes/pipelines.py:5396 (main phase loop caller)" + ], + "mitigation": "Ensure tasks 2 and 3 from the architect's plan are implemented in the same commit. Add a unit test for _aggregate_review_verdicts() that validates the return type and field access patterns. Consider using a 2-field NamedTuple (verdict, feedback) plus a separate function to get advisory content, if atomic update proves difficult to guarantee.", + "rollback": "Revert the single commit. No data migration needed since old verdict files work with new or old model.", + "human_review_needed": false + }, + { + "id": "R-2", + "category": "test_coverage", + "title": "Production aggregation function has no direct unit tests", + "description": "tests/workflows/test_multi_reviewer.py defines a local `aggregate_verdicts(verdicts: dict) -> str` function (line 239) that takes plain string verdicts. It does NOT test the production `_aggregate_review_verdicts()` which takes `dict[str, ReviewVerdict | None]`. This means: (a) the production function's current behavior is unverified by unit tests, (b) changes to the production function will not be caught by this test file, and (c) the new AggregatedReviewResult return type will have no coverage from the tests the architect plans to update (task-8).", + "likelihood": "certain", + "impact": "medium", + "impact_detail": "Tests may pass while the production aggregation function is broken. The only coverage for the production function comes from integration mocks in test_tier3_execute.py, which mock the function rather than testing it directly.", + "affected_files": [ + "tests/workflows/test_multi_reviewer.py (tests wrong function)", + "orchestrator/tests/test_tier3_execute.py (mocks the function, doesn't test it)" + ], + "mitigation": "Task-8 should be revised to either: (a) replace the local `aggregate_verdicts()` stub with an import of the production `_aggregate_review_verdicts()`, or (b) add new tests that directly call the production function with ReviewVerdict objects. The existing local function can remain for testing the simplified aggregation logic, but new tests must cover the production function.", + "rollback": "N/A — this is a pre-existing gap that should be fixed as part of this change.", + "human_review_needed": false + }, + { + "id": "R-3", + "category": "behavioral", + "title": "Advisory content collected but never surfaced (dead code path)", + "description": "The architect's plan collects advisory_content in AggregatedReviewResult but the implementation plan has no task to surface it. AD-5 says it should appear in phase-completion comments and be available to the integrator, but no implementation task covers this. The advisory_content field will be populated, logged, and then discarded — meaning approved-verdict suggestions are still effectively lost.", + "likelihood": "certain", + "impact": "medium", + "impact_detail": "The core promise of issue #897 is that reviewer observations on approved work are actionable. If advisory content is collected but not surfaced, the user-visible behavior for 'approve with suggestions' is unchanged. The schema and prompt improvements will still produce deeper analysis in the verdict JSON files (readable by humans), but the pipeline automation doesn't act on them.", + "affected_files": [ + "orchestrator/routes/pipelines.py (phase-completion comment posting, location TBD)" + ], + "mitigation": "Add a task to the implementation plan that surfaces advisory_content in the phase-completion GitHub issue comment. This is where the human reviews pipeline results. Even a simple 'Advisory suggestions from reviewers:' section appended to the comment would close the loop. The integrator can read raw verdict files, so no additional integration work is needed there.", + "rollback": "N/A — gap in plan, not a regression.", + "human_review_needed": true, + "human_review_reason": "The scope of how/where advisory content is surfaced is a product decision. Options: (a) append to phase-completion comment, (b) write a separate summary file, (c) defer to a follow-up issue. Human should decide which approach to take." + }, + { + "id": "R-4", + "category": "cost", + "title": "Token consumption increase per review cycle", + "description": "Requiring always-populated analysis and encouraging non-blocking suggestions will increase output token usage per review verdict. Rough estimate: each reviewer produces ~300-500 additional words of analysis (~400-700 tokens). With 3 reviewers x up to 3 cycles = 9 reviewer invocations per pipeline, this adds ~3,600-6,300 output tokens per pipeline run. At Opus output pricing (~$75/M tokens), this is ~$0.27-$0.47 per pipeline run.", + "likelihood": "certain", + "impact": "low", + "impact_detail": "The increased cost is the intended outcome — deeper analysis requires more tokens. The per-pipeline increase (~$0.50 worst case) is small relative to total pipeline cost (typically $5-15 for a full SDLC run with multiple agents). The prompt input token increase from adding review conventions (~200 tokens) is negligible.", + "affected_files": [], + "mitigation": "Include length guidance in the prompt (e.g., 'thorough but concise analysis, typically 200-500 words') as the architect suggests. Monitor actual token usage in the first few pipeline runs after deployment. The agent-design reviewer exemption (AD-2) avoids wasted tokens on reviews with no findings.", + "rollback": "Revert prompt changes to restore previous token consumption levels.", + "human_review_needed": false + }, + { + "id": "R-5", + "category": "behavioral", + "title": "Prompt changes may cause over-reporting: fabricated or low-value findings", + "description": "Aggressive thoroughness framing ('critical infrastructure', 'last line of defense', 'find ALL issues', 'always provide detailed analysis') may cause reviewers to over-report. When an agent is pressured to always produce analysis, it may fabricate issues or flag trivial style concerns as substantive findings to fill the analysis field. This is particularly risky for the contract and refine/plan reviewers where clean work genuinely has little to critique.", + "likelihood": "low-medium", + "impact": "medium", + "impact_detail": "If reviewers start producing false positives in their analysis, it degrades trust in the review system. If false positives appear in the feedback field (even non-blocking suggestions), they could trigger unnecessary revision cycles or confuse implementing agents. This counterproductive behavior was observed in LLM reviewer research where forcing detailed output led to hallucinated concerns.", + "affected_files": [ + "orchestrator/routes/pipelines.py:1004-1044 (scope preambles)", + "orchestrator/routes/pipelines.py:1452-1586 (review prompt builder)" + ], + "mitigation": "The prompt should explicitly state: 'If the work genuinely has no issues in your review scope, say so briefly in the analysis field rather than inventing concerns. A short analysis confirming quality is better than a long analysis fabricating problems.' The agent-design reviewer exemption (AD-2) already handles this for that reviewer type. Apply similar guidance to contract and refine/plan reviewers: 'If all criteria are met, confirm each criterion is met with a brief evidence citation rather than searching for problems that don't exist.'", + "rollback": "Revert prompt changes to restore current behavior.", + "human_review_needed": false + }, + { + "id": "R-6", + "category": "maintenance", + "title": "Review conventions drift between PR and SDLC systems", + "description": "AD-4 chooses to inline review conventions into the prompt builder rather than loading from action/review-conventions.md. This creates two copies of the review quality standards: one in action/review-conventions.md (for PR reviewers) and one hard-coded in orchestrator/routes/pipelines.py (for SDLC reviewers). Future updates to conventions will need to be applied in both locations.", + "likelihood": "medium", + "impact": "low", + "impact_detail": "If conventions drift, PR and SDLC reviewers will apply different quality standards. This is the exact class of problem issue #897 is trying to fix — divergent behavior between the two reviewer systems. The drift would be gradual and hard to detect.", + "affected_files": [ + "action/review-conventions.md", + "orchestrator/routes/pipelines.py (_build_review_prompt)" + ], + "mitigation": "Two options: (a) Accept the inlining approach but add a code comment in both locations referencing the other, so editors know to update both. (b) Instead of inlining, create a shared/prompts/review-conventions.md file that both systems load. The architect rejected (b) due to cross-boundary dependency, but shared/prompts/ already contains shared criteria files (code-review-criteria.md, etc.), so this directory is already a shared dependency. Adding review-conventions.md there would be consistent with existing patterns.", + "rollback": "N/A — architectural choice, not a regression.", + "human_review_needed": true, + "human_review_reason": "Should review conventions be shared via shared/prompts/ (consistent with existing criteria sharing pattern) or inlined (as the architect recommends)? This is a maintainability tradeoff the human should decide." + }, + { + "id": "R-7", + "category": "compatibility", + "title": "Pydantic extra field handling is implicitly relied upon", + "description": "The verdict JSON template instructs agents to write a 'reviewer' field, but ReviewVerdict has no 'reviewer' field. This works because Pydantic v2 defaults to ignoring extra fields. The model has no explicit model_config. If someone later adds model_config with extra='forbid' (a common Pydantic best practice), _read_review_verdict() will start failing on all verdict files. Adding 2 new fields (analysis, suggestions) to the model increases the coupling to this implicit behavior — old verdict files will still have the 'reviewer' extra field.", + "likelihood": "low", + "impact": "medium", + "impact_detail": "Not an immediate risk. However, the implicit reliance on Pydantic's default extra-field handling is fragile. A future refactoring that adds extra='forbid' would break verdict parsing for all existing and new verdict files.", + "affected_files": [ + "orchestrator/models.py:97-103 (ReviewVerdict)", + "orchestrator/routes/pipelines.py:1621-1624 (_read_review_verdict parsing)" + ], + "mitigation": "Either: (a) add `reviewer` as an optional field to ReviewVerdict (since the prompt template includes it), or (b) add explicit `model_config = ConfigDict(extra='ignore')` to ReviewVerdict to document the intentional behavior and prevent accidental breakage from a future extra='forbid' change. Option (b) is lower risk and more explicit.", + "rollback": "N/A — pre-existing fragility.", + "human_review_needed": false + }, + { + "id": "R-8", + "category": "correctness", + "title": "Tier 3 test mocks may mask integration issues", + "description": "test_tier3_execute.py uses mock_read_verdict to return pre-built ReviewVerdict objects, bypassing the actual JSON parsing in _read_review_verdict(). With the new fields, mocks will use ReviewVerdict defaults (empty strings) which is correct, but the tests won't validate that agent-written JSON with the new fields actually parses correctly end-to-end.", + "likelihood": "low", + "impact": "low", + "impact_detail": "If an agent writes malformed JSON for the new fields (e.g., a list instead of a string for analysis), the mock-based tests won't catch it. Only real pipeline runs would reveal the issue.", + "affected_files": [ + "orchestrator/tests/test_tier3_execute.py" + ], + "mitigation": "Add a specific test in TestReadReviewVerdict (test_tier3_execute.py:682) that reads a JSON file containing the new fields and validates they parse into ReviewVerdict correctly. Also test with the 'reviewer' extra field present to confirm Pydantic ignores it.", + "rollback": "N/A — test improvement.", + "human_review_needed": false + }, + { + "id": "R-9", + "category": "deployment", + "title": "Active pipelines during deployment may see mixed behavior", + "description": "If the orchestrator is restarted to deploy the new code while a pipeline is in a review cycle, the following can happen: (a) reviews already written with old schema will be read by new aggregation code — this is safe due to Pydantic defaults. (b) New prompt will be served to reviewers mid-cycle — this is safe since each reviewer session is independent. (c) The review cycle may have some verdicts in old format and some in new format — the aggregation handles this via defaults.", + "likelihood": "low", + "impact": "low", + "impact_detail": "No data corruption or pipeline failure. Worst case: one review cycle produces mixed-depth verdicts (some old-style shallow, some new-style deep). The overall verdict logic is unaffected.", + "affected_files": [], + "mitigation": "No special deployment sequencing needed. The Pydantic defaults provide natural backward compatibility. For extra safety, deploy during low-pipeline-activity windows.", + "rollback": "Standard orchestrator restart with previous version.", + "human_review_needed": false + }, + { + "id": "R-10", + "category": "security", + "title": "No new attack surface introduced", + "description": "The changes modify internal prompt construction, JSON schema fields, and aggregation logic. No new external inputs, API endpoints, or authentication changes are introduced. The verdict JSON is written by sandboxed agents and read by the orchestrator — the trust boundary is unchanged.", + "likelihood": "negligible", + "impact": "negligible", + "impact_detail": "No security risk identified.", + "affected_files": [], + "mitigation": "None required.", + "rollback": "N/A", + "human_review_needed": false + } + ], + + "risk_matrix": { + "high_priority": [ + { + "risk_id": "R-1", + "title": "NamedTuple return type breaks callers", + "action": "Ensure atomic commit for aggregation + caller updates. Add unit test for return type." + } + ], + "medium_priority": [ + { + "risk_id": "R-2", + "title": "Production aggregation untested", + "action": "Add direct unit tests for _aggregate_review_verdicts() with ReviewVerdict objects." + }, + { + "risk_id": "R-3", + "title": "Advisory content has no consumer", + "action": "Add task to surface advisory_content in phase-completion comments. Human decision needed on approach." + }, + { + "risk_id": "R-5", + "title": "Over-reporting risk from aggressive prompting", + "action": "Add anti-hallucination guidance to prompts for all reviewer types." + } + ], + "low_priority": [ + { + "risk_id": "R-4", + "title": "Token cost increase", + "action": "Include length guidance in prompts. Monitor post-deployment." + }, + { + "risk_id": "R-6", + "title": "Review conventions drift", + "action": "Human decision: inline vs shared file. Either way, add cross-references." + }, + { + "risk_id": "R-7", + "title": "Pydantic extra field handling implicit", + "action": "Add explicit model_config or reviewer field to ReviewVerdict." + }, + { + "risk_id": "R-8", + "title": "Mock-based tests miss JSON parsing issues", + "action": "Add JSON round-trip test in TestReadReviewVerdict." + }, + { + "risk_id": "R-9", + "title": "Mixed behavior during deployment", + "action": "No action needed — Pydantic defaults provide compatibility." + } + ] + }, + + "areas_requiring_human_review": [ + { + "area": "Advisory content surfacing strategy", + "reason": "The implementation plan collects advisory_content but has no task to surface it. How and where should approved-verdict suggestions be shown to humans? Options: (a) append to phase-completion issue comment, (b) write separate summary file, (c) defer to follow-up issue.", + "risk_ids": ["R-3"], + "recommendation": "Option (a) — append to phase-completion comment. This is the natural place where humans review pipeline results and requires minimal additional code." + }, + { + "area": "Review conventions sharing strategy", + "reason": "The architect chose to inline conventions (AD-4) rather than share via shared/prompts/. This creates a maintenance burden since the same conventions would exist in two places. The shared/prompts/ directory already contains shared criteria files, so adding review-conventions.md there would be consistent.", + "risk_ids": ["R-6"], + "recommendation": "Use shared/prompts/review-conventions.md for consistency with existing shared criteria pattern. The cross-boundary dependency concern is already accepted for criteria files." + } + ], + + "rollback_plan": { + "strategy": "Single-commit revert", + "description": "All changes should be in a single PR (possibly multi-commit but squash-mergeable). Rollback is a git revert of the merge commit. No data migration is needed in either direction because: (a) old verdict files parse correctly with new model (Pydantic defaults), (b) new verdict files with extra fields parse correctly with old model (Pydantic ignores extras by default), (c) the aggregation function return type change is code-only with no persistent state.", + "steps": [ + "1. Revert the merge commit on main", + "2. Deploy the orchestrator with reverted code", + "3. No data cleanup needed — old and new verdict files are intercompatible", + "4. In-flight pipelines will continue normally with old behavior" + ], + "data_migration_needed": false, + "downtime_required": false + }, + + "implementation_plan_amendments": [ + { + "amendment": "Make tasks 2 and 3 atomic", + "reason": "The AggregatedReviewResult NamedTuple with 3 fields cannot be destructured into 2 variables. Both callers MUST be updated in the same commit as the return type change.", + "risk_id": "R-1", + "priority": "required" + }, + { + "amendment": "Add task: surface advisory_content in phase-completion comments", + "reason": "Without this, the advisory content collected by the new aggregation is dead code. The approved-verdict suggestions would still be lost at the pipeline level.", + "risk_id": "R-3", + "priority": "required" + }, + { + "amendment": "Revise task-8 to test the production _aggregate_review_verdicts function", + "reason": "test_multi_reviewer.py tests a local stub function, not the production function. New tests should import and test the actual production function.", + "risk_id": "R-2", + "priority": "required" + }, + { + "amendment": "Add anti-hallucination prompt guidance", + "reason": "Prevent reviewers from fabricating issues to fill the always-required analysis field.", + "risk_id": "R-5", + "priority": "recommended" + }, + { + "amendment": "Add explicit model_config = ConfigDict(extra='ignore') to ReviewVerdict", + "reason": "Document the intentional Pydantic behavior of ignoring extra fields like 'reviewer' in verdict JSON.", + "risk_id": "R-7", + "priority": "recommended" + }, + { + "amendment": "Add JSON round-trip test in TestReadReviewVerdict for new fields", + "reason": "Ensure agent-written JSON with analysis and suggestions fields parses correctly end-to-end.", + "risk_id": "R-8", + "priority": "recommended" + } + ], + + "overall_risk_level": "LOW-MEDIUM", + "overall_assessment": "The proposed approach is sound and the risks are manageable. The most critical finding is that the architect's R-2 mitigation regarding NamedTuple unpacking is factually incorrect — this will cause a hard runtime failure if callers are not updated atomically. The second finding is a plan gap: advisory content is collected but has no consumer, which means the 'approve with suggestions' feature is incomplete without an additional task. The third finding is a test coverage gap that predates this issue but should be fixed alongside the changes. All risks have straightforward mitigations. No risk warrants blocking the approach or choosing an alternative. The rollback path is clean due to Pydantic's backward-compatible field defaults.", + "blocking_risks": false +} diff --git a/.egg-state/checks/897-implement-results.json b/.egg-state/checks/897-implement-results.json new file mode 100644 index 000000000..2f76049a5 --- /dev/null +++ b/.egg-state/checks/897-implement-results.json @@ -0,0 +1,20 @@ +{ + "all_passed": true, + "checks": [ + { + "name": "lint", + "passed": true, + "output": "==> Ruff check...\nAll checks passed!\n==> Ruff format check...\n431 files already formatted\n==> Mypy...\nSuccess: no issues found in 118 source files\n==> Shellcheck...\n==> Yamllint...\nSKIP: yamllint not installed\n==> Hadolint...\nSKIP: hadolint not installed\n==> Actionlint...\nSKIP: actionlint not installed\n==> Custom checks...\n bin-symlinks...\nOK: All 2 bin/ symlinks are valid\n claude-imports...\nWarning: host-services directory not found\n container-host-boundary...\nOK: No forbidden host-services imports found in sandbox\n container-paths...\nOK: No problematic sys.path patterns found\n docker-and-claude-invocations...\nOK: No docker/claude invocation violations found\n gh-cli-usage...\nWarning: host-services directory not found\n hardcoded-ports...\nOK: No hardcoded port numbers found\n llm-api-calls...\nOK: No direct LLM API usage found outside sandbox\n model-versions...\nOK: No non-alias Claude model references found\n reviewer-job-names...\nOK: All reviewer jobs use the required naming prefix\n workflow-secrets...\nOK: No untrusted script execution with secrets found in workflows" + }, + { + "name": "test", + "passed": true, + "output": "6767 passed, 85 skipped, 4 warnings in 73.51s (0:01:13)" + }, + { + "name": "security", + "passed": true, + "output": "==> Running security scan...\nSKIP: bandit not installed" + } + ] +} diff --git a/.egg-state/contracts/897.json b/.egg-state/contracts/897.json new file mode 100644 index 000000000..4145db9a6 --- /dev/null +++ b/.egg-state/contracts/897.json @@ -0,0 +1,103 @@ +{ + "schemaVersion": "1.0", + "issue": { + "number": 897, + "title": "Issue #897", + "url": "https://github.com/jwbron/egg/issues/897" + }, + "pipeline_id": null, + "current_phase": "refine", + "acceptance_criteria": [], + "phases": [], + "decisions": [], + "workflow_owner": null, + "audit_log": [], + "refine_review_cycles": 0, + "refine_review_feedback": "", + "plan_review_cycles": 0, + "plan_review_feedback": "", + "pr": null, + "feedback": null, + "phase_configs": null, + "agent_executions": [ + { + "role": "coder", + "phase_id": null, + "status": "complete", + "started_at": "2026-02-24T07:54:30.454529Z", + "completed_at": "2026-02-24T08:06:09.760873Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + }, + { + "role": "tester", + "phase_id": null, + "status": "complete", + "started_at": "2026-02-24T08:06:09.765457Z", + "completed_at": "2026-02-24T08:12:24.333816Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + }, + { + "role": "documenter", + "phase_id": null, + "status": "complete", + "started_at": "2026-02-24T08:06:09.766204Z", + "completed_at": "2026-02-24T08:14:28.086147Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + }, + { + "role": "integrator", + "phase_id": null, + "status": "complete", + "started_at": "2026-02-24T08:14:28.090646Z", + "completed_at": "2026-02-24T08:22:37.308664Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + }, + { + "role": "reviewer_contract", + "phase_id": null, + "status": "complete", + "started_at": null, + "completed_at": "2026-02-24T08:35:28.827301Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + }, + { + "role": "reviewer_code", + "phase_id": null, + "status": "complete", + "started_at": null, + "completed_at": "2026-02-24T08:37:24.767396Z", + "commit": null, + "checkpoint_id": null, + "outputs": {}, + "error": null, + "retry_count": 0, + "conflicts": [] + } + ], + "multi_agent_config": null +} diff --git a/.egg-state/drafts/897-analysis.md b/.egg-state/drafts/897-analysis.md new file mode 100644 index 000000000..b2d2c3689 --- /dev/null +++ b/.egg-state/drafts/897-analysis.md @@ -0,0 +1,203 @@ +# Analysis: SDLC Pipeline Reviewers Produce Shallow Analysis Compared to PR Reviewers + +> Issue: #897 | Phase: refine + +## Problem Statement + +The SDLC pipeline's internal review agents (code, contract, agent-design, refine, plan) produce significantly shallower analysis than the automated PR reviewers running on GitHub Actions, despite both systems consuming the same shared criteria files (`shared/prompts/code-review-criteria.md`, `shared/prompts/contract-review-criteria.md`, `shared/prompts/agent-design-criteria.md`). + +Evidence from PR #895 (issue #871) shows SDLC reviewers writing ~1-3 sentence verdicts with `"feedback": ""` even when scope creep and real issues were present, while PR reviewers on the same code produced 20-row task tables, identified 5 specific code issues (driving a follow-up commit), and gave substantive multi-round reviews. + +The desired outcome is SDLC reviewers producing at minimum the same depth of analysis as PR reviewers — with detailed file-by-file analysis, advisory suggestions on approved work, and actionable output rather than a pass/fail stamp. + +## Current Behavior + +### Verdict Schema Constrains Depth + +The `ReviewVerdict` model (`orchestrator/models.py:97-103`) has four fields: + +```python +class ReviewVerdict(BaseModel): + verdict: str # "approved" or "needs_revision" + summary: str # "Brief summary of review findings" + feedback: str # "Detailed feedback if needs_revision" + timestamp: str # ISO 8601 timestamp +``` + +The prompt at `orchestrator/routes/pipelines.py:1554-1570` instructs reviewers: + +```json +"feedback": "Detailed feedback if needs_revision, empty if approved" +``` + +This **explicitly tells reviewers to write empty feedback when approving**. There is no field for advisory suggestions, non-blocking observations, or detailed analysis that should always be populated. + +### Aggregation Ignores Approved Verdicts + +`_aggregate_review_verdicts()` at `orchestrator/routes/pipelines.py:1712-1738` only collects feedback from `needs_revision` verdicts. If a reviewer approves with observations in `summary`, those observations are silently discarded — they are never surfaced to the implementing agent or the human. + +### SDLC Prompt is Less Detailed Than PR Prompt + +**PR code reviewer** (`action/build-review-prompt.sh`) gets: +- "Comprehensive, thorough code review" framing +- "Critical infrastructure — last line of defense before production" +- 6 detailed procedural steps +- Review conventions from `action/review-conventions.md` (comprehensive, specific, direct, suggest fixes, provide context) +- `` marker for approvals with advisory feedback + +**SDLC code reviewer** (`orchestrator/routes/pipelines.py:1509-1526`) for non-draft reviews gets: +- 9 procedural steps (briefer than PR counterpart) +- Scope preamble with "Be thorough" and "Find ALL issues" (`pipelines.py:1014-1022`) +- **No review conventions** — `review-conventions.md` is not loaded +- No equivalent to `has-suggestions` — binary approved/needs_revision only + +**SDLC non-code reviewers** (refine, plan, contract, agent-design) for draft-based reviews get: +- 4 steps: read draft, evaluate against criteria, write verdict JSON, commit (`pipelines.py:1523-1526`) +- No thoroughness framing, no review conventions, no procedural guidance +- The criteria themselves are substantive (refine has 6 sections, plan has 7), but the prompt wrapping them gives no instructions on how deep the analysis should be + +### No "Approve with Suggestions" Concept in SDLC + +PR reviewers have the `` marker (`action/review-conventions.md:34`) which promotes an `approve` to `approve-with-suggestions`, triggering the `on-review-feedback.yml` workflow so suggestions get acted on. The SDLC pipeline has no equivalent — observations from an approving reviewer are structurally lost. + +## Constraints + +- **Backward compatibility**: The `ReviewVerdict` model is consumed by `_read_review_verdict()` (`pipelines.py:1589-1632`), `_aggregate_review_verdicts()` (`pipelines.py:1712-1738`), and all existing verdict JSON files stored in `.egg-state/reviews/`. Any model changes must handle both old and new format gracefully. +- **JSON output format**: SDLC reviewers write JSON to files that are programmatically parsed, unlike PR reviewers who write free-form markdown to PR comments. The JSON format creates a ceiling on expressiveness unless new fields are added. +- **Token budget**: Making reviewers produce more detailed output increases token consumption per review cycle. This matters because reviews run on Opus, and the SDLC pipeline can run 3 reviewers × up to 3 cycles. +- **Reviewer scope separation**: The SDLC pipeline deliberately separates reviewers by concern (code, contract, agent-design). Adding thoroughness requirements must respect these boundaries — e.g., the agent-design reviewer should not be pressured into writing detailed feedback when there are no agent-design issues. +- **Agent-design reviewer is intentionally brief**: The agent-design criteria explicitly says "Only comment if you find agent-mode design issues. If the PR has no agent-mode concerns, approve with a brief note" (`action/build-agent-mode-design-review-prompt.sh:126-127`). Forcing thoroughness here would be counterproductive. +- **Shared criteria files**: Both systems consume the same criteria files. Changes to shared criteria affect PR reviewers too. +- **Test coverage**: `_aggregate_review_verdicts()` has no direct unit tests — only mock usage in tier 3 tests. The `ReviewVerdict` model is tested in `orchestrator/tests/test_models.py`. Prompt building is tested in `orchestrator/tests/test_pipeline_prompts.py`. + +## Options Considered + +### Option A: Expand Verdict Schema + Align Prompts (Comprehensive Fix) + +**Approach**: Add `analysis` and `suggestions` fields to `ReviewVerdict` that are always populated regardless of verdict. Update the prompt builder to include review conventions and thoroughness instructions matching the PR reviewer. Update aggregation to surface suggestions from approved verdicts. + +**Schema change**: +```python +class ReviewVerdict(BaseModel): + verdict: str + summary: str + analysis: str = "" # NEW: detailed file-by-file or section-by-section analysis (always populated) + suggestions: str = "" # NEW: non-blocking suggestions (populated even on approve) + feedback: str = "" # existing: blocking feedback for needs_revision + timestamp: str = "" +``` + +**Prompt changes**: +- Update verdict format instructions to require `analysis` always +- Explicitly say: "Always provide detailed analysis regardless of verdict" +- Remove "empty if approved" language from `feedback` field +- Include review conventions content (from `review-conventions.md`) in SDLC prompts +- Add the "last line of defense" and "critical infrastructure" framing for code reviewers + +**Aggregation changes**: +- `_aggregate_review_verdicts()` collects `analysis` + `suggestions` from all verdicts, even approved ones +- Return value changes: `(overall_verdict, combined_feedback, combined_analysis)` or similar + +**Pros**: +- Directly addresses all three root causes identified in the issue +- Clear separation between blocking feedback, non-blocking suggestions, and analysis +- Backward compatible — new fields have defaults, old JSON files parse fine +- Aligns SDLC reviewers with the PR reviewer standard + +**Cons**: +- Larger scope of changes across model, prompt builder, and aggregation +- Increases token consumption per review (reviewers must always write analysis) +- May be over-structured — adding fields doesn't guarantee quality; the prompt is what really drives behavior + +### Option B: Prompt-Only Fix (Minimal Schema Change) + +**Approach**: Keep the verdict schema mostly as-is but rewrite the prompt instructions to demand thoroughness. Change "empty if approved" to require detailed feedback always. Load `review-conventions.md` content into SDLC prompts. The existing `summary` and `feedback` fields can carry richer content with better prompting. + +**Prompt changes**: +- Remove "empty if approved" instruction +- Replace with: "Always provide detailed analysis in the summary field. Provide non-blocking suggestions in the feedback field even when approving." +- Add review conventions (comprehensive, specific, direct, suggest fixes) +- Add "critical infrastructure" framing for code reviewers +- For non-code reviewers (refine, plan), add structured analysis expectations keyed to their criteria sections + +**Aggregation changes**: +- Update `_aggregate_review_verdicts()` to also collect `summary` and `feedback` from approved verdicts (not just needs_revision) + +**Pros**: +- Smaller scope — no model changes needed +- Addresses the key issue (prompt instructions drive behavior) +- Lower risk of breaking existing verdict parsing + +**Cons**: +- Overloads existing fields (`summary` for analysis, `feedback` for suggestions) — field semantics become ambiguous +- Harder for downstream code to distinguish blocking vs non-blocking feedback +- The "approved with empty feedback" pattern is reinforced by the field description in the model itself + +### Option C: Add "Approved with Suggestions" Verdict + Prompt Improvements + +**Approach**: Add a third verdict value (`approved_with_suggestions`) alongside the prompt improvements from Option B. When a reviewer approves but has suggestions, they use this verdict. The aggregation logic treats it as `approved` for overall verdict but surfaces the feedback for action. + +**Schema change**: Add `approved_with_suggestions` as a valid verdict value. No new fields. + +**Prompt changes**: Same as Option B, plus instructions about when to use each verdict. + +**Aggregation changes**: `approved_with_suggestions` → overall still `approved`, but feedback is collected and surfaced. + +**Pros**: +- Mirrors the PR reviewer's `has-suggestions` concept +- Clear semantic distinction without adding fields +- Moderate scope + +**Cons**: +- Three-way verdict adds complexity to all verdict-handling code paths +- Doesn't address the shallow analysis problem for clean approvals (an approve with no suggestions still has no analysis requirement) +- Requires updating every `if verdict == "approved"` check throughout the codebase + +## Recommended Approach + +**Option A: Expand Verdict Schema + Align Prompts**. This directly addresses all three root causes: + +1. **Schema constraint** → New `analysis` and `suggestions` fields that are always populated give reviewers structured space for depth. +2. **Prompt gap** → Including review conventions and thoroughness framing aligns SDLC prompts with PR prompts. +3. **No approve-with-suggestions** → The `suggestions` field serves this purpose without needing a third verdict value. + +The key insight from the issue is that **the verdict format structurally discourages thorough analysis**. Option B improves prompting but leaves the structural incentive intact — an agent told to write JSON with a field called `feedback` described as "empty if approved" will naturally minimize effort. Option A changes the structure to match the desired behavior. + +Backward compatibility is straightforward: new fields have empty-string defaults, so old JSON files parse without error. The aggregation function already handles `None` verdicts gracefully, and extending it to collect analysis/suggestions from all verdicts is a natural extension. + +The agent-design reviewer should be exempted from the "always populate analysis" requirement — its criteria explicitly says to approve briefly when there are no concerns. This aligns with its PR counterpart's behavior. + +## Open Questions + +### Design Decisions + +1. **Should the `analysis` field use a structured sub-format?** For code reviewers, should analysis be a list of per-file findings (like the PR reviewer's file-by-file approach), or free-form markdown? A structured format increases parsability but constrains the reviewer. The PR reviewer uses free-form markdown successfully. + +2. **How should approved-verdict suggestions be surfaced to the implementing agent?** Options: (a) include them in the next cycle's prompt as "advisory feedback", (b) write them to a separate file agents can optionally read, (c) only surface them to the human in the phase-completion comment. Option (a) risks agents treating non-blocking suggestions as blocking requirements. + +3. **Should the agent-design reviewer be exempt from the always-populate-analysis requirement?** Its PR counterpart explicitly says "approve with a brief note" when there are no agent-mode concerns. Forcing it to write detailed analysis seems counterproductive, but applying different rules per reviewer type adds complexity. + +4. **Should review conventions be loaded from `action/review-conventions.md` or duplicated into the orchestrator?** Loading from the file keeps them in sync with PR reviewers but creates a dependency across the action/orchestrator boundary. Duplicating risks drift. + +5. **Should the `feedback` field semantics change?** Currently "detailed feedback if needs_revision". Should it become "blocking feedback for needs_revision, non-blocking suggestions for approved"? Or should `suggestions` fully replace its role for non-blocking items, leaving `feedback` as blocking-only? + +### Scope Decisions + +6. **Should existing verdict files be migrated?** Old verdicts in `.egg-state/reviews/` lack `analysis` and `suggestions` fields. The default empty strings handle this, but should a one-time migration backfill them for consistency? + +7. **Should the aggregation function return analysis/suggestions separately from feedback?** The current return type is `tuple[str, str]` (verdict, feedback). Adding analysis/suggestions changes the return signature and all callers. Should it return a typed object instead? + +8. **How should the non-code reviewers' (refine, plan) prompts be improved?** The code reviewer has a clear model (the PR reviewer prompt) to align with. The refine and plan reviewers have no PR-side counterpart. How much additional prompting should they get? Their criteria are already substantive — is the gap mainly in the verdict format instructions and the "empty if approved" language? + +### Risk/Budget Decisions + +9. **What is the acceptable token budget increase?** Always-populate-analysis reviews will use more tokens per review. With 3 reviewers × potentially 3 cycles, this could meaningfully increase pipeline cost. Is there a per-review or per-cycle token budget to stay within? + +10. **Should there be a maximum length for the `analysis` field?** Without guidance, a reviewer might produce very long analysis. Should the prompt include length guidance (e.g., "provide a thorough but concise analysis, typically 200-500 words")? + +--- + +*Authored-by: egg* + +# metadata +complexity_tier: mid diff --git a/.egg-state/drafts/897-plan.md b/.egg-state/drafts/897-plan.md new file mode 100644 index 000000000..07997241e --- /dev/null +++ b/.egg-state/drafts/897-plan.md @@ -0,0 +1,189 @@ +# Implementation Plan: Deepen SDLC Review Quality + +> Issue: #897 | Phase: plan | Pipeline: issue-897 + +## Summary + +SDLC pipeline reviewers produce shallow verdicts compared to PR reviewers because: +(1) the verdict JSON schema tells reviewers to write empty feedback when approving, +(2) the SDLC prompt lacks the thoroughness framing and review conventions that PR +prompts include, and (3) there is no way to surface non-blocking suggestions from +approved verdicts. + +This plan implements Approach A from the architecture analysis: expand the +`ReviewVerdict` model with `analysis` and `suggestions` fields, align the SDLC +review prompt builder with PR reviewer standards, and update aggregation to surface +non-blocking observations from all verdicts. + +## Approach + +**Single PR with three implementation phases** within one commit stream: + +1. **Model + aggregation** — Expand the data model and update the aggregation + function and its callers. This is the structural foundation. +2. **Prompt builder** — Update the verdict format instructions, add review + conventions, enhance scope preambles, and expand draft-reviewer procedural steps. +3. **Tests** — Update existing tests and add new coverage for the changed code. + +Each phase builds on the prior one. The model change is backward-compatible (new +fields default to empty strings), so existing verdict files continue to parse. + +## Key Design Decisions + +Per the architect's analysis (AD-1 through AD-6): + +- **Free-form markdown** for the `analysis` field (not structured JSON sub-objects). +- **Agent-design reviewer exempt** from always-populate-analysis — its criteria + explicitly allows brief approval when no concerns exist. +- **`AggregatedReviewResult` NamedTuple** replaces the `tuple[str, str]` return type + from `_aggregate_review_verdicts()` for clarity and extensibility. +- **Review conventions inlined** in the prompt builder — not loaded from + `action/review-conventions.md` (avoids cross-boundary dependency). +- **Advisory content NOT injected** into retry prompt `prior_feedback` — only + blocking feedback goes there. Advisory content is logged for observability. +- **No migration** of old verdict files — Pydantic defaults handle missing fields. + +## Risk Mitigation + +| Risk | Mitigation | +|------|------------| +| Token consumption increase | Prompt includes length guidance ("thorough but concise, 200-500 words"). Agent-design reviewer exempted. | +| Aggregation return type breaks callers | Only 2 callers exist (verified). Both updated in same phase. NamedTuple fields match old tuple positionally as fallback. | +| Prompt changes don't improve quality | PR reviewer evidence proves same model (Opus) produces thorough reviews with better prompting. Post-deploy comparison validates. | +| test_multi_reviewer.py uses local helpers | Test uses its own `aggregate_verdicts()` function, not production `_aggregate_review_verdicts()`. Update test to also exercise the production function or document the scope gap. | + +## Test Strategy + +1. **Unit tests** (`orchestrator/tests/test_tier3_execute.py`): + - `TestReadReviewVerdict`: Add backward-compat test — old JSON without + `analysis`/`suggestions` fields parses correctly. + - Add test exercising `ReviewVerdict` with all new fields populated. + - Update mock `ReviewVerdict()` calls to verify defaults work. + +2. **Unit tests** (`orchestrator/tests/test_pipeline_prompts.py`): + - Add `TestBuildReviewPrompt` class testing the generated prompt: + - Verdict format JSON includes `analysis` and `suggestions` fields. + - Review conventions text is present in generated prompt. + - `"empty if approved"` language is NOT present. + - Agent-design reviewer preamble does NOT require detailed analysis. + - Code reviewer preamble includes file-by-file analysis expectation. + +3. **Integration tests** (`tests/workflows/test_multi_reviewer.py`): + - Update `TestReviewVerdictAggregation` tests to use `ReviewVerdict` objects + (currently uses plain strings via a local helper). Or add parallel tests + that import and exercise `_aggregate_review_verdicts()` directly. + - Test that approved verdicts with `analysis`/`suggestions` are collected. + - Test `AggregatedReviewResult` fields. + +4. **Existing tests**: Run full test suite to verify no regressions from model and + aggregation changes. + +## File Impact + +| File | Change | Risk | +|------|--------|------| +| `orchestrator/models.py:97-103` | Add `analysis`, `suggestions` fields to `ReviewVerdict`. Add `AggregatedReviewResult` NamedTuple. | Low | +| `orchestrator/routes/pipelines.py:1004-1044` | Update `_get_reviewer_scope_preamble()` with per-type analysis depth expectations. | Low | +| `orchestrator/routes/pipelines.py:1452-1586` | Update `_build_review_prompt()`: verdict format, review conventions, draft-reviewer steps. | Low | +| `orchestrator/routes/pipelines.py:1712-1738` | Update `_aggregate_review_verdicts()`: return `AggregatedReviewResult`, collect from all verdicts. | Medium | +| `orchestrator/routes/pipelines.py:3447` | Update Tier 3 caller to use `AggregatedReviewResult`. | Low | +| `orchestrator/routes/pipelines.py:5396` | Update main phase loop caller to use `AggregatedReviewResult`. | Low | +| `orchestrator/tests/test_tier3_execute.py` | Update `TestReadReviewVerdict`, add backward-compat test, update mocks. | Low | +| `orchestrator/tests/test_pipeline_prompts.py` | Add `TestBuildReviewPrompt` class. | Low | +| `tests/workflows/test_multi_reviewer.py` | Add `_aggregate_review_verdicts()` tests with `ReviewVerdict` objects. | Low | + +--- + +```yaml +# yaml-tasks +pr: + title: "Deepen SDLC review quality with expanded verdict schema and aligned prompts" + description: | + SDLC pipeline reviewers produce shallow ~1-3 sentence verdicts with empty + feedback even when approving code with real issues, while PR reviewers on + the same code produce detailed multi-section reviews. This expands the + ReviewVerdict model with always-populated analysis and suggestions fields, + aligns SDLC review prompts with PR reviewer thoroughness standards, and + updates aggregation to surface non-blocking observations from all verdicts. +phases: + - id: 1 + name: Model and aggregation + goal: Expand the ReviewVerdict data model and update aggregation to collect analysis from all verdicts + tasks: + - id: TASK-1-1 + description: Add `analysis` (str, default="") and `suggestions` (str, default="") fields to ReviewVerdict in orchestrator/models.py with descriptive Field annotations + acceptance: ReviewVerdict has both new fields with empty-string defaults. Old verdict JSON without these fields parses correctly via Pydantic defaults. + files: + - orchestrator/models.py + - id: TASK-1-2 + description: Define AggregatedReviewResult NamedTuple in orchestrator/models.py with fields verdict, blocking_feedback, advisory_content + acceptance: AggregatedReviewResult is importable from models.py with the three named fields. + files: + - orchestrator/models.py + - id: TASK-1-3 + description: Update _aggregate_review_verdicts() to return AggregatedReviewResult. Collect feedback from needs_revision verdicts into blocking_feedback (existing behavior). Collect analysis and suggestions from ALL verdicts (including approved) into advisory_content. + acceptance: Function returns AggregatedReviewResult. blocking_feedback contains only needs_revision feedback. advisory_content contains analysis+suggestions from all verdicts. None verdicts are still skipped. + files: + - orchestrator/routes/pipelines.py + - id: TASK-1-4 + description: Update Tier 3 caller at pipelines.py:3447 to destructure AggregatedReviewResult. Use result.blocking_feedback for prior_feedback in retry loop. Log advisory_content. + acceptance: Tier 3 caller uses named fields from AggregatedReviewResult. Only blocking_feedback is passed to prior_feedback. Advisory content is not injected into retry prompts. + files: + - orchestrator/routes/pipelines.py + - id: TASK-1-5 + description: Update main phase loop caller at pipelines.py:5396 to destructure AggregatedReviewResult. Use result.blocking_feedback for review_feedback. Log advisory_content. + acceptance: Main loop caller uses named fields from AggregatedReviewResult. Only blocking_feedback is used for review_feedback. Advisory content is not injected into retry prompts. + files: + - orchestrator/routes/pipelines.py + - id: 2 + name: Prompt builder alignment + goal: Align SDLC review prompts with PR reviewer thoroughness standards + tasks: + - id: TASK-2-1 + description: Update the verdict format section in _build_review_prompt() (pipelines.py:1554-1570) to include analysis and suggestions fields in the JSON template. Replace "empty if approved" with instructions to always populate analysis and provide non-blocking suggestions. Clarify feedback field is for blocking issues only. + acceptance: Verdict JSON template has 5 fields (reviewer, verdict, summary, analysis, suggestions, feedback, timestamp — 7 total). Instructions say "Always provide detailed analysis regardless of verdict". The string "empty if approved" does not appear. feedback field described as blocking-only. + files: + - orchestrator/routes/pipelines.py + - id: TASK-2-2 + description: Add review conventions (5 comment quality standards) to _build_review_prompt() after the criteria section. Inline the standards from review-conventions.md — comprehensive, specific, direct, suggest fixes, provide context. Add "critical infrastructure" and "last line of defense" framing for code reviewers. + acceptance: Generated prompt contains a Review Conventions section with all 5 quality standards. Code reviewer prompt includes "critical infrastructure" framing. Conventions are inlined in the Python code, not loaded from action/review-conventions.md. + files: + - orchestrator/routes/pipelines.py + - id: TASK-2-3 + description: Update _get_reviewer_scope_preamble() to add analysis depth expectations per reviewer type. Code reviewer — file-by-file analysis. Contract reviewer — criterion-by-criterion verification table. Refine/plan reviewer — section-by-section evaluation. Agent-design reviewer — brief approval acceptable when no concerns. + acceptance: Each reviewer type's preamble includes its expected analysis format. Agent-design preamble explicitly states brief approval is acceptable. Code preamble mentions file-by-file. + files: + - orchestrator/routes/pipelines.py + - id: TASK-2-4 + description: Expand procedural steps for draft-based (non-code) reviewers from 4 steps to 6-7 steps. Add instructions to read thoroughly, cross-reference criteria sections, cite specific evidence in analysis, and evaluate completeness. + acceptance: Non-code reviewer steps are expanded from 4 to 6-7 steps. Steps include cross-referencing with criteria and citing specific sections. Steps are not overly prescriptive. + files: + - orchestrator/routes/pipelines.py + - id: 3 + name: Tests + goal: Update existing tests and add coverage for all changed code + tasks: + - id: TASK-3-1 + description: Update orchestrator/tests/test_tier3_execute.py — add backward-compat test for ReviewVerdict parsing old JSON (no analysis/suggestions fields). Add test with all new fields populated. Verify existing mock ReviewVerdict() calls still work with defaults. + acceptance: TestReadReviewVerdict has a test for old-format JSON without analysis/suggestions. At least one test creates ReviewVerdict with analysis and suggestions populated. All existing tests pass. + files: + - orchestrator/tests/test_tier3_execute.py + - id: TASK-3-2 + description: Add TestBuildReviewPrompt class to orchestrator/tests/test_pipeline_prompts.py. Test verdict format JSON includes analysis and suggestions. Test review conventions text appears. Test "empty if approved" is absent. Test agent-design preamble doesn't require detailed analysis. Test code reviewer preamble includes file-by-file expectation. + acceptance: TestBuildReviewPrompt has 4+ test methods covering verdict format, conventions presence, absence of "empty if approved", and per-reviewer preamble content. + files: + - orchestrator/tests/test_pipeline_prompts.py + - id: TASK-3-3 + description: Update tests/workflows/test_multi_reviewer.py to add tests exercising _aggregate_review_verdicts() with ReviewVerdict objects. Test approved verdicts with analysis/suggestions are collected in advisory_content. Test AggregatedReviewResult fields. Test backward compat (verdicts without new fields). + acceptance: New test class or methods import and exercise _aggregate_review_verdicts(). Tests verify advisory_content is populated from approved verdicts. Tests verify AggregatedReviewResult named fields. + files: + - tests/workflows/test_multi_reviewer.py + - id: TASK-3-4 + description: Run the full test suite (make test or equivalent) and verify no regressions from model and aggregation changes. + acceptance: All existing tests pass. No new test failures introduced. + files: [] +``` + +--- + +*Authored-by: egg* diff --git a/.egg-state/reviews/897-implement-code-review.json b/.egg-state/reviews/897-implement-code-review.json new file mode 100644 index 000000000..7dd7a86b5 --- /dev/null +++ b/.egg-state/reviews/897-implement-code-review.json @@ -0,0 +1,9 @@ +{ + "reviewer": "code", + "verdict": "approved", + "summary": "Clean, backward-compatible expansion of review verdict schema with proper aggregation, comprehensive test coverage, and consistent migration across all call sites.", + "analysis": "Reviewed all 8 changed source/test/doc files systematically. The implementation adds `analysis` and `suggestions` fields to ReviewVerdict (Pydantic model with empty-string defaults preserving backward compat), introduces AggregatedReviewResult NamedTuple replacing the previous 2-tuple return, and updates both call sites in _run_tier3_implement and _run_pipeline. The _build_review_prompt function now emits structured field guidelines, review conventions, and reviewer-type-specific analysis format instructions. All old tuple-unpacking references (`overall_verdict, combined_feedback`) are fully removed — verified via grep. The conftest.py docker mock fix correctly creates real exception classes instead of MagicMock objects, preventing TypeError in `except NotFound:` clauses. Advisory content is logged (500-char preview) but not persisted, which aligns with the stated observability purpose. Tests cover backward compat (old JSON without new fields), full-field parsing, ReviewVerdict construction defaults, aggregation edge cases (empty dict, all-None, mixed verdicts, feedback-fallback-to-summary), and all prompt generation paths. 252 tests pass across all three test suites.", + "suggestions": "Minor inconsistency: tests/workflows/test_multi_reviewer.py uses bare MagicMock() for docker module mocking while orchestrator/tests/conftest.py creates proper exception classes. Since test_multi_reviewer.py only exercises _aggregate_review_verdicts (which never catches docker exceptions), this is not a practical problem, but aligning the mock strategy would prevent subtle breakage if future tests in that file exercise docker exception paths. Consider extracting the docker mock setup into a shared test utility.", + "feedback": "", + "timestamp": "2026-02-24T08:45:00Z" +} diff --git a/.egg-state/reviews/897-implement-contract-review.json b/.egg-state/reviews/897-implement-contract-review.json new file mode 100644 index 000000000..beec72638 --- /dev/null +++ b/.egg-state/reviews/897-implement-contract-review.json @@ -0,0 +1,9 @@ +{ + "reviewer": "contract", + "verdict": "approved", + "summary": "All 13 tasks from the implementation plan are fully implemented, tests pass (6767 passed), and documentation is updated.", + "analysis": "### Contract Structure Note\n\nThe formal contract (897.json) has empty `acceptance_criteria` and `phases` arrays — the contract was never populated with structured tasks during pipeline setup. However, the plan file (897-plan.md) defines 13 tasks across 3 phases with clear acceptance criteria in yaml-tasks format. This review verifies the implementation against those plan-defined criteria.\n\n### Phase 1: Model and Aggregation — All 5 tasks verified\n\n**TASK-1-1** (ReviewVerdict fields): `analysis` and `suggestions` fields added to `orchestrator/models.py:101-108` with `default=\"\"` and descriptive Field annotations. Backward compatibility confirmed — old JSON without these fields parses via Pydantic defaults (tested in `test_backward_compat_old_json_without_new_fields`).\n\n**TASK-1-2** (AggregatedReviewResult): NamedTuple defined at `orchestrator/models.py:113-125` with three fields: `verdict`, `blocking_feedback`, `advisory_content`. Importable from models.py (verified via test imports).\n\n**TASK-1-3** (Aggregation update): `_aggregate_review_verdicts()` at `pipelines.py:1787` returns `AggregatedReviewResult`. Blocking feedback collected only from `needs_revision` verdicts. Advisory content collected from ALL verdicts (including approved). None verdicts skipped.\n\n**TASK-1-4** (Tier 3 caller): Updated at `pipelines.py:3540` to use `agg_result` named fields. Only `blocking_feedback` passed to `prior_feedback`. Advisory content logged with 500-char preview.\n\n**TASK-1-5** (Main loop caller): Updated at `pipelines.py:5499` to use `agg_result` named fields. Only `blocking_feedback` used for `review_feedback`. Advisory content logged with 500-char preview.\n\n### Phase 2: Prompt Builder Alignment — All 4 tasks verified\n\n**TASK-2-1** (Verdict format): JSON template at `pipelines.py:1616-1625` includes all 7 fields (reviewer, verdict, summary, analysis, suggestions, feedback, timestamp). Field guidelines section instructs \"Always provide detailed analysis regardless of verdict\". The string \"empty if approved\" does not appear anywhere in pipelines.py. Feedback field described as \"blocking issues only\".\n\n**TASK-2-2** (Review conventions): Review Conventions section added at `pipelines.py:1557-1590` with all 5 quality standards (comprehensive, specific, direct, suggest fixes, provide context). Code reviewer gets \"critical infrastructure\" and \"last line of defense\" framing. Conventions are inlined in Python code, not loaded from `action/review-conventions.md`.\n\n**TASK-2-3** (Scope preambles): Updated at `pipelines.py:1011-1060`. Code reviewer: \"file-by-file analysis\". Contract reviewer: \"criterion-by-criterion verification\". Refine reviewer: \"section-by-section evaluation\". Plan reviewer: \"section-by-section evaluation\". Agent-design reviewer: \"brief approval is acceptable\".\n\n**TASK-2-4** (Draft reviewer steps): Expanded procedural steps added at `pipelines.py:1536-1546` for draft-based reviewers (7 steps). Includes: read thoroughly, cross-reference against criteria, cite specific evidence, evaluate completeness, assess quality.\n\n### Phase 3: Tests — All 4 tasks verified\n\n**TASK-3-1** (test_tier3_execute.py): Backward-compat test for old JSON added (`test_backward_compat_old_json_without_new_fields`). Test with all fields populated added (`test_reads_verdict_with_all_fields`). `TestReviewVerdictDefaults` class added for mock patterns. All existing tests pass.\n\n**TASK-3-2** (test_pipeline_prompts.py): `TestBuildReviewPrompt` class added with 20+ test methods covering: verdict format fields, conventions presence, absence of \"empty if approved\", agent-design brief approval, code file-by-file, contract criterion-by-criterion, refine/plan section-by-section, analysis/suggestions/feedback field guidelines, delta review, prior feedback, code reviewer systematic steps, and phase restrictions.\n\n**TASK-3-3** (test_multi_reviewer.py): `TestProductionAggregateReviewVerdicts` class added with 15 test methods importing and exercising `_aggregate_review_verdicts()` with `ReviewVerdict` objects. Tests verify: advisory_content from approved verdicts, AggregatedReviewResult named fields, backward compat, positional access, empty dict, all-None verdicts, multiple needs_revision combination, mixed verdicts, suggestions prefix formatting.\n\n**TASK-3-4** (Full test suite): All checks passed — 6767 tests passed, 85 skipped, 4 warnings. Lint clean (ruff, mypy). No regressions.\n\n### Additional Changes\n\n- `docs/guides/sdlc-pipeline.md`: Updated verdict schema documentation with new fields, field guidelines, and review conventions section.\n- `orchestrator/README.md`: Updated to mention AggregatedReviewResult in directory listing.\n- `orchestrator/tests/conftest.py`: Fixed Docker mock exception classes (proper class hierarchy instead of MagicMock stubs).", + "suggestions": "1. The formal contract file (897.json) has empty acceptance_criteria and phases — this appears to be a pipeline setup issue where the yaml-tasks from the plan were not parsed into the contract structure. Future pipelines should ensure contract population.\n2. The plan reviewer noted a typo in TASK-2-1 acceptance criteria (\"5 fields\" vs 7) — the implementation correctly uses 7 fields, so this was handled correctly despite the typo.\n3. Advisory content is logged but not surfaced in phase-completion issue comments — the plan reviewer noted this as a potential follow-up enhancement.", + "feedback": "", + "timestamp": "2026-02-24T08:30:00Z" +} diff --git a/.egg-state/reviews/897-plan-plan-review.json b/.egg-state/reviews/897-plan-plan-review.json new file mode 100644 index 000000000..c2c5a4d6a --- /dev/null +++ b/.egg-state/reviews/897-plan-plan-review.json @@ -0,0 +1,9 @@ +{ + "reviewer": "plan", + "verdict": "approved", + "summary": "The plan is well-structured, aligns with the analysis's recommended Option A, addresses all design decisions (AD-1 through AD-6), and provides good task granularity with testable acceptance criteria. Tasks are properly scoped and dependency ordering is correct. Three observations for the implementer are noted below but none are blocking.", + "analysis": "### 1. Alignment with Analysis\n\nThe plan faithfully implements Option A (Expand Verdict Schema + Align Prompts) as recommended by the analysis. All three root causes are addressed: (1) schema constraint via new `analysis` and `suggestions` fields, (2) prompt gap via review conventions and thoroughness framing, (3) missing approve-with-suggestions via the `suggestions` field. The six design decisions from the architecture analysis (AD-1 through AD-6) are all reflected in the Key Design Decisions section.\n\n### 2. Task Breakdown\n\nThe 13 tasks across 3 phases are discrete, well-scoped, and properly ordered. Each task targets specific code locations with clear boundaries. Phase 1 (model + aggregation) provides the structural foundation, Phase 2 (prompt builder) builds on it, Phase 3 (tests) validates everything. No overlapping responsibilities between tasks.\n\n### 3. Acceptance Criteria\n\nAcceptance criteria are specific and testable for all tasks. Minor inconsistency in TASK-2-1: the acceptance text says \"Verdict JSON template has 5 fields\" but then lists 7 fields (reviewer, verdict, summary, analysis, suggestions, feedback, timestamp). This is clearly a typo — the parenthetical \"7 total\" is correct. Not blocking but the implementer should note the actual count is 7.\n\n### 4. Dependency Ordering\n\nCorrect. Within Phase 1, the ordering is sound: model fields first (TASK-1-1, 1-2), aggregation function (TASK-1-3), then callers (TASK-1-4, 1-5). The risk analyst's R-1 concern about atomic updates is addressed by placing all five tasks in the same phase. However, the plan's risk table claims \"NamedTuple fields match old tuple positionally as fallback\" — this is incorrect. A 3-field NamedTuple raises `ValueError: too many values to unpack` when destructured into 2 variables. The mitigation works in practice (both callers updated in the same phase), but the stated rationale is wrong. The implementer should ensure TASK-1-3, 1-4, and 1-5 are committed together.\n\n### 5. Risk Assessment\n\nThe plan's risk table covers 4 risks with reasonable mitigations. The token consumption and test_multi_reviewer.py risks are well-handled. Two risks from the risk analyst's assessment that are worth noting:\n\n- **Over-reporting (R-5)**: The plan doesn't include anti-hallucination guidance for reviewers pressured to always produce analysis. The implementer should add a line like \"If the work genuinely has no issues in your scope, confirm that briefly rather than inventing concerns\" to the prompt.\n- **Review conventions drift (R-6)**: Inlining conventions means two copies exist (action/review-conventions.md and the Python code). Adding cross-reference comments in both locations would help future maintainers.\n\n### 6. Test Strategy\n\nComprehensive. The backward-compatibility test for old JSON, the TestBuildReviewPrompt class for prompt validation, and the plan to exercise the production `_aggregate_review_verdicts()` (not just the local stub in test_multi_reviewer.py) are all appropriate. The full regression run in TASK-3-4 is the right final step.\n\n### 7. Completeness\n\nThe plan covers model changes, prompt alignment, aggregation updates, and tests — all necessary for the requested improvement. One gap worth noting: advisory_content is collected and logged (TASK-1-4, 1-5) but not surfaced in phase-completion issue comments. The raw verdict JSON files in `.egg-state/reviews/` will contain the detailed analysis (which is directly readable), so the primary surfacing mechanism works. Aggregated surfacing to phase-completion comments could be a follow-up enhancement but is not required for the core value delivery of this issue.", + "suggestions": "1. **Fix the field count typo in TASK-2-1 acceptance criteria**: Change \"5 fields\" to \"7 fields\" for consistency with the parenthetical list.\n2. **Remove the incorrect NamedTuple positional fallback claim** from the risk table. The actual mitigation (updating callers in the same phase) is sound; the fallback claim is not.\n3. **Consider adding anti-hallucination guidance** to TASK-2-2 or TASK-2-3 prompt changes — a single sentence telling reviewers that confirming quality briefly is preferable to fabricating concerns.\n4. **Add cross-reference comments** when inlining review conventions (TASK-2-2): a comment in the Python code noting the conventions also exist in `action/review-conventions.md`, and vice versa.", + "feedback": "", + "timestamp": "2026-02-24T07:55:00Z" +} diff --git a/.egg-state/reviews/897-refine-agent-design-review.json b/.egg-state/reviews/897-refine-agent-design-review.json new file mode 100644 index 000000000..921672329 --- /dev/null +++ b/.egg-state/reviews/897-refine-agent-design-review.json @@ -0,0 +1,7 @@ +{ + "reviewer": "agent-design", + "verdict": "approved", + "summary": "No agent-mode design anti-patterns found. The analysis correctly identifies that the ReviewVerdict JSON format serves a machine consumer (_aggregate_review_verdicts), making structured output appropriate. Proposed schema extensions (analysis, suggestions fields) are modest and backward-compatible. Prompt improvements focus on quality expectations (what) rather than rigid procedures (how). The analysis shows good judgment by explicitly exempting the agent-design reviewer from the always-populate-analysis requirement, consistent with its intentionally brief scope.", + "feedback": "", + "timestamp": "2026-02-24T12:00:00Z" +} diff --git a/.egg-state/reviews/897-refine-refine-review.json b/.egg-state/reviews/897-refine-refine-review.json new file mode 100644 index 000000000..d6f50333b --- /dev/null +++ b/.egg-state/reviews/897-refine-refine-review.json @@ -0,0 +1,7 @@ +{ + "reviewer": "refine", + "verdict": "approved", + "summary": "High-quality analysis with thorough, accurate codebase research and a well-justified recommendation. Problem understanding is precise — correctly identifies all three root causes (verdict schema constraining depth, prompt gap vs PR reviewers, no approve-with-suggestions concept) with specific evidence from PR #895. All major code references verified accurate (ReviewVerdict fields, aggregation logic, prompt instructions, line numbers). Three meaningfully different options with clearly articulated trade-offs. Recommendation of Option A (schema expansion + prompt alignment) is well-reasoned — the insight that structural incentives matter more than prompt wording alone is sound. One minor factual inaccuracy: the analysis states 'The ReviewVerdict model is tested in orchestrator/tests/test_models.py' but ReviewVerdict is not actually covered in that test file. This does not affect the recommendation. Open questions are mostly actionable, though some (#6 migration, #7 return type, #10 max length) are implementation details better deferred to the plan phase rather than requiring human input at this stage.", + "feedback": "", + "timestamp": "2026-02-24T12:30:00Z" +} diff --git a/.gitignore b/.gitignore index 2db69fb5c..4905c6b47 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__/ .coverage htmlcov/ .mypy_cache/ +.hypothesis/ integration_tests/findings/ # Linting diff --git a/docs/guides/sdlc-pipeline.md b/docs/guides/sdlc-pipeline.md index df3c9986d..346c4b89d 100644 --- a/docs/guides/sdlc-pipeline.md +++ b/docs/guides/sdlc-pipeline.md @@ -320,7 +320,8 @@ The orchestrator runs multiple specialized reviewers in parallel, with phase-spe **Verdict Aggregation:** - All reviewers run in parallel - Any `needs_revision` from any reviewer → aggregate `needs_revision` -- Feedback is combined with per-reviewer section headers +- Blocking feedback (from `needs_revision` verdicts) is combined with per-reviewer section headers and passed to the next revision cycle +- Analysis and suggestions from **all** verdicts (including `approved`) are collected as advisory content and logged for observability - Failed reviewers are tracked separately and trigger escalation **Per-Reviewer State Tracking:** @@ -447,12 +448,31 @@ The refine and plan phases include an automated internal review step before huma { "reviewer": "refine" | "plan" | "agent-design" | "contract" | "code", "verdict": "approved" | "needs_revision", - "summary": "Brief summary of findings", - "feedback": "Detailed feedback (empty if approved)", + "summary": "Brief summary of findings (1-2 sentences)", + "analysis": "Detailed analysis of the reviewed work (always populated)", + "suggestions": "Non-blocking suggestions for improvement (even when approving)", + "feedback": "Blocking issues requiring revision (empty when approving)", "timestamp": "ISO 8601 timestamp" } ``` +**Field guidelines:** +- **analysis**: Always populated regardless of verdict. Describes what was reviewed, what was found, and reasoning. Code reviewers provide file-by-file analysis; contract reviewers provide criterion-by-criterion verification; plan/refine reviewers provide section-by-section evaluation. +- **suggestions**: Non-blocking observations and improvement ideas, included even when approving. These are surfaced as advisory content for observability. +- **feedback**: Reserved for blocking issues only — problems that must be fixed before approval. Empty when the verdict is `approved`. + +**Review Conventions:** + +All SDLC reviewers follow quality standards aligned with the PR reviewer workflow: + +1. **Be comprehensive** — Review the entire scope, not just the obvious parts +2. **Be specific** — Reference exact file paths, line numbers, function names, and code snippets +3. **Be direct** — State issues plainly without hedging or softening language +4. **Suggest fixes** — When identifying a problem, include a concrete suggestion for resolution +5. **Provide context** — Explain *why* something is an issue (impact, risk, or principle violated) + +Code reviewers additionally receive "last line of defense" framing and must provide file-by-file analysis of all changed files. Draft-based reviewers (refine, plan) follow expanded procedural steps that require cross-referencing each section against criteria and citing specific evidence. + **Review Criteria for Refine:** - Does the analysis address the issue description? - Are options meaningfully different and well-reasoned? diff --git a/orchestrator/README.md b/orchestrator/README.md index ecf355a10..9a72d6d25 100644 --- a/orchestrator/README.md +++ b/orchestrator/README.md @@ -169,7 +169,7 @@ See [Architecture: Deployment Modes](../docs/architecture/orchestrator.md#deploy orchestrator/ ├── api.py # Flask REST API server with blueprint registration ├── cli.py # CLI interface (serve, health, pipelines commands) -├── models.py # Pydantic models (Pipeline, AgentExecution, ContainerInfo, etc.) +├── models.py # Pydantic models (Pipeline, AgentExecution, ReviewVerdict, AggregatedReviewResult, etc.) ├── state_store.py # Git-backed persistent state storage ├── container_spawner.py # Container spawning with gateway session integration ├── container_monitor.py # Container state monitoring and lifecycle tracking diff --git a/orchestrator/models.py b/orchestrator/models.py index 78c8036c2..6db6548bb 100644 --- a/orchestrator/models.py +++ b/orchestrator/models.py @@ -7,7 +7,7 @@ from datetime import datetime from enum import StrEnum -from typing import Any +from typing import Any, NamedTuple from pydantic import BaseModel, Field @@ -99,10 +99,33 @@ class ReviewVerdict(BaseModel): verdict: str = Field(..., description="'approved' or 'needs_revision'") summary: str = Field(default="", description="Brief summary of review findings") - feedback: str = Field(default="", description="Detailed feedback if needs_revision") + analysis: str = Field( + default="", + description="Detailed analysis of the reviewed work, populated regardless of verdict", + ) + suggestions: str = Field( + default="", + description="Non-blocking suggestions for improvement, even when approving", + ) + feedback: str = Field(default="", description="Blocking feedback requiring revision") timestamp: str = Field(default="", description="ISO 8601 timestamp") +class AggregatedReviewResult(NamedTuple): + """Result of aggregating multiple review verdicts. + + Attributes: + verdict: Overall verdict — 'approved' or 'needs_revision'. + blocking_feedback: Combined feedback from needs_revision verdicts only. + advisory_content: Combined analysis and suggestions from ALL verdicts + (including approved), for observability and logging. + """ + + verdict: str + blocking_feedback: str + advisory_content: str + + class ContainerInfo(BaseModel): """Information about a sandbox container.""" diff --git a/orchestrator/routes/pipelines.py b/orchestrator/routes/pipelines.py index 6390ac2c7..96179cccb 100644 --- a/orchestrator/routes/pipelines.py +++ b/orchestrator/routes/pipelines.py @@ -37,6 +37,7 @@ def get_logger(name: str, **kwargs) -> logging.Logger: # type: ignore[misc] from ..docker_client import ContainerNotFoundError, ContainerOperationError, DockerClientError from ..models import ( AgentRole, + AggregatedReviewResult, CycleTiming, Pipeline, PipelinePhase, @@ -62,6 +63,7 @@ def get_logger(name: str, **kwargs) -> logging.Logger: # type: ignore[misc] ) from models import ( # type: ignore AgentRole, + AggregatedReviewResult, ComplexityTier, CycleTiming, Pipeline, @@ -1009,7 +1011,8 @@ def _get_reviewer_scope_preamble(reviewer_type: str, phase: str) -> str: "agent-mode design principles. Do NOT review general code quality, " "security, or correctness — other reviewers handle those.\n\n" "**Only flag issues if you find clear agent-mode design anti-patterns.** " - "If the output has no agent-mode concerns, approve it." + "If the output has no agent-mode concerns, a brief approval is acceptable " + "— you do not need to produce a lengthy analysis when there are no concerns." ) elif reviewer_type == "code": return ( @@ -1018,27 +1021,37 @@ def _get_reviewer_scope_preamble(reviewer_type: str, phase: str) -> str: "**Be direct.** Do not soften feedback. State issues clearly and explain " "why they matter.\n\n" "**Be thorough.** Find ALL issues on the first pass. Do not stop after " - "identifying a few problems." + "identifying a few problems.\n\n" + "**Analysis format:** Provide file-by-file analysis covering each changed " + "file. For each file, note what changed, whether the change is correct, " + "and any issues or observations." ) elif reviewer_type == "contract": return ( "This is a **contract verification review**. Verify that the implementation " "matches the contract and all acceptance criteria are met. Do NOT review " - "general code quality or security — other reviewers handle those." + "general code quality or security — other reviewers handle those.\n\n" + "**Analysis format:** Provide a criterion-by-criterion verification — for each " + "acceptance criterion, state whether it is met and cite the specific evidence." ) elif reviewer_type == "refine": return ( "This is a **refine phase review**. Focus on the quality and completeness " "of the analysis produced during the refine phase. Evaluate problem " "understanding, codebase research, options analysis, and the recommended " - "approach. Agent-mode design alignment is handled by another reviewer." + "approach. Agent-mode design alignment is handled by another reviewer.\n\n" + "**Analysis format:** Provide section-by-section evaluation of the refine " + "output — assess each major section for depth, accuracy, and completeness." ) elif reviewer_type == "plan": return ( "This is a **plan phase review**. Focus on the quality and completeness " "of the implementation plan. Evaluate task breakdown, acceptance criteria, " "dependency ordering, risk assessment, and test strategy. Agent-mode " - "design alignment is handled by another reviewer." + "design alignment is handled by another reviewer.\n\n" + "**Analysis format:** Provide section-by-section evaluation of the plan — " + "assess task decomposition, acceptance criteria quality, dependency ordering, " + "and risk coverage." ) else: raise ValueError(f"Unknown reviewer type: {reviewer_type}") @@ -1520,6 +1533,17 @@ def _build_review_prompt( lines.append("7. Evaluate against the criteria below") lines.append(f"8. Write your verdict to `{verdict_path}` as JSON") lines.append("9. Commit the verdict file") + elif draft_path: + # Expanded procedural steps for draft-based (non-code) reviewers + lines.append("2. Read the draft thoroughly — do not skim") + lines.append( + "3. Cross-reference each section of the draft against the review criteria below" + ) + lines.append("4. Cite specific sections, quotes, or omissions as evidence in your analysis") + lines.append("5. Evaluate completeness — identify any criteria not adequately addressed") + lines.append("6. Assess overall quality and coherence of the draft") + lines.append(f"7. Write your verdict to `{verdict_path}` as JSON") + lines.append("8. Commit the verdict file") else: lines.append("2. Evaluate it against the criteria below") lines.append(f"3. Write your verdict to `{verdict_path}` as JSON") @@ -1531,6 +1555,38 @@ def _build_review_prompt( lines.append(_get_review_criteria_for_type(reviewer_type, phase, repo_path=repo_path)) lines.append("") + # Review conventions — quality standards aligned with PR reviewer thoroughness + lines.append("## Review Conventions\n") + if reviewer_type == "code": + lines.append( + "You are a critical part of the engineering infrastructure — the last line " + "of defense before code reaches production. Your review must meet these " + "quality standards:\n" + ) + else: + lines.append("Your review must meet these quality standards:\n") + lines.append( + "1. **Be comprehensive.** Review the entire scope, not just the obvious parts. " + "Do not stop after finding the first few issues." + ) + lines.append( + "2. **Be specific.** Reference exact file paths, line numbers, function names, " + "and code snippets. Vague feedback is not actionable." + ) + lines.append( + "3. **Be direct.** State issues plainly without hedging or softening language. " + '"This will fail when X" not "you might want to consider X".' + ) + lines.append( + "4. **Suggest fixes.** When identifying a problem, include a concrete suggestion " + "for how to resolve it." + ) + lines.append( + "5. **Provide context.** Explain *why* something is an issue — the impact, " + "the risk, or the principle being violated." + ) + lines.append("") + # Delta review directive for re-reviews if is_delta_review: lines.append("## Delta Review\n") @@ -1558,15 +1614,31 @@ def _build_review_prompt( lines.append("{") lines.append(f' "reviewer": "{reviewer_type}",') lines.append(' "verdict": "approved" or "needs_revision",') - lines.append(' "summary": "Brief summary of findings",') - lines.append(' "feedback": "Detailed feedback if needs_revision, empty if approved",') + lines.append(' "summary": "Brief summary of findings (1-2 sentences)",') + lines.append(' "analysis": "Detailed analysis of the reviewed work (see below)",') + lines.append(' "suggestions": "Non-blocking suggestions for improvement",') + lines.append(' "feedback": "Blocking issues requiring revision before approval",') lines.append(' "timestamp": "ISO 8601 timestamp"') lines.append("}") lines.append("```\n") + lines.append("**Field guidelines:**\n") + lines.append( + "- **analysis**: Always provide detailed analysis regardless of verdict. " + "Describe what you reviewed, what you found, and your reasoning. " + "Be thorough but concise (200-500 words)." + ) lines.append( - "If the work meets all criteria, set verdict to `approved`. " + "- **suggestions**: Non-blocking observations and improvement ideas. " + "Include these even when approving — they help the team improve over time." + ) + lines.append( + "- **feedback**: Reserved for **blocking issues only** — problems that must " + "be fixed before the work can be approved. Leave empty when approving." + ) + lines.append( + "\nIf the work meets all criteria, set verdict to `approved`. " "If significant issues remain, set verdict to `needs_revision` " - "and provide actionable feedback." + "and provide actionable feedback in the `feedback` field." ) # Phase restrictions for reviewers @@ -1711,20 +1783,26 @@ def _read_tester_gaps( def _aggregate_review_verdicts( verdicts: dict[str, ReviewVerdict | None], -) -> tuple[str, str]: +) -> AggregatedReviewResult: """Aggregate multiple typed review verdicts into an overall result. Returns: - (overall_verdict, combined_feedback) where overall_verdict is - "approved" or "needs_revision". Any needs_revision → overall - needs_revision. Missing/None verdicts are treated as approved. + AggregatedReviewResult with: + - verdict: "approved" or "needs_revision" (any needs_revision → overall needs_revision) + - blocking_feedback: combined feedback from needs_revision verdicts only + - advisory_content: analysis and suggestions from ALL verdicts (including approved) + + Missing/None verdicts are skipped. """ overall = "approved" feedback_sections: list[str] = [] + advisory_sections: list[str] = [] for reviewer_type, verdict in verdicts.items(): if verdict is None: continue + + # Collect blocking feedback from needs_revision verdicts if verdict.verdict == "needs_revision": overall = "needs_revision" section = f"### {reviewer_type} reviewer\n" @@ -1734,8 +1812,24 @@ def _aggregate_review_verdicts( section += verdict.summary feedback_sections.append(section) - combined = "\n\n".join(feedback_sections) if feedback_sections else "" - return overall, combined + # Collect analysis and suggestions from ALL verdicts (including approved) + advisory_parts: list[str] = [] + if verdict.analysis: + advisory_parts.append(verdict.analysis) + if verdict.suggestions: + advisory_parts.append(f"**Suggestions:** {verdict.suggestions}") + if advisory_parts: + advisory_sections.append( + f"### {reviewer_type} reviewer\n" + "\n\n".join(advisory_parts) + ) + + blocking_feedback = "\n\n".join(feedback_sections) if feedback_sections else "" + advisory_content = "\n\n".join(advisory_sections) if advisory_sections else "" + return AggregatedReviewResult( + verdict=overall, + blocking_feedback=blocking_feedback, + advisory_content=advisory_content, + ) def _sync_worktree_with_remote( @@ -3444,9 +3538,17 @@ def _spawn_one_reviewer( # type: ignore[no-untyped-def] for rtype in reviewer_types } - overall_verdict, combined_feedback = _aggregate_review_verdicts(all_verdicts) + agg_result = _aggregate_review_verdicts(all_verdicts) - if overall_verdict == "approved": + if agg_result.advisory_content: + logger.info( + "Review advisory content (non-blocking)", + pipeline_id=pipeline_id, + phase_id=phase_id, + advisory_preview=agg_result.advisory_content[:500], + ) + + if agg_result.verdict == "approved": logger.info( "Phase approved by all reviewers", pipeline_id=pipeline_id, @@ -3461,12 +3563,12 @@ def _spawn_one_reviewer( # type: ignore[no-untyped-def] phase_id=phase_id, retry=retry, ) - if tester_gap_summary and combined_feedback: - prior_feedback = f"{combined_feedback}\n\n{tester_gap_summary}" + if tester_gap_summary and agg_result.blocking_feedback: + prior_feedback = f"{agg_result.blocking_feedback}\n\n{tester_gap_summary}" elif tester_gap_summary: prior_feedback = tester_gap_summary else: - prior_feedback = combined_feedback + prior_feedback = agg_result.blocking_feedback last_reviewed_commit = cycle_head continue else: @@ -5393,9 +5495,17 @@ def _spawn_reviewer( # type: ignore[no-untyped-def] pipeline_id=pipeline_id, ) - overall_verdict, combined_feedback = _aggregate_review_verdicts(all_verdicts) + agg_result = _aggregate_review_verdicts(all_verdicts) + + if agg_result.advisory_content: + logger.info( + "Review advisory content (non-blocking)", + pipeline_id=pipeline_id, + phase=current_phase.value, + advisory_preview=agg_result.advisory_content[:500], + ) - if overall_verdict == "approved": + if agg_result.verdict == "approved": logger.info( "All reviewers approved", pipeline_id=pipeline_id, @@ -5430,12 +5540,12 @@ def _spawn_reviewer( # type: ignore[no-untyped-def] # Store feedback and loop — merge tester gaps with reviewer # feedback so the coder sees both on the next cycle. - if tester_gap_summary and combined_feedback: - review_feedback = f"{combined_feedback}\n\n{tester_gap_summary}" + if tester_gap_summary and agg_result.blocking_feedback: + review_feedback = f"{agg_result.blocking_feedback}\n\n{tester_gap_summary}" elif tester_gap_summary: review_feedback = tester_gap_summary else: - review_feedback = combined_feedback + review_feedback = agg_result.blocking_feedback with get_pipeline_state_lock(pipeline_id): pipeline = store.load_pipeline(pipeline_id) phase_execution = pipeline.get_phase_execution(current_phase) diff --git a/orchestrator/tests/conftest.py b/orchestrator/tests/conftest.py index 0b90b532d..9b9fadc72 100644 --- a/orchestrator/tests/conftest.py +++ b/orchestrator/tests/conftest.py @@ -6,7 +6,9 @@ """ import sys +import types from pathlib import Path +from unittest.mock import MagicMock # Project root _project_root = Path(__file__).parent.parent.parent @@ -25,4 +27,34 @@ try: import docker # noqa: F401 except ImportError: - pass + # docker SDK is not installed — create a mock module with real + # exception classes so that ``except NotFound`` works correctly. + class _DockerException(Exception): + """Mock DockerException matching docker SDK hierarchy.""" + + class _APIError(_DockerException): + """Mock APIError.""" + + class _NotFound(_APIError): + """Mock NotFound.""" + + class _ImageNotFound(_APIError): + """Mock ImageNotFound.""" + + _errors_mod = types.ModuleType("docker.errors") + _errors_mod.DockerException = _DockerException # type: ignore[attr-defined] + _errors_mod.APIError = _APIError # type: ignore[attr-defined] + _errors_mod.NotFound = _NotFound # type: ignore[attr-defined] + _errors_mod.ImageNotFound = _ImageNotFound # type: ignore[attr-defined] + + _docker_mod = MagicMock() + _docker_mod.errors = _errors_mod + # Also expose on the MagicMock directly so attribute access works + _docker_mod.errors.DockerException = _DockerException + _docker_mod.errors.APIError = _APIError + _docker_mod.errors.NotFound = _NotFound + _docker_mod.errors.ImageNotFound = _ImageNotFound + + sys.modules.setdefault("docker", _docker_mod) + sys.modules.setdefault("docker.errors", _errors_mod) + sys.modules.setdefault("docker.types", MagicMock()) diff --git a/orchestrator/tests/test_pipeline_prompts.py b/orchestrator/tests/test_pipeline_prompts.py index 2c7504a3c..e80d03b37 100644 --- a/orchestrator/tests/test_pipeline_prompts.py +++ b/orchestrator/tests/test_pipeline_prompts.py @@ -8,6 +8,8 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + # Mock heavy dependencies that pipelines.py imports at module level _docker_mock = MagicMock() sys.modules.setdefault("docker", _docker_mock) @@ -21,11 +23,13 @@ _build_checker_prompt, _build_phase_prompt, _build_phase_scoped_prompt, + _build_review_prompt, _build_role_context, _extract_plan_overview, _get_agent_design_criteria, _get_code_review_criteria, _get_contract_review_criteria, + _get_reviewer_scope_preamble, _read_shared_criteria, _read_tester_gaps, _render_contract_tasks, @@ -2368,3 +2372,269 @@ def test_empty_outputs_produce_no_draft(self, tmp_path): draft_path = tmp_path / ".egg-state" / "drafts" / "871-plan.md" # No meaningful content → draft not written assert not draft_path.exists() + + +class TestBuildReviewPrompt: + """Tests for _build_review_prompt verdict format, conventions, and preambles.""" + + def test_verdict_format_includes_analysis_and_suggestions(self): + """Verdict JSON template includes analysis and suggestions fields.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert '"analysis"' in prompt + assert '"suggestions"' in prompt + + def test_verdict_format_no_empty_if_approved(self): + """The old 'empty if approved' language is not present.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "empty if approved" not in prompt + + def test_review_conventions_present(self): + """Generated prompt contains a Review Conventions section with quality standards.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "## Review Conventions" in prompt + assert "Be comprehensive" in prompt + assert "Be specific" in prompt + assert "Be direct" in prompt + assert "Suggest fixes" in prompt + assert "Provide context" in prompt + + def test_code_reviewer_conventions_include_infrastructure_framing(self): + """Code reviewer prompt includes 'critical infrastructure' framing.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "critical" in prompt.lower() + assert "last line of defense" in prompt + + def test_agent_design_preamble_allows_brief_approval(self): + """Agent-design reviewer preamble does not require detailed analysis.""" + preamble = _get_reviewer_scope_preamble("agent-design", "implement") + assert "brief approval is acceptable" in preamble + + def test_code_preamble_includes_file_by_file(self): + """Code reviewer preamble includes file-by-file analysis expectation.""" + preamble = _get_reviewer_scope_preamble("code", "implement") + assert "file-by-file" in preamble + + def test_contract_preamble_includes_criterion_verification(self): + """Contract reviewer preamble includes criterion-by-criterion verification.""" + preamble = _get_reviewer_scope_preamble("contract", "implement") + assert "criterion-by-criterion" in preamble + + def test_refine_preamble_includes_section_evaluation(self): + """Refine reviewer preamble includes section-by-section evaluation.""" + preamble = _get_reviewer_scope_preamble("refine", "refine") + assert "section-by-section" in preamble + + def test_plan_preamble_includes_section_evaluation(self): + """Plan reviewer preamble includes section-by-section evaluation.""" + preamble = _get_reviewer_scope_preamble("plan", "plan") + assert "section-by-section" in preamble + + def test_analysis_field_guidelines(self): + """Prompt includes guidance to always provide analysis regardless of verdict.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "Always provide detailed analysis regardless of verdict" in prompt + + def test_draft_reviewer_has_expanded_steps(self): + """Draft-based reviewers get expanded procedural steps (6+).""" + prompt = _build_review_prompt( + phase="refine", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="refine", + issue_number=100, + ) + # Draft-based reviewer should have cross-reference and cite steps + assert "Cross-reference" in prompt + assert "Cite specific" in prompt + assert "completeness" in prompt.lower() + + def test_non_code_reviewer_generic_conventions_framing(self): + """Non-code reviewers get generic quality standards framing, not infrastructure framing.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="contract", + issue_number=100, + ) + assert "## Review Conventions" in prompt + assert "Your review must meet these quality standards" in prompt + # Should NOT have the code-specific infrastructure framing + assert "last line of defense" not in prompt + + def test_contract_reviewer_phase_restrictions_include_contract_write(self): + """Contract reviewer gets extra permission to update contract files.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="contract", + issue_number=100, + ) + assert "CAN update the contract" in prompt + + def test_code_reviewer_phase_restrictions_no_contract_write(self): + """Code reviewer does NOT get contract write permission.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "CAN update the contract" not in prompt + + def test_delta_review_directive(self): + """Re-review with last_reviewed_commit includes delta review section.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + review_cycle=2, + last_reviewed_commit="abc123", + ) + assert "## Delta Review" in prompt + assert "git diff abc123..HEAD" in prompt + assert "cycle 2" in prompt + + def test_first_review_no_delta_section(self): + """First review cycle does not include delta review section.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + review_cycle=1, + ) + assert "## Delta Review" not in prompt + + def test_prior_feedback_section(self): + """Re-review with prior feedback includes prior feedback section.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + review_cycle=2, + prior_feedback="Fix the SQL injection vulnerability in query.py", + ) + assert "## Prior Review Feedback" in prompt + assert "SQL injection vulnerability in query.py" in prompt + + def test_prior_feedback_not_shown_on_first_cycle(self): + """Prior feedback is not shown on first review cycle even if provided.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + review_cycle=1, + prior_feedback="Fix the bug", + ) + assert "## Prior Review Feedback" not in prompt + + def test_local_pipeline_mode_verdict_path(self): + """Local pipeline mode uses pipeline_id in verdict path instead of issue number.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="local-abc12345", + pipeline_mode="local", + reviewer_type="code", + ) + assert "local-abc12345" in prompt + # Should not contain an issue number reference in the verdict path + assert "None-implement" not in prompt + + def test_non_code_non_draft_reviewer_short_steps(self): + """Non-code reviewer in implement phase (no draft) gets short procedural steps.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="contract", + issue_number=100, + ) + # Contract reviewer in implement phase has no draft_path, so gets the short steps + # It should NOT have code reviewer's extended steps (like "Trace data flow") + assert "Trace data flow" not in prompt + # And not the draft-based expanded steps + assert "Cross-reference" not in prompt + # But should still have basic evaluation steps + assert "Evaluate it against the criteria below" in prompt + + def test_unknown_reviewer_type_raises_error(self): + """Unknown reviewer type raises ValueError in preamble.""" + with pytest.raises(ValueError, match="Unknown reviewer type"): + _get_reviewer_scope_preamble("unknown-type", "implement") + + def test_feedback_field_guideline_blocking_only(self): + """Feedback field guideline makes clear it's for blocking issues only.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "blocking issues only" in prompt.lower() + assert "Leave empty when approving" in prompt + + def test_suggestions_field_guideline_even_when_approving(self): + """Suggestions field guideline encourages providing them even when approving.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "even when approving" in prompt.lower() + + def test_code_reviewer_has_systematic_steps(self): + """Code reviewer in implement phase gets detailed systematic review steps.""" + prompt = _build_review_prompt( + phase="implement", + pipeline_id="test-pipe", + pipeline_mode="issue", + reviewer_type="code", + issue_number=100, + ) + assert "review every changed file systematically" in prompt + assert "Trace data flow" in prompt + assert "edge cases" in prompt.lower() + assert "Research when uncertain" in prompt diff --git a/orchestrator/tests/test_tier3_execute.py b/orchestrator/tests/test_tier3_execute.py index 9036acaca..cc26d39d3 100644 --- a/orchestrator/tests/test_tier3_execute.py +++ b/orchestrator/tests/test_tier3_execute.py @@ -713,6 +713,74 @@ def test_reads_valid_verdict(self, tmp_path: Path): if result is not None: assert result.verdict == "approved" + def test_backward_compat_old_json_without_new_fields(self, tmp_path: Path): + """Old verdict JSON without analysis/suggestions fields parses correctly.""" + reviews_dir = tmp_path / ".egg-state" / "reviews" + reviews_dir.mkdir(parents=True, exist_ok=True) + # Old format: no analysis or suggestions fields + verdict = { + "verdict": "needs_revision", + "summary": "Found issues", + "feedback": "Fix the bug", + "timestamp": "2024-01-01T00:00:00Z", + } + (reviews_dir / "42-implement-code-review.json").write_text( + json.dumps(verdict), encoding="utf-8" + ) + + result = self._read(tmp_path, "implement", "code", "issue", 42, "test-pipeline") + if result is not None: + assert result.verdict == "needs_revision" + assert result.feedback == "Fix the bug" + # New fields default to empty strings + assert result.analysis == "" + assert result.suggestions == "" + + def test_reads_verdict_with_all_fields(self, tmp_path: Path): + """Verdict JSON with all new fields is parsed correctly.""" + reviews_dir = tmp_path / ".egg-state" / "reviews" + reviews_dir.mkdir(parents=True, exist_ok=True) + verdict = { + "verdict": "approved", + "summary": "Code looks good", + "analysis": "Reviewed all changed files. Logic is sound.", + "suggestions": "Consider adding a docstring to the helper function.", + "feedback": "", + "timestamp": "2024-01-01T00:00:00Z", + } + (reviews_dir / "42-implement-code-review.json").write_text( + json.dumps(verdict), encoding="utf-8" + ) + + result = self._read(tmp_path, "implement", "code", "issue", 42, "test-pipeline") + if result is not None: + assert result.verdict == "approved" + assert result.analysis == "Reviewed all changed files. Logic is sound." + assert result.suggestions == "Consider adding a docstring to the helper function." + assert result.feedback == "" + + +class TestReviewVerdictDefaults: + """Tests that ReviewVerdict defaults work for existing usage patterns.""" + + def test_minimal_construction(self): + """ReviewVerdict with only verdict field uses empty defaults for new fields.""" + v = ReviewVerdict(verdict="approved") + assert v.verdict == "approved" + assert v.summary == "" + assert v.analysis == "" + assert v.suggestions == "" + assert v.feedback == "" + assert v.timestamp == "" + + def test_construction_with_feedback_only(self): + """ReviewVerdict with verdict and feedback (common mock pattern) works.""" + v = ReviewVerdict(verdict="needs_revision", feedback="Fix types") + assert v.verdict == "needs_revision" + assert v.feedback == "Fix types" + assert v.analysis == "" + assert v.suggestions == "" + class TestReadLastReviewFeedback: """Tests for _read_last_review_feedback helper.""" diff --git a/tests/workflows/test_multi_reviewer.py b/tests/workflows/test_multi_reviewer.py index 199651201..93a02ee26 100644 --- a/tests/workflows/test_multi_reviewer.py +++ b/tests/workflows/test_multi_reviewer.py @@ -6,13 +6,24 @@ - Per-reviewer feedback combination - Reviewer failure handling - Phase-based reviewer defaults +- Production _aggregate_review_verdicts() with ReviewVerdict objects """ import sys from pathlib import Path +from unittest.mock import MagicMock -# Add shared to path for import +# Add shared and orchestrator to path for import sys.path.insert(0, str(Path(__file__).parent.parent.parent / "shared")) +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "orchestrator")) + +# Mock heavy dependencies that pipelines.py imports at module level +_docker_mock = MagicMock() +sys.modules.setdefault("docker", _docker_mock) +sys.modules.setdefault("docker.errors", _docker_mock.errors) +sys.modules.setdefault("docker.types", _docker_mock.types) + +import pytest # noqa: E402 class TestReviewVerdictAggregation: @@ -292,3 +303,215 @@ def get_default_reviewers(phase: str) -> list: ] else: return [{"name": "code"}] + + +class TestProductionAggregateReviewVerdicts: + """Tests for the production _aggregate_review_verdicts() with ReviewVerdict objects.""" + + @pytest.fixture(autouse=True) + def setup(self): + """Import production functions.""" + try: + from models import AggregatedReviewResult, ReviewVerdict + from routes.pipelines import _aggregate_review_verdicts + + self._aggregate = _aggregate_review_verdicts + self.ReviewVerdict = ReviewVerdict + self.AggregatedReviewResult = AggregatedReviewResult + except ImportError: + pytest.skip("Cannot import orchestrator modules") + + def test_all_approved_returns_approved(self): + """All approved verdicts → overall approved.""" + verdicts = { + "code": self.ReviewVerdict(verdict="approved"), + "contract": self.ReviewVerdict(verdict="approved"), + } + result = self._aggregate(verdicts) + assert result.verdict == "approved" + assert result.blocking_feedback == "" + + def test_any_needs_revision_returns_needs_revision(self): + """Any needs_revision → overall needs_revision.""" + verdicts = { + "code": self.ReviewVerdict(verdict="approved"), + "contract": self.ReviewVerdict(verdict="needs_revision", feedback="Missing tests"), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + assert "Missing tests" in result.blocking_feedback + + def test_none_verdicts_are_skipped(self): + """None verdicts are skipped entirely.""" + verdicts = { + "code": self.ReviewVerdict(verdict="approved"), + "agent-design": None, + } + result = self._aggregate(verdicts) + assert result.verdict == "approved" + + def test_approved_verdicts_with_analysis_collected_in_advisory(self): + """Analysis and suggestions from approved verdicts appear in advisory_content.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="approved", + analysis="Reviewed all files. Logic is sound.", + suggestions="Consider adding a docstring.", + ), + "contract": self.ReviewVerdict( + verdict="approved", + analysis="All criteria met.", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "approved" + assert result.blocking_feedback == "" + assert "Reviewed all files" in result.advisory_content + assert "All criteria met" in result.advisory_content + assert "Consider adding a docstring" in result.advisory_content + + def test_needs_revision_with_analysis_in_both_fields(self): + """needs_revision verdicts contribute to both blocking_feedback and advisory_content.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + feedback="Fix the SQL injection", + analysis="Found a serious security issue in query builder.", + suggestions="Use parameterized queries.", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + assert "Fix the SQL injection" in result.blocking_feedback + assert "serious security issue" in result.advisory_content + assert "parameterized queries" in result.advisory_content + + def test_result_is_named_tuple(self): + """Result is an AggregatedReviewResult with named fields.""" + verdicts = {"code": self.ReviewVerdict(verdict="approved")} + result = self._aggregate(verdicts) + assert isinstance(result, self.AggregatedReviewResult) + # Named fields accessible + assert hasattr(result, "verdict") + assert hasattr(result, "blocking_feedback") + assert hasattr(result, "advisory_content") + + def test_backward_compat_verdicts_without_new_fields(self): + """Verdicts without analysis/suggestions (old format) produce empty advisory.""" + verdicts = { + "code": self.ReviewVerdict(verdict="approved"), + "contract": self.ReviewVerdict(verdict="approved", summary="Looks good"), + } + result = self._aggregate(verdicts) + assert result.verdict == "approved" + assert result.advisory_content == "" + + def test_positional_access_backward_compat(self): + """AggregatedReviewResult can be accessed positionally (tuple compat).""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + feedback="Bug found", + ), + } + result = self._aggregate(verdicts) + # Positional: [0]=verdict, [1]=blocking_feedback, [2]=advisory_content + assert result[0] == "needs_revision" + assert "Bug found" in result[1] + + def test_empty_verdicts_dict(self): + """Empty dict returns approved with empty strings.""" + result = self._aggregate({}) + assert result.verdict == "approved" + assert result.blocking_feedback == "" + assert result.advisory_content == "" + + def test_all_none_verdicts(self): + """All None verdicts returns approved with empty strings (all skipped).""" + verdicts = { + "code": None, + "contract": None, + "agent-design": None, + } + result = self._aggregate(verdicts) + assert result.verdict == "approved" + assert result.blocking_feedback == "" + assert result.advisory_content == "" + + def test_needs_revision_feedback_fallback_to_summary(self): + """When needs_revision has no feedback, blocking_feedback uses summary.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + summary="Type errors in module X", + feedback="", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + assert "Type errors in module X" in result.blocking_feedback + + def test_needs_revision_no_feedback_no_summary(self): + """When needs_revision has neither feedback nor summary, section header is still present.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + # Should have a section header even without content + assert "code reviewer" in result.blocking_feedback + + def test_multiple_needs_revision_combined(self): + """Multiple needs_revision verdicts combine blocking_feedback from all.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + feedback="SQL injection in query builder", + ), + "contract": self.ReviewVerdict( + verdict="needs_revision", + feedback="Missing acceptance criterion ac-3", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + assert "SQL injection" in result.blocking_feedback + assert "acceptance criterion ac-3" in result.blocking_feedback + assert "code reviewer" in result.blocking_feedback + assert "contract reviewer" in result.blocking_feedback + + def test_mixed_approved_and_revision_advisory_from_both(self): + """Advisory content is collected from both approved and needs_revision verdicts.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="needs_revision", + feedback="Fix the bug", + analysis="Found a critical bug in authentication.", + ), + "contract": self.ReviewVerdict( + verdict="approved", + analysis="All criteria met, implementation is solid.", + suggestions="Consider adding integration tests.", + ), + } + result = self._aggregate(verdicts) + assert result.verdict == "needs_revision" + # Advisory should have content from BOTH reviewers + assert "critical bug in authentication" in result.advisory_content + assert "All criteria met" in result.advisory_content + assert "integration tests" in result.advisory_content + + def test_suggestions_prefixed_in_advisory(self): + """Suggestions in advisory_content are prefixed with '**Suggestions:**'.""" + verdicts = { + "code": self.ReviewVerdict( + verdict="approved", + analysis="Code is clean.", + suggestions="Add a type annotation to the return value.", + ), + } + result = self._aggregate(verdicts) + assert "**Suggestions:**" in result.advisory_content + assert "type annotation" in result.advisory_content