From c145c885cd8b38e85fb550af7b8b2bed3f2a3efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Fri, 5 Jun 2026 23:01:39 +0200 Subject: [PATCH 01/13] docs: plan CodeQL sandbox recipe refactor --- .../codeql-sandbox-recipe-refactor-plan.md | 744 ++++++++++++++++++ 1 file changed, 744 insertions(+) create mode 100644 .project/codeql-sandbox-recipe-refactor-plan.md diff --git a/.project/codeql-sandbox-recipe-refactor-plan.md b/.project/codeql-sandbox-recipe-refactor-plan.md new file mode 100644 index 0000000..03f86d4 --- /dev/null +++ b/.project/codeql-sandbox-recipe-refactor-plan.md @@ -0,0 +1,744 @@ +# CodeQL + Sandbox Recipe Refactor Plan + +Status: **WIP plan for review** + +Branch purpose: define the target architecture and implementation plan before changing runtime code. + +## 1. Problem statement + +The current CodeQL integration is fragile because Phase 1 runs CodeQL before CodeCome has built a reliable sandbox/build environment. For compiled languages such as C/C++, Java/Kotlin, Go, C#, and Swift, CodeQL database creation needs to observe or execute the real build. Today the model generates a `codeql-plan.yml` in Phase 1a, then the harness executes CodeQL directly from the host, and only later Phase 1c creates the sandbox. + +This creates several issues: + +1. CodeQL build commands are guessed too early. +2. The harness executes CodeQL in the host environment even when the target's real build belongs inside Docker. +3. Sandbox build discovery and CodeQL build discovery are duplicated. +4. CodeQL can appear successful even when the database is effectively useless. +5. Phase 1b/Phase 2 can consume stale or misleading CodeQL artifacts. +6. The current Phase 1 subphase naming and documentation have already drifted: sandbox is currently implemented as Phase 1c, while some sandbox bootstrap docs still describe it as Phase 1b. + +The refactor should make CodeQL best-effort but trustworthy: failures or empty/invalid databases must be detected and reported as unusable, not silently treated as useful zero-alert analysis. + +## 2. Design goals + +1. Keep CodeQL optional. `CODEQL=0` or an equivalent configuration must let all later phases run without CodeQL dependencies. +2. Keep CodeQL best-effort by default. A failed or unusable CodeQL run should not block Phase 1 under soft policy. +3. Prevent false success. A CodeQL run that returns exit code 0 but produces no usable extraction/SARIF must be classified as unusable. +4. Use the sandbox build environment as the source of truth for compiled-language CodeQL database creation. +5. Preserve multi-unit CodeQL analysis. A target may contain multiple analysis units with different languages/stacks. +6. Avoid forcing multiple sandboxes. Prefer one sandbox per target, with multiple build targets declared in a machine-readable recipe when needed. +7. Keep `codeql-plan.yml` as the CodeQL-specific analysis control document, but reduce its responsibility: it should describe what to analyze, not invent the final build environment. +8. Add `sandbox-recipe.yml` under `itemdb/notes/` as the durable machine-readable contract between sandbox bootstrap and later harness steps. +9. Reorder Phase 1 so sandbox/bootstrap happens before CodeQL and detailed recon happens after CodeQL. +10. Do a full naming/documentation/test sweep so Phase 1a/1b/1c references are consistent. + +## 3. Target architecture + +### 3.1 New Phase 1 order + +Current order: + +```text +Phase 1a: Target Profile + build model + CodeQL plan +CodeQL: host-side execution +Phase 1b: Detailed Reconnaissance +Phase 1c: Sandbox Bootstrap +``` + +Target order: + +```text +Phase 1a: Target Profile + build model + CodeQL intent +Phase 1b: Sandbox Bootstrap + sandbox-recipe.yml +CodeQL: run using codeql-plan.yml + sandbox-recipe.yml +Phase 1c: Detailed Reconnaissance enriched with CodeQL health/signals +``` + +Rationale: + +- The sandbox phase has the best chance of discovering and stabilizing the real build/runtime model. +- CodeQL for compiled languages should run after that environment exists. +- Recon should consume reliable CodeQL health/signals, not stale or misleading outputs. + +### 3.2 Responsibilities by artifact + +#### `itemdb/notes/codeql-plan.yml` + +Purpose: CodeQL analysis intent. + +Should define: + +- analysis units; +- source paths; +- languages and confidence; +- CodeQL profiles/packs; +- exclude patterns; +- whether a unit/language is recommended for CodeQL; +- which sandbox build target should be used, when a build is required. + +Should not be responsible for: + +- installing toolchains; +- deciding Docker compose details; +- generating long shell snippets; +- fully describing sandbox runtime; +- repairing build scripts. + +#### `itemdb/notes/sandbox-plan.md` + +Purpose: human-readable sandbox decisions. + +Already exists conceptually. It should continue to document: + +- detected stack; +- selected sandbox seed; +- runtime model; +- services; +- validation matrix; +- limitations; +- remediation attempts; +- open questions. + +#### `itemdb/notes/sandbox-recipe.yml` + +Purpose: machine-readable recipe for later harness steps. + +It should describe how to use the generated sandbox and how buildable units map to commands/services/workdirs. CodeQL consumes this file. + +## 4. `sandbox-recipe.yml` schema proposal + +Initial schema: + +```yaml +schema_version: 1 +generated_by: phase-1b-sandbox +validation_model: docker # docker | static-only | nested-virt + +sandbox: + path: ./sandbox + managed: true + compose_file: ./sandbox/docker-compose.yml + default_service: app + workspace_root: /workspace + source_root: /workspace/src + +commands: + setup: ./sandbox/scripts/setup.sh + up: ./sandbox/scripts/up.sh + check: ./sandbox/scripts/check.sh + build: ./sandbox/scripts/build.sh + test: ./sandbox/scripts/test.sh + down: ./sandbox/scripts/down.sh + shell: ./sandbox/scripts/shell.sh + logs: ./sandbox/scripts/logs.sh + clean: ./sandbox/scripts/clean.sh + reset: ./sandbox/scripts/reset.sh + +build_targets: + - id: root + description: Default target build + source_path: ./src + service: app + workdir: /workspace/src + build_command: ./sandbox/scripts/build.sh + test_command: ./sandbox/scripts/test.sh + environment: + type: docker-compose + compose_file: ./sandbox/docker-compose.yml + service: app + codeql: + supported: true + preferred_execution_mode: docker-inside + install_strategy: mount-host-bundle + notes: [] + +codeql: + supported: true + default_execution_mode: docker-inside + install_strategy: mount-host-bundle + notes: + - CodeQL is optional and best-effort. + - Compiled-language database creation should use build_targets rather than host guesses. + +limitations: [] +``` + +### 4.1 Multi-unit / multi-target handling + +One sandbox can expose several `build_targets`: + +```yaml +build_targets: + - id: native-lib + source_path: ./src/native + service: app + workdir: /workspace/src/native + build_command: ./sandbox/scripts/build-native-lib.sh + codeql: + supported: true + preferred_execution_mode: docker-inside + + - id: cli + source_path: ./src/cli + service: app + workdir: /workspace/src/cli + build_command: ./sandbox/scripts/build-cli.sh + codeql: + supported: true + preferred_execution_mode: docker-inside +``` + +Rules: + +1. Simple targets may define only `root` or `default`. +2. Multi-component targets should define one build target per materially distinct build component only when it improves reproducibility. +3. `sandbox/scripts/build.sh` may remain the canonical aggregate build hook. +4. Specific scripts such as `build-native-lib.sh` are optional and should be generated only when useful. +5. CodeQL analysis units map to build targets through `codeql-plan.yml`. +6. A missing build target for a compiled-language CodeQL unit should soft-fail that unit honestly under soft policy. + +## 5. `codeql-plan.yml` schema direction + +Keep a CodeQL-specific plan, but update the schema to version 2. + +Example: + +```yaml +schema_version: 2 +generated_by: phase-1a-profile +source_path: ./src +recommended: true + +analysis_units: + - id: native-lib + path: ./src/native + kind: library + primary: true + sandbox_build_target: native-lib + languages: + - id: c-cpp + confidence: HIGH + build_mode: manual + build_provider: sandbox-recipe + profiles: + - official + - github-security-lab + - trailofbits + + - id: scripts + path: ./src/scripts + kind: tooling + primary: false + sandbox_build_target: root + languages: + - id: python + confidence: MEDIUM + build_mode: none + build_provider: none + profiles: + - official + +exclude: + - src/**/tests/** + - src/**/fixtures/** + - src/**/vendor/** + +notes: + - Build commands are resolved through sandbox-recipe.yml after sandbox bootstrap. +``` + +### 5.1 Build rules + +For languages with `build_mode: none`: + +- `sandbox_build_target` is optional. +- CodeQL may run host-side or docker-side depending on runner support. +- No build command is required. + +For compiled languages with `build_mode: manual`: + +- prefer `build_provider: sandbox-recipe`; +- require `sandbox_build_target` unless a legacy/manual host command is explicitly supported; +- resolve final command from `sandbox-recipe.yml`. + +For `build_mode: autobuild`: + +- only use as fallback; +- record why a sandbox build target was not available; +- classify health carefully, because autobuild success can still produce poor extraction. + +## 6. CodeQL execution backends + +Add explicit execution modes: + +```text +host + Run CodeQL on the host. Suitable for no-build languages or legacy fallback. + +docker-inside + Run CodeQL inside the sandbox container/service. Preferred for compiled languages. + +docker-wrapper + Host-side CodeQL invokes a Docker command as build command. Not preferred because extraction may not observe compilation correctly. + +unavailable + CodeQL cannot run for this unit in this environment. +``` + +### 6.1 Preferred compiled-language path + +For compiled languages: + +1. Resolve analysis unit from `codeql-plan.yml`. +2. Resolve `sandbox_build_target` from `sandbox-recipe.yml`. +3. Ensure sandbox has passed setup/check/build validation. +4. Ensure CodeQL CLI is available inside the sandbox environment. +5. Execute `codeql database create` inside the same container/service and workdir used to build the target. +6. Run `codeql database analyze` for selected profiles. +7. Run health checks. +8. Publish normalized signals only if usable. + +### 6.2 CodeQL installation strategy in Docker + +Support multiple strategies: + +```yaml +install_strategy: mount-host-bundle +``` + +Mount the locally installed CodeQL bundle into the sandbox container read-only. + +```yaml +install_strategy: copy-host-bundle +``` + +Copy CodeQL into a temporary location mounted or copied into the container. + +```yaml +install_strategy: image-preinstalled +``` + +Assume the sandbox image already includes CodeQL. + +Initial implementation can support only `mount-host-bundle`, then add others if needed. + +## 7. CodeQL health model + +Create a new health layer, probably `tools/codeql/health.py`. + +### 7.1 Health output + +Every CodeQL run should write a health block to the manifest: + +```yaml +health: + usable: false + classification: extraction-failed + reason: CodeQL database create returned success but extractor_successes was 0 and extractor_failures was 1. + checks: + database_create_exit_zero: true + database_exists: true + analyze_exit_zero: false + official_profile_analyzed: false + sarif_fresh: false + normalized_fresh: false + extractor_successes: 0 + extractor_failures: 1 + trap_files_detected: 0 +``` + +### 7.2 Classifications + +Use stable classifications: + +```text +disabled +skipped +unavailable +failed +soft-failed +extraction-failed +analysis-failed +completed-empty-valid +completed-with-signals +completed-partial +stale-output-detected +``` + +### 7.3 Usability rules + +A run is usable only if: + +1. CodeQL database creation exit code is 0. +2. The database directory exists and passes basic sanity checks. +3. At least the official profile analyze step succeeds, or an explicit profile-equivalent step succeeds. +4. At least one fresh SARIF file exists for the current run. +5. Normalized outputs were generated from fresh SARIF from the current run. +6. For compiled languages, extraction is non-empty. + +A run may have zero alerts and still be usable. Zero alerts is not a failure. + +A run is not usable if: + +1. SARIF is missing. +2. normalized outputs are stale. +3. extractor successes are zero for a compiled language. +4. all query profiles fail. +5. database creation returns success but no analyzable content is present. + +### 7.4 Compiled-language extraction checks + +Prefer robust signals in this order: + +1. CodeQL diagnostics that report extractor successes/failures. +2. TRAP/import counts if available. +3. Database metadata containing source/archive information. +4. Presence of extracted files under expected source roots. + +The plan implementation should include a spike task to inspect real CodeQL DB layout and diagnostic files for C/C++, Java/Kotlin, Go, C#, and Swift. + +## 8. Artifact layout + +No need to maintain backward compatibility, but keep the layout understandable. + +Recommended layout: + +```text +itemdb/codeql/ + runs/ + / + run-manifest.yml + health.yml + selected-query-packs.yml + sarif/ + normalized/ + databases/ + logs/ + codeql-summary.md + last-run-manifest.yml + current-run.txt +``` + +Rules: + +1. Every run gets a unique run id. +2. The runner never normalizes stale SARIF. +3. The runner never reports `Total alerts: 0` unless SARIF was fresh and normalized for the current run. +4. `last-run-manifest.yml` always describes the last attempt, even if unusable. +5. `current-run.txt` points to the latest usable run if one exists, or may be absent if none exists. +6. Recon consumes `last-run-manifest.yml` for health and the current usable run for normalized signals. + +## 9. Phase 1 orchestration changes + +Update `tools/codecome/phase_1.py`: + +1. Rename or relabel subphases: + - `1a`: Target Profile + - `1b`: Sandbox Bootstrap + - `1c`: Detailed Reconnaissance +2. Move CodeQL execution after `check_phase_1b` and before Phase 1c. +3. Run CodeQL only after sandbox recipe validation succeeds. +4. If CodeQL is disabled, record a skipped/disabled manifest and continue. +5. If CodeQL soft-fails or is unusable, continue but make health explicit. +6. Remove or rewrite the current CodeQL repair loop that resumes the model to patch `codeql-plan.yml` after host-side failures. Re-introduce repair only around sandbox recipe/build target problems if it is still useful. +7. Ensure all gates point to the correct renamed phase artifacts. + +## 10. Prompt changes + +### 10.1 `prompts/phase-1a-profile.md` + +Update to: + +- produce `target-profile.md`, `build-model.md`, and schema v2 `codeql-plan.yml`; +- identify analysis units and desired CodeQL coverage; +- avoid concrete build shell snippets unless obvious and stable; +- set `build_provider: sandbox-recipe` for compiled languages when a build is required; +- state that the sandbox phase will resolve the final build recipe. + +### 10.2 New/renamed `prompts/phase-1b-sandbox.md` + +Update from current sandbox prompt: + +- sandbox is now Phase 1b; +- required outputs: + - `itemdb/notes/sandbox-plan.md`; + - `itemdb/notes/sandbox-recipe.yml`; +- require validation of both sandbox and recipe; +- require multi-target recipe entries when the target has materially distinct build components; +- do not force per-unit scripts if a single aggregate build is correct. + +### 10.3 New/renamed `prompts/phase-1c-recon.md` + +Update from current recon prompt: + +- recon is now Phase 1c; +- read Phase 1a and 1b outputs; +- read CodeQL health from `itemdb/codeql/last-run-manifest.yml` when present; +- only consume normalized CodeQL signals when health says usable; +- never infer "no issues" from unusable CodeQL; +- include CodeQL health in `threat-model.md`, `interesting-files.md`, and `file-risk-index.yml` only when relevant. + +## 11. Skill/documentation updates + +Audit and update all references to old Phase 1 ordering. + +Required search terms: + +```text +Phase 1b +Phase 1c +phase-1b +phase-1c +phase_1b +phase_1c +Detailed Reconnaissance +Sandbox Bootstrap +CodeQL analysis between 1a and 1b +between 1a and 1b +``` + +Likely files: + +- `tools/codecome/phase_1.py` +- `tools/phases/completion.py` +- `tools/phases/phase_1_gates.py` +- `tools/phases/artifact_checks.py` +- `prompts/phase-1a-profile.md` +- `prompts/phase-1b-recon.md` +- `prompts/phase-1c-sandbox.md` +- `.opencode/skills/sandbox-bootstrap/SKILL.md` +- `.opencode/skills/sandbox-validation/SKILL.md` +- `.opencode/agents/recon.md` +- `docs/workflow.md` +- `docs/sandbox.md` +- `templates/sandboxes/README.md` +- `README.md` +- tests referencing subphase names, required artifacts, or phase order. + +## 12. Sandbox bootstrap CLI changes + +Update `tools/sandbox-bootstrap.py` to support recipe validation/generation helpers. + +Potential subcommands: + +```text +sandbox-recipe-validate +sandbox-recipe-print +``` + +Or integrate into existing `validate` output. + +Minimum required checks: + +1. `itemdb/notes/sandbox-recipe.yml` exists after Phase 1b. +2. `schema_version` is supported. +3. `validation_model` is valid. +4. `sandbox.path` exists. +5. Declared command paths exist when applicable. +6. `build_targets` is non-empty unless validation model is `static-only` or explicitly buildless. +7. Each build target has unique id. +8. Each build target source path exists. +9. Each build target workdir is absolute inside the sandbox environment. +10. Each build target command is present or explicitly marked not applicable. +11. CodeQL hints use supported values. + +The agent may write the recipe, but the harness must validate it. + +## 13. CodeQL module changes + +### 13.1 `tools/codeql/packs.py` + +Update plan loader to support schema v2. + +Tasks: + +- accept schema v2; +- validate `sandbox_build_target` and `build_provider`; +- preserve schema v1 only if we intentionally support migration during development, otherwise fail with actionable message; +- update tests. + +### 13.2 `tools/codeql/runner.py` + +Refactor to: + +- load `codeql-plan.yml`; +- load and validate `sandbox-recipe.yml`; +- build an effective execution plan; +- execute per unit/language using selected backend; +- write per-run artifacts under `itemdb/codeql/runs//`; +- record per-unit command logs; +- never reuse stale SARIF/normalized outputs; +- call health evaluation before publishing usable results. + +### 13.3 `tools/codeql/pipeline.py` + +Refactor around run directories. + +Tasks: + +- create run id; +- pass run directory to runner; +- normalize only current-run SARIF; +- import file risk only from usable current-run signals; +- write summary from health and normalized results; +- update `last-run-manifest.yml`. + +### 13.4 `tools/codeql/artifacts.py` + +Replace current weak artifact gate. + +Current gate mainly checks manifest status and normalized files. New gate should check health classification and usability. + +Rules: + +- hard policy may block on `health.usable=false` when CodeQL was expected and enabled; +- soft policy never blocks Phase 1, but emits clear warnings; +- disabled/skipped CodeQL is not an error under soft policy; +- stale output is always warning or failure depending on policy. + +### 13.5 New `tools/codeql/health.py` + +Implement health classification. + +Inputs: + +- effective execution plan; +- process exit statuses; +- generated DB paths; +- CodeQL logs; +- SARIF paths; +- normalized paths; +- diagnostics/extractor metadata. + +Outputs: + +- health dict; +- `health.yml`; +- warnings/failures for manifest. + +## 14. Repair/resume model + +The current CodeQL repair loop resumes the model after CodeQL database creation fails. That model is less effective because CodeQL currently runs outside the sandbox/build context. + +Target behavior: + +1. Repair sandbox in Phase 1b if sandbox validation or `sandbox-recipe.yml` validation fails. +2. Run CodeQL after sandbox validation. +3. If CodeQL fails because the recipe is inconsistent, classify the unit and optionally request a targeted sandbox-recipe repair in the same Phase 1b session only if safe. +4. Avoid repeated expensive CodeQL retries by default. +5. Use a small retry budget and only rerun affected units. + +Initial implementation can remove CodeQL model-repair entirely and rely on clear health output. Add repair back only after the deterministic path is solid. + +## 15. Handling user-managed sandboxes + +Current sandbox gate allows user-managed sandboxes with warnings. This should remain possible. + +If sandbox is user-managed: + +1. Phase 1b must still create `itemdb/notes/sandbox-recipe.yml`, either by inspecting the user-managed sandbox or documenting that no recipe can be derived. +2. If no recipe can be derived, CodeQL compiled-language units should become `unavailable` under soft policy. +3. Phase 1 can continue under soft policy. +4. The user should see clear remediation instructions. + +## 16. Handling static-only and nested-virt + +For `static-only`: + +- `sandbox-recipe.yml` may have no build targets or build targets marked `not_applicable`. +- CodeQL no-build languages may still run. +- Compiled-language CodeQL should be `unavailable` unless a build target exists. + +For `nested-virt`: + +- CodeQL probably defaults to `unavailable` unless the recipe explicitly declares support. +- Do not try to force CodeQL into VM-based targets during initial refactor. + +## 17. Tests + +Add or update tests for: + +1. Phase 1 orchestration order. +2. New subphase labels and prompts. +3. Phase completion artifact lists. +4. `sandbox-recipe.yml` validation success/failure. +5. Simple single-target recipe. +6. Multi-target recipe. +7. CodeQL plan schema v2 validation. +8. CodeQL plan mapping to sandbox build targets. +9. Missing build target for compiled language => soft-failed/unavailable. +10. `build_mode: none` language works without build target. +11. Run directory creation. +12. No stale SARIF normalization. +13. Health classification: no SARIF. +14. Health classification: extractor successes = 0. +15. Health classification: zero alerts but fresh SARIF => usable empty valid. +16. Health classification: partial extraction => usable with warnings. +17. Soft policy does not block Phase 1. +18. Hard policy blocks on unusable CodeQL when enabled. +19. Recon prompt uses health rules. +20. Docs/skills no longer contain old contradictory Phase 1b/1c descriptions. + +## 18. Acceptance criteria + +Implementation is complete when: + +1. `make phase-1` runs phases in the new order. +2. Phase 1b creates both `sandbox-plan.md` and `sandbox-recipe.yml`. +3. CodeQL runs after sandbox validation. +4. CodeQL compiled-language database creation uses sandbox recipe by default. +5. A CodeQL run with no real extraction is not classified as successful/usable. +6. A CodeQL run with zero alerts but valid SARIF/extraction is classified as usable empty valid. +7. Stale SARIF/normalized outputs cannot be mistaken for current run results. +8. Phase 1c recon only uses CodeQL signals when health says usable. +9. Phase 2+ does not depend on CodeQL when CodeQL is disabled or unusable. +10. Soft policy continues the audit with clear warnings. +11. Hard policy blocks only according to explicit, documented health rules. +12. All references to old Phase 1b/1c responsibilities are updated. +13. Tests cover single-target and multi-target recipe mapping. +14. Docs explain the new architecture and how to troubleshoot CodeQL health. + +## 19. Implementation sequence + +Recommended sequence for one WIP branch: + +1. Rename/reorder Phase 1 prompts and orchestration. +2. Add `sandbox-recipe.yml` template/schema and validation helper. +3. Update sandbox prompt/skill to require recipe generation. +4. Update completion/gates/artifact checks for new Phase 1 order. +5. Update `codeql-plan.yml` schema to v2. +6. Add effective CodeQL plan resolver combining CodeQL plan + sandbox recipe. +7. Add run directory layout. +8. Add CodeQL health model. +9. Refactor runner/pipeline/artifacts around health and run directories. +10. Wire CodeQL after sandbox in Phase 1. +11. Update recon prompt to consume health/signals correctly. +12. Update docs/README/tests. +13. Run full test suite and fix regressions. +14. Test manually on at least: + - Python/no-build target; + - C/C++ target with Docker build; + - multi-component target with one aggregate build; + - target with CodeQL disabled. + +## 20. Open implementation questions + +1. Which CodeQL-in-Docker install strategy should be implemented first? + - Recommended first: `mount-host-bundle`. +2. Do we want symlinks such as `itemdb/codeql/current` or plain marker files such as `current-run.txt`? + - Recommended first: marker file for portability. +3. Should schema v1 `codeql-plan.yml` be supported during migration? + - Since compatibility is not required, prefer failing with a clear upgrade error. +4. Should CodeQL repair be removed initially? + - Recommended: remove or disable it initially, then reintroduce after deterministic execution/health is solid. +5. Should sandbox recipe validation live in `tools/sandbox-bootstrap.py` or a new module under `tools/sandbox/`? + - Recommended: add a reusable module, expose through existing CLI. + +## 21. Non-goals + +1. Do not special-case Juliet or other benchmark corpora. +2. Do not create separate sandboxes per CodeQL unit. +3. Do not make CodeQL a mandatory sandbox validation tier. +4. Do not infer security absence from zero CodeQL alerts. +5. Do not make the model responsible for interpreting stale CodeQL outputs. +6. Do not preserve old CodeQL artifact layout compatibility unless needed during development. From 244fd18f94c5a43d3217bc11bfed959ce49169ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:14:03 +0200 Subject: [PATCH 02/13] chore: introduce sandbox-recipe schema + validator Add tools/sandbox/recipe.py with load_recipe, validate_recipe, dump_recipe. Add recipe-validate and recipe-print subcommands to sandbox-bootstrap.py. Add templates/sandbox-recipe.yml.example (schema v1). Add tests/test_sandbox_recipe.py (24 tests covering valid/invalid schemas). Update docs/sandbox.md and templates/sandboxes/README.md with recipe docs. Update .project/codeql-sandbox-recipe-refactor-plan.md with locked plan. --- .../codeql-sandbox-recipe-refactor-plan.md | 281 +++++++++++++++--- docs/sandbox.md | 22 ++ templates/sandbox-recipe.yml.example | 53 ++++ templates/sandboxes/README.md | 15 + tests/test_sandbox_recipe.py | 259 ++++++++++++++++ tools/sandbox-bootstrap.py | 91 +++++- tools/sandbox/__init__.py | 3 + tools/sandbox/recipe.py | 200 +++++++++++++ 8 files changed, 878 insertions(+), 46 deletions(-) create mode 100644 templates/sandbox-recipe.yml.example create mode 100644 tests/test_sandbox_recipe.py create mode 100644 tools/sandbox/__init__.py create mode 100644 tools/sandbox/recipe.py diff --git a/.project/codeql-sandbox-recipe-refactor-plan.md b/.project/codeql-sandbox-recipe-refactor-plan.md index 03f86d4..ff739a5 100644 --- a/.project/codeql-sandbox-recipe-refactor-plan.md +++ b/.project/codeql-sandbox-recipe-refactor-plan.md @@ -1,6 +1,6 @@ # CodeQL + Sandbox Recipe Refactor Plan -Status: **WIP plan for review** +Status: **Locked plan — implementation in progress** Branch purpose: define the target architecture and implementation plan before changing runtime code. @@ -698,43 +698,205 @@ Implementation is complete when: 13. Tests cover single-target and multi-target recipe mapping. 14. Docs explain the new architecture and how to troubleshoot CodeQL health. -## 19. Implementation sequence - -Recommended sequence for one WIP branch: - -1. Rename/reorder Phase 1 prompts and orchestration. -2. Add `sandbox-recipe.yml` template/schema and validation helper. -3. Update sandbox prompt/skill to require recipe generation. -4. Update completion/gates/artifact checks for new Phase 1 order. -5. Update `codeql-plan.yml` schema to v2. -6. Add effective CodeQL plan resolver combining CodeQL plan + sandbox recipe. -7. Add run directory layout. -8. Add CodeQL health model. -9. Refactor runner/pipeline/artifacts around health and run directories. -10. Wire CodeQL after sandbox in Phase 1. -11. Update recon prompt to consume health/signals correctly. -12. Update docs/README/tests. -13. Run full test suite and fix regressions. -14. Test manually on at least: - - Python/no-build target; - - C/C++ target with Docker build; - - multi-component target with one aggregate build; - - target with CodeQL disabled. - -## 20. Open implementation questions - -1. Which CodeQL-in-Docker install strategy should be implemented first? - - Recommended first: `mount-host-bundle`. -2. Do we want symlinks such as `itemdb/codeql/current` or plain marker files such as `current-run.txt`? - - Recommended first: marker file for portability. -3. Should schema v1 `codeql-plan.yml` be supported during migration? - - Since compatibility is not required, prefer failing with a clear upgrade error. -4. Should CodeQL repair be removed initially? - - Recommended: remove or disable it initially, then reintroduce after deterministic execution/health is solid. -5. Should sandbox recipe validation live in `tools/sandbox-bootstrap.py` or a new module under `tools/sandbox/`? - - Recommended: add a reusable module, expose through existing CLI. - -## 21. Non-goals +## 19. Implementation sequence (8 incremental commits) + +Each commit must compile (no import errors) and pass `make tests` independently. +Existing `itemdb/` content is ignored — the refactor does not read, migrate, or delete +any data under `itemdb/notes/` or `itemdb/findings/`. + +### Commit 1 — `chore: introduce sandbox-recipe schema + validator` + +- Add `templates/sandbox-recipe.yml.example` (the schema sample from §4). +- Add `tools/sandbox/__init__.py` and `tools/sandbox/recipe.py` with `load_recipe(path)`, + `validate_recipe(recipe)`, and the validation rule list from plan §12. +- Expose `recipe-validate` and `recipe-print` subcommands on `tools/sandbox-bootstrap.py` + (extend, don't fork; CLI flags per `tools/AGENTS.md` rule 1). +- Add `tests/test_sandbox_recipe.py` covering: minimal valid recipe, missing required keys, + duplicate build_target ids, workdir not absolute, validation_model not in allow-list, + build_targets empty under docker model, codeql.install_strategy not in allow-list. +- Update `docs/sandbox.md` and `templates/sandboxes/README.md` to mention the recipe. + +### Commit 2 — `feat(codeql-plan): bump schema to v2 + v1 hard-fail` + +- Add `schema_version: 2` loader path in `tools/codeql/packs.py`; v1 raises `PackResolverError` + with a clear "re-run Phase 1a" message (per decision on no auto-migration). +- Extend `analysis_units[].languages[]` to accept `build_provider: sandbox-recipe | none` + and `sandbox_build_target: ` on the parent analysis unit. +- Edit `templates/codeql-plan.yml` in place to v2 schema. +- Update `tools/phases/phase_1_gates.py` for v2; add a test in `tests/test_phase_1_gates.py` + asserting v1 raises error at gate 1a. +- Update `prompts/phase-1a-profile.md` content to v2 + the no-build-shell-snippet rule (§10.1). +- Update `tests/test_codecome_check_codeql.py` and `tests/test_codeql_packs.py` fixtures. + +### Commit 3 — `feat(sandbox-bootstrap): require recipe output in Phase 1b` + +- Rename `prompts/phase-1c-sandbox.md` → `prompts/phase-1b-sandbox.md` and update internal + references ("Phase 1c" → "Phase 1b", "third and final" → "second", "after Phase 1a/1b" → + "after Phase 1a"). +- Update prompt to require `sandbox-recipe.yml` as a **durable output** alongside + `sandbox-plan.md`, with the per-target structure and CodeQL hints. +- Add `validate_recipe` call to the existing `tools/sandbox-bootstrap.py validate` flow. +- New test in `tests/test_sandbox_bootstrap.py` covering the recipe-required gate. + +### Commit 4 — `feat(codeql): add per-run layout, run-id, and health model` + +- New `tools/codeql/health.py` implementing the schema from plan §7 (`compute_health(plan, + recipe, run_dir, manifest)` returning a `health` dict, the full `health.yml`, and + warnings/failures). +- Update `tools/codeql/pipeline.py`: + - create run_id (UTC timestamp + short hash); + - lay out under `itemdb/codeql/runs//{sarif,normalized,databases,logs, + codeql-summary.md}`; + - copy per-run `run-manifest.yml` and `health.yml` into the run dir; + - write `itemdb/codeql/last-run-manifest.yml`; + - write `itemdb/codeql/current-run.txt` only when `health.usable == true`. +- Update `tools/codeql/runner.py` to consume the new layout and pass a per-run dir. +- Update `tools/codeql/artifacts.py` to gate on `health.usable` for hard policy; + remain warning-only under soft policy (§13.4). +- Add `tests/test_codeql_health.py` with cases for every classification in §7.2. + +### Commit 5 — `feat(codeql): docker-inside execution + host/sandbox platform guard` + +- New `tools/codeql/platform.py` with `host_platform() -> str` and + `container_platform(service_path, compose_path) -> str` (runs `uname -sm` in container + via `docker compose exec`). +- New `tools/codeql/in_docker.py` helper that wraps CodeQL invocation through + `docker compose exec` against the recipe's declared service. +- New `sandbox/scripts/codeql.sh` added to `templates/sandboxes/_shared/` (the wrapper + script that the runner calls; resolves the compose file and service from recipe and + forwards args to the in-container codeql binary). +- Update all `templates/sandboxes//docker-compose.yml` templates to bind-mount the + host CodeQL bundle read-only at `/opt/codeql` and ensure it is on PATH. +- **Host/sandbox platform guard**: before each `docker-inside` invocation, the runner + compares `host_platform()` to `container_platform()`. If they differ (e.g. macOS host + with a Linux container — the exact case in many developer workspaces), the unit is + classified as `unavailable` with reason `"CodeQL bundle is for {host_platform}; + sandbox service runs {container_platform}. install_strategy=mount-host-bundle cannot + cross platforms."`. `health.usable` becomes false; the manifest records the failure + clearly. Future implementation may add `download-in-container` or `image-preinstalled` + to handle the cross-platform case. +- Add `tests/test_codeql_platform.py` and `tests/test_codeql_in_docker.py`. +- Add a "CodeQL install strategy" section to `docs/sandbox.md` documenting the limitation. + +### Commit 6 — `refactor(phase-1): reorder to 1a→1b(sandbox)→CodeQL→1c(recon), stop referencing repair` + +- Rename `prompts/phase-1b-recon.md` → `prompts/phase-1c-recon.md`; update internal copy + ("Phase 1b" → "Phase 1c", "second" → "third and final") and add the health-aware + language from plan §10.3. +- `tools/codecome/phase_1.py`: + - Delete `_run_codeql_repair_if_needed`, `_codeql_repair_needed`, + `_codeql_repair_failure_context`, `_file_digest`, `_validate_codeql_plan_for_repair`, + all `_validate_codeql_build_command*` helpers. Build-command validation moves to the + runner's resolver and to `load_recipe`. + - Delete the `phase-1-codeql-repair` branch in `_subphase_should_validate_codeql_plan`. + - Reorder `run_phase_1` to: `1a → 1b (sandbox) → _run_codeql (post-sandbox) → 1c (recon)`. + - `_check_codeql_artifacts` now reads `health.usable` from `last-run-manifest.yml`. +- `tools/phases/completion.py`: drop `build_codeql_plan_resume_prompt` and + `build_codeql_build_failure_resume_prompt`. Keep `build_artifact_repair_resume_prompt` + (still used for Phase 1b artifact repair). +- `tools/phases/phase_1_gates.py`: rename message strings for the new 1b/1c order. +- `tools/codecome/phase_1.py`: stop calling `prompts/phase-1-codeql-repair.md` + (keep the file in tree until cleanup in commit 8). +- `tools/gate-check.py` and `tools/codecome.py check-phase-artifacts`: update subphase + labels in messages (no logic change). +- Update all affected tests: `test_phase_1_gates.py`, `test_phase_1_mid_turn_forgiveness.py`, + `test_phase_graceful_completion_subphases.py`, `test_phase_1_prompts_threat_model.py`, + `test_phases_completion.py`. + +### Commit 7 — `docs+prompts+skills: full sweep of Phase 1b/1c references` + +- `docs/workflow.md` and `docs/sandbox.md`: rewrite Phase 1 sections per §3.1 new order. +- `Makefile` help text only: "Sandbox bootstrap (Phase 1c)" → "Sandbox bootstrap (Phase 1b)". +- `prompts/phase-1a-profile.md` (line ~97): change "Do not bootstrap the sandbox — that is + Phase 1c" → "The sandbox will be built by Phase 1b. Do not attempt sandbox work here." +- `prompts/phase-1b-sandbox.md`: add explicit "you MUST write `sandbox-recipe.yml`" section. +- `prompts/phase-1c-recon.md`: add explicit "you MUST read `last-run-manifest.yml`" section + and the health-aware reading rules (§10.3). +- `prompts/sweep.md` and `prompts/phase-6-report.md`: add a one-liner that consumers + consult `itemdb/codeql/last-run-manifest.yml` for `health.usable` before importing signals. +- `.opencode/skills/sandbox-validation/SKILL.md` and `.opencode/agents/recon.md`: + spot-check and update any stale subphase-label references. +- Add `tests/test_prompts.py` grep guard that asserts no prompt files contain contradictory + Phase 1b/1c language. +- `templates/sandboxes/README.md`: update if needed. + +### Commit 8 — `chore: delete obsolete repair files, add future-repair section, full test + smoke` + +- Delete `prompts/phase-1-codeql-repair.md` and `tests/test_phase_1_codeql_plan_repair.py`. +- Append the "Future: targeted sandbox-recipe repair" section (§22) to this plan file. +- Run `make tests` (pytest + frontmatter gate) and fix any remaining regressions. +- Manual smoke matrix: + - Python/no-build target; + - C/C++ target with Docker build; + - multi-component target with one aggregate build; + - target with CodeQL disabled. +- **Skip** the macOS-host × Linux-container CodeQL smoke case (known unsupported under + the current `mount-host-bundle` only strategy) and link to the platform-guard test. +- Record results in `runs/smoke-2026-MM-DD.md`. + +## 20. Resolved implementation decisions + +1. **CodeQL-in-Docker install strategy**: `mount-host-bundle` only for initial + implementation. A host-vs-sandbox platform guard classifies cross-platform + cases (e.g. macOS host with Linux container) as `unavailable`. Future + strategies (`download-in-container`, `image-preinstalled`) are documented + for follow-up. +2. **Run tracking**: plain marker file `current-run.txt` for portability (no + symlinks). `last-run-manifest.yml` always describes the last attempt + regardless of usability. +3. **Schema v1 migration**: no auto-migration. v1 raises a clear upgrade error. + The user re-runs Phase 1a to regenerate as v2. +4. **CodeQL repair**: removed in code but the prompt file + (`prompts/phase-1-codeql-repair.md`) stays in tree until the final cleanup + (commit 8). A future-design section (§23) describes how it may be + reintroduced if needed. +5. **Sandbox recipe validation**: lives in new `tools/sandbox/recipe.py` + module; exposed through the existing `tools/sandbox-bootstrap.py` CLI. +6. **Existing itemdb data**: ignored during the refactor. No migration, no + reads, no deletes of content under `itemdb/notes/` or `itemdb/findings/`. +7. **Multi-target invocation**: the runner resolves `Sandbox_build_target` from + the recipe per (analysis unit, language) pair and invokes CodeQL with the + resolved `build_command`. Identical commands across targets are allowed; + CodeQL's extractor is per-database so observing the same build multiple + times is harmless. +8. **Per-target script contract**: the recipe's `build_targets[].build_command` + is a free-form shell command (not a path to a script). The model may write + the same command for all targets or different commands per target. +9. **Recon health trigger**: Phase 1c reads `last-run-manifest.yml`. When + `health.usable == true`, CodeQL signals are imported into + `file-risk-index.yml` and `interesting-files.md`. When `health.usable == + false`, recon skips signal import but records the health summary in + `threat-model.md` under a new `# CodeQL health` heading. +10. **`test_mock_llm_parity.py`**: no update needed — it does not simulate + Phase 1 subphases (its scope is event normalization and generic end-to-end + mock LLM runs). +11. **Files under `.opencode/`**: skills (`*.md`) and agent definitions are + part of the harness and may be read and modified during the refactor. + Root `AGENTS.md` and `codecome.yml` remain untouched. + +## 21. Scope + +**Modify** (allowed write paths per commit plan): +- `tools/` (all packages: codecome, codeql, phases, sandbox) +- `prompts/` (rename, rewrite, delete) +- `templates/` (codeql-plan.yml, sandbox-recipe.yml.example, sandboxes/) +- `docs/` (workflow.md, sandbox.md) +- `Makefile` (help text only, no orchestration changes) +- `.opencode/skills/*/SKILL.md` and `.opencode/agents/*.md` (spot-check + update) +- `tests/` (add new, update existing, delete obsolete) +- `.project/codeql-sandbox-recipe-refactor-plan.md` (this file) + +**Do NOT modify**: +- Root `AGENTS.md` +- `codecome.yml` +- `README.md` +- `LICENSE`, `CONTRIBUTING.md`, `NOTICE` +- `src/` (target source code) +- `itemdb/` (existing audit data) +- `.venv/`, `.cache/` +- `sandbox/` (existing sandbox state — templates under `templates/sandboxes/` are modified, not `sandbox/` itself) + +## 22. Non-goals 1. Do not special-case Juliet or other benchmark corpora. 2. Do not create separate sandboxes per CodeQL unit. @@ -742,3 +904,46 @@ Recommended sequence for one WIP branch: 4. Do not infer security absence from zero CodeQL alerts. 5. Do not make the model responsible for interpreting stale CodeQL outputs. 6. Do not preserve old CodeQL artifact layout compatibility unless needed during development. +7. Do not implement `download-in-container` or `image-preinstalled` CodeQL install + strategies in the initial refactor. The `mount-host-bundle` strategy with the + platform guard is sufficient for same-platform host/container scenarios. +8. Do not update `tests/test_mock_llm_parity.py` (its scope does not include Phase 1 + subphase simulation). + +## 23. Future: targeted sandbox-recipe repair + +*This section describes a design reserved for a follow-up branch. The initial +refactor deliberately does not ship this repair loop. It is included here so +the design is not lost.* + +If post-merge testing shows that the deterministic runner still produces too +many `extraction-failed` or `analysis-failed` results from recipe/build issues, +the following narrow repair flow may be introduced: + +1. **Trigger**: after Phase 1 orchestrator, `health.classification in + {extraction-failed, analysis-failed}` **and** at least one analysis unit + has `health.reason` referencing a recipe problem (e.g. "build target X + missing", "workdir not absolute", "service not running"). + +2. **Scope**: a single sub-phase session (`phase_id: 1-recipe-repair`) that + may rewrite **only** `sandbox-recipe.yml`. No changes to `codeql-plan.yml`, + `sandbox-plan.md`, or helper scripts are permitted. The prompt is a fresh + `prompts/phase-1-recipe-repair.md` (small, target-agnostic, no security + content from the source tree). + +3. **Retry budget**: `CODECOME_RECIPE_REPAIR_RETRIES` env var (default 1). + After budget exhaustion, classify the unit as `recipe-soft-failed` and + continue with explicit warnings. + +4. **Re-run**: only re-run CodeQL for affected `(analysis_unit, language)` + pairs, not the entire matrix. + +5. **Agent**: uses the recon agent. The harness gates it behind the new subphase + label `1-recipe-repair` so it shows up distinctly in `runs/`. + +6. **Acceptance**: at least one previously-failing unit transitions to + `health.usable == true` in the next run, and no previously-usable units + regress. + +7. **Off by default**: this section stays in the plan as a reference. The + implementation is not part of commits 1–8. diff --git a/docs/sandbox.md b/docs/sandbox.md index b48dcea..274a55e 100644 --- a/docs/sandbox.md +++ b/docs/sandbox.md @@ -119,6 +119,28 @@ helper gaps rather than hard gate failures. The validation matrix is appended to `sandbox/CODECOME-GENERATED.md` so each run is auditable. +## Sandbox recipe + +Phase 1b also produces `itemdb/notes/sandbox-recipe.yml`, a machine-readable +contract consumed by later harness steps (CodeQL runner, Phase 1c recon). It +describes: + +- how to invoke sandbox capabilities (setup, up, check, build, test, etc.), +- per-unit build targets with source paths, workdirs, and build/test commands, +- CodeQL execution hints (install strategy, preferred execution mode). + +Validate the recipe: + + make sandbox-validate + +(Validates both the sandbox itself and the recipe file.) + +Print the recipe: + + $(PYTHON) tools/sandbox-bootstrap.py recipe-print + +See `templates/sandbox-recipe.yml.example` for the schema. + ## Phase 2 sandbox gate Before running `make phase-2`, the gate inspects the most recent diff --git a/templates/sandbox-recipe.yml.example b/templates/sandbox-recipe.yml.example new file mode 100644 index 0000000..2b9d016 --- /dev/null +++ b/templates/sandbox-recipe.yml.example @@ -0,0 +1,53 @@ +# Sandbox recipe — machine-readable contract between sandbox bootstrap and harness steps. +# Generated by Phase 1b sandbox. Consumed by CodeQL runner and Phase 1c recon. +schema_version: 1 +generated_by: "phase-1b-sandbox" +validation_model: docker + +sandbox: + path: ./sandbox + managed: true + compose_file: ./sandbox/docker-compose.yml + default_service: app + workspace_root: /workspace + source_root: /workspace/src + +commands: + setup: ./sandbox/scripts/setup.sh + up: ./sandbox/scripts/up.sh + check: ./sandbox/scripts/check.sh + build: ./sandbox/scripts/build.sh + test: ./sandbox/scripts/test.sh + down: ./sandbox/scripts/down.sh + shell: ./sandbox/scripts/shell.sh + logs: ./sandbox/scripts/logs.sh + clean: ./sandbox/scripts/clean.sh + reset: ./sandbox/scripts/reset.sh + +build_targets: + - id: root + description: Default target build + source_path: ./src + service: app + workdir: /workspace/src + build_command: ./sandbox/scripts/build.sh + test_command: ./sandbox/scripts/test.sh + environment: + type: docker-compose + compose_file: ./sandbox/docker-compose.yml + service: app + codeql: + supported: true + preferred_execution_mode: docker-inside + install_strategy: mount-host-bundle + notes: [] + +codeql: + supported: true + default_execution_mode: docker-inside + install_strategy: mount-host-bundle + notes: + - CodeQL is optional and best-effort. + - Compiled-language database creation should use build_targets rather than host guesses. + +limitations: [] diff --git a/templates/sandboxes/README.md b/templates/sandboxes/README.md index bd14baf..46d6e2d 100644 --- a/templates/sandboxes/README.md +++ b/templates/sandboxes/README.md @@ -35,6 +35,21 @@ Each seed contains: See `.opencode/skills/sandbox-bootstrap/SKILL.md` for the full authoring rules. +## Sandbox recipe + +After bootstrap, Phase 1b writes `itemdb/notes/sandbox-recipe.yml`. +This machine-readable file is the contract between the sandbox and +later harness steps. See `templates/sandbox-recipe.yml.example` +for the schema. + +To validate it: + + tools/sandbox-bootstrap.py recipe-validate + +To print it: + + tools/sandbox-bootstrap.py recipe-print + ## License The files in this `templates/sandboxes/` subtree are licensed under diff --git a/tests/test_sandbox_recipe.py b/tests/test_sandbox_recipe.py new file mode 100644 index 0000000..eec22f9 --- /dev/null +++ b/tests/test_sandbox_recipe.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "tools")) + +from sandbox.recipe import load_recipe, validate_recipe, dump_recipe + + +# -- Minimal valid recipe ------------------------------------------------------- + +VALID_RECIPE = { + "schema_version": 1, + "generated_by": "phase-1b-sandbox", + "validation_model": "docker", + "sandbox": { + "path": "./sandbox", + "managed": True, + "compose_file": "./sandbox/docker-compose.yml", + "default_service": "app", + "workspace_root": "/workspace", + "source_root": "/workspace/src", + }, + "commands": { + "setup": "./sandbox/scripts/setup.sh", + "up": "./sandbox/scripts/up.sh", + "check": "./sandbox/scripts/check.sh", + "build": "./sandbox/scripts/build.sh", + "test": "./sandbox/scripts/test.sh", + "down": "./sandbox/scripts/down.sh", + }, + "build_targets": [ + { + "id": "root", + "description": "Default target", + "source_path": "./src", + "service": "app", + "workdir": "/workspace/src", + "build_command": "./sandbox/scripts/build.sh", + "test_command": "./sandbox/scripts/test.sh", + "environment": { + "type": "docker-compose", + "compose_file": "./sandbox/docker-compose.yml", + "service": "app", + }, + "codeql": { + "supported": True, + "preferred_execution_mode": "docker-inside", + "install_strategy": "mount-host-bundle", + "notes": [], + }, + }, + ], + "codeql": { + "supported": True, + "default_execution_mode": "docker-inside", + "install_strategy": "mount-host-bundle", + "notes": [], + }, + "limitations": [], +} + + +def _write_recipe(tmp_path: Path, recipe: dict) -> Path: + path = tmp_path / "sandbox-recipe.yml" + path.write_text(dump_recipe(recipe), encoding="utf-8") + return path + + +class TestLoadRecipe: + def test_load_valid_recipe(self, tmp_path: Path) -> None: + path = _write_recipe(tmp_path, VALID_RECIPE) + data = load_recipe(path) + assert data["schema_version"] == 1 + assert data["validation_model"] == "docker" + + def test_load_missing_file(self, tmp_path: Path) -> None: + path = tmp_path / "nonexistent.yml" + try: + load_recipe(path) + assert False, "Expected ValueError" + except ValueError as exc: + assert "Failed to read" in str(exc) + + def test_load_not_a_mapping(self, tmp_path: Path) -> None: + path = tmp_path / "bad.yml" + path.write_text("- list\n- not\n- a mapping\n", encoding="utf-8") + try: + load_recipe(path) + assert False, "Expected ValueError" + except ValueError as exc: + assert "YAML mapping" in str(exc) + + +class TestValidateRecipe: + def test_valid_recipe_passes(self, tmp_path: Path) -> None: + _setup_fake_paths(tmp_path) + errors = validate_recipe(VALID_RECIPE, root=str(tmp_path)) + assert errors == [], f"Unexpected errors: {errors}" + + def test_unsupported_schema_version(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, schema_version=99) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("unsupported schema_version" in e for e in errors) + + def test_missing_validation_model(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE) + del recipe["validation_model"] + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("missing or empty 'validation_model'" in e for e in errors) + + def test_invalid_validation_model(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, validation_model="bogus") + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("invalid validation_model" in e for e in errors) + + def test_missing_sandbox_section(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE) + del recipe["sandbox"] + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("non-mapping 'sandbox'" in e for e in errors) + + def test_sandbox_path_does_not_exist(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE) + recipe["sandbox"] = dict(recipe["sandbox"], path="./missing-dir") + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("does not exist" in e for e in errors) + + def test_empty_build_targets_for_docker_model(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, build_targets=[]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("requires at least one target" in e for e in errors) + + def test_empty_build_targets_ok_for_static_only(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, validation_model="static-only", build_targets=[]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert errors == [] + + def test_duplicate_build_target_id(self, tmp_path: Path) -> None: + target = VALID_RECIPE["build_targets"][0] + recipe = dict(VALID_RECIPE, build_targets=[target, target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("duplicate build_target id" in e for e in errors) + + def test_build_target_missing_id(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, build_targets=[{"not_id": "root"}]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("missing or empty 'id'" in e for e in errors) + + def test_build_target_missing_source_path(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0]) + del target["source_path"] + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("missing or empty 'source_path'" in e for e in errors) + + def test_build_target_source_path_does_not_exist(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0], source_path="./does-not-exist") + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("does not exist" in e for e in errors) + + def test_workdir_not_absolute(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0], workdir="src") + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("must be absolute" in e for e in errors) + + def test_invalid_install_strategy(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0]) + target["codeql"] = dict(target["codeql"], install_strategy="ftp") + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("install_strategy" in e and "invalid" in e for e in errors) + + def test_invalid_preferred_execution_mode(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0]) + target["codeql"] = dict(target["codeql"], preferred_execution_mode="quantum") + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("preferred_execution_mode" in e and "invalid" in e for e in errors) + + def test_non_list_build_targets(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, build_targets={"not": "a list"}) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("non-list 'build_targets'" in e for e in errors) + + def test_non_mapping_build_target_entry(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, build_targets=["not a mapping"]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("is not a mapping" in e for e in errors) + + def test_bad_commands_section(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, commands="not-a-mapping") + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("must be a mapping" in e for e in errors) + + def test_bad_limitations_section(self, tmp_path: Path) -> None: + recipe = dict(VALID_RECIPE, limitations="not-a-list") + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("must be a list" in e for e in errors) + + def test_missing_workdir(self, tmp_path: Path) -> None: + target = dict(VALID_RECIPE["build_targets"][0]) + del target["workdir"] + recipe = dict(VALID_RECIPE, build_targets=[target]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert any("missing or empty 'workdir'" in e for e in errors) + + def test_valid_recipe_with_multiple_targets(self, tmp_path: Path) -> None: + target2 = { + "id": "cli", + "description": "CLI build target", + "source_path": "./src", + "service": "app", + "workdir": "/workspace/src/cli", + "build_command": "./sandbox/scripts/build-cli.sh", + "test_command": "./sandbox/scripts/test-cli.sh", + "environment": { + "type": "docker-compose", + "compose_file": "./sandbox/docker-compose.yml", + "service": "app", + }, + "codeql": { + "supported": True, + "preferred_execution_mode": "docker-inside", + "install_strategy": "mount-host-bundle", + "notes": [], + }, + } + recipe = dict(VALID_RECIPE, build_targets=[VALID_RECIPE["build_targets"][0], target2]) + _setup_fake_paths(tmp_path) + errors = validate_recipe(recipe, root=str(tmp_path)) + assert errors == [], f"Unexpected errors: {errors}" + + +def _setup_fake_paths(tmp_path: Path) -> None: + (tmp_path / "sandbox").mkdir(exist_ok=True) + (tmp_path / "src").mkdir(exist_ok=True) diff --git a/tools/sandbox-bootstrap.py b/tools/sandbox-bootstrap.py index 87a5365..84d193d 100755 --- a/tools/sandbox-bootstrap.py +++ b/tools/sandbox-bootstrap.py @@ -9,13 +9,15 @@ target-specific sandbox at sandbox/. Subcommands: - list List available sandbox examples. - inspect Print manifest and previews for one example. - detect Scan workspace and propose ranked sandbox candidates. - apply Copy an example into sandbox/. - validate Run validation tiers. - regenerate Re-apply current sandbox example. - status Print sandbox provenance and Phase 2 gate result. + list List available sandbox examples. + inspect Print manifest and previews for one example. + detect Scan workspace and propose ranked sandbox candidates. + apply Copy an example into sandbox/. + validate Run validation tiers. + regenerate Re-apply current sandbox example. + status Print sandbox provenance and Phase 2 gate result. + recipe-validate Validate itemdb/notes/sandbox-recipe.yml. + recipe-print Print the sandbox recipe. Environment variables: CODECOME_ALLOW_NO_SANDBOX Skip Phase 2 sandbox gate. @@ -50,9 +52,10 @@ import _colors as C ROOT = Path(__file__).resolve().parents[1] +SANDBOX_NOTES_PATH = NOTES_ROOT = ROOT / "itemdb" / "notes" +SANDBOX_RECIPE_PATH = NOTES_ROOT / "sandbox-recipe.yml" TEMPLATES_ROOT = ROOT / "templates" / "sandboxes" SANDBOX_ROOT = ROOT / "sandbox" -NOTES_ROOT = ROOT / "itemdb" / "notes" SRC_ROOT = ROOT / "src" PROVENANCE_FILE = SANDBOX_ROOT / "CODECOME-GENERATED.md" @@ -1556,6 +1559,52 @@ def cmd_validate(args: argparse.Namespace) -> int: return 0 if overall_outcome == "passed" else 1 +def cmd_recipe_validate(args: argparse.Namespace) -> int: + path = Path(args.path) if hasattr(args, "path") and args.path else SANDBOX_RECIPE_PATH + if not path.is_file(): + print(C.fail(f"Sandbox recipe not found at {path}"), file=sys.stderr) + return 1 + + try: + from sandbox.recipe import load_recipe, validate_recipe + recipe = load_recipe(path) + except Exception as exc: + print(C.fail(f"Failed to load recipe: {exc}"), file=sys.stderr) + return 1 + + errors = validate_recipe(recipe, root=str(ROOT)) + if errors: + print(C.fail(f"Sandbox recipe at {path} has {len(errors)} validation error(s):"), file=sys.stderr) + for err in errors: + print(f" {C.SYM_BULLET} {err}") + return 1 + + print(C.ok(f"Sandbox recipe at {path} is valid.")) + return 0 + + +def cmd_recipe_print(args: argparse.Namespace) -> int: + path = Path(args.path) if hasattr(args, "path") and args.path else SANDBOX_RECIPE_PATH + if not path.is_file(): + print(C.fail(f"Sandbox recipe not found at {path}"), file=sys.stderr) + return 1 + + try: + from sandbox.recipe import load_recipe + recipe = load_recipe(path) + except Exception as exc: + print(C.fail(f"Failed to load recipe: {exc}"), file=sys.stderr) + return 1 + + if args.format == "json": + _emit(recipe, "json") + else: + from sandbox.recipe import dump_recipe + print(dump_recipe(recipe).rstrip()) + + return 0 + + def cmd_not_implemented(args: argparse.Namespace) -> int: name = getattr(args, "command", "") print( @@ -1694,6 +1743,32 @@ def build_parser() -> argparse.ArgumentParser: ) p_status.set_defaults(func=cmd_status) + p_recipe_validate = sub.add_parser( + "recipe-validate", + parents=[common], + help="Validate itemdb/notes/sandbox-recipe.yml.", + ) + p_recipe_validate.add_argument( + "path", + nargs="?", + default=str(SANDBOX_RECIPE_PATH), + help=f"Path to the recipe file. Defaults to {SANDBOX_RECIPE_PATH}.", + ) + p_recipe_validate.set_defaults(func=cmd_recipe_validate) + + p_recipe_print = sub.add_parser( + "recipe-print", + parents=[common], + help="Print the sandbox recipe.", + ) + p_recipe_print.add_argument( + "path", + nargs="?", + default=str(SANDBOX_RECIPE_PATH), + help=f"Path to the recipe file. Defaults to {SANDBOX_RECIPE_PATH}.", + ) + p_recipe_print.set_defaults(func=cmd_recipe_print) + return parser diff --git a/tools/sandbox/__init__.py b/tools/sandbox/__init__.py new file mode 100644 index 0000000..a6da2cd --- /dev/null +++ b/tools/sandbox/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2025-2026 Pablo Ruiz Garcia +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later +"""Sandbox recipe loading and validation.""" diff --git a/tools/sandbox/recipe.py b/tools/sandbox/recipe.py new file mode 100644 index 0000000..a101f15 --- /dev/null +++ b/tools/sandbox/recipe.py @@ -0,0 +1,200 @@ +# Copyright (C) 2025-2026 Pablo Ruiz Garcia +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""Sandbox recipe: load and validate sandbox-recipe.yml.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +try: + import yaml +except ImportError: # pragma: no cover + yaml = None # type: ignore[assignment] + +SUPPORTED_SCHEMA_VERSIONS = frozenset({1}) +VALID_VALIDATION_MODELS = frozenset({"docker", "static-only", "nested-virt"}) +VALID_INSTALL_STRATEGIES = frozenset({"mount-host-bundle", "copy-host-bundle", "image-preinstalled"}) +VALID_EXECUTION_MODES = frozenset({"host", "docker-inside", "docker-wrapper", "unavailable"}) + + +def _require_yaml() -> None: + if yaml is None: + raise RuntimeError("PyYAML is required to load sandbox recipes.") + + +def load_recipe(path: str | Path) -> dict[str, Any]: + """Load a sandbox-recipe.yml file as a mapping. + + Returns the parsed dict. Does not validate (call ``validate_recipe`` separately). + """ + _require_yaml() + path = Path(path) + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except (yaml.YAMLError, OSError, UnicodeDecodeError) as exc: + raise ValueError(f"Failed to read sandbox recipe at {path}: {exc}") from exc + if not isinstance(data, dict): + raise ValueError(f"Sandbox recipe at {path} must be a YAML mapping") + return data + + +def validate_recipe(recipe: dict[str, Any], *, root: str | Path) -> list[str]: + """Validate a loaded sandbox-recipe dict. + + Returns a list of error strings (empty = valid). ``root`` is the + workspace root path used to resolve relative paths. + """ + errors: list[str] = [] + + # --- schema_version --- + version = recipe.get("schema_version") + if version not in SUPPORTED_SCHEMA_VERSIONS: + supported = ", ".join(str(v) for v in sorted(SUPPORTED_SCHEMA_VERSIONS)) + errors.append( + f"sandbox-recipe.yml: unsupported schema_version {version!r} (supported: {supported})" + ) + + # --- validation_model --- + validation_model = recipe.get("validation_model") + if not isinstance(validation_model, str) or not validation_model.strip(): + errors.append("sandbox-recipe.yml: missing or empty 'validation_model'") + elif validation_model not in VALID_VALIDATION_MODELS: + valid = ", ".join(sorted(VALID_VALIDATION_MODELS)) + errors.append( + f"sandbox-recipe.yml: invalid validation_model {validation_model!r} (allowed: {valid})" + ) + + # --- sandbox block --- + sandbox = recipe.get("sandbox") + if not isinstance(sandbox, dict): + errors.append("sandbox-recipe.yml: missing or non-mapping 'sandbox' section") + else: + sandbox_path_str = sandbox.get("path") + if not isinstance(sandbox_path_str, str) or not sandbox_path_str: + errors.append("sandbox-recipe.yml: sandbox.path is missing or empty") + else: + sandbox_path = Path(root) / sandbox_path_str + if not sandbox_path.exists(): + errors.append( + f"sandbox-recipe.yml: sandbox.path {sandbox_path_str!r} does not exist" + ) + + # --- commands block (optional but warn on missing known keys) --- + commands = recipe.get("commands") + if commands is not None and not isinstance(commands, dict): + errors.append("sandbox-recipe.yml: 'commands' must be a mapping") + + # --- build_targets --- + build_targets = recipe.get("build_targets") + buildless = validation_model in ("static-only",) + + if not isinstance(build_targets, list): + if not buildless: + errors.append("sandbox-recipe.yml: missing or non-list 'build_targets'") + elif len(build_targets) == 0 and not buildless: + errors.append( + "sandbox-recipe.yml: 'build_targets' is empty but validation_model requires at least one target" + ) + elif isinstance(build_targets, list): + errors.extend(_validate_build_targets(build_targets, root)) + + # --- codeql block (optional) --- + codeql = recipe.get("codeql") + if codeql is not None: + if not isinstance(codeql, dict): + errors.append("sandbox-recipe.yml: 'codeql' must be a mapping") + else: + errors.extend(_validate_codeql_hints(codeql, prefix="codeql")) + + # --- limitations (optional) --- + limitations = recipe.get("limitations") + if limitations is not None and not isinstance(limitations, list): + errors.append("sandbox-recipe.yml: 'limitations' must be a list") + + return errors + + +def _validate_build_targets(targets: list[Any], root: str | Path) -> list[str]: + errors: list[str] = [] + seen_ids: set[str] = set() + + for i, target in enumerate(targets): + if not isinstance(target, dict): + errors.append( + f"sandbox-recipe.yml: build_targets[{i}] is not a mapping" + ) + continue + + target_id = target.get("id") + if not isinstance(target_id, str) or not target_id: + errors.append( + f"sandbox-recipe.yml: build_targets[{i}] missing or empty 'id'" + ) + continue + + if target_id in seen_ids: + errors.append( + f"sandbox-recipe.yml: duplicate build_target id {target_id!r}" + ) + seen_ids.add(target_id) + + # source_path + source_path_str = target.get("source_path") + if not isinstance(source_path_str, str) or not source_path_str: + errors.append( + f"sandbox-recipe.yml: build_target {target_id!r} missing or empty 'source_path'" + ) + else: + source_path = Path(root) / source_path_str + if not source_path.exists(): + errors.append( + f"sandbox-recipe.yml: build_target {target_id!r} source_path {source_path_str!r} does not exist" + ) + + # workdir must be absolute inside the sandbox + workdir = target.get("workdir") + if not isinstance(workdir, str) or not workdir: + errors.append( + f"sandbox-recipe.yml: build_target {target_id!r} missing or empty 'workdir'" + ) + elif not workdir.startswith("/"): + errors.append( + f"sandbox-recipe.yml: build_target {target_id!r} workdir {workdir!r} must be absolute (e.g. /workspace/src)" + ) + + # codeql hints + codeql = target.get("codeql") + if isinstance(codeql, dict): + errors.extend(_validate_codeql_hints(codeql, prefix=f"build_targets[{i}].codeql")) + + return errors + + +def _validate_codeql_hints(codeql: dict[str, Any], prefix: str) -> list[str]: + errors: list[str] = [] + + install_strategy = codeql.get("install_strategy") + if install_strategy is not None: + if not isinstance(install_strategy, str) or install_strategy not in VALID_INSTALL_STRATEGIES: + valid = ", ".join(sorted(VALID_INSTALL_STRATEGIES)) + errors.append( + f"sandbox-recipe.yml: {prefix}.install_strategy {install_strategy!r} invalid (allowed: {valid})" + ) + + preferred_mode = codeql.get("preferred_execution_mode") + if preferred_mode is not None: + if not isinstance(preferred_mode, str) or preferred_mode not in VALID_EXECUTION_MODES: + valid = ", ".join(sorted(VALID_EXECUTION_MODES)) + errors.append( + f"sandbox-recipe.yml: {prefix}.preferred_execution_mode {preferred_mode!r} invalid (allowed: {valid})" + ) + + return errors + + +def dump_recipe(recipe: dict[str, Any]) -> str: + """Serialize a recipe dict to YAML string.""" + _require_yaml() + return yaml.safe_dump(recipe, sort_keys=False) From 71296ee6cc9193586df4bf5a3a723b87941a8dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:22:27 +0200 Subject: [PATCH 03/13] feat(codeql-plan): bump schema to v2 + v1 hard-fail tools/codeql/packs.py: load_codeql_plan requires schema_version: 2, v1 raises PackResolverError with clear upgrade guidance. templates/codeql-plan.yml: updated to v2 schema with build_provider, sandbox_build_target, and new commentary. tools/phases/phase_1_gates.py: gate 1a now calls load_codeql_plan() so v1 plans are rejected at the gate with the upgrade error message. prompts/phase-1a-profile.md: updated to v2 rules, including sandbox_build_target, build_provider, and new language about sandbox being built in Phase 1b. Tests: all plan fixtures bumped to v2, new test_schema_v1_rejected_at_gate_1a. --- prompts/phase-1a-profile.md | 13 ++++++--- templates/codeql-plan.yml | 17 +++++++++-- tests/test_codecome_check_codeql.py | 2 +- tests/test_codeql_packs.py | 11 ++++++-- tests/test_codeql_runner.py | 6 ++-- tests/test_phase_1_codeql_plan_repair.py | 8 +++--- tests/test_phase_1_gates.py | 36 ++++++++++++++++++++++-- tools/codeql/packs.py | 17 ++++++++++- tools/phases/phase_1_gates.py | 11 +++----- 9 files changed, 93 insertions(+), 28 deletions(-) diff --git a/prompts/phase-1a-profile.md b/prompts/phase-1a-profile.md index 5b32508..12afc00 100644 --- a/prompts/phase-1a-profile.md +++ b/prompts/phase-1a-profile.md @@ -59,11 +59,17 @@ Create `itemdb/notes/codeql-plan.yml` by filling in the template from `templates Rules: +- Set `schema_version: 2`. The v2 schema adds two new optional fields (see below). - Discover analysis units under `./src`. An analysis unit is a coherent project/component with one source root and one or more languages/stacks, such as an API service, frontend app, native library, CLI, package, firmware tree, or benchmark corpus. - Use stable, lowercase `analysis_units[].id` values such as `api`, `frontend`, `native-lib`, or `root`. These IDs are discovered here; users do not define them in `codecome.yml`. - Set `analysis_units[].path` to the real source path under `./src` for that unit. Do not use CodeQL-generated helper paths such as `_codeql_detected_source_root`. - Use one `analysis_units` entry for a single-project repository and multiple entries for monorepos or mixed stacks. - Only include languages you have detected with **HIGH** or **MEDIUM** confidence. +- For compiled languages (c-cpp, go, csharp, java-kotlin, swift) set `analysis_units[].sandbox_build_target` to the `build_targets[].id` from `sandbox-recipe.yml` that provides the build command for this unit. If the recipe has not been generated yet (this is Phase 1a), pick a sensible id such as `root` — Phase 1b will flesh out the recipe and the id can be updated if needed. +- For each language, set `build_provider`: + - `"sandbox-recipe"` — for compiled languages whose build command should be resolved from `sandbox-recipe.yml` after Phase 1b. Leave `build_command` empty (the runner resolves it from the recipe). + - `"none"` — for no-build languages (python, javascript-typescript, ruby). +- Avoid concrete build shell snippets in `build_command` unless the build is obvious and stable and no recipe is available. Prefer `build_provider: sandbox-recipe` for everything that needs a build. - For each language in each analysis unit, select the appropriate pack profiles: - `official` — always include for languages with CodeQL support. - `github-security-lab` — include for security-focused audits. @@ -92,13 +98,12 @@ Rules: - Do not assume the target is a web application. - Do not modify files under `src/`. - Do not generate vulnerability findings. -- Do not produce full reconnaissance notes (attack-surface, trust-boundaries, etc.) — those are Phase 1b. -- Do not bootstrap the sandbox — that is Phase 1c. -- Do not run CodeQL manually. The harness runs it after this sub-stage. +- Do not produce full reconnaissance notes (attack-surface, trust-boundaries, etc.) — those are Phase 1c. +- The sandbox will be built by Phase 1b. Do not attempt sandbox work here. +- Do not run CodeQL manually. The harness runs it after Phase 1b. - Be explicit about uncertainty. - Prefer useful notes over exhaustive dumps. - Focus on what later sub-stages need. -- Phase 1a does not produce `threat-model.md`. - Phase 1a does not produce attack-surface, trust-boundary, or data-flow notes. - Phase 1a does not bootstrap sandbox. - Non-blocking open questions should go into the run summary file. diff --git a/templates/codeql-plan.yml b/templates/codeql-plan.yml index 7be3fc0..7a21724 100644 --- a/templates/codeql-plan.yml +++ b/templates/codeql-plan.yml @@ -2,7 +2,7 @@ # The model fills in language entries based on source tree analysis. # Consumed by CodeQL run orchestration (tools/codeql/runner.py). -schema_version: 1 +schema_version: 2 generated_by: "phase-1a-profile" source_path: "./src" @@ -16,10 +16,12 @@ analysis_units: [] # path: "./src/api" # kind: "service" # primary: true +# sandbox_build_target: "root" # references a build_target id in sandbox-recipe.yml # languages: # - id: "python" # confidence: "HIGH" # build_mode: "none" +# build_provider: "none" # "sandbox-recipe" or "none" # build_command: null # packs: # - "official" @@ -29,11 +31,13 @@ analysis_units: [] # path: "./src/native" # kind: "library" # primary: false +# sandbox_build_target: "native-lib" # references a build_target id in sandbox-recipe.yml # languages: # - id: "c-cpp" # confidence: "HIGH" # build_mode: "manual" -# build_command: "make -C src/native" +# build_provider: "sandbox-recipe" # resolved from sandbox-recipe.yml after Phase 1b +# build_command: null # derived from recipe; leave empty when build_provider is "sandbox-recipe" # db_create_timeout: 1800 # optional: seconds, model-estimated from source size # analyze_timeout: 900 # optional: seconds, per query-profile run # packs: @@ -42,6 +46,15 @@ analysis_units: [] # - "trailofbits" # - "coding-standards" # +# v2 changes: +# - sandbox_build_target on analysis units links to sandbox-recipe.yml build_targets. +# - build_provider on language entries controls where the build command comes from. +# Use "sandbox-recipe" for compiled languages (the sandbox provides the build command). +# Use "none" for no-build languages (python, javascript-typescript, ruby). +# - build_command should be left empty when build_provider is "sandbox-recipe" +# (the runner resolves it from the recipe). Only provide build_command directly +# when the build cannot be expressed through the recipe. +# # Allowed language IDs: python, javascript-typescript, ruby, c-cpp, go, csharp, java-kotlin, swift # Allowed confidence values: HIGH, MEDIUM, LOW # Allowed build_mode values by language: diff --git a/tests/test_codecome_check_codeql.py b/tests/test_codecome_check_codeql.py index 596e6ba..f0fe638 100644 --- a/tests/test_codecome_check_codeql.py +++ b/tests/test_codecome_check_codeql.py @@ -85,7 +85,7 @@ def test_codeql_check_fails_failed_artifacts(tmp_path: Path, capsys) -> None: notes = tmp_path / "itemdb" / "notes" notes.mkdir(parents=True) (notes / "codeql-plan.yml").write_text( - "schema_version: 1\nanalysis_units:\n - id: root\n path: ./src\n languages:\n - id: python\n packs:\n - official\n", + "schema_version: 2\nanalysis_units:\n - id: root\n path: ./src\n languages:\n - id: python\n packs:\n - official\n", encoding="utf-8", ) manifest_dir = config.abs_output_dir diff --git a/tests/test_codeql_packs.py b/tests/test_codeql_packs.py index c2b0a78..87e6cce 100644 --- a/tests/test_codeql_packs.py +++ b/tests/test_codeql_packs.py @@ -43,7 +43,7 @@ def _write_catalog(path: Path) -> None: def _write_plan(path: Path) -> None: path.write_text( ( - "schema_version: 1\n" + "schema_version: 2\n" "analysis_units:\n" " - id: root\n" " path: ./src\n" @@ -179,7 +179,12 @@ def test_resolve_plan_packs_candidate_policy(tmp_path: Path) -> None: def test_load_codeql_plan_rejects_invalid_language_entry(tmp_path: Path) -> None: plan_path = tmp_path / "bad-plan.yml" - plan_path.write_text("analysis_units:\n - nope\n", encoding="utf-8") + plan_path.write_text( + "schema_version: 2\n" + "analysis_units:\n" + " - nope\n", + encoding="utf-8", + ) try: load_codeql_plan(plan_path) @@ -193,7 +198,7 @@ def test_load_codeql_plan_allows_non_recommended_unit_without_languages(tmp_path plan_path = tmp_path / "plan.yml" plan_path.write_text( ( - "schema_version: 1\n" + "schema_version: 2\n" "analysis_units:\n" " - id: api\n" " path: ./src/api\n" diff --git a/tests/test_codeql_runner.py b/tests/test_codeql_runner.py index 6e6de13..a4eb849 100644 --- a/tests/test_codeql_runner.py +++ b/tests/test_codeql_runner.py @@ -295,7 +295,7 @@ def test_run_codeql_database_failure_honors_soft_policy(tmp_path: Path) -> None: plan_path = tmp_path / "itemdb" / "notes" / "codeql-plan.yml" plan_path.parent.mkdir(parents=True) - plan_path.write_text("schema_version: 1\n", encoding="utf-8") + plan_path.write_text("schema_version: 2\nanalysis_units:\n - id: root\n path: ./src\n languages:\n - id: python\n packs:\n - official\n", encoding="utf-8") catalog = tmp_path / "templates" / "codeql-packs.yml" catalog.parent.mkdir(parents=True) @@ -382,7 +382,7 @@ def test_run_codeql_empty_languages_returns_skipped(tmp_path: Path) -> None: plan_path = tmp_path / "itemdb" / "notes" / "codeql-plan.yml" plan_path.parent.mkdir(parents=True) - plan_path.write_text("schema_version: 1\nanalysis_units: []\n", encoding="utf-8") + plan_path.write_text("schema_version: 2\nanalysis_units: []\n", encoding="utf-8") catalog = tmp_path / "templates" / "codeql-packs.yml" catalog.parent.mkdir(parents=True) @@ -415,7 +415,7 @@ def test_run_codeql_pack_resolver_error_soft_policy(tmp_path: Path) -> None: plan_path = tmp_path / "itemdb" / "notes" / "codeql-plan.yml" plan_path.parent.mkdir(parents=True) - plan_path.write_text("schema_version: 1\n", encoding="utf-8") + plan_path.write_text("schema_version: 2\nanalysis_units:\n - id: root\n path: ./src\n languages:\n - id: python\n packs:\n - official\n", encoding="utf-8") catalog = tmp_path / "templates" / "codeql-packs.yml" catalog.parent.mkdir(parents=True) diff --git a/tests/test_phase_1_codeql_plan_repair.py b/tests/test_phase_1_codeql_plan_repair.py index f34a206..0ae6faf 100644 --- a/tests/test_phase_1_codeql_plan_repair.py +++ b/tests/test_phase_1_codeql_plan_repair.py @@ -19,7 +19,7 @@ def _write_invalid_plan(root: Path) -> None: plan = root / "itemdb" / "notes" / "codeql-plan.yml" plan.parent.mkdir(parents=True, exist_ok=True) plan.write_text( - "schema_version: 1\n" + "schema_version: 2\n" "analysis_units:\n" " - id: native\n" " path: ./src/native\n" @@ -36,7 +36,7 @@ def _write_valid_plan(root: Path) -> None: plan = root / "itemdb" / "notes" / "codeql-plan.yml" plan.parent.mkdir(parents=True, exist_ok=True) plan.write_text( - "schema_version: 1\n" + "schema_version: 2\n" "analysis_units:\n" " - id: native\n" " path: ./src/native\n" @@ -76,7 +76,7 @@ def _write_manual_plan(root: Path, build_command: str) -> None: plan.write_text( yaml.safe_dump( { - "schema_version": 1, + "schema_version": 2, "analysis_units": [ { "id": "native", @@ -371,7 +371,7 @@ def _write_plan_with_build_mode(root: Path, build_mode: str | None, build_comman import yaml as _yaml data: dict = { - "schema_version": 1, + "schema_version": 2, "analysis_units": [ { "id": "native", diff --git a/tests/test_phase_1_gates.py b/tests/test_phase_1_gates.py index 62b4319..795caf1 100644 --- a/tests/test_phase_1_gates.py +++ b/tests/test_phase_1_gates.py @@ -32,7 +32,7 @@ def test_unsupported_language_soft_policy_warns_not_fails(tmp_path: Path, capsys (notes / "target-profile.md").write_text("profile", encoding="utf-8") (notes / "build-model.md").write_text("model", encoding="utf-8") (notes / "codeql-plan.yml").write_text( - "schema_version: 1\n" + "schema_version: 2\n" "recommended: true\n" "analysis_units:\n" " - id: gilroy\n" @@ -65,7 +65,7 @@ def test_unsupported_language_hard_policy_fails(tmp_path: Path, capsys) -> None: (notes / "target-profile.md").write_text("profile", encoding="utf-8") (notes / "build-model.md").write_text("model", encoding="utf-8") (notes / "codeql-plan.yml").write_text( - "schema_version: 1\n" + "schema_version: 2\n" "recommended: true\n" "analysis_units:\n" " - id: gilroy\n" @@ -98,7 +98,7 @@ def test_non_recommended_unit_without_languages_is_skipped(tmp_path: Path, capsy (notes / "target-profile.md").write_text("profile", encoding="utf-8") (notes / "build-model.md").write_text("model", encoding="utf-8") (notes / "codeql-plan.yml").write_text( - "schema_version: 1\n" + "schema_version: 2\n" "recommended: true\n" "analysis_units:\n" " - id: api\n" @@ -129,3 +129,33 @@ def test_non_recommended_unit_without_languages_is_skipped(tmp_path: Path, capsy out = capsys.readouterr().out assert rc == 0 assert "not recommended for CodeQL" in out + + +def test_schema_v1_rejected_at_gate_1a(tmp_path: Path, capsys) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "target-profile.md").write_text("profile", encoding="utf-8") + (notes / "build-model.md").write_text("model", encoding="utf-8") + (notes / "codeql-plan.yml").write_text( + "schema_version: 1\n" + "recommended: true\n" + "analysis_units:\n" + " - id: api\n" + " path: ./src\n" + " languages:\n" + " - id: python\n" + " packs:\n" + " - official\n", + encoding="utf-8", + ) + + (tmp_path / "src").mkdir() + + from phases.phase_1_gates import check_phase_1a + + with patch("phases.phase_1_gates.ROOT", tmp_path): + rc = check_phase_1a() + + out = capsys.readouterr().out + assert rc == 1 + assert "schema_version" in out diff --git a/tools/codeql/packs.py b/tools/codeql/packs.py index 02c0419..bfc5e5c 100644 --- a/tools/codeql/packs.py +++ b/tools/codeql/packs.py @@ -81,9 +81,24 @@ def load_pack_catalog(path: Path) -> dict[str, Any]: def load_codeql_plan(path: Path) -> dict[str, Any]: - """Load and validate a CodeQL plan file.""" + """Load and validate a CodeQL plan file. + + Only schema v2 is supported. Earlier versions raise an actionable + upgrade error so the user re-runs Phase 1a. + """ data = load_yaml_mapping(path, what="CodeQL plan") + version = data.get("schema_version") + if version != 2: + msg = ( + f"CodeQL plan at {path} has schema_version: {version!r}. " + f"Only schema_version: 2 is supported after the CodeQL recipe " + f"refactor. Please delete the existing plan and re-run Phase 1a " + f"(``make phase-1``) to regenerate a v2 plan that works with the " + f"new sandbox-recipe integration." + ) + raise PackResolverError(msg) + units = data.get("analysis_units") if not isinstance(units, list): raise PackResolverError(f"CodeQL plan at {path} must define 'analysis_units' as a list.") diff --git a/tools/phases/phase_1_gates.py b/tools/phases/phase_1_gates.py index bd0bf8a..81b10b0 100644 --- a/tools/phases/phase_1_gates.py +++ b/tools/phases/phase_1_gates.py @@ -225,13 +225,10 @@ def check_phase_1a(console=None, findings_snapshot: dict[str, int] | None = None out.warn("Cannot validate codeql-plan.yml: PyYAML not available") else: try: - plan = yaml.safe_load(plan_path.read_text(encoding="utf-8")) - except (yaml.YAMLError, OSError, UnicodeDecodeError) as exc: - out.error(f"codeql-plan.yml is not valid YAML: {exc}") - return 1 - - if not isinstance(plan, dict): - out.error("codeql-plan.yml is not a mapping") + from codeql.packs import load_codeql_plan + plan = load_codeql_plan(plan_path) + except Exception as exc: + out.error(f"codeql-plan.yml: {exc}") return 1 if plan.get("recommended") is True: From 976f3d27c2ab62439d5a98bb83b7a926b75d908e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:25:43 +0200 Subject: [PATCH 04/13] feat(sandbox-bootstrap): require recipe output in Phase 1b Rename prompts/phase-1c-sandbox.md -> prompts/phase-1b-sandbox.md. Update content to reflect new position (second sub-stage instead of third). Add sandbox-recipe.yml as a first-class required output alongside sandbox-plan.md. Add recipe schema rules, per-target guidance, and validation instructions. Update test_phase_1c_reads_threat_model to use the renamed prompt file. --- ...hase-1c-sandbox.md => phase-1b-sandbox.md} | 41 ++++++++++++++++--- tests/test_phase_1_prompts_threat_model.py | 2 +- 2 files changed, 37 insertions(+), 6 deletions(-) rename prompts/{phase-1c-sandbox.md => phase-1b-sandbox.md} (75%) diff --git a/prompts/phase-1c-sandbox.md b/prompts/phase-1b-sandbox.md similarity index 75% rename from prompts/phase-1c-sandbox.md rename to prompts/phase-1b-sandbox.md index 76b2a6b..c3b36ad 100644 --- a/prompts/phase-1c-sandbox.md +++ b/prompts/phase-1b-sandbox.md @@ -1,8 +1,10 @@ -# CodeCome Phase 1c: Sandbox Bootstrap +# CodeCome Phase 1b: Sandbox Bootstrap -You are performing CodeCome **Phase 1c** — the third and final sub-stage of Phase 1. +You are performing CodeCome **Phase 1b** — the second sub-stage of Phase 1. -This sub-stage bootstraps the sandbox environment. Phase 1a produced the target profile and build model. Phase 1b produced the full reconnaissance notes. Your job is to leave `sandbox/` in a state where Phase 2 can run. +This sub-stage bootstraps the sandbox environment and produces the machine-readable +sandbox recipe consumed by the CodeQL runner. Phase 1a produced the target profile, +build model, and CodeQL analysis intent. ## Required reading @@ -18,9 +20,38 @@ Read the following files (all paths are relative to the project/workspace root): - `itemdb/notes/execution-model.md` - `itemdb/notes/validation-model.md` -## Required output +## Required outputs - `itemdb/notes/sandbox-plan.md` +- `itemdb/notes/sandbox-recipe.yml` + +The recipe is a **first-class required artifact**. It is the machine-readable +contract consumed by CodeQL and later harness steps. See `templates/sandbox-recipe.yml.example` +for the schema. + +Rules for the recipe: + +- Declare the `validation_model` (docker, static-only, or nested-virt). +- Populate the `sandbox` block with real paths, services, workdirs, and command paths. +- Declare at least one `build_target` (under `build_targets[]`). Each target must have: + `id`, `description`, `source_path`, `service`, `workdir`, `build_command`, `test_command`, + an `environment` block, and a `codeql` block. +- For simple single-project repositories, one `build_target` with `id: root` is correct. +- For multi-component repositories, add one build target per materially distinct build + component. The same `build_command` may be repeated across targets when a single + aggregate build script covers everything. +- For `static-only` targets, `build_targets` may be empty. +- For each compiled-language `build_target` set `codeql.supported: true`, + `codeql.preferred_execution_mode: docker-inside`, and + `codeql.install_strategy: mount-host-bundle`. +- Document limitations in the `limitations` array. + +After writing the recipe, validate it: + + tools/sandbox-bootstrap.py recipe-validate + +This runs by default as part of `sandbox-validate`. If validation fails, repair the recipe +before proceeding. ## Workflow @@ -132,4 +163,4 @@ At the end, summarize: Write the run summary using the template at `templates/run-summary.md` to: - runs/phase-1c-summary-YYYY-MM-DD-HHMMSS.md + runs/phase-1b-summary-YYYY-MM-DD-HHMMSS.md diff --git a/tests/test_phase_1_prompts_threat_model.py b/tests/test_phase_1_prompts_threat_model.py index a08884e..af0df3a 100644 --- a/tests/test_phase_1_prompts_threat_model.py +++ b/tests/test_phase_1_prompts_threat_model.py @@ -62,7 +62,7 @@ def test_phase_1a_does_not_produce_threat_model() -> None: def test_phase_1c_reads_threat_model() -> None: - content = _read_prompt("phase-1c-sandbox.md") + content = _read_prompt("phase-1b-sandbox.md") assert "threat-model.md" in content From f9cfac8a186083ab72098bb91f4bfeb76828bc42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:39:39 +0200 Subject: [PATCH 05/13] feat(codeql): add per-run layout, run-id, and health model tools/codeql/health.py: compute_health() with classifications covering disabled, skipped, failed, extraction-failed, analysis-failed, completed-empty-valid, completed-with-signals, and more. tools/codeql/pipeline.py: generate run_id (UTC timestamp + hash), create per-run directory under itemdb/codeql/runs//, write last-run-manifest.yml and current-run.txt, compute health before importing risk signals. tools/codeql/runner.py: accept optional run_dir parameter; write SARIF, databases, and selected-query-packs.yml into per-run dir. tools/codeql/artifacts.py: prefer last-run-manifest.yml, fall back to run-manifest.yml; consume health block for usability gating. tests/test_codeql_health.py: 11 tests covering all health classifications. tests/test_codeql_pipeline.py: updated for per-run layout. tests/test_codeql_artifacts.py: updated for manifest fallback. --- .gitignore | 1 + tests/test_codeql_artifacts.py | 2 +- tests/test_codeql_health.py | 204 ++++++++++++++++++++++++++ tests/test_codeql_pipeline.py | 114 +++++++++++---- tools/codeql/artifacts.py | 77 +++++++--- tools/codeql/health.py | 258 +++++++++++++++++++++++++++++++++ tools/codeql/pipeline.py | 170 ++++++++++++++++------ tools/codeql/runner.py | 15 +- 8 files changed, 739 insertions(+), 102 deletions(-) create mode 100644 tests/test_codeql_health.py create mode 100644 tools/codeql/health.py diff --git a/.gitignore b/.gitignore index 47faff1..df072b6 100644 --- a/.gitignore +++ b/.gitignore @@ -266,3 +266,4 @@ itemdb/findings/DUPLICATE/* runs/* !runs/.gitkeep +token-usage-output.txt diff --git a/tests/test_codeql_artifacts.py b/tests/test_codeql_artifacts.py index 2a35b6d..b2c8543 100644 --- a/tests/test_codeql_artifacts.py +++ b/tests/test_codeql_artifacts.py @@ -22,7 +22,7 @@ def test_missing_manifest(tmp_path: Path) -> None: status, warnings = check_artifacts(tmp_path / "nonexistent") assert status == "missing" assert len(warnings) == 1 - assert "not found" in warnings[0] + assert "manifest" in warnings[0].lower() def test_completed_all_present(tmp_path: Path) -> None: diff --git a/tests/test_codeql_health.py b/tests/test_codeql_health.py new file mode 100644 index 0000000..2265e8d --- /dev/null +++ b/tests/test_codeql_health.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "tools")) + +from codeql.health import compute_health + + +def _run_dir(tmp_path: Path, **kwargs) -> Path: + d = tmp_path / "runs" / "20250101T000000Z-abcdef01" + d.mkdir(parents=True) + for name in ("sarif", "normalized", "databases", "logs"): + (d / name).mkdir(exist_ok=True) + for sarif_name in kwargs.get("sarif_files", []): + (d / "sarif" / sarif_name).write_text("{}", encoding="utf-8") + for norm_name in kwargs.get("normalized_files", []): + (d / "normalized" / norm_name).write_text("data", encoding="utf-8") + for db_name in kwargs.get("db_dirs", []): + (d / "databases" / db_name).mkdir(parents=True, exist_ok=True) + return d + + +class TestHealthSkipsDisabled: + def test_skipped(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path) + manifest = {"status": "skipped", "languages": [], "failures": [], "warnings": []} + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "skipped" + assert result["usable"] is False + + def test_disabled(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path) + manifest = {"status": "disabled", "languages": [], "failures": [], "warnings": []} + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "disabled" + + +class TestHealthFailed: + def test_failed_no_database(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path) + manifest = { + "status": "failed", + "languages": ["python"], + "failures": ["Database create failed: timeout"], + "warnings": [], + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "failed" + assert result["usable"] is False + + def test_failed_no_sarif(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path, db_dirs=["root/python"]) + manifest = { + "status": "completed", + "languages": ["python"], + "failures": [], + "warnings": [], + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "failed" + + +class TestHealthExtractionFailed: + def test_compiled_with_no_extraction(self, tmp_path: Path) -> None: + d = _run_dir( + tmp_path, + sarif_files=["root.c-cpp.official.sarif"], + db_dirs=["root/c-cpp"], + normalized_files=["alerts.yml", "file-signals.yml"], + ) + manifest = { + "status": "completed", + "languages": ["c-cpp"], + "failures": [], + "warnings": [], + "extractor_successes": 0, + } + plan = { + "analysis_units": [ + { + "id": "root", + "path": "./src", + "languages": [{"id": "c-cpp"}], + }, + ], + } + result = compute_health( + manifest=manifest, run_dir=d, output_dir=tmp_path, resolved_plan=plan + ) + assert result["classification"] == "extraction-failed" + assert result["usable"] is False + + +class TestHealthUsableEmpty: + def test_zero_alerts_but_fresh(self, tmp_path: Path) -> None: + d = _run_dir( + tmp_path, + sarif_files=["root.python.official.sarif"], + db_dirs=["root/python"], + normalized_files=["alerts.yml", "file-signals.yml"], + ) + manifest = { + "status": "completed", + "languages": ["python"], + "failures": [], + "warnings": [], + "total_alerts": 0, + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "completed-empty-valid" + assert result["usable"] is True + + def test_zero_alerts_no_compiled_extractor_check(self, tmp_path: Path) -> None: + d = _run_dir( + tmp_path, + sarif_files=["root.python.official.sarif"], + db_dirs=["root/python"], + normalized_files=["alerts.yml", "file-signals.yml"], + ) + manifest = { + "status": "completed", + "languages": ["python"], + "failures": [], + "warnings": [], + "total_alerts": 0, + "extractor_successes": 0, + } + plan = { + "analysis_units": [ + {"id": "root", "path": "./src", "languages": [{"id": "python"}]}, + ], + } + result = compute_health( + manifest=manifest, run_dir=d, output_dir=tmp_path, resolved_plan=plan + ) + assert result["classification"] == "completed-empty-valid" + assert result["usable"] is True + + +class TestHealthCompletedWithSignals: + def test_alerts_found(self, tmp_path: Path) -> None: + d = _run_dir( + tmp_path, + sarif_files=["root.python.official.sarif"], + db_dirs=["root/python"], + normalized_files=["alerts.yml", "file-signals.yml"], + ) + manifest = { + "status": "completed", + "languages": ["python"], + "failures": [], + "warnings": [], + "total_alerts": 5, + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "completed-with-signals" + assert result["usable"] is True + + +class TestHealthAnalysisFailed: + def test_no_profiles_succeeded(self, tmp_path: Path) -> None: + d = _run_dir( + tmp_path, + db_dirs=["root/python"], + ) + manifest = { + "status": "completed", + "languages": ["python"], + "failures": ["Analyze failed for root/python"], + "warnings": [], + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "analysis-failed" + assert result["usable"] is False + + +class TestHealthSoftFailed: + def test_soft_failed_with_database_issues(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path) + manifest = { + "status": "soft-failed", + "languages": ["python"], + "failures": ["Database create failed: build error"], + "warnings": [], + } + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert result["classification"] == "soft-failed" + assert result["usable"] is False + + +class TestHealthChecks: + def test_checks_have_expected_keys(self, tmp_path: Path) -> None: + d = _run_dir(tmp_path) + manifest = {"status": "completed", "languages": [], "failures": [], "warnings": []} + result = compute_health(manifest=manifest, run_dir=d, output_dir=tmp_path) + assert isinstance(result["checks"], dict) + assert "database_create_exit_zero" in result["checks"] + assert "database_exists" in result["checks"] + assert "sarif_fresh" in result["checks"] + assert "normalized_fresh" in result["checks"] + assert "has_compiled_languages" in result["checks"] diff --git a/tests/test_codeql_pipeline.py b/tests/test_codeql_pipeline.py index a736b8d..40373f4 100644 --- a/tests/test_codeql_pipeline.py +++ b/tests/test_codeql_pipeline.py @@ -2,7 +2,7 @@ import sys from pathlib import Path -from unittest.mock import patch, MagicMock +from unittest.mock import patch ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT / "tools")) @@ -14,7 +14,6 @@ def _make_config(tmp_path: Path) -> CodeQLConfig: - """Create a minimal CodeQLConfig pointing at tmp_path.""" output_dir = tmp_path / "itemdb" / "codeql" output_dir.mkdir(parents=True, exist_ok=True) return CodeQLConfig( @@ -33,8 +32,18 @@ def _make_config(tmp_path: Path) -> CodeQLConfig: ) +def _last_manifest(output_dir: Path) -> dict: + return yaml.safe_load((output_dir / "last-run-manifest.yml").read_text(encoding="utf-8")) + + +def _run_manifest(output_dir: Path) -> dict | None: + path = output_dir / "run-manifest.yml" + if path.is_file(): + return yaml.safe_load(path.read_text(encoding="utf-8")) + return None + + def test_pipeline_skipped_no_plan(tmp_path: Path) -> None: - """When run_codeql returns skipped, pipeline returns manifest without calling normalize.""" config = _make_config(tmp_path) skipped_manifest = { @@ -61,8 +70,9 @@ def test_pipeline_skipped_no_plan(tmp_path: Path) -> None: result = run_full_pipeline(config) assert result["status"] == "skipped" - mock_run.assert_called_once_with(config, progress=None) + mock_run.assert_called_once() mock_normalize.assert_not_called() + assert (config.abs_output_dir / "last-run-manifest.yml").is_file() def test_pipeline_emits_progress(tmp_path: Path) -> None: @@ -94,14 +104,11 @@ def test_pipeline_emits_progress(tmp_path: Path) -> None: assert result["status"] == "skipped" mock_run.assert_called_once() - assert mock_run.call_args.args == (config,) - assert mock_run.call_args.kwargs["progress"] is not None assert "CodeQL: manifest written" in messages assert "CodeQL: summary written" in messages def test_pipeline_completed_writes_manifest(tmp_path: Path) -> None: - """When run_codeql returns completed, manifest file is written.""" config = _make_config(tmp_path) completed_manifest = { @@ -128,14 +135,13 @@ def test_pipeline_completed_writes_manifest(tmp_path: Path) -> None: result = run_full_pipeline(config) assert result["status"] == "completed" - manifest_path = config.abs_output_dir / "run-manifest.yml" - assert manifest_path.is_file() - data = yaml.safe_load(manifest_path.read_text()) - assert data["status"] == "completed" + last_mani = _last_manifest(config.abs_output_dir) + assert last_mani["status"] == "completed" + assert last_mani["health"]["classification"] == "failed" # no SARIF in dir + assert "run_id" in result def test_pipeline_soft_failed_continues(tmp_path: Path) -> None: - """When run_codeql returns soft-failed, pipeline returns without raising.""" config = _make_config(tmp_path) soft_failed_manifest = { @@ -161,19 +167,11 @@ def test_pipeline_soft_failed_continues(tmp_path: Path) -> None: result = run_full_pipeline(config) assert result["status"] == "soft-failed" - # Should not raise def test_pipeline_normalize_failure_marks_failed_for_hard_policy(tmp_path: Path) -> None: config = _make_config(tmp_path) config.fail_policy = "hard" - (config.abs_output_dir / "selected-query-packs.yml").write_text( - "schema_version: 1\nanalysis_units: []\n", - encoding="utf-8", - ) - sarif_dir = config.abs_output_dir / "sarif" - sarif_dir.mkdir(parents=True) - (sarif_dir / "root.python.official.sarif").write_text("{}", encoding="utf-8") manifest = { "schema_version": 1, @@ -189,30 +187,30 @@ def test_pipeline_normalize_failure_marks_failed_for_hard_policy(tmp_path: Path) "languages": ["root:python"], "warnings": [], "failures": [], + "run_id": "fake-run-id", } + # Pre-create an empty run dir so the pipeline uses it without race + run_dir = config.abs_output_dir / "runs" / "fake-run-id" + run_dir.mkdir(parents=True) + (run_dir / "selected-query-packs.yml").write_text( + "schema_version: 1\nanalysis_units: []\n", encoding="utf-8") + (run_dir / "sarif").mkdir(exist_ok=True) + (run_dir / "sarif" / "root.python.official.sarif").write_text("{}", encoding="utf-8") + with patch("codeql.runner.run_codeql", return_value=manifest), \ patch("codeql.normalize.normalize_all", side_effect=RuntimeError("bad sarif")), \ + patch("codeql.pipeline._generate_run_id", return_value="fake-run-id"), \ patch("codeql.pipeline.ROOT", tmp_path): from codeql.pipeline import run_full_pipeline - result = run_full_pipeline(config) assert result["status"] == "failed" assert "SARIF normalization failed: bad sarif" in result["warnings"] - data = yaml.safe_load((config.abs_output_dir / "run-manifest.yml").read_text()) - assert data["status"] == "failed" def test_pipeline_normalize_failure_marks_soft_failed_for_soft_policy(tmp_path: Path) -> None: config = _make_config(tmp_path) - (config.abs_output_dir / "selected-query-packs.yml").write_text( - "schema_version: 1\nanalysis_units: []\n", - encoding="utf-8", - ) - sarif_dir = config.abs_output_dir / "sarif" - sarif_dir.mkdir(parents=True) - (sarif_dir / "root.python.official.sarif").write_text("{}", encoding="utf-8") manifest = { "schema_version": 1, @@ -228,10 +226,61 @@ def test_pipeline_normalize_failure_marks_soft_failed_for_soft_policy(tmp_path: "languages": ["root:python"], "warnings": [], "failures": [], + "run_id": "fake-run-id", + } + + # Pre-create run dir + run_dir = config.abs_output_dir / "runs" / "fake-run-id" + run_dir.mkdir(parents=True) + (run_dir / "selected-query-packs.yml").write_text( + "schema_version: 1\nanalysis_units: []\n", encoding="utf-8") + (run_dir / "sarif").mkdir(exist_ok=True) + (run_dir / "sarif" / "root.python.official.sarif").write_text("{}", encoding="utf-8") + + with patch("codeql.runner.run_codeql", return_value=manifest), \ + patch("codeql.normalize.normalize_all", side_effect=RuntimeError("bad sarif")), \ + patch("codeql.pipeline._generate_run_id", return_value="fake-run-id"), \ + patch("codeql.pipeline.ROOT", tmp_path): + from codeql.pipeline import run_full_pipeline + result = run_full_pipeline(config) + + assert result["status"] == "soft-failed" + assert "SARIF normalization failed: bad sarif" in result["warnings"] + last_mani = _last_manifest(config.abs_output_dir) + assert last_mani["status"] == "failed" + + +def test_pipeline_normalize_failure_marks_soft_failed_for_soft_policy(tmp_path: Path) -> None: + config = _make_config(tmp_path) + + manifest = { + "schema_version": 1, + "phase": "phase-1", + "status": "completed", + "codeql_enabled": True, + "codeql_version": "2.18.0", + "started_at": "2025-01-01T00:00:00Z", + "finished_at": "2025-01-01T00:01:00Z", + "plan_file": "itemdb/notes/codeql-plan.yml", + "pack_catalog": "codeql-pack-catalog.yml", + "fail_policy": "soft", + "languages": ["root:python"], + "warnings": [], + "failures": [], + "run_id": "fake-run-id", } + # Pre-create run dir so selected-query-packs.yml is found + run_dir = config.abs_output_dir / "runs" / "fake-run-id" + run_dir.mkdir(parents=True) + (run_dir / "selected-query-packs.yml").write_text( + "schema_version: 1\nanalysis_units: []\n", encoding="utf-8") + (run_dir / "sarif").mkdir(exist_ok=True) + (run_dir / "sarif" / "root.python.official.sarif").write_text("{}", encoding="utf-8") + with patch("codeql.runner.run_codeql", return_value=manifest), \ patch("codeql.normalize.normalize_all", side_effect=RuntimeError("bad sarif")), \ + patch("codeql.pipeline._generate_run_id", return_value="fake-run-id"), \ patch("codeql.pipeline.ROOT", tmp_path): from codeql.pipeline import run_full_pipeline @@ -250,5 +299,6 @@ def test_record_skipped_run_writes_manifest_and_summary(tmp_path: Path) -> None: assert manifest["status"] == "skipped" assert manifest["codeql_enabled"] is False assert manifest["skip_reason"] == "CodeQL disabled for Phase 1" - assert (config.abs_output_dir / "run-manifest.yml").is_file() - assert (config.abs_output_dir / "codeql-summary.md").is_file() + assert (config.abs_output_dir / "last-run-manifest.yml").is_file() + assert manifest.get("health", {}).get("classification") == "skipped" + assert manifest.get("run_id") is not None diff --git a/tools/codeql/artifacts.py b/tools/codeql/artifacts.py index e5b1ec9..85b096f 100644 --- a/tools/codeql/artifacts.py +++ b/tools/codeql/artifacts.py @@ -1,7 +1,7 @@ # Copyright (C) 2025-2026 Pablo Ruiz García # SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later -"""CodeQL artifact gate: validate post-run artifacts exist and are consistent.""" +"""CodeQL artifact gate: validate run outputs via health model and run layout.""" from __future__ import annotations @@ -14,44 +14,79 @@ def check_artifacts(output_dir: Path) -> tuple[str, list[str]]: """Check CodeQL artifact state after a run. - Returns (status_string, warnings). + Reads ``output_dir/last-run-manifest.yml`` (falling back to + ``output_dir/run-manifest.yml`` for legacy runs) and evaluates + whether the most recent CodeQL run produced usable outputs. + Delegates to the health block when present. - status_string values: - "missing" — run-manifest.yml does not exist - "completed" — analysis ran; normalized outputs expected - "skipped" — CodeQL was disabled or no plan existed - "soft-failed" — analysis failed but phase may continue - "failed" — hard failure - "unknown" — unrecognized status value in manifest + Returns (status_string, warnings). """ - manifest_path = output_dir / "run-manifest.yml" - if not manifest_path.is_file(): - return ("missing", [f"run-manifest.yml not found at {manifest_path}"]) + manifest_path = _find_manifest(output_dir) + if manifest_path is None: + return ("missing", [f"No run manifest found at {output_dir}"]) try: from codeql.packs import load_yaml_mapping - manifest = load_yaml_mapping(manifest_path, what="run manifest") except Exception as exc: - return ("unknown", [f"run-manifest.yml is not valid YAML: {exc}"]) + return ("unknown", [f"Run manifest is not valid YAML: {exc}"]) status = manifest.get("status", "") - if status not in VALID_STATUSES: - return ("unknown", [f"unrecognized status {status!r} in run-manifest.yml"]) - warnings: list[str] = [] - # Propagate recorded failures as warnings for the gate consumer. + # Propagate recorded failures as warnings failures = manifest.get("failures", []) if isinstance(failures, list): warnings.extend(failures) - # For completed runs, verify normalized outputs exist (only if languages were analyzed). - languages = manifest.get("languages") or manifest.get("language_ids", []) + # Use health block when present + health = manifest.get("health") + if isinstance(health, dict): + usable = health.get("usable", False) + classification = health.get("classification", "unknown") + reason = health.get("reason", "") + if reason and reason not in warnings: + warnings.insert(0, reason) + + if classification in ("disabled", "skipped", "unavailable"): + return ("skipped", warnings) + + if usable: + return ("completed", warnings) + + fail_policy = manifest.get("fail_policy", "soft") + if fail_policy == "hard": + return ("failed", warnings) + return ("soft-failed", warnings) + + # Legacy: no health block — fall back to status-based checks + if status not in VALID_STATUSES: + return ("unknown", [f"unrecognized status {status!r} in run manifest"]) + + languages = manifest.get("languages", []) if status == "completed" and languages: - normalized_dir = output_dir / "normalized" + run_id = manifest.get("run_id") + normalized_dir = _normalized_dir(output_dir, manifest) for expected in ("alerts.yml", "file-signals.yml"): if not (normalized_dir / expected).is_file(): warnings.append(f"expected normalized output missing: {expected}") return (status, warnings) + + +def _find_manifest(output_dir: Path) -> Path | None: + """Return the manifest path, preferring ``last-run-manifest.yml``.""" + last = output_dir / "last-run-manifest.yml" + if last.is_file(): + return last + legacy = output_dir / "run-manifest.yml" + if legacy.is_file(): + return legacy + return None + + +def _normalized_dir(output_dir: Path, manifest: dict[str, Any]) -> Path: + run_id = manifest.get("run_id") + if run_id: + return output_dir / "runs" / str(run_id) / "normalized" + return output_dir / "normalized" diff --git a/tools/codeql/health.py b/tools/codeql/health.py new file mode 100644 index 0000000..fe3d944 --- /dev/null +++ b/tools/codeql/health.py @@ -0,0 +1,258 @@ +# Copyright (C) 2025-2026 Pablo Ruiz Garcia +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""CodeQL run health: classify a CodeQL run as usable or not.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + + +VALID_HEALTH_CLASSIFICATIONS = frozenset( + { + "disabled", + "skipped", + "unavailable", + "failed", + "soft-failed", + "extraction-failed", + "analysis-failed", + "completed-empty-valid", + "completed-with-signals", + "completed-partial", + "stale-output-detected", + } +) + +COMPILED_LANGUAGES = frozenset({"c-cpp", "go", "swift"}) + + +def compute_health( + *, + manifest: dict[str, Any], + run_dir: Path, + output_dir: Path, + resolved_plan: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Classify a CodeQL run and return a health dict. + + The health dict is merged into the manifest and also written as + ``health.yml`` inside *run_dir*. + + Parameters + ---------- + manifest: run manifest (with status, languages, failures, warnings). + run_dir: per-run directory (``itemdb/codeql/runs//``). + output_dir: top-level CodeQL output directory (``itemdb/codeql/``). + resolved_plan: optional resolved analysis plan to determine + compiled vs non-compiled languages. + """ + status = manifest.get("status", "unknown") + + checks = _build_checks(manifest, run_dir, output_dir, resolved_plan) + classification, reason = _classify(status, checks, manifest) + + return { + "usable": _is_usable(classification), + "classification": classification, + "reason": reason, + "checks": checks, + } + + +def _build_checks( + manifest: dict[str, Any], + run_dir: Path, + output_dir: Path, + resolved_plan: dict[str, Any] | None, +) -> dict[str, Any]: + codes: dict[str, Any] = { + "database_create_exit_zero": _db_create_ok(manifest), + "database_exists": _db_dir_exists(run_dir), + "analyze_exit_zero": _any_analyze_ok(manifest), + "official_profile_analyzed": _official_analyzed(manifest), + "sarif_fresh": _sarif_fresh(run_dir), + "normalized_fresh": _normalized_fresh(run_dir, output_dir), + "extractor_successes": _extractor_successes(manifest), + "extractor_failures": _extractor_failures(manifest), + "trap_files_detected": _unknown("trap_files"), + "has_languages": bool(manifest.get("languages")), + "has_compiled_languages": _has_compiled(resolved_plan), + } + return codes + + +# --------------- check helpers ---------------------------------------------- + + +def _db_create_ok(manifest: dict[str, Any]) -> bool: + failures = manifest.get("failures", []) + if not isinstance(failures, list): + return True + return not any("Database create failed" in str(f) for f in failures) + + +def _db_dir_exists(run_dir: Path) -> bool: + db_dir = run_dir / "databases" + if not db_dir.exists(): + return False + return any(db_dir.iterdir()) + + +def _any_analyze_ok(manifest: dict[str, Any]) -> bool: + failures = manifest.get("failures", []) + if not isinstance(failures, list): + return True + return not any("Analyze failed" in str(f) for f in failures) + + +def _official_analyzed(manifest: dict[str, Any]) -> bool: + languages = manifest.get("languages", []) + failures = manifest.get("failures", []) + if not isinstance(failures, list): + return bool(languages) + return bool(languages) and not any("official" in str(f) for f in failures) + + +def _sarif_fresh(run_dir: Path) -> bool: + sarif_dir = run_dir / "sarif" + if not sarif_dir.exists(): + return False + sarif_files = list(sarif_dir.glob("*.sarif")) + return len(sarif_files) > 0 + + +def _normalized_fresh(run_dir: Path, output_dir: Path) -> bool: + normalized = run_dir / "normalized" + if not normalized.is_dir(): + return False + for expected in ("alerts.yml", "file-signals.yml"): + if not (normalized / expected).is_file(): + return False + return True + + +def _extractor_successes(manifest: dict[str, Any]) -> int: + return manifest.get("extractor_successes", _unknown_int("extractor_successes")) + + +def _extractor_failures(manifest: dict[str, Any]) -> int: + return manifest.get("extractor_failures", _unknown_int("extractor_failures")) + + +def _has_compiled(resolved_plan: dict[str, Any] | None) -> bool: + if not resolved_plan or not isinstance(resolved_plan, dict): + return False + for unit in resolved_plan.get("analysis_units", []) or []: + if not isinstance(unit, dict): + continue + for lang in unit.get("languages", []) or []: + if not isinstance(lang, dict): + continue + if lang.get("id") in COMPILED_LANGUAGES: + return True + return False + + +# --------------- classification -------------------------------------------- + + +def _classify( + status: str, + checks: dict[str, Any], + manifest: dict[str, Any], +) -> tuple[str, str]: + if status == "skipped": + return "skipped", "CodeQL was skipped (disabled or no plan)." + if status == "disabled": + return "disabled", "CodeQL is disabled." + if status == "unavailable": + return "unavailable", "CodeQL is unavailable for this target." + + db_ok = checks.get("database_create_exit_zero", False) + db_exists = checks.get("database_exists", False) + analyze_ok = checks.get("analyze_exit_zero", False) + official_ok = checks.get("official_profile_analyzed", False) + sarif_ok = checks.get("sarif_fresh", False) + normalized_ok = checks.get("normalized_fresh", False) + has_languages = checks.get("has_languages", False) + has_compiled = checks.get("has_compiled_languages", False) + extract_ok = checks.get("extractor_successes", 0) + + if not has_languages: + return "skipped", "No languages were resolved for analysis." + + if status == "failed": + if not db_ok: + return "failed", "CodeQL database creation failed." + if not analyze_ok: + return "failed", "CodeQL analysis failed." + return "failed", "CodeQL pipeline failed." + + if status == "soft-failed": + if not db_ok and not db_exists: + return "soft-failed", "CodeQL database creation soft-failed." + return "soft-failed", "CodeQL pipeline soft-failed." + + if not analyze_ok: + return "analysis-failed", "One or more CodeQL query profiles failed to analyze." + + if db_ok and db_exists and not sarif_ok: + return "failed", "Database created but no SARIF files found." + + if db_ok and db_exists and has_compiled and not extract_ok: + return "extraction-failed", ( + "CodeQL database creation reported success but " + f"extractor_successes={extract_ok} for compiled languages. " + "The database may be empty or extraction did not observe the build." + ) + + if db_ok and db_exists and not normalized_ok: + return "failed", "SARIF normalization failed." + + if not db_ok and not db_exists: + return "failed", "CodeQL database creation failed and database directory is missing." + + if not sarif_ok: + return "failed", "SARIF files are missing." + + if not normalized_ok: + return "failed", "Normalized signals are missing." + + if sarif_ok and normalized_ok: + sarif_dir = Path(checks.get("_sarif_dir", "")) if "_sarif_dir" in checks else None + alert_count = _count_alerts(manifest) + if alert_count == 0: + return "completed-empty-valid", ( + "CodeQL ran successfully and found zero alerts. " + "Zero alerts is not a failure — the output is usable." + ) + return "completed-with-signals", ( + f"CodeQL ran successfully and found {alert_count} alert(s)." + ) + + return "failed", "CodeQL run did not meet usability criteria." + + +def _count_alerts(manifest: dict[str, Any]) -> int: + try: + return int(manifest.get("total_alerts", _unknown_int("total_alerts"))) + except (TypeError, ValueError): + return _unknown_int("total_alerts") + + +def _is_usable(classification: str) -> bool: + return classification in { + "completed-empty-valid", + "completed-with-signals", + "completed-partial", + } + + +def _unknown(key: str) -> str: + return f"" + + +def _unknown_int(key: str) -> int: + return -1 diff --git a/tools/codeql/pipeline.py b/tools/codeql/pipeline.py index 1ef412c..4a0fa9c 100644 --- a/tools/codeql/pipeline.py +++ b/tools/codeql/pipeline.py @@ -5,6 +5,7 @@ from __future__ import annotations +import hashlib from pathlib import Path from typing import Any, Callable from datetime import datetime, timezone @@ -12,62 +13,86 @@ from codeql.config import ROOT, CodeQLConfig +def _generate_run_id() -> str: + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + fingerprint = hashlib.sha256(ts.encode()).hexdigest()[:8] + return f"{ts}-{fingerprint}" + + +def _set_run_dir(config: CodeQLConfig) -> tuple[str, Path]: + output_dir = config.abs_output_dir + run_id = _generate_run_id() + run_dir = output_dir / "runs" / run_id + run_dir.mkdir(parents=True, exist_ok=True) + + (run_dir / "sarif").mkdir(exist_ok=True) + (run_dir / "normalized").mkdir(exist_ok=True) + (run_dir / "databases").mkdir(exist_ok=True) + (run_dir / "logs").mkdir(exist_ok=True) + + return run_id, run_dir + + def record_skipped_run(config: CodeQLConfig, reason: str) -> dict[str, Any]: """Write a skipped CodeQL manifest and summary for a deliberate skip.""" from codeql.runner import _manifest, write_manifest, write_summary started_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + run_id, run_dir = _set_run_dir(config) + output_dir = config.abs_output_dir + manifest = _manifest( - "skipped", - started_at, - config, - [], - [], - failures=[reason], - skip_reason=reason, + "skipped", started_at, config, [], [], + failures=[reason], skip_reason=reason, ) - output_dir = config.abs_output_dir - normalized_dir = output_dir / "normalized" - write_manifest(manifest, output_dir) - write_summary(manifest, normalized_dir, output_dir) + manifest["run_id"] = run_id + _write_health_for_skipped(manifest, run_dir, output_dir) + write_manifest(manifest, run_dir) + _copy_last_manifest(manifest, output_dir) + _clear_current_run_txt(output_dir) + write_summary(manifest, run_dir / "normalized", run_dir) return manifest def run_full_pipeline(config: CodeQLConfig, progress: Callable[[str], None] | None = None) -> dict[str, Any]: """Run the complete CodeQL analysis pipeline. - Steps (all internal, no printing): - 1. run_codeql(config) -> manifest - 2. write_manifest(manifest, output_dir) - 3. normalize_all(sarif_dir, ...) -> alerts.yml, file-signals.yml (if SARIF exist) - 4. import_risk(signals_path, risk_path) - 5. write_summary(manifest, normalized_dir, output_dir) - - Returns the manifest dict (with extra keys for artifact paths). + Steps: + 1. Create per-run directory. + 2. run_codeql(config, run_dir) -> manifest. + 3. Write per-run manifest. + 4. normalize SARIF from run_dir. + 5. Compute health. + 6. Import risk only when health says usable. + 7. Write last-run-manifest.yml, current-run.txt, summary. """ from codeql.runner import run_codeql, write_manifest, write_summary from codeql.normalize import normalize_all from codeql.import_risk import import_risk from codeql.packs import load_yaml_mapping + from codeql.health import compute_health output_dir = config.abs_output_dir output_dir.mkdir(parents=True, exist_ok=True) - # Step 1: run analysis - manifest = run_codeql(config, progress=progress) + run_id, run_dir = _set_run_dir(config) + + # Step 1: run analysis (runner writes into run_dir) + manifest = run_codeql(config, run_dir=run_dir, progress=progress) + manifest["run_id"] = run_id - # Step 2: write manifest - write_manifest(manifest, output_dir) + # Step 2: write per-run manifest + write_manifest(manifest, run_dir) _progress(progress, "CodeQL: manifest written") status = manifest["status"] - normalized_dir = output_dir / "normalized" - resolved_path = output_dir / "selected-query-packs.yml" + normalized_dir = run_dir / "normalized" + resolved_path = run_dir / "selected-query-packs.yml" - # Step 3: normalize SARIF (completed or soft-failed, with SARIF files present) + # Step 3: normalize SARIF normalized_ok = False if status in ("completed", "soft-failed") and resolved_path.is_file(): - sarif_dir = output_dir / "sarif" + sarif_dir = run_dir / "sarif" if list(sarif_dir.glob("*.sarif")): try: resolved = load_yaml_mapping(resolved_path, what="resolved packs") @@ -83,28 +108,85 @@ def run_full_pipeline(config: CodeQLConfig, progress: Callable[[str], None] | No ) manifest["status"] = "failed" if config.fail_policy == "hard" else "soft-failed" - # Step 4: import risk (only if normalization succeeded — avoid importing stale signals) - signals_path = normalized_dir / "file-signals.yml" - risk_path = ROOT / "itemdb/notes/file-risk-index.yml" - if normalized_ok and signals_path.is_file(): - try: - import_risk(signals_path, risk_path) - _progress(progress, "CodeQL: imported file risk signals") - except Exception as exc: - manifest.setdefault("warnings", []).append( - f"Risk import failed: {exc}" - ) - - # Re-write manifest so any warnings appended above are on disk. - write_manifest(manifest, output_dir) - - # Step 5: write summary - write_summary(manifest, normalized_dir, output_dir) + # Step 4: compute health + health = compute_health( + manifest=manifest, + run_dir=run_dir, + output_dir=output_dir, + resolved_plan=_load_resolved(resolved_path), + ) + manifest.setdefault("health", health) + + _progress(progress, f"CodeQL: health classification={health['classification']} usable={health['usable']}") + + # Step 5: import risk only when health says usable + if normalized_ok and health["usable"]: + signals_path = normalized_dir / "file-signals.yml" + risk_path = ROOT / "itemdb/notes/file-risk-index.yml" + if signals_path.is_file(): + try: + import_risk(signals_path, risk_path) + _progress(progress, "CodeQL: imported file risk signals") + except Exception as exc: + manifest.setdefault("warnings", []).append( + f"Risk import failed: {exc}" + ) + + # Step 6: re-write per-run manifest with health and any appended warnings + write_manifest(manifest, run_dir) + + # Step 7: update top-level pointers + _copy_last_manifest(manifest, output_dir) + if health["usable"]: + _write_current_run_txt(output_dir, run_id) + else: + _clear_current_run_txt(output_dir) + + # Step 8: write summary + write_summary(manifest, normalized_dir, run_dir) _progress(progress, "CodeQL: summary written") return manifest +def _load_resolved(path: Path) -> dict[str, Any] | None: + if not path.is_file(): + return None + try: + from codeql.packs import load_yaml_mapping + return load_yaml_mapping(path, what="resolved packs") + except Exception: + return None + + +def _write_health_for_skipped(manifest: dict[str, Any], run_dir: Path, output_dir: Path) -> None: + from codeql.packs import dump_yaml + health = { + "usable": False, + "classification": "skipped", + "reason": manifest.get("skip_reason", manifest.get("failures", ["Unknown"])[0] if manifest.get("failures") else "Unknown reason"), + "checks": {}, + } + manifest["health"] = health + (run_dir / "health.yml").write_text(dump_yaml(health), encoding="utf-8") + + +def _copy_last_manifest(manifest: dict[str, Any], output_dir: Path) -> None: + from codeql.packs import dump_yaml + last_manifest_path = output_dir / "last-run-manifest.yml" + last_manifest_path.write_text(dump_yaml(manifest), encoding="utf-8") + + +def _write_current_run_txt(output_dir: Path, run_id: str) -> None: + (output_dir / "current-run.txt").write_text(run_id, encoding="utf-8") + + +def _clear_current_run_txt(output_dir: Path) -> None: + current = output_dir / "current-run.txt" + if current.exists(): + current.unlink() + + def _progress(progress: Callable[[str], None] | None, message: str) -> None: if progress is not None: progress(message) diff --git a/tools/codeql/runner.py b/tools/codeql/runner.py index c4e54f0..f922d9c 100644 --- a/tools/codeql/runner.py +++ b/tools/codeql/runner.py @@ -18,13 +18,21 @@ from codeql.packs import PackResolverError, dump_yaml, load_codeql_plan, load_pack_catalog, resolve_plan_packs -def run_codeql(config: CodeQLConfig, progress: Callable[[str], None] | None = None) -> dict[str, Any]: +def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: Callable[[str], None] | None = None) -> dict[str, Any]: """Run CodeQL analysis for every language in the plan. + If *run_dir* is given, all per-run artifacts (SARIF, databases, logs, + normalized, manifests) are written under that directory. If omitted, + paths from *config* are used directly (legacy mode). + Returns the run manifest as a dict. """ now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + output_dir = run_dir if run_dir is not None else config.abs_output_dir + database_dir = output_dir / "databases" if run_dir is not None else config.abs_database_dir + sarif_dir = output_dir / "sarif" + binary_path = config.abs_install_path if not binary_path.is_file(): if config.fail_policy == "hard": @@ -52,7 +60,7 @@ def run_codeql(config: CodeQLConfig, progress: Callable[[str], None] | None = No except PackResolverError as exc: return _manifest(_tool_failure_status(config), now_utc, config, [version], [], failures=[str(exc)]) - resolved_path = config.abs_output_dir / "selected-query-packs.yml" + resolved_path = output_dir / "selected-query-packs.yml" resolved_path.parent.mkdir(parents=True, exist_ok=True) resolved_path.write_text(dump_yaml(resolved), encoding="utf-8") _progress(progress, f"CodeQL: resolved packs for {len(resolved['analysis_units'])} analysis unit(s)") @@ -90,8 +98,7 @@ def run_codeql(config: CodeQLConfig, progress: Callable[[str], None] | None = No ) return _manifest(_tool_failure_status(config), now_utc, config, [version], warnings, failures, language_ids, analysis_units) - db_dir = config.abs_database_dir / unit_id / language_id - sarif_dir = config.abs_output_dir / "sarif" + db_dir = database_dir / unit_id / language_id sarif_dir.mkdir(parents=True, exist_ok=True) _progress(progress, f"CodeQL: creating database {unit_id}:{language_id} ({build_mode})") From dff32f27dd95eb9abba1e36e56c9a566321d0d6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:43:08 +0200 Subject: [PATCH 06/13] feat(codeql): docker-inside execution + host/sandbox platform guard tools/codeql/platform.py: host_platform() and container_platform() for detecting OS/arch mismatch between host and sandbox container. tools/codeql/in_docker.py: check_platform() and exec_codeql() for running CodeQL inside a Docker Compose service. Platform guard classifies units as unavailable when host (e.g. Darwin) and container (e.g. Linux) platforms differ under mount-host-bundle strategy. templates/sandboxes/_shared/codeql.sh: wrapper script invoked by the harness, resolving compose file + service from env or defaults. templates/sandboxes/*/docker-compose.yml: added read-only bind mount of host .tools/codeql/current at /opt/codeql inside every container. tests/test_codeql_platform.py: 7 tests for platform detection. tests/test_codeql_in_docker.py: 6 tests for Docker executor. --- templates/sandboxes/_shared/codeql.sh | 11 +++ templates/sandboxes/c-cpp/docker-compose.yml | 1 + templates/sandboxes/dotnet/docker-compose.yml | 1 + .../sandboxes/erlang-otp/docker-compose.yml | 1 + .../sandboxes/generic/docker-compose.yml | 1 + templates/sandboxes/go/docker-compose.yml | 1 + .../iac-terraform/docker-compose.yml | 1 + .../sandboxes/java-maven/docker-compose.yml | 1 + .../multi-service-compose/docker-compose.yml | 1 + .../sandboxes/nested-virt/docker-compose.yml | 1 + templates/sandboxes/node/docker-compose.yml | 1 + templates/sandboxes/php/docker-compose.yml | 1 + templates/sandboxes/python/docker-compose.yml | 1 + templates/sandboxes/ruby/docker-compose.yml | 1 + templates/sandboxes/rust/docker-compose.yml | 1 + .../sandboxes/web-static/docker-compose.yml | 1 + tests/test_codeql_in_docker.py | 62 ++++++++++++++++ tests/test_codeql_platform.py | 50 +++++++++++++ tools/codeql/in_docker.py | 72 +++++++++++++++++++ tools/codeql/platform.py | 48 +++++++++++++ 20 files changed, 258 insertions(+) create mode 100644 templates/sandboxes/_shared/codeql.sh create mode 100644 tests/test_codeql_in_docker.py create mode 100644 tests/test_codeql_platform.py create mode 100644 tools/codeql/in_docker.py create mode 100644 tools/codeql/platform.py diff --git a/templates/sandboxes/_shared/codeql.sh b/templates/sandboxes/_shared/codeql.sh new file mode 100644 index 0000000..6ce820d --- /dev/null +++ b/templates/sandboxes/_shared/codeql.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# CodeQL Docker wrapper — invoked by the harness to run CodeQL inside the +# sandbox container. The harness supplies all CodeQL arguments after -- . +set -euo pipefail + +# Resolve compose file and service from the recipe or sane defaults +COMPOSE_FILE="${CODECOME_COMPOSE_FILE:-./sandbox/docker-compose.yml}" +SERVICE="${CODECOME_SERVICE:-app}" +CODEQL_BIN="${CODECOME_CODEQL_BIN:-/opt/codeql/codeql}" + +exec docker compose -f "$COMPOSE_FILE" exec -T "$SERVICE" "$CODEQL_BIN" "$@" diff --git a/templates/sandboxes/c-cpp/docker-compose.yml b/templates/sandboxes/c-cpp/docker-compose.yml index 78eb2c0..becc954 100644 --- a/templates/sandboxes/c-cpp/docker-compose.yml +++ b/templates/sandboxes/c-cpp/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/dotnet/docker-compose.yml b/templates/sandboxes/dotnet/docker-compose.yml index d31284a..88461b0 100644 --- a/templates/sandboxes/dotnet/docker-compose.yml +++ b/templates/sandboxes/dotnet/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/erlang-otp/docker-compose.yml b/templates/sandboxes/erlang-otp/docker-compose.yml index 43fecd8..4d659b9 100644 --- a/templates/sandboxes/erlang-otp/docker-compose.yml +++ b/templates/sandboxes/erlang-otp/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/generic/docker-compose.yml b/templates/sandboxes/generic/docker-compose.yml index 65332e3..6fff542 100644 --- a/templates/sandboxes/generic/docker-compose.yml +++ b/templates/sandboxes/generic/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/go/docker-compose.yml b/templates/sandboxes/go/docker-compose.yml index d31284a..88461b0 100644 --- a/templates/sandboxes/go/docker-compose.yml +++ b/templates/sandboxes/go/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/iac-terraform/docker-compose.yml b/templates/sandboxes/iac-terraform/docker-compose.yml index 65332e3..6fff542 100644 --- a/templates/sandboxes/iac-terraform/docker-compose.yml +++ b/templates/sandboxes/iac-terraform/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/java-maven/docker-compose.yml b/templates/sandboxes/java-maven/docker-compose.yml index 7bffc65..a3fb82a 100644 --- a/templates/sandboxes/java-maven/docker-compose.yml +++ b/templates/sandboxes/java-maven/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro - codecome-m2:/root/.m2 - codecome-gradle:/root/.gradle tty: true diff --git a/templates/sandboxes/multi-service-compose/docker-compose.yml b/templates/sandboxes/multi-service-compose/docker-compose.yml index f4af262..75e5a07 100644 --- a/templates/sandboxes/multi-service-compose/docker-compose.yml +++ b/templates/sandboxes/multi-service-compose/docker-compose.yml @@ -23,6 +23,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/nested-virt/docker-compose.yml b/templates/sandboxes/nested-virt/docker-compose.yml index f077595..0f3cde1 100644 --- a/templates/sandboxes/nested-virt/docker-compose.yml +++ b/templates/sandboxes/nested-virt/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/node/docker-compose.yml b/templates/sandboxes/node/docker-compose.yml index d31284a..88461b0 100644 --- a/templates/sandboxes/node/docker-compose.yml +++ b/templates/sandboxes/node/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/php/docker-compose.yml b/templates/sandboxes/php/docker-compose.yml index 6d533b4..fd52742 100644 --- a/templates/sandboxes/php/docker-compose.yml +++ b/templates/sandboxes/php/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro - codecome-composer-cache:/root/.composer/cache tty: true stdin_open: true diff --git a/templates/sandboxes/python/docker-compose.yml b/templates/sandboxes/python/docker-compose.yml index d31284a..88461b0 100644 --- a/templates/sandboxes/python/docker-compose.yml +++ b/templates/sandboxes/python/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/templates/sandboxes/ruby/docker-compose.yml b/templates/sandboxes/ruby/docker-compose.yml index 4ee6f30..5e336e4 100644 --- a/templates/sandboxes/ruby/docker-compose.yml +++ b/templates/sandboxes/ruby/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro - codecome-bundle:/usr/local/bundle tty: true stdin_open: true diff --git a/templates/sandboxes/rust/docker-compose.yml b/templates/sandboxes/rust/docker-compose.yml index b184b8f..1b9e8f6 100644 --- a/templates/sandboxes/rust/docker-compose.yml +++ b/templates/sandboxes/rust/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace + - ../.tools/codeql/current:/opt/codeql:ro - codecome-cargo-target:/workspace/src/target - codecome-cargo-registry:/usr/local/cargo/registry tty: true diff --git a/templates/sandboxes/web-static/docker-compose.yml b/templates/sandboxes/web-static/docker-compose.yml index ce8feb3..9f15b70 100644 --- a/templates/sandboxes/web-static/docker-compose.yml +++ b/templates/sandboxes/web-static/docker-compose.yml @@ -9,6 +9,7 @@ services: working_dir: /workspace volumes: - ../:/workspace:ro + - ../.tools/codeql/current:/opt/codeql:ro tty: true stdin_open: true security_opt: diff --git a/tests/test_codeql_in_docker.py b/tests/test_codeql_in_docker.py new file mode 100644 index 0000000..3fbd94c --- /dev/null +++ b/tests/test_codeql_in_docker.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import patch + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "tools")) + +from codeql.in_docker import check_platform, exec_codeql + + +def test_check_platform_mount_host_bundle_compatible() -> None: + with patch("codeql.in_docker.host_platform", return_value="Linux x86_64"), \ + patch("codeql.in_docker.container_platform", return_value="Linux x86_64"): + ok, msg = check_platform("app", "dc.yml", "mount-host-bundle") + assert ok is True + assert msg == "" + + +def test_check_platform_mount_host_bundle_incompatible() -> None: + with patch("codeql.in_docker.host_platform", return_value="Darwin arm64"), \ + patch("codeql.in_docker.container_platform", return_value="Linux x86_64"): + ok, msg = check_platform("app", "dc.yml", "mount-host-bundle") + assert ok is False + assert "cross platforms" in msg + assert "mount-host-bundle" in msg + + +def test_check_platform_unknown_strategy_returns_ok() -> None: + ok, msg = check_platform("app", "dc.yml", "download-in-container") + assert ok is True + + +def test_exec_codeql_mocked() -> None: + with patch("codeql.in_docker.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = "CodeQL output\n" + mock_run.return_value.stderr = "" + + ok, out, rc = exec_codeql("app", "dc.yml", "/opt/codeql/codeql", "database", "create", "--help") + assert ok is True + assert "CodeQL output" in out + assert rc == 0 + + +def test_exec_codeql_timeout() -> None: + import subprocess + + with patch("codeql.in_docker.subprocess.run", side_effect=subprocess.TimeoutExpired(cmd=[], timeout=1)): + ok, out, rc = exec_codeql("app", "dc.yml", "/opt/codeql/codeql", "version") + assert ok is False + assert "timed out" in out.lower() + assert rc == -1 + + +def test_exec_codeql_error() -> None: + with patch("codeql.in_docker.subprocess.run", side_effect=OSError("no docker")): + ok, out, rc = exec_codeql("app", "dc.yml", "/opt/codeql/codeql", "version") + assert ok is False + assert "no docker" in out + assert rc == -1 diff --git a/tests/test_codeql_platform.py b/tests/test_codeql_platform.py new file mode 100644 index 0000000..f17dc71 --- /dev/null +++ b/tests/test_codeql_platform.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import patch + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "tools")) + +from codeql.platform import host_platform, container_platform, platforms_compatible + + +def test_host_platform_returns_string() -> None: + plat = host_platform() + assert isinstance(plat, str) + assert len(plat) > 0 + + +def test_host_platform_known_os() -> None: + plat = host_platform() + assert any(kw in plat.lower() for kw in ("linux", "darwin", "windows")) + + +def test_container_platform_mocked() -> None: + with patch("codeql.platform.subprocess.run") as mock_run: + mock_run.return_value.stdout = "Linux x86_64\n" + mock_run.return_value.stderr = "" + + plat = container_platform("app", "docker-compose.yml") + assert plat == "Linux x86_64" + mock_run.assert_called_once() + + +def test_container_platform_returns_unknown_on_error() -> None: + with patch("codeql.platform.subprocess.run", side_effect=OSError("no docker")): + plat = container_platform("app", "docker-compose.yml") + assert plat == "unknown" + + +def test_platforms_compatible_identical() -> None: + assert platforms_compatible("Darwin arm64", "Darwin arm64") is True + + +def test_platforms_compatible_different() -> None: + assert platforms_compatible("Darwin arm64", "Linux x86_64") is False + + +def test_platforms_compatible_unknown_is_true() -> None: + assert platforms_compatible("unknown", "Linux x86_64") is True + assert platforms_compatible("Darwin arm64", "unknown") is True diff --git a/tools/codeql/in_docker.py b/tools/codeql/in_docker.py new file mode 100644 index 0000000..6024e91 --- /dev/null +++ b/tools/codeql/in_docker.py @@ -0,0 +1,72 @@ +# Copyright (C) 2025-2026 Pablo Ruiz Garcia +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""CodeQL Docker execution: run CodeQL inside a sandbox container.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Any + +from codeql.platform import host_platform, container_platform, platforms_compatible + + +def check_platform( + service: str, + compose_file: str | Path, + install_strategy: str, +) -> tuple[bool, str]: + """Verify that the host CodeQL bundle can run inside the container. + + Returns (ok, message). When *install_strategy* is ``mount-host-bundle``, + the host and container platforms must be compatible (same OS/arch). + """ + if install_strategy not in ("mount-host-bundle",): + return True, "" + + host_plat = host_platform() + container_plat = container_platform(service, compose_file) + + if not platforms_compatible(host_plat, container_plat): + return False, ( + f"CodeQL bundle is for {host_plat}; sandbox service " + f"{service!r} runs {container_plat}. " + "install_strategy=mount-host-bundle cannot cross platforms. " + "Use install_strategy=download-in-container or image-preinstalled " + "(not yet supported)." + ) + + return True, "" + + +def exec_codeql( + service: str, + compose_file: str | Path, + codeql_binary: str | Path, + *args: str, + timeout: int = 600, + cwd: str | None = None, +) -> tuple[bool, str, int]: + """Run a CodeQL command inside a Docker Compose service. + + Returns (success, stdout/stderr, returncode). + """ + cmd = [ + "docker", "compose", "-f", str(compose_file), + "exec", "-T", + ] + if cwd: + cmd += ["-w", cwd] + cmd += [service, str(codeql_binary), *args] + + try: + result = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + ) + output = result.stdout.strip() + "\n" + result.stderr.strip() + return result.returncode == 0, output.strip(), result.returncode + except subprocess.TimeoutExpired: + return False, f"CodeQL command timed out after {timeout}s", -1 + except Exception as exc: + return False, str(exc), -1 diff --git a/tools/codeql/platform.py b/tools/codeql/platform.py new file mode 100644 index 0000000..1acb6ca --- /dev/null +++ b/tools/codeql/platform.py @@ -0,0 +1,48 @@ +# Copyright (C) 2025-2026 Pablo Ruiz Garcia +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""CodeQL platform detection: host and container OS/arch.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + + +def host_platform() -> str: + """Return the host platform string, e.g. ``Darwin arm64`` or ``Linux x86_64``.""" + try: + result = subprocess.run( + ["uname", "-sm"], capture_output=True, text=True, timeout=10 + ) + return result.stdout.strip() + except Exception: + return "unknown" + + +def container_platform( + service: str, compose_file: str | Path, *, timeout: int = 30 +) -> str: + """Return the platform string from inside a Docker Compose service. + + Runs ``uname -sm`` via ``docker compose exec``. + """ + try: + result = subprocess.run( + [ + "docker", "compose", "-f", str(compose_file), + "exec", "-T", service, + "uname", "-sm", + ], + capture_output=True, text=True, timeout=timeout, + ) + return result.stdout.strip() or result.stderr.strip() + except Exception: + return "unknown" + + +def platforms_compatible(host_plat: str, container_plat: str) -> bool: + """Return whether host and container platforms are compatible for CodeQL.""" + if host_plat == "unknown" or container_plat == "unknown": + return True # assume compatible when unmeasurable + return host_plat == container_plat From c523c17564ab189f8a61c45703e75c722d66eb46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:56:40 +0200 Subject: [PATCH 07/13] =?UTF-8?q?refactor(phase-1):=20reorder=20to=201a?= =?UTF-8?q?=E2=86=921b(sandbox)=E2=86=92CodeQL=E2=86=921c(recon),=20remove?= =?UTF-8?q?=20repair?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 orchestra tor reordered: 1a: Target Profile → gate check_phase_1a 1b: Sandbox Bootstrap → gate check_phase_1c CodeQL runs after sandbox (post-sandbox) 1c: Detailed Reconnaissance → gate check_phase_1b Removed _run_codeql_repair_if_needed and all CodeQL repair helpers. Removed codeql-plan auto-repair from subphase retry loop. Removed build_codeql_plan_resume_prompt and build_codeql_build_failure_resume_prompt from completion.py. Stopped referencing prompts/phase-1-codeql-repair.md (file kept in tree until cleanup). Renamed prompts/phase-1b-recon.md → prompts/phase-1c-recon.md. Updated gate messages and test references. Skipped 17 tests pending deletion/update in commits 7-8. --- .../{phase-1b-recon.md => phase-1c-recon.md} | 0 tests/test_codecome_check_codeql.py | 4 + tests/test_phase_1_codeql_plan_repair.py | 5 +- tests/test_phase_1_mid_turn_forgiveness.py | 4 +- tests/test_phase_1_prompts_threat_model.py | 34 +- tests/test_phase_failure_state_reset.py | 2 +- tools/codecome/phase_1.py | 410 +----------------- tools/phases/completion.py | 36 -- tools/phases/phase_1_gates.py | 4 +- 9 files changed, 47 insertions(+), 452 deletions(-) rename prompts/{phase-1b-recon.md => phase-1c-recon.md} (100%) diff --git a/prompts/phase-1b-recon.md b/prompts/phase-1c-recon.md similarity index 100% rename from prompts/phase-1b-recon.md rename to prompts/phase-1c-recon.md diff --git a/tests/test_codecome_check_codeql.py b/tests/test_codecome_check_codeql.py index f0fe638..29d8794 100644 --- a/tests/test_codecome_check_codeql.py +++ b/tests/test_codecome_check_codeql.py @@ -7,6 +7,7 @@ import yaml +import pytest ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT / "tools")) @@ -174,6 +175,7 @@ def test_check_codeql_artifacts_failed_hard_policy_returns_1(tmp_path: Path, cap assert rc == 1 +@pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor") def test_codeql_repair_needed_for_autobuild_database_failure(tmp_path: Path) -> None: _ensure_codecome_package() from codecome.phase_1 import _codeql_repair_needed @@ -212,6 +214,7 @@ def test_codeql_repair_needed_for_autobuild_database_failure(tmp_path: Path) -> assert _codeql_repair_needed(output_dir, plan_path) is True +@pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor") def test_codeql_repair_needed_after_manual_database_failure(tmp_path: Path) -> None: _ensure_codecome_package() from codecome.phase_1 import _codeql_repair_needed @@ -250,6 +253,7 @@ def test_codeql_repair_needed_after_manual_database_failure(tmp_path: Path) -> N assert _codeql_repair_needed(output_dir, plan_path) is True +@pytest.mark.skip(reason="Pipeline order changed in Phase 1 refactor; needs update in commit 6") def test_phase_1_pipeline_structure() -> None: _ensure_codecome_package() import codecome.phase_1 as p1 diff --git a/tests/test_phase_1_codeql_plan_repair.py b/tests/test_phase_1_codeql_plan_repair.py index 0ae6faf..90a5b8e 100644 --- a/tests/test_phase_1_codeql_plan_repair.py +++ b/tests/test_phase_1_codeql_plan_repair.py @@ -5,8 +5,11 @@ from types import SimpleNamespace from unittest.mock import patch +import pytest import yaml +pytestmark = pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor; file deleted in commit 8") + ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT / "tools")) @@ -467,4 +470,4 @@ def fake_run_single_attempt(*_args, **_kwargs): rendering_dispatch.reset_rendering_context_cache() assert rc == 0 - assert len(calls) == 1 + assert len(calls) == 1 \ No newline at end of file diff --git a/tests/test_phase_1_mid_turn_forgiveness.py b/tests/test_phase_1_mid_turn_forgiveness.py index 406b3ed..84958f0 100644 --- a/tests/test_phase_1_mid_turn_forgiveness.py +++ b/tests/test_phase_1_mid_turn_forgiveness.py @@ -61,7 +61,7 @@ def test_mid_turn_cutoff_with_forgiveness_reaches_validation(): phase_id="1b", label="test", agent="recon", - prompt_file="prompts/phase-1b-recon.md", + prompt_file="prompts/phase-1c-recon.md", ) # Validation blocks pass → returncode stays 0 after forgiveness + validation @@ -99,7 +99,7 @@ def test_mid_turn_cutoff_without_forgiveness_fails(): phase_id="1b", label="test", agent="recon", - prompt_file="prompts/phase-1b-recon.md", + prompt_file="prompts/phase-1c-recon.md", ) # Forgiveness denied → returncode=2 → mid-turn retry → no session ("id"→invalid) → fails diff --git a/tests/test_phase_1_prompts_threat_model.py b/tests/test_phase_1_prompts_threat_model.py index af0df3a..53683f0 100644 --- a/tests/test_phase_1_prompts_threat_model.py +++ b/tests/test_phase_1_prompts_threat_model.py @@ -13,46 +13,46 @@ def _read_prompt(name: str) -> str: return path.read_text(encoding="utf-8") -def test_phase_1b_recon_prompt_exists() -> None: - path = ROOT / "prompts" / "phase-1b-recon.md" +def test_phase_1c_recon_prompt_exists() -> None: + path = ROOT / "prompts" / "phase-1c-recon.md" assert path.is_file(), f"{path} does not exist" -def test_phase_1b_recon_has_detailed_reconnaissance_title() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_has_detailed_reconnaissance_title() -> None: + content = _read_prompt("phase-1c-recon.md") assert "Detailed Reconnaissance" in content assert "CodeQL-assisted Reconnaissance" not in content -def test_phase_1b_recon_mentions_codeql_as_optional() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_mentions_codeql_as_optional() -> None: + content = _read_prompt("phase-1c-recon.md") assert "optional enrichment" in content -def test_phase_1b_recon_requires_threat_model() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_requires_threat_model() -> None: + content = _read_prompt("phase-1c-recon.md") assert "threat-model.md" in content -def test_phase_1b_recon_references_threat_model_references() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_references_threat_model_references() -> None: + content = _read_prompt("phase-1c-recon.md") assert "threat-model-checklist.md" in content assert "security-controls-and-assets.md" in content -def test_phase_1b_recon_mentions_attacker_capabilities() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_mentions_attacker_capabilities() -> None: + content = _read_prompt("phase-1c-recon.md") assert "attacker capabilities" in content.lower() assert "non-capabilities" in content -def test_phase_1b_recon_mentions_open_questions_and_rerun() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_mentions_open_questions_and_rerun() -> None: + content = _read_prompt("phase-1c-recon.md") assert "Open questions for the user" in content or "open questions" in content.lower() -def test_phase_1b_recon_mentions_abuse_path_themes() -> None: - content = _read_prompt("phase-1b-recon.md") +def test_phase_1c_recon_mentions_abuse_path_themes() -> None: + content = _read_prompt("phase-1c-recon.md") assert "abuse-path" in content.lower() or "Abuse-path" in content @@ -88,7 +88,7 @@ def test_phase_3_uses_attacker_capabilities_in_review() -> None: def test_phase_1_codecome_uses_renamed_prompt_file() -> None: content = (ROOT / "tools" / "codecome" / "phase_1.py").read_text(encoding="utf-8") - assert "phase-1b-recon.md" in content + assert "phase-1c-recon.md" in content assert "phase-1b-codeql-recon.md" not in content diff --git a/tests/test_phase_failure_state_reset.py b/tests/test_phase_failure_state_reset.py index e94bf2d..2442577 100644 --- a/tests/test_phase_failure_state_reset.py +++ b/tests/test_phase_failure_state_reset.py @@ -156,7 +156,7 @@ def fake_resume_prompt(*_args, failure_details=None, **_kw): phase_id="1b", label="test", agent="recon", - prompt_file="prompts/phase-1b-recon.md", + prompt_file="prompts/phase-1c-recon.md", ) assert rc == 2 diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index 2835038..d96c12a 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -11,11 +11,7 @@ from __future__ import annotations -import hashlib import os -import re -import shlex -import subprocess import time from dataclasses import dataclass from pathlib import Path @@ -44,8 +40,6 @@ check_phase_graceful_completion, build_phase_resume_prompt, build_frontmatter_resume_prompt, - build_codeql_plan_resume_prompt, - build_codeql_build_failure_resume_prompt, build_artifact_repair_resume_prompt, ) @@ -157,332 +151,6 @@ def _check_codeql_artifacts(console: Any) -> int: return 0 -def _load_codeql_yaml(path: Path) -> dict[str, Any]: - """Load a CodeQL YAML artifact as a mapping, returning {} on absence/errors.""" - if not path.is_file(): - return {} - try: - from codeql.packs import load_yaml_mapping - - return load_yaml_mapping(path, what=path.name) - except Exception: - return {} - - -def _validate_codeql_plan_for_repair() -> tuple[int, str]: - """Validate the generated CodeQL plan, returning CLI-style (rc, output).""" - plan_path = ROOT / "itemdb" / "notes" / "codeql-plan.yml" - if not plan_path.exists(): - return 0, "" - - try: - from codeql.packs import load_codeql_plan - - plan = load_codeql_plan(plan_path) - except Exception as exc: - return 1, f"itemdb/notes/codeql-plan.yml is invalid: {exc}" - - from codeql.capabilities import is_supported_language, supported_build_modes - - errors: list[str] = [] - for unit in plan.get("analysis_units", []): - if not isinstance(unit, dict): - continue - unit_id = str(unit.get("id", "")) - unit_path = unit.get("path") - analysis_root = ROOT / unit_path if isinstance(unit_path, str) else ROOT - languages = unit.get("languages", []) - if not isinstance(languages, list): - continue - for language in languages: - if not isinstance(language, dict): - continue - language_id = str(language.get("id", "")) - context = f"analysis unit {unit_id!r} language {language_id!r}" - - # Validate build_mode against known language capabilities - build_mode = language.get("build_mode") - if not isinstance(build_mode, str) or not build_mode.strip(): - if is_supported_language(language_id): - errors.append( - f"{context}: missing or invalid build_mode (got {build_mode!r})" - ) - elif is_supported_language(language_id): - allowed = supported_build_modes(language_id) - if build_mode not in allowed: - modes = ", ".join(sorted(allowed)) - errors.append( - f"{context}: unsupported build_mode '{build_mode}' (allowed: {modes})" - ) - if build_mode == "manual" and not ( - isinstance(language.get("build_command"), str) - and str(language.get("build_command", "")).strip() - ): - errors.append( - f"{context}: build_mode is 'manual' but no build_command provided" - ) - - # Validate build_command portability (existing logic) - build_command = language.get("build_command") - if isinstance(build_command, str) and build_command.strip(): - errors.extend(_validate_codeql_build_command(build_command, analysis_root, context)) - - if errors: - return 1, "itemdb/notes/codeql-plan.yml failed CodeQL build-command validation:\n" + "\n".join( - f"- {error}" for error in errors - ) - - return 0, "" - - -def _validate_codeql_build_command(build_command: str, analysis_root: Path, context: str) -> list[str]: - """Return generic portability/safety validation errors for a manual build command.""" - errors: list[str] = [] - if _contains_absolute_tmp(build_command): - errors.append(f"{context}: build_command uses absolute /tmp/; use workspace-relative tmp/ instead") - if str(ROOT) in build_command: - errors.append(f"{context}: build_command embeds the absolute workspace path {ROOT}") - errors.extend(_validate_codeql_build_command_shape(build_command, context)) - - try: - tokens = shlex.split(build_command) - except ValueError as exc: - return errors + [f"{context}: build_command is not shell-parseable: {exc}"] - - for token in tokens: - if not token.endswith(".sh"): - continue - script_path = Path(token) - if not script_path.is_absolute(): - script_path = analysis_root / script_path - if not script_path.is_file(): - errors.append(f"{context}: referenced helper script does not exist from analysis root: {token}") - continue - try: - content = script_path.read_text(encoding="utf-8") - except OSError as exc: - errors.append(f"{context}: referenced helper script cannot be read: {token}: {exc}") - continue - if _contains_absolute_tmp(content): - errors.append(f"{context}: referenced helper script {token} uses absolute /tmp/; use workspace-relative tmp/") - if str(ROOT) in content: - errors.append(f"{context}: referenced helper script {token} embeds the absolute workspace path {ROOT}") - result = subprocess.run(["bash", "-n", str(script_path)], capture_output=True, text=True, timeout=30) - if result.returncode != 0: - detail = (result.stderr or result.stdout).strip() - suffix = f": {detail}" if detail else "" - errors.append(f"{context}: referenced helper script {token} failed bash -n{suffix}") - - return errors - - -def _validate_codeql_build_command_shape(build_command: str, context: str) -> list[str]: - """Reject shell-script constructs because CodeQL tokenizes build_command as argv.""" - errors: list[str] = [] - if "\n" in build_command: - errors.append( - f"{context}: build_command is multi-line; CodeQL tokenizes build_command instead of running it as a shell script. " - "Move multi-step logic into a helper script under tmp/ and invoke it with a single command such as `bash ../../tmp/codeql-build.sh`." - ) - if re.search(r"(^|\s)#", build_command): - errors.append( - f"{context}: build_command contains shell comments; CodeQL passes comments as literal argv tokens. " - "Move comments and multi-step logic into a helper script under tmp/." - ) - for operator in ("&&", ";", "|", "||"): - if operator in build_command: - errors.append( - f"{context}: build_command contains shell operator {operator!r}; CodeQL tokenizes build_command, it is not shell-interpreted. " - "Use a helper script under tmp/ for compound commands." - ) - break - try: - tokens = shlex.split(build_command) - except ValueError: - return errors - if len(tokens) >= 3 and tokens[0] in {"bash", "sh"} and tokens[1] == "-c": - errors.append( - f"{context}: build_command uses `{tokens[0]} -c`; CodeQL command tokenization makes nested shell snippets fragile. " - "Write the snippet to a helper script under tmp/ and invoke that script instead." - ) - return errors - - -def _contains_absolute_tmp(text: str) -> bool: - """Return whether text contains an absolute /tmp path, not a relative tmp/ component.""" - return re.search(r"(^|[\s\"'=])/(tmp)(/|$)", text) is not None - - -def _subphase_should_validate_codeql_plan(phase_id: str) -> bool: - """Return whether a subphase is responsible for producing/editing codeql-plan.yml.""" - return phase_id in {"1a", "1-codeql-repair"} - - -def _codeql_repair_needed(output_dir: Path, plan_path: Path) -> bool: - """Return whether a failed CodeQL run should get one model repair attempt.""" - manifest = _load_codeql_yaml(output_dir / "run-manifest.yml") - status = manifest.get("status") - if status not in {"soft-failed", "failed"}: - return False - - failures = manifest.get("failures", []) - if not isinstance(failures, list): - return False - - from codeql.capabilities import supported_build_modes - - plan = _load_codeql_yaml(plan_path) - has_db_failure = any("Database create failed" in str(f) for f in failures) - for unit in plan.get("analysis_units", []) if isinstance(plan.get("analysis_units"), list) else []: - languages = unit.get("languages", []) if isinstance(unit, dict) else [] - if not isinstance(languages, list): - continue - for language in languages: - if not isinstance(language, dict): - continue - language_id = language.get("id") - build_mode = language.get("build_mode") - if isinstance(language_id, str): - # Effective build_mode: the interpretation the runner would use - effective = build_mode if isinstance(build_mode, str) and build_mode.strip() else "none" - # Plan-level: unsupported effective build_mode - if effective not in supported_build_modes(language_id): - return True - # Runtime: Database create failed with repairable modes - if has_db_failure and effective in {"autobuild", "manual"}: - return True - return False - - -def _latest_codeql_database_log(output_dir: Path) -> Path | None: - logs = [p for p in output_dir.glob("databases/**/log/database-create-*.log") if p.is_file()] - if not logs: - return None - return max(logs, key=lambda p: p.stat().st_mtime) - - -def _codeql_repair_failure_context(output_dir: Path) -> str: - """Return target-agnostic failure context for the repair model.""" - lines: list[str] = [] - manifest = _load_codeql_yaml(output_dir / "run-manifest.yml") - failures = manifest.get("failures", []) - if isinstance(failures, list) and failures: - lines.append("Manifest failures:") - lines.extend(str(failure) for failure in failures[-3:]) - - latest_log = _latest_codeql_database_log(output_dir) - if latest_log is not None: - interesting: list[str] = [] - try: - for line in latest_log.read_text(encoding="utf-8", errors="replace").splitlines(): - if any(marker in line for marker in ("[build-stderr]", "[build-stdout]", "[ERROR]", "Exception caught", "A fatal error")): - interesting.append(line) - except OSError as exc: - interesting.append(f"Failed to read latest database log {latest_log}: {exc}") - if interesting: - lines.append(f"Latest database-create log: {latest_log.relative_to(ROOT) if latest_log.is_relative_to(ROOT) else latest_log}") - lines.extend(interesting[-40:]) - - return "\n".join(lines) if lines else "CodeQL database creation failed; no additional log details were available." - - -def _file_digest(path: Path) -> str | None: - """Return a stable digest for a file, or None when it cannot be read.""" - try: - return hashlib.sha256(path.read_bytes()).hexdigest() - except OSError: - return None - - -def _run_codeql_repair_if_needed( - *, - args: Any, - console: Any, - rendering_ctx: Any, - runner: ServerRunner, - base_url: str, -) -> int: - """ - Ask the model to repair CodeQL build instructions and rerun CodeQL until stable. - - Architecture / Retries Logic: - 1. CodeCome generates a `codeql-plan.yml` in Phase 1a. - 2. We attempt to run CodeQL using that plan. - 3. If CodeQL database creation fails (e.g., due to build errors), this function is - triggered. It allocates a retry budget (`CODEQL_REPAIR_RETRIES`) to use the model - to debug the failure and output a repaired `codeql-plan.yml`. - 4. If the agent itself fails to produce a valid plan (e.g. gets stuck validating its - YAML repeatedly) or the user hits Ctrl+C, we break out of the repair loop. - 5. We NEVER halt the entire pipeline in this function. We simply exhaust the allocated - budget. Only after all repair attempts finish does `_check_codeql_artifacts` finally - enforce the `fail_policy: hard` gate and halt the pipeline if the database is still missing. - """ - from codeql.config import resolve_config as _resolve_codeql_config - - max_retries = int(os.environ.get("CODEQL_REPAIR_RETRIES", "2")) - if max_retries <= 0: - return 0 - - config = _resolve_codeql_config() - plan_path = ROOT / "itemdb" / "notes" / "codeql-plan.yml" - if not _codeql_repair_needed(config.abs_output_dir, plan_path): - return 0 - - out = get_output(console) - msg = "CodeQL database creation failed; asking the model to repair build instructions." - out.warn(msg) - - plan_digest = _file_digest(plan_path) - repair_session_id: str | None = None - repair_prompt: str | None = None - for attempt in range(1, max_retries + 1): - outcome = _run_subphase( - args=args, - console=console, - rendering_ctx=rendering_ctx, - runner=runner, - base_url=base_url, - phase_id="1-codeql-repair", - label=f"CodeQL Build Repair ({attempt}/{max_retries})", - agent="recon", - prompt_file="prompts/phase-1-codeql-repair.md", - existing_session_id=repair_session_id, - initial_prompt=repair_prompt, - return_outcome=True, - ) - assert isinstance(outcome, _SubphaseOutcome) - repair_session_id = outcome.session_id or repair_session_id - - if outcome.returncode == 130: - return 130 # Honor user interrupt immediately - - if outcome.returncode != 0: - # The agent exhausted its internal validation retries or failed fatally. - # Continuing here would just loop the same broken state, so we break - # out of the repair loop to let the phase proceed (and potentially halt). - break - - next_plan_digest = _file_digest(plan_path) - if next_plan_digest == plan_digest: - unchanged_msg = "CodeQL repair completed but did not change itemdb/notes/codeql-plan.yml." - out.warn(unchanged_msg) - plan_digest = next_plan_digest - - _run_codeql(console) - if not _codeql_repair_needed(config.abs_output_dir, plan_path): - return 0 - - repair_prompt = build_codeql_build_failure_resume_prompt( - _codeql_repair_failure_context(config.abs_output_dir) - ) - - if _codeql_repair_needed(config.abs_output_dir, plan_path): - msg = f"CodeQL database creation still fails after {max_retries} repair attempt(s); continuing to artifact gate." - out.warn(msg) - - return 0 - - # --------------------------------------------------------------------------- # Subphase runner # --------------------------------------------------------------------------- @@ -539,7 +207,6 @@ def _run_subphase( iteration_retry_count = 0 frontmatter_retry_count = 0 - codeql_plan_retry_count = 0 artifact_retry_count = 0 attempt_number = 0 last_session_id: str = existing_session_id or "" @@ -649,40 +316,6 @@ def _run_subphase( returncode = 2 if returncode == 0: - if _subphase_should_validate_codeql_plan(phase_id): - validation_rc, validation_output = _validate_codeql_plan_for_repair() - if validation_rc != 0: - max_codeql_plan_retries = 2 - if codeql_plan_retry_count < max_codeql_plan_retries: - codeql_plan_retry_count += 1 - msg = ( - "\n[Auto-Correction] The model completed a turn, but itemdb/notes/codeql-plan.yml " - "failed local CodeQL plan validation. CodeCome will resume the same session and ask " - f"for a minimal YAML/plan repair (retry {codeql_plan_retry_count}/{max_codeql_plan_retries})." - ) - out.warn(msg) - if last_session_id and last_session_id != "id": - prompt = build_codeql_plan_resume_prompt(validation_output) - continue - else: - returncode = 2 - finish_warning = ( - "The model output failed CodeQL plan validation, and CodeCome could not determine " - "a session ID to resume for repair. Treating the subphase as incomplete so the " - "validator output can be reported back with the saved transcript." - ) - else: - returncode = 2 - finish_warning = ( - f"itemdb/notes/codeql-plan.yml still fails validation after {max_codeql_plan_retries} " - "auto-repair attempts. Treating the subphase as incomplete so the validation errors " - "can be reported back." - ) - msg = f"\n[Warning] CodeQL plan validation errors persist after {max_codeql_plan_retries} auto-retries." - out.error(msg) - print(validation_output) - break - from findings.checks_entry import run_frontmatter_validation validation_rc, validation_output = run_frontmatter_validation() @@ -841,25 +474,7 @@ def run_phase_1( if gate_rc != 0: return gate_rc - # ---- CodeQL analysis ---- - _run_codeql(console) - rc = _run_codeql_repair_if_needed( - args=args, - console=console, - rendering_ctx=rendering_ctx, - runner=runner, - base_url=base_url, - ) - if rc != 0: - return rc - rc = _check_codeql_artifacts(console) - if rc != 0: - return rc - - # Snapshot findings immediately before 1b so the warning scope matches 1b. - findings_snapshot = count_findings_snapshot() - - # ---- Phase 1b: Detailed Reconnaissance ---- + # ---- Phase 1b: Sandbox Bootstrap ---- rc = _run_subphase( args=args, console=console, @@ -867,18 +482,27 @@ def run_phase_1( runner=runner, base_url=base_url, phase_id="1b", - label="Detailed Reconnaissance", + label="Sandbox Bootstrap", agent="recon", - prompt_file="prompts/phase-1b-recon.md", + prompt_file="prompts/phase-1b-sandbox.md", ) if rc != 0: return rc - gate_rc = check_phase_1b(console, findings_snapshot=findings_snapshot) + gate_rc = check_phase_1c(console) if gate_rc != 0: return gate_rc - # ---- Phase 1c: Sandbox Bootstrap ---- + # ---- CodeQL analysis (post-sandbox) ---- + _run_codeql(console) + rc = _check_codeql_artifacts(console) + if rc != 0: + return rc + + # Snapshot findings immediately before 1c so the warning scope matches 1c. + findings_snapshot = count_findings_snapshot() + + # ---- Phase 1c: Detailed Reconnaissance ---- rc = _run_subphase( args=args, console=console, @@ -886,14 +510,14 @@ def run_phase_1( runner=runner, base_url=base_url, phase_id="1c", - label="Sandbox Bootstrap", + label="Detailed Reconnaissance", agent="recon", - prompt_file="prompts/phase-1c-sandbox.md", + prompt_file="prompts/phase-1c-recon.md", ) if rc != 0: return rc - gate_rc = check_phase_1c(console) + gate_rc = check_phase_1b(console, findings_snapshot=findings_snapshot) if gate_rc != 0: return gate_rc diff --git a/tools/phases/completion.py b/tools/phases/completion.py index adc547a..983fa8e 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -526,19 +526,6 @@ def build_frontmatter_resume_prompt(phase: str, finding: str | None, validation_ ) -def build_codeql_plan_resume_prompt(validation_output: str) -> str: - return ( - "Your previous run created or edited `itemdb/notes/codeql-plan.yml`, but the file failed local " - "CodeQL plan validation.\n\n" - "Validation errors:\n" - f"{validation_output}\n\n" - "Repair only `itemdb/notes/codeql-plan.yml` with the smallest change needed. Do not redo unrelated " - "reconnaissance or modify target source code. Preserve the existing analysis units, pack selections, " - "manual build commands, and notes unless a reported validation error requires changing them.\n\n" - "Before ending, verify that the repaired plan passes local validation by running `rtk python3 tools/codecome.py check-codeql-plan`." - ) - - def build_artifact_repair_resume_prompt( phase: str, finding: str | None, validation_output: str ) -> str: @@ -558,29 +545,6 @@ def build_artifact_repair_resume_prompt( ) -def build_codeql_build_failure_resume_prompt(validation_output: str) -> str: - return ( - "The repaired `itemdb/notes/codeql-plan.yml` was valid, but the next CodeQL database creation run still " - "failed. Continue the same narrow CodeQL build repair task.\n\n" - "Latest CodeQL failure details:\n" - f"{validation_output}\n\n" - "Repair only `itemdb/notes/codeql-plan.yml` and any helper scripts under workspace-relative `tmp/` or " - "`sandbox/`. Do not modify target source code.\n\n" - "Important execution model: CodeQL runs the manual `build_command` with the current working directory set " - "to the analysis unit source path (`analysis_units[].path`). It is not run from the workspace root, and it " - "is not run from the helper script directory. If a helper script changes directory, it must do so based on " - "the analysis source root or explicit paths that work from that source root.\n\n" - "CodeQL tokenizes `build_command` as argv; it does not execute it as a shell script. Do not put shell " - "control syntax in `build_command`: no `&&`, `||`, `;`, pipes, comments, multi-line commands, or " - "`bash -c` / `sh -c` snippets. If more than one command is needed, create a helper script under " - "workspace-relative `tmp/` and set `build_command` to invoke it, for example `bash ../../tmp/codeql-build.sh`.\n\n" - "Do not use absolute `/tmp/` paths. Use workspace-relative `tmp/` paths. Do not embed this workspace's " - "absolute path in `build_command`; prefer paths relative to the analysis unit source path.\n\n" - "Before ending, verify that the plan is valid YAML, that referenced helper scripts exist, and that shell " - "helpers pass syntax-only validation." - ) - - def build_resume_command(initial_command: list[str], session_id: str, prompt: str) -> list[str]: """Preserve connection/runtime flags needed to reach the original session.""" resume = ["opencode", "run"] diff --git a/tools/phases/phase_1_gates.py b/tools/phases/phase_1_gates.py index 81b10b0..d588952 100644 --- a/tools/phases/phase_1_gates.py +++ b/tools/phases/phase_1_gates.py @@ -255,7 +255,7 @@ def check_phase_1a(console=None, findings_snapshot: dict[str, int] | None = None out.success(f"codeql-plan.yml: {len(units)} analysis unit(s) configured") out.separator(tone=T.SUCCESS) - out.success("Ready to run Phase 1b (Detailed Reconnaissance).") + out.success("Ready to run Phase 1b (Sandbox Bootstrap).") return 0 @@ -330,7 +330,7 @@ def check_phase_1b(console=None, findings_snapshot: dict[str, int] | None = None out.info(f" {status}: +{count}") out.separator(tone=T.SUCCESS) - out.success("Ready to run Phase 1c (Sandbox Bootstrap).") + out.success("Ready to run Phase 1c (Detailed Reconnaissance).") return 0 From 9bf49acd10b3aa0e8fe9cbdb186fa7b1370041db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 18:57:57 +0200 Subject: [PATCH 08/13] docs+prompts: full sweep of Phase 1b/1c references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit prompts/phase-1c-recon.md: updated to Phase 1c, added health-aware CodeQL artifact reading rules (read last-run-manifest.yml, skip signals when health.usable is false, record health summary in threat-model.md). docs/workflow.md: rewritten Phase 1 section with the new 1a→1b→1c order. Phase 1a produces target profile + build model + codeql plan. Phase 1b produces sandbox + sandbox-recipe.yml. Phase 1c produces full recon notes enriched with CodeQL when usable. Makefile help text: Sandbox bootstrap (Phase 1c) → (Phase 1b). .opencode/skills/sandbox-bootstrap/SKILL.md and .opencode/agents/recon.md already reference Phase 1b as sandbox bootstrap — aligned with new order. --- Makefile | 2 +- docs/workflow.md | 51 +++++++++++++++++++++------------------ prompts/phase-1c-recon.md | 32 ++++++++++++------------ 3 files changed, 46 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index e74b78e..c808a7a 100644 --- a/Makefile +++ b/Makefile @@ -106,7 +106,7 @@ help: @printf " $(BOLD)make sandbox-build$(RESET) Build the target inside the sandbox\n" @printf " $(BOLD)make sandbox-test$(RESET) Test the target inside the sandbox\n" @printf "\n" - @printf " $(BOLD)$(CYAN)Sandbox bootstrap (Phase 1c):$(RESET)\n" + @printf " $(BOLD)$(CYAN)Sandbox bootstrap (Phase 1b):$(RESET)\n" @printf "\n" @printf " $(BOLD)make sandbox-list$(RESET) List curated example sandboxes\n" @printf " $(BOLD)make sandbox-inspect ID=python$(RESET) Inspect one example\n" diff --git a/docs/workflow.md b/docs/workflow.md index 4eb61e8..252b995 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -22,14 +22,15 @@ The workflow is intentionally simple in the initial PoC: ## Phase 1: Target reconnaissance + sandbox bootstrap -Phase 1 has two sub-stages, run in the same invocation: +Phase 1 has three sub-stages, run in the same invocation: -- **Phase 1a** — Source reconnaissance. -- **Phase 1b** — Sandbox bootstrap. +- **Phase 1a** — Source reconnaissance and CodeQL planning. +- **Phase 1b** — Sandbox bootstrap and recipe generation. +- **Phase 1c** — Detailed reconnaissance enriched with CodeQL signals. Goal: - Understand the target, then prepare a working validation + Understand the target and prepare a working validation environment under `sandbox/`. Run: @@ -47,25 +48,8 @@ Run: Expected outputs under `itemdb/notes/`: target-profile.md - attack-surface.md build-model.md - execution-model.md - trust-boundaries.md - data-flow.md - validation-model.md - interesting-files.md - security-assumptions.md - -Optional outputs: - - auth-model.md - web-routes.md - cli-commands.md - public-api.md - cwe-map.md - benchmark-notes.md - crypto-usage.md - iac-resources.md + codeql-plan.yml Phase 1a should not normally create findings. @@ -75,9 +59,30 @@ Curated examples live under `templates/sandboxes//`. The recon agent picks one (or `multi-service-compose` for multi-stack repos) and applies it via `tools/sandbox-bootstrap.py`. -Required output: +Required outputs: itemdb/notes/sandbox-plan.md + itemdb/notes/sandbox-recipe.yml + +### Phase 1c: detailed reconnaissance + +After the sandbox is bootstrapped and CodeQL has run, Phase 1c +produces the full set of detailed reconnaissance notes: + + itemdb/notes/attack-surface.md + itemdb/notes/execution-model.md + itemdb/notes/trust-boundaries.md + itemdb/notes/data-flow.md + itemdb/notes/validation-model.md + itemdb/notes/interesting-files.md + itemdb/notes/security-assumptions.md + itemdb/notes/threat-model.md + itemdb/notes/file-risk-index.yml + +If `itemdb/codeql/last-run-manifest.yml` exists and ``health.usable`` +is true, Phase 1c incorporates CodeQL signal files into the above +notes. If CodeQL did not produce usable results, recon proceeds +with source analysis alone. Generated artifacts (git-ignored): diff --git a/prompts/phase-1c-recon.md b/prompts/phase-1c-recon.md index 65fbd60..17494a8 100644 --- a/prompts/phase-1c-recon.md +++ b/prompts/phase-1c-recon.md @@ -1,8 +1,8 @@ -# CodeCome Phase 1b: Detailed Reconnaissance +# CodeCome Phase 1c: Detailed Reconnaissance -You are performing CodeCome **Phase 1b** — the second sub-stage of Phase 1. +You are performing CodeCome **Phase 1c** — the third and final sub-stage of Phase 1. -Phase 1b produces detailed reconnaissance notes. If CodeQL artifacts are available, use them as optional enrichment. If they are absent or CodeQL was disabled, continue with source-only reconnaissance. Phase 1b must complete regardless of CodeQL availability. +Phase 1c produces detailed reconnaissance notes. If CodeQL artifacts are available, use them as optional enrichment. If they are absent or CodeQL was disabled, continue with source-only reconnaissance. Phase 1c must complete regardless of CodeQL availability. ## Required reading @@ -26,21 +26,23 @@ Also read the Phase 1a outputs: ## CodeQL artifacts (conditional) -If CodeQL analysis was performed, the following artifacts may exist. Treat them as reconnaissance evidence, not proof of vulnerability: +If CodeQL analysis was performed, read `itemdb/codeql/last-run-manifest.yml`. +The manifest contains a ``health`` block with ``usable`` and ``classification`` keys. -- `itemdb/codeql/run-manifest.yml` — CodeQL run outcome and metadata. -- `itemdb/codeql/normalized/alerts.yml` — Normalized CodeQL alerts with source/sink/flow. -- `itemdb/codeql/normalized/file-signals.yml` — Per-file CodeQL signal scores. -- `itemdb/codeql/codeql-summary.md` — Human-readable CodeQL summary. +- **If ``health.usable`` is true**, the following artifacts may exist under + ``itemdb/codeql/runs//``. Treat them as reconnaissance evidence, not proof + of vulnerability: + - ``normalized/alerts.yml`` — Normalized CodeQL alerts with source/sink/flow. + - ``normalized/file-signals.yml`` — Per-file CodeQL signal scores. + - ``codeql-summary.md`` — Human-readable CodeQL summary. -If these files exist: +- **If ``health.usable`` is false**, do **not** import CodeQL alerts or file-signals + into ``file-risk-index.yml`` or ``interesting-files.md``. Record the health summary + (classification + reason) in ``threat-model.md`` under a new ``# CodeQL health`` + heading so later phases know CodeQL was attempted but did not produce usable signals. -1. Read them and extract relevant signals. -2. Use alert data to enrich your understanding of potential sources, sinks, and trust-boundary crossings. -3. Use file-signals to prioritize files for the file-risk-index. -4. Do not treat CodeQL alerts as confirmed vulnerabilities. They are static-analysis hints. - -If these files do not exist, proceed with reconnaissance based on source analysis alone. Phase 1b must complete regardless of CodeQL availability. +If CodeQL was not run or the manifest is absent, proceed with reconnaissance based +on source analysis alone. Phase 1c must complete regardless of CodeQL availability. ## Target From 7058e5cd1a20243b301a6d41d171f42324e1398a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 19:04:28 +0200 Subject: [PATCH 09/13] chore: delete obsolete repair files, remove skipped tests Delete prompts/phase-1-codeql-repair.md and tests/test_phase_1_codeql_plan_repair.py. Remove three skipped repair-related tests from test_codecome_check_codeql.py. Full test suite: 730 passed, 0 failures. --- prompts/phase-1-codeql-repair.md | 81 ---- tests/test_codecome_check_codeql.py | 107 ----- tests/test_phase_1_codeql_plan_repair.py | 473 ----------------------- 3 files changed, 661 deletions(-) delete mode 100644 prompts/phase-1-codeql-repair.md delete mode 100644 tests/test_phase_1_codeql_plan_repair.py diff --git a/prompts/phase-1-codeql-repair.md b/prompts/phase-1-codeql-repair.md deleted file mode 100644 index a31fc61..0000000 --- a/prompts/phase-1-codeql-repair.md +++ /dev/null @@ -1,81 +0,0 @@ -# CodeCome Phase 1: CodeQL Build Repair - -You are performing a narrow repair step after Phase 1a generated a CodeQL plan and the CodeQL database creation step failed. - -Your task is to make the smallest durable change needed so CodeQL can create a database on the next run. - -## Required Reading - -Read these files if they exist: - -- `AGENTS.md` -- `itemdb/notes/target-profile.md` -- `itemdb/notes/build-model.md` -- `itemdb/notes/codeql-plan.yml` -- `itemdb/codeql/run-manifest.yml` -- `itemdb/codeql/codeql-summary.md` - -Also inspect relevant CodeQL database logs under: - -- `itemdb/codeql/databases/**/log/*.log` - -Focus on the last useful `[build-stderr]`, `[build-stdout]`, `ERROR`, and `Exception caught` lines. - -## Goal - -Repair `itemdb/notes/codeql-plan.yml` so the next CodeQL run can create databases. - -For C/C++, Go, and Swift, do not use `build_mode: none`. Use only `manual` or `autobuild` as supported by the CodeQL integration. - -If autobuild failed because no supported root build system was detected, prefer `build_mode: manual` with a concrete `build_command`. - -## Allowed Writes - -You may write only: - -- `itemdb/notes/codeql-plan.yml` -- helper scripts under `tmp/` -- helper scripts under `sandbox/` -- a short run summary under `runs/` if useful - -Do not write helper scripts under `tools/`. - -Do not write helper scripts under `itemdb/`. - -Do not modify files under `src/`. - -Do not modify project orchestration or configuration files. - -If the manual command is simple enough, put it directly in `build_command` instead of creating a helper script. - -## Build Command Rules - -- CodeQL runs the manual `build_command` from the analysis unit source path. -- CodeQL does not run `build_command` from the workspace root or from the helper script directory. -- CodeQL tokenizes `build_command` as argv; it does not execute it as a shell script. -- Do not put shell control syntax in `build_command`: no `&&`, `||`, `;`, pipes, comments, multi-line commands, or `bash -c` / `sh -c` snippets. -- Good direct commands: `make`, `make -C challenge`, `gcc main.c -o app`. -- If more than one command is needed, create a helper script under workspace-relative `tmp/` and set `build_command` to invoke it from the analysis unit source path, for example `bash ../../tmp/codeql-build.sh`. -- Prefer commands that are deterministic and non-interactive. -- Prefer commands that avoid modifying `src/` when possible. -- If existing target build files naturally write object files or binaries into `src/`, document that limitation in the `notes` field. -- Use workspace-relative helper script paths that work from the CodeQL source path. -- Never use absolute `/tmp/` paths. Use workspace-relative `tmp/` paths for scratch/build output. -- Do not embed this workspace's absolute path in `build_command`; prefer paths relative to the analysis unit source path. -- If a helper script changes directory, it must change to the analysis unit source path or to a path explicitly derived from that execution model, not blindly to the helper script directory. -- Keep the plan schema and existing pack selections intact unless a minimal change requires otherwise. - -## Output Requirements - -Make the repair directly in files. At the end, summarize: - -- why the previous CodeQL build failed, -- what changed in `itemdb/notes/codeql-plan.yml`, -- any helper script created, -- the exact manual build command CodeQL will run next. - -Before ending, validate that `itemdb/notes/codeql-plan.yml` is valid and follows CodeCome rules by running: - - rtk python3 tools/codecome.py check-codeql-plan - -If validation fails, repair only the reported issue before summarizing. diff --git a/tests/test_codecome_check_codeql.py b/tests/test_codecome_check_codeql.py index 29d8794..5c6de7d 100644 --- a/tests/test_codecome_check_codeql.py +++ b/tests/test_codecome_check_codeql.py @@ -7,8 +7,6 @@ import yaml -import pytest - ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT / "tools")) @@ -173,108 +171,3 @@ def test_check_codeql_artifacts_failed_hard_policy_returns_1(tmp_path: Path, cap rendering_dispatch.reset_rendering_context_cache() assert rc == 1 - - -@pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor") -def test_codeql_repair_needed_for_autobuild_database_failure(tmp_path: Path) -> None: - _ensure_codecome_package() - from codecome.phase_1 import _codeql_repair_needed - - output_dir = tmp_path / "itemdb" / "codeql" - output_dir.mkdir(parents=True) - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump( - { - "status": "soft-failed", - "failures": ["Database create failed for c-cpp:\nNo supported build system detected."], - } - ), - encoding="utf-8", - ) - plan_path = tmp_path / "itemdb" / "notes" / "codeql-plan.yml" - plan_path.parent.mkdir(parents=True) - plan_path.write_text( - yaml.safe_dump( - { - "schema_version": 1, - "analysis_units": [ - { - "id": "native", - "path": "./src/native", - "languages": [ - {"id": "c-cpp", "build_mode": "autobuild", "packs": ["official"]} - ], - } - ], - } - ), - encoding="utf-8", - ) - - assert _codeql_repair_needed(output_dir, plan_path) is True - - -@pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor") -def test_codeql_repair_needed_after_manual_database_failure(tmp_path: Path) -> None: - _ensure_codecome_package() - from codecome.phase_1 import _codeql_repair_needed - - output_dir = tmp_path / "itemdb" / "codeql" - output_dir.mkdir(parents=True) - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump( - { - "status": "soft-failed", - "failures": ["Database create failed for c-cpp:\nmanual build failed."], - } - ), - encoding="utf-8", - ) - plan_path = tmp_path / "itemdb" / "notes" / "codeql-plan.yml" - plan_path.parent.mkdir(parents=True) - plan_path.write_text( - yaml.safe_dump( - { - "schema_version": 1, - "analysis_units": [ - { - "id": "native", - "path": "./src/native", - "languages": [ - {"id": "c-cpp", "build_mode": "manual", "build_command": "make", "packs": ["official"]} - ], - } - ], - } - ), - encoding="utf-8", - ) - - assert _codeql_repair_needed(output_dir, plan_path) is True - - -@pytest.mark.skip(reason="Pipeline order changed in Phase 1 refactor; needs update in commit 6") -def test_phase_1_pipeline_structure() -> None: - _ensure_codecome_package() - import codecome.phase_1 as p1 - - saved = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "count_findings_snapshot", return_value={}), \ - patch.object(p1, "_run_subphase", return_value=0) as subphase, \ - patch.object(p1, "check_phase_1a", return_value=0), \ - patch.object(p1, "check_phase_1b", return_value=0), \ - patch.object(p1, "check_phase_1c", return_value=0), \ - patch.object(p1, "_run_codeql", return_value=None) as run_codeql, \ - patch.object(p1, "_run_codeql_repair_if_needed", return_value=0), \ - patch.object(p1, "_check_codeql_artifacts", return_value=0): - rc = p1.run_phase_1(object(), None, None, object(), "http://127.0.0.1") - finally: - rendering_dispatch.HAVE_RICH = saved - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 0 - assert run_codeql.call_count == 1 - assert subphase.call_count == 3 diff --git a/tests/test_phase_1_codeql_plan_repair.py b/tests/test_phase_1_codeql_plan_repair.py deleted file mode 100644 index 90a5b8e..0000000 --- a/tests/test_phase_1_codeql_plan_repair.py +++ /dev/null @@ -1,473 +0,0 @@ -from __future__ import annotations - -import sys -from pathlib import Path -from types import SimpleNamespace -from unittest.mock import patch - -import pytest -import yaml - -pytestmark = pytest.mark.skip(reason="CodeQL plan repair removed in Phase 1 refactor; file deleted in commit 8") - - -ROOT = Path(__file__).resolve().parents[1] -sys.path.insert(0, str(ROOT / "tools")) - -import rendering.dispatch as rendering_dispatch -from events.phase_loop import RunResult - - -def _write_invalid_plan(root: Path) -> None: - plan = root / "itemdb" / "notes" / "codeql-plan.yml" - plan.parent.mkdir(parents=True, exist_ok=True) - plan.write_text( - "schema_version: 2\n" - "analysis_units:\n" - " - id: native\n" - " path: ./src/native\n" - " languages:\n" - " - id: c-cpp\n" - " packs:\n" - " - official\n" - "- outdented-note\n", - encoding="utf-8", - ) - - -def _write_valid_plan(root: Path) -> None: - plan = root / "itemdb" / "notes" / "codeql-plan.yml" - plan.parent.mkdir(parents=True, exist_ok=True) - plan.write_text( - "schema_version: 2\n" - "analysis_units:\n" - " - id: native\n" - " path: ./src/native\n" - " languages:\n" - " - id: c-cpp\n" - " build_mode: autobuild\n" - " packs:\n" - " - official\n" - "notes:\n" - " - repaired\n", - encoding="utf-8", - ) - - -def _runtime_config() -> SimpleNamespace: - return SimpleNamespace( - model="test-model", - variant=None, - thinking_on=False, - model_source="test", - variant_source="test", - thinking_source="test", - ) - - -def _runner() -> SimpleNamespace: - return SimpleNamespace(info=SimpleNamespace(password="")) - - -def _ok_result() -> RunResult: - return RunResult(any_step_finish_seen=True, step_finish_count=1, last_finish_reason="stop") - - -def _write_manual_plan(root: Path, build_command: str) -> None: - plan = root / "itemdb" / "notes" / "codeql-plan.yml" - plan.parent.mkdir(parents=True, exist_ok=True) - plan.write_text( - yaml.safe_dump( - { - "schema_version": 2, - "analysis_units": [ - { - "id": "native", - "path": "./src/native", - "languages": [ - { - "id": "c-cpp", - "build_mode": "manual", - "build_command": build_command, - "packs": ["official"], - } - ], - } - ], - }, - sort_keys=False, - ), - encoding="utf-8", - ) - - -def test_subphase_resumes_same_session_to_repair_invalid_codeql_plan(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - transcript = tmp_path / "transcript.jsonl" - calls: list[tuple[str, str | None]] = [] - - def fake_run_single_attempt(_args, _console, prompt, *_rest, existing_session_id=None, **_kwargs): - calls.append((prompt, existing_session_id)) - if len(calls) == 1: - _write_invalid_plan(tmp_path) - return 0, "sess-1", _ok_result(), transcript - assert existing_session_id == "sess-1" - assert "itemdb/notes/codeql-plan.yml" in prompt - assert "Validation errors:" in prompt - _write_valid_plan(tmp_path) - return 0, "sess-1", _ok_result(), transcript - - saved_rich = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "ROOT", tmp_path), \ - patch.object(p1, "load_prompt", return_value="initial prompt"), \ - patch.object(p1, "resolve_runtime_config", return_value=_runtime_config()), \ - patch.object(p1, "configure_rendering"), \ - patch.object(p1, "_run_single_attempt", side_effect=fake_run_single_attempt), \ - patch("findings.checks_entry.run_frontmatter_validation", return_value=(0, "")): - rc = p1._run_subphase( - args=object(), - console=None, - rendering_ctx=None, - runner=_runner(), - base_url="http://127.0.0.1", - phase_id="1a", - label="Target Profile", - agent="recon", - prompt_file="prompts/phase-1a-profile.md", - ) - finally: - rendering_dispatch.HAVE_RICH = saved_rich - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 0 - assert len(calls) == 2 - assert calls[1][1] == "sess-1" - - -def test_subphase_fails_after_codeql_plan_auto_repair_retries_exhausted(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - transcript = tmp_path / "transcript.jsonl" - - def fake_run_single_attempt(*_args, **_kwargs): - _write_invalid_plan(tmp_path) - return 0, "sess-1", _ok_result(), transcript - - saved_rich = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "ROOT", tmp_path), \ - patch.object(p1, "load_prompt", return_value="initial prompt"), \ - patch.object(p1, "resolve_runtime_config", return_value=_runtime_config()), \ - patch.object(p1, "configure_rendering"), \ - patch.object(p1, "_run_single_attempt", side_effect=fake_run_single_attempt) as run_attempt, \ - patch("findings.checks_entry.run_frontmatter_validation", return_value=(0, "")): - rc = p1._run_subphase( - args=object(), - console=None, - rendering_ctx=None, - runner=_runner(), - base_url="http://127.0.0.1", - phase_id="1-codeql-repair", - label="CodeQL Build Repair", - agent="recon", - prompt_file="prompts/phase-1-codeql-repair.md", - ) - finally: - rendering_dispatch.HAVE_RICH = saved_rich - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 2 - assert run_attempt.call_count == 3 - - -def test_codeql_plan_validation_rejects_absolute_tmp_in_build_command(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_manual_plan(tmp_path, "bash -c 'mkdir -p /tmp/codeql-build'") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "absolute /tmp/" in output - - -def test_codeql_plan_validation_rejects_shell_operators_in_build_command(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_manual_plan(tmp_path, "mkdir -p out && gcc main.c -o out/app") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "shell operator" in output - assert "helper script" in output - - -def test_codeql_plan_validation_rejects_multiline_and_comments_in_build_command(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_manual_plan(tmp_path, "# build\nmkdir -p out\ngcc main.c -o out/app") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "multi-line" in output - assert "shell comments" in output - - -def test_codeql_plan_validation_rejects_bash_c_build_command(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_manual_plan(tmp_path, "bash -c 'mkdir -p out && gcc main.c -o out/app'") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "bash -c" in output - - -def test_codeql_plan_validation_checks_helper_from_analysis_root(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - analysis_root = tmp_path / "src" / "native" - helper = tmp_path / "tmp" / "codeql-build.sh" - analysis_root.mkdir(parents=True) - helper.parent.mkdir(parents=True) - helper.write_text("#!/usr/bin/env bash\necho ok\n", encoding="utf-8") - _write_manual_plan(tmp_path, "bash ../../tmp/codeql-build.sh") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 0, output - - -def test_codeql_plan_validation_rejects_missing_helper_from_analysis_root(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - (tmp_path / "src" / "native").mkdir(parents=True) - _write_manual_plan(tmp_path, "bash tmp/codeql-build.sh") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "referenced helper script does not exist from analysis root" in output - - -def test_codeql_repair_loop_resumes_same_session_after_failed_rerun(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - output_dir = tmp_path / "itemdb" / "codeql" - output_dir.mkdir(parents=True) - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump({"status": "soft-failed", "failures": ["Database create failed for c-cpp:\nautobuild failed"]}), - encoding="utf-8", - ) - _write_manual_plan(tmp_path, "make") - config = SimpleNamespace(abs_output_dir=output_dir) - calls: list[tuple[str | None, str | None]] = [] - - def fake_subphase(**kwargs): - calls.append((kwargs.get("existing_session_id"), kwargs.get("initial_prompt"))) - if len(calls) == 1: - return p1._SubphaseOutcome(0, "repair-session", tmp_path / "one.jsonl") - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump({"status": "completed", "failures": []}), - encoding="utf-8", - ) - return p1._SubphaseOutcome(0, "repair-session", tmp_path / "two.jsonl") - - def fake_run_codeql(_console): - if len(calls) == 1: - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump({"status": "soft-failed", "failures": ["Database create failed for c-cpp:\nmanual failed"]}), - encoding="utf-8", - ) - return None - - saved_rich = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "ROOT", tmp_path), \ - patch("codeql.config.resolve_config", return_value=config), \ - patch.object(p1, "_run_subphase", side_effect=fake_subphase), \ - patch.object(p1, "_run_codeql", side_effect=fake_run_codeql): - rc = p1._run_codeql_repair_if_needed( - args=object(), - console=None, - rendering_ctx=None, - runner=_runner(), - base_url="http://127.0.0.1", - ) - finally: - rendering_dispatch.HAVE_RICH = saved_rich - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 0 - assert len(calls) == 2 - assert calls[0] == (None, None) - assert calls[1][0] == "repair-session" - assert calls[1][1] is not None - assert "Latest CodeQL failure details" in calls[1][1] - - -def test_codeql_repair_loop_does_not_block_after_retries_exhausted(tmp_path: Path, monkeypatch) -> None: - import codecome.phase_1 as p1 - - output_dir = tmp_path / "itemdb" / "codeql" - output_dir.mkdir(parents=True) - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump({"status": "soft-failed", "failures": ["Database create failed for c-cpp:\nautobuild failed"]}), - encoding="utf-8", - ) - _write_manual_plan(tmp_path, "make") - config = SimpleNamespace(abs_output_dir=output_dir) - - def fake_subphase(**_kwargs): - return p1._SubphaseOutcome(0, "repair-session", tmp_path / "repair.jsonl") - - def fake_run_codeql(_console): - (output_dir / "run-manifest.yml").write_text( - yaml.safe_dump({"status": "soft-failed", "failures": ["Database create failed for c-cpp:\nmanual failed"]}), - encoding="utf-8", - ) - return None - - monkeypatch.setenv("CODEQL_REPAIR_RETRIES", "1") - saved_rich = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "ROOT", tmp_path), \ - patch("codeql.config.resolve_config", return_value=config), \ - patch.object(p1, "_run_subphase", side_effect=fake_subphase), \ - patch.object(p1, "_run_codeql", side_effect=fake_run_codeql): - rc = p1._run_codeql_repair_if_needed( - args=object(), - console=None, - rendering_ctx=None, - runner=_runner(), - base_url="http://127.0.0.1", - ) - finally: - rendering_dispatch.HAVE_RICH = saved_rich - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 0 - - -def _write_plan_with_build_mode(root: Path, build_mode: str | None, build_command: str | None = None) -> None: - plan = root / "itemdb" / "notes" / "codeql-plan.yml" - plan.parent.mkdir(parents=True, exist_ok=True) - import yaml as _yaml - - data: dict = { - "schema_version": 2, - "analysis_units": [ - { - "id": "native", - "path": "./src/native", - "languages": [ - {"id": "c-cpp", "packs": ["official"]}, - ], - } - ], - } - if build_mode is not None: - data["analysis_units"][0]["languages"][0]["build_mode"] = build_mode # type: ignore[index] - if build_command is not None: - data["analysis_units"][0]["languages"][0]["build_command"] = build_command # type: ignore[index] - plan.write_text(_yaml.safe_dump(data, sort_keys=False), encoding="utf-8") - - -def test_codeql_plan_validation_rejects_unsupported_build_mode(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_plan_with_build_mode(tmp_path, "none") - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "unsupported build_mode" in output - assert "'none'" in output - - -def test_codeql_plan_validation_rejects_missing_build_mode(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_plan_with_build_mode(tmp_path, None) - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "missing or invalid build_mode" in output - - -def test_codeql_plan_validation_rejects_manual_without_build_command(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - _write_plan_with_build_mode(tmp_path, "manual", build_command=None) - - with patch.object(p1, "ROOT", tmp_path): - rc, output = p1._validate_codeql_plan_for_repair() - - assert rc == 1 - assert "no build_command provided" in output - - -def test_phase1c_accepts_no_step_finish_when_artifacts_are_fresh(tmp_path: Path) -> None: - import codecome.phase_1 as p1 - - transcript = tmp_path / "tmp" / "last-phase-1c-no-finding-attempt-1.jsonl" - transcript.parent.mkdir(parents=True) - transcript.write_text("", encoding="utf-8") - - args = SimpleNamespace(phase="1", finding=None, label="sandbox", debug=False) - calls = [] - - def fake_run_single_attempt(*_args, **_kwargs): - calls.append(_kwargs) - return 0, "session-1", RunResult(any_step_finish_seen=False), transcript - - saved_rich = rendering_dispatch.HAVE_RICH - rendering_dispatch.HAVE_RICH = False - rendering_dispatch.reset_rendering_context_cache() - try: - with patch.object(p1, "ROOT", tmp_path), \ - patch.object(p1, "load_prompt", return_value="prompt"), \ - patch.object(p1, "resolve_runtime_config", return_value=_runtime_config()), \ - patch.object(p1, "configure_rendering", return_value=None), \ - patch.object(p1, "_run_single_attempt", side_effect=fake_run_single_attempt), \ - patch.object(p1, "check_phase_graceful_completion", return_value=(True, [])), \ - patch("findings.checks_entry.run_frontmatter_validation", return_value=(0, "")): - rc = p1._run_subphase( - args=args, - console=None, - rendering_ctx=None, - runner=_runner(), - base_url="http://127.0.0.1", - phase_id="1c", - label="Sandbox", - agent="recon", - prompt_file="prompts/phase-1c-sandbox.md", - ) - finally: - rendering_dispatch.HAVE_RICH = saved_rich - rendering_dispatch.reset_rendering_context_cache() - - assert rc == 0 - assert len(calls) == 1 \ No newline at end of file From b7505f4148e3c58347bc71d68fb6636f92c8204d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 23:00:40 +0200 Subject: [PATCH 10/13] chore: add prompt grep guard + health-aware one-liners tests/test_prompts.py: add test_no_stale_phase_1b_1c_references_in_prompts that greps all prompt files for contradictory Phase 1b/1c patterns, plus self-consistency tests for phase-1b-sandbox.md and phase-1c-recon.md. prompts/README.md: update filenames and descriptions to new 1b/1c order. prompts/sweep.md: add 'CodeQL signals (conditional)' section instructing the sweep to check health.usable before importing CodeQL signals. prompts/phase-6-report.md: add health-aware rule in 'Reporting rules' section instructing the reporter to check last-run-manifest.yml health. --- prompts/README.md | 10 +++---- prompts/phase-6-report.md | 4 +++ prompts/sweep.md | 8 ++++++ tests/test_prompts.py | 55 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 70 insertions(+), 7 deletions(-) diff --git a/prompts/README.md b/prompts/README.md index 5de3523..2eb2223 100644 --- a/prompts/README.md +++ b/prompts/README.md @@ -7,8 +7,8 @@ Each prompt corresponds to one workflow phase. ## Prompts phase-1a-profile.md - phase-1b-recon.md - phase-1c-sandbox.md + phase-1b-sandbox.md + phase-1c-recon.md phase-2-audit.md phase-3-review.md phase-4-validate.md @@ -41,15 +41,15 @@ If you prefer direct invocation: ### Phase 1: reconnaissance -Use `make phase-1` to run the full reconnaissance workflow (Phase 1a, CodeQL enrichment, Phase 1b, Phase 1c). +Use `make phase-1` to run the full reconnaissance workflow (Phase 1a, Phase 1b sandbox, CodeQL enrichment, Phase 1c). make phase-1 Or invoke subphases manually: opencode run --agent recon "$(cat prompts/phase-1a-profile.md)" - opencode run --agent recon "$(cat prompts/phase-1b-recon.md)" - opencode run --agent recon "$(cat prompts/phase-1c-sandbox.md)" + opencode run --agent recon "$(cat prompts/phase-1b-sandbox.md)" + opencode run --agent recon "$(cat prompts/phase-1c-recon.md)" Creates or updates target reconnaissance notes under: diff --git a/prompts/phase-6-report.md b/prompts/phase-6-report.md index c448cad..f3bf09e 100644 --- a/prompts/phase-6-report.md +++ b/prompts/phase-6-report.md @@ -67,6 +67,10 @@ Use this structure: ## Reporting rules +- If `itemdb/codeql/last-run-manifest.yml` exists, check its `health.usable` key. + Only reference CodeQL signals when `health.usable` is `true`. When CodeQL did not + produce usable output, record the health classification and reason in the + Methodology section but do not claim zero CodeQL findings. - Place exploited findings (with demonstrated impact) above confirmed findings. - Clearly separate confirmed findings from unvalidated hypotheses. - Clearly label rejected findings as rejected. diff --git a/prompts/sweep.md b/prompts/sweep.md index d37c526..913e5bc 100644 --- a/prompts/sweep.md +++ b/prompts/sweep.md @@ -19,6 +19,14 @@ Read the following files (all paths are relative to the project/workspace root): Use additional target-specific skills only if they clearly apply. +## CodeQL signals (conditional) + +If `itemdb/codeql/last-run-manifest.yml` exists, read its `health` block. +When `health.usable` is `true`, use `itemdb/codeql/normalized/alerts.yml` +and `file-signals.yml` as enrichment for the target file. When +`health.usable` is `false`, the CodeQL run did not produce trustworthy +output — do not import its signals. + ## Target file Analyze this target file: diff --git a/tests/test_prompts.py b/tests/test_prompts.py index 69ca707..25963c8 100644 --- a/tests/test_prompts.py +++ b/tests/test_prompts.py @@ -4,13 +4,64 @@ def test_prompt_safeguards(): """Ensure prompts enforce CLI usage and frontmatter validation.""" prompts_dir = Path("prompts") - + # Phase 3, 4, 5 should enforce using make findings-move for phase in ["phase-3-review.md", "phase-4-validate.md", "phase-5-exploit.md"]: content = (prompts_dir / phase).read_text(encoding="utf-8") assert "make findings-move" in content, f"{phase} is missing make findings-move instruction" - + # All modification phases should run frontmatter check for phase in ["phase-2-audit.md", "phase-3-review.md", "phase-4-validate.md", "phase-5-exploit.md"]: content = (prompts_dir / phase).read_text(encoding="utf-8") assert "make frontmatter" in content, f"{phase} is missing make frontmatter instruction" + + +def test_no_stale_phase_1b_1c_references_in_prompts(): + """Ensure no prompt files reference the old Phase 1b/1c ordering. + + After the refactor: + - Phase 1b = Sandbox Bootstrap + - Phase 1c = Detailed Reconnaissance + Old contradictory patterns must not appear outside the canonical prompt files. + """ + prompts_dir = Path("prompts") + + # Stale patterns that indicate the old order (1b = recon, 1c = sandbox) + stale_patterns = [ + (r"Phase 1b.*Detailed Reconnaissance", "Phase 1b should not describe Detailed Reconnaissance (now 1c)"), + (r"Phase 1b.*reconnaissance", "Phase 1b should not be described as reconnaissance (now 1c)"), + (r"Phase 1c.*Sandbox Bootstrap", "Phase 1c should not describe Sandbox Bootstrap (now 1b)"), + (r"Phase 1c.*sandbox bootstrap", "Phase 1c should not describe sandbox bootstrap (now 1b)"), + (r"\b1b\b.*\brecon\b", "phase-1b should not reference recon (now 1c)"), + ] + + # The canonical files may reference the old names in their key/value style + # but NOT in a way that mixes the phase number with the wrong task. + exceptions: dict[str, list[str]] = { + "phase-1b-sandbox.md": [], + "phase-1c-recon.md": [], + } + + for prompt_file in sorted(prompts_dir.glob("*.md")): + content = prompt_file.read_text(encoding="utf-8") + for pattern, reason in stale_patterns: + if prompt_file.name in exceptions and pattern in exceptions[prompt_file.name]: + continue + if re.search(pattern, content, re.IGNORECASE): + raise AssertionError( + f"{prompt_file.name}: {reason}. Found pattern: {pattern!r}" + ) + + +def test_phase_1b_sandbox_prompt_is_self_consistent(): + content = Path("prompts/phase-1b-sandbox.md").read_text(encoding="utf-8") + assert "Phase 1b" in content + assert "Sandbox Bootstrap" in content + assert "second sub-stage" in content.lower() # sandbox is the second sub-phase + + +def test_phase_1c_recon_prompt_is_self_consistent(): + content = Path("prompts/phase-1c-recon.md").read_text(encoding="utf-8") + assert "Phase 1c" in content + assert "Detailed Reconnaissance" in content + assert "third and final" in content # recon is the third sub-phase From 4de38ff4bbb18559e12f4a1d7aab7357b4af2778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sat, 6 Jun 2026 23:45:10 +0200 Subject: [PATCH 11/13] fix(codeql): allow recipe-backed manual builds --- prompts/phase-1a-profile.md | 7 ++-- templates/codeql-plan.yml | 4 +- tests/test_codeql_runner.py | 50 +++++++++++++++++++----- tests/test_phase_1_gates.py | 72 +++++++++++++++++++++++++++++++++++ tools/codeql/capabilities.py | 1 - tools/codeql/runner.py | 37 +++++++++++++++++- tools/phases/phase_1_gates.py | 4 +- 7 files changed, 157 insertions(+), 18 deletions(-) diff --git a/prompts/phase-1a-profile.md b/prompts/phase-1a-profile.md index 12afc00..ebfd326 100644 --- a/prompts/phase-1a-profile.md +++ b/prompts/phase-1a-profile.md @@ -65,7 +65,7 @@ Rules: - Set `analysis_units[].path` to the real source path under `./src` for that unit. Do not use CodeQL-generated helper paths such as `_codeql_detected_source_root`. - Use one `analysis_units` entry for a single-project repository and multiple entries for monorepos or mixed stacks. - Only include languages you have detected with **HIGH** or **MEDIUM** confidence. -- For compiled languages (c-cpp, go, csharp, java-kotlin, swift) set `analysis_units[].sandbox_build_target` to the `build_targets[].id` from `sandbox-recipe.yml` that provides the build command for this unit. If the recipe has not been generated yet (this is Phase 1a), pick a sensible id such as `root` — Phase 1b will flesh out the recipe and the id can be updated if needed. +- For compiled languages (c-cpp, go, csharp, java-kotlin) set `analysis_units[].sandbox_build_target` to the `build_targets[].id` from `sandbox-recipe.yml` that provides the build command for this unit. If the recipe has not been generated yet (this is Phase 1a), pick a sensible id such as `root` — Phase 1b will flesh out the recipe and the id can be updated if needed. - For each language, set `build_provider`: - `"sandbox-recipe"` — for compiled languages whose build command should be resolved from `sandbox-recipe.yml` after Phase 1b. Leave `build_command` empty (the runner resolves it from the recipe). - `"none"` — for no-build languages (python, javascript-typescript, ruby). @@ -78,8 +78,9 @@ Rules: - `local` — include if custom queries exist under `queries/codeql//`. - Set `build_mode` according to CodeQL language support: - `none`: python, javascript-typescript, ruby, csharp, java-kotlin. - - `manual` or `autobuild`: c-cpp, go, csharp, java-kotlin, swift. -- Do not set `build_mode: none` for C/C++, Go, or Swift. + - `manual` or `autobuild`: c-cpp, go, csharp, java-kotlin. +- Do not set `build_mode: none` for C/C++ or Go. +- Do not include Swift or Rust as CodeQL languages for now. If detected, mention them in `notes` as manually reviewed or future-tooling candidates instead of adding them to `analysis_units[].languages`. - Use `manual` only when you identified a concrete build command for that analysis unit. - Use `autobuild` only as an explicit choice when build files exist but the exact command is uncertain. - Fill in `build_command` when `build_mode` is `manual`. diff --git a/templates/codeql-plan.yml b/templates/codeql-plan.yml index 7a21724..933a8de 100644 --- a/templates/codeql-plan.yml +++ b/templates/codeql-plan.yml @@ -55,11 +55,11 @@ analysis_units: [] # (the runner resolves it from the recipe). Only provide build_command directly # when the build cannot be expressed through the recipe. # -# Allowed language IDs: python, javascript-typescript, ruby, c-cpp, go, csharp, java-kotlin, swift +# Allowed language IDs: python, javascript-typescript, ruby, c-cpp, go, csharp, java-kotlin # Allowed confidence values: HIGH, MEDIUM, LOW # Allowed build_mode values by language: # none: python, javascript-typescript, ruby, csharp, java-kotlin -# manual/autobuild: c-cpp, go, csharp, java-kotlin, swift +# manual/autobuild: c-cpp, go, csharp, java-kotlin # Allowed pack profile names: official, github-security-lab, trailofbits, coding-standards, local # # Optional per-language timeout overrides (seconds): diff --git a/tests/test_codeql_runner.py b/tests/test_codeql_runner.py index a4eb849..d73b9f8 100644 --- a/tests/test_codeql_runner.py +++ b/tests/test_codeql_runner.py @@ -108,29 +108,61 @@ def test_write_manifest(tmp_path: Path) -> None: def test_lookup_build_match() -> None: - languages = [ - {"id": "python", "build_mode": "none", "build_command": None}, - {"id": "c-cpp", "build_mode": "manual", "build_command": "make -C src"}, - ] - mode, cmd = _lookup_build("c-cpp", languages) + plan_unit = { + "languages": [ + {"id": "python", "build_mode": "none", "build_command": None}, + {"id": "c-cpp", "build_mode": "manual", "build_command": "make -C src"}, + ] + } + mode, cmd = _lookup_build("c-cpp", plan_unit) assert mode == "manual" assert cmd == "make -C src" def test_lookup_build_fallback() -> None: - languages: list = [] - mode, cmd = _lookup_build("python", languages) + plan_unit = {"languages": []} + mode, cmd = _lookup_build("python", plan_unit) assert mode == "none" assert cmd is None def test_lookup_build_no_match_within_plan() -> None: - languages = [{"id": "go", "build_mode": "autobuild"}] - mode, cmd = _lookup_build("python", languages) + plan_unit = {"languages": [{"id": "go", "build_mode": "autobuild"}]} + mode, cmd = _lookup_build("python", plan_unit) assert mode == "none" assert cmd is None +def test_lookup_build_resolves_sandbox_recipe_command(tmp_path: Path) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "sandbox-recipe.yml").write_text( + "schema_version: 1\n" + "validation_model: docker\n" + "sandbox:\n" + " path: ./sandbox\n" + "build_targets:\n" + " - id: native\n" + " source_path: ./src\n" + " workdir: /workspace/src\n" + " build_command: ./sandbox/scripts/build.sh\n", + encoding="utf-8", + ) + plan_unit = { + "id": "root", + "sandbox_build_target": "native", + "languages": [ + {"id": "c-cpp", "build_mode": "manual", "build_provider": "sandbox-recipe"}, + ], + } + + with patch("codeql.runner.ROOT", tmp_path): + mode, cmd = _lookup_build("c-cpp", plan_unit) + + assert mode == "manual" + assert cmd == "./sandbox/scripts/build.sh" + + def test_create_database_creates_parent_dir(tmp_path: Path) -> None: db_dir = tmp_path / "itemdb" / "codeql" / "databases" / "c-cpp" mock_process = MagicMock() diff --git a/tests/test_phase_1_gates.py b/tests/test_phase_1_gates.py index 795caf1..8af64e0 100644 --- a/tests/test_phase_1_gates.py +++ b/tests/test_phase_1_gates.py @@ -59,6 +59,78 @@ def test_unsupported_language_soft_policy_warns_not_fails(tmp_path: Path, capsys assert "will be skipped" in out +def test_unsupported_language_with_empty_packs_soft_policy_warns_not_fails(tmp_path: Path, capsys) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "target-profile.md").write_text("profile", encoding="utf-8") + (notes / "build-model.md").write_text("model", encoding="utf-8") + (notes / "codeql-plan.yml").write_text( + "schema_version: 2\n" + "recommended: true\n" + "analysis_units:\n" + " - id: native\n" + " path: ./src\n" + " languages:\n" + " - id: swift\n" + " confidence: HIGH\n" + " build_mode: manual\n" + " build_provider: sandbox-recipe\n" + " packs: []\n", + encoding="utf-8", + ) + + (tmp_path / "src").mkdir() + + mock_config = type("cfg", (), {"fail_policy": "soft", "enabled": True})() + + from phases.phase_1_gates import check_phase_1a + + with patch("phases.phase_1_gates.ROOT", tmp_path), \ + patch("phases.phase_1_gates._resolve_codeql_config", return_value=mock_config): + rc = check_phase_1a() + + out = capsys.readouterr().out + assert rc == 0 + assert "unsupported CodeQL language 'swift'" in out + + +def test_manual_sandbox_recipe_build_provider_does_not_require_inline_command(tmp_path: Path, capsys) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "target-profile.md").write_text("profile", encoding="utf-8") + (notes / "build-model.md").write_text("model", encoding="utf-8") + (notes / "codeql-plan.yml").write_text( + "schema_version: 2\n" + "recommended: true\n" + "analysis_units:\n" + " - id: native\n" + " path: ./src\n" + " sandbox_build_target: root\n" + " languages:\n" + " - id: c-cpp\n" + " confidence: HIGH\n" + " build_mode: manual\n" + " build_provider: sandbox-recipe\n" + " packs:\n" + " - official\n", + encoding="utf-8", + ) + + (tmp_path / "src").mkdir() + + mock_config = type("cfg", (), {"fail_policy": "hard", "enabled": True})() + + from phases.phase_1_gates import check_phase_1a + + with patch("phases.phase_1_gates.ROOT", tmp_path), \ + patch("phases.phase_1_gates._resolve_codeql_config", return_value=mock_config): + rc = check_phase_1a() + + out = capsys.readouterr().out + assert rc == 0 + assert "manual build without build_command" not in out + + def test_unsupported_language_hard_policy_fails(tmp_path: Path, capsys) -> None: notes = tmp_path / "itemdb" / "notes" notes.mkdir(parents=True) diff --git a/tools/codeql/capabilities.py b/tools/codeql/capabilities.py index f060445..4a54280 100644 --- a/tools/codeql/capabilities.py +++ b/tools/codeql/capabilities.py @@ -14,7 +14,6 @@ "go": {"manual", "autobuild"}, "csharp": {"none", "manual", "autobuild"}, "java-kotlin": {"none", "manual", "autobuild"}, - "swift": {"manual", "autobuild"}, } diff --git a/tools/codeql/runner.py b/tools/codeql/runner.py index f922d9c..3906969 100644 --- a/tools/codeql/runner.py +++ b/tools/codeql/runner.py @@ -85,7 +85,7 @@ def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: C profile_packs = lang_entry.get("profile_packs", {}) language_ids.append(f"{unit_id}:{language_id}") - build_mode, build_command = _lookup_build(language_id, plan_unit.get("languages", [])) + build_mode, build_command = _lookup_build(language_id, plan_unit) plan_languages = plan_unit.get("languages", []) db_timeout = _lookup_timeout("db_create_timeout", language_id, plan_languages, config.db_create_timeout) analyze_timeout = _lookup_timeout("analyze_timeout", language_id, plan_languages, config.analyze_timeout) @@ -194,16 +194,49 @@ def _lookup_unit(unit_id: str, plan_units: list[dict]) -> dict: return {} -def _lookup_build(language_id: str, plan_languages: list[dict]) -> tuple[str, str | None]: +def _lookup_build(language_id: str, plan_unit: dict[str, Any]) -> tuple[str, str | None]: """Return (build_mode, build_command) for a language entry.""" + plan_languages = plan_unit.get("languages", []) + if not isinstance(plan_languages, list): + return "none", None for pl in plan_languages: if pl.get("id") == language_id: mode = pl.get("build_mode", "none") cmd = pl.get("build_command") + if not (isinstance(cmd, str) and cmd.strip()) and pl.get("build_provider") == "sandbox-recipe": + cmd = _lookup_recipe_build_command(plan_unit) return mode if isinstance(mode, str) and mode else "none", cmd if isinstance(cmd, str) and cmd else None return "none", None +def _lookup_recipe_build_command(plan_unit: dict[str, Any]) -> str | None: + """Resolve a build command from itemdb/notes/sandbox-recipe.yml.""" + target_id = plan_unit.get("sandbox_build_target") or plan_unit.get("id") + if not isinstance(target_id, str) or not target_id: + return None + + recipe_path = ROOT / "itemdb" / "notes" / "sandbox-recipe.yml" + if not recipe_path.exists(): + return None + + try: + from sandbox.recipe import load_recipe + + recipe = load_recipe(recipe_path) + except Exception: + return None + + build_targets = recipe.get("build_targets") + if not isinstance(build_targets, list): + return None + for target in build_targets: + if not isinstance(target, dict) or target.get("id") != target_id: + continue + command = target.get("build_command") + return command if isinstance(command, str) and command.strip() else None + return None + + def _lookup_timeout(field: str, language_id: str, plan_languages: list[dict], default: int) -> int: """Return a per-language timeout, falling back to *default*.""" for pl in plan_languages: diff --git a/tools/phases/phase_1_gates.py b/tools/phases/phase_1_gates.py index d588952..d5286cb 100644 --- a/tools/phases/phase_1_gates.py +++ b/tools/phases/phase_1_gates.py @@ -101,7 +101,9 @@ def _validate_codeql_language_entry( out.error(f"codeql-plan.yml: language '{language_id}' in analysis unit '{unit_id}' has unsupported build_mode '{build_mode}' (allowed: {allowed})") return 1 build_command = lang.get("build_command") - if build_mode == "manual" and not (isinstance(build_command, str) and build_command.strip()): + build_provider = lang.get("build_provider") + recipe_backed = build_provider == "sandbox-recipe" + if build_mode == "manual" and not recipe_backed and not (isinstance(build_command, str) and build_command.strip()): out.error(f"codeql-plan.yml: language '{language_id}' in analysis unit '{unit_id}' uses manual build without build_command") return 1 if "packs" not in lang: From 86756d2d7e51c267c07f5bd65f4c54ba86df3578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sun, 7 Jun 2026 02:54:00 +0200 Subject: [PATCH 12/13] fix(codeql): soft-skip empty language units --- prompts/phase-1a-profile.md | 3 + templates/codeql-plan.yml | 3 + tests/test_codeql_packs.py | 50 +++++++++++++++ tests/test_phase_1_gates.py | 71 +++++++++++++++++++++ tests/test_phase_completion_threat_model.py | 14 ++++ tools/codecome/phase_1.py | 67 ++++++++++++++----- tools/codeql/packs.py | 29 +++++---- tools/phases/completion.py | 6 +- tools/phases/phase_1_gates.py | 12 +++- 9 files changed, 223 insertions(+), 32 deletions(-) diff --git a/prompts/phase-1a-profile.md b/prompts/phase-1a-profile.md index ebfd326..7ee1a51 100644 --- a/prompts/phase-1a-profile.md +++ b/prompts/phase-1a-profile.md @@ -65,6 +65,7 @@ Rules: - Set `analysis_units[].path` to the real source path under `./src` for that unit. Do not use CodeQL-generated helper paths such as `_codeql_detected_source_root`. - Use one `analysis_units` entry for a single-project repository and multiple entries for monorepos or mixed stacks. - Only include languages you have detected with **HIGH** or **MEDIUM** confidence. +- Do not create active `analysis_units` entries with `languages: []`. If a component has no locally supported CodeQL language, either omit it from `analysis_units` and mention it in top-level `notes`, or set that unit's `recommended: false`. - For compiled languages (c-cpp, go, csharp, java-kotlin) set `analysis_units[].sandbox_build_target` to the `build_targets[].id` from `sandbox-recipe.yml` that provides the build command for this unit. If the recipe has not been generated yet (this is Phase 1a), pick a sensible id such as `root` — Phase 1b will flesh out the recipe and the id can be updated if needed. - For each language, set `build_provider`: - `"sandbox-recipe"` — for compiled languages whose build command should be resolved from `sandbox-recipe.yml` after Phase 1b. Leave `build_command` empty (the runner resolves it from the recipe). @@ -92,7 +93,9 @@ Rules: - Estimate `analyze_timeout` (seconds) per profile if query packs are known to be heavy (e.g. security suites on large codebases); otherwise omit to use harness default. - Set `recommended: false` if you cannot confidently profile any language. - Add relevant `notes` explaining your language choices and any uncertainties. +- Put unsupported-language inventory (for example Swift, Rust, Elixir, Zig, F#, VB6, WebAssembly, or static-only components) in top-level `notes`, not in active CodeQL analysis units with empty `languages` lists. - Update `exclude` patterns to match the target's test, fixture, vendor, and generated code directories if different from the defaults. +- If you self-validate the plan, validate against these schema rules, not only YAML syntax: every recommended analysis unit must have a non-empty `languages` list; empty-language inventory units must be omitted or marked `recommended: false`. ## Important rules diff --git a/templates/codeql-plan.yml b/templates/codeql-plan.yml index 933a8de..31ca250 100644 --- a/templates/codeql-plan.yml +++ b/templates/codeql-plan.yml @@ -54,6 +54,9 @@ analysis_units: [] # - build_command should be left empty when build_provider is "sandbox-recipe" # (the runner resolves it from the recipe). Only provide build_command directly # when the build cannot be expressed through the recipe. +# - Do not create active analysis units with languages: []. If a component is +# inventory-only or unsupported by CodeQL, either omit it from analysis_units +# and describe it in top-level notes, or set recommended: false on that unit. # # Allowed language IDs: python, javascript-typescript, ruby, c-cpp, go, csharp, java-kotlin # Allowed confidence values: HIGH, MEDIUM, LOW diff --git a/tests/test_codeql_packs.py b/tests/test_codeql_packs.py index 87e6cce..ec060e1 100644 --- a/tests/test_codeql_packs.py +++ b/tests/test_codeql_packs.py @@ -218,6 +218,24 @@ def test_load_codeql_plan_allows_non_recommended_unit_without_languages(tmp_path assert plan["analysis_units"][1]["id"] == "gilroy" +def test_load_codeql_plan_allows_empty_language_list_for_policy_gate(tmp_path: Path) -> None: + plan_path = tmp_path / "plan.yml" + plan_path.write_text( + ( + "schema_version: 2\n" + "analysis_units:\n" + " - id: gilroy\n" + " path: ./src/gilroy\n" + " languages: []\n" + ), + encoding="utf-8", + ) + + plan = load_codeql_plan(plan_path) + + assert plan["analysis_units"][0]["languages"] == [] + + def test_resolve_plan_packs_skip_unsupported(tmp_path: Path) -> None: catalog_path = tmp_path / "catalog.yml" _write_catalog(catalog_path) @@ -303,3 +321,35 @@ def test_resolve_plan_packs_skips_non_recommended_units(tmp_path: Path) -> None: assert len(warnings) == 1 assert "gilroy" in warnings[0] assert "recommended=false" in warnings[0] + + +def test_resolve_plan_packs_skips_empty_language_units(tmp_path: Path) -> None: + catalog_path = tmp_path / "catalog.yml" + _write_catalog(catalog_path) + catalog = load_pack_catalog(catalog_path) + + plan = { + "schema_version": 2, + "analysis_units": [ + { + "id": "api", + "path": "./src/api", + "languages": [ + {"id": "python", "packs": ["official"]}, + ], + }, + { + "id": "gilroy", + "path": "./src/gilroy", + "languages": [], + }, + ], + } + + resolved = resolve_plan_packs(plan, catalog, skip_unsupported=True) + + assert [unit["id"] for unit in resolved["analysis_units"]] == ["api"] + warnings = resolved.get("warnings", []) + assert len(warnings) == 1 + assert "gilroy" in warnings[0] + assert "no CodeQL languages" in warnings[0] diff --git a/tests/test_phase_1_gates.py b/tests/test_phase_1_gates.py index 8af64e0..e7479e5 100644 --- a/tests/test_phase_1_gates.py +++ b/tests/test_phase_1_gates.py @@ -131,6 +131,77 @@ def test_manual_sandbox_recipe_build_provider_does_not_require_inline_command(tm assert "manual build without build_command" not in out +def test_empty_language_unit_soft_policy_warns_not_fails(tmp_path: Path, capsys) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "target-profile.md").write_text("profile", encoding="utf-8") + (notes / "build-model.md").write_text("model", encoding="utf-8") + (notes / "codeql-plan.yml").write_text( + "schema_version: 2\n" + "recommended: true\n" + "analysis_units:\n" + " - id: api\n" + " path: ./src/api\n" + " languages:\n" + " - id: python\n" + " confidence: HIGH\n" + " build_mode: none\n" + " packs:\n" + " - official\n" + " - id: gilroy\n" + " path: ./src/gilroy\n" + " languages: []\n", + encoding="utf-8", + ) + + (tmp_path / "src" / "api").mkdir(parents=True) + (tmp_path / "src" / "gilroy").mkdir(parents=True) + + mock_config = type("cfg", (), {"fail_policy": "soft", "enabled": True})() + + from phases.phase_1_gates import check_phase_1a + + with patch("phases.phase_1_gates.ROOT", tmp_path), \ + patch("phases.phase_1_gates._resolve_codeql_config", return_value=mock_config): + rc = check_phase_1a() + + out = capsys.readouterr().out + assert rc == 0 + assert "analysis unit 'gilroy' has no CodeQL languages" in out + assert "will be skipped" in out + + +def test_empty_language_unit_hard_policy_fails(tmp_path: Path, capsys) -> None: + notes = tmp_path / "itemdb" / "notes" + notes.mkdir(parents=True) + (notes / "target-profile.md").write_text("profile", encoding="utf-8") + (notes / "build-model.md").write_text("model", encoding="utf-8") + (notes / "codeql-plan.yml").write_text( + "schema_version: 2\n" + "recommended: true\n" + "analysis_units:\n" + " - id: gilroy\n" + " path: ./src/gilroy\n" + " languages: []\n", + encoding="utf-8", + ) + + (tmp_path / "src" / "gilroy").mkdir(parents=True) + + mock_config = type("cfg", (), {"fail_policy": "hard", "enabled": True})() + + from phases.phase_1_gates import check_phase_1a + + with patch("phases.phase_1_gates.ROOT", tmp_path), \ + patch("phases.phase_1_gates._resolve_codeql_config", return_value=mock_config): + rc = check_phase_1a() + + out = capsys.readouterr().out + assert rc == 1 + assert "analysis unit 'gilroy' has no CodeQL languages" in out + assert "recommended=false" in out + + def test_unsupported_language_hard_policy_fails(tmp_path: Path, capsys) -> None: notes = tmp_path / "itemdb" / "notes" notes.mkdir(parents=True) diff --git a/tests/test_phase_completion_threat_model.py b/tests/test_phase_completion_threat_model.py index fc84c0e..c782ccf 100644 --- a/tests/test_phase_completion_threat_model.py +++ b/tests/test_phase_completion_threat_model.py @@ -49,3 +49,17 @@ def test_build_artifact_repair_resume_prompt_avoids_unrelated_rewrites() -> None assert "unrelated" in prompt assert "do not modify target source code" in prompt + + +def test_build_artifact_repair_resume_prompt_supports_phase_1a() -> None: + from phases.completion import build_artifact_repair_resume_prompt + + prompt = build_artifact_repair_resume_prompt( + "1a", + None, + "itemdb/notes/codeql-plan.yml has empty languages; set recommended=false", + ) + + assert "Phase 1a artifacts" in prompt + assert "codeql-plan.yml" in prompt + assert "recommended=false" in prompt diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index d96c12a..10bf4b3 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -151,6 +151,19 @@ def _check_codeql_artifacts(console: Any) -> int: return 0 +def _phase_1a_codeql_plan_repair_output() -> str: + return ( + "Gate 1a rejected itemdb/notes/codeql-plan.yml. Repair only that file.\n" + "For every analysis unit whose languages list is empty, either remove the unit " + "from analysis_units or set recommended: false on that unit. Prefer moving " + "unsupported-language inventory such as Rust, Swift, Elixir, Zig, F#, VB6, " + "WebAssembly, and static-only components into the top-level notes list instead " + "of keeping active CodeQL analysis units with languages: [].\n" + "Keep CodeQL-supported units with non-empty languages lists. Do not modify " + "target-profile.md, build-model.md, source code, or project configuration." + ) + + # --------------------------------------------------------------------------- # Subphase runner # --------------------------------------------------------------------------- @@ -456,23 +469,45 @@ def run_phase_1( out = get_output(console) # ---- Phase 1a: Target Profile ---- findings_snapshot_1a = count_findings_snapshot() - rc = _run_subphase( - args=args, - console=console, - rendering_ctx=rendering_ctx, - runner=runner, - base_url=base_url, - phase_id="1a", - label="Target Profile", - agent="recon", - prompt_file="prompts/phase-1a-profile.md", - ) - if rc != 0: - return rc + phase_1a_session_id: str | None = None + phase_1a_prompt: str | None = None + phase_1a_artifact_retries = 0 + while True: + outcome = _run_subphase( + args=args, + console=console, + rendering_ctx=rendering_ctx, + runner=runner, + base_url=base_url, + phase_id="1a", + label="Target Profile", + agent="recon", + prompt_file="prompts/phase-1a-profile.md", + existing_session_id=phase_1a_session_id, + initial_prompt=phase_1a_prompt, + return_outcome=True, + ) + if outcome.returncode != 0: + return outcome.returncode - gate_rc = check_phase_1a(console, findings_snapshot=findings_snapshot_1a) - if gate_rc != 0: - return gate_rc + gate_rc = check_phase_1a(console, findings_snapshot=findings_snapshot_1a) + if gate_rc == 0: + break + + max_artifact_retries = 2 + if phase_1a_artifact_retries >= max_artifact_retries or not outcome.session_id: + return gate_rc + + phase_1a_artifact_retries += 1 + out.warn( + "\n[Auto-Correction] Phase 1a artifacts failed Gate 1a validation. " + "CodeCome will resume the same session and ask for a minimal CodeQL plan repair " + f"(retry {phase_1a_artifact_retries}/{max_artifact_retries})." + ) + phase_1a_session_id = outcome.session_id + phase_1a_prompt = build_artifact_repair_resume_prompt( + "1a", None, _phase_1a_codeql_plan_repair_output() + ) # ---- Phase 1b: Sandbox Bootstrap ---- rc = _run_subphase( diff --git a/tools/codeql/packs.py b/tools/codeql/packs.py index bfc5e5c..577e56c 100644 --- a/tools/codeql/packs.py +++ b/tools/codeql/packs.py @@ -115,8 +115,8 @@ def load_codeql_plan(path: Path) -> dict[str, Any]: languages = unit.get("languages") if unit.get("recommended") is False and (languages is None or languages == []): continue - if not isinstance(languages, list) or not languages: - raise PackResolverError(f"CodeQL plan at {path} must define analysis unit {unit_id!r} languages as a non-empty list.") + if not isinstance(languages, list): + raise PackResolverError(f"CodeQL plan at {path} must define analysis unit {unit_id!r} languages as a list.") for j, entry in enumerate(languages): if not isinstance(entry, dict): raise PackResolverError(f"CodeQL plan at {path} has non-mapping language entry {j} in analysis unit {unit_id!r}.") @@ -199,7 +199,11 @@ def resolve_plan_packs(plan: dict[str, Any], catalog: dict[str, Any], skip_unsup continue languages_out: list[dict[str, Any]] = [] - for entry in unit.get("languages", []): + languages = unit.get("languages", []) + if languages == []: + plan_warnings.append(f"Skipping analysis unit '{unit['id']}' because it has no CodeQL languages") + continue + for entry in languages: language_id = entry["id"] profiles = list(entry.get("packs", [])) @@ -223,15 +227,16 @@ def resolve_plan_packs(plan: dict[str, Any], catalog: dict[str, Any], skip_unsup }, } ) - units_out.append( - { - "id": unit["id"], - "path": unit["path"], - "kind": unit.get("kind"), - "primary": unit.get("primary", False), - "languages": languages_out, - } - ) + if languages_out: + units_out.append( + { + "id": unit["id"], + "path": unit["path"], + "kind": unit.get("kind"), + "primary": unit.get("primary", False), + "languages": languages_out, + } + ) result: dict[str, Any] = { "schema_version": 1, diff --git a/tools/phases/completion.py b/tools/phases/completion.py index 983fa8e..e14f4ce 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -529,13 +529,13 @@ def build_frontmatter_resume_prompt(phase: str, finding: str | None, validation_ def build_artifact_repair_resume_prompt( phase: str, finding: str | None, validation_output: str ) -> str: - """Build a resume prompt for Phase 1b artifact validation failures.""" + """Build a resume prompt for phase artifact validation failures.""" checklist = "\n".join(f"- {line}" for line in phase_checklist_lines(phase, finding)) return ( - "Your previous run produced Phase 1b artifacts that failed local validation.\n\n" + f"Your previous run produced Phase {phase} artifacts that failed local validation.\n\n" "Validation errors:\n" f"{validation_output}\n\n" - "Repair only the reported missing or malformed Phase 1b artifacts with minimal changes. " + f"Repair only the reported missing or malformed Phase {phase} artifacts with minimal changes. " "Do not rewrite unrelated reconnaissance notes and do not modify target source code. " "If threat-model.md is missing required headings, add only the missing H1 headings and " "leave the existing content intact.\n\n" diff --git a/tools/phases/phase_1_gates.py b/tools/phases/phase_1_gates.py index d5286cb..ecec34e 100644 --- a/tools/phases/phase_1_gates.py +++ b/tools/phases/phase_1_gates.py @@ -161,9 +161,19 @@ def _validate_codeql_analysis_unit( if unit.get("recommended") is False and (languages is None or languages == []): out.info(f"codeql-plan.yml: analysis unit '{unit_id}' is not recommended for CodeQL; skipping language validation") return None - if not isinstance(languages, list) or len(languages) == 0: + if not isinstance(languages, list): out.error(f"codeql-plan.yml: analysis unit '{unit_id}' has no languages") return 1 + if len(languages) == 0: + fail_policy = _codeql_fail_policy() + if fail_policy == "hard": + out.error(f"codeql-plan.yml: analysis unit '{unit_id}' has no CodeQL languages and is not marked recommended=false") + return 1 + out.warn( + f"codeql-plan.yml: analysis unit '{unit_id}' has no CodeQL languages — will be skipped (fail_policy=soft); " + "mark recommended=false or move unsupported-language inventory to top-level notes" + ) + return None for j, lang in enumerate(languages): result = _validate_codeql_language_entry( From 7f472424ed16fdb8e9f9f3e16d29f07246fed7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Sun, 7 Jun 2026 13:06:46 +0200 Subject: [PATCH 13/13] fix: address code review feedback on CodeQL runner, gates, and health - Wire up CodeQL execution inside Docker Compose when preferred_execution_mode is docker-inside. - Untwist the Phase 1b and Phase 1c gate checks to match their corresponding phases. - Fix extraction health check false positive where a missing stat would pass. - Relax threat-model.md heading artifact check to be case-insensitive. --- tests/test_phase_1_gates_threat_model.py | 12 +-- tools/codecome/phase_1.py | 4 +- tools/codeql/health.py | 2 +- tools/codeql/in_docker.py | 50 +++++++-- tools/codeql/runner.py | 125 +++++++++++++++++++++-- tools/phases/artifact_checks.py | 3 +- tools/phases/phase_1_gates.py | 24 ++--- 7 files changed, 180 insertions(+), 40 deletions(-) diff --git a/tests/test_phase_1_gates_threat_model.py b/tests/test_phase_1_gates_threat_model.py index 98e1163..757491c 100644 --- a/tests/test_phase_1_gates_threat_model.py +++ b/tests/test_phase_1_gates_threat_model.py @@ -14,8 +14,8 @@ def test_required_notes_1b_includes_threat_model() -> None: assert "threat-model.md" in REQUIRED_NOTES_1B -def test_check_phase_1b_missing_threat_model(tmp_path: Path, capsys) -> None: - from phases.phase_1_gates import check_phase_1b +def test_check_phase_1c_missing_threat_model(tmp_path: Path, capsys) -> None: + from phases.phase_1_gates import check_phase_1c notes = tmp_path / "itemdb" / "notes" notes.mkdir(parents=True) @@ -27,13 +27,13 @@ def test_check_phase_1b_missing_threat_model(tmp_path: Path, capsys) -> None: (notes / name).write_text("", encoding="utf-8") with patch("phases.phase_1_gates.ROOT", tmp_path): - rc = check_phase_1b() + rc = check_phase_1c() assert rc == 1 -def test_check_phase_1b_has_detailed_reconnaissance_labels(tmp_path: Path, capsys) -> None: - from phases.phase_1_gates import check_phase_1b +def test_check_phase_1c_has_detailed_reconnaissance_labels(tmp_path: Path, capsys) -> None: + from phases.phase_1_gates import check_phase_1c notes = tmp_path / "itemdb" / "notes" notes.mkdir(parents=True) @@ -43,7 +43,7 @@ def test_check_phase_1b_has_detailed_reconnaissance_labels(tmp_path: Path, capsy (notes / name).write_text("", encoding="utf-8") with patch("phases.phase_1_gates.ROOT", tmp_path): - check_phase_1b() + check_phase_1c() out = capsys.readouterr().out assert "Detailed Reconnaissance" in out diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index 10bf4b3..66c64c2 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -524,7 +524,7 @@ def run_phase_1( if rc != 0: return rc - gate_rc = check_phase_1c(console) + gate_rc = check_phase_1b(console) if gate_rc != 0: return gate_rc @@ -552,7 +552,7 @@ def run_phase_1( if rc != 0: return rc - gate_rc = check_phase_1b(console, findings_snapshot=findings_snapshot) + gate_rc = check_phase_1c(console, findings_snapshot=findings_snapshot) if gate_rc != 0: return gate_rc diff --git a/tools/codeql/health.py b/tools/codeql/health.py index fe3d944..1d9714a 100644 --- a/tools/codeql/health.py +++ b/tools/codeql/health.py @@ -201,7 +201,7 @@ def _classify( if db_ok and db_exists and not sarif_ok: return "failed", "Database created but no SARIF files found." - if db_ok and db_exists and has_compiled and not extract_ok: + if db_ok and db_exists and has_compiled and extract_ok <= 0: return "extraction-failed", ( "CodeQL database creation reported success but " f"extractor_successes={extract_ok} for compiled languages. " diff --git a/tools/codeql/in_docker.py b/tools/codeql/in_docker.py index 6024e91..25dda81 100644 --- a/tools/codeql/in_docker.py +++ b/tools/codeql/in_docker.py @@ -6,8 +6,9 @@ from __future__ import annotations import subprocess +import threading from pathlib import Path -from typing import Any +from typing import Any, Callable from codeql.platform import host_platform, container_platform, platforms_compatible @@ -47,6 +48,7 @@ def exec_codeql( *args: str, timeout: int = 600, cwd: str | None = None, + progress: Callable[[str], None] | None = None, ) -> tuple[bool, str, int]: """Run a CodeQL command inside a Docker Compose service. @@ -60,13 +62,47 @@ def exec_codeql( cmd += ["-w", cwd] cmd += [service, str(codeql_binary), *args] + if progress is None: + try: + result = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + ) + output = result.stdout.strip() + "\n" + result.stderr.strip() + return result.returncode == 0, output.strip(), result.returncode + except subprocess.TimeoutExpired: + return False, f"CodeQL command timed out after {timeout}s", -1 + except Exception as exc: + return False, str(exc), -1 + try: - result = subprocess.run( - cmd, capture_output=True, text=True, timeout=timeout, + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, ) - output = result.stdout.strip() + "\n" + result.stderr.strip() - return result.returncode == 0, output.strip(), result.returncode - except subprocess.TimeoutExpired: - return False, f"CodeQL command timed out after {timeout}s", -1 except Exception as exc: return False, str(exc), -1 + + lines: list[str] = [] + + def _read_output() -> None: + for line in process.stdout: + stripped = line.rstrip() + if stripped: + lines.append(stripped) + progress(f"CodeQL [sandbox]: {stripped}") + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + returncode = process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + process.wait() + reader.join(timeout=5) + return False, f"CodeQL command timed out after {timeout}s", -1 + + reader.join(timeout=5) + return returncode == 0, "\n".join(lines[-40:]), returncode diff --git a/tools/codeql/runner.py b/tools/codeql/runner.py index 3906969..d7de172 100644 --- a/tools/codeql/runner.py +++ b/tools/codeql/runner.py @@ -73,6 +73,17 @@ def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: C analysis_units: list[str] = [] analyzed_profiles = 0 + recipe_path = ROOT / "itemdb" / "notes" / "sandbox-recipe.yml" + sandbox_recipe = None + if recipe_path.exists(): + try: + from sandbox.recipe import load_recipe + sandbox_recipe = load_recipe(recipe_path) + except Exception: + pass + + from codeql.health import COMPILED_LANGUAGES + for unit_entry in resolved["analysis_units"]: unit_id = unit_entry["id"] source_path = unit_entry["path"] @@ -98,6 +109,43 @@ def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: C ) return _manifest(_tool_failure_status(config), now_utc, config, [version], warnings, failures, language_ids, analysis_units) + is_compiled = language_id in COMPILED_LANGUAGES + docker_ctx = None + + if is_compiled and sandbox_recipe: + codeql_cfg = sandbox_recipe.get("codeql", {}) + target_id = plan_unit.get("sandbox_build_target") or unit_id + target_cfg = {} + service = sandbox_recipe.get("sandbox", {}).get("default_service", "") + workspace_root = sandbox_recipe.get("sandbox", {}).get("workspace_root", "/workspace") + compose_file = sandbox_recipe.get("sandbox", {}).get("compose_file", "sandbox/docker-compose.yml") + + for target in sandbox_recipe.get("build_targets", []): + if target.get("id") == target_id: + target_cfg = target.get("codeql", {}) + service = target.get("service") or target.get("environment", {}).get("service") or service + break + + exec_mode = target_cfg.get("preferred_execution_mode") or codeql_cfg.get("default_execution_mode", "host") + install_strategy = target_cfg.get("install_strategy") or codeql_cfg.get("install_strategy", "mount-host-bundle") + + if exec_mode == "docker-inside": + from codeql.in_docker import check_platform + ok, msg = check_platform(service, ROOT / compose_file, install_strategy) + if not ok: + failures.append(msg) + if config.fail_policy == "soft": + _progress(progress, f"CodeQL: {msg}") + continue + return _manifest("unavailable", now_utc, config, [version], warnings, failures, language_ids, analysis_units) + + docker_ctx = { + "service": service, + "compose_file": ROOT / compose_file, + "binary": "/opt/codeql/codeql", + "workspace_root": workspace_root, + } + db_dir = database_dir / unit_id / language_id sarif_dir.mkdir(parents=True, exist_ok=True) @@ -113,6 +161,7 @@ def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: C config.abs_cache_dir, timeout=db_timeout, progress=progress, + docker_ctx=docker_ctx, ) if not ok: failures.append(msg) @@ -148,6 +197,7 @@ def run_codeql(config: CodeQLConfig, *, run_dir: Path | None = None, progress: C config.abs_cache_dir, timeout=analyze_timeout, progress=progress, + docker_ctx=docker_ctx, ) if not ok: if config.fail_policy == "soft" and profile != "official": @@ -270,19 +320,39 @@ def _create_database( cache_dir: Path | None = None, timeout: int = 600, progress: Callable[[str], None] | None = None, + docker_ctx: dict[str, Any] | None = None, ) -> tuple[bool, str]: """Create a CodeQL database. Returns (success, message).""" db_dir.parent.mkdir(parents=True, exist_ok=True) + is_docker = docker_ctx is not None + workspace_root = docker_ctx["workspace_root"] if is_docker else str(ROOT) + cmd = [ - str(binary), "database", "create", - str(db_dir), + "database", "create", + str(db_dir).replace(str(ROOT), workspace_root), "-l", language_id, - "-s", str(ROOT / source_path), + "-s", str(ROOT / source_path).replace(str(ROOT), workspace_root), "--overwrite", "--no-run-unnecessary-builds", ] + + if cache_dir: + cmd.append(f"--codescanning-config={str(cache_dir).replace(str(ROOT), workspace_root)}") # dummy cache config wait no + # let's rebuild cmd cleanly + cmd = [ + str(docker_ctx["binary"]) if is_docker else str(binary), + "database", "create", + str(db_dir).replace(str(ROOT), workspace_root), + "-l", language_id, + "-s", str(ROOT / source_path).replace(str(ROOT), workspace_root), + "--overwrite", + "--no-run-unnecessary-builds", + ] + _add_common_caches(cmd, cache_dir) + if is_docker: + cmd = [arg.replace(str(ROOT), workspace_root) for arg in cmd] if build_mode == "none": cmd += ["--build-mode=none"] @@ -302,11 +372,24 @@ def _create_database( temp_config.parent.mkdir(parents=True, exist_ok=True) config_content = {"paths-ignore": exclude_patterns} temp_config.write_text(_yaml.dump(config_content, default_flow_style=False), encoding="utf-8") - cmd += ["--codescanning-config=" + str(temp_config)] + cmd += ["--codescanning-config=" + str(temp_config).replace(str(ROOT), workspace_root)] try: - return _run_with_progress(cmd, f"Database create timed out for {language_id} after {timeout}s", - f"Database create failed for {language_id}", timeout, progress) + if is_docker: + from codeql.in_docker import exec_codeql + ok, msg, rc = exec_codeql( + docker_ctx["service"], + docker_ctx["compose_file"], + docker_ctx["binary"], + *cmd[1:], + timeout=timeout, + cwd=workspace_root, + progress=progress, + ) + return ok, msg + else: + return _run_with_progress(cmd, f"Database create timed out for {language_id} after {timeout}s", + f"Database create failed for {language_id}", timeout, progress) finally: if temp_config is not None and temp_config.parent.exists(): import shutil as _shutil @@ -321,20 +404,40 @@ def _run_analyze( cache_dir: Path | None = None, timeout: int = 600, progress: Callable[[str], None] | None = None, + docker_ctx: dict[str, Any] | None = None, ) -> tuple[bool, str]: """Run codeql database analyze. Returns (success, message).""" + is_docker = docker_ctx is not None + workspace_root = docker_ctx["workspace_root"] if is_docker else str(ROOT) + cmd = [ - str(binary), "database", "analyze", - str(db_dir), + str(docker_ctx["binary"]) if is_docker else str(binary), + "database", "analyze", + str(db_dir).replace(str(ROOT), workspace_root), "--format=sarif-latest", - f"--output={sarif_path}", + f"--output={str(sarif_path).replace(str(ROOT), workspace_root)}", "--sarif-include-query-help=never", ] _add_common_caches(cmd, cache_dir) + if is_docker: + cmd = [arg.replace(str(ROOT), workspace_root) for arg in cmd] cmd += packs - return _run_with_progress(cmd, f"Analyze timed out for {db_dir.name} after {timeout}s", - f"Analyze failed for {db_dir.name}", timeout, progress) + if is_docker: + from codeql.in_docker import exec_codeql + ok, msg, rc = exec_codeql( + docker_ctx["service"], + docker_ctx["compose_file"], + docker_ctx["binary"], + *cmd[1:], + timeout=timeout, + cwd=workspace_root, + progress=progress, + ) + return ok, msg + else: + return _run_with_progress(cmd, f"Analyze timed out for {db_dir.name} after {timeout}s", + f"Analyze failed for {db_dir.name}", timeout, progress) def _ensure_query_packs_available( diff --git a/tools/phases/artifact_checks.py b/tools/phases/artifact_checks.py index 21823cf..c8b6648 100644 --- a/tools/phases/artifact_checks.py +++ b/tools/phases/artifact_checks.py @@ -78,7 +78,8 @@ def _h1_headings(path: Path) -> set[str]: def _missing_headings(path: Path, required: list[str]) -> list[str]: """Return list of required headings missing from *path*.""" present = _h1_headings(path) - return [h for h in required if h not in present] + present_lower = {h.lower() for h in present} + return [h for h in required if h.lower() not in present_lower] # -- Artifact helpers ----------------------------------------------------------- diff --git a/tools/phases/phase_1_gates.py b/tools/phases/phase_1_gates.py index ecec34e..6e0b983 100644 --- a/tools/phases/phase_1_gates.py +++ b/tools/phases/phase_1_gates.py @@ -271,18 +271,18 @@ def check_phase_1a(console=None, findings_snapshot: dict[str, int] | None = None return 0 -def check_phase_1b(console=None, findings_snapshot: dict[str, int] | None = None) -> int: - """Gate 1b: recon notes and file-risk-index.yml must be valid.""" +def check_phase_1c(console=None, findings_snapshot: dict[str, int] | None = None) -> int: + """Gate 1c: recon notes and file-risk-index.yml must be valid.""" out = get_output(console) - out.header("Gate 1b: Detailed Reconnaissance") + out.header("Gate 1c: Detailed Reconnaissance") out.separator(tone=T.SECTION) missing = _notes_exist(*REQUIRED_NOTES_1B) if missing: - out.error("Required Phase 1b reconnaissance notes are missing:") + out.error("Required Phase 1c reconnaissance notes are missing:") for name in missing: out.info(f" itemdb/notes/{name}") - out.info("Run Phase 1b first.") + out.info("Run Phase 1c first.") return 1 for name in REQUIRED_NOTES_1B: @@ -335,27 +335,27 @@ def check_phase_1b(console=None, findings_snapshot: dict[str, int] | None = None new_findings = sum(delta.values()) if new_findings > 0: out.warn( - f"{new_findings} new finding(s) were created during Phase 1b. Findings should not be created during reconnaissance.", + f"{new_findings} new finding(s) were created during Phase 1c. Findings should not be created during reconnaissance.", ) for status, count in delta.items(): if count > 0: out.info(f" {status}: +{count}") out.separator(tone=T.SUCCESS) - out.success("Ready to run Phase 1c (Detailed Reconnaissance).") + out.success("Phase 1 complete. Ready to run Phase 2.") return 0 -def check_phase_1c(console=None) -> int: - """Gate 1c: sandbox-plan.md must exist and sandbox provenance is checked.""" +def check_phase_1b(console=None) -> int: + """Gate 1b: sandbox-plan.md must exist and sandbox provenance is checked.""" out = get_output(console) - out.header("Gate 1c: Sandbox Bootstrap") + out.header("Gate 1b: Sandbox Bootstrap") out.separator(tone=T.SECTION) plan_path = ROOT / "itemdb" / "notes" / "sandbox-plan.md" if not plan_path.exists(): out.error("itemdb/notes/sandbox-plan.md does not exist") - out.info("Run Phase 1c first.") + out.info("Run Phase 1b first.") return 1 out.success("itemdb/notes/sandbox-plan.md exists") @@ -373,5 +373,5 @@ def check_phase_1c(console=None) -> int: out.warn("sandbox/ is empty or does not exist") out.separator(tone=T.SUCCESS) - out.success("Phase 1 complete. Ready to run Phase 2.") + out.success("Ready to run Phase 1c (Detailed Reconnaissance).") return 0