cahenesy · cahenesy · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "throughline",
-  "version": "3.21.0",
+  "version": "3.22.0",
   "description": "Throughline plugin: a PRD/TDD/ADR design-doc pipeline with phase-gate PRs, layered as a thin governance overlay on top of the official superpowers/pr-review-toolkit plugins (it owns governance; they own discovery + engineering). /prd-author requires an observable acceptance criterion per new requirement. /tdd-author decides how many TDDs a PRD change needs (git-diff + coverage), recommends ADR actions, requires an alternatives analysis for every new dependency, requires a verification plan (observable surface → observation point(s) → expected observations) per TDD, reads BLOCKERS.md so implementation-time design blockers feed back into design, self-reviews, then runs an independent design-critique gate (different model, fresh context) before the design PR. /implement builds every TDD merged to the integration branch unattended in detached processes on the best model (opus) but does not trust self-reported success: the flip to implemented is gated on failing-test-first discipline, a mechanical ci-checks gate (tests+typecheck+lint — CI's job), a runtime-verify gate that drives the built artifact at its observable surface (PASS/FAIL/BLOCKED/SKIP kept distinct; mechanism delegated, no harness vendored — ADR 0004), and an independent review on a DIFFERENT model (sonnet) for reviewer diversity — one stacked PR per TDD, downstream halt-on-failure, never merges. Includes a toolchain bootstrap skill and a format+lint hook for JS/TS, Python, Rust, and Go.",
   "author": {
     "name": "Chris Henesy"

diff --git a/docs/tdd/0040-transient-gate-failure-resilience.md b/docs/tdd/0040-transient-gate-failure-resilience.md
@@ -1,6 +1,6 @@
 # TDD 0040: Transient gate-failure resilience — ci-checks retry-once + no-verdict is couldn't-observe, not failed
 
-Status: draft
+Status: implemented
 PRD refs: FR-15 (gap-closure); FR-57 (gap-closure); NFR-4
 PRD-rev: d289607
 ADR constraints: 0004, 0005, 0006, 0007

diff --git a/scripts/lib/gates.sh b/scripts/lib/gates.sh
@@ -544,7 +544,47 @@ _fresh_review_verdict() {  # <log> <pre-log-size>
     | grep -aE '^[`[:space:]]*REVIEW_RESULT:' \
     | tail -1
 }
-run_ci_checks()    { bash "$CI_CHECKS" >>"$1" 2>&1; }
+# run_ci_checks <log> — TDD 0040 §1 (FR-15, NFR-4). Run ci-checks.sh; on a
+# non-zero exit, re-run the checks up to THROUGHLINE_CI_CHECKS_RETRIES (default 1)
+# more times in the SAME worktree (sequential, no parallelism). The FIRST passing
+# run wins (PASS); only the initial run AND every retry failing is a real FAIL —
+# so a transient suite flake is re-observed, never guessed past (ADR 0006), and a
+# reproducible regression still FAILs. A pass on retry writes a
+# "passed on retry N (initial run flaked)" telemetry line to the gate log so a
+# recovered flake is visible, not silent; a retries-exhausted FAIL writes an
+# equally explicit "FAILED after N attempt(s)" line (NFR-4: honest both ways).
+# THROUGHLINE_CI_CHECKS_RETRIES=0 restores the no-retry behavior (an escape hatch
+# for a deterministic-suite project); a non-numeric value defaults-and-warns
+# (mirrors the THROUGHLINE_WATCH_MAX_SECS validation pattern). The signature is
+# unchanged (<log> only) so the gate_one call site is untouched.
+run_ci_checks() {  # <log>
+  local log="$1"
+  local retries="${THROUGHLINE_CI_CHECKS_RETRIES:-1}"
+  case "$retries" in
+    ''|*[!0-9]*)
+      echo "warning: THROUGHLINE_CI_CHECKS_RETRIES='$retries' not numeric; falling back to 1" >&2
+      retries=1 ;;
+  esac
+  local attempt=0
+  while :; do
+    if bash "$CI_CHECKS" >>"$log" 2>&1; then
+      if [ "$attempt" -gt 0 ]; then
+        printf 'ci-checks: passed on retry %d (initial run flaked; recovered, NFR-4)\n' "$attempt" >> "$log"
+      fi
+      return 0
+    fi
+    if [ "$attempt" -ge "$retries" ]; then
+      # NFR-4 honesty: a retries-exhausted FAIL is recorded as explicitly as a
+      # recovery, so a reader can tell it apart from a single-shot failure.
+      printf 'ci-checks: FAILED after %d attempt(s) (initial + %d retries; retries exhausted, real FAIL)\n' \
+        "$((attempt + 1))" "$retries" >> "$log"
+      return 1
+    fi
+    attempt=$((attempt + 1))
+    printf 'ci-checks: attempt failed; re-running (retry %d of %d, THROUGHLINE_CI_CHECKS_RETRIES)\n' \
+      "$attempt" "$retries" >> "$log"
+  done
+}
 # _test_first_ok_range <base> <head> <skip-present> — the SHARED test-first
 # predicate (TDD 0038 §1 / FR-15a). Returns 0 iff `git log <base>..<head>`
 # contains a commit subject matching `^test(failing)` case-insensitively
@@ -1766,6 +1806,50 @@ _rework_escalate() {  # <slug> <tdd> <gate> <step> <cause> <ref> <criterion> <ex
     || echo "warning: _rework_escalate: record_blocker failed for $slug ($cause); BLOCKERS.md not updated — operator must add the entry by hand" >&2
 }
 
+# _gate_output_tail <log> <pre-log-size>  — TDD 0040 §2. Echo the last non-blank
+# line of the gate log slice produced AFTER <pre-log-size> bytes (the subprocess
+# output for THIS pass), stripped of control chars and clipped, for use as the
+# couldn't-observe `halt_cause_detail` (e.g. a `timeout … No such file` exec
+# error). Read-once (FR-74 #6) over a fixed byte offset; fail-loud (#1) with a
+# clear marker when the log is unreadable rather than silently emitting empty.
+_gate_output_tail() {  # <log> <pre-log-size>
+  local log="$1" pre="${2:-0}"
+  [ -r "$log" ] || { printf '(gate log unreadable: %s)' "$log"; return 0; }
+  case "$pre" in ''|*[!0-9]*) pre=0 ;; esac
+  local tail_line
+  tail_line="$(tail -c +"$((pre + 1))" "$log" 2>/dev/null \
+    | grep -avE '^[[:space:]]*$' \
+    | tail -n 1 \
+    | tr -d '\000-\010\013\014\016-\037' \
+    | cut -c1-200)"
+  [ -n "$tail_line" ] && printf '%s' "$tail_line" || printf '(no output captured)'
+}
+
+# _classify_gate_no_verdict <slug> <gate> <tail>  — TDD 0040 §2 (FR-57, NFR-4,
+# ADR 0006). A review/runtime-verify gate SUBPROCESS that exited leaving NO
+# parseable verdict line is couldn't-observe, not observed-wrong: a missing
+# verdict is the absence of an artifact, so it cannot BE a verdict. Record a
+# *resumable* gate-unobservable blocked halt — NOT a terminal `failed` — so the
+# gate is simply re-run on the next resume (Component 3 maps gate-unobservable to
+# a resume-first action list, making the blocked fragment auto-resumable via
+# _resume_from's blocked arm). <gate> names which gate (review|verify-runtime);
+# the detail carries the gate + the captured stderr/output tail so the operator
+# can see WHY the gate could not run. set_halt_cause FIRST then _terminal_state
+# blocked (TDD 0040 §2 order): set_halt_cause preserves the current status while
+# writing the halt fields, and _terminal_state blocked then carries those fields
+# forward, so the fragment ends at status=blocked with the cause intact. The
+# helper is gate-AGNOSTIC by design (it takes <gate> as a parameter): in THIS TDD
+# only the review gate (_rework_loop, below) drives it — the verify-runtime
+# no-verdict path lives in gate_one in lib/resume.sh, which is OUTSIDE this TDD's
+# declared ## Touched files and still records the old terminal `failed`; rewiring
+# that one call site to this classifier is a follow-up within resume.sh's scope.
+_classify_gate_no_verdict() {  # <slug> <gate> <tail>
+  local slug="$1" gate="$2" tail="$3"
+  set_halt_cause "$slug" gate-unobservable "$gate" "$gate gate emitted no parseable verdict: $tail" \
+    || return 1
+  _terminal_state "$slug" blocked "" "$gate gate produced no parseable verdict (couldn't observe; resumable, re-runs the gate)"
+}
+
 # _rework_loop <slug> <tdd> <rbase> <log>  — the bounded automatic rework loop
 # (FR-61, FR-62, FR-65, FR-66, FR-67). Runs the review gate; on a PASS verdict
 # returns 0 (converged). On a halting finding it either escalates (structural
@@ -1787,7 +1871,7 @@ _rework_loop() {  # <slug> <tdd> <rbase> <log>
   # the loop's state writes.
   case "$step" in ''|*[!0-9]*) step=1 ;; esac
   local max="${THROUGHLINE_REWORK_MAX:-3}"; case "$max" in ''|*[!0-9]*) max=3 ;; esac
-  local build_start="$rbase" cleared attempts rrc rs _retries_json
+  local build_start="$rbase" cleared attempts rrc rs
   # §3c re-review state. Declared local so review_one sees REVIEW_ATTENTION_DIRECTIVE
   # via dynamic scope only while this loop runs; RFIND_RE_REVIEW_DIRECTIVE is set by
   # _per_file_coverage_check when coverage is incomplete.
@@ -1825,21 +1909,26 @@ _rework_loop() {  # <slug> <tdd> <rbase> <log>
     # becomes a refinement of the fail path, not the only fail trigger.
     verdict_in_new="$(_fresh_review_verdict "$log" "$pre_log_size")"
     if [ "$rrc" -ne 0 ] && [ -z "$verdict_in_new" ]; then
-      _retries_json="$(_read_fragment_raw_array "${STATE_DIR:-}/$slug.json" retries 2>/dev/null)"
-      if [ -n "$_retries_json" ] && [ "$_retries_json" != "[]" ]; then
-        _terminal_state "$slug" failed "" "review gate fatal exit after retries (rc=$rrc; no fresh verdict)"
-      else
-        _terminal_state "$slug" failed "" "review gate fatal exit, no retries recorded and no fresh verdict (rc=$rrc)"
-      fi
+      # TDD 0040 §2 (FR-57, NFR-4, ADR 0006): the review subprocess exited
+      # leaving NO parseable REVIEW_RESULT line — couldn't-observe, NOT
+      # observed-wrong. Record a resumable gate-unobservable blocked halt with the
+      # captured output tail as detail, instead of the old terminal `failed`. The
+      # discriminator is verdict-presence (a mechanical check on the output),
+      # never the exit code; the retries-recorded distinction folds into the tail.
+      _classify_gate_no_verdict "$slug" review "$(_gate_output_tail "$log" "$pre_log_size") (rc=$rrc)"
       return 1
     fi
     # Prefer the fresh-pass verdict over the cumulative log tail; review_status
     # is the legacy fallback for callers that didn't snapshot pre_log_size.
     rs="${verdict_in_new:-$(review_status "$log")}"
-    # Crash guard: a pass that produced neither verdict is a fatal/garbled run.
+    # Crash guard: a pass that produced neither verdict is couldn't-observe — a
+    # garbled/empty run (rc may even be 0). TDD 0040 §2: a malformed/absent
+    # verdict resolves to gate-unobservable (couldn't-observe), never a guessed
+    # PASS/FAIL (NFR-4), so the gate is re-run rather than recorded as a false
+    # terminal verdict.
     case "$rs" in
       *PASS*|*BLOCK*) : ;;
-      *) _terminal_state "$slug" failed "" "review: no REVIEW_RESULT line"; return 1 ;;
+      *) _classify_gate_no_verdict "$slug" review "$(_gate_output_tail "$log" "$pre_log_size")"; return 1 ;;
     esac
     # TDD 0021 §2/§4 (FR-58): record this pass's findings onto findings[] and
     # drive the halt boundary off the {blocker,major} subset — NOT the

diff --git a/scripts/lib/state.sh b/scripts/lib/state.sh
@@ -935,6 +935,17 @@ _next_actions_for_cause() {
       # is revised + merged (the §3 verify-plan-unrevised guard enforces the
       # precondition). Mirrors structural-finding's resume-after-revision shape.
       echo "resume (re-run runtime-verify against the revised verification plan),revise the TDD's ## Verification plan via /tdd-author" ;;
+    gate-unobservable)
+      # TDD 0040 §3 (FR-57, NFR-4): a review/runtime-verify gate SUBPROCESS that
+      # exited leaving NO parseable verdict line (crash, exec failure, empty
+      # output) is couldn't-observe, not observed-wrong (ADR 0006: a missing
+      # verdict is the absence of an artifact, so it cannot BE a verdict). The
+      # FIRST element begins with `resume`, the machine-readable marker
+      # status.sh --check-paused and _resume_from's blocked arm key on. Unlike
+      # verify-unobservable (which needs a verification-plan revision first), a
+      # no-verdict gate is genuinely safe to re-run with no operator intent — the
+      # gate simply could not run — so the resume needs no revision precondition.
+      echo "resume (re-runs the gate),see the gate log for why the gate emitted no verdict" ;;
     design-escalation)
       echo "revise TDD via /tdd-author,/adr-new if a constraint is being challenged" ;;
     external-blocker)

diff --git a/scripts/status.sh b/scripts/status.sh
@@ -200,6 +200,7 @@ _halt_cause_known() {  # <cause>
     design-escalation|external-blocker) return 0 ;;
     resume-blocked-integration-conflict) return 0 ;;   # TDD 0031 §3c (mirrors state.sh enum)
     verify-unobservable) return 0 ;;   # TDD 0035 §1 (mirrors state.sh enum); FR-64 renders it without the unknown-cause warning
+    gate-unobservable) return 0 ;;     # TDD 0040 §3 (mirrors state.sh enum); a no-verdict gate renders without the unknown-cause warning
     *) return 1 ;;
   esac
 }

diff --git a/tests/implement-gate.test.sh b/tests/implement-gate.test.sh
@@ -656,4 +656,20 @@ if [ -f "$TFP" ]; then
   bash "$TFP" || TFP_FAIL=1
 fi
 
-[ "$FAIL" -eq 0 ] && [ "$RPV_FAIL" -eq 0 ] && [ "$TSR_FAIL" -eq 0 ] && [ "$BTS_FAIL" -eq 0 ] && [ "$SMS_FAIL" -eq 0 ] && [ "$PRM_FAIL" -eq 0 ] && [ "$GRM_FAIL" -eq 0 ] && [ "$BRL_FAIL" -eq 0 ] && [ "$SCB_FAIL" -eq 0 ] && [ "$RR_FAIL" -eq 0 ] && [ "$BCL_FAIL" -eq 0 ] && [ "$BO_FAIL" -eq 0 ] && [ "$IDP_FAIL" -eq 0 ] && [ "$RES_FAIL" -eq 0 ] && [ "$CVR_FAIL" -eq 0 ] && [ "$HRS_FAIL" -eq 0 ] && [ "$SHR_FAIL" -eq 0 ] && [ "$BPL_FAIL" -eq 0 ] && [ "$BDN_FAIL" -eq 0 ] && [ "$IDISC_FAIL" -eq 0 ] && [ "$ERC_FAIL" -eq 0 ] && [ "$SCP_FAIL" -eq 0 ] && [ "$IMR_FAIL" -eq 0 ] && [ "$RVR_FAIL" -eq 0 ] && [ "$WIC_FAIL" -eq 0 ] && [ "$RTH_FAIL" -eq 0 ] && [ "$TFP_FAIL" -eq 0 ]
+# Run the transient-gate-resilience eval (TDD 0040 / FR-15, FR-57, NFR-4; ADR
+# 0004, 0006, 0007) as part of the same suite so the ci-checks retry-once loop
+# (Component 1), the gate-unobservable no-verdict classification + the gate-agnostic
+# _classify_gate_no_verdict / _gate_output_tail helpers (Component 2), and the
+# closed-enum + status-render mirror for gate-unobservable (Component 3) are
+# regression-gated by ci-checks, not orphaned from the aggregator. Per the
+# TDD 0038 §3 wire-in rule this registration is new gating behavior — its failing
+# wire-in test (the eval's §W dogfood) drove the AND-chain term below red→green
+# before this block landed.
+TGR="$(dirname "$0")/transient-gate-resilience.test.sh"
+TGR_FAIL=0
+if [ -f "$TGR" ]; then
+  echo
+  bash "$TGR" || TGR_FAIL=1
+fi
+
+[ "$FAIL" -eq 0 ] && [ "$RPV_FAIL" -eq 0 ] && [ "$TSR_FAIL" -eq 0 ] && [ "$BTS_FAIL" -eq 0 ] && [ "$SMS_FAIL" -eq 0 ] && [ "$PRM_FAIL" -eq 0 ] && [ "$GRM_FAIL" -eq 0 ] && [ "$BRL_FAIL" -eq 0 ] && [ "$SCB_FAIL" -eq 0 ] && [ "$RR_FAIL" -eq 0 ] && [ "$BCL_FAIL" -eq 0 ] && [ "$BO_FAIL" -eq 0 ] && [ "$IDP_FAIL" -eq 0 ] && [ "$RES_FAIL" -eq 0 ] && [ "$CVR_FAIL" -eq 0 ] && [ "$HRS_FAIL" -eq 0 ] && [ "$SHR_FAIL" -eq 0 ] && [ "$BPL_FAIL" -eq 0 ] && [ "$BDN_FAIL" -eq 0 ] && [ "$IDISC_FAIL" -eq 0 ] && [ "$ERC_FAIL" -eq 0 ] && [ "$SCP_FAIL" -eq 0 ] && [ "$IMR_FAIL" -eq 0 ] && [ "$RVR_FAIL" -eq 0 ] && [ "$WIC_FAIL" -eq 0 ] && [ "$RTH_FAIL" -eq 0 ] && [ "$TFP_FAIL" -eq 0 ] && [ "$TGR_FAIL" -eq 0 ]