From a271f54ee922c6100a9f02147b1a9af38d832b28 Mon Sep 17 00:00:00 2001 From: Tim Jacks <53003551+jimmytacks@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:37:40 +0100 Subject: [PATCH] Add retry logic and graceful recovery to research loop API errors (500, overloaded) now trigger up to 3 retries with exponential backoff (30s/60s/90s). If all retries fail, the iteration is skipped without counting against the max. When an iteration completes but has no handoff signal (partial crash), the loop continues with the same persona and injects context about the failure into the handoff file so the next iteration knows to check git status and resume rather than redo work. Co-Authored-By: Claude Opus 4.6 (1M context) --- experiments/run-loop.sh | 78 +++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/experiments/run-loop.sh b/experiments/run-loop.sh index 64e8ea82..a73e49f6 100644 --- a/experiments/run-loop.sh +++ b/experiments/run-loop.sh @@ -215,22 +215,52 @@ while [ $iteration -lt $MAX_ITERATIONS ]; do break fi - if [ "$VERBOSE" = true ]; then - echo "$prompt" | claude -p - \ - --model opus \ - --dangerously-skip-permissions \ - --max-turns 100 \ - --verbose \ - --output-format stream-json \ - | tee "$session_file" - else - echo "$prompt" | claude -p - \ - --model opus \ - --dangerously-skip-permissions \ - --max-turns 100 \ - --verbose \ - --output-format stream-json \ - > "$session_file" + # Retry loop for transient API errors + MAX_RETRIES=3 + retry=0 + api_error=true + + while [ "$api_error" = true ] && [ $retry -lt $MAX_RETRIES ]; do + if [ $retry -gt 0 ]; then + backoff=$(( 30 * retry )) + echo " Retrying in ${backoff}s (attempt $((retry + 1))/$MAX_RETRIES)..." + sleep $backoff + fi + + if [ "$VERBOSE" = true ]; then + echo "$prompt" | claude -p - \ + --model opus \ + --dangerously-skip-permissions \ + --max-turns 100 \ + --verbose \ + --output-format stream-json \ + | tee "$session_file" + else + echo "$prompt" | claude -p - \ + --model opus \ + --dangerously-skip-permissions \ + --max-turns 100 \ + --verbose \ + --output-format stream-json \ + > "$session_file" + fi + + # Check for API errors in session output + if grep -q '"type":"api_error"\|"type":"overloaded_error"\|"error":{"type":"api_error"' "$session_file" 2>/dev/null; then + echo " API error detected in iteration $iteration." + echo "API error on attempt $((retry + 1)): $(grep -o 'API Error.*' "experiments/data/session-${iteration}-result.txt" 2>/dev/null || echo 'see session file')" >> "$DEBUG_LOG" + retry=$((retry + 1)) + else + api_error=false + fi + done + + if [ "$api_error" = true ]; then + echo " Failed after $MAX_RETRIES retries. Skipping iteration." + echo "Iteration $iteration FAILED after $MAX_RETRIES retries" >> "$DEBUG_LOG" + # Don't count this iteration — decrement and continue to next + iteration=$((iteration - 1)) + continue fi # Extract result text from session file @@ -295,8 +325,20 @@ while [ $iteration -lt $MAX_ITERATIONS ]; do echo "Research complete after $iteration iterations." break else - echo "No handoff signal detected (got: '$handoff'). Stopping loop." - break + echo "No handoff signal detected (got: '$handoff'). Retrying same persona ($PERSONA)." + # Write context about the failed iteration into the handoff file + # so the next iteration knows it may be resuming partial work + cat > "$HANDOFF_FILE" <