From f1387bc3c97ea63363904236e8975a3f93d6a13b Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 11:59:06 -0300
Subject: [PATCH 01/14] fix: ensure content+role fields always present in
 streaming deltas for OpenAI compat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #1370 — OpenCode / @ai-sdk/openai-compatible streaming crash

Three fixes for reasoning model streaming:

1. Streaming proxy normalization (streaming_proxy.cpp):
   - Intercepts each SSE data: {...} line in forward_sse_stream()
   - Injects content: "" when reasoning_content is present without content
   - Injects role: "assistant" when null/missing on assistant deltas
   - Only applies to chat.completion.chunk objects (non-chat passthrough)

2. Non-streaming response (server.cpp):
   - Same content injection for REST chat completions response

3. thinking: false passthrough (server.cpp):
   - Replaced strip_handled_thinking_fields() (which erased enable_thinking/
     thinking before forwarding) with normalize_thinking_fields() which
     renames thinking → enable_thinking and keeps it in the forwarded
     request. FLM/vLLM/cloud backends now see enable_thinking.
   - /no_think prefix retained for llama.cpp compatibility

Tests: 11 unit tests covering role normalization, reasoning content
normalization, carriage return, multi-choice, multi-line streams.
7/7 C++ tests pass (100%).
---
 CMakeLists.txt                                |  22 ++
 src/cpp/include/lemon/streaming_proxy.h       |   8 +-
 src/cpp/server/server.cpp                     |  40 +++-
 src/cpp/server/streaming_proxy.cpp            | 153 +++++++++++++-
 ...test_streaming_proxy_reasoning_content.cpp | 193 ++++++++++++++++++
 5 files changed, 407 insertions(+), 9 deletions(-)
 create mode 100644 test/cpp/test_streaming_proxy_reasoning_content.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 311b19e3a..f0cc11f75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1834,3 +1834,25 @@ if(EXISTS "${_AUTO_TUNE_TEST_SRC}")
     include(CTest)
     add_test(NAME AutoTuneTest COMMAND test_auto_tune)
 endif()
+
+# Streaming proxy reasoning content normalization: ensures `content` field is
+# always present alongside `reasoning_content` in SSE delta chunks.
+set(_STREAMING_REASONING_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_streaming_proxy_reasoning_content.cpp"
+)
+if(EXISTS "${_STREAMING_REASONING_TEST_SRC}")
+    add_executable(test_streaming_proxy_reasoning_content
+        test/cpp/test_streaming_proxy_reasoning_content.cpp
+    )
+    target_include_directories(test_streaming_proxy_reasoning_content PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+    )
+    target_link_libraries(test_streaming_proxy_reasoning_content PRIVATE
+        lemonade-server-core
+        nlohmann_json::nlohmann_json
+    )
+
+    include(CTest)
+    add_test(NAME StreamingProxyReasoningContentTest COMMAND test_streaming_proxy_reasoning_content)
+endif()
diff --git a/src/cpp/include/lemon/streaming_proxy.h b/src/cpp/include/lemon/streaming_proxy.h
index 4d56b702a..070ffbc8e 100644
--- a/src/cpp/include/lemon/streaming_proxy.h
+++ b/src/cpp/include/lemon/streaming_proxy.h
@@ -51,8 +51,14 @@ class StreamingProxy {
         std::function<void()> on_chunk = nullptr
     );
 
+    // Normalize streaming chat.completion.chunk SSE deltas for OpenAI API
+    // compatibility. Applies:
+    //   1. Injects `role: "assistant"` when null/missing on assistant deltas
+    //   2. Injects `content: ""` alongside `reasoning_content` when absent
+    static std::string normalize_chat_completion_chunk(const std::string& sse_chunk);
+
 private:
     static TelemetryData parse_telemetry(const std::string& buffer);
 };
 
-} // namespace lemon
+} // namespace lemon
\ No newline at end of file
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 10724bbfa..26ba3a12d 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -93,10 +93,33 @@ bool should_disable_thinking(const json& request_json) {
     return false;
 }
 
-bool strip_handled_thinking_fields(json& request_json) {
+bool normalize_thinking_fields(json& request_json) {
     bool modified = false;
-    modified = request_json.erase("enable_thinking") > 0 || modified;
-    modified = request_json.erase("thinking") > 0 || modified;
+
+    // Rename OpenAI's `thinking` (boolean or object) to lemonade's canonical
+    // `enable_thinking` (boolean) so backends (FLM, llama.cpp, cloud, etc.)
+    // that understand the flag can act on it instead of having it stripped.
+    if (request_json.contains("thinking")) {
+        const auto& thinking = request_json["thinking"];
+        bool enable = true;
+        if (thinking.is_boolean()) {
+            enable = thinking.get<bool>();
+            request_json["enable_thinking"] = enable;
+        } else if (thinking.is_object()) {
+            std::string type = thinking.value("type", "");
+            if (type == "disabled") {
+                request_json["enable_thinking"] = false;
+            } else if (type == "enabled") {
+                request_json["enable_thinking"] = true;
+            }
+        }
+        request_json.erase("thinking");
+        modified = true;
+    }
+
+    // Don't strip enable_thinking — forward to backends that support it.
+    // The /no_think prefix (injected earlier for llama.cpp) is an additional
+    // signal; backends like FLM/vLLM use the enable_thinking field directly.
     return modified;
 }
 
@@ -2164,7 +2187,7 @@ void Server::handle_chat_completions(const httplib::Request& req, httplib::Respo
         if (should_disable_thinking(request_json)) {
             request_modified = prepend_no_think_to_last_user_message(request_json) || request_modified;
         }
-        request_modified = strip_handled_thinking_fields(request_json) || request_modified;
+        request_modified = normalize_thinking_fields(request_json) || request_modified;
 
         // If we modified the request (or normalized the model name earlier), serialize to string
         // The early normalize_client_model_name() call modifies request_json but doesn't set a flag,
@@ -2227,6 +2250,15 @@ void Server::handle_chat_completions(const httplib::Request& req, httplib::Respo
                             LOG(DEBUG, "Server") << "Message content: " << message["content"].get<std::string>().substr(0, 200) << std::endl;
                         }
                     }
+
+                    // Ensure content field is present alongside reasoning_content
+                    // Standard OpenAI-compatible clients expect content to always be present
+                    const bool has_reasoning = message.contains("reasoning_content") &&
+                                               message["reasoning_content"].is_string();
+                    const bool has_content = message.contains("content") && !message["content"].is_null();
+                    if (has_reasoning && !has_content) {
+                        message["content"] = "";
+                    }
                 }
             }
 
diff --git a/src/cpp/server/streaming_proxy.cpp b/src/cpp/server/streaming_proxy.cpp
index abb22fc02..096a4523d 100644
--- a/src/cpp/server/streaming_proxy.cpp
+++ b/src/cpp/server/streaming_proxy.cpp
@@ -9,6 +9,119 @@
 
 namespace lemon {
 
+namespace {
+
+// Normalize a single `data: {...}` SSE line for chat.completion.chunk objects.
+// Applies two fixes for OpenAI API compliance:
+//
+// 1. Role normalization: some backends emit null or missing `delta.role` on
+//    content chunks. Injects `"role": "assistant"` when the delta contains
+//    assistant-type fields (content, reasoning_content, thinking, tool_calls,
+//    function_call) but role is absent or null.
+//
+// 2. Content normalization: backends that emit `reasoning_content` often omit
+//    the standard `content` field entirely. Injects `"content": ""` to prevent
+//    OpenAI-compatible clients (e.g. @ai-sdk/openai-compatible) from resetting
+//    the connection when content is expected but absent.
+std::string normalize_data_line(const std::string& line) {
+    const std::string prefix = "data: ";
+    if (line.rfind(prefix, 0) != 0) {
+        return line;
+    }
+
+    // Preserve trailing \r if present (some SSE implementations send \r\n)
+    std::string suffix;
+    std::string payload = line.substr(prefix.size());
+    if (!payload.empty() && payload.back() == '\r') {
+        suffix = "\r";
+        payload.pop_back();
+    }
+
+    if (payload.empty() || payload == "[DONE]") {
+        return line;
+    }
+
+    try {
+        auto chunk = json::parse(payload);
+        // Only normalize chat.completion.chunk objects — leave text_completion,
+        // error frames, and other SSE events untouched.
+        if (!chunk.is_object() ||
+            !chunk.contains("object") ||
+            !chunk["object"].is_string() ||
+            chunk["object"].get<std::string>() != "chat.completion.chunk" ||
+            !chunk.contains("choices") ||
+            !chunk["choices"].is_array()) {
+            return line;
+        }
+
+        bool changed = false;
+        for (auto& choice : chunk["choices"]) {
+            if (!choice.is_object() || !choice.contains("delta") || !choice["delta"].is_object()) {
+                continue;
+            }
+
+            auto& delta = choice["delta"];
+
+            // --- Fix 1: Role normalization ---
+            const bool role_is_null = delta.contains("role") && delta["role"].is_null();
+            const bool role_is_missing = !delta.contains("role");
+            const bool has_assistant_delta =
+                delta.contains("content") ||
+                delta.contains("reasoning_content") ||
+                delta.contains("thinking") ||
+                delta.contains("tool_calls") ||
+                delta.contains("function_call");
+
+            if (role_is_null || (role_is_missing && has_assistant_delta)) {
+                delta["role"] = "assistant";
+                changed = true;
+            }
+
+            // --- Fix 2: Content normalization ---
+            // If delta has reasoning_content but content is missing or null,
+            // inject empty content string for OpenAI compatibility
+            const bool has_reasoning = delta.contains("reasoning_content") &&
+                                       delta["reasoning_content"].is_string();
+            const bool has_content = delta.contains("content") && !delta["content"].is_null();
+
+            if (has_reasoning && !has_content) {
+                delta["content"] = "";
+                changed = true;
+            }
+        }
+
+        if (!changed) {
+            return line;
+        }
+
+        return prefix + chunk.dump() + suffix;
+    } catch (...) {
+        // Malformed JSON — pass through unchanged
+        return line;
+    }
+}
+
+} // namespace
+
+std::string StreamingProxy::normalize_chat_completion_chunk(const std::string& sse_chunk) {
+    std::string output;
+    size_t pos = 0;
+
+    while (pos < sse_chunk.size()) {
+        size_t newline = sse_chunk.find('\n', pos);
+        if (newline == std::string::npos) {
+            output += normalize_data_line(sse_chunk.substr(pos));
+            break;
+        }
+
+        output += normalize_data_line(sse_chunk.substr(pos, newline - pos));
+        output.push_back('\n');
+        pos = newline + 1;
+    }
+
+    return output;
+}
+
 void StreamingProxy::forward_sse_stream(
     const std::string& backend_url,
     const std::string& request_body,
@@ -24,30 +137,55 @@ void StreamingProxy::forward_sse_stream(
     double time_to_first_token = 0.0;
     const auto start_time = std::chrono::steady_clock::now();
 
+    // Line buffer for SSE normalization: libcurl may deliver an SSE line split
+    // across multiple write callbacks, so we accumulate partial input and only
+    // normalize complete lines (terminated by '\n') before forwarding.
+    std::string line_buffer;
+
     auto result = utils::HttpClient::post_stream(
         backend_url,
         request_body,
         [&sink, &telemetry_buffer, &has_done_marker, &has_first_token,
-         &time_to_first_token, &start_time, &on_chunk](const char* data, size_t length) {
+         &time_to_first_token, &start_time, &on_chunk, &line_buffer](const char* data, size_t length) {
             if (on_chunk) {
                 on_chunk();
             }
 
+            // Telemetry buffer — raw bytes, pre-normalization
             telemetry_buffer.append(data, length);
 
             std::string chunk(data, length);
+
+            // First-token timing
             if (!has_first_token && chunk.find("data: ") != std::string::npos) {
                 has_first_token = true;
                 time_to_first_token = std::chrono::duration<double>(
                     std::chrono::steady_clock::now() - start_time).count();
             }
 
+            // [DONE] marker detection
             if (chunk.find("data: [DONE]") != std::string::npos) {
                 has_done_marker = true;
             }
 
-            if (!sink.write(data, length)) {
-                return false;
+            // Accumulate bytes and flush only complete (newline-terminated) lines
+            // so normalization can safely parse each `data: {...}` payload.
+            line_buffer.append(chunk);
+            std::string output;
+            size_t pos = 0;
+            size_t newline;
+            while ((newline = line_buffer.find('\n', pos)) != std::string::npos) {
+                output.append(
+                    StreamingProxy::normalize_chat_completion_chunk(
+                        line_buffer.substr(pos, newline - pos + 1)));
+                pos = newline + 1;
+            }
+            line_buffer.erase(0, pos);
+
+            if (!output.empty()) {
+                if (!sink.write(output.data(), output.size())) {
+                    return false; // Client disconnected
+                }
             }
 
             return true;
@@ -76,6 +214,13 @@ void StreamingProxy::forward_sse_stream(
     }
 
     if (!stream_error) {
+        // Flush any trailing partial line before sending [DONE]
+        if (!line_buffer.empty()) {
+            std::string tail = StreamingProxy::normalize_chat_completion_chunk(line_buffer);
+            sink.write(tail.data(), tail.size());
+            line_buffer.clear();
+        }
+
         // Ensure [DONE] marker is sent only for clean transports. If the transport
         // was interrupted before [DONE], the block above throws and recovery is
         // handled by WrappedServer/Router instead of pretending success.
@@ -233,4 +378,4 @@ StreamingProxy::TelemetryData StreamingProxy::parse_telemetry(const std::string&
     return telemetry;
 }
 
-} // namespace lemon
+} // namespace lemon
\ No newline at end of file
diff --git a/test/cpp/test_streaming_proxy_reasoning_content.cpp b/test/cpp/test_streaming_proxy_reasoning_content.cpp
new file mode 100644
index 000000000..3c4fc115c
--- /dev/null
+++ b/test/cpp/test_streaming_proxy_reasoning_content.cpp
@@ -0,0 +1,193 @@
+#include "lemon/streaming_proxy.h"
+#include <cassert>
+#include <iostream>
+#include <nlohmann/json.hpp>
+#include <sstream>
+#include <string>
+
+using json = nlohmann::json;
+
+static json parse_first_data_json(const std::string& sse) {
+    const std::string prefix = "data: ";
+    auto start = sse.find(prefix);
+    assert(start != std::string::npos);
+    start += prefix.size();
+    auto end = sse.find('\n', start);
+    assert(end != std::string::npos);
+    return json::parse(sse.substr(start, end - start));
+}
+
+// ===== Role normalization tests =====
+
+static void test_null_role_is_normalized() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"hi\",\"role\":null},\"finish_reason\":null}]}\n\n";
+
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+
+    assert(chunk["choices"][0]["delta"]["role"] == "assistant");
+    assert(chunk["choices"][0]["delta"]["content"] == "hi");
+}
+
+static void test_missing_role_on_content_delta_is_added() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"hi\"},\"finish_reason\":null}]}\n\n";
+
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+
+    assert(chunk["choices"][0]["delta"]["role"] == "assistant");
+}
+
+static void test_empty_delta_chunk_role_is_not_mutated() {
+    // A finish-reason-only delta (no assistant payload) should not get a role added.
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{},\"finish_reason\":\"stop\"}]}\n\n";
+    assert(lemon::StreamingProxy::normalize_chat_completion_chunk(input) == input);
+}
+
+static void test_done_marker_and_non_chat_chunks_are_preserved() {
+    std::string done = "data: [DONE]\n\n";
+    assert(lemon::StreamingProxy::normalize_chat_completion_chunk(done) == done);
+
+    std::string completion =
+        "data: {\"object\":\"text_completion\",\"choices\":[{\"index\":0,\"text\":\"hi\"}]}\n\n";
+    assert(lemon::StreamingProxy::normalize_chat_completion_chunk(completion) == completion);
+}
+
+// ===== Reasoning content normalization tests =====
+
+static void test_reasoning_content_without_content_gets_empty_content() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"reasoning_content\":\"Let me think...\"},\"finish_reason\":null}]}\n\n";
+
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+
+    assert(chunk["choices"][0]["delta"]["reasoning_content"] == "Let me think...");
+    assert(chunk["choices"][0]["delta"]["content"] == "");
+}
+
+static void test_reasoning_content_with_null_content_gets_empty_content() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"reasoning_content\":\"hmm\",\"content\":null},\"finish_reason\":null}]}\n\n";
+
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+
+    assert(chunk["choices"][0]["delta"]["reasoning_content"] == "hmm");
+    assert(chunk["choices"][0]["delta"]["content"] == "");
+}
+
+static void test_normal_content_delta_gets_role_added() {
+    // Content with no role gets role: assistant injected (role normalization)
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"Hello\"},\"finish_reason\":null}]}\n\n";
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+    assert(chunk["choices"][0]["delta"]["content"] == "Hello");
+    assert(chunk["choices"][0]["delta"]["role"] == "assistant");
+}
+
+static void test_content_and_reasoning_together_is_not_content_mutated() {
+    // Both content and reasoning_content present — content should not be touched
+    // Role gets added (role normalization) since none was present
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"The\",\"reasoning_content\":\"thinking\"},\"finish_reason\":null}]}\n\n";
+    auto output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+    assert(chunk["choices"][0]["delta"]["content"] == "The");
+    assert(chunk["choices"][0]["delta"]["reasoning_content"] == "thinking");
+    assert(chunk["choices"][0]["delta"]["role"] == "assistant");
+}
+
+// ===== Combined edge cases =====
+
+static void test_carriage_return_is_preserved() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"reasoning_content\":\"hmm\"},\"finish_reason\":null}]}\r\n\r\n";
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+    assert(chunk["choices"][0]["delta"]["reasoning_content"] == "hmm");
+    assert(chunk["choices"][0]["delta"]["content"] == "");
+    assert(output.size() >= 2 && output[output.size() - 2] == '\r');
+}
+
+static void test_multi_choice_handling() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":["
+        "{\"index\":0,\"delta\":{\"reasoning_content\":\"think\"},\"finish_reason\":null},"
+        "{\"index\":1,\"delta\":{\"content\":\"Hello\"},\"finish_reason\":null}"
+        "]}\n\n";
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    auto chunk = parse_first_data_json(output);
+    // Choice 0: reasoning + content injection + role
+    assert(chunk["choices"][0]["delta"]["reasoning_content"] == "think");
+    assert(chunk["choices"][0]["delta"]["content"] == "");
+    assert(chunk["choices"][0]["delta"]["role"] == "assistant");
+    // Choice 1: unchanged (already has content)
+    assert(!chunk["choices"][1]["delta"].contains("reasoning_content"));
+    assert(chunk["choices"][1]["delta"]["content"] == "Hello");
+}
+
+static void test_multiple_lines() {
+    std::string input =
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"role\":\"assistant\"},\"finish_reason\":null}]}\n"
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"reasoning_content\":\"Let me\"},\"finish_reason\":null}]}\n"
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"reasoning_content\":\" think\"},\"finish_reason\":null}]}\n"
+        "data: {\"object\":\"chat.completion.chunk\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"The answer is\"},\"finish_reason\":null}]}\n"
+        "data: [DONE]\n";
+
+    std::string output = lemon::StreamingProxy::normalize_chat_completion_chunk(input);
+    
+    std::istringstream stream(output);
+    std::string line;
+    int line_num = 0;
+    while (std::getline(stream, line)) {
+        if (line.find("data: ") != 0) continue;
+        std::string payload = line.substr(6);
+        if (payload == "[DONE]") break;
+        
+        auto chunk = json::parse(payload);
+        auto& delta = chunk["choices"][0]["delta"];
+        
+        if (line_num == 0) {
+            // Role delta — unchanged
+            assert(delta["role"] == "assistant");
+        } else if (line_num >= 1 && line_num <= 2) {
+            // Reasoning deltas — should have content: "" AND role: assistant
+            assert(delta.contains("reasoning_content"));
+            assert(delta["content"] == "");
+            assert(delta["role"] == "assistant");
+        } else if (line_num == 3) {
+            // Content delta — unchanged
+            assert(delta["content"] == "The answer is");
+            assert(!delta.contains("reasoning_content"));
+        }
+        line_num++;
+    }
+    assert(line_num >= 4);
+}
+
+int main() {
+    // Role normalization
+    test_null_role_is_normalized();
+    test_missing_role_on_content_delta_is_added();
+    test_empty_delta_chunk_role_is_not_mutated();
+    test_done_marker_and_non_chat_chunks_are_preserved();
+
+    // Reasoning content normalization
+    test_reasoning_content_without_content_gets_empty_content();
+    test_reasoning_content_with_null_content_gets_empty_content();
+    test_normal_content_delta_gets_role_added();
+    test_content_and_reasoning_together_is_not_content_mutated();
+
+    // Combined edge cases
+    test_carriage_return_is_preserved();
+    test_multi_choice_handling();
+    test_multiple_lines();
+
+    std::cout << "all streaming proxy normalization tests passed\n";
+    return 0;
+}
\ No newline at end of file

From 6849d928387b1a7a2a38c3fd939692fdf5442e43 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:11:14 -0300
Subject: [PATCH 02/14] fix: download TheRock ROCm backend for all detected GPU
 architectures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #2371 — multi-GPU systems only got ROCm for the first GPU arch

get_rocm_arch() iterates AMD GPUs (iGPU first, then dGPU) and returns
only the first match. On systems with both an iGPU and dGPU with
different architectures, TheRock was only installed for the iGPU.

Fix:
- Add get_rocm_arches() returning ALL detected AMD GPU architectures
  (deduplicated, iGPU-first ordering preserved)
- Update install_therock_if_needed() to install TheRock for every arch
- Keep get_rocm_arch() for backward compat (rocm_channel, display)
---
 src/cpp/include/lemon/system_info.h |  7 ++++++
 src/cpp/server/backend_manager.cpp  | 17 ++++++++++++---
 src/cpp/server/system_info.cpp      | 33 +++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/src/cpp/include/lemon/system_info.h b/src/cpp/include/lemon/system_info.h
index 9b143ae47..fa192d838 100644
--- a/src/cpp/include/lemon/system_info.h
+++ b/src/cpp/include/lemon/system_info.h
@@ -108,7 +108,14 @@ class SystemInfo {
     static std::string get_system_llamacpp_version();
 
     // Device support detection
+    // Return the first (primary) ROCm architecture — typically the iGPU.
+    // Used for rocm_channel selection and single-arch contexts.
     static std::string get_rocm_arch();
+
+    // Return ALL detected AMD GPU ROCm architectures, ordered iGPU first
+    // then dGPUs. Used by backend download paths to install ROCm binaries
+    // for every GPU on the system, not just the first one detected.
+    static std::vector<std::string> get_rocm_arches();
     static std::string get_cuda_arch();
 
     // CUDA release assets are architecture-specific (sm_89, sm_120, etc.).
diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp
index 120b61428..6e1537367 100644
--- a/src/cpp/server/backend_manager.cpp
+++ b/src/cpp/server/backend_manager.cpp
@@ -215,11 +215,22 @@ void install_therock_if_needed(const std::string& os, const json& backend_versio
         return;
     }
 
-    std::string rocm_arch = SystemInfo::get_rocm_arch();
+    std::vector<std::string> rocm_arches = SystemInfo::get_rocm_arches();
     std::string version = backend_versions["therock"]["version"].get<std::string>();
 
-    // Install TheRock for this architecture
-    backends::BackendUtils::install_therock(rocm_arch, version, progress_cb);
+    if (rocm_arches.empty()) {
+        // Fall back to single-arch detection for backward compatibility
+        std::string single_arch = SystemInfo::get_rocm_arch();
+        if (!single_arch.empty()) {
+            backends::BackendUtils::install_therock(single_arch, version, progress_cb);
+        }
+        return;
+    }
+
+    // Install TheRock for each detected GPU architecture
+    for (const auto& rocm_arch : rocm_arches) {
+        backends::BackendUtils::install_therock(rocm_arch, version, progress_cb);
+    }
 }
 
 } // namespace
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 056297bad..6f0e75c2d 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -2140,6 +2140,39 @@ std::string SystemInfo::get_rocm_arch() {
     return "";  // No supported architecture found
 }
 
+std::vector<std::string> SystemInfo::get_rocm_arches() {
+    // Returns ALL detected AMD GPU ROCm architectures.
+    // Ordered: iGPU first, then dGPUs (same order as the amd_gpu array).
+    // Used to ensure ROCm backends are downloaded for every GPU on the system.
+    std::vector<std::string> arches;
+    try {
+        json system_info = SystemInfoCache::get_system_info_with_cache();
+        if (!system_info.contains("devices")) {
+            return arches;
+        }
+        const auto& devices = system_info["devices"];
+        if (devices.contains("amd_gpu") && devices["amd_gpu"].is_array()) {
+            for (const auto& gpu : devices["amd_gpu"]) {
+                if (gpu.value("available", false)) {
+                    std::string name = gpu.value("name", "");
+                    if (!name.empty()) {
+                        std::string arch = identify_rocm_arch_from_name(name);
+                        if (!arch.empty()) {
+                            // Avoid duplicates (e.g., identical iGPU/dGPU arch strings)
+                            if (std::find(arches.begin(), arches.end(), arch) == arches.end()) {
+                                arches.push_back(arch);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } catch (...) {
+        // Detection failed — return empty list
+    }
+    return arches;
+}
+
 static int cuda_sm_value(const std::string& arch) {
     if (arch.size() <= 3 || arch.substr(0, 3) != "sm_") {
         return 0;

From 184d3b7b4916d847b487eff99a2ec1118889ab0b Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:44:19 -0300
Subject: [PATCH 03/14] fix: SSE heartbeat for long prefill + download
 resilience (#1364, #1546)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#1364 — Large Prompts Timing Out
- Add SSE keepalive heartbeat thread in forward_sse_stream()
- Sends : keepalive\n\n every 10s during prefill while waiting for first token
- Prevents client-side read timeouts on long-running prompt processing
- Thread-safe via shared mutex with the libcurl write callback

#1546 — Model Download Resilience
- Add .completed sentinel written after all files are verified in download
- is_checkpoint_path_complete() checks for .completed as authoritative marker
- Prevents corrupt partially-downloaded files from appearing complete
- Hardened recursive_directory_iterator with skip_permission_denied + error_code
---
 src/cpp/server/model_manager.cpp   | 33 +++++++++++++++++--
 src/cpp/server/streaming_proxy.cpp | 52 +++++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 6cd2a483c..a79e0d76d 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1025,9 +1025,16 @@ std::map<std::string, ModelInfo> ModelManager::discover_extra_models() const {
     std::map<std::string, std::vector<fs::path>> dirs_with_gguf;  // directory -> list of gguf files
     std::vector<fs::path> standalone_files;  // GGUF files not in subdirectories
 
-    // Recursively find all .gguf files
+    // Recursively find all .gguf files. Use error_code to skip inaccessible
+    // entries (permission denied, broken symlinks, dangling temp files from
+    // interrupted downloads) instead of throwing.
     try {
-        for (const auto& entry : fs::recursive_directory_iterator(search_dir)) {
+        std::error_code ec;
+        for (const auto& entry : fs::recursive_directory_iterator(search_dir, fs::directory_options::skip_permission_denied, ec)) {
+            if (ec) {
+                ec.clear();
+                continue;
+            }
             if (!entry.is_regular_file()) continue;
 
             std::string filename = entry.path().filename().string();
@@ -1761,7 +1768,15 @@ static bool is_checkpoint_path_complete(const std::string& path_str) {
         return !safe_exists(path_from_utf8(path_str + ".partial"));
     }
 
-    return !has_partial_files(resolved);
+    if (has_partial_files(resolved)) return false;
+
+    // Check for .completed sentinel — this is the authoritative marker that a
+    // download finished successfully. Without it, a crash during download
+    // (between fs::rename and manifest removal) leaves a corrupt file that is
+    // indistinguishable from a complete download by other checks alone.
+    // The sentinel is written after all files are verified in
+    // download_from_huggingface().
+    return safe_exists(marker_dir / ".completed");
 }
 
 /**
@@ -4406,6 +4421,18 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
         LOG(INFO, "ModelManager") << "Removed download manifest (download complete)" << std::endl;
     }
 
+    // Write .completed sentinel — this is the authoritative marker that a
+    // download finished successfully. Unlike manifest removal (which leaves a
+    // window where a crash produces a corrupt file indistinguishable from a
+    // complete one), the sentinel is only created after all files are verified.
+    // is_checkpoint_path_complete() checks for this file.
+    const fs::path completed_path = snapshot_path / ".completed";
+    if (!safe_exists(completed_path)) {
+        std::ofstream completed_file(path_to_utf8(completed_path));
+        completed_file << "completed\n";
+        completed_file.close();
+    }
+
     // Advance refs/main only after a successful pull that actually uses the
     // latest snapshot for the selected model artifacts. README-only commits keep
     // refs/main on the previous active snapshot.
diff --git a/src/cpp/server/streaming_proxy.cpp b/src/cpp/server/streaming_proxy.cpp
index 096a4523d..f688a3ed1 100644
--- a/src/cpp/server/streaming_proxy.cpp
+++ b/src/cpp/server/streaming_proxy.cpp
@@ -3,6 +3,9 @@
 #include <iostream>
 #include <chrono>
 #include <cstring>
+#include <thread>
+#include <atomic>
+#include <mutex>
 #include <stdexcept>
 #include <curl/curl.h>
 #include <lemon/utils/aixlog.hpp>
@@ -130,22 +133,53 @@ void StreamingProxy::forward_sse_stream(
     long timeout_seconds,
     std::function<void()> on_chunk) {
 
+    // During long prefill phases (e.g., 15k-token prompts taking minutes), no
+    // bytes are sent to the client. This can trigger client-side read timeouts
+    // (e.g. httplib::Client's 300s default) or reverse-proxy idle timeouts.
+    // Send periodic SSE comment lines (`: keepalive`) to reset I/O timeouts
+    // on both sides — SSE parsers ignore comment lines.
+    constexpr auto KEEPALIVE_INTERVAL = std::chrono::seconds(10);
+
     std::string telemetry_buffer;
     bool stream_error = false;
     bool has_done_marker = false;
-    bool has_first_token = false;
+    std::atomic<bool> has_first_token{false};
     double time_to_first_token = 0.0;
     const auto start_time = std::chrono::steady_clock::now();
 
+    // Mutex serialises sink.write() calls from the libcurl callback thread and
+    // the optional keepalive heartbeat thread below.
+    std::mutex sink_mutex;
+
     // Line buffer for SSE normalization: libcurl may deliver an SSE line split
     // across multiple write callbacks, so we accumulate partial input and only
     // normalize complete lines (terminated by '\n') before forwarding.
     std::string line_buffer;
 
+    // Start a keepalive heartbeat thread that sends SSE comment lines every
+    // KEEPALIVE_INTERVAL while waiting for the first token. The thread stops
+    // when has_first_token is set or the stream ends (detected via the shared
+    // flag being polled). This prevents client-side read timeouts during
+    // extended prefill phases where the backend sends no data for minutes.
+    std::atomic<bool> heartbeat_running{true};
+    std::thread heartbeat_thread([&]() {
+        while (!has_first_token.load() && heartbeat_running.load()) {
+            std::this_thread::sleep_for(KEEPALIVE_INTERVAL);
+            if (has_first_token.load() || !heartbeat_running.load()) break;
+            std::lock_guard<std::mutex> lock(sink_mutex);
+            const char* keepalive = ": keepalive\n\n";
+            if (!sink.write(keepalive, strlen(keepalive))) {
+                // Client disconnected — stop heartbeat
+                heartbeat_running.store(false);
+                break;
+            }
+        }
+    });
+
     auto result = utils::HttpClient::post_stream(
         backend_url,
         request_body,
-        [&sink, &telemetry_buffer, &has_done_marker, &has_first_token,
+        [&sink, &sink_mutex, &telemetry_buffer, &has_done_marker, &has_first_token,
          &time_to_first_token, &start_time, &on_chunk, &line_buffer](const char* data, size_t length) {
             if (on_chunk) {
                 on_chunk();
@@ -156,9 +190,9 @@ void StreamingProxy::forward_sse_stream(
 
             std::string chunk(data, length);
 
-            // First-token timing
-            if (!has_first_token && chunk.find("data: ") != std::string::npos) {
-                has_first_token = true;
+            // First-token timing — also signals heartbeat to stop
+            if (!has_first_token.load() && chunk.find("data: ") != std::string::npos) {
+                has_first_token.store(true);
                 time_to_first_token = std::chrono::duration<double>(
                     std::chrono::steady_clock::now() - start_time).count();
             }
@@ -183,6 +217,7 @@ void StreamingProxy::forward_sse_stream(
             line_buffer.erase(0, pos);
 
             if (!output.empty()) {
+                std::lock_guard<std::mutex> lock(sink_mutex);
                 if (!sink.write(output.data(), output.size())) {
                     return false; // Client disconnected
                 }
@@ -194,6 +229,13 @@ void StreamingProxy::forward_sse_stream(
         timeout_seconds
     );
 
+    // Signal heartbeat thread to stop and wait for it. This must happen
+    // before any post-stream sink operations (flush, [DONE], sink.done()).
+    heartbeat_running.store(false);
+    if (heartbeat_thread.joinable()) {
+        heartbeat_thread.join();
+    }
+
     const bool transport_interrupted =
         result.curl_code == CURLE_PARTIAL_FILE || result.curl_code == CURLE_RECV_ERROR;
 

From cba818a017e2724646c7ad41d128abbc6ae7097b Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:48:25 -0300
Subject: [PATCH 04/14] ci: add PR-Agent (DeepSeek) + Qodo dual review
 workflows

---
 .github/workflows/pr-agent-review.yml | 30 +++++++++++++++++++++++++++
 .github/workflows/qodo-merge.yml      | 19 +++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 .github/workflows/pr-agent-review.yml
 create mode 100644 .github/workflows/qodo-merge.yml

diff --git a/.github/workflows/pr-agent-review.yml b/.github/workflows/pr-agent-review.yml
new file mode 100644
index 000000000..d1a8b9155
--- /dev/null
+++ b/.github/workflows/pr-agent-review.yml
@@ -0,0 +1,30 @@
+name: pr-agent-review
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+  issue_comment:
+    types: [created]
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+jobs:
+  pr_agent_job:
+    name: PR-Agent (DeepSeek)
+    runs-on: ubuntu-latest
+    if: ${{ github.event.sender.type != 'Bot' && secrets.DEEPSEEK_API_KEY != '' }}
+    steps:
+      - name: PR Agent review
+        uses: the-pr-agent/pr-agent@main
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
+          config.model: "deepseek/deepseek-chat"
+          config.fallback_models: '["deepseek/deepseek-chat"]'
+          github_action_config.auto_review: "true"
+          github_action_config.auto_describe: "true"
+          github_action_config.auto_improve: "true"
+          pr_description.publish_labels: "true"
+          pr_description.publish_description_as: "suggestion"
+          pr_reviewer.require_score_review: "false"
+          pr_reviewer.num_code_suggestions: "4"
diff --git a/.github/workflows/qodo-merge.yml b/.github/workflows/qodo-merge.yml
new file mode 100644
index 000000000..9bbada1c1
--- /dev/null
+++ b/.github/workflows/qodo-merge.yml
@@ -0,0 +1,19 @@
+name: qodo-merge
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+jobs:
+  qodo-merge:
+    if: ${{ secrets.QODO_API_KEY != "" }}
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+      contents: read
+    steps:
+      - name: Qodo Merge Review
+        uses: qodo-ai/qodo-merge@main
+        env:
+          QODO_API_KEY: ${{ secrets.QODO_API_KEY }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}

From 31a64eb39cb03472fad3549f4e953a6fbe440671 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:48:29 -0300
Subject: [PATCH 05/14] fix: pre-load OOM memory warning in router (#1804)

Add a pre-load memory check in Router::load_model() that compares
model_info.size (file size in GB) against get_available_memory_gb()
for the target device. Logs a warning when the model may not fit.

Chose warn-only (not block) because:
1. GGUF file size != load-time memory (mmap'd, paged)
2. Auto-tune ctx_size resolver will reduce context to fit
3. A hard block would frustrate users who know their setup
---
 src/cpp/include/lemon/model_manager.h |  4 +++-
 src/cpp/server/model_manager.cpp      | 24 ++++++++++++++++++++++--
 src/cpp/server/router.cpp             | 21 +++++++++++++++++++++
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index 4efc68ae4..424106873 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -296,8 +296,10 @@ class ModelManager {
     // Download from a JSON manifest
     void download_from_manifest(const json& manifest, std::map<std::string, std::string>& headers, DownloadProgressCallback progress_callback);
 
-    // Download from Hugging Face
+    // Download from Hugging Face. When do_not_upgrade is true and a .completed
+    // sentinel exists in the snapshot directory, the HF API call is skipped.
     void download_from_huggingface(const ModelInfo& info,
+                                   bool do_not_upgrade = false,
                                    DownloadProgressCallback progress_callback = nullptr);
 
     // Download from FLM
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index a79e0d76d..03acb4ee8 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -2972,7 +2972,7 @@ void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_
     if (info.recipe == "flm") {
         download_from_flm(info.checkpoint(), do_not_upgrade, progress_callback);
     } else {
-        download_from_huggingface(info, progress_callback);
+        download_from_huggingface(info, do_not_upgrade, progress_callback);
     }
 
     // Update cache after successful download
@@ -3133,7 +3133,7 @@ json ModelManager::fetch_collection_manifest(const std::string& repo_id, bool do
         manifest_info.model_name = repo_id;
         manifest_info.checkpoints["main"] = repo_id;
         try {
-            download_from_huggingface(manifest_info, nullptr);
+            download_from_huggingface(manifest_info, false, nullptr);
             manifest = read_cached_collection_manifest(cache_dir);
         } catch (const std::exception& e) {
             if (!have_cache) throw;
@@ -4122,8 +4122,28 @@ void ModelManager::download_from_manifest(const json& manifest, std::map<std::st
 //   - llama-server backend: ❌ Cannot download (expects GGUF files pre-cached)
 //   - ryzenai-server backend: ❌ Cannot download (expects ONNX files pre-cached)
 void ModelManager::download_from_huggingface(const ModelInfo& info,
+                                            bool do_not_upgrade,
                                             DownloadProgressCallback progress_callback) {
     std::string main_repo_id = checkpoint_to_repo_id(info.checkpoint("main"));
+
+    // Fast-path: if do_not_upgrade is set and a .completed sentinel exists,
+    // skip the HF API call entirely. This avoids re-downloading files that
+    // are already fully verified on disk.
+    if (do_not_upgrade) {
+        fs::path model_cache_path = path_from_utf8(get_hf_cache_dir()) / repo_id_to_cache_dir_name(main_repo_id);
+        fs::path snapshots_dir = model_cache_path / "snapshots";
+        if (safe_exists(snapshots_dir)) {
+            std::error_code ec;
+            for (const auto& entry : fs::directory_iterator(snapshots_dir, ec)) {
+                if (ec) break;
+                if (entry.is_directory() && safe_exists(entry.path() / ".completed")) {
+                    LOG(INFO, "ModelManager") << "Model already downloaded (found .completed sentinel), skipping API call"
+                                              << std::endl;
+                    return;
+                }
+            }
+        }
+    }
     std::string main_variant = checkpoint_to_variant(info.checkpoint("main"));
 
     // Get Hugging Face cache directory
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index b3ec22c3b..bb229110b 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -486,6 +486,27 @@ void Router::load_model(const std::string& model_name,
             }
         }
 
+        // Pre-load OOM check: estimate if the model fits in available memory.
+        // Done AFTER eviction so freed VRAM/RAM is visible. Uses ModelInfo::size
+        // (file size in GB) as a rough proxy for load-time memory demand.
+        if (model_info.size > 0.0) {
+            constexpr double SAFETY_MARGIN = 0.9;
+            double available = get_available_memory_gb(
+                model_info.device & DEVICE_GPU ? DEVICE_GPU :
+                model_info.device & DEVICE_NPU ? DEVICE_NPU :
+                DEVICE_CPU);
+            double headroom = available * SAFETY_MARGIN;
+            if (model_info.size > headroom) {
+                LOG(WARNING, "Router") << "Model " << canonical_model_name
+                    << " requires ~" << model_info.size << " GB but only "
+                    << headroom << " GB available (safety margin " << SAFETY_MARGIN
+                    << "). Consider freeing memory or reducing ctx_size." << std::endl;
+                // Log only — don't block; the OS will handle OOM if it occurs.
+                // Many models load smaller than their file size (quantization
+                // mapped in pages), and the auto-tune below may reduce ctx_size.
+            }
+        }
+
         // Auto-tune: resolve ctx_size = -1 → computed from memory + arch metadata
         // Done AFTER eviction so that freed VRAM/RAM is visible to the memory query.
         int64_t auto_ctx = resolve_auto_ctx_size(effective_options, model_info);

From 132b09493db42dfbe4ad32d0b229f630316e1f41 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:50:21 -0300
Subject: [PATCH 06/14] fix: SSE heartbeat, download resilience, OOM guard,
 PR-Agent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completes fixes for #1364, #1546, #1804:

## #1364 — SSE heartbeat during long prefill
Injects : keepalive\n\n every 10s during prefill to prevent client-side
read timeouts on long prompts (15k tokens → 5 min prefill).

## #1546b — .completed sentinel for download verification
Written after all files are verified in download_from_huggingface().
is_checkpoint_path_complete() checks for it, preventing corrupt partially-
downloaded files from appearing complete after a crash.

## #1546a — Model-level download resume fast-path
download_from_huggingface() now accepts do_not_upgrade flag. When set and
.completed sentinel exists, skips the HF API call entirely.

## #1546c — Directory iterator hardening
discover_extra_models() uses skip_permission_denied + error_code handling
to prevent crashes on temp files from interrupted downloads.

## #1804 — Pre-load OOM guard
Upgraded the pre-load memory check from warning to hard block when model
size exceeds 2x available memory headroom. Prevents OOM killer crashes
with a clear error message instead.

## CI — PR-Agent + Qodo dual review
Added pr-agent-review.yml (DeepSeek) and qodo-merge.yml workflows.
---
 src/cpp/server/router.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index bb229110b..c05059484 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -489,6 +489,10 @@ void Router::load_model(const std::string& model_name,
         // Pre-load OOM check: estimate if the model fits in available memory.
         // Done AFTER eviction so freed VRAM/RAM is visible. Uses ModelInfo::size
         // (file size in GB) as a rough proxy for load-time memory demand.
+        // When the model far exceeds available memory (more than 2x headroom),
+        // reject the load with a clear error rather than letting the OS OOM killer
+        // kill the entire process. Smaller overruns may still succeed due to
+        // page-mapped quantization, so those are logged as warnings only.
         if (model_info.size > 0.0) {
             constexpr double SAFETY_MARGIN = 0.9;
             double available = get_available_memory_gb(
@@ -496,14 +500,19 @@ void Router::load_model(const std::string& model_name,
                 model_info.device & DEVICE_NPU ? DEVICE_NPU :
                 DEVICE_CPU);
             double headroom = available * SAFETY_MARGIN;
-            if (model_info.size > headroom) {
+            if (model_info.size > headroom * 2.0) {
+                is_loading_ = false;
+                load_cv_.notify_all();
+                throw std::runtime_error(
+                    "Model '" + canonical_model_name + "' requires ~" +
+                    std::to_string(model_info.size) + " GB but only " +
+                    std::to_string(headroom) + " GB available. "
+                    "Free memory by unloading other models or reducing ctx_size.");
+            } else if (model_info.size > headroom) {
                 LOG(WARNING, "Router") << "Model " << canonical_model_name
                     << " requires ~" << model_info.size << " GB but only "
                     << headroom << " GB available (safety margin " << SAFETY_MARGIN
                     << "). Consider freeing memory or reducing ctx_size." << std::endl;
-                // Log only — don't block; the OS will handle OOM if it occurs.
-                // Many models load smaller than their file size (quantization
-                // mapped in pages), and the auto-tune below may reduce ctx_size.
             }
         }
 

From 74918a2b7c235ba6558b61ca64bd8bde8168ee35 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 12:51:47 -0300
Subject: [PATCH 07/14] fix: model-level resume for interrupted HF downloads
 (#1546a)

Before fetching the HF API and rebuilding the file list, check for an
existing .download_manifest.json with incomplete files. If found, resume
downloading from the partial state instead of starting over.

This avoids re-downloading already-completed files after a network
interruption or Ctrl-C during model pull.
---
 src/cpp/server/model_manager.cpp | 34 ++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 03acb4ee8..96750f84a 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -4144,6 +4144,40 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
             }
         }
     }
+    // Resume fast-path: if a download manifest exists, resume downloading
+    // incomplete files instead of re-fetching the HF API and rebuilding the
+    // file list. This handles the common case of a network interruption.
+    {
+        fs::path model_cache_path = path_from_utf8(get_hf_cache_dir()) / repo_id_to_cache_dir_name(main_repo_id);
+        fs::path manifest_path = model_cache_path / "snapshots" / ".download_manifest.json";
+        if (!safe_exists(manifest_path)) {
+            manifest_path = model_cache_path / ".download_manifest.json";
+        }
+        if (safe_exists(manifest_path)) {
+            try {
+                json manifest = JsonUtils::load_from_file(path_to_utf8(manifest_path));
+                if (manifest.contains("files") && manifest["files"].is_array() && !manifest["files"].empty()) {
+                    LOG(INFO, "ModelManager") << "Resuming interrupted download from existing manifest"
+                                              << std::endl;
+                    std::map<std::string, std::string> headers;
+                    const char* hf_token = std::getenv("HF_TOKEN");
+                    if (hf_token && hf_token[0]) {
+                        headers["Authorization"] = "Bearer " + std::string(hf_token);
+                    }
+                    download_from_manifest(manifest, headers, progress_callback);
+                    if (fs::exists(manifest_path)) {
+                        fs::remove(manifest_path);
+                    }
+                    return;
+                }
+            } catch (...) {
+                // Stale manifest — fall through to fresh download
+                LOG(WARNING, "ModelManager") << "Failed to resume from manifest, starting fresh download"
+                                             << std::endl;
+            }
+        }
+    }
+
     std::string main_variant = checkpoint_to_variant(info.checkpoint("main"));
 
     // Get Hugging Face cache directory

From 11991bc6fdab71b43eda447c6d76b4c316abfa84 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:03:27 -0300
Subject: [PATCH 08/14] ci: remove claude-review.yml (no Anthropic API key
 available)

---
 .github/workflows/claude-review.yml | 42 -----------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 .github/workflows/claude-review.yml

diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml
deleted file mode 100644
index 6d21b534c..000000000
--- a/.github/workflows/claude-review.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Claude Code Review
-
-on:
-  pull_request_target:
-    types: [review_requested]
-  issue_comment:
-    types: [created]
-  pull_request_review_comment:
-    types: [created]
-
-permissions:
-  contents: read
-  pull-requests: write
-  issues: write
-  id-token: write
-
-jobs:
-  claude-review:
-    # Only run for maintainers (OWNER, MEMBER, COLLABORATOR)
-    # Triggers on: @claude mention in comments, or assigning claude as reviewer
-    if: |
-      (github.event_name == 'pull_request_target' &&
-       github.event.action == 'review_requested' &&
-       github.event.requested_reviewer.login == 'claude[bot]') ||
-      ((github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment') &&
-       contains(github.event.comment.body, '@claude') &&
-       (github.event.comment.author_association == 'OWNER' ||
-        github.event.comment.author_association == 'MEMBER' ||
-        github.event.comment.author_association == 'COLLABORATOR'))
-    runs-on: ubuntu-latest
-    concurrency:
-      group: claude-review-${{ github.event.pull_request.number || github.event.issue.number }}
-      cancel-in-progress: true
-    steps:
-      - uses: actions/checkout@v5
-        with:
-          fetch-depth: 1
-
-      - uses: anthropics/claude-code-action@v1
-        with:
-          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
-          github_token: ${{ secrets.GITHUB_TOKEN }}

From 5aa839c7b22b6aaa694ad50316f56150e5bd9205 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:04:54 -0300
Subject: [PATCH 09/14] Revert "ci: remove claude-review.yml (no Anthropic API
 key available)"

This reverts commit 11991bc6fdab71b43eda447c6d76b4c316abfa84.
---
 .github/workflows/claude-review.yml | 42 +++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 .github/workflows/claude-review.yml

diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml
new file mode 100644
index 000000000..6d21b534c
--- /dev/null
+++ b/.github/workflows/claude-review.yml
@@ -0,0 +1,42 @@
+name: Claude Code Review
+
+on:
+  pull_request_target:
+    types: [review_requested]
+  issue_comment:
+    types: [created]
+  pull_request_review_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+  id-token: write
+
+jobs:
+  claude-review:
+    # Only run for maintainers (OWNER, MEMBER, COLLABORATOR)
+    # Triggers on: @claude mention in comments, or assigning claude as reviewer
+    if: |
+      (github.event_name == 'pull_request_target' &&
+       github.event.action == 'review_requested' &&
+       github.event.requested_reviewer.login == 'claude[bot]') ||
+      ((github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment') &&
+       contains(github.event.comment.body, '@claude') &&
+       (github.event.comment.author_association == 'OWNER' ||
+        github.event.comment.author_association == 'MEMBER' ||
+        github.event.comment.author_association == 'COLLABORATOR'))
+    runs-on: ubuntu-latest
+    concurrency:
+      group: claude-review-${{ github.event.pull_request.number || github.event.issue.number }}
+      cancel-in-progress: true
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 1
+
+      - uses: anthropics/claude-code-action@v1
+        with:
+          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+          github_token: ${{ secrets.GITHUB_TOKEN }}

From b96224c9291f48944e87ad5a171b22258edafbaf Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:07:10 -0300
Subject: [PATCH 10/14] fix: prevent deadlock in get_system_info_with_cache
 during recipe detection (#2414)

The function used a std::lock_guard which caused a deadlock when
build_recipes_info re-entered get_system_info_with_cache via
get_rocm_arch(). Fix by switching to std::unique_lock, marking
s_recipes_computed early, and unlocking during recipe computation.
On failure the flag resets so the next call retries.
---
 src/cpp/server/system_info.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 6f0e75c2d..8c54c2489 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -3993,7 +3993,7 @@ static bool s_hardware_computed = false;
 static bool s_recipes_computed = false;
 
 json SystemInfoCache::get_system_info_with_cache() {
-    std::lock_guard<std::mutex> lock(s_system_info_mutex);
+    std::unique_lock<std::mutex> lock(s_system_info_mutex);
 
     // Return fully cached result if both hardware and recipes are computed
     if (s_hardware_computed && s_recipes_computed) {
@@ -4041,18 +4041,30 @@ json SystemInfoCache::get_system_info_with_cache() {
     }
 
     // Compute recipes if not cached (or invalidated)
+    // Mark s_recipes_computed early and unlock during recipe computation to
+    // prevent deadlock when build_recipes_info re-enters this function
+    // (e.g. via get_rocm_arch() → get_system_info_with_cache()).
     if (!s_recipes_computed) {
+        s_recipes_computed = true;
+        lock.unlock();
+
         try {
             auto sys_info = create_system_info();
             json devices = s_cached_system_info.contains("devices")
                 ? s_cached_system_info["devices"] : json::object();
-            s_cached_system_info["recipes"] = sys_info->build_recipes_info(devices);
+            json recipes = sys_info->build_recipes_info(devices);
+
+            lock.lock();
+            s_cached_system_info["recipes"] = recipes;
         } catch (const std::exception& e) {
+            lock.lock();
+            s_recipes_computed = false;  // Allow retry on failure
             LOG(ERROR, "Server") << "Recipe detection failed: " << e.what() << std::endl;
         } catch (...) {
+            lock.lock();
+            s_recipes_computed = false;  // Allow retry on failure
             LOG(ERROR, "Server") << "Recipe detection failed with unknown error" << std::endl;
         }
-        s_recipes_computed = true;
     }
 
     return s_cached_system_info;

From 7048b17f320b0f861e03bef4fe33535c14b3be71 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:09:33 -0300
Subject: [PATCH 11/14] fix: skip optional draft checkpoint in
 are_required_checkpoints_complete (#2435)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The draft checkpoint is optional — the model can run without MTP spec
decoding. But are_required_checkpoints_complete() iterates over ALL
checkpoint types, and if the draft model hasn't been downloaded yet
the entire model gets marked as not-downloaded / 'unreadable'.

Fix by skipping the draft type alongside npu_cache.
---
 src/cpp/server/model_manager.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 96750f84a..8ab453d89 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1782,12 +1782,13 @@ static bool is_checkpoint_path_complete(const std::string& path_str) {
 /**
  * Returns true if all files required by the model recipe are present and complete.
  * Note: npu_cache is skipped as it is managed lazily by the flm-npu backend.
+ * Note: draft is skipped as it is an optional MTP checkpoint.
  */
 static bool are_required_checkpoints_complete(const ModelInfo& info) {
     for (const auto& [type, checkpoint] : info.checkpoints) {
         (void)checkpoint;
 
-        if (type == "npu_cache") continue;
+        if (type == "npu_cache" || type == "draft") continue;
 
         const std::string resolved_path = info.resolved_path(type);
         if (!is_checkpoint_path_complete(resolved_path)) {

From 406e82d031bbc248ea947d9cc11be99489f8787c Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:16:00 -0300
Subject: [PATCH 12/14] feat: add setup-repo-secrets script and document
 required PR-Agent secrets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a setup script that reads API keys from ~/Documents/ and pushes
them as GitHub secrets via \`gh secret set\`, plus comment headers in
both PR-Agent workflows referencing how to configure secrets.

Workflows already skip gracefully when secrets are missing, but the
script and comments make it discoverable for new contributors.

- .github/scripts/setup-repo-secrets.sh — idempotent, dry-run, --repo support
- pr-agent-review.yml — header: DEEPSEEK_API_KEY required
- qodo-merge.yml — header: QODO_API_KEY required
---
 .github/scripts/setup-repo-secrets.sh | 106 ++++++++++++++++++++++++++
 .github/workflows/pr-agent-review.yml |   7 ++
 .github/workflows/qodo-merge.yml      |   7 ++
 3 files changed, 120 insertions(+)
 create mode 100644 .github/scripts/setup-repo-secrets.sh

diff --git a/.github/scripts/setup-repo-secrets.sh b/.github/scripts/setup-repo-secrets.sh
new file mode 100644
index 000000000..9671964a8
--- /dev/null
+++ b/.github/scripts/setup-repo-secrets.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+# ---------------------------------------------------------------------------
+# Setup GitHub repo secrets for PR-Agent (DeepSeek) and Qodo Merge workflows.
+#
+# Reads API keys from local files in ~/Documents/ and pushes them as
+# encrypted GitHub Actions secrets via `gh secret set`.
+#
+# Usage:
+#   bash .github/scripts/setup-repo-secrets.sh                  # current repo
+#   bash .github/scripts/setup-repo-secrets.sh --repo owner/repo
+#   bash .github/scripts/setup-repo-secrets.sh --dry-run        # preview only
+#
+# Key sources (edit paths below to match your setup):
+#   ~/Documents/deepseek api key.txt   → DEEPSEEK_API_KEY
+#   ~/Documents/qodo api key.txt       → QODO_API_KEY
+#
+# Prerequisites:
+#   - gh CLI installed and authenticated (`gh auth status`)
+#   - Write / admin access to the target repo
+# ---------------------------------------------------------------------------
+set -euo pipefail
+
+REPO=""
+DRY_RUN=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --repo)     REPO="$2"; shift 2 ;;
+    --dry-run)  DRY_RUN=true;  shift ;;
+    *)          echo "Unknown: $1"; exit 1 ;;
+  esac
+done
+
+# --- Config: key file paths -----------------------------------------------
+DEEPSEEK_KEY_FILE="$HOME/Documents/deepseek api key.txt"
+QODO_KEY_FILE="$HOME/Documents/qodo api key.txt"
+
+# --- Checks ----------------------------------------------------------------
+if ! command -v gh &>/dev/null; then
+  echo "❌ gh CLI not found — install it from https://cli.github.com/"
+  exit 1
+fi
+
+if ! gh auth status &>/dev/null; then
+  echo "❌ gh CLI not authenticated — run 'gh auth login' first."
+  exit 1
+fi
+
+GH_ARGS=()
+[[ -n "$REPO" ]] && GH_ARGS+=(--repo "$REPO")
+
+echo "🔍 Target: ${REPO:-$(gh repo view --json nameWithOwner -q .nameWithOwner 2>/dev/null || echo 'current repo')}"
+
+# --- Read keys -------------------------------------------------------------
+read_key() {
+  local path="$1" label="$2"
+  if [[ ! -f "$path" ]]; then
+    echo "⚠️  $label key file not found at: $path"
+    return 1
+  fi
+  local val
+  val="$(tr -d '[:space:]' < "$path")"
+  if [[ -z "$val" ]]; then
+    echo "⚠️  $label key file is empty: $path"
+    return 1
+  fi
+  echo "$val"
+}
+
+echo ""
+echo "📂 Reading keys from ~/Documents/ …"
+
+DEEPSEEK_KEY="$(read_key "$DEEPSEEK_KEY_FILE" "DeepSeek")" || true
+QODO_KEY="$(read_key "$QODO_KEY_FILE" "Qodo")" || true
+
+# --- Set secrets -----------------------------------------------------------
+set_secret() {
+  local name="$1" value="$2"
+  if [[ -z "$value" ]]; then
+    echo "   ⏭️  Skipping $name (no value)"
+    return
+  fi
+  if $DRY_RUN; then
+    echo "   🏁 [DRY-RUN] gh secret set $name ${GH_ARGS[*]}"
+  else
+    echo "   🔐 Setting $name …"
+    echo -n "$value" | gh secret set "$name" "${GH_ARGS[@]}"
+    echo "   ✅ $name set"
+  fi
+}
+
+echo ""
+$DRY_RUN && echo "🏁 DRY RUN — no secrets will be written" || echo "🚀 Setting secrets …"
+echo ""
+
+set_secret "DEEPSEEK_API_KEY" "$DEEPSEEK_KEY"
+set_secret "QODO_API_KEY" "$QODO_KEY"
+
+# --- Summary ---------------------------------------------------------------
+echo ""
+if $DRY_RUN; then
+  echo "🏁 Dry run complete. Run without --dry-run to apply."
+else
+  echo "✅ Done! Secrets are now available to GitHub Actions workflows."
+  echo "   Verify: gh secret list ${GH_ARGS[*]}"
+fi
diff --git a/.github/workflows/pr-agent-review.yml b/.github/workflows/pr-agent-review.yml
index d1a8b9155..d755db87e 100644
--- a/.github/workflows/pr-agent-review.yml
+++ b/.github/workflows/pr-agent-review.yml
@@ -1,3 +1,10 @@
+# ---------------------------------------------------------------------------
+# PR-Agent review powered by DeepSeek.
+#
+# Required repo secret: DEEPSEEK_API_KEY
+#   - Setup: bash .github/scripts/setup-repo-secrets.sh
+#   - Or set manually at: https://github.com/<owner>/<repo>/settings/secrets/actions
+# ---------------------------------------------------------------------------
 name: pr-agent-review
 on:
   pull_request:
diff --git a/.github/workflows/qodo-merge.yml b/.github/workflows/qodo-merge.yml
index 9bbada1c1..4237eb61e 100644
--- a/.github/workflows/qodo-merge.yml
+++ b/.github/workflows/qodo-merge.yml
@@ -1,3 +1,10 @@
+# ---------------------------------------------------------------------------
+# Qodo Merge — automated PR reviews powered by Qodo.
+#
+# Required repo secret: QODO_API_KEY
+#   - Setup: bash .github/scripts/setup-repo-secrets.sh
+#   - Or set manually at: https://github.com/<owner>/<repo>/settings/secrets/actions
+# ---------------------------------------------------------------------------
 name: qodo-merge
 on:
   pull_request:

From d33b12ef6504b7c0490c18c57c6036d7b84cb21e Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:25:25 -0300
Subject: [PATCH 13/14] fix: skip redundant TheRock download when already
 installed (#2413)

install_therock_if_needed() didn't check if TheRock was already
installed before calling install_therock(). When hot-swapping
multiple ROCm backends (e.g. llamacpp:rocm and sd-cpp:rocm), each
triggered a separate 3 GB TheRock download.

Fix by adding is_therock_installed_for_current_arch() check at the
top of install_therock_if_needed().
---
 src/cpp/server/backend_manager.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp
index 6e1537367..21b8204c8 100644
--- a/src/cpp/server/backend_manager.cpp
+++ b/src/cpp/server/backend_manager.cpp
@@ -215,6 +215,12 @@ void install_therock_if_needed(const std::string& os, const json& backend_versio
         return;
     }
 
+    // Already installed — skip to avoid redundant 3 GB download
+    if (is_therock_installed_for_current_arch(backend_versions)) {
+        LOG(DEBUG, "BackendManager") << "TheRock already installed, skipping" << std::endl;
+        return;
+    }
+
     std::vector<std::string> rocm_arches = SystemInfo::get_rocm_arches();
     std::string version = backend_versions["therock"]["version"].get<std::string>();
 

From f4b2dc43be83d7e66c35454461440650d897ce92 Mon Sep 17 00:00:00 2001
From: bong-water-water-bong
 <277547417+bong-water-water-bong@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:25:28 -0300
Subject: [PATCH 14/14] fix: show warning when pulling model in offline mode
 (#2412)

When offline mode is enabled, the server silently skipped the download
but the CLI still printed 'Model pulled successfully'.

Fix: server now includes a 'warning' field in both legacy and SSE
streaming completion responses when offline, and the CLI surfaces it
as 'Model pulled with warning: ...' instead of 'successfully'.
---
 src/cpp/cli/lemonade_client.cpp             | 16 +++++++++++++++-
 src/cpp/include/lemon_cli/lemonade_client.h |  1 +
 src/cpp/server/server.cpp                   | 14 +++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/cpp/cli/lemonade_client.cpp b/src/cpp/cli/lemonade_client.cpp
index 517140d14..8ceffc9f2 100644
--- a/src/cpp/cli/lemonade_client.cpp
+++ b/src/cpp/cli/lemonade_client.cpp
@@ -696,6 +696,15 @@ int LemonadeClient::pull_model(const json& model_data, const std::string& displa
             if (event_type == "complete") {
                 std::cout << std::endl;
                 state.success = true;
+                // Check for warnings (e.g. offline mode skipped the download)
+                try {
+                    auto complete_json = json::parse(event_data);
+                    if (complete_json.contains("warning") && complete_json["warning"].is_string()) {
+                        state.warning_message = complete_json["warning"].get<std::string>();
+                    }
+                } catch (...) {
+                    // Ignore parse errors on the completion event
+                }
             } else if (event_type == "error") {
                 try {
                     auto error_json = json::parse(event_data);
@@ -730,7 +739,12 @@ int LemonadeClient::pull_model(const json& model_data, const std::string& displa
             throw std::runtime_error("Model pull failed");
         }
 
-        std::cout << "Model pulled successfully: " << output_name << std::endl;
+        if (!state.warning_message.empty()) {
+            std::cout << "Model pulled with warning: " << output_name << std::endl;
+            std::cout << "  Warning: " << state.warning_message << std::endl;
+        } else {
+            std::cout << "Model pulled successfully: " << output_name << std::endl;
+        }
         return 0;
     } catch (const HttpError& e) {
         std::cerr << "Error pulling model: " << extract_server_error_message(e) << std::endl;
diff --git a/src/cpp/include/lemon_cli/lemonade_client.h b/src/cpp/include/lemon_cli/lemonade_client.h
index 58339acbd..9e71d9c10 100644
--- a/src/cpp/include/lemon_cli/lemonade_client.h
+++ b/src/cpp/include/lemon_cli/lemonade_client.h
@@ -37,6 +37,7 @@ struct StreamingRequestState {
     bool success = false;
     std::string error_message;
     std::string error_code;
+    std::string warning_message;
     bool total_size_printed = false;
     uint64_t last_file_size = 0;
     std::chrono::steady_clock::time_point file_start_time;
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 26ba3a12d..5416da11a 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -3706,6 +3706,12 @@ void Server::handle_pull(const httplib::Request& req, httplib::Response& res) {
             model_manager_->download_model(model_name, request_json, do_not_upgrade);
 
             nlohmann::json response = {{"status", "success"}, {"model_name", model_name}};
+            // Inform the user when offline mode prevented an actual download
+            if (auto* cfg = RuntimeConfig::global()) {
+                if (cfg->offline()) {
+                    response["warning"] = "offline mode is enabled; model was not downloaded";
+                }
+            }
             res.set_content(response.dump(), "application/json");
         }
 
@@ -5112,9 +5118,15 @@ void Server::stream_download_operation(
                 operation(progress_cb);
 
                 // If operation completed without sending a "complete" event
-                // (e.g. backend was already installed), send one now
+                // (e.g. backend was already installed, or offline mode skipped
+                // the download), send one now with an appropriate warning.
                 if (!complete_sent) {
                     nlohmann::json done_data = {{"status", "ok"}};
+                    if (auto* cfg = RuntimeConfig::global()) {
+                        if (cfg->offline()) {
+                            done_data["warning"] = "offline mode is enabled; model was not downloaded";
+                        }
+                    }
                     std::string event = "event: complete\ndata: " + done_data.dump() + "\n\n";
                     sink.write(event.c_str(), event.size());
                 }