From 3450adc113acf9a2e9e0877437a76b7e263a4cc7 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 2 Jul 2026 16:13:00 +0530
Subject: [PATCH] fix(desktop): macOS/Linux pre-CEF stale-lock reap + installer
 pre-install kill (#4395)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #3605; the Windows-only runtime reap landed in #3793. macOS and
Linux had no pre-CEF single-instance guard, so a process holding the CEF
SingletonLock past the cache-wait budget could be a healthy running primary,
not a wedged one — the reason the earlier macOS SIGKILL escalation was reverted.

Part 1: add `cef_stale_reap` (macOS/Linux), a pre-CEF reap of a wedged prior
instance gated on a post-update relaunch marker (the staleness signal from
#4395). The updater writes the marker before `app.restart()`; the relaunched
process consumes it (once, deleted, age-bounded). Only a fresh marker + a live
lock holder triggers a reap. Guards: skip self, skip cross-host holders, and
SIGTERM → grace → re-validate the same pid still owns the lock → SIGKILL
(closes the PID-reuse window, honors kill_pid_force's contract). Never reaps a
healthy instance. Wired into run() before cef_preflight::wait_for_cache_release.

Part 2: add an NSIS NSIS_HOOK_PREINSTALL that terminates openhuman-core.exe and
the main binary tree before any file copy, so a stale prior-version process
can't survive an update.

Adds the nix `hostname` feature for gethostname. Pure decision logic is unit
tested on any host; the cfg-gated runtime/installer paths need verification on
real macOS, Linux, and Windows(installer) hosts.
---
 app/src-tauri/Cargo.toml            |   2 +-
 app/src-tauri/nsis-hooks.nsh        |  29 ++
 app/src-tauri/src/cef_stale_reap.rs | 511 ++++++++++++++++++++++++++++
 app/src-tauri/src/lib.rs            |  30 ++
 app/src-tauri/tauri.conf.json       |   5 +
 5 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 app/src-tauri/nsis-hooks.nsh
 create mode 100644 app/src-tauri/src/cef_stale_reap.rs

diff --git a/app/src-tauri/Cargo.toml b/app/src-tauri/Cargo.toml
index 4252e660c0..2e68696ab7 100644
--- a/app/src-tauri/Cargo.toml
+++ b/app/src-tauri/Cargo.toml
@@ -135,7 +135,7 @@ cef = { version = "=146.4.1", default-features = false }
 openhuman_core = { path = "../..", package = "openhuman", default-features = false }
 
 [target.'cfg(unix)'.dependencies]
-nix = { version = "0.29", default-features = false, features = ["signal", "user"] }
+nix = { version = "0.29", default-features = false, features = ["hostname", "signal", "user"] }
 
 [target.'cfg(target_os = "macos")'.dependencies]
 objc2 = "0.6"
diff --git a/app/src-tauri/nsis-hooks.nsh b/app/src-tauri/nsis-hooks.nsh
new file mode 100644
index 0000000000..8b60906762
--- /dev/null
+++ b/app/src-tauri/nsis-hooks.nsh
@@ -0,0 +1,29 @@
+; NSIS installer hooks for OpenHuman (issue #4395, follow-up to #3605).
+;
+; Part 2 of #4395 — installer pre-install process kill. The default tauri NSIS
+; template only stops the *main* binary (OpenHuman.exe) via CheckIfAppIsRunning
+; before overwriting it. A leftover / wedged `openhuman-core.exe` from the prior
+; version (CLI/MCP/core mode) can still hold file handles under $INSTDIR and
+; survive the update, which is exactly the "stale process blocks update" symptom
+; #3605 tracks. Terminate those related processes before any file is copied so
+; the new binary always lands cleanly.
+;
+; NSIS_HOOK_PREINSTALL runs before the template copies files, sets registry
+; keys, and creates shortcuts (and before the built-in CheckIfAppIsRunning), so
+; this is the correct place to guarantee no OpenHuman process is holding the
+; install directory open.
+;
+; taskkill ships with every supported Windows version. /T also terminates child
+; processes (CEF helpers); a non-zero exit (nothing matched) is fine and ignored.
+
+!macro NSIS_HOOK_PREINSTALL
+  DetailPrint "Stopping any running OpenHuman helper processes before install..."
+  ; Force-kill the embedded core / CLI / MCP process and its child tree.
+  nsExec::Exec 'taskkill /F /IM openhuman-core.exe /T'
+  Pop $0
+  ; Belt-and-suspenders: the main binary and its CEF child processes. The
+  ; template's CheckIfAppIsRunning also covers OpenHuman.exe, but in a silent
+  ; auto-update relaunch we want the whole tree gone before files are replaced.
+  nsExec::Exec 'taskkill /F /IM ${MAINBINARYNAME}.exe /T'
+  Pop $0
+!macroend
diff --git a/app/src-tauri/src/cef_stale_reap.rs b/app/src-tauri/src/cef_stale_reap.rs
new file mode 100644
index 0000000000..d7fee7784f
--- /dev/null
+++ b/app/src-tauri/src/cef_stale_reap.rs
@@ -0,0 +1,511 @@
+//! macOS/Linux pre-CEF stale CEF-lock-holder reap (issue #4395, follow-up to
+//! #3605).
+//!
+//! ## The gap this closes
+//!
+//! On Windows a Win32 named mutex early in `run()` forces any *concurrent*
+//! peer to `exit(0)` before the pre-CEF reap, so a process still holding the
+//! CEF cache lock past that point is provably a wedged prior instance —
+//! `process_recovery::reap_stale_openhuman_processes` reaps it safely.
+//!
+//! macOS and Linux have **no equivalent pre-CEF single-instance guard**: the
+//! `#[cfg(windows)]` mutex does not exist there and `tauri_plugin_single_instance`
+//! registers later (inside the builder, after CEF init). So a process holding
+//! the CEF `SingletonLock` past the [`cef_preflight`] cache-wait budget may be a
+//! **healthy running primary** — a live app holds its lock for its whole
+//! lifetime — not a wedged one. Reaping it purely because "the lock was held N
+//! seconds" could SIGKILL a healthy instance and lose user data. That exact
+//! macOS SIGKILL escalation was reverted from PR #3793 (flagged by @oxoxDev +
+//! Codex P1), and this module must not reintroduce it.
+//!
+//! ## The staleness signal (why this is safe)
+//!
+//! Rather than a time budget, the reap is gated on a **post-update relaunch
+//! marker** — the "post-update marker" staleness signal named in #4395. The
+//! updater writes the marker ([`write_update_relaunch_marker`]) immediately
+//! before `app.restart()`; the freshly relaunched process consumes it here.
+//!
+//! Only when a *recent* marker is present do we treat a live CEF-lock holder as
+//! stale. That is the one situation where a survivor is provably the
+//! pre-update instance that should already have exited — never a healthy,
+//! separately-launched app (single-instance app; no marker on a normal launch).
+//! The marker is consumed (deleted) once per launch and bounded by
+//! [`MARKER_MAX_AGE`] so a leaked marker cannot make a much-later launch reap a
+//! healthy instance.
+//!
+//! Additional guards before any kill:
+//!   - **Self**: never reap our own pid.
+//!   - **Host**: the `SingletonLock` symlink target is `<hostname>-<pid>`; skip
+//!     if the holder's host differs from ours (networked/NFS home dir where the
+//!     pid belongs to another machine and could collide with an unrelated local
+//!     process).
+//!   - **Re-validate before SIGKILL**: after SIGTERM + grace, re-read the lock
+//!     and confirm the *same* pid still owns it and is still alive. This closes
+//!     the PID-reuse window and honors `kill_pid_force`'s contract.
+//!
+//! This is complementary to `process_recovery::reap_stale_openhuman_processes`
+//! (which deliberately *skips* when a live CEF-lock holder exists) and to the
+//! Windows-only reap hardening tracked separately in #3900 — no Windows code is
+//! touched here.
+//!
+//! Validation note: the reap glue is `#[cfg(any(macos, linux))]` and cannot be
+//! reproduced on an arbitrary dev host. The decision logic ([`decide_reap`],
+//! [`marker_is_fresh`], [`marker_path_for`]) is pure and unit-tested on any
+//! host; the filesystem/signal glue is exercised on real macOS/Linux hosts.
+
+use std::path::{Path, PathBuf};
+use std::time::Duration;
+
+/// Grace period between SIGTERM and the re-validated SIGKILL. Mirrors the
+/// `TERM_GRACE` used by `process_recovery`.
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+const TERM_GRACE: Duration = Duration::from_millis(500);
+
+/// A relaunch marker older than this is ignored (and cleaned up). Bounds the
+/// blast radius of a marker leaked by a process that died before consuming it,
+/// so it can never make an unrelated later launch reap a healthy instance.
+const MARKER_MAX_AGE: Duration = Duration::from_secs(300);
+
+/// File name of the post-update relaunch marker, written as a sibling of the
+/// CEF cache directory (outside the locked `cef/` dir).
+const MARKER_FILE_NAME: &str = "openhuman-update-relaunch.marker";
+
+/// What to do about the current CEF `SingletonLock` holder.
+#[derive(Debug, Clone, PartialEq, Eq)]
+enum ReapDecision {
+    /// Leave the holder alone; the normal `cef_preflight` wait handles it.
+    Skip(&'static str),
+    /// Holder is provably a stale pre-update instance — reap this pid.
+    Reap { pid: i32 },
+}
+
+/// Pure reap decision, separated from the filesystem/signal glue so it is
+/// unit-testable on any host.
+///
+/// `holder` is the *live* `SingletonLock` holder as `(host, pid)`, or `None`
+/// when there is no lock / it is stale / the pid is dead. `local_host` is this
+/// machine's hostname (`None` if unresolved — then the host gate is skipped and
+/// the marker requirement alone protects us).
+fn decide_reap(
+    holder: Option<(String, i32)>,
+    self_pid: i32,
+    local_host: Option<&str>,
+    marker_fresh: bool,
+) -> ReapDecision {
+    let Some((host, pid)) = holder else {
+        return ReapDecision::Skip("no live CEF SingletonLock holder");
+    };
+    if pid == self_pid {
+        return ReapDecision::Skip("CEF lock held by self");
+    }
+    if let Some(local) = local_host {
+        if local != host {
+            return ReapDecision::Skip("CEF lock holder is on a different host");
+        }
+    }
+    if !marker_fresh {
+        // The decisive safety gate: without a recent post-update marker a live
+        // holder may be a healthy running instance. Defer to the preflight wait
+        // rather than risk killing it.
+        return ReapDecision::Skip("no recent post-update marker; deferring to preflight wait");
+    }
+    ReapDecision::Reap { pid }
+}
+
+/// Pure freshness predicate: a marker is fresh iff its age is known and within
+/// `max_age`. An unknown age (`None`) is treated as not fresh (fail-safe: do
+/// not reap on an unreadable timestamp).
+fn marker_is_fresh(age: Option<Duration>, max_age: Duration) -> bool {
+    matches!(age, Some(a) if a <= max_age)
+}
+
+/// Pure derivation of the marker path from the CEF cache directory. The marker
+/// is a sibling of the cache dir (e.g. `.../com.openhuman.app/cef` →
+/// `.../com.openhuman.app/openhuman-update-relaunch.marker`) so it lives
+/// outside the locked `cef/` directory.
+fn marker_path_for(cache_path: &Path) -> PathBuf {
+    cache_path.with_file_name(MARKER_FILE_NAME)
+}
+
+/// Resolve the CEF cache directory. In production `cef_profile::prepare_process_cache_path`
+/// always sets `OPENHUMAN_CEF_CACHE_PATH` before both the updater and this reap
+/// run, so env resolution is sufficient and keeps this cross-platform/testable.
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+fn cef_cache_path() -> Option<PathBuf> {
+    std::env::var_os("OPENHUMAN_CEF_CACHE_PATH").map(PathBuf::from)
+}
+
+/// Write the post-update relaunch marker. Called by the updater immediately
+/// before `app.restart()`; the freshly relaunched process consumes it in
+/// [`reap_stale_cef_lock_holder`]. Best-effort — a write failure only means the
+/// reap falls back to the (safe) preflight wait.
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+pub(crate) fn write_update_relaunch_marker() {
+    let Some(cache) = cef_cache_path() else {
+        log::warn!(
+            "[cef-stale-reap] OPENHUMAN_CEF_CACHE_PATH unset; skipping update-relaunch marker write"
+        );
+        return;
+    };
+    let marker = marker_path_for(&cache);
+    let body = format!("pid={}\n", std::process::id());
+    match std::fs::write(&marker, body) {
+        Ok(()) => log::info!(
+            "[cef-stale-reap] wrote update-relaunch marker at {} (pid={})",
+            marker.display(),
+            std::process::id()
+        ),
+        Err(e) => log::warn!(
+            "[cef-stale-reap] failed to write update-relaunch marker at {}: {e}",
+            marker.display()
+        ),
+    }
+}
+
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+mod imp {
+    use std::path::Path;
+
+    use super::{
+        cef_cache_path, decide_reap, marker_is_fresh, marker_path_for, ReapDecision, MARKER_MAX_AGE,
+        TERM_GRACE,
+    };
+    use crate::cef_preflight;
+    use crate::core_process;
+    use crate::process_kill::{kill_pid_force, kill_pid_term};
+
+    /// Pre-CEF reap of a wedged prior instance that still holds the CEF
+    /// `SingletonLock` after an update relaunch. Safe by construction: only
+    /// acts when a recent post-update marker is present (see module docs).
+    /// Best-effort throughout — any failure leaves the holder for the normal
+    /// `cef_preflight::wait_for_cache_release` path.
+    pub(crate) fn reap_stale_cef_lock_holder() {
+        if core_process::reuse_existing_listener_enabled() {
+            log::info!(
+                "[cef-stale-reap] OPENHUMAN_CORE_REUSE_EXISTING=1; skipping stale CEF-lock reap"
+            );
+            return;
+        }
+
+        let Some(cache) = cef_cache_path() else {
+            log::debug!("[cef-stale-reap] OPENHUMAN_CEF_CACHE_PATH unset; nothing to reap");
+            return;
+        };
+        let lock_path = cache.join("SingletonLock");
+        let marker_path = marker_path_for(&cache);
+
+        // Consume the marker exactly once per launch (delete regardless of the
+        // decision) so a normal launch can never inherit a leftover signal.
+        let marker_fresh = consume_marker(&marker_path);
+
+        let holder = live_lock_holder(&lock_path);
+        let self_pid = std::process::id() as i32;
+        let local_host = local_hostname();
+
+        match decide_reap(holder, self_pid, local_host.as_deref(), marker_fresh) {
+            ReapDecision::Skip(reason) => {
+                log::debug!("[cef-stale-reap] not reaping CEF-lock holder: {reason}");
+            }
+            ReapDecision::Reap { pid } => reap_pid(&lock_path, pid),
+        }
+    }
+
+    /// SIGTERM the stale pid, wait a grace period, then — only if the *same*
+    /// pid still owns the lock and is still alive — SIGKILL it. The re-check
+    /// closes the PID-reuse window before the force-kill.
+    fn reap_pid(lock_path: &Path, pid: i32) {
+        let pid_u32 = pid as u32;
+        log::warn!(
+            "[cef-stale-reap] recent post-update marker + live CEF-lock holder pid={pid}; \
+             treating as wedged pre-update instance, sending SIGTERM"
+        );
+        if let Err(e) = kill_pid_term(pid_u32) {
+            log::warn!("[cef-stale-reap] SIGTERM pid={pid} failed: {e}");
+        }
+
+        std::thread::sleep(TERM_GRACE);
+
+        if holder_still_owns_lock(lock_path, pid) {
+            log::warn!(
+                "[cef-stale-reap] pid={pid} still holds the CEF lock after SIGTERM+grace; SIGKILL"
+            );
+            if let Err(e) = kill_pid_force(pid_u32) {
+                log::warn!("[cef-stale-reap] SIGKILL pid={pid} failed: {e}");
+            }
+        } else {
+            // Either it exited on SIGTERM, or the lock now points elsewhere.
+            // Do NOT force-kill — the pid may have been reused. `cef_preflight`
+            // removes the now-dead lock on its next poll.
+            log::info!(
+                "[cef-stale-reap] pid={pid} released the CEF lock after SIGTERM (or ownership changed); no SIGKILL"
+            );
+        }
+    }
+
+    /// Read+parse the live `SingletonLock` holder as `(host, pid)`. Returns
+    /// `None` when the lock is absent, unparseable, or the pid is not alive
+    /// (a dead-pid lock is stale and handled by `cef_preflight`).
+    fn live_lock_holder(lock_path: &Path) -> Option<(String, i32)> {
+        let target = std::fs::read_link(lock_path).ok()?;
+        let (host, pid) = cef_preflight::parse_lock_target(&target.to_string_lossy())?;
+        cef_preflight::is_pid_alive(pid).then_some((host, pid))
+    }
+
+    /// True iff the lock still resolves to `expected_pid` and that pid is alive.
+    fn holder_still_owns_lock(lock_path: &Path, expected_pid: i32) -> bool {
+        let Ok(target) = std::fs::read_link(lock_path) else {
+            return false;
+        };
+        match cef_preflight::parse_lock_target(&target.to_string_lossy()) {
+            Some((_, pid)) => pid == expected_pid && cef_preflight::is_pid_alive(pid),
+            None => false,
+        }
+    }
+
+    /// This machine's hostname, matching the `<hostname>-<pid>` that Chromium
+    /// writes into `SingletonLock`. `None` if it cannot be resolved.
+    fn local_hostname() -> Option<String> {
+        nix::unistd::gethostname()
+            .ok()?
+            .into_string()
+            .ok()
+            .filter(|h| !h.is_empty())
+    }
+
+    /// Read the marker's freshness and delete it (one-shot). Returns `true`
+    /// only if the marker existed and is within [`MARKER_MAX_AGE`].
+    fn consume_marker(marker_path: &Path) -> bool {
+        let fresh = match std::fs::metadata(marker_path) {
+            Ok(meta) => {
+                let age = meta
+                    .modified()
+                    .ok()
+                    .and_then(|m| std::time::SystemTime::now().duration_since(m).ok());
+                let fresh = marker_is_fresh(age, MARKER_MAX_AGE);
+                log::info!(
+                    "[cef-stale-reap] update-relaunch marker present at {} (fresh={fresh})",
+                    marker_path.display()
+                );
+                fresh
+            }
+            Err(_) => false,
+        };
+        // Delete whether fresh or stale so it is never reused.
+        if let Err(e) = std::fs::remove_file(marker_path) {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                log::warn!(
+                    "[cef-stale-reap] failed to remove update-relaunch marker {}: {e}",
+                    marker_path.display()
+                );
+            }
+        }
+        fresh
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+        use std::os::unix::fs::symlink;
+        use std::path::PathBuf;
+
+        // Serialises tests that mutate the process-global OPENHUMAN_CEF_CACHE_PATH.
+        static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+        fn fresh_tmp(tag: &str) -> PathBuf {
+            let tmp = std::env::temp_dir().join(format!(
+                "oh-cef-stale-reap-{}-{}-{}",
+                tag,
+                std::process::id(),
+                std::time::SystemTime::now()
+                    .duration_since(std::time::UNIX_EPOCH)
+                    .map(|d| d.as_nanos())
+                    .unwrap_or(0)
+            ));
+            let _ = std::fs::remove_dir_all(&tmp);
+            std::fs::create_dir_all(&tmp).expect("create tmp dir");
+            tmp
+        }
+
+        #[test]
+        fn live_lock_holder_reads_self_pid() {
+            let tmp = fresh_tmp("live-holder");
+            let me = std::process::id() as i32;
+            symlink(format!("myhost-{me}"), tmp.join("SingletonLock")).unwrap();
+
+            let holder = live_lock_holder(&tmp.join("SingletonLock"));
+            assert_eq!(holder, Some(("myhost".to_string(), me)));
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+
+        #[test]
+        fn live_lock_holder_none_for_dead_pid() {
+            let tmp = fresh_tmp("dead-holder");
+            // ~i32::MAX-1: far beyond any plausible live pid.
+            symlink("deadhost-2147483646", tmp.join("SingletonLock")).unwrap();
+            assert_eq!(live_lock_holder(&tmp.join("SingletonLock")), None);
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+
+        #[test]
+        fn holder_still_owns_lock_matches_same_pid_only() {
+            let tmp = fresh_tmp("revalidate");
+            let me = std::process::id() as i32;
+            let lock = tmp.join("SingletonLock");
+            symlink(format!("myhost-{me}"), &lock).unwrap();
+
+            assert!(holder_still_owns_lock(&lock, me));
+            // A different (dead) pid must not re-validate.
+            assert!(!holder_still_owns_lock(&lock, 2147483646));
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+
+        #[test]
+        fn consume_marker_reports_fresh_and_deletes() {
+            let tmp = fresh_tmp("consume-fresh");
+            let cache = tmp.join("cef");
+            std::fs::create_dir_all(&cache).unwrap();
+            let marker = marker_path_for(&cache);
+            std::fs::write(&marker, "pid=1\n").unwrap();
+
+            assert!(consume_marker(&marker), "just-written marker must be fresh");
+            assert!(
+                std::fs::metadata(&marker).is_err(),
+                "marker must be deleted after consume"
+            );
+            // Second consume: absent → not fresh, no error.
+            assert!(!consume_marker(&marker));
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+
+        #[test]
+        fn write_then_consume_roundtrip_via_env() {
+            let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+            let prior = std::env::var_os("OPENHUMAN_CEF_CACHE_PATH");
+
+            let tmp = fresh_tmp("roundtrip");
+            let cache = tmp.join("cef");
+            std::fs::create_dir_all(&cache).unwrap();
+            std::env::set_var("OPENHUMAN_CEF_CACHE_PATH", &cache);
+
+            super::super::write_update_relaunch_marker();
+            let marker = marker_path_for(&cache);
+            assert!(
+                std::fs::metadata(&marker).is_ok(),
+                "marker must exist after write"
+            );
+            assert!(consume_marker(&marker), "freshly written marker is fresh");
+
+            match prior {
+                Some(v) => std::env::set_var("OPENHUMAN_CEF_CACHE_PATH", v),
+                None => std::env::remove_var("OPENHUMAN_CEF_CACHE_PATH"),
+            }
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+
+        #[test]
+        fn reap_orchestrator_no_lock_no_marker_is_safe_noop() {
+            // With no SingletonLock and no marker, the orchestrator must take
+            // the Skip path — no panic, no files created, nothing killed.
+            let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+            let prior = std::env::var_os("OPENHUMAN_CEF_CACHE_PATH");
+
+            let tmp = fresh_tmp("noop");
+            let cache = tmp.join("cef");
+            std::fs::create_dir_all(&cache).unwrap();
+            std::env::set_var("OPENHUMAN_CEF_CACHE_PATH", &cache);
+
+            reap_stale_cef_lock_holder();
+
+            assert!(
+                std::fs::metadata(marker_path_for(&cache)).is_err(),
+                "orchestrator must not create a marker"
+            );
+
+            match prior {
+                Some(v) => std::env::set_var("OPENHUMAN_CEF_CACHE_PATH", v),
+                None => std::env::remove_var("OPENHUMAN_CEF_CACHE_PATH"),
+            }
+            let _ = std::fs::remove_dir_all(&tmp);
+        }
+    }
+}
+
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+pub(crate) use imp::reap_stale_cef_lock_holder;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn skip_when_no_holder() {
+        assert_eq!(
+            decide_reap(None, 100, Some("host"), true),
+            ReapDecision::Skip("no live CEF SingletonLock holder")
+        );
+    }
+
+    #[test]
+    fn skip_when_holder_is_self() {
+        assert_eq!(
+            decide_reap(Some(("host".into(), 100)), 100, Some("host"), true),
+            ReapDecision::Skip("CEF lock held by self")
+        );
+    }
+
+    #[test]
+    fn skip_when_holder_on_other_host() {
+        assert_eq!(
+            decide_reap(Some(("otherbox".into(), 200)), 100, Some("host"), true),
+            ReapDecision::Skip("CEF lock holder is on a different host")
+        );
+    }
+
+    #[test]
+    fn skip_when_no_fresh_marker_even_if_live_holder() {
+        // The core safety property: a live holder on our host is NOT reaped
+        // without a recent post-update marker (it may be a healthy instance).
+        assert_eq!(
+            decide_reap(Some(("host".into(), 200)), 100, Some("host"), false),
+            ReapDecision::Skip("no recent post-update marker; deferring to preflight wait")
+        );
+    }
+
+    #[test]
+    fn reap_only_with_marker_matching_host_and_other_pid() {
+        assert_eq!(
+            decide_reap(Some(("host".into(), 200)), 100, Some("host"), true),
+            ReapDecision::Reap { pid: 200 }
+        );
+    }
+
+    #[test]
+    fn reap_when_local_host_unresolved_but_marker_fresh() {
+        // Unresolved local host skips only the host gate; the marker gate still
+        // governs, so a fresh marker still permits the reap.
+        assert_eq!(
+            decide_reap(Some(("host".into(), 200)), 100, None, true),
+            ReapDecision::Reap { pid: 200 }
+        );
+    }
+
+    #[test]
+    fn marker_is_fresh_bounds_on_age() {
+        assert!(marker_is_fresh(Some(Duration::from_secs(0)), MARKER_MAX_AGE));
+        assert!(marker_is_fresh(Some(MARKER_MAX_AGE), MARKER_MAX_AGE));
+        assert!(!marker_is_fresh(
+            Some(MARKER_MAX_AGE + Duration::from_secs(1)),
+            MARKER_MAX_AGE
+        ));
+        // Unknown age is fail-safe (not fresh → no reap).
+        assert!(!marker_is_fresh(None, MARKER_MAX_AGE));
+    }
+
+    #[test]
+    fn marker_path_is_sibling_of_cache_dir() {
+        assert_eq!(
+            marker_path_for(Path::new("/x/com.openhuman.app/cef")),
+            PathBuf::from("/x/com.openhuman.app/openhuman-update-relaunch.marker")
+        );
+    }
+}
diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs
index b7aac0440d..4aae5e7114 100644
--- a/app/src-tauri/src/lib.rs
+++ b/app/src-tauri/src/lib.rs
@@ -20,6 +20,13 @@ mod cef_profile;
 // logic is unit-tested on any host; the Win32 glue is windows-only.
 #[cfg(any(target_os = "windows", test))]
 mod cef_singleton_wait;
+// macOS/Linux pre-CEF reap of a wedged prior instance that still holds the CEF
+// SingletonLock after an update relaunch (issue #4395, follow-up to #3605).
+// Marker-gated so it never reaps a healthy running instance. Compiled under
+// `test` too so the pure decision logic is unit-tested on any host; the
+// filesystem/signal glue is macOS/Linux-only.
+#[cfg(any(target_os = "macos", target_os = "linux", test))]
+mod cef_stale_reap;
 mod claude_code;
 mod companion_commands;
 mod core_process;
@@ -601,6 +608,11 @@ async fn apply_app_update(
     log::info!("[app-update] install complete — relaunching");
     let _ = app.emit("app-update:status", "restarting");
 
+    // Drop a post-update relaunch marker so the freshly launched process can
+    // safely reap this instance if it wedges instead of exiting (issue #4395).
+    #[cfg(any(target_os = "macos", target_os = "linux"))]
+    cef_stale_reap::write_update_relaunch_marker();
+
     log::info!("[app-update] starting early teardown before restart");
     perform_early_teardown_async(&app).await;
 
@@ -814,6 +826,11 @@ async fn install_app_update(
     log::info!("[app-update] install complete — relaunching");
     let _ = app.emit("app-update:status", "restarting");
 
+    // Drop a post-update relaunch marker so the freshly launched process can
+    // safely reap this instance if it wedges instead of exiting (issue #4395).
+    #[cfg(any(target_os = "macos", target_os = "linux"))]
+    cef_stale_reap::write_update_relaunch_marker();
+
     log::info!("[app-update] starting early teardown before restart");
     perform_early_teardown_async(&app).await;
 
@@ -2711,6 +2728,19 @@ pub fn run() {
     // after the budget we exit cleanly (code 0). Stale locks (PID dead) are
     // removed so crashed processes don't block launches. macOS: issue #864.
     // Linux: OPENHUMAN-TAURI-K1. Sentry: TAURI-RUST-F.
+    //
+    // ── macOS/Linux pre-CEF stale-lock reap (issue #4395) ─────────────────
+    // Before the bounded wait above can only *time out* on a wedged prior
+    // instance (the #3605 symptom: the updated app silently refuses to start),
+    // proactively reap that instance — but ONLY when a recent post-update
+    // relaunch marker proves the CEF-lock holder is the pre-update process,
+    // never a healthy running one. This is the macOS/Linux analogue of the
+    // Windows pre-CEF reap, gated on a staleness signal because these targets
+    // have no early single-instance mutex. See `cef_stale_reap` for the safety
+    // rationale (and why the reverted #3793 SIGKILL is not reintroduced).
+    #[cfg(any(target_os = "macos", target_os = "linux"))]
+    cef_stale_reap::reap_stale_cef_lock_holder();
+
     #[cfg(any(target_os = "macos", target_os = "linux"))]
     cef_preflight::wait_for_cache_release();
 
diff --git a/app/src-tauri/tauri.conf.json b/app/src-tauri/tauri.conf.json
index 1eaad69899..e1a62c18e6 100644
--- a/app/src-tauri/tauri.conf.json
+++ b/app/src-tauri/tauri.conf.json
@@ -69,6 +69,11 @@
         "desktopTemplate": "main.desktop"
       }
     },
+    "windows": {
+      "nsis": {
+        "installerHooks": "nsis-hooks.nsh"
+      }
+    },
     "createUpdaterArtifacts": false,
     "macOS": {
       "minimumSystemVersion": "10.15",