diff --git a/CHANGELOG.md b/CHANGELOG.md index 444137e..ac89a8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,64 @@ > **Note:** Versions 0.3.26 – 0.3.55 were released as git tags without changelog entries. Changelog resumes at 0.3.56 below. +## 0.5.2 + +### Fixed + +- **Same-host Bonjour peers permanently rejected each other.** When two + `@sym-bot/sym` (or sym-swift) processes ran on the same host and + Bonjour-discovered each other, neither could maintain a peer relationship. + The `inbound-connection` handler and `_createPeer` short-circuited the + moment a same-source transport key was present in `peer.transports`, + regardless of whether that prior was actually alive or what direction it + was in. Three real failure modes collapsed into the same bug: + + 1. **Stale prior** — the previous transport's `_closed` flag was set but + its close handler hadn't fired yet. Apple's Network framework doesn't + always deliver FIN promptly when a peer process exits abruptly, leaving + a dead entry in the transports map. Any reconnect attempt was + permanently rejected until the OS reaped the dead entry. + 2. **Same-direction duplicate** — listener fires `newConnectionHandler` + twice for the same advertised service (TCP retry, multipath race, + repeated Bonjour resolution). Silently replacing the established + healthy inbound with the duplicate tore down the wire pair on the + remote side and triggered peer-left storms. + 3. **Dual-dial collision** — both peers Bonjour-discovered each other + within ~50ms and both initiated outbound TCP. Each side held one + outbound + one inbound for the same nodeId. The unconditional reject + killed one side's view of the connection, leaving asymmetric peer + state. + + Observed in the field on macOS: a Mac Catalyst app (sym-swift) and a + Node CLI (`@sym-bot/sym`) on the same Mac would never maintain a peer + relationship — the Node side silently rejected the Catalyst side's + inbound dial via `transport.close(); return;`. Cross-host LAN peers + worked because the timing windows differ. + + Fix is two-part, applied in both `inbound-connection` handler and + `_createPeer`: + + 1. **Stale-aware dedup** — short-circuit only when the prior transport + is alive (`!_closed`). A stale `_closed=true` entry is treated as no + prior; the new connection replaces it. + 2. **Direction-aware dedup with deterministic tie-break** — for a live + prior: + - **Same-direction duplicate** (both inbound or both outbound) → + keep prior, drop new (no wire-pair teardown on the remote). + - **Dual-dial collision** (different directions) → nodeId-based + tie-break. The lower nodeId acts as client and keeps its outbound; + the higher keeps the matching inbound. Both peers independently + compute the same physical-socket winner without exchanging + coordination frames. + + Mirrors the equivalent fix shipped in `@sym-bot/sym-swift` v0.3.79 + + v0.3.80 so cross-runtime peers (sym-swift ↔ sym Node) now agree on the + same dedup convention. + + Affects all peers running on the same host with another sym instance, + and any deployment where Bonjour discovery races finish within ~50ms + of each other. + ## 0.5.1 ### Fixed diff --git a/lib/node.js b/lib/node.js index 029504b..fd5d604 100644 --- a/lib/node.js +++ b/lib/node.js @@ -497,19 +497,73 @@ class SymNode extends EventEmitter { // Wire discovery events to peer management this._discovery.on('peer-found', (address, port, peerId, peerName) => { - // Skip if already connecting or already have Bonjour transport + // Skip if already connecting (dns-sd resolves the same peer multiple times). if (this._pendingBonjour.has(peerId)) return; + // If we already have a LIVE bonjour transport, no need to dial again. + // A `_closed` transport is a stale entry whose close handler hasn't fired + // yet (Apple framework may not deliver FIN promptly when the peer process + // exits abruptly) — fall through and dial fresh. const existing = this._peers.get(peerId); - if (existing && existing.transports && existing.transports.has('bonjour')) return; + if (existing && existing.transports && existing.transports.has('bonjour')) { + const t = existing.transports.get('bonjour'); + if (t && !t._closed) return; + } this._pendingBonjour.add(peerId); this._connectToPeer(address, port, peerId, peerName); }); this._discovery.on('inbound-connection', (transport, peerId, peerName, handshakeMsg) => { - // Section 5.2 + 4.6: if peer exists via SAME transport type, reject. - // If different transport type, accept as secondary. + // Section 5.2 + 4.6 + dual-dial dedup. Three cases when an existing + // bonjour transport is registered for this peerId: + // + // (a) Stale: the prior transport's _closed flag is set, but its + // close handler hasn't run yet. Treat as no prior — accept the + // new inbound (it will replace via _createPeer). + // + // (b) Same-direction duplicate: prior is also inbound. Happens when + // the OS fires newConnectionHandler twice for the same advertised + // service (multipath / TCP retry). Keep the established prior, + // reject duplicate — silently replacing it would tear down the + // wire pair on the remote side and trigger peer-left storms. + // + // (c) Dual-dial collision: prior is OUR outbound, new is the peer's + // outbound arriving as our inbound. Both peers Bonjour-discovered + // each other within ~50ms and dialed. Tie-break deterministically + // by nodeId — the lower nodeId acts as client and keeps its + // outbound; the higher its inbound. Both peers compute the same + // physical-socket winner without exchanging coordination frames. const existingPeer = this._peers.get(peerId); - if (existingPeer && existingPeer.transports.has('bonjour')) { transport.close(); return; } + if (existingPeer && existingPeer.transports.has('bonjour')) { + const prev = existingPeer.transports.get('bonjour'); + if (prev && !prev._closed) { + // Determine prior direction. The peer's `isOutbound` field reflects + // the direction at first transport creation; for a single-transport + // bonjour peer (the common case) this matches the live transport. + const prevIsOutbound = !!existingPeer.isOutbound; + const newIsOutbound = false; // by definition — this is inbound + const isDualDial = prevIsOutbound !== newIsOutbound; + if (!isDualDial) { + // (b) same-direction duplicate — keep prior + transport.close(); + return; + } + // (c) dual-dial: lower nodeId is client, keeps outbound + const localIsClient = String(this._identity.nodeId) < String(peerId); + const keepOutbound = localIsClient; + const preferNew = (keepOutbound === newIsOutbound); + if (!preferNew) { + // we keep prior outbound, reject new inbound + transport.close(); + return; + } + // preferNew = true: replace prior outbound with new inbound. + // Close prior so its close handler removes the bonjour entry from + // existingPeer.transports; _createPeer below will then add the new + // transport without the "same source already open" guard rejecting it. + prev.close(); + } + // else: (a) stale prior — fall through to accept + } if (handshakeMsg.e2ePublicKey) { this._deriveAndStoreSecret(peerId, handshakeMsg.e2ePublicKey); } @@ -1110,13 +1164,35 @@ class SymNode extends EventEmitter { const existingPeer = this._peers.get(peerId); if (existingPeer) { - // Section 4.6: add as secondary transport, don't reject. - // But if same transport type already exists and is connected, skip — - // prevents relay reconnect loops replacing active connections. + // Section 4.6: add as secondary transport (different source — e.g. relay + // on top of bonjour). For the SAME source, three cases: + // + // (a) stale prior (_closed set) — replace with new + // (b) same-direction duplicate (prior and new share isOutbound) — + // keep prior alive, drop new + // (c) dual-dial — opposite directions. Tie-break by nodeId: lower + // nodeId acts as client and keeps outbound; higher keeps inbound const existingTransport = existingPeer.transports.get(source); if (existingTransport && !existingTransport._closed) { - transport.close(); - return existingPeer; + const prevIsOutbound = !!existingPeer.isOutbound; + const isDualDial = prevIsOutbound !== isOutbound; + if (!isDualDial) { + // (b) — keep prior, drop new + transport.close(); + return existingPeer; + } + // (c) dual-dial tie-break + const localIsClient = String(this._identity.nodeId) < String(peerId); + const keepOutbound = localIsClient; + const preferNew = (keepOutbound === isOutbound); + if (!preferNew) { + transport.close(); + return existingPeer; + } + // preferNew = true — replace prior. Close prior so its close handler + // removes the source entry, then fall through to set the new one. + existingTransport.close(); + existingPeer.isOutbound = isOutbound; } existingPeer.transports.set(source, transport); this._log(`Transport added for ${peerName}: ${source} (${existingPeer.transports.size} transports)`); diff --git a/package.json b/package.json index a2de59d..ed85ca6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@sym-bot/sym", - "version": "0.5.1", + "version": "0.5.2", "description": "Infrastructure and protocol for multi-agent collective intelligence", "main": "lib/node.js", "bin": {