Fastcode
diff --git a/‎src/threading/Reaction.hpp‎
Lines changed: 13 additions & 2 deletions b/‎src/threading/Reaction.hpp‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎src/threading/scheduler/Group.cpp‎
Lines changed: 6 additions & 4 deletions b/‎src/threading/scheduler/Group.cpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/threading/scheduler/Group.hpp‎
Lines changed: 11 additions & 5 deletions b/‎src/threading/scheduler/Group.hpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎src/threading/scheduler/Pool.cpp‎
Lines changed: 6 additions & 0 deletions b/‎src/threading/scheduler/Pool.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/threading/scheduler/Scheduler.cpp‎
Lines changed: 22 additions & 18 deletions b/‎src/threading/scheduler/Scheduler.cpp‎
Lines changed: 22 additions & 18 deletions
diff --git a/‎src/threading/scheduler/Scheduler.hpp‎
Lines changed: 9 additions & 5 deletions b/‎src/threading/scheduler/Scheduler.hpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/threading/scheduler/queue/MPSCQueue.hpp‎
Lines changed: 24 additions & 13 deletions b/‎src/threading/scheduler/queue/MPSCQueue.hpp‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎src/threading/scheduler/queue/Semaphore.hpp‎
Lines changed: 3 additions & 2 deletions b/‎src/threading/scheduler/queue/Semaphore.hpp‎
Lines changed: 3 additions & 2 deletions
@@ -135,8 +135,19 @@ namespace threading {
         /// The callback generator function (creates databound callbacks)
         TaskGenerator generator;
 
-        /// Cached data for this reaction added by the scheduler
-        std::shared_ptr<void> scheduler_data;
+        /// Cached scheduler-private pointer for this reaction.
+        ///
+        /// The scheduler uses this as a fast-path cache for the resolved pool that this reaction's
+        /// tasks should run on. It is a raw, non-owning `void*` rather than `std::shared_ptr<void>`
+        /// to avoid the per-submit cost of `std::atomic_load`/`atomic_store` on a `shared_ptr`,
+        /// which on libstdc++ falls back to a small global pool of mutexes (selected by pointer
+        /// hash) and can become a contention point on hot submission paths.
+        ///
+        /// Ownership of whatever this points at lives entirely with the scheduler; reactions
+        /// outlive scheduler-side resources because PowerPlant tears reactors down before the
+        /// scheduler. The cache is set-once: the first submit resolves the pool and CASes it in,
+        /// subsequent submits just load it.
+        std::atomic<void*> scheduler_data{nullptr};
         friend class scheduler::Scheduler;  /// Let the scheduler mess with reaction objects
     };
 
 
@@ -22,6 +22,8 @@
 #include "Group.hpp"
 
 #include <algorithm>
+#include <atomic>
+#include <cstddef>
 #include <functional>
 #include <memory>
 #include <mutex>
@@ -31,7 +33,9 @@
 #include "../../id.hpp"
 #include "../../util/GroupDescriptor.hpp"
 #include "../ReactionTask.hpp"
+#include "Lock.hpp"
 #include "Pool.hpp"
+#include "queue/Priority.hpp"
 
 namespace NUClear {
 namespace threading {
@@ -165,9 +169,7 @@ namespace threading {
             return nullptr;
         }
 
-        bool Group::try_submit(std::unique_ptr<ReactionTask>&& task,
-                               const std::shared_ptr<Pool>& pool,
-                               const bool& clear_idle) {
+        bool Group::try_submit(std::unique_ptr<ReactionTask>&& task, Pool* pool, const bool& clear_idle) {
             // Don't jump ahead of multi-group waiters; if any exist, queue ourselves.
             if (slow_pending.load(std::memory_order_acquire) == 0) {
                 int expected = tokens.load(std::memory_order_acquire);
@@ -240,7 +242,7 @@ namespace threading {
             WaitEntry entry;
             for (std::size_t bucket = 0; bucket < queue::PRIORITY_BUCKETS; ++bucket) {
                 if (wait_buckets[bucket].try_dequeue(entry)) {
-                    auto pool = entry.pool;
+                    Pool* pool = entry.pool;
                     pool->submit({std::move(entry.task), make_running_lock()}, entry.clear_idle, /*force=*/true);
                     pool->unregister_external_waiter();
                     return true;
 
@@ -57,7 +57,10 @@ namespace threading {
         private:
             struct WaitEntry {
                 std::unique_ptr<ReactionTask> task;
-                std::shared_ptr<Pool> pool;
+                /// Non-owning pointer; Pools live for the lifetime of the Scheduler and the
+                /// Scheduler tears down Groups before Pools, so it is always safe to dereference
+                /// while this WaitEntry is reachable.
+                Pool* pool{nullptr};
                 bool clear_idle{false};
             };
 
@@ -111,6 +114,11 @@ namespace threading {
                 RunningLock(Group& group, std::shared_ptr<Group> group_keepalive);
                 ~RunningLock() override;
 
+                RunningLock(const RunningLock&)            = delete;
+                RunningLock(RunningLock&&)                 = delete;
+                RunningLock& operator=(const RunningLock&) = delete;
+                RunningLock& operator=(RunningLock&&)      = delete;
+
                 bool lock() override;
 
             private:
@@ -196,14 +204,12 @@ namespace threading {
              * Otherwise the task is queued until a token is released.
              *
              * @param task       the reaction task to submit
-             * @param pool       the pool to submit to when runnable
+             * @param pool       the pool to submit to when runnable (non-owning; must outlive the call)
              * @param clear_idle if true, clear idle state on submission
              *
              * @return true if the task was submitted immediately
              */
-            bool try_submit(std::unique_ptr<ReactionTask>&& task,
-                            const std::shared_ptr<Pool>& pool,
-                            const bool& clear_idle);
+            bool try_submit(std::unique_ptr<ReactionTask>&& task, Pool* pool, const bool& clear_idle);
 
             /**
              * This function will create a new lock for the task and return it.
 
@@ -22,6 +22,8 @@
 #include "Pool.hpp"
 
 #include <algorithm>
+#include <atomic>
+#include <cstddef>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -31,11 +33,15 @@
 
 #include "../../dsl/word/MainThread.hpp"
 #include "../../dsl/word/Pool.hpp"
+#include "../../id.hpp"
 #include "../../threading/Reaction.hpp"
 #include "../../util/Inline.hpp"
 #include "../ReactionTask.hpp"
 #include "CountingLock.hpp"
 #include "Scheduler.hpp"
+#include "queue/MPSCQueue.hpp"
+#include "queue/Priority.hpp"
+#include "queue/TaskQueue.hpp"
 
 namespace NUClear {
 namespace threading {
 
@@ -162,7 +162,7 @@ namespace threading {
         std::unique_ptr<Lock> Scheduler::get_groups_lock(
             const NUClear::id_t& task_id,
             const int& priority,
-            const std::shared_ptr<Pool>& pool,
+            Pool* pool,
             const std::set<std::shared_ptr<const util::GroupDescriptor>>& descs) {
 
             // No groups
@@ -188,28 +188,32 @@ namespace threading {
                 return;
             }
 
-            // If we have run this task before, we know which pool it should be submitted to and cached it
-            // on the parent reaction. This avoids every submit having to lock a mutex to find the pool.
+            // Resolve the Pool for this task.
             //
-            // The cache is read/written from any thread that submits a task for this reaction, so we use
-            // std::atomic_load/store on the shared_ptr to avoid a data race. The cache lookup is benign
-            // even under contention: the worst case is two submitters racing both compute the same pool
-            // pointer and store it; the resulting pool is identical so a "last writer wins" is fine.
-            std::shared_ptr<Pool> pool;
+            // The first submit for a reaction does a mutex-protected `get_pool()` lookup; the
+            // resulting pointer is then cached on the parent Reaction so subsequent submits skip
+            // the mutex entirely.
+            //
+            // The cache is a single `std::atomic<void*>` (see Reaction::scheduler_data). We
+            // deliberately avoid `std::atomic_load`/`atomic_store` on a `std::shared_ptr<void>`:
+            // on libstdc++ those fall back to a small global pool of mutexes (~8 chosen by
+            // pointer hash) and become a contention point on hot submission paths. Pools live
+            // for the lifetime of the Scheduler (and the Scheduler tears down reactions before
+            // its own pools), so a non-owning raw pointer is safe.
+            //
+            // The cache update is benign-racing: two submitters that miss simultaneously will
+            // both call `get_pool()` and store the same pointer; last writer wins, identical
+            // value.
+            Pool* pool = nullptr;
             if (task->parent) {
-                auto cached = std::atomic_load_explicit(&task->parent->scheduler_data, std::memory_order_acquire);
-                if (cached) {
-                    pool = std::static_pointer_cast<Pool>(cached);
-                }
-                else {
-                    pool = get_pool(task->pool_descriptor);
-                    std::atomic_store_explicit(&task->parent->scheduler_data,
-                                               std::static_pointer_cast<void>(pool),
-                                               std::memory_order_release);
+                pool = static_cast<Pool*>(task->parent->scheduler_data.load(std::memory_order_acquire));
+                if (pool == nullptr) {
+                    pool = get_pool(task->pool_descriptor).get();
+                    task->parent->scheduler_data.store(static_cast<void*>(pool), std::memory_order_release);
                 }
             }
             else {
-                pool = get_pool(task->pool_descriptor);
+                pool = get_pool(task->pool_descriptor).get();
             }
 
             const bool current_pool_idle = Pool::current() != nullptr && Pool::current()->is_idle();
 
@@ -127,7 +127,7 @@ namespace threading {
              */
             std::unique_ptr<Lock> get_groups_lock(const NUClear::id_t& task_id,
                                                   const int& priority,
-                                                  const std::shared_ptr<Pool>& pool,
+                                                  Pool* pool,
                                                   const std::set<std::shared_ptr<const util::GroupDescriptor>>& descs);
 
             /// The number of threads that will be in the default thread pool
@@ -136,10 +136,9 @@ namespace threading {
             /// If running is false this means the scheduler is shutting down and no new pools will be created
             std::atomic<bool> running{true};
 
-            /// A mutex for when we are modifying groups
-            std::mutex groups_mutex;
-            /// A map of group ids to the number of active tasks currently running in that group
-            std::map<std::shared_ptr<const util::GroupDescriptor>, std::shared_ptr<Group>> groups;
+            // NB: `pools` is declared before `groups` so that on Scheduler destruction the groups
+            // (which may hold non-owning Pool* in their waiter buckets) are destroyed first, then
+            // the pools. This keeps the raw pointers in WaitEntry safe-by-construction.
 
             /// A mutex for when we are modifying pools
             std::mutex pools_mutex;
@@ -149,6 +148,11 @@ namespace threading {
             /// once start is called future pools will be started immediately
             std::atomic<bool> started{false};
 
+            /// A mutex for when we are modifying groups
+            std::mutex groups_mutex;
+            /// A map of group ids to the number of active tasks currently running in that group
+            std::map<std::shared_ptr<const util::GroupDescriptor>, std::shared_ptr<Group>> groups;
+
             /// A mutex to protect the idle tasks list
             std::mutex idle_mutex;
             /// A list of idle tasks to execute when all pools are idle
 
@@ -23,6 +23,7 @@
 #define NUCLEAR_THREADING_SCHEDULER_QUEUE_MPSC_QUEUE_HPP
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cstddef>
 #include <new>
@@ -55,15 +56,17 @@ namespace threading {
                 static_assert(std::is_move_constructible<T>::value, "MPSCQueue requires move constructible T");
 
             private:
-                enum { BLOCK_SIZE = 64 };
+                static constexpr std::size_t BLOCK_SIZE = 64;
 
                 struct Slot {
                     std::atomic<bool> committed{false};
-                    alignas(T) unsigned char storage[sizeof(T)];
+                    /// Raw aligned storage for the T payload. Left value-initialised (zeroed) so the
+                    /// constructor fully covers all members; placement-new overwrites it on enqueue.
+                    alignas(T) std::array<unsigned char, sizeof(T)> storage{};
                 };
 
                 struct Block {
-                    Slot slots[BLOCK_SIZE];
+                    std::array<Slot, BLOCK_SIZE> slots{};
                     /// Producer claim counter, fetched by every enqueuer (atomic, MP-safe).
                     std::atomic<std::size_t> write{0};
                     /// Consumer read counter, only touched by the single consumer (non-atomic).
@@ -73,10 +76,10 @@ namespace threading {
                 };
 
                 static T* slot_ptr(Slot& slot) {
-                    return reinterpret_cast<T*>(slot.storage);
+                    return reinterpret_cast<T*>(slot.storage.data());
                 }
 
-                Block* allocate_block() {
+                static Block* allocate_block() {
                     return new Block();
                 }
 
@@ -87,12 +90,15 @@ namespace threading {
                 // state the graveyard length is bounded by the peak number of in-flight blocks.
                 void retire_block(Block* block) {
                     Block* head_graveyard = graveyard.load(std::memory_order_acquire);
-                    do {
+                    while (true) {
                         block->graveyard_next = head_graveyard;
-                    } while (!graveyard.compare_exchange_weak(head_graveyard,
-                                                              block,
-                                                              std::memory_order_release,
-                                                              std::memory_order_relaxed));
+                        if (graveyard.compare_exchange_weak(head_graveyard,
+                                                            block,
+                                                            std::memory_order_release,
+                                                            std::memory_order_relaxed)) {
+                            return;
+                        }
+                    }
                 }
 
                 bool link_next_block(Block* block) {
@@ -126,8 +132,8 @@ namespace threading {
 
             public:
                 MPSCQueue() {
-                    Block* initial = new Block();
-                    head_block     = initial;
+                    auto* initial = new Block();
+                    head_block    = initial;
                     tail_block.store(initial, std::memory_order_relaxed);
                     graveyard.store(nullptr, std::memory_order_relaxed);
                 }
@@ -153,14 +159,19 @@ namespace threading {
                     }
                 }
 
+                void enqueue(const T& item) {
+                    T copy(item);
+                    enqueue(std::move(copy));
+                }
+
                 void enqueue(T&& item) override {
                     while (true) {
                         Block*            block = tail_block.load(std::memory_order_acquire);
                         const std::size_t index = block->write.fetch_add(1, std::memory_order_relaxed);
 
                         if (index < BLOCK_SIZE) {
                             Slot& slot = block->slots[index];
-                            new (slot.storage) T(std::move(item));
+                            new (slot.storage.data()) T(std::move(item));
                             slot.committed.store(true, std::memory_order_release);
                             return;
                         }
 
@@ -39,7 +39,8 @@ namespace threading {
              */
             class Semaphore {
             public:
-                Semaphore() = default;
+                Semaphore()  = default;
+                ~Semaphore() = default;
 
                 Semaphore(const Semaphore&)            = delete;
                 Semaphore& operator=(const Semaphore&) = delete;
@@ -49,7 +50,7 @@ namespace threading {
                 void signal(int n = 1) {
                     const int previous = count.fetch_add(n, std::memory_order_release);
                     if (previous < 0) {
-                        std::lock_guard<std::mutex> lock(mutex);
+                        const std::lock_guard<std::mutex> lock(mutex);
                         const int waiters = std::min(n, -previous);
                         for (int i = 0; i < waiters; ++i) {
                             cv.notify_one();