GSI-HPC · dennisklein · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/docs/content/architecture/docker-resources.md b/docs/content/architecture/docker-resources.md
@@ -12,6 +12,7 @@ toc: true
 |------|-------------|------------------------|
 | Network | `<realm>-<cluster>-net` | `sind-dev-net` |
 | Controller | `<realm>-<cluster>-controller` | `sind-dev-controller` |
+| Backup controller | `<realm>-<cluster>-controller-backup` | `sind-dev-controller-backup` |
 | Submitter | `<realm>-<cluster>-submitter` | `sind-dev-submitter` |
 | Worker | `<realm>-<cluster>-worker-<N>` | `sind-dev-worker-0` |
 | Config volume | `<realm>-<cluster>-config` | `sind-dev-config` |

diff --git a/docs/content/configuration/cluster-config.md b/docs/content/configuration/cluster-config.md
@@ -168,6 +168,7 @@ See [Slurm Configuration]({{< relref "/architecture/slurm-config" >}}) for detai
 - At least one `worker` node is required
 - `count` is only valid for worker nodes
 - `managed` is only valid for worker nodes
+- `backupController` is only valid for controller nodes
 - `count` must not be negative
 - `capAdd`/`capDrop` values must be recognized Linux capability names (e.g. `SYS_ADMIN`, `NET_ADMIN`, `ALL`)
 - `devices` paths must be absolute (start with `/`)
diff --git a/docs/content/configuration/node-definitions.md b/docs/content/configuration/node-definitions.md
@@ -24,6 +24,7 @@ toc: true
 | `tmpSize` | global + per-node | `"256m"` | tmpfs size for `/tmp` |
 | `count` | worker only | `1` | Number of worker nodes |
 | `managed` | worker only | `true` | Start slurmd and add to slurm.conf |
+| `backupController` | controller only | `false` | Also create an idle `controller-backup` container (see below) |
 | `capAdd` | global + per-node | none | Extra Linux capabilities (e.g. `SYS_ADMIN`) |
 | `capDrop` | global + per-node | none | Dropped Linux capabilities |
 | `devices` | global + per-node | none | Host devices to expose (e.g. `/dev/fuse`) |
@@ -80,6 +81,38 @@ Unmanaged workers can also be created dynamically:
 sind create worker --count 2 --unmanaged
 ```
 
+## Backup controller
+
+Setting `backupController: true` on the controller node spec spawns a second
+controller container named `controller-backup` alongside the primary
+`controller`. The backup uses the same image, resources, volumes,
+capabilities, devices, and security options as the primary — both containers
+share the cluster's config, munge, and data volumes — but sind does **not**
+start `slurmctld` on it. It comes up idle, ready for manual debug runs or
+active/passive experiments.
+
+```yaml
+nodes:
+  - role: controller
+    backupController: true
+  - role: worker
+    count: 2
+```
+
+Typical uses:
+
+- Running `slurmctld -Dvvvvvv` by hand against the same `/etc/slurm` layout
+  to investigate controller behavior.
+- Trying out Slurm's active/passive `SlurmctldHost` failover configuration
+  without having to rebuild the cluster.
+
+The backup controller is addressable via DNS at
+`controller-backup.<cluster>.<realm>.sind` and can be entered with
+`sind enter controller-backup`. Worker discovery, `slurm.conf` generation,
+and the `sind create worker` / `sind delete worker` flows all continue to
+reference the primary `controller` container — the backup is invisible to
+them.
+
 ## Capabilities and devices
 
 sind's default security posture avoids extra capabilities and device access. When specific use cases require them (e.g. testing CVMFS provisioning or FUSE-based filesystems), you can grant targeted privileges per node.

diff --git a/pkg/cluster/create.go b/pkg/cluster/create.go
@@ -371,6 +371,9 @@ func enableSlurm(ctx context.Context, client *docker.Client, realm, clusterName
 		if nc.Role == config.RoleWorker && !nc.Managed {
 			continue
 		}
+		if nc.Role == config.RoleController && nc.ShortName == ControllerBackupShortName {
+			continue
+		}
 		service, ok := slurm.ServiceForRole(nc.Role)
 		if !ok {
 			continue

diff --git a/pkg/cluster/create_test.go b/pkg/cluster/create_test.go
@@ -1047,6 +1047,40 @@ func TestLogExtraPrivileges(t *testing.T) {
 	})
 }
 
+func TestEnableSlurm_SkipsBackupController(t *testing.T) {
+	var m mock.Executor
+	m.OnCall = func(args []string, _ string) mock.Result {
+		// systemctl enable --now slurmctld → success
+		if args[0] == "exec" && len(args) > 3 && args[2] == "systemctl" && args[3] == "enable" {
+			return mock.Result{}
+		}
+		// scontrol ping → success for the readiness probe
+		if args[0] == "exec" && len(args) > 2 && args[2] == "scontrol" {
+			return mock.Result{}
+		}
+		return mock.Result{}
+	}
+
+	client := docker.NewClient(&m)
+	configs := []RunConfig{
+		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
+		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: ControllerBackupShortName, Role: config.RoleController},
+	}
+
+	err := enableSlurm(t.Context(), client, mesh.DefaultRealm, "dev", configs, 10*time.Millisecond, nil)
+	require.NoError(t, err)
+
+	// Only the primary controller should have had systemctl enable invoked on it.
+	var enableCalls []mock.Call
+	for _, c := range m.Calls {
+		if len(c.Args) > 3 && c.Args[0] == "exec" && c.Args[2] == "systemctl" && c.Args[3] == "enable" {
+			enableCalls = append(enableCalls, c)
+		}
+	}
+	require.Len(t, enableCalls, 1)
+	assert.Equal(t, "sind-dev-controller", enableCalls[0].Args[1])
+}
+
 func TestEnableSlurm_ProbeTimeout(t *testing.T) {
 	var m mock.Executor
 	m.OnCall = func(args []string, _ string) mock.Result {

diff --git a/pkg/cluster/node.go b/pkg/cluster/node.go
@@ -17,6 +17,12 @@ import (
 // DefaultDataMountPath is the default mount path for the shared data volume.
 const DefaultDataMountPath = "/data"
 
+// ControllerBackupShortName is the hostname used for the optional backup
+// controller container spawned when a controller node spec has
+// backupController: true. The corresponding slurmctld is not enabled so the
+// container can be used for manual debug runs or active/passive experiments.
+const ControllerBackupShortName = "controller-backup"
+
 // Label keys used on sind containers.
 const (
 	LabelRealm        = "sind.realm"
@@ -209,7 +215,7 @@ func NodeRunConfigs(cfg *config.Cluster, realm, dnsIP, slurmVersion string) []Ru
 	for _, n := range cfg.Nodes {
 		switch n.Role {
 		case config.RoleController, config.RoleSubmitter:
-			configs = append(configs, RunConfig{
+			base := RunConfig{
 				Realm:           realm,
 				ClusterName:     cfg.Name,
 				ShortName:       string(n.Role),
@@ -228,7 +234,14 @@ func NodeRunConfigs(cfg *config.Cluster, realm, dnsIP, slurmVersion string) []Ru
 				CapDrop:         n.CapDrop,
 				Devices:         n.Devices,
 				SecurityOpt:     n.SecurityOpt,
-			})
+			}
+			configs = append(configs, base)
+			if n.Role == config.RoleController && n.BackupController {
+				backup := base
+				backup.ShortName = ControllerBackupShortName
+				backup.ContainerNumber = 2
+				configs = append(configs, backup)
+			}
 		case config.RoleWorker:
 			count := n.Count
 			if count <= 0 {
@@ -275,25 +288,3 @@ func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mes
 	}
 	return nil
 }
-
-// EnableSlurmServices enables the role-appropriate Slurm daemon on each node.
-// Controller nodes get slurmctld; managed worker nodes get slurmd.
-// Submitter and unmanaged worker nodes are skipped.
-func EnableSlurmServices(ctx context.Context, client *docker.Client, configs []RunConfig) error {
-	for _, cfg := range configs {
-		if cfg.Role == config.RoleWorker && !cfg.Managed {
-			continue
-		}
-		service, ok := slurm.ServiceForRole(cfg.Role)
-		if !ok {
-			continue
-		}
-
-		containerName := ContainerName(cfg.Realm, cfg.ClusterName, cfg.ShortName)
-		_, err := client.Exec(ctx, containerName, "systemctl", "enable", "--now", string(service))
-		if err != nil {
-			return fmt.Errorf("enabling %s on %s: %w", service, cfg.ShortName, err)
-		}
-	}
-	return nil
-}
diff --git a/pkg/cluster/node_test.go b/pkg/cluster/node_test.go
@@ -494,6 +494,59 @@ func TestNodeRunConfigs_UnmanagedCompute(t *testing.T) {
 	assert.True(t, configs[3].Managed, "worker-2 managed")
 }
 
+func TestNodeRunConfigs_BackupController(t *testing.T) {
+	cfg := &config.Cluster{
+		Name: "dev",
+		Nodes: []config.Node{
+			{Role: config.RoleController, BackupController: true, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g",
+				CapAdd: []string{"SYS_ADMIN"}, Devices: []string{"/dev/fuse"}},
+			{Role: config.RoleWorker, Count: 1, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
+		},
+	}
+
+	configs := NodeRunConfigs(cfg, mesh.DefaultRealm, "172.18.0.2", "25.11.0")
+
+	require.Len(t, configs, 3)
+
+	primary := configs[0]
+	backup := configs[1]
+	assert.Equal(t, "controller", primary.ShortName)
+	assert.Equal(t, 1, primary.ContainerNumber)
+	assert.Equal(t, ControllerBackupShortName, backup.ShortName)
+	assert.Equal(t, "controller-backup", backup.ShortName)
+	assert.Equal(t, 2, backup.ContainerNumber)
+	assert.Equal(t, config.RoleController, backup.Role)
+	assert.False(t, primary.Managed, "Managed left zero on controller")
+	assert.False(t, backup.Managed, "Managed left zero on controller-backup")
+
+	// Every field other than ShortName and ContainerNumber must match between
+	// primary and backup so the two containers have identical resources,
+	// volumes, caps, devices, and security opts.
+	backupNormalized := backup
+	backupNormalized.ShortName = primary.ShortName
+	backupNormalized.ContainerNumber = primary.ContainerNumber
+	assert.Equal(t, primary, backupNormalized)
+
+	// Worker follows after the backup controller and keeps its own indexing.
+	assert.Equal(t, "worker-0", configs[2].ShortName)
+}
+
+func TestNodeRunConfigs_BackupControllerDisabled(t *testing.T) {
+	cfg := &config.Cluster{
+		Name: "dev",
+		Nodes: []config.Node{
+			{Role: config.RoleController, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
+			{Role: config.RoleWorker, Count: 1, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
+		},
+	}
+
+	configs := NodeRunConfigs(cfg, mesh.DefaultRealm, "", "")
+
+	require.Len(t, configs, 2)
+	assert.Equal(t, "controller", configs[0].ShortName)
+	assert.Equal(t, "worker-0", configs[1].ShortName)
+}
+
 func TestNodeRunConfigs_HostPathStorage(t *testing.T) {
 	cfg := &config.Cluster{
 		Name: "dev",
@@ -623,82 +676,6 @@ func TestCreateClusterNodes_Empty(t *testing.T) {
 	assert.Empty(t, m.Calls)
 }
 
-// --- EnableSlurmServices ---
-
-func TestEnableSlurmServices(t *testing.T) {
-	var m mock.Executor
-	m.AddResult("", "", nil) // slurmctld on controller
-	m.AddResult("", "", nil) // slurmd on worker-0
-	c := docker.NewClient(&m)
-
-	configs := []RunConfig{
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-0", Role: config.RoleWorker, Managed: true},
-	}
-
-	err := EnableSlurmServices(t.Context(), c, configs)
-
-	require.NoError(t, err)
-	require.Len(t, m.Calls, 2)
-	assert.Equal(t, []string{"exec", "sind-dev-controller", "systemctl", "enable", "--now", "slurmctld"},
-		m.Calls[0].Args)
-	assert.Equal(t, []string{"exec", "sind-dev-worker-0", "systemctl", "enable", "--now", "slurmd"},
-		m.Calls[1].Args)
-}
-
-func TestEnableSlurmServices_SkipsSubmitter(t *testing.T) {
-	var m mock.Executor
-	m.AddResult("", "", nil) // slurmctld on controller
-	c := docker.NewClient(&m)
-
-	configs := []RunConfig{
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "submitter", Role: config.RoleSubmitter},
-	}
-
-	err := EnableSlurmServices(t.Context(), c, configs)
-
-	require.NoError(t, err)
-	assert.Len(t, m.Calls, 1) // only controller
-}
-
-func TestEnableSlurmServices_SkipsUnmanaged(t *testing.T) {
-	var m mock.Executor
-	m.AddResult("", "", nil) // slurmctld on controller
-	c := docker.NewClient(&m)
-
-	configs := []RunConfig{
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-0", Role: config.RoleWorker, Managed: false},
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-1", Role: config.RoleWorker, Managed: true},
-	}
-
-	// Need result for worker-1 slurmd
-	m.AddResult("", "", nil)
-
-	err := EnableSlurmServices(t.Context(), c, configs)
-
-	require.NoError(t, err)
-	require.Len(t, m.Calls, 2)
-	// Controller + worker-1 only; worker-0 skipped
-	assert.Contains(t, m.Calls[1].Args, "sind-dev-worker-1")
-}
-
-func TestEnableSlurmServices_Error(t *testing.T) {
-	var m mock.Executor
-	m.AddResult("", "", fmt.Errorf("systemctl failed"))
-	c := docker.NewClient(&m)
-
-	configs := []RunConfig{
-		{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
-	}
-
-	err := EnableSlurmServices(t.Context(), c, configs)
-
-	require.Error(t, err)
-	assert.Contains(t, err.Error(), "enabling slurmctld on controller")
-}
-
 // --- Security fields in BuildRunArgs ---
 
 func TestBuildRunArgs_CapAdd(t *testing.T) {

diff --git a/pkg/cluster/status.go b/pkg/cluster/status.go
@@ -33,6 +33,7 @@ type NodeHealth struct {
 // clusterName is used to select the cluster network IP.
 func GetNodeHealth(ctx context.Context, client *docker.Client, containerName string, role config.Role, realm, clusterName string) (*NodeHealth, error) {
 	name := docker.ContainerName(containerName)
+	shortName := strings.TrimPrefix(containerName, ContainerPrefix(realm, clusterName))
 
 	info, err := client.InspectContainer(ctx, name)
 	if err != nil {
@@ -47,7 +48,7 @@ func GetNodeHealth(ctx context.Context, client *docker.Client, containerName str
 
 	// If container is not running, skip all service checks.
 	if info.Status != docker.StateRunning {
-		for _, svc := range roleServices(role) {
+		for _, svc := range roleServices(role, shortName) {
 			health.Services[svc] = false
 		}
 		return health, nil
@@ -56,7 +57,7 @@ func GetNodeHealth(ctx context.Context, client *docker.Client, containerName str
 	health.Munge = probe.MungeReady(ctx, client, name) == nil
 	health.SSHD = probe.SSHDReady(ctx, client, name) == nil
 
-	for _, svc := range roleServices(role) {
+	for _, svc := range roleServices(role, shortName) {
 		var check probe.Func
 		switch svc {
 		case "slurmctld":
@@ -257,8 +258,13 @@ func nodeStatusOrder(n *NodeStatus) string {
 	return roleSortKey(n.Role, n.Name)
 }
 
-// roleServices returns the Slurm service names for the given role.
-func roleServices(role config.Role) []string {
+// roleServices returns the Slurm service names for the given role. The
+// backup controller (short name "controller-backup") has no managed Slurm
+// service because sind does not start slurmctld on it.
+func roleServices(role config.Role, shortName string) []string {
+	if role == config.RoleController && shortName == ControllerBackupShortName {
+		return nil
+	}
 	if svc, ok := slurm.ServiceForRole(role); ok {
 		return []string{string(svc)}
 	}

diff --git a/pkg/cluster/status_test.go b/pkg/cluster/status_test.go
@@ -81,6 +81,20 @@ func TestGetNodeHealth_Compute(t *testing.T) {
 	assert.True(t, health.Services["slurmd"])
 }
 
+func TestGetNodeHealth_ControllerBackup(t *testing.T) {
+	var m mock.Executor
+	m.OnCall = healthyOnCall("sind-dev-controller-backup", "172.18.0.5")
+	c := docker.NewClient(&m)
+
+	health, err := GetNodeHealth(t.Context(), c, "sind-dev-controller-backup", config.RoleController, mesh.DefaultRealm, "dev")
+
+	require.NoError(t, err)
+	assert.Equal(t, docker.StateRunning, health.Container)
+	assert.True(t, health.Munge)
+	assert.True(t, health.SSHD)
+	assert.Empty(t, health.Services)
+}
+
 func TestGetNodeHealth_Submitter(t *testing.T) {
 	var m mock.Executor
 	m.OnCall = healthyOnCall("sind-dev-submitter", "172.18.0.4")