Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/content/architecture/docker-resources.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ toc: true
|------|-------------|------------------------|
| Network | `<realm>-<cluster>-net` | `sind-dev-net` |
| Controller | `<realm>-<cluster>-controller` | `sind-dev-controller` |
| Backup controller | `<realm>-<cluster>-controller-backup` | `sind-dev-controller-backup` |
| Submitter | `<realm>-<cluster>-submitter` | `sind-dev-submitter` |
| Worker | `<realm>-<cluster>-worker-<N>` | `sind-dev-worker-0` |
| Config volume | `<realm>-<cluster>-config` | `sind-dev-config` |
Expand Down
1 change: 1 addition & 0 deletions docs/content/configuration/cluster-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ See [Slurm Configuration]({{< relref "/architecture/slurm-config" >}}) for detai
- At least one `worker` node is required
- `count` is only valid for worker nodes
- `managed` is only valid for worker nodes
- `backupController` is only valid for controller nodes
- `count` must not be negative
- `capAdd`/`capDrop` values must be recognized Linux capability names (e.g. `SYS_ADMIN`, `NET_ADMIN`, `ALL`)
- `devices` paths must be absolute (start with `/`)
33 changes: 33 additions & 0 deletions docs/content/configuration/node-definitions.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ toc: true
| `tmpSize` | global + per-node | `"256m"` | tmpfs size for `/tmp` |
| `count` | worker only | `1` | Number of worker nodes |
| `managed` | worker only | `true` | Start slurmd and add to slurm.conf |
| `backupController` | controller only | `false` | Also create an idle `controller-backup` container (see below) |
| `capAdd` | global + per-node | none | Extra Linux capabilities (e.g. `SYS_ADMIN`) |
| `capDrop` | global + per-node | none | Dropped Linux capabilities |
| `devices` | global + per-node | none | Host devices to expose (e.g. `/dev/fuse`) |
Expand Down Expand Up @@ -80,6 +81,38 @@ Unmanaged workers can also be created dynamically:
sind create worker --count 2 --unmanaged
```

## Backup controller

Setting `backupController: true` on the controller node spec spawns a second
controller container named `controller-backup` alongside the primary
`controller`. The backup uses the same image, resources, volumes,
capabilities, devices, and security options as the primary — both containers
share the cluster's config, munge, and data volumes — but sind does **not**
start `slurmctld` on it. It comes up idle, ready for manual debug runs or
active/passive experiments.

```yaml
nodes:
- role: controller
backupController: true
- role: worker
count: 2
```

Typical uses:

- Running `slurmctld -Dvvvvvv` by hand against the same `/etc/slurm` layout
to investigate controller behavior.
- Trying out Slurm's active/passive `SlurmctldHost` failover configuration
without having to rebuild the cluster.

The backup controller is addressable via DNS at
`controller-backup.<cluster>.<realm>.sind` and can be entered with
`sind enter controller-backup`. Worker discovery, `slurm.conf` generation,
and the `sind create worker` / `sind delete worker` flows all continue to
reference the primary `controller` container — the backup is invisible to
them.

## Capabilities and devices

sind's default security posture avoids extra capabilities and device access. When specific use cases require them (e.g. testing CVMFS provisioning or FUSE-based filesystems), you can grant targeted privileges per node.
Expand Down
3 changes: 3 additions & 0 deletions pkg/cluster/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,9 @@ func enableSlurm(ctx context.Context, client *docker.Client, realm, clusterName
if nc.Role == config.RoleWorker && !nc.Managed {
continue
}
if nc.Role == config.RoleController && nc.ShortName == ControllerBackupShortName {
continue
}
service, ok := slurm.ServiceForRole(nc.Role)
if !ok {
continue
Expand Down
34 changes: 34 additions & 0 deletions pkg/cluster/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,40 @@ func TestLogExtraPrivileges(t *testing.T) {
})
}

func TestEnableSlurm_SkipsBackupController(t *testing.T) {
var m mock.Executor
m.OnCall = func(args []string, _ string) mock.Result {
// systemctl enable --now slurmctld → success
if args[0] == "exec" && len(args) > 3 && args[2] == "systemctl" && args[3] == "enable" {
return mock.Result{}
}
// scontrol ping → success for the readiness probe
if args[0] == "exec" && len(args) > 2 && args[2] == "scontrol" {
return mock.Result{}
}
return mock.Result{}
}

client := docker.NewClient(&m)
configs := []RunConfig{
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: ControllerBackupShortName, Role: config.RoleController},
}

err := enableSlurm(t.Context(), client, mesh.DefaultRealm, "dev", configs, 10*time.Millisecond, nil)
require.NoError(t, err)

// Only the primary controller should have had systemctl enable invoked on it.
var enableCalls []mock.Call
for _, c := range m.Calls {
if len(c.Args) > 3 && c.Args[0] == "exec" && c.Args[2] == "systemctl" && c.Args[3] == "enable" {
enableCalls = append(enableCalls, c)
}
}
require.Len(t, enableCalls, 1)
assert.Equal(t, "sind-dev-controller", enableCalls[0].Args[1])
}

func TestEnableSlurm_ProbeTimeout(t *testing.T) {
var m mock.Executor
m.OnCall = func(args []string, _ string) mock.Result {
Expand Down
39 changes: 15 additions & 24 deletions pkg/cluster/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ import (
// DefaultDataMountPath is the default mount path for the shared data volume.
const DefaultDataMountPath = "/data"

// ControllerBackupShortName is the hostname used for the optional backup
// controller container spawned when a controller node spec has
// backupController: true. The corresponding slurmctld is not enabled so the
// container can be used for manual debug runs or active/passive experiments.
const ControllerBackupShortName = "controller-backup"

// Label keys used on sind containers.
const (
LabelRealm = "sind.realm"
Expand Down Expand Up @@ -209,7 +215,7 @@ func NodeRunConfigs(cfg *config.Cluster, realm, dnsIP, slurmVersion string) []Ru
for _, n := range cfg.Nodes {
switch n.Role {
case config.RoleController, config.RoleSubmitter:
configs = append(configs, RunConfig{
base := RunConfig{
Realm: realm,
ClusterName: cfg.Name,
ShortName: string(n.Role),
Expand All @@ -228,7 +234,14 @@ func NodeRunConfigs(cfg *config.Cluster, realm, dnsIP, slurmVersion string) []Ru
CapDrop: n.CapDrop,
Devices: n.Devices,
SecurityOpt: n.SecurityOpt,
})
}
configs = append(configs, base)
if n.Role == config.RoleController && n.BackupController {
backup := base
backup.ShortName = ControllerBackupShortName
backup.ContainerNumber = 2
configs = append(configs, backup)
}
case config.RoleWorker:
count := n.Count
if count <= 0 {
Expand Down Expand Up @@ -275,25 +288,3 @@ func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mes
}
return nil
}

// EnableSlurmServices enables the role-appropriate Slurm daemon on each node.
// Controller nodes get slurmctld; managed worker nodes get slurmd.
// Submitter and unmanaged worker nodes are skipped.
func EnableSlurmServices(ctx context.Context, client *docker.Client, configs []RunConfig) error {
for _, cfg := range configs {
if cfg.Role == config.RoleWorker && !cfg.Managed {
continue
}
service, ok := slurm.ServiceForRole(cfg.Role)
if !ok {
continue
}

containerName := ContainerName(cfg.Realm, cfg.ClusterName, cfg.ShortName)
_, err := client.Exec(ctx, containerName, "systemctl", "enable", "--now", string(service))
if err != nil {
return fmt.Errorf("enabling %s on %s: %w", service, cfg.ShortName, err)
}
}
return nil
}
129 changes: 53 additions & 76 deletions pkg/cluster/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,59 @@ func TestNodeRunConfigs_UnmanagedCompute(t *testing.T) {
assert.True(t, configs[3].Managed, "worker-2 managed")
}

func TestNodeRunConfigs_BackupController(t *testing.T) {
cfg := &config.Cluster{
Name: "dev",
Nodes: []config.Node{
{Role: config.RoleController, BackupController: true, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g",
CapAdd: []string{"SYS_ADMIN"}, Devices: []string{"/dev/fuse"}},
{Role: config.RoleWorker, Count: 1, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
},
}

configs := NodeRunConfigs(cfg, mesh.DefaultRealm, "172.18.0.2", "25.11.0")

require.Len(t, configs, 3)

primary := configs[0]
backup := configs[1]
assert.Equal(t, "controller", primary.ShortName)
assert.Equal(t, 1, primary.ContainerNumber)
assert.Equal(t, ControllerBackupShortName, backup.ShortName)
assert.Equal(t, "controller-backup", backup.ShortName)
assert.Equal(t, 2, backup.ContainerNumber)
assert.Equal(t, config.RoleController, backup.Role)
assert.False(t, primary.Managed, "Managed left zero on controller")
assert.False(t, backup.Managed, "Managed left zero on controller-backup")

// Every field other than ShortName and ContainerNumber must match between
// primary and backup so the two containers have identical resources,
// volumes, caps, devices, and security opts.
backupNormalized := backup
backupNormalized.ShortName = primary.ShortName
backupNormalized.ContainerNumber = primary.ContainerNumber
assert.Equal(t, primary, backupNormalized)

// Worker follows after the backup controller and keeps its own indexing.
assert.Equal(t, "worker-0", configs[2].ShortName)
}

func TestNodeRunConfigs_BackupControllerDisabled(t *testing.T) {
cfg := &config.Cluster{
Name: "dev",
Nodes: []config.Node{
{Role: config.RoleController, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
{Role: config.RoleWorker, Count: 1, Image: "img:1", CPUs: 2, Memory: "2g", TmpSize: "1g"},
},
}

configs := NodeRunConfigs(cfg, mesh.DefaultRealm, "", "")

require.Len(t, configs, 2)
assert.Equal(t, "controller", configs[0].ShortName)
assert.Equal(t, "worker-0", configs[1].ShortName)
}

func TestNodeRunConfigs_HostPathStorage(t *testing.T) {
cfg := &config.Cluster{
Name: "dev",
Expand Down Expand Up @@ -623,82 +676,6 @@ func TestCreateClusterNodes_Empty(t *testing.T) {
assert.Empty(t, m.Calls)
}

// --- EnableSlurmServices ---

func TestEnableSlurmServices(t *testing.T) {
var m mock.Executor
m.AddResult("", "", nil) // slurmctld on controller
m.AddResult("", "", nil) // slurmd on worker-0
c := docker.NewClient(&m)

configs := []RunConfig{
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-0", Role: config.RoleWorker, Managed: true},
}

err := EnableSlurmServices(t.Context(), c, configs)

require.NoError(t, err)
require.Len(t, m.Calls, 2)
assert.Equal(t, []string{"exec", "sind-dev-controller", "systemctl", "enable", "--now", "slurmctld"},
m.Calls[0].Args)
assert.Equal(t, []string{"exec", "sind-dev-worker-0", "systemctl", "enable", "--now", "slurmd"},
m.Calls[1].Args)
}

func TestEnableSlurmServices_SkipsSubmitter(t *testing.T) {
var m mock.Executor
m.AddResult("", "", nil) // slurmctld on controller
c := docker.NewClient(&m)

configs := []RunConfig{
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "submitter", Role: config.RoleSubmitter},
}

err := EnableSlurmServices(t.Context(), c, configs)

require.NoError(t, err)
assert.Len(t, m.Calls, 1) // only controller
}

func TestEnableSlurmServices_SkipsUnmanaged(t *testing.T) {
var m mock.Executor
m.AddResult("", "", nil) // slurmctld on controller
c := docker.NewClient(&m)

configs := []RunConfig{
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-0", Role: config.RoleWorker, Managed: false},
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "worker-1", Role: config.RoleWorker, Managed: true},
}

// Need result for worker-1 slurmd
m.AddResult("", "", nil)

err := EnableSlurmServices(t.Context(), c, configs)

require.NoError(t, err)
require.Len(t, m.Calls, 2)
// Controller + worker-1 only; worker-0 skipped
assert.Contains(t, m.Calls[1].Args, "sind-dev-worker-1")
}

func TestEnableSlurmServices_Error(t *testing.T) {
var m mock.Executor
m.AddResult("", "", fmt.Errorf("systemctl failed"))
c := docker.NewClient(&m)

configs := []RunConfig{
{Realm: mesh.DefaultRealm, ClusterName: "dev", ShortName: "controller", Role: config.RoleController},
}

err := EnableSlurmServices(t.Context(), c, configs)

require.Error(t, err)
assert.Contains(t, err.Error(), "enabling slurmctld on controller")
}

// --- Security fields in BuildRunArgs ---

func TestBuildRunArgs_CapAdd(t *testing.T) {
Expand Down
14 changes: 10 additions & 4 deletions pkg/cluster/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type NodeHealth struct {
// clusterName is used to select the cluster network IP.
func GetNodeHealth(ctx context.Context, client *docker.Client, containerName string, role config.Role, realm, clusterName string) (*NodeHealth, error) {
name := docker.ContainerName(containerName)
shortName := strings.TrimPrefix(containerName, ContainerPrefix(realm, clusterName))

info, err := client.InspectContainer(ctx, name)
if err != nil {
Expand All @@ -47,7 +48,7 @@ func GetNodeHealth(ctx context.Context, client *docker.Client, containerName str

// If container is not running, skip all service checks.
if info.Status != docker.StateRunning {
for _, svc := range roleServices(role) {
for _, svc := range roleServices(role, shortName) {
health.Services[svc] = false
}
return health, nil
Expand All @@ -56,7 +57,7 @@ func GetNodeHealth(ctx context.Context, client *docker.Client, containerName str
health.Munge = probe.MungeReady(ctx, client, name) == nil
health.SSHD = probe.SSHDReady(ctx, client, name) == nil

for _, svc := range roleServices(role) {
for _, svc := range roleServices(role, shortName) {
var check probe.Func
switch svc {
case "slurmctld":
Expand Down Expand Up @@ -257,8 +258,13 @@ func nodeStatusOrder(n *NodeStatus) string {
return roleSortKey(n.Role, n.Name)
}

// roleServices returns the Slurm service names for the given role.
func roleServices(role config.Role) []string {
// roleServices returns the Slurm service names for the given role. The
// backup controller (short name "controller-backup") has no managed Slurm
// service because sind does not start slurmctld on it.
func roleServices(role config.Role, shortName string) []string {
if role == config.RoleController && shortName == ControllerBackupShortName {
return nil
}
if svc, ok := slurm.ServiceForRole(role); ok {
return []string{string(svc)}
}
Expand Down
14 changes: 14 additions & 0 deletions pkg/cluster/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ func TestGetNodeHealth_Compute(t *testing.T) {
assert.True(t, health.Services["slurmd"])
}

func TestGetNodeHealth_ControllerBackup(t *testing.T) {
var m mock.Executor
m.OnCall = healthyOnCall("sind-dev-controller-backup", "172.18.0.5")
c := docker.NewClient(&m)

health, err := GetNodeHealth(t.Context(), c, "sind-dev-controller-backup", config.RoleController, mesh.DefaultRealm, "dev")

require.NoError(t, err)
assert.Equal(t, docker.StateRunning, health.Container)
assert.True(t, health.Munge)
assert.True(t, health.SSHD)
assert.Empty(t, health.Services)
}

func TestGetNodeHealth_Submitter(t *testing.T) {
var m mock.Executor
m.OnCall = healthyOnCall("sind-dev-submitter", "172.18.0.4")
Expand Down
Loading