diff --git a/cmd/bbox/main.go b/cmd/bbox/main.go index 58207fd..b523768 100644 --- a/cmd/bbox/main.go +++ b/cmd/bbox/main.go @@ -249,6 +249,11 @@ func run(parentCtx context.Context, agentName string, flags runFlags) error { infraws.CleanupStaleSnapshots(ws, logger) } + // Clean up stale VM log directories from previous crashes. + if home, homeErr := os.UserHomeDir(); homeErr == nil { + infravm.CleanupStaleLogs(filepath.Join(home, ".config", "broodbox", "vms"), logger) + } + // Build registry with config-based custom agents. registry := infraagent.NewRegistry() cfgLoader := infraconfig.NewLoader(flags.cfgPath) @@ -552,6 +557,11 @@ func openLogFile(override, vmName string) (string, *os.File, io.Closer, error) { if err := os.MkdirAll(logDir, 0o700); err != nil { return "", nil, nil, fmt.Errorf("creating log dir: %w", err) } + // Write PID sentinel to mark ownership so stale cleanup can + // identify directories from dead processes. + if err := infravm.WriteSentinel(logDir); err != nil { + return "", nil, nil, err + } logPath = filepath.Join(logDir, defaultLogFile) } diff --git a/go.mod b/go.mod index 2751a39..d506af0 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 github.com/sergi/go-diff v1.4.0 github.com/spf13/cobra v1.10.2 - github.com/stacklok/propolis v0.0.6 + github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf github.com/stacklok/toolhive v0.10.0 github.com/stacklok/toolhive-core v0.0.6 github.com/stretchr/testify v1.11.1 diff --git a/go.sum b/go.sum index 26e5cc4..4c5e8eb 100644 --- a/go.sum +++ b/go.sum @@ -628,8 +628,8 @@ github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= -github.com/stacklok/propolis v0.0.6 h1:Szfo4xeYX3pQpHzDOzSomo2zngIyFE3t4yRA4MqVPp0= -github.com/stacklok/propolis v0.0.6/go.mod h1:GK7TUXCm4J8Hh/QFoIGuXE4szSt8PWDZWzj1JPpqxVU= +github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf h1:HsQYq1H56kvEbgc+/4qwFpEr3HcyfMQ0b8dCcCHwesU= +github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf/go.mod h1:GK7TUXCm4J8Hh/QFoIGuXE4szSt8PWDZWzj1JPpqxVU= github.com/stacklok/toolhive v0.10.0 h1:yXTR2ZbD83tGjSjSS2ypg61dYWZ3AwKxCyq3cACELOc= github.com/stacklok/toolhive v0.10.0/go.mod h1:5suFGdrDM9j8Vh/ULROVO2qImpm+kMklXHcVBNFxL9Y= github.com/stacklok/toolhive-core v0.0.6 h1:JLJpL4qyGh3z/fZKk+NNavziNCdtJlHoqroqBdWH6x8= diff --git a/internal/infra/process/process.go b/internal/infra/process/process.go new file mode 100644 index 0000000..64b432c --- /dev/null +++ b/internal/infra/process/process.go @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package process provides shared process-level utilities for infrastructure +// packages that need to check process liveness (e.g. stale cleanup). +package process + +import ( + "os" + "syscall" +) + +// IsAlive checks if a process with the given PID is still running. +// Uses signal 0 which checks for process existence without sending a signal. +func IsAlive(pid int) bool { + proc, err := os.FindProcess(pid) + if err != nil { + return false + } + // Signal 0 checks existence without actually sending a signal. + err = proc.Signal(syscall.Signal(0)) + return err == nil +} diff --git a/internal/infra/process/process_test.go b/internal/infra/process/process_test.go new file mode 100644 index 0000000..0bfa5ae --- /dev/null +++ b/internal/infra/process/process_test.go @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package process + +import ( + "os" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsAlive(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + pid int + alive bool + }{ + { + name: "current process is alive", + pid: os.Getpid(), + alive: true, + }, + { + name: "parent process is alive", + pid: os.Getppid(), + alive: true, + }, + { + name: "impossible PID is not alive", + pid: 2147483647, + alive: false, + }, + { + name: "negative PID is not alive", + pid: -1, + alive: false, + }, + { + name: "large negative PID is not alive", + pid: -99999, + alive: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.alive, IsAlive(tt.pid)) + }) + } +} diff --git a/internal/infra/vm/cleanup.go b/internal/infra/vm/cleanup.go new file mode 100644 index 0000000..db3fb21 --- /dev/null +++ b/internal/infra/vm/cleanup.go @@ -0,0 +1,81 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package vm + +import ( + "fmt" + "log/slog" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/stacklok/brood-box/internal/infra/process" +) + +// LogSentinel is the marker file placed inside per-VM log directories +// to identify ownership by a running bbox process. +const LogSentinel = ".bbox-sentinel" + +// CleanupStaleLogs removes orphaned per-VM log directories from previous +// crashes. It scans vmsDir for subdirectories with a sentinel file whose +// owning process has died. +func CleanupStaleLogs(vmsDir string, logger *slog.Logger) { + entries, err := os.ReadDir(vmsDir) + if err != nil { + // Directory may not exist yet on first run — not an error. + if os.IsNotExist(err) { + return + } + logger.Warn("failed to scan for stale VM log directories", "error", err) + return + } + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + dirPath := filepath.Join(vmsDir, entry.Name()) + + // Only remove directories that have our sentinel file to avoid + // deleting unrelated directories. + sentinelPath := filepath.Join(dirPath, LogSentinel) + data, err := os.ReadFile(sentinelPath) + if err != nil { + logger.Debug("skipping VM directory without sentinel", "path", dirPath) + continue + } + + // If the sentinel contains a PID, check if that process is still alive. + // Skip cleanup for directories owned by a running process. + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil || pid <= 0 { + logger.Debug("skipping VM directory with invalid sentinel", "path", dirPath) + continue + } + + if process.IsAlive(pid) { + logger.Debug("skipping VM log directory owned by running process", + "path", dirPath, "pid", pid) + continue + } + + logger.Warn("removing stale VM log directory", "path", dirPath) + if err := os.RemoveAll(dirPath); err != nil { + logger.Error("failed to remove stale VM log directory", "path", dirPath, "error", err) + } + } +} + +// WriteSentinel writes a PID sentinel file into the given directory to mark +// ownership by the current process. Returns an error if the write fails. +func WriteSentinel(dir string) error { + sentinelPath := filepath.Join(dir, LogSentinel) + content := fmt.Sprintf("%d", os.Getpid()) + if err := os.WriteFile(sentinelPath, []byte(content), 0o600); err != nil { + return fmt.Errorf("writing log sentinel: %w", err) + } + return nil +} diff --git a/internal/infra/vm/cleanup_test.go b/internal/infra/vm/cleanup_test.go new file mode 100644 index 0000000..775d68c --- /dev/null +++ b/internal/infra/vm/cleanup_test.go @@ -0,0 +1,215 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package vm + +import ( + "fmt" + "log/slog" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// testLogger returns a logger that discards output unless tests are run with -v. +func testLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) +} + +func TestCleanupStaleLogs(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + require.NoError(t, os.MkdirAll(vmsDir, 0o755)) + + // Stale directory with dead PID sentinel — should be removed. + staleDir := filepath.Join(vmsDir, "stale-vm") + require.NoError(t, os.MkdirAll(staleDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(staleDir, LogSentinel), []byte("2147483647"), 0o600)) + // Also create a log file to verify entire directory is removed. + require.NoError(t, os.WriteFile(filepath.Join(staleDir, "broodbox.log"), []byte("old log"), 0o600)) + + // Directory with live PID sentinel (our process) — should be preserved. + liveDir := filepath.Join(vmsDir, "live-vm") + require.NoError(t, os.MkdirAll(liveDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(liveDir, LogSentinel), []byte(fmt.Sprintf("%d", os.Getpid())), 0o600)) + + // Directory without sentinel — should be preserved. + noSentinelDir := filepath.Join(vmsDir, "no-sentinel-vm") + require.NoError(t, os.MkdirAll(noSentinelDir, 0o755)) + + // Regular file in vms/ (not a directory) — should be skipped. + require.NoError(t, os.WriteFile(filepath.Join(vmsDir, "stray-file"), []byte("x"), 0o600)) + + CleanupStaleLogs(vmsDir, testLogger()) + + _, err := os.Stat(staleDir) + assert.True(t, os.IsNotExist(err), "stale directory with dead PID should be removed") + + _, err = os.Stat(liveDir) + assert.NoError(t, err, "directory with live PID should remain") + + _, err = os.Stat(noSentinelDir) + assert.NoError(t, err, "directory without sentinel should remain") + + _, err = os.Stat(filepath.Join(vmsDir, "stray-file")) + assert.NoError(t, err, "non-directory entry should remain") +} + +func TestCleanupStaleLogs_InvalidSentinelContent(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + require.NoError(t, os.MkdirAll(vmsDir, 0o755)) + + tests := []struct { + name string + content string + }{ + {"empty sentinel", ""}, + {"non-numeric text", "not-a-pid"}, + {"negative PID", "-1"}, + {"zero PID", "0"}, + {"floating point", "123.456"}, + {"PID with trailing garbage", "123abc"}, + } + + for _, tt := range tests { + dir := filepath.Join(vmsDir, tt.name) + require.NoError(t, os.MkdirAll(dir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, LogSentinel), []byte(tt.content), 0o600)) + } + + CleanupStaleLogs(vmsDir, testLogger()) + + // All directories with invalid sentinels should be preserved (not cleaned). + for _, tt := range tests { + dir := filepath.Join(vmsDir, tt.name) + _, err := os.Stat(dir) + assert.NoError(t, err, "directory with invalid sentinel %q should remain", tt.name) + } +} + +func TestCleanupStaleLogs_WhitespacePaddedSentinel(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + require.NoError(t, os.MkdirAll(vmsDir, 0o755)) + + dir := filepath.Join(vmsDir, "whitespace-vm") + require.NoError(t, os.MkdirAll(dir, 0o755)) + // PID 2147483647 with leading/trailing whitespace and newline. + require.NoError(t, os.WriteFile(filepath.Join(dir, LogSentinel), []byte(" 2147483647\n"), 0o600)) + + CleanupStaleLogs(vmsDir, testLogger()) + + _, err := os.Stat(dir) + assert.True(t, os.IsNotExist(err), "stale directory with whitespace-padded dead PID should be removed") +} + +func TestCleanupStaleLogs_MultipleStaleDirectories(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + require.NoError(t, os.MkdirAll(vmsDir, 0o755)) + + staleDirs := make([]string, 5) + for i := range staleDirs { + dir := filepath.Join(vmsDir, fmt.Sprintf("stale-%d", i)) + require.NoError(t, os.MkdirAll(dir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, LogSentinel), []byte("2147483647"), 0o600)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "broodbox.log"), []byte("log data"), 0o600)) + staleDirs[i] = dir + } + + CleanupStaleLogs(vmsDir, testLogger()) + + for _, dir := range staleDirs { + _, err := os.Stat(dir) + assert.True(t, os.IsNotExist(err), "stale directory %s should be removed", filepath.Base(dir)) + } +} + +func TestCleanupStaleLogs_NestedDataSubdirectory(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + dir := filepath.Join(vmsDir, "nested-vm") + dataDir := filepath.Join(dir, "data") + require.NoError(t, os.MkdirAll(dataDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, LogSentinel), []byte("2147483647"), 0o600)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "broodbox.log"), []byte("log"), 0o600)) + require.NoError(t, os.WriteFile(filepath.Join(dataDir, "state.json"), []byte("{}"), 0o600)) + + CleanupStaleLogs(vmsDir, testLogger()) + + _, err := os.Stat(dir) + assert.True(t, os.IsNotExist(err), "entire directory tree should be removed") +} + +func TestCleanupStaleLogs_EmptyVmsDir(t *testing.T) { + t.Parallel() + + vmsDir := filepath.Join(t.TempDir(), "vms") + require.NoError(t, os.MkdirAll(vmsDir, 0o755)) + + // Should not panic or error on empty directory. + CleanupStaleLogs(vmsDir, testLogger()) +} + +func TestCleanupStaleLogs_NonexistentVmsDir(t *testing.T) { + t.Parallel() + + // Should not panic when vms/ doesn't exist at all. + CleanupStaleLogs(filepath.Join(t.TempDir(), "nonexistent"), testLogger()) +} + +func TestWriteSentinel(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + require.NoError(t, WriteSentinel(dir)) + + data, err := os.ReadFile(filepath.Join(dir, LogSentinel)) + require.NoError(t, err) + assert.Equal(t, fmt.Sprintf("%d", os.Getpid()), string(data)) +} + +func TestWriteSentinel_FilePermissions(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + require.NoError(t, WriteSentinel(dir)) + + info, err := os.Stat(filepath.Join(dir, LogSentinel)) + require.NoError(t, err) + assert.Equal(t, os.FileMode(0o600), info.Mode().Perm(), + "sentinel should be owner-only read/write") +} + +func TestWriteSentinel_Overwrite(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + + // Write a sentinel with stale content. + require.NoError(t, os.WriteFile(filepath.Join(dir, LogSentinel), []byte("99999"), 0o600)) + + // Overwrite with current PID. + require.NoError(t, WriteSentinel(dir)) + + data, err := os.ReadFile(filepath.Join(dir, LogSentinel)) + require.NoError(t, err) + assert.Equal(t, fmt.Sprintf("%d", os.Getpid()), string(data), + "sentinel should contain current PID after overwrite") +} + +func TestWriteSentinel_NonexistentDirectory(t *testing.T) { + t.Parallel() + + err := WriteSentinel(filepath.Join(t.TempDir(), "nonexistent")) + assert.Error(t, err, "writing sentinel to nonexistent directory should fail") +} diff --git a/internal/infra/vm/mcpconfig.go b/internal/infra/vm/mcpconfig.go index d59fd71..bfbee8a 100644 --- a/internal/infra/vm/mcpconfig.go +++ b/internal/infra/vm/mcpconfig.go @@ -7,6 +7,7 @@ import ( "encoding/json" "errors" "fmt" + "log/slog" "os" "path/filepath" @@ -14,9 +15,22 @@ import ( ) // ChownFunc abstracts file ownership changes for testability. -// Production code passes os.Chown; tests can pass a recording mock. +// Production code uses bestEffortLchown; tests can pass a recording mock. type ChownFunc func(path string, uid, gid int) error +// bestEffortLchown attempts os.Lchown and silently ignores permission errors. +// On macOS non-root users cannot chown to a different UID; the guest init +// will fix ownership at boot time. Lchown is used instead of Chown to avoid +// following symlinks in the rootfs. +func bestEffortLchown(path string, uid, gid int) error { + if err := os.Lchown(path, uid, gid); err != nil { + if !os.IsPermission(err) { + slog.Warn("lchown failed", "path", path, "uid", uid, "gid", gid, "err", err) + } + } + return nil +} + // sandboxHome is the home directory of the sandbox user inside the guest. // Config files are written here because /workspace is mounted via virtiofs // and would shadow anything written into the rootfs at that path. diff --git a/internal/infra/vm/runner.go b/internal/infra/vm/runner.go index 9d854db..8245d23 100644 --- a/internal/infra/vm/runner.go +++ b/internal/infra/vm/runner.go @@ -149,7 +149,7 @@ func (r *PropolisRunner) Start(ctx context.Context, cfg domvm.VMConfig) (domvm.V hooks.InjectFile("/etc/ssh/ssh_host_ecdsa_key", hostKeyPEM, 0o600), InjectInitBinary(), hooks.InjectEnvFile("/etc/sandbox-env", cfg.EnvVars), - InjectGitConfig(cfg.GitIdentity, cfg.HasGitToken, os.Chown), + InjectGitConfig(cfg.GitIdentity, cfg.HasGitToken, bestEffortLchown), ), propolis.WithInitOverride("/bbox-init"), propolis.WithPostBoot(func(ctx context.Context, _ *propolis.VM) error { @@ -198,7 +198,7 @@ func (r *PropolisRunner) Start(ctx context.Context, cfg domvm.VMConfig) (domvm.V // Add MCP config injection hook if host services and agent format are set. if len(cfg.HostServices) > 0 && cfg.MCPConfigFormat != "" && cfg.MCPConfigFormat != agent.MCPConfigFormatNone { opts = append(opts, propolis.WithRootFSHook( - InjectMCPConfig(cfg.MCPConfigFormat, topology.GatewayIP, cfg.HostServices[0].Port, os.Chown), + InjectMCPConfig(cfg.MCPConfigFormat, topology.GatewayIP, cfg.HostServices[0].Port, bestEffortLchown), )) } @@ -292,7 +292,7 @@ func (v *propolisVM) SSHHostKey() ssh.PublicKey { // vmDataDir returns a per-VM data directory under ~/.config/broodbox/vms//data. // This isolates state files and locks so multiple VMs can run in parallel. -// The parent directory is used by bbox for logs and should not be cleaned. +// The parent directory is used by bbox for logs and is cleaned by CleanupStaleLogs. func vmDataDir(name string) (string, error) { home, err := os.UserHomeDir() if err != nil { diff --git a/internal/infra/workspace/snapshot.go b/internal/infra/workspace/snapshot.go index 5708346..da4bf17 100644 --- a/internal/infra/workspace/snapshot.go +++ b/internal/infra/workspace/snapshot.go @@ -12,8 +12,8 @@ import ( "path/filepath" "strconv" "strings" - "syscall" + "github.com/stacklok/brood-box/internal/infra/process" "github.com/stacklok/brood-box/pkg/domain/snapshot" domws "github.com/stacklok/brood-box/pkg/domain/workspace" ) @@ -207,7 +207,7 @@ func CleanupStaleSnapshots(workspacePath string, logger *slog.Logger) { // If the sentinel contains a PID, check if that process is still alive. // Skip cleanup for snapshots owned by a running process. if pid, err := strconv.Atoi(strings.TrimSpace(string(data))); err == nil && pid > 0 { - if isProcessAlive(pid) { + if process.IsAlive(pid) { logger.Debug("skipping snapshot owned by running process", "path", stalePath, "pid", pid) continue @@ -220,15 +220,3 @@ func CleanupStaleSnapshots(workspacePath string, logger *slog.Logger) { } } } - -// isProcessAlive checks if a process with the given PID is still running. -// Uses signal 0 which checks for process existence without sending a signal. -func isProcessAlive(pid int) bool { - proc, err := os.FindProcess(pid) - if err != nil { - return false - } - // Signal 0 checks existence without actually sending a signal. - err = proc.Signal(syscall.Signal(0)) - return err == nil -}