Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/bbox/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,11 @@ func run(parentCtx context.Context, agentName string, flags runFlags) error {
infraws.CleanupStaleSnapshots(ws, logger)
}

// Clean up stale VM log directories from previous crashes.
if home, homeErr := os.UserHomeDir(); homeErr == nil {
infravm.CleanupStaleLogs(filepath.Join(home, ".config", "broodbox", "vms"), logger)
}

// Build registry with config-based custom agents.
registry := infraagent.NewRegistry()
cfgLoader := infraconfig.NewLoader(flags.cfgPath)
Expand Down Expand Up @@ -552,6 +557,11 @@ func openLogFile(override, vmName string) (string, *os.File, io.Closer, error) {
if err := os.MkdirAll(logDir, 0o700); err != nil {
return "", nil, nil, fmt.Errorf("creating log dir: %w", err)
}
// Write PID sentinel to mark ownership so stale cleanup can
// identify directories from dead processes.
if err := infravm.WriteSentinel(logDir); err != nil {
return "", nil, nil, err
}
logPath = filepath.Join(logDir, defaultLogFile)
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06
github.com/sergi/go-diff v1.4.0
github.com/spf13/cobra v1.10.2
github.com/stacklok/propolis v0.0.6
github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf
github.com/stacklok/toolhive v0.10.0
github.com/stacklok/toolhive-core v0.0.6
github.com/stretchr/testify v1.11.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,8 @@ github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU=
github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY=
github.com/stacklok/propolis v0.0.6 h1:Szfo4xeYX3pQpHzDOzSomo2zngIyFE3t4yRA4MqVPp0=
github.com/stacklok/propolis v0.0.6/go.mod h1:GK7TUXCm4J8Hh/QFoIGuXE4szSt8PWDZWzj1JPpqxVU=
github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf h1:HsQYq1H56kvEbgc+/4qwFpEr3HcyfMQ0b8dCcCHwesU=
github.com/stacklok/propolis v0.0.7-0.20260303111539-b54bd3284abf/go.mod h1:GK7TUXCm4J8Hh/QFoIGuXE4szSt8PWDZWzj1JPpqxVU=
github.com/stacklok/toolhive v0.10.0 h1:yXTR2ZbD83tGjSjSS2ypg61dYWZ3AwKxCyq3cACELOc=
github.com/stacklok/toolhive v0.10.0/go.mod h1:5suFGdrDM9j8Vh/ULROVO2qImpm+kMklXHcVBNFxL9Y=
github.com/stacklok/toolhive-core v0.0.6 h1:JLJpL4qyGh3z/fZKk+NNavziNCdtJlHoqroqBdWH6x8=
Expand Down
23 changes: 23 additions & 0 deletions internal/infra/process/process.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
// SPDX-License-Identifier: Apache-2.0

// Package process provides shared process-level utilities for infrastructure
// packages that need to check process liveness (e.g. stale cleanup).
package process

import (
"os"
"syscall"
)

// IsAlive checks if a process with the given PID is still running.
// Uses signal 0 which checks for process existence without sending a signal.
func IsAlive(pid int) bool {
proc, err := os.FindProcess(pid)
if err != nil {
return false
}
// Signal 0 checks existence without actually sending a signal.
err = proc.Signal(syscall.Signal(0))
return err == nil
}
54 changes: 54 additions & 0 deletions internal/infra/process/process_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
// SPDX-License-Identifier: Apache-2.0

package process

import (
"os"
"testing"

"github.com/stretchr/testify/assert"
)

func TestIsAlive(t *testing.T) {
t.Parallel()

tests := []struct {
name string
pid int
alive bool
}{
{
name: "current process is alive",
pid: os.Getpid(),
alive: true,
},
{
name: "parent process is alive",
pid: os.Getppid(),
alive: true,
},
{
name: "impossible PID is not alive",
pid: 2147483647,
alive: false,
},
{
name: "negative PID is not alive",
pid: -1,
alive: false,
},
{
name: "large negative PID is not alive",
pid: -99999,
alive: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
assert.Equal(t, tt.alive, IsAlive(tt.pid))
})
}
}
81 changes: 81 additions & 0 deletions internal/infra/vm/cleanup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
// SPDX-License-Identifier: Apache-2.0

package vm

import (
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/stacklok/brood-box/internal/infra/process"
)

// LogSentinel is the marker file placed inside per-VM log directories
// to identify ownership by a running bbox process.
const LogSentinel = ".bbox-sentinel"

// CleanupStaleLogs removes orphaned per-VM log directories from previous
// crashes. It scans vmsDir for subdirectories with a sentinel file whose
// owning process has died.
func CleanupStaleLogs(vmsDir string, logger *slog.Logger) {
entries, err := os.ReadDir(vmsDir)
if err != nil {
// Directory may not exist yet on first run — not an error.
if os.IsNotExist(err) {
return
}
logger.Warn("failed to scan for stale VM log directories", "error", err)
return
}

for _, entry := range entries {
if !entry.IsDir() {
continue
}

dirPath := filepath.Join(vmsDir, entry.Name())

// Only remove directories that have our sentinel file to avoid
// deleting unrelated directories.
sentinelPath := filepath.Join(dirPath, LogSentinel)
data, err := os.ReadFile(sentinelPath)
if err != nil {
logger.Debug("skipping VM directory without sentinel", "path", dirPath)
continue
}

// If the sentinel contains a PID, check if that process is still alive.
// Skip cleanup for directories owned by a running process.
pid, err := strconv.Atoi(strings.TrimSpace(string(data)))
if err != nil || pid <= 0 {
logger.Debug("skipping VM directory with invalid sentinel", "path", dirPath)
continue
}

if process.IsAlive(pid) {
logger.Debug("skipping VM log directory owned by running process",
"path", dirPath, "pid", pid)
continue
}

logger.Warn("removing stale VM log directory", "path", dirPath)
if err := os.RemoveAll(dirPath); err != nil {
logger.Error("failed to remove stale VM log directory", "path", dirPath, "error", err)
}
}
}

// WriteSentinel writes a PID sentinel file into the given directory to mark
// ownership by the current process. Returns an error if the write fails.
func WriteSentinel(dir string) error {
sentinelPath := filepath.Join(dir, LogSentinel)
content := fmt.Sprintf("%d", os.Getpid())
if err := os.WriteFile(sentinelPath, []byte(content), 0o600); err != nil {
return fmt.Errorf("writing log sentinel: %w", err)
}
return nil
}
Loading