diff --git a/disk.go b/disk.go index eccad53..3e20d05 100644 --- a/disk.go +++ b/disk.go @@ -2,6 +2,7 @@ package disko import ( "encoding/json" + "errors" "fmt" "sort" "strings" @@ -9,6 +10,14 @@ import ( "machinerun.io/disko/partid" ) +// ErrDiskTypeUndetermined is returned by RAIDController.GetDiskType when +// the controller layer cannot determine the disk type. Typical cases: +// the device is on the controller's sysfs tree but is not a configured +// virtual/logical drive and cannot be matched as a JBOD/passthrough +// disk, or controller queries were inconclusive (e.g. controller tool +// unavailable). Callers should fall back to generic udev-based detection. +var ErrDiskTypeUndetermined = errors.New("RAID controller could not determine disk type") + // DiskType enumerates supported disk types. type DiskType int @@ -24,19 +33,28 @@ const ( // TYPEFILE - A file on disk, not a block device. TYPEFILE + + // Unknown is an internal disko placeholder returned alongside a + // non-nil error by RAID-controller drivers when they cannot classify + // a device. It MUST NOT leak out of disko onto disko.Disk.Type: the + // linux system layer either consumes it via udev fallback or + // propagates the accompanying error. External callers should never + // observe this value. + Unknown ) func (t DiskType) String() string { - return []string{"HDD", "SSD", "NVME", "FILE"}[t] + return []string{"HDD", "SSD", "NVME", "FILE", "UNKNOWN"}[t] } // StringToDiskType - convert a string to a disk type. func StringToDiskType(typeStr string) DiskType { kmap := map[string]DiskType{ - "HDD": HDD, - "SSD": SSD, - "NVME": NVME, - "FILE": TYPEFILE, + "HDD": HDD, + "SSD": SSD, + "NVME": NVME, + "FILE": TYPEFILE, + "UNKNOWN": Unknown, } if dtype, ok := kmap[typeStr]; ok { return dtype @@ -485,6 +503,30 @@ type UdevInfo struct { Properties map[string]string `json:"properties"` } +// CollectSerials returns the set of udev serial-style tokens that +// identify this device. Used by RAID drivers (megaraid, smartpqi) to +// correlate a Linux device with a controller-reported physical drive +// serial number. +// +// Controllers typically expose the SCSI INQUIRY page-80 serial, which +// udev surfaces as ID_SCSI_SERIAL on SCSI-class devices and is the +// primary key. ID_SERIAL_SHORT and ID_SERIAL are WWN-derived on most +// SAS/SATA drives but cover drives that don't expose a distinct VPD +// page-80 serial. All non-empty values are returned so callers can match +// any of them against a controller record. +func (u UdevInfo) CollectSerials() map[string]struct{} { + out := map[string]struct{}{} + + for _, key := range []string{"ID_SCSI_SERIAL", "ID_SERIAL_SHORT", "ID_SERIAL"} { + v := strings.TrimSpace(u.Properties[key]) + if v != "" { + out[v] = struct{}{} + } + } + + return out +} + // PartitionSet is a map of partition number to the partition. type PartitionSet map[uint]Partition diff --git a/disk_test.go b/disk_test.go index 4308bc2..b8c6889 100644 --- a/disk_test.go +++ b/disk_test.go @@ -7,6 +7,8 @@ import ( "strings" "testing" + "github.com/stretchr/testify/assert" + "machinerun.io/disko" "machinerun.io/disko/partid" ) @@ -394,3 +396,59 @@ func TestMarshalProperties(t *testing.T) { } } } + +func TestUdevInfoCollectSerialsPrefersBoth(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{ + "ID_SERIAL_SHORT": "short1", + "ID_SERIAL": "long_short1", + "ID_MODEL": "ignored", + }, + } + + got := ud.CollectSerials() + assert.Contains(t, got, "short1", "missing ID_SERIAL_SHORT token") + assert.Contains(t, got, "long_short1", "missing ID_SERIAL token") + assert.NotContains(t, got, "ignored", + "CollectSerials must not include unrelated properties") +} + +func TestUdevInfoCollectSerialsIncludesSCSISerial(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{ + "ID_SCSI_SERIAL": "TESTSN0PQI01", + "ID_SERIAL_SHORT": "deadbeefcafef001", + "ID_SERIAL": "3deadbeefcafef001", + }, + } + + got := ud.CollectSerials() + for _, want := range []string{ + "TESTSN0PQI01", + "deadbeefcafef001", + "3deadbeefcafef001", + } { + assert.Contains(t, got, want, "CollectSerials: missing token %q", want) + } +} + +// Whitespace-only values must not be returned as serial tokens; otherwise +// a controller record with a blank SerialNumber could match every device. +func TestUdevInfoCollectSerialsTrimsWhitespace(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{ + "ID_SCSI_SERIAL": " ", + "ID_SERIAL_SHORT": "", + "ID_SERIAL": "real-serial", + }, + } + + got := ud.CollectSerials() + assert.Contains(t, got, "real-serial", "missing real serial token") + assert.Len(t, got, 1, "expected only the non-empty token") +} + +func TestUdevInfoCollectSerialsEmpty(t *testing.T) { + got := disko.UdevInfo{}.CollectSerials() + assert.Empty(t, got, "expected empty map for empty UdevInfo") +} diff --git a/linux/raidcontroller.go b/linux/raidcontroller.go index d5af3f0..675e6db 100644 --- a/linux/raidcontroller.go +++ b/linux/raidcontroller.go @@ -12,7 +12,6 @@ const ( type RAIDController interface { // Type() RAIDControllerType - GetDiskType(string) (disko.DiskType, error) - IsSysPathRAID(string) bool + GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) DriverSysfsPath() string } diff --git a/linux/sysfs/scsi.go b/linux/sysfs/scsi.go new file mode 100644 index 0000000..4307aa9 --- /dev/null +++ b/linux/sysfs/scsi.go @@ -0,0 +1,65 @@ +package sysfs + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +// ReadSCSITarget returns Target from /sys/block//device -> +// Host:Channel:Target:LUN (H:C:T:L). For SCSI-attached JBOD/passthrough disks +// behind a RAID HBA, Target matches the controller-reported drive ID +// (megaraid Drive.DID, mpi3mr PhysicalDrive.PID), which lets callers +// correlate a Linux block device with an entry in the controller's PD list. +// +// The "device" entry under a SCSI block device is a symlink into +// /sys/class/scsi_device/, e.g.: +// +// % ls -al /sys/block/sda/device +// lrwxrwxrwx 1 root root 0 Jan 31 16:11 /sys/block/sda/device -> ../../../2:0:0:0 +// +// Callers must only invoke ReadSCSITarget for devices that udev reports as +// SCSI (ID_SCSI=1); virtio-blk, NVMe, ATA/SATA, etc. do not expose this +// symlink and should be filtered upstream. An empty kname is a caller bug +// and is rejected with an error; ok=false with a nil error is reserved for +// the benign missing-"device"-symlink case; a malformed +// Host:Channel:Target:LUN link target is reported as an error. sysRoot is +// injectable for tests; production passes "/sys". +func ReadSCSITarget(sysRoot, kname string) (target int, ok bool, err error) { + if kname == "" { + return 0, false, fmt.Errorf("invalid empty kname parameter") + } + + link := filepath.Join(sysRoot, "block", kname, "device") + + dest, err := os.Readlink(link) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return 0, false, nil + } + return 0, false, fmt.Errorf("readlink %q: %w", link, err) + } + + // Parse out the SCSI device id from the symlink. e.g. + // ../../../0:3:110:0 -> 0:3:110:0 + // matching an entry under /sys/class/scsi_device/. + scsiDeviceID := filepath.Base(dest) + hctlFields := strings.Split(scsiDeviceID, ":") + + const requiredHCTLFields = 4 + if len(hctlFields) != requiredHCTLFields { + return 0, false, fmt.Errorf( + "invalid SCSI Host:Channel:Target:LUN value %q from %q: expected %d fields, got %d", + scsiDeviceID, link, requiredHCTLFields, len(hctlFields)) + } + + t, cerr := strconv.Atoi(hctlFields[2]) + if cerr != nil { + return 0, false, fmt.Errorf("parse SCSI target from %q: %w", scsiDeviceID, cerr) + } + + return t, true, nil +} diff --git a/linux/sysfs/scsi_test.go b/linux/sysfs/scsi_test.go new file mode 100644 index 0000000..483956d --- /dev/null +++ b/linux/sysfs/scsi_test.go @@ -0,0 +1,83 @@ +package sysfs + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// makeBlockDeviceSymlink wires up a fake sysfs entry at +// /block//device pointing at +// ../../scsi_device/. +func makeBlockDeviceSymlink(t *testing.T, root, kname, hctl string) { + t.Helper() + + blockDir := filepath.Join(root, "block", kname) + require.NoError(t, os.MkdirAll(blockDir, 0o755), "mkdir %q", blockDir) + + scsiDir := filepath.Join(root, "scsi_device", hctl) + require.NoError(t, os.MkdirAll(scsiDir, 0o755), "mkdir %q", scsiDir) + + link := filepath.Join(blockDir, "device") + target := filepath.Join("..", "..", "scsi_device", hctl) + require.NoError(t, os.Symlink(target, link), "symlink") +} + +func TestReadSCSITargetJBOD(t *testing.T) { + root := t.TempDir() + makeBlockDeviceSymlink(t, root, "sdb", "0:2:3:0") + + target, ok, err := ReadSCSITarget(root, "sdb") + require.NoError(t, err) + require.True(t, ok, "expected ok=true for a SCSI-backed block device") + assert.Equal(t, 3, target, "target") +} + +// An NVMe/virtio-style block device has no Host:Channel:Target:LUN +// "device" symlink. +// ReadSCSITarget must report ok=false (not an error) so the caller can +// fall through to generic udev detection. +func TestReadSCSITargetNoDevice(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "block", "nvme0n1"), 0o755)) + + _, ok, err := ReadSCSITarget(root, "nvme0n1") + require.NoError(t, err) + assert.False(t, ok, "expected ok=false when device symlink is absent") +} + +// A "device" symlink whose last segment isn't Host:Channel:Target:LUN +// (e.g. points at a PCI node) is malformed for a SCSI block device and +// should be reported as an error so the caller can log/diagnose. Callers +// are expected to filter non-SCSI devices upstream via udev (ID_SCSI=1). +func TestReadSCSITargetNonHCTL(t *testing.T) { + root := t.TempDir() + blockDir := filepath.Join(root, "block", "vda") + require.NoError(t, os.MkdirAll(blockDir, 0o755)) + other := filepath.Join(root, "devices", "virtio0") + require.NoError(t, os.MkdirAll(other, 0o755)) + require.NoError(t, os.Symlink(filepath.Join("..", "..", "devices", "virtio0"), + filepath.Join(blockDir, "device"))) + + _, ok, err := ReadSCSITarget(root, "vda") + require.Error(t, err, "expected error for malformed Host:Channel:Target:LUN link target") + assert.False(t, ok, "expected ok=false when link target is not Host:Channel:Target:LUN") +} + +func TestReadSCSITargetBadTarget(t *testing.T) { + root := t.TempDir() + makeBlockDeviceSymlink(t, root, "sdc", "0:2:notanum:0") + + _, ok, err := ReadSCSITarget(root, "sdc") + require.Error(t, err, "expected parse error") + assert.False(t, ok, "expected ok=false on parse error") +} + +func TestReadSCSITargetEmptyKname(t *testing.T) { + _, ok, err := ReadSCSITarget("/sys", "") + require.Error(t, err, "expected error for empty kname") + assert.False(t, ok, "expected ok=false for empty kname") +} diff --git a/linux/sysfs/sysfs.go b/linux/sysfs/sysfs.go new file mode 100644 index 0000000..dda23db --- /dev/null +++ b/linux/sysfs/sysfs.go @@ -0,0 +1,66 @@ +// Package sysfs contains Linux-specific helpers for inspecting the sysfs +// hierarchy. The helpers are driver-agnostic so they can be shared by the +// top-level linux package and by individual RAID driver packages without +// creating an import cycle back into linux. +package sysfs + +import ( + "fmt" + "path/filepath" + "strings" +) + +// IsSysPathRAID checks whether syspath (udevadm DEVPATH) belongs to a RAID +// controller whose PCI driver is registered at driverSysPath. +// +// syspath will look something like +// /devices/pci0000:3a/0000:3a:02.0/0000:3c:00.0/host0/target0:2:2/0:2:2:0/block/sdc +func IsSysPathRAID(syspath string, driverSysPath string) bool { + if !strings.HasPrefix(syspath, "/sys") { + syspath = "/sys" + syspath + } + + if !strings.Contains(syspath, "/host") { + return false + } + + fp, err := filepath.EvalSymlinks(syspath) + if err != nil { + fmt.Printf("seriously? %s\n", err) + return false + } + + for _, path := range GetSysPaths(driverSysPath) { + if strings.HasPrefix(fp, path) { + return true + } + } + + return false +} + +// GetSysPaths returns the resolved PCI device paths for a RAID driver. +func GetSysPaths(driverSysPath string) []string { + paths := []string{} + // a raid driver has directory entries for each of the scsi hosts on that controller. + // $cd /sys/bus/pci/drivers/ + // $ for d in *; do [ -d "$d" ] || continue; echo "$d -> $( cd "$d" && pwd -P )"; done + // 0000:3c:00.0 -> /sys/devices/pci0000:3a/0000:3a:02.0/0000:3c:00.0 + // module -> /sys/module/ + + // We take a hack path and consider anything with a ":" in that dir as a host path. + matches, err := filepath.Glob(driverSysPath + "/*:*") + + if err != nil { + fmt.Printf("errors: %s\n", err) + return paths + } + + for _, p := range matches { + if fp, err := filepath.EvalSymlinks(p); err == nil { + paths = append(paths, fp) + } + } + + return paths +} diff --git a/linux/sysfs/sysfs_test.go b/linux/sysfs/sysfs_test.go new file mode 100644 index 0000000..12bd789 --- /dev/null +++ b/linux/sysfs/sysfs_test.go @@ -0,0 +1,69 @@ +package sysfs + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// buildFakeDriverDir creates a tempdir layout mirroring /sys/bus/pci when a +// RAID PCI driver is loaded: +// +// /drivers// -> /devices/pci0000/ +// /drivers//module -> /module/ +// +// It returns the driver directory and the resolved device paths that +// GetSysPaths should return for it. +func buildFakeDriverDir(t *testing.T, driver string, bdfs []string) (string, []string) { + t.Helper() + root := t.TempDir() + + driverDir := filepath.Join(root, "drivers", driver) + require.NoError(t, os.MkdirAll(driverDir, 0o755)) + + // Real driver dirs also contain a "module" symlink; GetSysPaths must + // skip it because its name has no ':' (the "*:*" glob excludes it). + modulePath := filepath.Join(root, "module", driver) + require.NoError(t, os.MkdirAll(modulePath, 0o755)) + require.NoError(t, os.Symlink(modulePath, filepath.Join(driverDir, "module"))) + + resolved := make([]string, 0, len(bdfs)) + for _, bdf := range bdfs { + dev := filepath.Join(root, "devices", "pci0000", bdf) + require.NoError(t, os.MkdirAll(dev, 0o755)) + require.NoError(t, os.Symlink(dev, filepath.Join(driverDir, bdf))) + + resolvedDev, err := filepath.EvalSymlinks(dev) + require.NoError(t, err) + resolved = append(resolved, resolvedDev) + } + + return driverDir, resolved +} + +func TestGetSysPaths_EmptyDriverDir(t *testing.T) { + driverDir, _ := buildFakeDriverDir(t, "megaraid_sas", nil) + assert.Empty(t, GetSysPaths(driverDir)) +} + +func TestGetSysPaths_SingleBoundDevice(t *testing.T) { + driverDir, resolved := buildFakeDriverDir(t, "megaraid_sas", + []string{"0000:3c:00.0"}) + + assert.ElementsMatch(t, resolved, GetSysPaths(driverDir)) +} + +func TestGetSysPaths_MultipleBoundDevices(t *testing.T) { + bdfs := []string{"0000:3c:00.0", "0000:82:00.0"} + driverDir, resolved := buildFakeDriverDir(t, "mpi3mr", bdfs) + + assert.ElementsMatch(t, resolved, GetSysPaths(driverDir)) +} + +func TestIsSysPathRAID_NoHostSegment(t *testing.T) { + assert.False(t, IsSysPathRAID("/sys/devices/virtual/block/loop0", "")) + assert.False(t, IsSysPathRAID("/devices/virtual/block/loop0", "")) +} diff --git a/linux/system.go b/linux/system.go index b972b6e..bea385a 100644 --- a/linux/system.go +++ b/linux/system.go @@ -1,6 +1,7 @@ package linux import ( + "errors" "fmt" "log" "os" @@ -11,6 +12,7 @@ import ( "golang.org/x/sys/unix" "machinerun.io/disko" + "machinerun.io/disko/linux/sysfs" "machinerun.io/disko/megaraid" "machinerun.io/disko/mpi3mr" "machinerun.io/disko/smartpqi" @@ -18,6 +20,10 @@ import ( type linuxSystem struct { raidctrls []RAIDController + // isSysPathRAID resolves whether a udev DEVPATH belongs to the driver + // rooted at driverSysPath. It defaults to sysfs.IsSysPathRAID in + // production; tests inject a stub to avoid touching real /sys. + isSysPathRAID func(syspath, driverSysPath string) bool } // System returns an linux specific implementation of disko.System interface. @@ -28,6 +34,7 @@ func System() disko.System { smartpqi.ArcConf(), mpi3mr.StorCli2(), }, + isSysPathRAID: sysfs.IsSysPathRAID, } } @@ -161,28 +168,14 @@ func (ls *linuxSystem) ScanDisk(devicePath string) (disko.Disk, error) { attachType = getAttachType(udInfo) - for _, ctrl := range ls.raidctrls { - if IsSysPathRAID(udInfo.Properties["DEVPATH"], ctrl.DriverSysfsPath()) { - // we know this is device is part of a raid, so if we cannot get - // disk type we must return an error - dType, err := ctrl.GetDiskType(devicePath) - if err != nil { - return disko.Disk{}, fmt.Errorf("failed to get diskType of %q from RAID controller: %s", devicePath, err) - } - - attachType = disko.RAID - diskType = dType - - break - } + var onRAID bool + diskType, onRAID, err = ls.resolveDiskType(devicePath, udInfo) + if err != nil { + return disko.Disk{}, fmt.Errorf("error while getting disk type: %s", err) } - // check disk type if it wasn't on raid - if attachType != disko.RAID { - diskType, err = getDiskType(udInfo) - if err != nil { - return disko.Disk{}, fmt.Errorf("error while getting disk type: %s", err) - } + if onRAID { + attachType = disko.RAID } ro, err = getDiskReadOnly(name) @@ -297,16 +290,50 @@ func (ls *linuxSystem) Wipe(d disko.Disk) error { return udevSettle() } +// GetDiskType is the disko.System entry point for disk-type classification. +// resolveDiskType also reports whether a RAID controller owns the device, +// but external callers of GetDiskType only care about the type itself. func (ls *linuxSystem) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { + dType, _, err := ls.resolveDiskType(path, udInfo) + return dType, err +} + +// resolveDiskType classifies devicePath. Returns (type, true, nil) when a +// RAID controller owns the virtual/logical drive; otherwise (non-RAID or +// ErrDiskTypeUndetermined) falls back to getDiskType(udInfo) with bool=false. +func (ls *linuxSystem) resolveDiskType(devicePath string, udInfo disko.UdevInfo) (disko.DiskType, bool, error) { + devpath := udInfo.Properties["DEVPATH"] + for _, ctrl := range ls.raidctrls { - if IsSysPathRAID(udInfo.Properties["DEVPATH"], ctrl.DriverSysfsPath()) { - dType, err := ctrl.GetDiskType(path) - if err != nil { - return disko.HDD, fmt.Errorf("failed to get diskType of %q from RAID controller: %s", path, err) + // skip any device that isn't bound to the raid controller + if !ls.isSysPathRAID(devpath, ctrl.DriverSysfsPath()) { + continue + } + + dType, err := ctrl.GetDiskType(devicePath, udInfo) + if err != nil { + if errors.Is(err, disko.ErrDiskTypeUndetermined) { + log.Printf("RAID controller could not determine disk type for %q, using generic detection", devicePath) + break } - return dType, nil + return disko.Unknown, false, fmt.Errorf("failed to get diskType of %q from RAID controller: %s", devicePath, err) + } + + // RAID Controller should never return disko.Unknown with err == nil. + if dType == disko.Unknown { + return disko.Unknown, false, fmt.Errorf( + "RAID controller returned disko.Unknown with nil error for %q; driver contract violated", + devicePath) } + + return dType, true, nil } - return getDiskType(udInfo) + + dType, err := getDiskType(udInfo) + if err != nil { + return disko.Unknown, false, err + } + + return dType, false, nil } diff --git a/linux/system_test.go b/linux/system_test.go index 8750226..b7db442 100644 --- a/linux/system_test.go +++ b/linux/system_test.go @@ -1,6 +1,7 @@ package linux import ( + "errors" "fmt" "os" "path" @@ -235,3 +236,374 @@ func TestCreatePartitions(t *testing.T) { ast.Equal(disko.FILESYSTEM, scannedDisk.Attachment) ast.Equal(disko.TYPEFILE, scannedDisk.Type) } + +type mockRAIDController struct { + diskType disko.DiskType + err error + sysfsPath string + isSysPathRAID bool + getDiskTypeCalled bool + getDiskTypePath string + getDiskTypeUdev disko.UdevInfo +} + +func (m *mockRAIDController) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { + m.getDiskTypeCalled = true + m.getDiskTypePath = path + m.getDiskTypeUdev = udInfo + return m.diskType, m.err +} + +func (m *mockRAIDController) DriverSysfsPath() string { + return m.sysfsPath +} + +// newTestLinuxSystem builds a linuxSystem whose sysfs lookup is stubbed by +// consulting the isSysPathRAID field on the matching mock (matched by +// DriverSysfsPath), so tests don't need a real /sys mount. +func newTestLinuxSystem(ctrls ...*mockRAIDController) *linuxSystem { + rc := make([]RAIDController, 0, len(ctrls)) + for _, c := range ctrls { + rc = append(rc, c) + } + + return &linuxSystem{ + raidctrls: rc, + isSysPathRAID: func(_, driverSysPath string) bool { + for _, c := range ctrls { + if c.sysfsPath == driverSysPath { + return c.isSysPathRAID + } + } + return false + }, + } +} + +func TestGetDiskTypeRAIDMatchHDD(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.HDD, + err: nil, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + dtype, err := ls.GetDiskType("/dev/sda", udInfo) + ast.NoError(err) + ast.Equal(disko.HDD, dtype) + ast.True(mock.getDiskTypeCalled) + ast.Equal("/dev/sda", mock.getDiskTypePath) +} + +func TestGetDiskTypeRAIDMatchSSD(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.SSD, + err: nil, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + dtype, err := ls.GetDiskType("/dev/sda", udInfo) + ast.NoError(err) + ast.Equal(disko.SSD, dtype) + ast.True(mock.getDiskTypeCalled) +} + +func TestGetDiskTypeJBODFallback(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.Unknown, + err: disko.ErrDiskTypeUndetermined, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{ + Name: "sda", + Properties: map[string]string{ + "DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda", + "ID_BUS": "scsi", + "DEVTYPE": "disk", + }, + } + + dtype, err := ls.GetDiskType("/dev/sda", udInfo) + + ast.True(mock.getDiskTypeCalled, "RAID controller should have been consulted") + ast.NoError(err, "ErrDiskTypeUndetermined should not propagate as a fatal error") + ast.Equal(disko.HDD, dtype, "should fall through to generic detection (HDD default)") +} + +func TestGetDiskTypeWrappedSentinelFallback(t *testing.T) { + ast := assert.New(t) + wrappedErr := fmt.Errorf("controller 0: %w", disko.ErrDiskTypeUndetermined) + + mock := &mockRAIDController{ + diskType: disko.Unknown, + err: wrappedErr, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + _, err := ls.GetDiskType("/dev/sda", udInfo) + ast.NoError(err, "wrapped ErrDiskTypeUndetermined should still trigger fallback via errors.Is") +} + +func TestGetDiskTypeRAIDRealError(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.Unknown, + err: fmt.Errorf("storcli binary crashed"), + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + _, err := ls.GetDiskType("/dev/sda", udInfo) + ast.Error(err) + ast.Contains(err.Error(), "failed to get diskType") + ast.Contains(err.Error(), "storcli binary crashed") + ast.False(errors.Is(err, disko.ErrDiskTypeUndetermined)) +} + +func TestGetDiskTypeNoRAIDMatch(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.HDD, + err: nil, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: false, + } + + ls := newTestLinuxSystem(mock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + _, err := ls.GetDiskType("/dev/sda", udInfo) + ast.False(mock.getDiskTypeCalled, "should not call GetDiskType when sysfs path doesn't match") + _ = err +} + +func TestGetDiskTypeMultiControllerJBODFallback(t *testing.T) { + ast := assert.New(t) + + megaraidMock := &mockRAIDController{ + diskType: disko.Unknown, + err: disko.ErrDiskTypeUndetermined, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + smartpqiMock := &mockRAIDController{ + diskType: disko.HDD, + err: nil, + sysfsPath: "/sys/bus/pci/drivers/smartpqi", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(megaraidMock, smartpqiMock) + udInfo := disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}} + + _, err := ls.GetDiskType("/dev/sda", udInfo) + ast.NoError(err) + ast.True(megaraidMock.getDiskTypeCalled, "megaraid should have been tried") + ast.False(smartpqiMock.getDiskTypeCalled, "smartpqi should NOT be tried after break from megaraid ErrDiskTypeUndetermined") +} + +// udevInfoFallbackStub returns a UdevInfo that drives getDiskType's +// nvme-prefix short-circuit, so fallback tests get a deterministic +// (disko.NVME, nil) result without touching /dev or sysfs. The observed +// disk type is a fingerprint that the fallback ran; it is not itself under +// test. +func udevInfoFallbackStub(devpath string) disko.UdevInfo { + return disko.UdevInfo{ + Name: "nvme0n1", + Properties: map[string]string{ + "DEVPATH": devpath, + "DEVTYPE": "disk", + }, + } +} + +func TestResolveDiskType_RAIDMatchSSD(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.SSD, + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/sda", + disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}}) + ast.NoError(err) + ast.True(onRAID) + ast.Equal(disko.SSD, dType) + ast.True(mock.getDiskTypeCalled) + ast.Equal("/dev/sda", mock.getDiskTypePath) +} + +func TestResolveDiskType_RAIDMatchHDD(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.HDD, + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/sda", + disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}}) + ast.NoError(err) + ast.True(onRAID) + ast.Equal(disko.HDD, dType) +} + +// An ErrDiskTypeUndetermined from the controller must not propagate; resolveDiskType +// falls through to the udev classifier. +func TestResolveDiskType_JBODFallsThroughToUdev(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + err: disko.ErrDiskTypeUndetermined, + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/nvme0n1", + udevInfoFallbackStub("/devices/pci0000:00/0000:00:1c.4/0000:04:00.0/nvme/nvme0/nvme0n1")) + ast.NoError(err) + ast.False(onRAID) + ast.True(mock.getDiskTypeCalled) + ast.Equal(disko.NVME, dType) +} + +// Same contract as the previous test, but the sentinel is wrapped via +// fmt.Errorf("...: %w", ...); errors.Is must still trigger the fallback. +func TestResolveDiskType_WrappedJBODFallsThroughToUdev(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + err: fmt.Errorf("controller 0: %w", disko.ErrDiskTypeUndetermined), + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/nvme0n1", + udevInfoFallbackStub("/devices/pci0000:00/0000:00:1c.4/0000:04:00.0/nvme/nvme0/nvme0n1")) + ast.NoError(err) + ast.False(onRAID) + ast.Equal(disko.NVME, dType) +} + +// Non-sentinel controller errors must surface to the caller, not be swallowed +// into the udev fallback. +func TestResolveDiskType_RealErrorIsPropagated(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + err: fmt.Errorf("storcli binary crashed"), + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + _, onRAID, err := ls.resolveDiskType("/dev/sda", + disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}}) + ast.Error(err) + ast.False(onRAID) + ast.Contains(err.Error(), "storcli binary crashed") +} + +// When no controller claims the devpath via IsSysPathRAID, resolveDiskType +// skips the RAID path entirely and classifies the disk via udev. +func TestResolveDiskType_NoRAIDMatchFallsThroughToUdev(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + isSysPathRAID: false, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/nvme0n1", + udevInfoFallbackStub("/devices/pci0000:00/0000:00:1c.4/0000:04:00.0/nvme/nvme0/nvme0n1")) + ast.NoError(err) + ast.False(onRAID) + ast.False(mock.getDiskTypeCalled) + ast.Equal(disko.NVME, dType) +} + +// Once a matching controller reports ErrDiskTypeUndetermined, iteration stops (a +// JBOD on one HBA cannot simultaneously be a VD on another) and we fall +// through to the udev classifier exactly once. +func TestResolveDiskType_MultiControllerJBODStopsIteration(t *testing.T) { + ast := assert.New(t) + + first := &mockRAIDController{ + err: disko.ErrDiskTypeUndetermined, + isSysPathRAID: true, + } + second := &mockRAIDController{ + diskType: disko.SSD, + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(first, second) + + dType, onRAID, err := ls.resolveDiskType("/dev/nvme0n1", + udevInfoFallbackStub("/devices/pci0000:00/0000:00:1c.4/0000:04:00.0/nvme/nvme0/nvme0n1")) + ast.NoError(err) + ast.False(onRAID) + ast.True(first.getDiskTypeCalled) + ast.False(second.getDiskTypeCalled, "second controller should not be consulted after JBOD break") + ast.Equal(disko.NVME, dType) +} + +// A system with no RAID controllers configured still classifies disks via +// udev and never reports onRAID=true. +func TestResolveDiskType_NoControllersConfiguredUsesUdev(t *testing.T) { + ast := assert.New(t) + + ls := newTestLinuxSystem() + + dType, onRAID, err := ls.resolveDiskType("/dev/nvme0n1", + udevInfoFallbackStub("/devices/pci0000:00/0000:00:1c.4/0000:04:00.0/nvme/nvme0/nvme0n1")) + ast.NoError(err) + ast.False(onRAID) + ast.Equal(disko.NVME, dType) +} + +// disko.Unknown is reserved for error paths; a driver that returns it +// with err == nil has violated the contract. resolveDiskType must catch +// this and surface a hard error rather than let Unknown reach callers. +func TestResolveDiskType_UnknownWithNilErrorIsRejected(t *testing.T) { + ast := assert.New(t) + mock := &mockRAIDController{ + diskType: disko.Unknown, + err: nil, + sysfsPath: "/sys/bus/pci/drivers/megaraid_sas", + isSysPathRAID: true, + } + + ls := newTestLinuxSystem(mock) + + dType, onRAID, err := ls.resolveDiskType("/dev/sda", + disko.UdevInfo{Properties: map[string]string{"DEVPATH": "/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/block/sda"}}) + ast.Error(err) + ast.Contains(err.Error(), "disko.Unknown") + ast.Contains(err.Error(), "contract violated") + ast.False(onRAID) + ast.Equal(disko.Unknown, dType) +} diff --git a/linux/util.go b/linux/util.go index 45c914d..a64c665 100644 --- a/linux/util.go +++ b/linux/util.go @@ -15,6 +15,7 @@ import ( "github.com/pkg/errors" "machinerun.io/disko" + "machinerun.io/disko/linux/sysfs" ) // GetUdevInfo return a UdevInfo for the device with kernel name kname. @@ -291,32 +292,14 @@ func Floor(val, unit uint64) uint64 { return (val / unit) * unit } -// IsSysPathRAID - is this sys path (udevadm info's DEVPATH) on a scsi controller. +// IsSysPathRAID checks whether syspath (udevadm DEVPATH) belongs to a RAID +// controller whose PCI driver is registered at driverSysPath. // -// syspath will look something like -// /devices/pci0000:3a/0000:3a:02.0/0000:3c:00.0/host0/target0:2:2/0:2:2:0/block/sdc +// Deprecated: use sysfs.IsSysPathRAID from machinerun.io/disko/linux/sysfs +// instead. This wrapper exists only for backwards compatibility with callers +// that import the top-level linux package. func IsSysPathRAID(syspath string, driverSysPath string) bool { - if !strings.HasPrefix(syspath, "/sys") { - syspath = "/sys" + syspath - } - - if !strings.Contains(syspath, "/host") { - return false - } - - fp, err := filepath.EvalSymlinks(syspath) - if err != nil { - fmt.Printf("seriously? %s\n", err) - return false - } - - for _, path := range GetSysPaths(driverSysPath) { - if strings.HasPrefix(fp, path) { - return true - } - } - - return false + return sysfs.IsSysPathRAID(syspath, driverSysPath) } // NameByDiskID - return the linux name (sda) for the disk with given DiskID @@ -340,29 +323,11 @@ func NameByDiskID(driverSysPath string, id int) (string, error) { return path.Base(matches[0]), nil } +// GetSysPaths returns the resolved PCI device paths for a RAID driver. +// +// Deprecated: use sysfs.GetSysPaths from machinerun.io/disko/linux/sysfs +// instead. This wrapper exists only for backwards compatibility with callers +// that import the top-level linux package. func GetSysPaths(driverSysPath string) []string { - paths := []string{} - // a raid driver has directory entries for each of the scsi hosts on that controller. - // $cd /sys/bus/pci/drivers/ - // $ for d in *; do [ -d "$d" ] || continue; echo "$d -> $( cd "$d" && pwd -P )"; done - // 0000:3c:00.0 -> /sys/devices/pci0000:3a/0000:3a:02.0/0000:3c:00.0 - // module -> /sys/module/ - - // We take a hack path and consider anything with a ":" in that dir as a host path. - matches, err := filepath.Glob(driverSysPath + "/*:*") - - if err != nil { - fmt.Printf("errors: %s\n", err) - return paths - } - - for _, p := range matches { - fp, err := filepath.EvalSymlinks(p) - - if err == nil { - paths = append(paths, fp) - } - } - - return paths + return sysfs.GetSysPaths(driverSysPath) } diff --git a/megaraid/megaraid.go b/megaraid/megaraid.go index e0d0d2f..bdf526b 100644 --- a/megaraid/megaraid.go +++ b/megaraid/megaraid.go @@ -85,14 +85,15 @@ func (dgs DriveGroupSet) MarshalJSON() ([]byte, error) { // Drive - a megaraid (physical) Drive. type Drive struct { - ID int - DriveGroup int - EID int - Slot int - State string - MediaType MediaType - Model string - Raw map[string]string + ID int + DriveGroup int + EID int + Slot int + State string + MediaType MediaType + Model string + SerialNumber string + Raw map[string]string } // DriveSet - just a map of Drives by ID @@ -118,6 +119,9 @@ const ( // SSD - Solid State Disk SSD + + // NVME - Non-Volatile Memory Express + NVME ) func (t MediaType) String() string { @@ -135,13 +139,10 @@ type MegaRaid interface { Query(int) (Controller, error) // GetDiskType - Determine the disk type if controller owns disk - GetDiskType(string) (disko.DiskType, error) + GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) // DriverSysfsPath - Return the sysfs path to the linux driver for this controller DriverSysfsPath() string - - // IsSysPathRAID - Check if sysfs path is a device on the controller - IsSysPathRAID(string) bool } // ErrNoController - Error reported by Query if no controller is found. diff --git a/megaraid/storcli.go b/megaraid/storcli.go index bc55232..8e87307 100644 --- a/megaraid/storcli.go +++ b/megaraid/storcli.go @@ -2,6 +2,7 @@ package megaraid import ( "bytes" + "errors" "fmt" "os/exec" "regexp" @@ -44,10 +45,9 @@ type scResultSection struct { } func (sc *storCli) Query(cID int) (Controller, error) { - // run /c0 show all - // - get PDs and VDs - // run /c0/vall show all - // - populate VD Properties and Path + // run /cN show - get PDs and VDs + // run /cN/vall show all - populate VD Properties and Path + // run /cN/eall/sall show all - populate Drive.SerialNumber (best-effort) var stdout, stderr []byte var rc int @@ -72,23 +72,27 @@ func (sc *storCli) Query(cID int) (Controller, error) { cxVxOut := string(stdout) - return newController(cID, cxDxOut, cxVxOut) + // Best-effort: an error here leaves Drive.SerialNumber empty, which + // disables JBOD matching but does not break VD classification. + args = []string{fmt.Sprintf("/c%d/eall/sall", cID), "show", "all", "nolog"} + stdout, _, rc = storcli(args...) + cxEallSallOut := "" + if rc == 0 { + cxEallSallOut = string(stdout) + } + + return newController(cID, cxDxOut, cxVxOut, cxEallSallOut) } func (sc *storCli) DriverSysfsPath() string { return SysfsPCIDriversPath } -func (sc *storCli) GetDiskType(path string) (disko.DiskType, error) { - return disko.HDD, fmt.Errorf("missing controller to run query") -} - -// not implemented in driver layer -func (sc *storCli) IsSysPathRAID(syspath string) bool { - return false +func (sc *storCli) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { + return disko.Unknown, fmt.Errorf("missing controller to run query") } -func newController(cID int, cxDxOut string, cxVxOut string) (Controller, error) { +func newController(cID int, cxDxOut, cxVxOut, cxEallSallOut string) (Controller, error) { const pathPropName = "OS Drive Name" ctrl := Controller{ @@ -116,6 +120,15 @@ func newController(cID int, cxDxOut string, cxVxOut string) (Controller, error) ctrl.VirtDrives[vID].Path = vProps[pathPropName] } + // Best-effort: ignore parse errors; drives just keep empty SNs. + if serials, sErr := parseDriveSerials(cxEallSallOut); sErr == nil { + for _, drive := range pds { + if sn, ok := serials[driveKey{EID: drive.EID, Slot: drive.Slot}]; ok { + drive.SerialNumber = sn + } + } + } + for diskID, drive := range pds { dgID := drive.DriveGroup if dgID < 0 { @@ -134,9 +147,6 @@ func newController(cID int, cxDxOut string, cxVxOut string) (Controller, error) } } - // fmt.Printf("ctrl: %#v\n", propMap) - // fmt.Printf("ctrl: %#v\n", ctrl) - return ctrl, nil } @@ -451,6 +461,71 @@ func parseVirtProperties(cmdOut string) (map[int](map[string]string), error) { return vdmap, nil } +// driveKey identifies a physical drive by (enclosure, slot) for +// cross-referencing parsed 'storcli /cN' and '/cN/eall/sall' outputs. +type driveKey struct { + EID int + Slot int +} + +// parseDriveSerials extracts (EID, Slot) -> SN from the output of +// 'storcli /cN/eall/sall show all'. Returns an empty map (with nil error) +// for empty input, so callers can feed in a best-effort string. +func parseDriveSerials(cmdOut string) (map[driveKey]string, error) { + out := map[driveKey]string{} + + if strings.TrimSpace(cmdOut) == "" { + return out, nil + } + + // Surface Success/Failure/Unsupported via the standard header block. + for _, sect := range loadSections(cmdOut) { + if sect.Type == rsHeader { + if err := getHeaderError(parseKeyValData(sect.Lines)); err != nil { + return out, err + } + break + } + } + + // Any line starting with "Drive /cX/eY/sZ " updates the current + // drive context (covers the summary, state, attributes, etc. + // sub-sections). The drive's SN appears later in the attributes + // sub-section. + driveHdr := regexp.MustCompile(`^Drive /c\d+/e(\d+)/s(\d+)\b`) + snLine := regexp.MustCompile(`^SN\s*=\s*(.*\S)\s*$`) + + var cur *driveKey + + for _, line := range strings.Split(cmdOut, "\n") { + if m := driveHdr.FindStringSubmatch(line); m != nil { + eid, err := strconv.Atoi(m[1]) + if err != nil { + cur = nil + continue + } + slot, err := strconv.Atoi(m[2]) + if err != nil { + cur = nil + continue + } + k := driveKey{EID: eid, Slot: slot} + cur = &k + continue + } + + if cur == nil { + continue + } + + if m := snLine.FindStringSubmatch(line); m != nil { + out[*cur] = m[1] + } + } + + return out, nil +} + func parseIntOrDash(field string) (int, error) { if field == "-" { return -1, nil @@ -622,30 +697,88 @@ func (csc *cachingStorCli) Query(cID int) (Controller, error) { return ctrl, err } -func (csc *cachingStorCli) GetDiskType(path string) (disko.DiskType, error) { +func (csc *cachingStorCli) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { ctrl, err := csc.Query(0) - if err == nil { - for _, vd := range ctrl.VirtDrives { - if vd.Path == path { - if ctrl.DriveGroups[vd.DriveGroup].IsSSD() { - return disko.SSD, nil - } + if err != nil { + if isSoftStorCliErr(err) { + // Controller tool unavailable or no controller. Fall + // through with the sentinel so the caller uses udev. + return disko.Unknown, disko.ErrDiskTypeUndetermined + } + return disko.Unknown, err + } - return disko.HDD, nil + for _, vd := range ctrl.VirtDrives { + if vd.Path == path { + if ctrl.DriveGroups[vd.DriveGroup].IsSSD() { + return disko.SSD, nil } + + return disko.HDD, nil } - } else if err != ErrNoStorcli && err != ErrNoController && err != ErrUnsupported { - return disko.HDD, err } - return disko.HDD, fmt.Errorf("cannot determine disk type") + // No VD matched path. Try JBOD/passthrough by matching udev serial + // against Drive.SerialNumber; fall through with the sentinel on any + // failure. + if dType, ok := jbodDiskTypeFromSerial(ctrl, udInfo); ok { + return dType, nil + } + + return disko.Unknown, disko.ErrDiskTypeUndetermined } -func (csc *cachingStorCli) DriverSysfsPath() string { - return csc.mr.DriverSysfsPath() +// isSoftStorCliErr returns true when err indicates that storcli simply +// cannot answer right now (binary missing, no controller present, or an +// unsupported controller). Callers should fall back to generic detection +// rather than treat these as fatal. +func isSoftStorCliErr(err error) bool { + return errors.Is(err, ErrNoStorcli) || + errors.Is(err, ErrNoController) || + errors.Is(err, ErrUnsupported) } -// not implemented in the driver layer -func (csc *cachingStorCli) IsSysPathRAID(syspath string) bool { - return false +// jbodDiskTypeFromSerial matches a udev serial against Drive.SerialNumber +// across the controller's Drives. Returns ok=false on missing udev serial, +// no match, collision, or unknown media. +func jbodDiskTypeFromSerial(ctrl Controller, udInfo disko.UdevInfo) (disko.DiskType, bool) { + serials := udInfo.CollectSerials() + if len(serials) == 0 { + return disko.Unknown, false + } + + var matches []*Drive + for _, d := range ctrl.Drives { + if d == nil { + continue + } + sn := strings.TrimSpace(d.SerialNumber) + if sn == "" { + continue + } + if _, ok := serials[sn]; ok { + matches = append(matches, d) + } + } + + if len(matches) != 1 { + return disko.Unknown, false + } + + switch matches[0].MediaType { + case SSD: + return disko.SSD, true + case HDD: + return disko.HDD, true + case NVME: + return disko.NVME, true + case UnknownMedia: + return disko.Unknown, false + } + + return disko.Unknown, false +} + +func (csc *cachingStorCli) DriverSysfsPath() string { + return csc.mr.DriverSysfsPath() } diff --git a/megaraid/storcli_test.go b/megaraid/storcli_test.go index 518e5e1..cf7d0be 100644 --- a/megaraid/storcli_test.go +++ b/megaraid/storcli_test.go @@ -2,9 +2,18 @@ package megaraid import ( "encoding/json" + "errors" + "fmt" "reflect" "strings" "testing" + "time" + + "github.com/patrickmn/go-cache" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "machinerun.io/disko" ) var tableData1 = ` @@ -204,7 +213,7 @@ func TestNewController(t *testing.T) { var exVlen, exPlen, exDGlen = 5, 5, 5 var cID = 0 - ctrl, err := newController(cID, sys0CxShow, sys0CxVallShowAll) + ctrl, err := newController(cID, sys0CxShow, sys0CxVallShowAll, "") if err != nil { t.Fatalf("newController failed: %s", err) } @@ -1163,6 +1172,211 @@ SCSI NAA Id = 6cc167e9730322c027dd6c90047c42a1 Unmap Enabled = No ` +// Cisco 12G Modular Raid Controller with all JBOD drives and no virtual drives. +// Captured from a real system running megaraid_sas with +// FW 5.190.00-3697. All five disks are in JBOD mode (1 SSD + 4 HDD). +var ciscoJBODCxShow = ` +Generating detailed summary of the adapter, it may take a while to complete. + +CLI Version = 007.1907.0000.0000 Sep 13, 2021 +Operating system = Linux 6.6.0 +Controller = 0 +Status = Success +Description = None + +Product Name = Cisco 12G Modular Raid Controller with 2GB cache (max 16 drives) +Serial Number = SKXXXXXXXX +SAS Address = 0000000000000000 +PCI Address = 00:3c:00:00 +System Time = 04/18/2026 07:25:04 +Mfg. Date = 07/23/22 +Controller Time = 04/18/2026 07:24:48 +FW Package Build = 51.19.0-4532 +BIOS Version = 7.19.00.0_0x07130200 +FW Version = 5.190.00-3697 +Driver Name = megaraid_sas +Driver Version = 07.725.01.00-rc1 +Current Personality = RAID-Mode +Vendor Id = 0x1000 +Device Id = 0x14 +SubVendor Id = 0x1137 +SubDevice Id = 0x20E +Host Interface = PCI-E +Device Interface = SAS-12G +Bus Number = 60 +Device Number = 0 +Function Number = 0 +Domain ID = 0 +Security Protocol = None +JBOD Drives = 5 + +JBOD LIST : +========= + +------------------------------------------------------------------------------ +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +------------------------------------------------------------------------------ +134:2 1 JBOD - 745.211 GB SAS SSD N N 512B KPM6XVUG800G U - +134:3 0 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:4 3 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:5 2 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:6 4 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +------------------------------------------------------------------------------ + +ID=JBOD Target ID|EID=Enclosure Device ID|Slt=Slot No|DID=Device ID|Onln=Online +Offln=Offline|Intf=Interface|Med=Media Type|SeSz=Sector Size +SED=Self Encryptive Drive|PI=Protection Info|Sp=Spun|U=Up|D=Down + +Physical Drives = 5 + +PD LIST : +======= + +------------------------------------------------------------------------------ +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +------------------------------------------------------------------------------ +134:2 1 JBOD - 745.211 GB SAS SSD N N 512B KPM6XVUG800G U - +134:3 0 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:4 3 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:5 2 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +134:6 4 JBOD - 2.182 TB SAS HDD N N 4 KB AL15SEB24EP U - +------------------------------------------------------------------------------ + +EID=Enclosure Device ID|Slt=Slot No|DID=Device ID|DG=DriveGroup +DHS=Dedicated Hot Spare|UGood=Unconfigured Good|GHS=Global Hotspare +UBad=Unconfigured Bad|Sntze=Sanitize|Onln=Online|Offln=Offline|Intf=Interface +Med=Media Type|SED=Self Encryptive Drive|PI=Protection Info +SeSz=Sector Size|Sp=Spun|U=Up|D=Down|T=Transition|F=Foreign +UGUnsp=UGood Unsupported|UGShld=UGood shielded|HSPShld=Hotspare shielded +CFShld=Configured shielded|Cpybck=CopyBack|CBShld=Copyback Shielded +UBUnsp=UBad Unsupported|Rbld=Rebuild + +Enclosures = 1 + +Enclosure LIST : +============== + +------------------------------------------------------------------------ +EID State Slots PD PS Fans TSs Alms SIM Port# ProdID VendorSpecific +------------------------------------------------------------------------ +134 OK 16 5 0 0 0 0 0 - VirtualSES +------------------------------------------------------------------------ + +EID=Enclosure Device ID | PD=Physical drive count | PS=Power Supply count +TSs=Temperature sensor count | Alms=Alarm count | SIM=SIM Count | ProdID=Product ID + + +Cachevault_Info : +=============== + +------------------------------------ +Model State Temp Mode MfgDate +------------------------------------ +CVPM05 Optimal 25C - 2022/05/21 +------------------------------------ + +` + +var ciscoJBODCxVallShowAll = ` +CLI Version = 007.1907.0000.0000 Sep 13, 2021 +Operating system = Linux 6.6.0 +Controller = 0 +Status = Success +Description = No VD's have been configured. +` + +// Output of 'storcli /c0/eall/sall show all' for the same JBOD setup. +// Abridged: per drive we keep the summary row and the Device attributes +// sub-section that carries "SN = ...". Serial numbers are synthetic +// (SNTEST00N) so this fixture contains no data from real systems. +var ciscoJBODCxEallSallShowAll = ` +CLI Version = 007.1907.0000.0000 Sep 13, 2021 +Operating system = Linux 6.6.0 +Controller = 0 +Status = Success +Description = Show Drive Information Succeeded. + + +Drive /c0/e134/s2 : +================= + +------------------------------------------------------------------------------ +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +------------------------------------------------------------------------------ +134:2 1 JBOD - 372.611 GB SAS SSD N N 512B TESTMODELSSD1 U - +------------------------------------------------------------------------------ + + +Drive /c0/e134/s2 Device attributes : +=================================== +SN = SNTEST001 +Model Number = TESTMODELSSD1 + + +Drive /c0/e134/s3 : +================= + +---------------------------------------------------------------------------- +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +---------------------------------------------------------------------------- +134:3 0 JBOD - 2.182 TB SAS HDD N N 4 KB TESTMODELHDD1 U - +---------------------------------------------------------------------------- + + +Drive /c0/e134/s3 Device attributes : +=================================== +SN = SNTEST002 +Model Number = TESTMODELHDD1 + + +Drive /c0/e134/s4 : +================= + +---------------------------------------------------------------------------- +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +---------------------------------------------------------------------------- +134:4 3 JBOD - 2.182 TB SAS HDD N N 4 KB TESTMODELHDD1 U - +---------------------------------------------------------------------------- + + +Drive /c0/e134/s4 Device attributes : +=================================== +SN = SNTEST003 +Model Number = TESTMODELHDD1 + + +Drive /c0/e134/s5 : +================= + +---------------------------------------------------------------------------- +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +---------------------------------------------------------------------------- +134:5 2 JBOD - 2.182 TB SAS HDD N N 4 KB TESTMODELHDD1 U - +---------------------------------------------------------------------------- + + +Drive /c0/e134/s5 Device attributes : +=================================== +SN = SNTEST004 +Model Number = TESTMODELHDD1 + + +Drive /c0/e134/s6 : +================= + +---------------------------------------------------------------------------- +EID:Slt DID State DG Size Intf Med SED PI SeSz Model Sp Type +---------------------------------------------------------------------------- +134:6 4 JBOD - 2.182 TB SAS HDD N N 4 KB TESTMODELHDD1 U - +---------------------------------------------------------------------------- + + +Drive /c0/e134/s6 Device attributes : +=================================== +SN = SNTEST005 +Model Number = TESTMODELHDD1 +` + // this has a 'F' for a DriveGroup (foreign) // it is put together from old '/c0/dall show' output to // look like '/c0 show all' would. @@ -1289,3 +1503,260 @@ Model State Temp Mode MfgDate CVPM05 Optimal 34C - 2018/10/16 ------------------------------------ ` + +func TestParseCxShowCiscoJBOD(t *testing.T) { + vds, pds, err := parseCxShow(ciscoJBODCxShow) + require.NoError(t, err, "parseCxShow(ciscoJBODCxShow)") + + assert.Empty(t, vds, "virtual drives") + require.Len(t, pds, 5, "physical drives") + + for _, pd := range pds { + assert.Equal(t, "JBOD", pd.State, "drive %d: State", pd.ID) + assert.Equal(t, -1, pd.DriveGroup, "drive %d: DriveGroup (JBOD)", pd.ID) + } + + assert.Equal(t, SSD, pds[1].MediaType, "drive 1 (slot 2)") + assert.Equal(t, HDD, pds[0].MediaType, "drive 0 (slot 3)") +} + +func TestParseVirtPropertiesCiscoJBOD(t *testing.T) { + propMap, err := parseVirtProperties(ciscoJBODCxVallShowAll) + require.NoError(t, err, "parseVirtProperties(ciscoJBODCxVallShowAll)") + assert.Empty(t, propMap, "VD properties") +} + +func TestNewControllerCiscoJBOD(t *testing.T) { + ctrl, err := newController(0, ciscoJBODCxShow, ciscoJBODCxVallShowAll, + ciscoJBODCxEallSallShowAll) + require.NoError(t, err, "newController") + + assert.Equal(t, 0, ctrl.ID, "controller ID") + assert.Empty(t, ctrl.VirtDrives, "VirtDrives") + assert.Len(t, ctrl.Drives, 5, "Drives") + assert.Empty(t, ctrl.DriveGroups, "DriveGroups (JBOD has no drive groups)") + + // Drives indexed by DID. Slot 2 is the SSD (DID=1); slot 3 is HDD (DID=0). + wantSerials := map[int]string{ + 1: "SNTEST001", // 134:2 SSD + 0: "SNTEST002", // 134:3 HDD + 3: "SNTEST003", // 134:4 + 2: "SNTEST004", // 134:5 + 4: "SNTEST005", // 134:6 + } + for did, want := range wantSerials { + d := ctrl.Drives[did] + require.NotNil(t, d, "drive DID=%d missing", did) + assert.Equal(t, want, d.SerialNumber, "drive DID=%d: SerialNumber", did) + } +} + +func TestParseDriveSerialsCiscoJBOD(t *testing.T) { + got, err := parseDriveSerials(ciscoJBODCxEallSallShowAll) + require.NoError(t, err, "parseDriveSerials") + + want := map[driveKey]string{ + {EID: 134, Slot: 2}: "SNTEST001", + {EID: 134, Slot: 3}: "SNTEST002", + {EID: 134, Slot: 4}: "SNTEST003", + {EID: 134, Slot: 5}: "SNTEST004", + {EID: 134, Slot: 6}: "SNTEST005", + } + assert.Equal(t, want, got, "parseDriveSerials") +} + +func TestParseDriveSerialsEmpty(t *testing.T) { + got, err := parseDriveSerials("") + require.NoError(t, err, `parseDriveSerials("")`) + assert.Empty(t, got, "expected empty map") +} + +type mockMegaRaid struct { + ctrl Controller + err error +} + +func (m *mockMegaRaid) Query(cID int) (Controller, error) { + return m.ctrl, m.err +} + +func (m *mockMegaRaid) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { + return disko.HDD, nil +} + +func (m *mockMegaRaid) DriverSysfsPath() string { + return "/sys/bus/pci/drivers/megaraid_sas" +} + +func TestGetDiskTypeJBODReturnsErrDiskTypeUndetermined(t *testing.T) { + ctrl, err := newController(0, ciscoJBODCxShow, ciscoJBODCxVallShowAll, + ciscoJBODCxEallSallShowAll) + require.NoError(t, err, "newController") + + csc := &cachingStorCli{ + mr: &mockMegaRaid{ctrl: ctrl, err: nil}, + cache: cache.New(5*time.Minute, 5*time.Minute), + } + + // No udev serial => JBOD matching fails => sentinel for udev fallback. + for _, path := range []string{"/dev/sda", "/dev/sdb", "/dev/sdc"} { + dtype, err := csc.GetDiskType(path, disko.UdevInfo{}) + assert.ErrorIs(t, err, disko.ErrDiskTypeUndetermined, + "GetDiskType(%q): expected ErrDiskTypeUndetermined", path) + assert.Equal(t, disko.Unknown, dtype, + "GetDiskType(%q): expected disko.Unknown placeholder", path) + } +} + +// A soft storcli failure (missing binary / no controller / unsupported) +// should return the sentinel so the caller falls back to udev. +func TestGetDiskTypeSoftQueryFailureReturnsSentinel(t *testing.T) { + csc := &cachingStorCli{ + mr: &mockMegaRaid{ctrl: Controller{}, err: ErrNoStorcli}, + cache: cache.New(5*time.Minute, 5*time.Minute), + } + + _, err := csc.GetDiskType("/dev/sda", disko.UdevInfo{}) + assert.ErrorIs(t, err, disko.ErrDiskTypeUndetermined, + "expected ErrDiskTypeUndetermined on soft storcli failure") +} + +// A hard storcli error must propagate unchanged (it is not the sentinel). +func TestGetDiskTypeHardQueryFailurePropagates(t *testing.T) { + hardErr := fmt.Errorf("storcli exploded") + csc := &cachingStorCli{ + mr: &mockMegaRaid{ctrl: Controller{}, err: hardErr}, + cache: cache.New(5*time.Minute, 5*time.Minute), + } + + _, err := csc.GetDiskType("/dev/sda", disko.UdevInfo{}) + assert.NotErrorIs(t, err, disko.ErrDiskTypeUndetermined, + "hard storcli error should NOT be treated as the sentinel") + assert.Error(t, err, "hard storcli error must propagate") +} + +// newJBODCachingStorCli wires the Cisco JBOD fixture (with serials from +// the /eall/sall query) to a cachingStorCli for serial-based matching. +func newJBODCachingStorCli(t *testing.T) *cachingStorCli { + t.Helper() + + ctrl, err := newController(0, ciscoJBODCxShow, ciscoJBODCxVallShowAll, + ciscoJBODCxEallSallShowAll) + require.NoError(t, err, "newController") + + return &cachingStorCli{ + mr: &mockMegaRaid{ctrl: ctrl, err: nil}, + cache: cache.New(5*time.Minute, 5*time.Minute), + } +} + +// SSD serial (slot 2) resolves via ID_SERIAL_SHORT. +func TestGetDiskTypeJBODSerialMatchSSD(t *testing.T) { + csc := newJBODCachingStorCli(t) + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SNTEST001"}, + } + dtype, err := csc.GetDiskType("/dev/sdb", ud) + require.NoError(t, err, "expected nil err for SSD JBOD match") + assert.Equal(t, disko.SSD, dtype, "GetDiskType") +} + +// HDD serial (slot 3) resolves via ID_SERIAL_SHORT. +func TestGetDiskTypeJBODSerialMatchHDD(t *testing.T) { + csc := newJBODCachingStorCli(t) + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SNTEST002"}, + } + dtype, err := csc.GetDiskType("/dev/sdb", ud) + require.NoError(t, err, "expected nil err for HDD JBOD match") + assert.Equal(t, disko.HDD, dtype, "GetDiskType") +} + +// ID_SERIAL alone (no ID_SERIAL_SHORT) still matches when it equals the SN. +func TestGetDiskTypeJBODSerialViaIDSerialFallback(t *testing.T) { + csc := newJBODCachingStorCli(t) + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL": "SNTEST001"}, + } + dtype, err := csc.GetDiskType("/dev/sdb", ud) + require.NoError(t, err, "expected nil err for ID_SERIAL fallback match") + assert.Equal(t, disko.SSD, dtype, "GetDiskType") +} + +func TestGetDiskTypeJBODSerialViaIDSCSISerial(t *testing.T) { + csc := newJBODCachingStorCli(t) + + ud := disko.UdevInfo{ + Properties: map[string]string{ + "ID_SCSI_SERIAL": "SNTEST001", + "ID_SERIAL_SHORT": "deadbeefcafef001", + "ID_SERIAL": "3deadbeefcafef001", + }, + } + dtype, err := csc.GetDiskType("/dev/sdb", ud) + require.NoError(t, err, "expected nil err for ID_SCSI_SERIAL match") + assert.Equal(t, disko.SSD, dtype, "GetDiskType") +} + +// Serial not in fixture falls through to ErrDiskTypeUndetermined. +func TestGetDiskTypeJBODSerialNoMatch(t *testing.T) { + csc := newJBODCachingStorCli(t) + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "NOSUCHSERIAL"}, + } + _, err := csc.GetDiskType("/dev/sdb", ud) + assert.ErrorIs(t, err, disko.ErrDiskTypeUndetermined, + "expected ErrDiskTypeUndetermined for unknown serial") +} + +// When the /eall/sall detail query was unavailable (empty string), drives +// carry no SerialNumber and JBOD matching correctly fails over to udev. +func TestGetDiskTypeJBODEmptyDetailOutputFallsThrough(t *testing.T) { + ctrl, err := newController(0, ciscoJBODCxShow, ciscoJBODCxVallShowAll, "") + require.NoError(t, err, "newController") + + csc := &cachingStorCli{ + mr: &mockMegaRaid{ctrl: ctrl, err: nil}, + cache: cache.New(5*time.Minute, 5*time.Minute), + } + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SNTEST001"}, + } + _, err = csc.GetDiskType("/dev/sdb", ud) + assert.ErrorIs(t, err, disko.ErrDiskTypeUndetermined, + "expected ErrDiskTypeUndetermined with no serials") +} + +// isSoftStorCliErr must recognise the three soft sentinels whether bare +// or wrapped with fmt.Errorf("%w"). +func TestIsSoftStorCliErr(t *testing.T) { + hard := errors.New("storcli blew up") + + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"bare ErrNoStorcli", ErrNoStorcli, true}, + {"bare ErrNoController", ErrNoController, true}, + {"bare ErrUnsupported", ErrUnsupported, true}, + {"wrapped ErrNoStorcli", fmt.Errorf("ctx: %w", ErrNoStorcli), true}, + {"wrapped ErrNoController", fmt.Errorf("ctx: %w", ErrNoController), true}, + {"wrapped ErrUnsupported", fmt.Errorf("ctx: %w", ErrUnsupported), true}, + {"unrelated hard error", hard, false}, + {"wrapped hard error", fmt.Errorf("ctx: %w", hard), false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, isSoftStorCliErr(tc.err), + "isSoftStorCliErr(%v)", tc.err) + }) + } +} diff --git a/mpi3mr/mpi3mr.go b/mpi3mr/mpi3mr.go index 48c9144..ab558ba 100644 --- a/mpi3mr/mpi3mr.go +++ b/mpi3mr/mpi3mr.go @@ -68,13 +68,10 @@ type Mpi3mr interface { Query(int) (Controller, error) // GetDiskType - Determine the disk type if controller owns disk - GetDiskType(string) (disko.DiskType, error) + GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) // DriverSysfsPath - Return the sysfs path to the linux driver for this controller DriverSysfsPath() string - - // IsSysPathRAID - Check if sysfs path is a device on the controller - IsSysPathRAID(string) bool } // ErrNoController - Error reported by Query if no controller is found. diff --git a/mpi3mr/storcli2.go b/mpi3mr/storcli2.go index 9607a6b..5556c54 100644 --- a/mpi3mr/storcli2.go +++ b/mpi3mr/storcli2.go @@ -11,6 +11,7 @@ import ( "github.com/pkg/errors" "machinerun.io/disko" + "machinerun.io/disko/linux/sysfs" ) // parse JSON from 'storcli2 show nolog J' for List() method @@ -520,11 +521,18 @@ func getCommandErrorRCDefault(err error, rcError int) int { // Implement the mpi3mr Interface with storcli2 type storCli2 struct { + // sysRoot: sysfs root for JBOD SCSI-target lookups ("/sys" in prod). + sysRoot string + // scsiTargetFn: kname -> SCSI target ID. Overridable for tests. + scsiTargetFn func(sysRoot, kname string) (int, bool, error) } // StorCli returns a storcli2 specific implementation of Query for the Mpi3mr interface func StorCli2() Mpi3mr { - return &storCli2{} + return &storCli2{ + sysRoot: "/sys", + scsiTargetFn: sysfs.ReadSCSITarget, + } } const ( @@ -596,38 +604,112 @@ func (sc *storCli2) DriverSysfsPath() string { return SysfsPCIDriversPath } -func (sc *storCli2) GetDiskType(path string) (disko.DiskType, error) { +func (sc *storCli2) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { cIDs, err := sc.List() if err != nil { - return disko.HDD, errors.Errorf("failed to get controller list: %s", err) + if isSoftStorCli2Err(err) { + return disko.Unknown, disko.ErrDiskTypeUndetermined + } + return disko.Unknown, errors.Errorf("failed to get controller list: %s", err) } - errors := []error{} + var queryErrs []error + var queriedCtrls []Controller + for _, cID := range cIDs { ctrl, err := sc.Query(cID) if err != nil { - errors = append(errors, fmt.Errorf("error while getting config for controller id:%d %s", cID, err)) + queryErrs = append(queryErrs, fmt.Errorf("controller id %d: %w", cID, err)) continue } + + queriedCtrls = append(queriedCtrls, ctrl) + for _, vDev := range ctrl.VirtualDrives { - if vDev.Path() == path && vDev.IsSSD() { + if vDev.Path() != path { + continue + } + if vDev.IsSSD() { return disko.SSD, nil } - return disko.HDD, nil } } - for _, err := range errors { - if err != ErrNoStor2cli && err != ErrNoController && err != ErrUnsupported { - return disko.HDD, err + for _, err := range queryErrs { + if !isSoftStorCli2Err(err) { + return disko.Unknown, err } } - return disko.HDD, fmt.Errorf("cannot determine diskt type for path %q", path) + // No VD matched path (or no controller responded). Try JBOD/ + // passthrough by matching SCSI target against PhysicalDrive.PID; + // fall through with the sentinel so the caller uses udev. + if dType, ok := sc.jbodDiskTypeFromSCSI(queriedCtrls, udInfo); ok { + return dType, nil + } + + return disko.Unknown, disko.ErrDiskTypeUndetermined +} + +// isSoftStorCli2Err returns true when err indicates that storcli2 simply +// cannot answer right now (binary missing, no controller present, or an +// unsupported controller). Callers should fall back to generic detection +// rather than treat these as fatal. +func isSoftStorCli2Err(err error) bool { + return errors.Is(err, ErrNoStor2cli) || + errors.Is(err, ErrNoController) || + errors.Is(err, ErrUnsupported) } -// not implemented in driver layer -func (sc *storCli2) IsSysPathRAID(syspath string) bool { - return false +// jbodDiskTypeFromSCSI matches the Linux SCSI Target field (the third +// component of Host:Channel:Target:LUN read from +// /sys/block//device) against PhysicalDrive.PID across the given +// controllers. Returns ok=false for non-SCSI devices (udev ID_SCSI != "1"), +// missing kname, unresolved target, PID collision, or unknown medium. +func (sc *storCli2) jbodDiskTypeFromSCSI(ctrls []Controller, udInfo disko.UdevInfo) (disko.DiskType, bool) { + if sc.scsiTargetFn == nil { + return disko.Unknown, false + } + + // udev sets ID_SCSI=1 only for devices that present as SCSI on a + // host controller. virtio-blk, NVMe and ATA/SATA do not, so we can + // skip the sysfs lookup for them. + if udInfo.Properties["ID_SCSI"] != "1" { + return disko.Unknown, false + } + + kname := udInfo.Name + if kname == "" { + return disko.Unknown, false + } + + target, ok, err := sc.scsiTargetFn(sc.sysRoot, kname) + if err != nil || !ok { + return disko.Unknown, false + } + + var matches []PhysicalDrive + for _, ctrl := range ctrls { + for _, pd := range ctrl.PhysicalDrives { + if pd.PID == target { + matches = append(matches, pd) + } + } + } + + if len(matches) != 1 { + return disko.Unknown, false + } + + switch strings.ToUpper(strings.TrimSpace(matches[0].Medium)) { + case "SSD": + return disko.SSD, true + case "HDD": + return disko.HDD, true + case "NVME": + return disko.NVME, true + } + + return disko.Unknown, false } diff --git a/mpi3mr/storcli2_test.go b/mpi3mr/storcli2_test.go index 141f25b..7c9f804 100644 --- a/mpi3mr/storcli2_test.go +++ b/mpi3mr/storcli2_test.go @@ -2,9 +2,16 @@ package mpi3mr import ( "encoding/json" + "errors" + "fmt" "testing" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "machinerun.io/disko" + "machinerun.io/disko/linux/sysfs" ) var showNoLogJData = ` @@ -446,3 +453,197 @@ func TestStorCli2ParsePhysicalDriveForeign(t *testing.T) { } } + +func jbodPhysCtrl() Controller { + return Controller{ + PhysicalDrives: PhysicalDriveSet{ + "10": PhysicalDrive{PID: 10, Medium: "HDD", State: "JBOD"}, + "20": PhysicalDrive{PID: 20, Medium: "SSD", State: "JBOD"}, + }, + } +} + +func TestStorCli2JBODDiskTypeHDD(t *testing.T) { + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + if kname == "sdb" { + return 10, true, nil + } + return 0, false, nil + }, + } + + dType, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + require.True(t, ok, "expected ok=true on HDD JBOD match") + assert.Equal(t, disko.HDD, dType, "jbodDiskTypeFromSCSI") +} + +func TestStorCli2JBODDiskTypeSSD(t *testing.T) { + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 20, true, nil + }, + } + + dType, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + require.True(t, ok, "expected ok=true on SSD JBOD match") + assert.Equal(t, disko.SSD, dType, "jbodDiskTypeFromSCSI") +} + +// Tri-mode controllers can report Medium="NVMe" for NVMe JBODs. The +// production short-circuit on ID_SCSI=1 keeps real NVMe drives from +// reaching this switch, but exercising the case directly with a fake +// fixture pins the parity with megaraid's jbodDiskTypeFromSerial. +func TestStorCli2JBODDiskTypeNVMe(t *testing.T) { + ctrl := Controller{ + PhysicalDrives: PhysicalDriveSet{ + "30": PhysicalDrive{PID: 30, Medium: "NVMe", State: "JBOD"}, + }, + } + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 30, true, nil + }, + } + + dType, ok := sc.jbodDiskTypeFromSCSI([]Controller{ctrl}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + require.True(t, ok, "expected ok=true on NVMe JBOD match") + assert.Equal(t, disko.NVME, dType, "jbodDiskTypeFromSCSI") +} + +// SCSI target not reported by any controller. +func TestStorCli2JBODDiskTypeNoMatch(t *testing.T) { + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 42, true, nil + }, + } + + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + assert.False(t, ok, "expected ok=false when no PhysicalDrive matches SCSI target") +} + +// PID collision across controllers is ambiguous. +func TestStorCli2JBODDiskTypeAmbiguousPID(t *testing.T) { + other := Controller{ + PhysicalDrives: PhysicalDriveSet{ + "10": PhysicalDrive{PID: 10, Medium: "SSD", State: "JBOD"}, + }, + } + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 10, true, nil + }, + } + + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl(), other}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + assert.False(t, ok, "expected ok=false on PID collision across controllers") +} + +// Non-SCSI device (NVMe, virtio, missing symlink). +func TestStorCli2JBODDiskTypeUnresolvedTarget(t *testing.T) { + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 0, false, nil + }, + } + + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + assert.False(t, ok, "expected ok=false when SCSI target is unresolved") +} + +// Sysfs lookup error is swallowed as a no-match. +func TestStorCli2JBODDiskTypeLookupError(t *testing.T) { + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, kname string) (int, bool, error) { + return 0, false, errors.New("boom") + }, + } + + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, + disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "1"}}) + assert.False(t, ok, "expected ok=false when SCSI lookup errors") +} + +// Empty udev Name short-circuits before any sysfs call. +func TestStorCli2JBODDiskTypeEmptyName(t *testing.T) { + sc := &storCli2{sysRoot: "/unused", scsiTargetFn: sysfs.ReadSCSITarget} + + udInfo := disko.UdevInfo{Properties: map[string]string{"ID_SCSI": "1"}} + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, udInfo) + assert.False(t, ok, "expected ok=false when udev Name is empty") +} + +// Non-SCSI devices (ID_SCSI != "1") must short-circuit before any sysfs +// call so virtio-blk, NVMe, ATA/SATA etc. don't trigger spurious lookups. +func TestStorCli2JBODDiskTypeNonSCSI(t *testing.T) { + called := false + sc := &storCli2{ + sysRoot: "/unused", + scsiTargetFn: func(_, _ string) (int, bool, error) { + called = true + return 10, true, nil + }, + } + + cases := []struct { + name string + udInfo disko.UdevInfo + }{ + {"missing ID_SCSI", disko.UdevInfo{Name: "sdb"}}, + {"ID_SCSI=0", disko.UdevInfo{Name: "sdb", Properties: map[string]string{"ID_SCSI": "0"}}}, + {"NVMe-shaped", disko.UdevInfo{Name: "nvme0n1", Properties: map[string]string{"ID_MODEL": "Some NVMe"}}}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + called = false + _, ok := sc.jbodDiskTypeFromSCSI([]Controller{jbodPhysCtrl()}, tc.udInfo) + assert.False(t, ok, "expected ok=false") + assert.False(t, called, "expected scsiTargetFn not to be called") + }) + } +} + +// isSoftStorCli2Err must recognise the three soft sentinels whether bare +// or wrapped with fmt.Errorf("%w"), so wrapped per-controller errors are +// still routed through the udev fallback instead of propagating as fatal. +func TestIsSoftStorCli2Err(t *testing.T) { + hard := errors.New("storcli2 blew up") + + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"bare ErrNoStor2cli", ErrNoStor2cli, true}, + {"bare ErrNoController", ErrNoController, true}, + {"bare ErrUnsupported", ErrUnsupported, true}, + {"wrapped ErrNoStor2cli", fmt.Errorf("controller id %d: %w", 0, ErrNoStor2cli), true}, + {"wrapped ErrNoController", fmt.Errorf("controller id %d: %w", 1, ErrNoController), true}, + {"wrapped ErrUnsupported", fmt.Errorf("controller id %d: %w", 2, ErrUnsupported), true}, + {"unrelated hard error", hard, false}, + {"wrapped hard error", fmt.Errorf("controller id %d: %w", 3, hard), false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, isSoftStorCli2Err(tc.err), + "isSoftStorCli2Err(%v)", tc.err) + }) + } +} diff --git a/smartpqi/arcconf.go b/smartpqi/arcconf.go index 92a2b24..22569b3 100644 --- a/smartpqi/arcconf.go +++ b/smartpqi/arcconf.go @@ -2,6 +2,7 @@ package smartpqi import ( "bytes" + "errors" "fmt" "os/exec" "regexp" @@ -489,47 +490,107 @@ func (ac *arcConf) Query(cID int) (Controller, error) { return Controller{}, fmt.Errorf("unknown controller id %d", cID) } -func (ac *arcConf) GetDiskType(path string) (disko.DiskType, error) { +func (ac *arcConf) GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) { cIDs, err := ac.List() if err != nil { - return disko.HDD, fmt.Errorf("failed to enumerate controllers: %s", err) + if isSoftArcconfErr(err) { + return disko.Unknown, disko.ErrDiskTypeUndetermined + } + return disko.Unknown, fmt.Errorf("failed to enumerate controllers: %w", err) } - errors := []error{} + var queryErrs []error + var queriedCtrls []Controller + for _, cID := range cIDs { ctrl, err := ac.GetConfig(cID) if err != nil { - errors = append(errors, fmt.Errorf("error while getting config for controller id:%d: %s", cID, err)) + queryErrs = append(queryErrs, fmt.Errorf("controller id %d: %w", cID, err)) continue } - for _, lDrive := range ctrl.LogicalDrives { - if lDrive.DiskName == path { - if lDrive.IsSSD() { - return disko.SSD, nil - } + queriedCtrls = append(queriedCtrls, ctrl) - return disko.HDD, nil + for _, lDrive := range ctrl.LogicalDrives { + if lDrive.DiskName != path { + continue } + if lDrive.IsSSD() { + return disko.SSD, nil + } + return disko.HDD, nil } } - for _, err := range errors { - if err != ErrNoArcconf && err != ErrNoController && err != ErrUnsupported { - return disko.HDD, err + for _, err := range queryErrs { + if !isSoftArcconfErr(err) { + return disko.Unknown, err } } - return disko.HDD, fmt.Errorf("cannot determine disk type") + // No LD matched path (or no controller responded). Try JBOD by + // matching udev serial against PhysicalDevice SerialNumber; fall + // through with the sentinel so the caller uses udev. + if dType, ok := jbodDiskTypeFromSerial(queriedCtrls, udInfo); ok { + return dType, nil + } + + return disko.Unknown, disko.ErrDiskTypeUndetermined } -func (ac *arcConf) DriverSysfsPath() string { - return SysfsPCIDriversPath +// isSoftArcconfErr returns true when err indicates that arcconf simply +// cannot answer right now (binary missing, no controller present, or an +// unsupported controller). Callers should fall back to generic detection +// rather than treat these as fatal. +func isSoftArcconfErr(err error) bool { + return errors.Is(err, ErrNoArcconf) || + errors.Is(err, ErrNoController) || + errors.Is(err, ErrUnsupported) } -// not implemented at the driver level -func (ac *arcConf) IsSysPathRAID(path string) bool { - return false +// jbodDiskTypeFromSerial matches a udev serial against PhysicalDevice +// SerialNumber across the given controllers. Returns ok=false on missing +// udev serial, no match, collision, or unknown media. +func jbodDiskTypeFromSerial(ctrls []Controller, udInfo disko.UdevInfo) (disko.DiskType, bool) { + serials := udInfo.CollectSerials() + if len(serials) == 0 { + return disko.Unknown, false + } + + var matches []*PhysicalDevice + for _, ctrl := range ctrls { + for _, pDev := range ctrl.PhysicalDrives { + if pDev == nil { + continue + } + pdSerial := strings.TrimSpace(pDev.SerialNumber) + if pdSerial == "" { + continue + } + if _, ok := serials[pdSerial]; ok { + matches = append(matches, pDev) + } + } + } + + if len(matches) != 1 { + return disko.Unknown, false + } + + switch matches[0].Type { + case SSD: + return disko.SSD, true + case HDD: + return disko.HDD, true + case NVME: + return disko.NVME, true + } + + return disko.Unknown, false +} + +func (ac *arcConf) DriverSysfsPath() string { + return SysfsPCIDriversPath } func (ac *arcConf) GetConfig(cID int) (Controller, error) { diff --git a/smartpqi/arcconf_test.go b/smartpqi/arcconf_test.go index c1c04ce..dea5a77 100644 --- a/smartpqi/arcconf_test.go +++ b/smartpqi/arcconf_test.go @@ -2,9 +2,16 @@ package smartpqi import ( "encoding/json" + "errors" + "fmt" "reflect" "strings" "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "machinerun.io/disko" ) var arcConfGetConfig = ` @@ -1577,3 +1584,157 @@ func TestSmartPqiNewControllerBadInput(t *testing.T) { } } } + +// jbodCtrlWithPDs: one HDD, one SSD, one NVMe keyed by distinct serials. +func jbodCtrlWithPDs() Controller { + return Controller{ + ID: 1, + PhysicalDrives: DriveSet{ + 10: &PhysicalDevice{ID: 10, SerialNumber: "SN-HDD-1", Type: HDD}, + 20: &PhysicalDevice{ID: 20, SerialNumber: "SN-SSD-1", Type: SSD}, + 30: &PhysicalDevice{ID: 30, SerialNumber: "SN-NVME-1", Type: NVME}, + }, + } +} + +func TestJBODDiskTypeSerialHDDShort(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-HDD-1"}, + } + + dType, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + require.True(t, ok, "expected ok=true when ID_SERIAL_SHORT matches") + assert.Equal(t, disko.HDD, dType, "jbodDiskTypeFromSerial") +} + +// ID_SERIAL is the fallback when ID_SERIAL_SHORT is absent. +func TestJBODDiskTypeSerialSSDFallback(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL": "SN-SSD-1"}, + } + + dType, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + require.True(t, ok, "expected ok=true when ID_SERIAL matches as fallback") + assert.Equal(t, disko.SSD, dType, "jbodDiskTypeFromSerial") +} + +func TestJBODDiskTypeSerialNVME(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-NVME-1"}, + } + + dType, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + require.True(t, ok, "expected ok=true for NVME serial match") + assert.Equal(t, disko.NVME, dType, "jbodDiskTypeFromSerial") +} + +// No ID_SERIAL* property in udev. +func TestJBODDiskTypeSerialNoSerial(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_MODEL": "SomeModel"}, + } + + _, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + assert.False(t, ok, "expected ok=false when udev carries no ID_SERIAL*") +} + +// Serial not reported by any controller. +func TestJBODDiskTypeSerialNoMatch(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-UNKNOWN"}, + } + + _, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + assert.False(t, ok, "expected ok=false when no PhysicalDevice matches") +} + +// Duplicate serial across controllers is ambiguous. +func TestJBODDiskTypeSerialAmbiguous(t *testing.T) { + dup := Controller{ + ID: 2, + PhysicalDrives: DriveSet{ + 99: &PhysicalDevice{ID: 99, SerialNumber: "SN-HDD-1", Type: SSD}, + }, + } + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-HDD-1"}, + } + + _, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs(), dup}, ud) + assert.False(t, ok, "expected ok=false on duplicate SerialNumber across controllers") +} + +// Matched PD with UnknownMedia cannot be classified. +func TestJBODDiskTypeSerialUnknownMedia(t *testing.T) { + ctrl := Controller{ + ID: 3, + PhysicalDrives: DriveSet{ + 10: &PhysicalDevice{ID: 10, SerialNumber: "SN-UNK", Type: UnknownMedia}, + }, + } + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-UNK"}, + } + + _, ok := jbodDiskTypeFromSerial([]Controller{ctrl}, ud) + assert.False(t, ok, "expected ok=false when matched PD has UnknownMedia") +} + +// Blank controller-side SerialNumber never matches. +func TestJBODDiskTypeSerialEmptyPDSerial(t *testing.T) { + ctrl := Controller{ + ID: 4, + PhysicalDrives: DriveSet{ + 10: &PhysicalDevice{ID: 10, SerialNumber: " ", Type: SSD}, + }, + } + + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SERIAL_SHORT": "SN-X"}, + } + + _, ok := jbodDiskTypeFromSerial([]Controller{ctrl}, ud) + assert.False(t, ok, "expected ok=false when PhysicalDevice has blank SerialNumber") +} + +func TestJBODDiskTypeSerialViaIDSCSISerial(t *testing.T) { + ud := disko.UdevInfo{ + Properties: map[string]string{"ID_SCSI_SERIAL": "SN-HDD-1"}, + } + + dType, ok := jbodDiskTypeFromSerial([]Controller{jbodCtrlWithPDs()}, ud) + require.True(t, ok, "expected ok=true when ID_SCSI_SERIAL matches") + assert.Equal(t, disko.HDD, dType, "jbodDiskTypeFromSerial") +} + +// isSoftArcconfErr must recognise the three soft sentinels whether bare +// or wrapped with fmt.Errorf("%w"), so wrapped per-controller errors are +// still routed through the udev fallback instead of propagating as fatal. +func TestIsSoftArcconfErr(t *testing.T) { + hard := errors.New("arcconf blew up") + + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"bare ErrNoArcconf", ErrNoArcconf, true}, + {"bare ErrNoController", ErrNoController, true}, + {"bare ErrUnsupported", ErrUnsupported, true}, + {"wrapped ErrNoArcconf", fmt.Errorf("controller id %d: %w", 0, ErrNoArcconf), true}, + {"wrapped ErrNoController", fmt.Errorf("controller id %d: %w", 1, ErrNoController), true}, + {"wrapped ErrUnsupported", fmt.Errorf("controller id %d: %w", 2, ErrUnsupported), true}, + {"unrelated hard error", hard, false}, + {"wrapped hard error", fmt.Errorf("controller id %d: %w", 3, hard), false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, isSoftArcconfErr(tc.err), + "isSoftArcconfErr(%v)", tc.err) + }) + } +} diff --git a/smartpqi/smartpqi.go b/smartpqi/smartpqi.go index a4b0b08..08c0fa4 100644 --- a/smartpqi/smartpqi.go +++ b/smartpqi/smartpqi.go @@ -120,13 +120,10 @@ type SmartPqi interface { Query(int) (Controller, error) // GetDiskType - Determine the disk type if controller owns disk - GetDiskType(string) (disko.DiskType, error) + GetDiskType(path string, udInfo disko.UdevInfo) (disko.DiskType, error) // DriverSysfsPath - Return the sysfs path to the linux driver for this controller DriverSysfsPath() string - - // IsSysPathRAID - Check if sysfs path is a device on the controller - IsSysPathRAID(string) bool } // ErrNoController - Error reported by Query if no controller is found.