Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ To install the AMD Container Toolkit on RHEL/CentOS 9 systems, follow these step
> docker run --rm --runtime=amd -e AMD_VISIBLE_DEVICES=0-3,5,8 rocm/rocm-terminal rocm-smi
```

- Optional: override GPU device file mode in the container (e.g. ``0666``) via ``AMD_GPU_DEVICE_MODE``; host permissions are unchanged.

```text
> docker run --rm --runtime=amd -e AMD_VISIBLE_DEVICES=all -e AMD_GPU_DEVICE_MODE=0666 rocm/rocm-terminal rocm-smi
```

2. Using [CDI](docs/container-runtime/cdi-guide.rst) style

- First, generate the CDI spec.
Expand Down
2 changes: 2 additions & 0 deletions docs/container-runtime/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ To use AMD GPUs with Docker:

sudo docker run --rm --runtime=amd -e AMD_VISIBLE_DEVICES=all rocm/rocm-terminal rocm-smi

Optionally, set ``AMD_GPU_DEVICE_MODE`` to an octal value (e.g. ``0666``) to override GPU device permissions inside the container; host permissions are unchanged.
Comment thread
nikhilsk marked this conversation as resolved.
Outdated

- CDI style:

.. code-block:: bash
Expand Down
40 changes: 39 additions & 1 deletion internal/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"
"fmt"
"os"
"strconv"
"strings"

"github.com/ROCm/container-toolkit/internal/amdgpu"
Expand All @@ -32,6 +33,10 @@ import (
const (
// Default path for AMD Container Runtime OCI hook
DEFAULT_HOOK_PATH = "/usr/bin/amd-container-runtime-hook"

// AMD_GPU_DEVICE_MODE is the container env var to override GPU device file mode in the container (e.g. 0666).
// Host device permissions are unchanged. If unset, the host device mode is used.
AMD_GPU_DEVICE_MODE = "AMD_GPU_DEVICE_MODE"
)

// Interface for OCI package
Expand Down Expand Up @@ -296,14 +301,47 @@ func (oci *oci_t) addGPUDevices() error {
return nil
}

// getGPUDeviceModeOverride returns the GPU device file mode from AMD_GPU_DEVICE_MODE env if set and valid (octal, e.g. 0666).
// The host device permissions are never changed; this only affects the mode of the device node inside the container.
func (oci *oci_t) getGPUDeviceModeOverride(env []string) (os.FileMode, bool) {
for _, e := range env {
if !strings.HasPrefix(e, AMD_GPU_DEVICE_MODE+"=") {
continue
}
val := strings.TrimSpace(strings.TrimPrefix(e, AMD_GPU_DEVICE_MODE+"="))
if val == "" {
return 0, false
}
// Accept octal: 0666 or 0o666
val = strings.TrimPrefix(val, "0o")
val = strings.TrimPrefix(val, "0O")
m, err := strconv.ParseUint(val, 8, 32)
if err != nil {
logger.Log.Printf("Invalid %s value %q: %v", AMD_GPU_DEVICE_MODE, val, err)
return 0, false
}
Comment thread
nikhilsk marked this conversation as resolved.
return os.FileMode(m), true
}
return 0, false
}
Comment thread
nikhilsk marked this conversation as resolved.
Outdated

// addGPUDevice adds the requested GPU device to the OCI spec
func (oci *oci_t) addGPUDevice(gpu amdgpu.AMDGPU) error {
fileMode := &gpu.FileMode
if oci.spec != nil && oci.spec.Process != nil {
if override, ok := oci.getGPUDeviceModeOverride(oci.spec.Process.Env); ok {
Comment thread
nikhilsk marked this conversation as resolved.
Outdated
m := new(os.FileMode)
*m = override
fileMode = m
logger.Log.Printf("Using GPU device mode override %#o for %s", override, gpu.Path)
}
}
dev := specs.LinuxDevice{
Path: gpu.Path,
Type: gpu.DevType,
Major: gpu.Major,
Minor: gpu.Minor,
FileMode: &gpu.FileMode,
FileMode: fileMode,
GID: &gpu.Gid,
UID: &gpu.Uid,
}
Expand Down
Loading