diff --git a/.kres.yaml b/.kres.yaml index 434db428..e5186aa0 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -53,6 +53,7 @@ spec: - nvidia-gdrdrv-device - nvidia-open-gpu-kernel-modules-lts - nvidia-open-gpu-kernel-modules-production + - nvidia-tegra-nvgpu - nvme-cli - px-fuse - soci-snapshotter diff --git a/drivers/nvidia-tegra-nvgpu/README.md b/drivers/nvidia-tegra-nvgpu/README.md new file mode 100644 index 00000000..a848d682 --- /dev/null +++ b/drivers/nvidia-tegra-nvgpu/README.md @@ -0,0 +1,68 @@ +# nvidia-tegra-nvgpu + +Talos system extension providing NVIDIA GA10B GPU kernel modules for **Jetson Orin NX / Orin Nano** (Tegra234 SoC). + +## Hardware + +| SoM | SoC | GPU | +|-----|-----|-----| +| Jetson Orin NX 8GB / 16GB | Tegra234 | GA10B (Ampere, 1024 CUDA cores) | +| Jetson Orin Nano 4GB / 8GB | Tegra234 | GA10B subset | + +## Modules + +| Module | Source | Purpose | +|--------|--------|---------| +| `host1x.ko` | OE4T/linux-nv-oot | Syncpoint allocator with GA10B `ERRATA_SYNCPT_INVALID_ID_0` fix | +| `host1x_fence.ko` | OE4T/linux-nv-oot | DMA fence bridge for GPU/CPU synchronization | +| `nvmap.ko` | OE4T/linux-nv-oot | GPU memory allocator (NVIDIA memory management API) | +| `mc_utils.ko` | OE4T/linux-nv-oot | Memory controller EMC frequency helper | +| `governor_pod_scaling.ko` | OE4T/linux-nv-oot | nvhost_podgov devfreq governor for dynamic GPU frequency scaling | +| `nvhost_ctrl_shim.ko` | in-tree | `/dev/nvhost-ctrl` bridge — provides `SYNCPT_WAITMEX` ioctl for JetPack 6 CUDA runtime | +| `nvgpu.ko` | OE4T/linux-nvgpu | Main GA10B GPU driver (Clang-compatible, kernel 6.18) | + +## Prerequisites + +This extension requires the following to be present on the node: + +1. **nvidia-firmware-ext** — GPU firmware blobs (PMU, GSP, etc.) from JetPack r36.5 +2. JetPack 6 BSP (L4T r36.x) flashed to the Jetson carrier board for EEPROM / PMIC firmware + +## Talos Configuration + +Module load order is critical. Add the following to your Talos machine configuration: + +```yaml +machine: + kernel: + modules: + - name: host1x + - name: host1x_fence + - name: nvhost_ctrl_shim + - name: nvmap + - name: mc_utils + - name: nvgpu + - name: governor_pod_scaling +``` + +## CDI + +Container Device Interface (CDI) is managed by the `nvidia-cdi-setup` DaemonSet from the +[sbc-jetson](https://github.com/siderolabs/sbc-jetson) overlay. It generates +`/run/cdi/nvidia-jetson.yaml` at runtime, exposing `/dev/nvgpu/igpu0/*`, +`/dev/nvhost-ctrl`, and `/dev/nvmap` to GPU workloads. + +## Performance + +Tested on Jetson Orin NX 16GB with Talos v1.13.0, kernel 6.18.24, CUDA 12.6 (JetPack 6.2): + +| Model | Quantization | tok/s | +|-------|-------------|-------| +| qwen2.5:0.5b | Q4_K_M | ~60 | +| qwen3:4b | Q4_K_M | ~16 | + +## References + +- [siderolabs/pkgs#1518](https://github.com/siderolabs/pkgs/pull/1518) — kernel package that builds these modules +- [OE4T/linux-nvgpu](https://github.com/OE4T/linux-nvgpu) — GA10B GPU driver (patches-r36 branch) +- [OE4T/linux-nv-oot](https://github.com/OE4T/linux-nv-oot) — NVIDIA OOT modules (patches-r36.5 branch) diff --git a/drivers/nvidia-tegra-nvgpu/manifest.yaml.tmpl b/drivers/nvidia-tegra-nvgpu/manifest.yaml.tmpl new file mode 100644 index 00000000..1518c87a --- /dev/null +++ b/drivers/nvidia-tegra-nvgpu/manifest.yaml.tmpl @@ -0,0 +1,23 @@ +version: v1alpha1 +metadata: + name: nvidia-tegra-nvgpu + version: "{{ .VERSION }}" + author: Alexander Schwankner + description: | + [{{ .TIER }}] This system extension provides NVIDIA Jetson Orin GPU kernel modules built against a specific Talos version. + Targets the NVIDIA Tegra GA10B (Ampere) GPU found in Jetson Orin NX / Orin Nano SoMs (Tegra234). + + Modules included: + - host1x.ko — OE4T host1x syncpoint allocator (GA10B ERRATA_SYNCPT_INVALID_ID_0 fix) + - host1x_fence.ko — DMA fence bridge for GPU/CPU synchronization + - nvmap.ko — GPU memory allocator (NVIDIA memory management API) + - mc_utils.ko — Memory controller bandwidth helper + - governor_pod_scaling.ko — nvhost_podgov devfreq governor (dynamic GPU frequency) + - nvhost_ctrl_shim.ko — /dev/nvhost-ctrl bridge for JetPack 6 CUDA runtime (SYNCPT_WAITMEX) + - nvgpu.ko — Main GA10B GPU driver (OE4T patches, Clang-compatible) + + Requires the nvidia-firmware-ext extension for GPU firmware blobs (pmu_pkc_prod_sig.bin etc.). + Load order is critical and must be specified via machine.kernel.modules in the Talos config. + compatibility: + talos: + version: ">= v1.13.0" diff --git a/drivers/nvidia-tegra-nvgpu/pkg.yaml b/drivers/nvidia-tegra-nvgpu/pkg.yaml new file mode 100644 index 00000000..4b03f918 --- /dev/null +++ b/drivers/nvidia-tegra-nvgpu/pkg.yaml @@ -0,0 +1,25 @@ +name: nvidia-tegra-nvgpu +variant: scratch +shell: /bin/bash +dependencies: + - stage: base + # The pkgs version for a particular release of Talos as defined in + # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-tegra-nvgpu-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - install: + - | + mkdir -p /rootfs/usr/lib/modules + + cp -R /usr/lib/modules/* /rootfs/usr/lib/modules + - test: + - | + mkdir -p /extensions-validator-rootfs + cp -r /rootfs/ /extensions-validator-rootfs/rootfs + cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml + /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" +finalize: + - from: /rootfs + to: /rootfs + - from: /pkg/manifest.yaml + to: / diff --git a/drivers/nvidia-tegra-nvgpu/vars.yaml b/drivers/nvidia-tegra-nvgpu/vars.yaml new file mode 100644 index 00000000..3655a9aa --- /dev/null +++ b/drivers/nvidia-tegra-nvgpu/vars.yaml @@ -0,0 +1,3 @@ +# update when pkgs version is updated (short SHA of the nvidia-tegra-nvgpu commit in siderolabs/pkgs) +VERSION: "{{ .BUILD_ARG_PKGS }}-{{ .BUILD_ARG_TAG }}" +TIER: "extra"