diff --git a/.kres.yaml b/.kres.yaml index 47ad44863..6311388b5 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -71,6 +71,7 @@ spec: - mellanox-mstflint-pkg - nvidia-open-gpu-kernel-modules-lts-pkg - nvidia-open-gpu-kernel-modules-production-pkg + - nvidia-tegra-nvgpu-pkg - px-fuse-pkg - tenstorrent-pkg - xdma-driver-pkg diff --git a/Pkgfile b/Pkgfile index c19c4e173..f185a1ac7 100644 --- a/Pkgfile +++ b/Pkgfile @@ -288,5 +288,19 @@ vars: gdrcopy_version: v2.5.2 gdrcopy_sha256: 32bc7b2c198dd97ec251de0ff4823252c95e31a4c79a5f843c82514c9af2052b gdrcopy_sha512: c717f118eff8cd5a8dc35613c3881818f8b71dc493461dd0151ce7c882f8e2c2d852e22733fab4e2bec57219e10eec874c11b4fad90dd4815ae572840ed19d28 + + # OE4T (NVIDIA Tegra) kernel modules for Jetson Orin NX (Tegra234 / GA10B) + # renovate: datasource=git-refs versioning=git depName=https://github.com/OE4T/linux-nvgpu.git + oe4t_nvgpu_commit: d530a48d64f9ad3020d9f3307f53e8dde8e3fba1 + oe4t_nvgpu_sha256: adc5864edf76d986866e386803a9e628ee229e69ea34867b92b978a0b44f3d54 + oe4t_nvgpu_sha512: a7c7f0b5d3174bf41abc77c77009f46182358f93936aedbe4993e63ff7fc94e21bfd83c3fa0b41af5836866b9c200427504d8f26685d567c11722e7a7bfd3ed9 + # renovate: datasource=git-refs versioning=git depName=https://github.com/OE4T/linux-nv-oot.git + oe4t_nv_oot_commit: ea32e7f97dd04c3f75aadc22424dc63568387120 + oe4t_nv_oot_sha256: 9d2d70a121a418be307e3d1cd3c74d9ae9398e7abc756304d614e998dfd6f342 + oe4t_nv_oot_sha512: 5645163e964bfb13d7aa2ee1749188fe40a1fe9012080f548548f7dc70e4397a762c161041d8d209d2cd969cbb4aab36ea5c560ef5967946eeb3f1dd16335b9c + # renovate: datasource=git-refs versioning=git depName=https://github.com/OE4T/linux-hwpm.git + oe4t_hwpm_commit: 4d8a6998760d85f98637dbf61597bfbb88158206 + oe4t_hwpm_sha256: 96c7656bdad0bf330e7fd58981b8a4eec4717a76840cefbe84e720d88b46be55 + oe4t_hwpm_sha512: 971b91fcae284c59dbe411356109bce9b1a7884b8fac41c9683c79bf3eddef606e71ebaa9c06ad2389b2ba382c3c1125fabe0cbaeb5edac857e218077ed24ef9 labels: org.opencontainers.image.source: https://github.com/siderolabs/pkgs diff --git a/nvidia-tegra-nvgpu/nvhost_ctrl_shim.c b/nvidia-tegra-nvgpu/nvhost_ctrl_shim.c new file mode 100644 index 000000000..14ab9ba10 --- /dev/null +++ b/nvidia-tegra-nvgpu/nvhost_ctrl_shim.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// nvhost_ctrl_shim.c — nvhost-ctrl userspace API shim for Talos Linux / Jetson Orin NX +// +// Per-ioctl trace logging uses pr_debug — enable at runtime with: +// echo "file nvhost_ctrl_shim.c +p" > /sys/kernel/debug/dynamic_debug/control +// +// Provides /dev/nvhost-ctrl with the NVHOST_IOCTL_CTRL_* interface, +// bridging to the OOT host1x syncpoint kernel API. +// +// This allows libnvrm_host1x.so (JetPack 6 CUDA runtime) to use hardware +// syncpoint interrupts for cudaStreamSynchronize — replacing the CPU semaphore +// busy-wait with interrupt-driven sync. +// +// Symbol dependencies (all from host1x.ko): +// host1x_syncpt_get_by_id_noref, host1x_syncpt_read, host1x_syncpt_read_max, +// host1x_fence_create, host1x_fence_extract +// +// Supported ioctls (from linux-nv-oot/include/uapi/linux/nvhost_ioctl.h): +// NVHOST_IOCTL_CTRL_GET_VERSION (7) → return 1 +// NVHOST_IOCTL_CTRL_SYNCPT_READ (1) → host1x_syncpt_read() +// NVHOST_IOCTL_CTRL_SYNCPT_READ_MAX (8) → host1x_syncpt_read_max() +// NVHOST_IOCTL_CTRL_SYNCPT_WAITMEX (9) → dma_fence_wait_timeout() [interrupt-driven] +// NVHOST_IOCTL_CTRL_SYNC_FENCE_CREATE (11) → host1x_fence_create() → sync_file fd +// NVHOST_IOCTL_CTRL_GET_CHARACTERISTICS (14) → return Orin hw syncpt info +// NVHOST_IOCTL_CTRL_POLL_FD_CREATE (16) → anon_inode fd for syncpt event polling +// NVHOST_IOCTL_CTRL_SYNC_FILE_EXTRACT (19) → sync_file fd → host1x_fence_extract() +// +// Targets kernel 6.18 (Talos v1.12.6): +// - class_create() without THIS_MODULE (kernel 6.4+) +// - devnode() callback with const struct device * (kernel 6.2+) +// - close_fd() (kernel 5.11+, replaces __close_fd) +// +// CUDA 12.6 (JetPack 6) call sequence: +// 1. open(/dev/nvhost-ctrl) +// 2. GET_CHARACTERISTICS (nr=14): discover num_syncpts=704 etc. +// 3. SYNCPT_WAITMEX (nr=9): blocking wait for syncpt id/thresh → interrupt-driven +// 4. POLL_FD_CREATE (nr=16): once at GPU scaling init — creates anonymous poll fd +// Note: SYNC_FENCE_CREATE (nr=11) is NOT called by CUDA 12.6 directly but kept +// for other potential callers (e.g. media codecs, test tools). + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ── NVHOST uapi structs (from linux-nv-oot/include/uapi/linux/nvhost_ioctl.h) ── +// Embedded directly to avoid uapi header path issues in OOT build. + +#define NVHOST_IOCTL_MAGIC 'H' + +// nr=7: GET_VERSION +struct nvhost_get_param_args { + __u32 value; +} __packed; + +// nr=1,8: SYNCPT_READ / SYNCPT_READ_MAX +struct nvhost_ctrl_syncpt_read_args { + __u32 id; + __u32 value; +}; + +// nr=9: SYNCPT_WAITMEX — blocking wait until syncpt.value >= thresh +struct nvhost_ctrl_syncpt_waitmex_args { + __u32 id; /* syncpoint id (in) */ + __u32 thresh; /* wait until value >= thresh (in) */ + __s32 timeout; /* timeout in ms; -1 = wait forever (in) */ + __u32 value; /* syncpt value after wait (out) */ + __u32 tv_sec; /* timestamp seconds (out) */ + __u32 tv_nsec; /* timestamp nanoseconds (out) */ + __u32 clock_id; /* clock selector (in, ignored) */ + __u32 reserved; +}; + +// nr=10,11: SYNC_FENCE_CREATE (nr=10 is 32-bit compat, nr=11 is 64-bit) +struct nvhost_ctrl_sync_fence_info { + __u32 id; + __u32 thresh; +}; + +struct nvhost_ctrl_sync_fence_create_args { + __u32 num_pts; + __s32 fence_fd; + __u64 pts; /* struct nvhost_ctrl_sync_fence_info __user * */ + __u64 name; /* const char __user * — ignored, fences are anonymous */ +}; + +// nr=14: GET_CHARACTERISTICS — host1x capability discovery +struct nvhost_characteristics { +#define NVHOST_CHARACTERISTICS_GFILTER (1 << 0) +#define NVHOST_CHARACTERISTICS_RESOURCE_PER_CHANNEL_INSTANCE (1 << 1) +#define NVHOST_CHARACTERISTICS_SUPPORT_PREFENCES (1 << 2) + __u64 flags; + __u32 num_mlocks; + __u32 num_syncpts; + __u32 syncpts_base; + __u32 syncpts_limit; + __u32 num_hw_pts; + __u32 padding; +}; + +struct nvhost_ctrl_get_characteristics { + __u64 nvhost_characteristics_buf_size; + __u64 nvhost_characteristics_buf_addr; +}; + +// nr=16: POLL_FD_CREATE — creates an anonymous fd for syncpoint event polling. +// Called once by gk20a_scale_init during GPU frequency-scaling setup. +// The fd is used with poll()/epoll() to wait for syncpoint threshold events. +// Our implementation returns a real anonymous inode fd so callers get a valid +// file descriptor without ENOTTY; the fd is pollable (returns POLLHUP on close). +struct nvhost_ctrl_poll_fd_create_args { + __s32 fd; /* out: anonymous poll fd */ + __u32 padding; +}; + +// nr=19: SYNC_FILE_EXTRACT +struct nvhost_ctrl_sync_file_extract { + __s32 fd; + __u32 num_fences; + __u64 fences_ptr; /* struct nvhost_ctrl_sync_fence_info __user * */ +}; + +// ── Ioctl definitions ───────────────────────────────────────────────────────── + +#define NVHOST_IOCTL_CTRL_SYNCPT_READ \ + _IOWR(NVHOST_IOCTL_MAGIC, 1, struct nvhost_ctrl_syncpt_read_args) +#define NVHOST_IOCTL_CTRL_GET_VERSION \ + _IOR(NVHOST_IOCTL_MAGIC, 7, struct nvhost_get_param_args) +#define NVHOST_IOCTL_CTRL_SYNCPT_READ_MAX \ + _IOWR(NVHOST_IOCTL_MAGIC, 8, struct nvhost_ctrl_syncpt_read_args) +#define NVHOST_IOCTL_CTRL_SYNCPT_WAITMEX \ + _IOWR(NVHOST_IOCTL_MAGIC, 9, struct nvhost_ctrl_syncpt_waitmex_args) +#define NVHOST_IOCTL_CTRL_SYNC_FENCE_CREATE \ + _IOWR(NVHOST_IOCTL_MAGIC, 11, struct nvhost_ctrl_sync_fence_create_args) +#define NVHOST_IOCTL_CTRL_GET_CHARACTERISTICS \ + _IOWR(NVHOST_IOCTL_MAGIC, 14, struct nvhost_ctrl_get_characteristics) +#define NVHOST_IOCTL_CTRL_POLL_FD_CREATE \ + _IOR(NVHOST_IOCTL_MAGIC, 16, struct nvhost_ctrl_poll_fd_create_args) +#define NVHOST_IOCTL_CTRL_SYNC_FILE_EXTRACT \ + _IOWR(NVHOST_IOCTL_MAGIC, 19, struct nvhost_ctrl_sync_file_extract) + +// Jetson Orin (Tegra234) hardware syncpoint count +#define ORIN_NUM_SYNCPTS 704 + +// ── Module state ────────────────────────────────────────────────────────────── + +static struct { + struct class *class; + struct cdev cdev; + struct device *dev; + dev_t devt; +} nvhost_shim; + +// ── host1x device lookup ────────────────────────────────────────────────────── + +static const struct of_device_id host1x_of_match[] = { + { .compatible = "nvidia,tegra234-host1x" }, + { .compatible = "nvidia,tegra194-host1x" }, + { .compatible = "nvidia,tegra186-host1x" }, + {}, +}; + +static struct host1x *get_host1x(void) +{ + struct platform_device *pdev; + struct device_node *np; + void *drvdata; + + np = of_find_matching_node(NULL, host1x_of_match); + if (!np) { + pr_err_ratelimited("nvhost-ctrl-shim: no host1x OF node found\n"); + return ERR_PTR(-ENODEV); + } + + pdev = of_find_device_by_node(np); + of_node_put(np); + if (!pdev) { + pr_err_ratelimited("nvhost-ctrl-shim: no host1x platform_device\n"); + return ERR_PTR(-EAGAIN); + } + + drvdata = platform_get_drvdata(pdev); + /* Release ref acquired by of_find_device_by_node(); drvdata is stable + * as long as host1x.ko is loaded — pdev itself is not needed further. */ + put_device(&pdev->dev); + if (!drvdata) { + pr_err_ratelimited("nvhost-ctrl-shim: host1x drvdata is NULL\n"); + return ERR_PTR(-EAGAIN); + } + + return drvdata; +} + +// ── File operations ─────────────────────────────────────────────────────────── + +static int nvhost_ctrl_open(struct inode *inode, struct file *file) +{ + struct host1x *host1x = get_host1x(); + + if (IS_ERR(host1x)) { + pr_err("nvhost-ctrl-shim: open failed, get_host1x=%ld\n", + PTR_ERR(host1x)); + return PTR_ERR(host1x); + } + + pr_debug("nvhost-ctrl-shim: opened (pid %d)\n", current->pid); + file->private_data = host1x; + return 0; +} + +// ── NVHOST_IOCTL_CTRL_SYNCPT_READ / SYNCPT_READ_MAX ────────────────────────── + +static int ioctl_syncpt_read(struct host1x *host1x, void __user *data, + bool read_max) +{ + struct nvhost_ctrl_syncpt_read_args args; + struct host1x_syncpt *sp; + + if (copy_from_user(&args, data, sizeof(args))) + return -EFAULT; + + sp = host1x_syncpt_get_by_id_noref(host1x, args.id); + if (!sp) { + pr_err_ratelimited("nvhost-ctrl-shim: SYNCPT_READ%s: id=%u not found\n", + read_max ? "_MAX" : "", args.id); + return -EINVAL; + } + + args.value = read_max ? host1x_syncpt_read_max(sp) + : host1x_syncpt_read(sp); + + return copy_to_user(data, &args, sizeof(args)) ? -EFAULT : 0; +} + +// ── NVHOST_IOCTL_CTRL_SYNCPT_WAITMEX ───────────────────────────────────────── +// Blocking wait until syncpt[id].value >= thresh, using interrupt-driven +// dma_fence_wait_timeout (host1x hardware interrupt, NOT CPU busy-poll). + +static int ioctl_syncpt_waitmex(struct host1x *host1x, void __user *data) +{ + struct nvhost_ctrl_syncpt_waitmex_args args; + struct host1x_syncpt *sp; + struct dma_fence *fence; + long timeout_jiffies; + long ret; + + if (copy_from_user(&args, data, sizeof(args))) + return -EFAULT; + + pr_debug("nvhost-ctrl-shim: SYNCPT_WAITMEX id=%u thresh=%u timeout=%d\n", + args.id, args.thresh, args.timeout); + + sp = host1x_syncpt_get_by_id_noref(host1x, args.id); + if (!sp) { + pr_err("nvhost-ctrl-shim: SYNCPT_WAITMEX id=%u not found\n", + args.id); + return -EINVAL; + } + + // timeout: -1 → wait forever; 0 → wait forever; >0 → milliseconds. + // + // GA10B (Jetson Orin NX) is slower than desktop GPUs. CUDA's built-in + // timeout for cudaStreamSynchronize may expire before large-model kernels + // (e.g. qwen2.5:7b warmup with 311 MiB compute buffer) complete on GA10B. + // Enforce a minimum wait of 30 s so slow-but-valid kernels are not aborted. + // This also logs the CUDA-requested timeout for diagnostics. + if (args.timeout < 0) { + // -1 = wait forever + timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + } else if (args.timeout == 0) { + // 0 = also treat as "wait forever" (no timeout specified) + timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + } else { + // Clamp to minimum 30 000 ms so GA10B large-model kernels are not + // prematurely killed by CUDA's default short timeout. + unsigned int timeout_ms = max_t(unsigned int, + (unsigned int)args.timeout, 30000U); + pr_debug("nvhost-ctrl-shim: SYNCPT_WAITMEX cuda_timeout=%dms → using %ums\n", + args.timeout, timeout_ms); + timeout_jiffies = msecs_to_jiffies(timeout_ms); + } + + // Create a fence that signals when syncpt reaches thresh + fence = host1x_fence_create(sp, args.thresh, true); + if (IS_ERR(fence)) { + pr_err("nvhost-ctrl-shim: SYNCPT_WAITMEX fence_create failed: %ld\n", + PTR_ERR(fence)); + return PTR_ERR(fence); + } + + // Sleep until hardware interrupt signals the fence + ret = dma_fence_wait_timeout(fence, true, timeout_jiffies); + dma_fence_put(fence); + + if (ret < 0) { + pr_err("nvhost-ctrl-shim: SYNCPT_WAITMEX wait error: %ld\n", ret); + return ret; + } + if (ret == 0) { + pr_warn("nvhost-ctrl-shim: SYNCPT_WAITMEX timeout id=%u thresh=%u (cuda_timeout=%dms)\n", + args.id, args.thresh, args.timeout); + return -ETIMEDOUT; + } + + args.value = host1x_syncpt_read(sp); + args.tv_sec = 0; + args.tv_nsec = 0; + + pr_debug("nvhost-ctrl-shim: SYNCPT_WAITMEX done id=%u value=%u\n", + args.id, args.value); + + return copy_to_user(data, &args, sizeof(args)) ? -EFAULT : 0; +} + +// ── NVHOST_IOCTL_CTRL_GET_CHARACTERISTICS ──────────────────────────────────── +// CUDA calls this on every open to discover available syncpoints. + +static int ioctl_get_characteristics(void __user *data) +{ + struct nvhost_ctrl_get_characteristics req; + struct nvhost_characteristics chars = { + .flags = NVHOST_CHARACTERISTICS_SUPPORT_PREFENCES, + .num_mlocks = 0, + .num_syncpts = ORIN_NUM_SYNCPTS, + .syncpts_base = 0, + .syncpts_limit = ORIN_NUM_SYNCPTS, + .num_hw_pts = ORIN_NUM_SYNCPTS, + .padding = 0, + }; + __u64 copy_size; + + if (copy_from_user(&req, data, sizeof(req))) + return -EFAULT; + + pr_debug("nvhost-ctrl-shim: GET_CHARACTERISTICS buf_size=%llu\n", + req.nvhost_characteristics_buf_size); + + if (!req.nvhost_characteristics_buf_addr) { + // Only querying the required size + req.nvhost_characteristics_buf_size = sizeof(chars); + return copy_to_user(data, &req, sizeof(req)) ? -EFAULT : 0; + } + + copy_size = min_t(__u64, req.nvhost_characteristics_buf_size, sizeof(chars)); + if (copy_to_user(u64_to_user_ptr(req.nvhost_characteristics_buf_addr), + &chars, copy_size)) + return -EFAULT; + + req.nvhost_characteristics_buf_size = sizeof(chars); + return copy_to_user(data, &req, sizeof(req)) ? -EFAULT : 0; +} + +// ── NVHOST_IOCTL_CTRL_POLL_FD_CREATE ───────────────────────────────────────── +// Creates an anonymous inode fd for syncpoint event polling. +// Called once by gk20a_scale_init (GPU frequency scaling); NOT in the CUDA +// inference hot-path. Returns a real pollable fd so callers can select()/epoll() +// without getting ENOTTY. The fd is a minimal anon inode — it does not deliver +// syncpoint threshold events, but it is a valid open file descriptor. + +static __poll_t nvhost_ctrl_poll_fd_poll(struct file *file, poll_table *wait) +{ + /* Never signals readiness — callers use SYNCPT_WAITMEX for real waits */ + return 0; +} + +static const struct file_operations nvhost_ctrl_poll_fops = { + .owner = THIS_MODULE, + .poll = nvhost_ctrl_poll_fd_poll, +}; + +static int ioctl_poll_fd_create(void __user *data) +{ + struct nvhost_ctrl_poll_fd_create_args args; + int fd; + + fd = anon_inode_getfd("nvhost-ctrl-poll", &nvhost_ctrl_poll_fops, + NULL, O_RDWR | O_CLOEXEC); + if (fd < 0) { + pr_err("nvhost-ctrl-shim: POLL_FD_CREATE: anon_inode_getfd failed: %d\n", + fd); + return fd; + } + + args.fd = fd; + args.padding = 0; + + if (copy_to_user(data, &args, sizeof(args))) { + close_fd(fd); + return -EFAULT; + } + + pr_debug("nvhost-ctrl-shim: POLL_FD_CREATE → fd=%d\n", fd); + return 0; +} + +// ── NVHOST_IOCTL_CTRL_SYNC_FENCE_CREATE ────────────────────────────────────── + +static int make_fence_fd(struct host1x_syncpt *sp, u32 thresh) +{ + struct sync_file *sfile; + struct dma_fence *f; + int fd; + + f = host1x_fence_create(sp, thresh, true); + if (IS_ERR(f)) { + pr_err_ratelimited("nvhost-ctrl-shim: host1x_fence_create thresh=%u err=%ld\n", + thresh, PTR_ERR(f)); + return PTR_ERR(f); + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + dma_fence_put(f); + return fd; + } + + sfile = sync_file_create(f); + dma_fence_put(f); + if (!sfile) { + put_unused_fd(fd); + return -ENOMEM; + } + + fd_install(fd, sfile->file); + return fd; +} + +static int make_array_fence_fd(struct host1x *host1x, + struct nvhost_ctrl_sync_fence_info __user *pts_user, + u32 num_pts) +{ + struct dma_fence **fences; + struct dma_fence_array *arr; + struct sync_file *sfile; + struct host1x_syncpt *sp; + struct nvhost_ctrl_sync_fence_info pt; + int fd, err = 0; + u32 i; + + fences = kcalloc(num_pts, sizeof(*fences), GFP_KERNEL); + if (!fences) + return -ENOMEM; + + for (i = 0; i < num_pts; i++) { + if (copy_from_user(&pt, pts_user + i, sizeof(pt))) { + err = -EFAULT; + goto free_fences; + } + sp = host1x_syncpt_get_by_id_noref(host1x, pt.id); + if (!sp) { + err = -EINVAL; + goto free_fences; + } + fences[i] = host1x_fence_create(sp, pt.thresh, true); + if (IS_ERR(fences[i])) { + err = PTR_ERR(fences[i]); + fences[i] = NULL; + goto free_fences; + } + } + + /* dma_fence_array_create takes ownership of fences[] on success */ + arr = dma_fence_array_create(num_pts, fences, + dma_fence_context_alloc(1), 1, false); + if (!arr) { + err = -ENOMEM; + goto free_fences; + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + err = fd; + dma_fence_put(&arr->base); + return err; + } + + sfile = sync_file_create(&arr->base); + dma_fence_put(&arr->base); + if (!sfile) { + put_unused_fd(fd); + return -ENOMEM; + } + + fd_install(fd, sfile->file); + return fd; + +free_fences: + for (i = 0; i < num_pts; i++) + if (fences[i]) + dma_fence_put(fences[i]); + kfree(fences); + return err; +} + +static int ioctl_sync_fence_create(struct host1x *host1x, void __user *data) +{ + struct nvhost_ctrl_sync_fence_info __user *pts_user; + struct nvhost_ctrl_sync_fence_create_args args; + struct nvhost_ctrl_sync_fence_info pt; + struct host1x_syncpt *sp; + int fd; + + if (copy_from_user(&args, data, sizeof(args))) + return -EFAULT; + + pr_debug("nvhost-ctrl-shim: SYNC_FENCE_CREATE num_pts=%u\n", args.num_pts); + + if (args.num_pts == 0 || args.num_pts > 512) + return -EINVAL; + + pts_user = u64_to_user_ptr(args.pts); + + if (args.num_pts == 1) { + if (copy_from_user(&pt, pts_user, sizeof(pt))) + return -EFAULT; + pr_debug("nvhost-ctrl-shim: SYNC_FENCE_CREATE id=%u thresh=%u\n", + pt.id, pt.thresh); + sp = host1x_syncpt_get_by_id_noref(host1x, pt.id); + if (!sp) { + pr_err("nvhost-ctrl-shim: SYNC_FENCE_CREATE id=%u not found\n", + pt.id); + return -EINVAL; + } + fd = make_fence_fd(sp, pt.thresh); + } else { + fd = make_array_fence_fd(host1x, pts_user, args.num_pts); + } + + if (fd < 0) { + pr_err("nvhost-ctrl-shim: SYNC_FENCE_CREATE failed: %d\n", fd); + return fd; + } + + pr_debug("nvhost-ctrl-shim: SYNC_FENCE_CREATE → fd=%d\n", fd); + args.fence_fd = fd; + if (copy_to_user(data, &args, sizeof(args))) { + close_fd(fd); + return -EFAULT; + } + return 0; +} + +// ── NVHOST_IOCTL_CTRL_SYNC_FILE_EXTRACT ────────────────────────────────────── + +static int ioctl_sync_file_extract(struct host1x *host1x, void __user *data) +{ + struct nvhost_ctrl_sync_fence_info __user *fences_user; + struct nvhost_ctrl_sync_file_extract args; + struct dma_fence *fence, **fences; + struct dma_fence_array *array; + unsigned int num_fences, i, j; + int err = 0; + + if (copy_from_user(&args, data, sizeof(args))) + return -EFAULT; + + fences_user = u64_to_user_ptr(args.fences_ptr); + + fence = sync_file_get_fence(args.fd); + if (!fence) + return -EINVAL; + + array = to_dma_fence_array(fence); + if (array) { + fences = array->fences; + num_fences = array->num_fences; + } else { + fences = &fence; + num_fences = 1; + } + + for (i = 0, j = 0; i < num_fences; i++) { + struct nvhost_ctrl_sync_fence_info fi; + + err = host1x_fence_extract(fences[i], &fi.id, &fi.thresh); + if (err == -EINVAL && dma_fence_is_signaled(fences[i])) { + /* signaled stub fence — skip */ + err = 0; + continue; + } + if (err) + goto put; + + if (j < args.num_fences) { + if (copy_to_user(fences_user + j, &fi, sizeof(fi))) { + err = -EFAULT; + goto put; + } + } + j++; + } + + args.num_fences = j; + if (copy_to_user(data, &args, sizeof(args))) + err = -EFAULT; + +put: + dma_fence_put(fence); + return err; +} + +// ── Ioctl dispatcher ────────────────────────────────────────────────────────── + +static long nvhost_ctrl_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct host1x *host1x = file->private_data; + void __user *data = (void __user *)arg; + + switch (cmd) { + case NVHOST_IOCTL_CTRL_GET_VERSION: { + struct nvhost_get_param_args v = { .value = 1 }; + pr_debug("nvhost-ctrl-shim: GET_VERSION → 1\n"); + return copy_to_user(data, &v, sizeof(v)) ? -EFAULT : 0; + } + case NVHOST_IOCTL_CTRL_SYNCPT_READ: + return ioctl_syncpt_read(host1x, data, false); + case NVHOST_IOCTL_CTRL_SYNCPT_READ_MAX: + return ioctl_syncpt_read(host1x, data, true); + case NVHOST_IOCTL_CTRL_SYNCPT_WAITMEX: + return ioctl_syncpt_waitmex(host1x, data); + case NVHOST_IOCTL_CTRL_SYNC_FENCE_CREATE: + return ioctl_sync_fence_create(host1x, data); + case NVHOST_IOCTL_CTRL_GET_CHARACTERISTICS: + return ioctl_get_characteristics(data); + case NVHOST_IOCTL_CTRL_POLL_FD_CREATE: + return ioctl_poll_fd_create(data); + case NVHOST_IOCTL_CTRL_SYNC_FILE_EXTRACT: + return ioctl_sync_file_extract(host1x, data); + default: + pr_warn_ratelimited("nvhost-ctrl-shim: unknown ioctl cmd=0x%08x\n", cmd); + return -ENOTTY; + } +} + +// ── Device class / cdev setup ───────────────────────────────────────────────── + +static char *nvhost_ctrl_devnode(const struct device *dev, umode_t *mode) +{ + /* 0666: the CUDA runtime (libnvrm_host1x.so) opens this device as the + * container user; Talos runs a single-workload model so world-readable + * is acceptable. Guard mode for callers that pass NULL. */ + if (mode) + *mode = 0666; + return NULL; +} + +static const struct file_operations nvhost_ctrl_fops = { + .owner = THIS_MODULE, + .open = nvhost_ctrl_open, + .unlocked_ioctl = nvhost_ctrl_ioctl, + .compat_ioctl = nvhost_ctrl_ioctl, +}; + +// ── Module init / exit ──────────────────────────────────────────────────────── + +static int __init nvhost_ctrl_shim_init(void) +{ + dev_t devt; + int err; + + err = alloc_chrdev_region(&devt, 0, 1, "nvhost-ctrl"); + if (err) + return err; + + nvhost_shim.class = class_create("nvhost-ctrl"); + if (IS_ERR(nvhost_shim.class)) { + err = PTR_ERR(nvhost_shim.class); + goto unregister; + } + nvhost_shim.class->devnode = nvhost_ctrl_devnode; + + cdev_init(&nvhost_shim.cdev, &nvhost_ctrl_fops); + err = cdev_add(&nvhost_shim.cdev, devt, 1); + if (err) + goto destroy_class; + + nvhost_shim.dev = device_create(nvhost_shim.class, NULL, + devt, NULL, "nvhost-ctrl"); + if (IS_ERR(nvhost_shim.dev)) { + err = PTR_ERR(nvhost_shim.dev); + goto del_cdev; + } + + nvhost_shim.devt = devt; + pr_info("nvhost-ctrl-shim: /dev/nvhost-ctrl ready (major %d)\n", + MAJOR(devt)); + return 0; + +del_cdev: + cdev_del(&nvhost_shim.cdev); +destroy_class: + class_destroy(nvhost_shim.class); +unregister: + unregister_chrdev_region(devt, 1); + return err; +} + +static void __exit nvhost_ctrl_shim_exit(void) +{ + device_destroy(nvhost_shim.class, nvhost_shim.devt); + cdev_del(&nvhost_shim.cdev); + class_destroy(nvhost_shim.class); + unregister_chrdev_region(nvhost_shim.devt, 1); +} + +module_init(nvhost_ctrl_shim_init); +module_exit(nvhost_ctrl_shim_exit); + +MODULE_DESCRIPTION("nvhost-ctrl shim — NVHOST ioctl API over OOT host1x for Talos Jetson"); +MODULE_LICENSE("GPL"); diff --git a/nvidia-tegra-nvgpu/patches/0001-nvgpu-syncpt-retry-errata-id0.patch b/nvidia-tegra-nvgpu/patches/0001-nvgpu-syncpt-retry-errata-id0.patch new file mode 100644 index 000000000..07bdc3665 --- /dev/null +++ b/nvidia-tegra-nvgpu/patches/0001-nvgpu-syncpt-retry-errata-id0.patch @@ -0,0 +1,60 @@ +--- a/drivers/gpu/nvgpu/os/linux/nvhost_host1x.c 2026-04-27 22:24:47 ++++ b/drivers/gpu/nvgpu/os/linux/nvhost_host1x.c 2026-04-27 22:25:13 +@@ -14,6 +14,7 @@ + * along with this program. If not, see . + */ + ++#include + #include + #include + #include +@@ -237,17 +238,41 @@ + u32 nvgpu_nvhost_get_syncpt_client_managed(struct nvgpu_nvhost_dev *nvhost_dev, + const char *syncpt_name) + { +- struct host1x_syncpt *sp; ++ struct host1x_syncpt *sp = NULL; + struct host1x *host1x; ++ int retry; + +- host1x = platform_get_drvdata(nvhost_dev->host1x_pdev); +- if (!host1x) +- return 0; +- +- sp = host1x_syncpt_alloc(host1x, HOST1X_SYNCPT_CLIENT_MANAGED | HOST1X_SYNCPT_GPU, +- syncpt_name); +- if (!sp) ++ /* ++ * GA10B ERRATA_SYNCPT_INVALID_ID_0: nvgpu rejects syncpt id=0. ++ * During ~1-2ms after the first kernel submit, host1x_syncpt_alloc ++ * may return NULL or id=0, causing CUDA error 999 on ++ * cudaStreamSynchronize. Retry up to 5ms. ++ */ ++ for (retry = 0; retry < 5; retry++) { ++ host1x = platform_get_drvdata(nvhost_dev->host1x_pdev); ++ if (!host1x) { ++ pr_warn_ratelimited("nvgpu: host1x not ready, syncpt retry %d/5\n", ++ retry + 1); ++ msleep(1); ++ continue; ++ } ++ sp = host1x_syncpt_alloc(host1x, ++ HOST1X_SYNCPT_CLIENT_MANAGED | HOST1X_SYNCPT_GPU, ++ syncpt_name); ++ if (sp && host1x_syncpt_id(sp) != 0U) ++ break; ++ if (sp) { ++ host1x_syncpt_put(sp); ++ sp = NULL; ++ } ++ pr_warn_ratelimited("nvgpu: syncpt_alloc returned id=0 or NULL, retry %d/5\n", ++ retry + 1); ++ msleep(1); ++ } ++ if (!sp) { ++ pr_err_ratelimited("nvgpu: get_syncpt_client_managed: failed after retries\n"); + return 0; ++ } + + return host1x_syncpt_id(sp); + } diff --git a/nvidia-tegra-nvgpu/patches/0002-nvgpu-netlist-flexible-array.patch b/nvidia-tegra-nvgpu/patches/0002-nvgpu-netlist-flexible-array.patch new file mode 100644 index 000000000..883d2810e --- /dev/null +++ b/nvidia-tegra-nvgpu/patches/0002-nvgpu-netlist-flexible-array.patch @@ -0,0 +1,11 @@ +--- a/drivers/gpu/nvgpu/common/netlist/netlist_priv.h 2026-04-27 22:24:52 ++++ b/drivers/gpu/nvgpu/common/netlist/netlist_priv.h 2026-04-27 22:25:25 +@@ -113,7 +113,7 @@ + + struct netlist_image { + struct netlist_image_header header; +- struct netlist_region regions[1]; ++ struct netlist_region regions[]; + }; + + struct netlist_gr_ucode { diff --git a/nvidia-tegra-nvgpu/pkg.yaml b/nvidia-tegra-nvgpu/pkg.yaml new file mode 100644 index 000000000..55c48d1d6 --- /dev/null +++ b/nvidia-tegra-nvgpu/pkg.yaml @@ -0,0 +1,337 @@ +name: nvidia-tegra-nvgpu-pkg +variant: scratch +shell: /bin/bash +dependencies: + - stage: base + - stage: kernel-build + - image: "{{ .LLVM_IMAGE }}:{{ .TOOLS_REV }}" +steps: + - sources: + # OE4T patched nvgpu - supports kernel 6.x (fixes platform_driver.remove, hrtimer, struct fd) + - url: https://github.com/OE4T/linux-nvgpu/archive/{{ .oe4t_nvgpu_commit }}.tar.gz + destination: nvgpu.tar.gz + sha256: "{{ .oe4t_nvgpu_sha256 }}" + sha512: "{{ .oe4t_nvgpu_sha512 }}" + # OE4T patched nvidia-oot - patches-r36.5 branch (kernel 6.18 compat: __assign_str, + # f_count->f_ref, __alloc_pages_bulk 5-arg, and all earlier 6.x fixes) + - url: https://github.com/OE4T/linux-nv-oot/archive/{{ .oe4t_nv_oot_commit }}.tar.gz + destination: nvidia-oot.tar.gz + sha256: "{{ .oe4t_nv_oot_sha256 }}" + sha512: "{{ .oe4t_nv_oot_sha512 }}" + # OE4T patched hwpm - supports kernel 6.x (fixes platform_driver.remove, MODULE_IMPORT_NS) + - url: https://github.com/OE4T/linux-hwpm/archive/{{ .oe4t_hwpm_commit }}.tar.gz + destination: hwpm.tar.gz + sha256: "{{ .oe4t_hwpm_sha256 }}" + sha512: "{{ .oe4t_hwpm_sha512 }}" + # nvhost-ctrl-shim source is embedded in this package (see nvhost_ctrl_shim.c) + # and made available at /pkg/nvhost_ctrl_shim.c by bldr at build time. + env: + ARCH: arm64 + LLVM: "1" + LLVM_IAS: "1" + prepare: + - | + echo "Extracting OE4T patched OOT module sources..." + mkdir -p /oot-src/nvgpu /oot-src/nvidia-oot /oot-src/hwpm + tar xzf nvgpu.tar.gz -C /oot-src/nvgpu --strip-components=1 + tar xzf nvidia-oot.tar.gz -C /oot-src/nvidia-oot --strip-components=1 + tar xzf hwpm.tar.gz -C /oot-src/hwpm --strip-components=1 + echo "Sources extracted:" + ls /oot-src/ + # nvhost-ctrl-shim C source (embedded in this package, available at /pkg/) + mkdir -p /oot-src/nvhost-ctrl-shim + cp /pkg/nvhost_ctrl_shim.c /oot-src/nvhost-ctrl-shim/nvhost_ctrl_shim.c + echo "nvhost-ctrl-shim source ready ($(wc -l < /oot-src/nvhost-ctrl-shim/nvhost_ctrl_shim.c) lines)" + build: + - | + echo "Building NVIDIA conftest (kernel compat detection)..." + mkdir -p /oot-src/out/nvidia-conftest/nvidia /oot-src/out/nvidia-linux-header + + # conftest scripts are in nvidia-oot + cp -av /oot-src/nvidia-oot/scripts/conftest/* /oot-src/out/nvidia-conftest/nvidia/ + + make -j $(nproc) ARCH=arm64 \ + src=/oot-src/out/nvidia-conftest/nvidia \ + obj=/oot-src/out/nvidia-conftest/nvidia \ + LLVM=1 \ + NV_KERNEL_SOURCES=/src \ + NV_KERNEL_OUTPUT=/src \ + -f /oot-src/out/nvidia-conftest/nvidia/Makefile + echo "conftest done." + - | + /pkg/scripts/fixup.sh + - | + # ── Cross-compiler mismatch fix: Clang wrapper ──────────────────────────── + # Strips GCC-only flags before passing to clang (source-independent fix). + mkdir -p /usr/local/bin + cp /pkg/scripts/clang-oot /usr/local/bin/clang-oot + chmod +x /usr/local/bin/clang-oot + echo "clang-oot wrapper installed" + + NVIDIA_OOT=/oot-src/nvidia-oot + NVIDIA_CONFTEST=/oot-src/out/nvidia-conftest + touch /oot-src/out/nvidia-linux-header/Module.symvers.nvidia + + # ── OOT host1x: HOST1X_SYNCPT_GPU support + syncpt[0] id=0 fix ────────────── + echo "=== Building OOT host1x ===" + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=${NVIDIA_OOT}/drivers/gpu/host1x \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + modules 2>&1 | tee /tmp/build-host1x.log; [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "host1x: OK" || { echo "host1x: BUILD FAILED (see errors above)"; exit 1; } + cat ${NVIDIA_OOT}/drivers/gpu/host1x/Module.symvers \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + # ── mc-utils ────────────────────────────────────────────────────────────────── + echo "=== Building mc-utils ===" + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=${NVIDIA_OOT}/drivers/platform/tegra/mc-utils \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + modules 2>&1 | tee /tmp/build-mc-utils.log; [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "mc-utils: OK" || { echo "mc-utils: BUILD FAILED (see errors above)"; exit 1; } + cat ${NVIDIA_OOT}/drivers/platform/tegra/mc-utils/Module.symvers \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + # ── host1x-fence ───────────────────────────────────────────────────────────── + echo "=== Building OOT host1x-fence ===" + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=${NVIDIA_OOT}/drivers/gpu/host1x-fence \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + KBUILD_EXTRA_SYMBOLS=/oot-src/out/nvidia-linux-header/Module.symvers.nvidia \ + modules 2>&1 | tee /tmp/build-host1x-fence.log; [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "host1x-fence: OK" || { echo "host1x-fence: BUILD FAILED (see errors above)"; exit 1; } + cat ${NVIDIA_OOT}/drivers/gpu/host1x-fence/Module.symvers \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + # ── nvhost-ctrl-shim: /dev/nvhost-ctrl for JetPack 6 CUDA runtime ──────────── + # Bridges NVHOST_IOCTL_CTRL_SYNC_FENCE_CREATE / SYNC_FILE_EXTRACT to the OOT host1x + # syncpoint API so libnvrm_host1x.so uses hardware syncpoint interrupts for + # cudaStreamSynchronize instead of CPU semaphore polling. Symbol deps: host1x.ko only. + echo "=== Building nvhost-ctrl-shim ===" + SHIM_DIR=/oot-src/nvhost-ctrl-shim + printf 'obj-m += nvhost_ctrl_shim.o\nccflags-y += -I$(src)/../nvidia-oot/drivers/gpu/host1x/include\n' \ + > "${SHIM_DIR}/Makefile" + echo "nvhost-ctrl-shim: source ready ($(wc -l < "${SHIM_DIR}/nvhost_ctrl_shim.c") lines)" + make -j$(nproc) ARCH=arm64 \ + -C /src \ + M="${SHIM_DIR}" \ + CC=/usr/local/bin/clang-oot \ + LLVM=1 \ + KBUILD_EXTRA_SYMBOLS=/oot-src/out/nvidia-linux-header/Module.symvers.nvidia \ + modules 2>&1 | tee /tmp/build-nvhost-ctrl-shim.log + [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "nvhost-ctrl-shim: OK" \ + || { echo "=== nvhost-ctrl-shim BUILD FAILED ==="; cat /tmp/build-nvhost-ctrl-shim.log; exit 1; } + cat "${SHIM_DIR}/Module.symvers" \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + # ── nvmap ───────────────────────────────────────────────────────────────────── + echo "=== Building nvmap ===" + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=${NVIDIA_OOT}/drivers/video/tegra/nvmap \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + NVMAP_CONFIG=y \ + NVMAP_CONFIG_LOADABLE_MODULE=y \ + NVMAP_CONFIG_PAGE_POOLS=y \ + NVMAP_CONFIG_HANDLE_AS_ID=n \ + NVMAP_CONFIG_SCIIPC=n \ + modules 2>&1 | tee /tmp/build-nvmap.log; [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "nvmap: OK" || { echo "nvmap: BUILD FAILED (see errors above)"; exit 1; } + cat ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Module.symvers \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + # ── governor_pod_scaling ────────────────────────────────────────────────────── + echo "=== Building governor-pod-scaling ===" + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=${NVIDIA_OOT}/drivers/devfreq \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + modules 2>&1 | tee /tmp/build-governor.log; [[ ${PIPESTATUS[0]} -eq 0 ]] && echo "governor-pod-scaling: OK" || { echo "governor-pod-scaling: BUILD FAILED (see errors above)"; exit 1; } + cat ${NVIDIA_OOT}/drivers/devfreq/Module.symvers \ + >> /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null || true + + echo "=== nvidia-oot dependency modules done ===" + echo "Symbols exported so far:" + cat /oot-src/out/nvidia-linux-header/Module.symvers.nvidia | awk '{print $2}' | sort + - | + echo "Generating nvidia-linux-headers..." + cp -av /oot-src/nvidia-oot/include /oot-src/out/nvidia-linux-header/ + cat /src/Module.symvers \ + /oot-src/out/nvidia-linux-header/Module.symvers.nvidia 2>/dev/null \ + > /oot-src/out/nvidia-linux-header/Module.symvers + - | + echo "Building nvgpu module (OE4T patches-r36.5)..." + # clang-oot wrapper already installed by the earlier build step — reuse it. + grep -rl "NV_VM_AREA_STRUCT_HAS_CONST_VM_FLAGS" /oot-src/nvgpu/drivers/gpu/nvgpu/ \ + | xargs -r sed -i "s|#if defined(NV_VM_AREA_STRUCT_HAS_CONST_VM_FLAGS)|#if 1 /* force: kernel 6.3+ */|g" + sed -i '1s|^|ccflags-y += -Wno-implicit-fallthrough -Wno-parentheses-equality -Wno-incompatible-function-pointer-types -Wno-sometimes-uninitialized\n|' \ + /oot-src/nvgpu/drivers/gpu/nvgpu/Makefile + grep -rl "class_create(THIS_MODULE," /oot-src/nvgpu/drivers/gpu/nvgpu/ \ + | xargs -r sed -i 's/class_create(THIS_MODULE, /class_create(/g' + # nvgpu source patches (applied via standard patch tool) + patch -p1 -d /oot-src/nvgpu \ + < /pkg/patches/0001-nvgpu-syncpt-retry-errata-id0.patch + patch -p1 -d /oot-src/nvgpu \ + < /pkg/patches/0002-nvgpu-netlist-flexible-array.patch + make -j $(nproc) ARCH=arm64 \ + -C /src \ + M=/oot-src/nvgpu/drivers/gpu/nvgpu \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + CONFIG_GK20A=m \ + CONFIG_TEGRA_GK20A_NVHOST=y \ + CONFIG_TEGRA_GK20A_NVHOST_HOST1X=y \ + CONFIG_TEGRA_HWPM=n \ + srctree.nvgpu=/oot-src/nvgpu \ + srctree.nvidia=/oot-src/nvidia-oot \ + srctree.nvconftest=/oot-src/out/nvidia-conftest \ + LLVM=1 \ + KBUILD_EXTRA_SYMBOLS=/oot-src/out/nvidia-linux-header/Module.symvers.nvidia \ + KBUILD_MODPOST_WARN=1 \ + modules + echo "nvgpu done." + install: + - | + NVIDIA_OOT=/oot-src/nvidia-oot + NVIDIA_CONFTEST=/oot-src/out/nvidia-conftest + SHIM_DIR=/oot-src/nvhost-ctrl-shim + echo "Installing kernel modules..." + + mkdir -p /rootfs/usr/lib/modules/$(cat /src/include/config/kernel.release)/ + cp /src/modules.order /src/modules.builtin /src/modules.builtin.modinfo \ + /rootfs/usr/lib/modules/$(cat /src/include/config/kernel.release)/ + + # OOT host1x → kernel/drivers/gpu/host1x/ (shadows in-tree host1x.ko; + # provides HOST1X_SYNCPT_GPU that nvgpu's CONFIG_TEGRA_GK20A_NVHOST=y requires) + make -j$(nproc) ARCH=arm64 -C /src \ + M=${NVIDIA_OOT}/drivers/gpu/host1x \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=kernel/drivers/gpu/host1x \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # host1x-fence → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=${NVIDIA_OOT}/drivers/gpu/host1x-fence \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # nvhost-ctrl-shim → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=${SHIM_DIR} \ + CC=/usr/local/bin/clang-oot \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # nvmap → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=${NVIDIA_OOT}/drivers/video/tegra/nvmap \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + NVMAP_CONFIG=y \ + NVMAP_CONFIG_LOADABLE_MODULE=y \ + NVMAP_CONFIG_PAGE_POOLS=y \ + NVMAP_CONFIG_HANDLE_AS_ID=n \ + NVMAP_CONFIG_SCIIPC=n \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # mc-utils → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=${NVIDIA_OOT}/drivers/platform/tegra/mc-utils \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # governor_pod_scaling → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=${NVIDIA_OOT}/drivers/devfreq \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + srctree.nvidia-oot=${NVIDIA_OOT} \ + srctree.nvconftest=${NVIDIA_CONFTEST} \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + # nvgpu → extra/nvidia-tegra + make -j$(nproc) ARCH=arm64 -C /src \ + M=/oot-src/nvgpu/drivers/gpu/nvgpu \ + CC=/usr/local/bin/clang-oot \ + CONFIG_TEGRA_OOT_MODULE=m \ + CONFIG_GK20A=m \ + CONFIG_TEGRA_GK20A_NVHOST=y \ + CONFIG_TEGRA_GK20A_NVHOST_HOST1X=y \ + CONFIG_TEGRA_HWPM=n \ + srctree.nvgpu=/oot-src/nvgpu \ + srctree.nvidia=${NVIDIA_OOT} \ + srctree.nvconftest=/oot-src/out/nvidia-conftest \ + LLVM=1 \ + INSTALL_MOD_PATH=/rootfs/usr \ + INSTALL_MOD_DIR=extra/nvidia-tegra \ + INSTALL_MOD_STRIP=1 \ + CONFIG_MODULE_SIG_ALL=y \ + modules_install + + test: + - | + # https://www.kernel.org/doc/html/v4.15/admin-guide/module-signing.html#signed-modules-and-stripping + find /rootfs/usr/lib/modules -name '*.ko' -exec grep -FL '~Module signature appended~' {} \+ + - | + fhs-validator /rootfs +finalize: + - from: /rootfs + to: / diff --git a/nvidia-tegra-nvgpu/scripts/clang-oot b/nvidia-tegra-nvgpu/scripts/clang-oot new file mode 100644 index 000000000..d7d09f05d --- /dev/null +++ b/nvidia-tegra-nvgpu/scripts/clang-oot @@ -0,0 +1,16 @@ +#!/bin/bash +# Cross-compiler mismatch fix: strips GCC-only flags before passing to clang. +filtered=() +for arg in "$@"; do + case "$arg" in + -fmin-function-alignment=*|-fconserve-stack) ;; + -fsanitize=bounds-strict) filtered+=("-fsanitize=bounds") ;; + -Wimplicit-fallthrough=*) filtered+=("-Wimplicit-fallthrough") ;; + -Wno-maybe-uninitialized) filtered+=("-Wno-uninitialized") ;; + -Wno-alloc-size-larger-than|-Wno-alloc-size-larger-than=*) ;; + -fplugin=*|-fplugin-arg-*) ;; + -pg|-mrecord-mcount|-mfentry|-fpatchable-function-entry=*) ;; + *) filtered+=("$arg") ;; + esac +done +exec clang "${filtered[@]}" -Wno-unknown-warning-option -Wno-enum-enum-conversion -Wno-implicit-fallthrough -Wno-gnu-variable-sized-type-not-at-end diff --git a/nvidia-tegra-nvgpu/scripts/fixup.sh b/nvidia-tegra-nvgpu/scripts/fixup.sh new file mode 100644 index 000000000..be502f90d --- /dev/null +++ b/nvidia-tegra-nvgpu/scripts/fixup.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Patch OOT module Makefiles: remove -Werror and add required include paths. +# srctree.nvconftest and srctree.nvidia-oot are passed as make vars at build time. +set -euo pipefail + +NVIDIA_OOT=/oot-src/nvidia-oot + +# OOT host1x: add conftest + nvidia-oot includes (exports host1x_fence_extract) +printf 'ccflags-y += -I$(srctree.nvconftest)\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/include\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/drivers/gpu/host1x/include\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x/Makefile + +# Force conftest macros for OOT host1x on kernel 6.18 +grep -rl "NV_IOMMU_MAP_HAS_GFP_ARG" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_IOMMU_MAP_HAS_GFP_ARG)|#if 1 /* force: kernel 6.3+ */|g" +grep -rl "NV_IOMMU_PAGING_DOMAIN_ALLOC_PRESENT" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_IOMMU_PAGING_DOMAIN_ALLOC_PRESENT)|#if 1 /* force: kernel 6.11+ */|g" +grep -rl "NV_DEVM_TEGRA_CORE_DEV_INIT_OPP_TABLE_COMMON_PRESENT" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_DEVM_TEGRA_CORE_DEV_INIT_OPP_TABLE_COMMON_PRESENT)|#if 1 /* force: present */|g" +grep -rl "NV_PLATFORM_DRIVER_STRUCT_REMOVE_RETURNS_VOID" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_PLATFORM_DRIVER_STRUCT_REMOVE_RETURNS_VOID)|#if 1 /* force: kernel 6.11+ */|g" +grep -rl "NV_BUS_TYPE_STRUCT_MATCH_HAS_CONST_DRV_ARG" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_BUS_TYPE_STRUCT_MATCH_HAS_CONST_DRV_ARG)|#if 1 /* force: kernel 6.x+ */|g" +grep -rl "NV_BUS_TYPE_STRUCT_UEVENT_HAS_CONST_DEV_ARG" ${NVIDIA_OOT}/drivers/gpu/host1x/ \ + | xargs -r sed -i "s|#if defined(NV_BUS_TYPE_STRUCT_UEVENT_HAS_CONST_DEV_ARG)|#if 1 /* force: kernel 6.x+ */|g" +echo "Patched OOT host1x: forced conftest macro code paths for kernel 6.18" + +# host1x-fence: remove -Werror, add conftest + nvidia-oot includes +sed -i 's|ccflags-y += -Werror||g' \ + ${NVIDIA_OOT}/drivers/gpu/host1x-fence/Makefile +printf 'ccflags-y += -I$(srctree.nvconftest)\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x-fence/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/include\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x-fence/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/drivers/gpu/host1x/include\n' \ + >> ${NVIDIA_OOT}/drivers/gpu/host1x-fence/Makefile +grep -rl "class_create(THIS_MODULE," ${NVIDIA_OOT}/drivers/gpu/host1x-fence/ \ + | xargs -r sed -i 's/class_create(THIS_MODULE, /class_create(/g' +grep -rl "host1x_fence_devnode" ${NVIDIA_OOT}/drivers/gpu/host1x-fence/ \ + | xargs -r sed -i 's/static char \*host1x_fence_devnode(struct device \*/static char *host1x_fence_devnode(const struct device */g' +echo "Patched host1x-fence: class_create + devnode const fixes for kernel 6.x" + +# nvmap: remove subdir -Werror, add conftest + nvidia-oot includes +sed -i 's|subdir-ccflags-y += -Werror||g' \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -I$(srctree.nvconftest)\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/include\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/drivers/video/tegra/nvmap/include\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -DNV_GET_USER_PAGES_HAS_ARGS_FLAGS\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -DNV_MM_STRUCT_STRUCT_HAS_PERCPU_COUNTER_RSS_STAT\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile +printf 'ccflags-y += -DNV_IOREMAP_PROT_HAS_PGPROT_T_ARG\n' \ + >> ${NVIDIA_OOT}/drivers/video/tegra/nvmap/Makefile + +# mc-utils: add nvidia-oot includes +printf 'ccflags-y += -I$(srctree.nvidia-oot)/include\n' \ + >> ${NVIDIA_OOT}/drivers/platform/tegra/mc-utils/Makefile + +# governor_pod_scaling: add conftest + nvidia-oot includes +printf 'ccflags-y += -I$(srctree.nvconftest)\n' \ + >> ${NVIDIA_OOT}/drivers/devfreq/Makefile +printf 'ccflags-y += -I$(srctree.nvidia-oot)/include\n' \ + >> ${NVIDIA_OOT}/drivers/devfreq/Makefile + +echo "Include paths patched into OOT module Makefiles." + +# Force conftest macro paths in nvmap source for kernel 6.18 +grep -rl "NV_GET_USER_PAGES_HAS_ARGS_FLAGS" ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_GET_USER_PAGES_HAS_ARGS_FLAGS)|#if 1 /* force: kernel 6.5+ */|g" +grep -rl "NV_MM_STRUCT_STRUCT_HAS_PERCPU_COUNTER_RSS_STAT" ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_MM_STRUCT_STRUCT_HAS_PERCPU_COUNTER_RSS_STAT)|#if 1 /* force: kernel 6.2+ */|g" +grep -rl "NV_IOREMAP_PROT_HAS_PGPROT_T_ARG" ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_IOREMAP_PROT_HAS_PGPROT_T_ARG)|#if 1 /* force: kernel 6.15+ */|g" +grep -rl "NV_VM_AREA_STRUCT_HAS_CONST_VM_FLAGS" ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_VM_AREA_STRUCT_HAS_CONST_VM_FLAGS)|#if 1 /* force: kernel 6.3+ */|g" +grep -rl "NV___ASSIGN_STR_HAS_NO_SRC_ARG" \ + ${NVIDIA_OOT}/include/trace/events/ \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ 2>/dev/null \ + | xargs -r sed -i "s|#if defined(NV___ASSIGN_STR_HAS_NO_SRC_ARG)|#if 1 /* force: kernel 6.10+ */|g" +grep -rl "NV__ALLOC_PAGES_BULK_HAS_NO_PAGE_LIST_ARG" \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV__ALLOC_PAGES_BULK_HAS_NO_PAGE_LIST_ARG)|#if 1 /* force: kernel 6.14+ */|g" +grep -rl "NV_FILE_STRUCT_HAS_F_REF" \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_FILE_STRUCT_HAS_F_REF)|#if 1 /* force: kernel 6.13+ */|g" +grep -rl "NV_GET_FILE_RCU_HAS_DOUBLE_PTR_FILE_ARG" \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_GET_FILE_RCU_HAS_DOUBLE_PTR_FILE_ARG)|#if 1 /* force: kernel 6.7+ */|g" +grep -rl "NV_PLATFORM_DRIVER_STRUCT_REMOVE_RETURNS_VOID" \ + ${NVIDIA_OOT}/drivers/video/tegra/nvmap/ \ + | xargs -r sed -i "s|#if defined(NV_PLATFORM_DRIVER_STRUCT_REMOVE_RETURNS_VOID)|#if 1 /* force: kernel 6.11+ */|g" +echo "Patched nvmap: forced conftest macro code paths for kernel 6.18"