diff --git a/.coveragerc b/.coveragerc
index 36f0f7879a..1e1776fd56 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 [paths]
 source =
diff --git a/.spdx-ignore b/.spdx-ignore
index 8c1d155c47..7263b5414f 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -8,9 +8,6 @@ LICENSE
 requirements*.txt
 cuda_bindings/examples/*
 
-# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
-cuda_bindings/benchmarks/*
-
 # Vendored
 cuda_core/cuda/core/_include/dlpack.h
 
diff --git a/cuda_bindings/benchmarks/.gitignore b/benchmarks/cuda_bindings/.gitignore
similarity index 100%
rename from cuda_bindings/benchmarks/.gitignore
rename to benchmarks/cuda_bindings/.gitignore
diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md
new file mode 100644
index 0000000000..b9096a737f
--- /dev/null
+++ b/benchmarks/cuda_bindings/AGENTS.md
@@ -0,0 +1,6 @@
+# cuda.bindings benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-bindings
+that can be found in ../../cuda_bindings
diff --git a/cuda_bindings/benchmarks/README.md b/benchmarks/cuda_bindings/README.md
similarity index 97%
rename from cuda_bindings/benchmarks/README.md
rename to benchmarks/cuda_bindings/README.md
index 75e16db031..f8d5ccf043 100644
--- a/cuda_bindings/benchmarks/README.md
+++ b/benchmarks/cuda_bindings/README.md
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
 pixi run -e wheel -- python -m pyperf system show
 
 # Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+$(pixi run -e wheel -- which python) -m pyperf system tune
 ```
 
 ### Running benchmarks
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
similarity index 75%
rename from cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
rename to benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
index 1c82cd4046..2e2cd11d93 100644
--- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
@@ -15,48 +15,48 @@
 
 
 def bench_ctx_get_current(loops: int) -> float:
-    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+    _fn = cuda.cuCtxGetCurrent
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetCurrent()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_ctx_set_current(loops: int) -> float:
-    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _fn = cuda.cuCtxSetCurrent
     _ctx = CTX
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxSetCurrent(_ctx)
+        _fn(_ctx)
     return time.perf_counter() - t0
 
 
 def bench_ctx_get_device(loops: int) -> float:
-    _cuCtxGetDevice = cuda.cuCtxGetDevice
+    _fn = cuda.cuCtxGetDevice
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetDevice()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_device_get(loops: int) -> float:
-    _cuDeviceGet = cuda.cuDeviceGet
+    _fn = cuda.cuDeviceGet
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGet(0)
+        _fn(0)
     return time.perf_counter() - t0
 
 
 def bench_device_get_attribute(loops: int) -> float:
-    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _fn = cuda.cuDeviceGetAttribute
     _attr = ATTRIBUTE
     _dev = DEVICE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGetAttribute(_attr, _dev)
+        _fn(_attr, _dev)
     return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py
similarity index 76%
rename from cuda_bindings/benchmarks/benchmarks/bench_event.py
rename to benchmarks/cuda_bindings/benchmarks/bench_event.py
index e8e319115d..041adc2553 100644
--- a/cuda_bindings/benchmarks/benchmarks/bench_event.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py
@@ -20,43 +20,43 @@
 
 
 def bench_event_create_destroy(loops: int) -> float:
-    _cuEventCreate = cuda.cuEventCreate
-    _cuEventDestroy = cuda.cuEventDestroy
+    _create = cuda.cuEventCreate
+    _destroy = cuda.cuEventDestroy
     _flags = EVENT_FLAGS
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, e = _cuEventCreate(_flags)
-        _cuEventDestroy(e)
+        _, e = _create(_flags)
+        _destroy(e)
     return time.perf_counter() - t0
 
 
 def bench_event_record(loops: int) -> float:
-    _cuEventRecord = cuda.cuEventRecord
+    _fn = cuda.cuEventRecord
     _event = EVENT
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventRecord(_event, _stream)
+        _fn(_event, _stream)
     return time.perf_counter() - t0
 
 
 def bench_event_query(loops: int) -> float:
-    _cuEventQuery = cuda.cuEventQuery
+    _fn = cuda.cuEventQuery
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventQuery(_event)
+        _fn(_event)
     return time.perf_counter() - t0
 
 
 def bench_event_synchronize(loops: int) -> float:
-    _cuEventSynchronize = cuda.cuEventSynchronize
+    _fn = cuda.cuEventSynchronize
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventSynchronize(_event)
+        _fn(_event)
     return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
similarity index 87%
rename from cuda_bindings/benchmarks/benchmarks/bench_launch.py
rename to benchmarks/cuda_bindings/benchmarks/bench_launch.py
index 931194fbd3..abf3f946cc 100644
--- a/cuda_bindings/benchmarks/benchmarks/bench_launch.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py
@@ -82,19 +82,19 @@ def _ensure_launch_state() -> None:
 
 def bench_launch_empty_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = EMPTY_KERNEL
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_small_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = SMALL_KERNEL
     _stream = STREAM
     _args = (FLOAT_PTR,)
@@ -102,13 +102,13 @@ def bench_launch_small_kernel(loops: int) -> float:
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _args = INT_PTRS
@@ -116,18 +116,18 @@ def bench_launch_16_args(loops: int) -> float:
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args_pre_packed(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _packed = PACKED_16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
new file mode 100644
index 0000000000..875c060406
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+import numpy as np
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+# Allocation size for alloc/free benchmarks
+ALLOC_SIZE = 1024
+
+# Small transfer size (8 bytes) to measure call overhead, not bandwidth
+COPY_SIZE = 8
+
+# Pre-allocate device memory and host buffers for memcpy benchmarks
+DST_DPTR = alloc_persistent(COPY_SIZE)
+SRC_DPTR = alloc_persistent(COPY_SIZE)
+HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
+HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
+
+# Stream for async operations
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    _alloc = cuda.cuMemAlloc
+    _free = cuda.cuMemFree
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size)
+        _free(ptr)
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _alloc = cuda.cuMemAllocAsync
+    _free = cuda.cuMemFreeAsync
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size, _stream)
+        _free(ptr, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_htod(loops: int) -> float:
+    _fn = cuda.cuMemcpyHtoD
+    _dst = DST_DPTR
+    _src = HOST_SRC
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtoh(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoH
+    _dst = HOST_DST
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtod(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoD
+    _dst = DST_DPTR
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
similarity index 86%
rename from cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
rename to benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
index a02b82c399..191da263ee 100644
--- a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
@@ -15,11 +15,11 @@
 
 def bench_pointer_get_attribute(loops: int) -> float:
     # Local references to avoid global lookups in the hot loop
-    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
+    _fn = cuda.cuPointerGetAttribute
     _attr = ATTRIBUTE
     _ptr = PTR
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuPointerGetAttribute(_attr, _ptr)
+        _fn(_attr, _ptr)
     return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
similarity index 73%
rename from cuda_bindings/benchmarks/benchmarks/bench_stream.py
rename to benchmarks/cuda_bindings/benchmarks/bench_stream.py
index d816099ed5..3aab9288fc 100644
--- a/cuda_bindings/benchmarks/benchmarks/bench_stream.py
+++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py
@@ -14,32 +14,32 @@
 
 
 def bench_stream_create_destroy(loops: int) -> float:
-    _cuStreamCreate = cuda.cuStreamCreate
-    _cuStreamDestroy = cuda.cuStreamDestroy
+    _create = cuda.cuStreamCreate
+    _destroy = cuda.cuStreamDestroy
     _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, s = _cuStreamCreate(_flags)
-        _cuStreamDestroy(s)
+        _, s = _create(_flags)
+        _destroy(s)
     return time.perf_counter() - t0
 
 
 def bench_stream_query(loops: int) -> float:
-    _cuStreamQuery = cuda.cuStreamQuery
+    _fn = cuda.cuStreamQuery
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamQuery(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0
 
 
 def bench_stream_synchronize(loops: int) -> float:
-    _cuStreamSynchronize = cuda.cuStreamSynchronize
+    _fn = cuda.cuStreamSynchronize
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamSynchronize(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
similarity index 98%
rename from cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
rename to benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
index b4285834aa..83326911af 100644
--- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
@@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes)
 add_driver_benchmark(bench_ctx_device)
 add_driver_benchmark(bench_stream)
 add_driver_benchmark(bench_event)
+add_driver_benchmark(bench_memory)
 
 # NVRTC benchmarks (require nvrtc for kernel compilation)
 if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
similarity index 100%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
similarity index 100%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
similarity index 84%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
index fb65da6d74..a249426963 100644
--- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
@@ -168,39 +168,6 @@ int main(int argc, char** argv) {
         });
     }
 
-    // --- launch_small_kernel ---
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args ---
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
     // Cleanup
     for (int i = 0; i < 16; ++i) {
         check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
new file mode 100644
index 0000000000..4e71b73fb5
--- /dev/null
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
@@ -0,0 +1,106 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+static constexpr size_t ALLOC_SIZE = 1024;
+static constexpr size_t COPY_SIZE = 8;
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+
+    // Setup
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUstream stream;
+    check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
+
+    // Pre-allocate device memory for memcpy benchmarks
+    CUdeviceptr dst_dptr, src_dptr;
+    check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed");
+    check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed");
+
+    // Host buffers for memcpy
+    uint8_t host_src[COPY_SIZE] = {};
+    uint8_t host_dst[COPY_SIZE] = {};
+
+    bench::BenchmarkSuite suite(options);
+
+    // --- mem_alloc_free ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_free", [&]() {
+            check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed");
+            check_cu(cuMemFree(ptr), "cuMemFree failed");
+        });
+    }
+
+    // --- mem_alloc_async_free_async ---
+    {
+        CUdeviceptr ptr;
+        suite.run("memory.mem_alloc_async_free_async", [&]() {
+            check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed");
+            check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed");
+        });
+    }
+
+    check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
+
+    // --- memcpy_htod ---
+    {
+        suite.run("memory.memcpy_htod", [&]() {
+            check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed");
+        });
+    }
+
+    // --- memcpy_dtoh ---
+    {
+        suite.run("memory.memcpy_dtoh", [&]() {
+            check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed");
+        });
+    }
+
+    // --- memcpy_dtod ---
+    {
+        suite.run("memory.memcpy_dtod", [&]() {
+            check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed");
+        });
+    }
+
+    // Cleanup
+    check_cu(cuMemFree(dst_dptr), "cuMemFree failed");
+    check_cu(cuMemFree(src_dptr), "cuMemFree failed");
+    check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    suite.write();
+
+    return 0;
+}
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
similarity index 100%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
similarity index 100%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
similarity index 100%
rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp
rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
diff --git a/cuda_bindings/benchmarks/compare.py b/benchmarks/cuda_bindings/compare.py
similarity index 100%
rename from cuda_bindings/benchmarks/compare.py
rename to benchmarks/cuda_bindings/compare.py
diff --git a/cuda_bindings/benchmarks/pixi.lock b/benchmarks/cuda_bindings/pixi.lock
similarity index 98%
rename from cuda_bindings/benchmarks/pixi.lock
rename to benchmarks/cuda_bindings/pixi.lock
index c610db2f45..c571d4756c 100644
--- a/cuda_bindings/benchmarks/pixi.lock
+++ b/benchmarks/cuda_bindings/pixi.lock
@@ -38,8 +38,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
@@ -66,7 +66,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
@@ -130,7 +130,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
-      - conda: ..
+      - conda: ../../cuda_bindings
       - conda: ../../cuda_pathfinder
   wheel:
     channels:
@@ -406,7 +406,7 @@ packages:
   license_family: GPL
   size: 31705
   timestamp: 1771378159534
-- conda: ..
+- conda: ../../cuda_bindings
   name: cuda-bindings
   version: 13.2.0
   build: hb0f4dca_0
@@ -419,11 +419,11 @@ packages:
   - cuda-pathfinder
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.2.51,<14.0a0
+  - cuda-nvrtc >=13.2.78,<14.0a0
   - cuda-nvvm
   - libnvfatbin
   - libcufile
-  - libcufile >=1.17.0.44,<2.0a0
+  - libcufile >=1.17.1.22,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -643,17 +643,17 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 35339417
   timestamp: 1768272955912
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-  sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5
-  md5: 0413baaa73be1a39d5d8e442184acc78
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+  sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8
+  md5: 66623d882c42506fa3f1780b90841400
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-version >=13.2,<13.3.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35736655
-  timestamp: 1773100338749
+  size: 35670504
+  timestamp: 1776109867257
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda
   sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3
   md5: 1096fce4abad7dd975ce6d9953fceb6a
@@ -668,20 +668,20 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 35845
   timestamp: 1768273073971
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda
-  sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879
-  md5: 83535200a9e77165d5291b4ac82ebf6a
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda
+  sha256: 12505f1bbc222acf2a63da5c84e4176d2f9c18b458e2bde28939fdf326b6d292
+  md5: cc313f0ea18ebc6e713a8980611431f5
   depends:
   - __glibc >=2.17,<3.0.a0
-  - cuda-nvrtc 13.2.51 hecca717_0
+  - cuda-nvrtc 13.2.78 hecca717_0
   - cuda-version >=13.2,<13.3.0a0
   - libgcc >=14
   - libstdcxx >=14
   constrains:
-  - cuda-nvrtc-static >=13.2.51
+  - cuda-nvrtc-static >=13.2.78
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 36305
-  timestamp: 1773100458841
+  size: 36312
+  timestamp: 1776109983818
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
   sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b
   md5: 03cd3639b8e13623c7b91b1cb0136402
@@ -1018,9 +1018,9 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 990938
   timestamp: 1768273732081
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
-  sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19
-  md5: 6d8ed8335d144ec7303b8d3587b2205c
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
+  sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74
+  md5: 056a67706108efd1f9c24682ba8d3685
   depends:
   - __glibc >=2.28,<3.0.a0
   - cuda-version >=13.2,<13.3.0a0
@@ -1028,8 +1028,8 @@ packages:
   - libstdcxx >=14
   - rdma-core >=61.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1085341
-  timestamp: 1773100191342
+  size: 1082447
+  timestamp: 1776110053053
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda
   sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b
   md5: 1707cdd636af2ff697b53186572c9f77
diff --git a/cuda_bindings/benchmarks/pixi.toml b/benchmarks/cuda_bindings/pixi.toml
similarity index 97%
rename from cuda_bindings/benchmarks/pixi.toml
rename to benchmarks/cuda_bindings/pixi.toml
index a448e8d3e4..dbbddcd939 100644
--- a/cuda_bindings/benchmarks/pixi.toml
+++ b/benchmarks/cuda_bindings/pixi.toml
@@ -45,7 +45,7 @@ pre-commit = "*"
 cuda-bindings = "==13.1.0"
 
 [feature.bindings-source.dependencies]
-cuda-bindings = { path = ".." }
+cuda-bindings = { path = "../../cuda_bindings" }
 
 [environments]
 wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] }
diff --git a/cuda_bindings/benchmarks/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
similarity index 97%
rename from cuda_bindings/benchmarks/pytest-legacy/conftest.py
rename to benchmarks/cuda_bindings/pytest-legacy/conftest.py
index 0ea7b1d772..5d0cc95e7a 100644
--- a/cuda_bindings/benchmarks/pytest-legacy/conftest.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
 import pytest
diff --git a/cuda_bindings/benchmarks/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
similarity index 97%
rename from cuda_bindings/benchmarks/pytest-legacy/kernels.py
rename to benchmarks/cuda_bindings/pytest-legacy/kernels.py
index 36646fba00..7e741110a3 100644
--- a/cuda_bindings/benchmarks/pytest-legacy/kernels.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 kernel_string = """\
 #define ITEM_PARAM(x, T) T x
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
similarity index 98%
rename from cuda_bindings/benchmarks/pytest-legacy/test_cupy.py
rename to benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
index 76dd6e6a45..3eea752ce0 100644
--- a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import ctypes
 
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
similarity index 99%
rename from cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py
rename to benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
index dd994081a0..ad421de382 100755
--- a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import ctypes
 
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
similarity index 95%
rename from cuda_bindings/benchmarks/pytest-legacy/test_numba.py
rename to benchmarks/cuda_bindings/pytest-legacy/test_numba.py
index dfe084c6b1..d9ae0cdfee 100644
--- a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
 import pytest
diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
similarity index 98%
rename from cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py
rename to benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
index fae72ffd79..6df32ec511 100644
--- a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py
+++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import random
 
diff --git a/cuda_bindings/benchmarks/run_cpp.py b/benchmarks/cuda_bindings/run_cpp.py
similarity index 100%
rename from cuda_bindings/benchmarks/run_cpp.py
rename to benchmarks/cuda_bindings/run_cpp.py
diff --git a/cuda_bindings/benchmarks/run_pyperf.py b/benchmarks/cuda_bindings/run_pyperf.py
similarity index 100%
rename from cuda_bindings/benchmarks/run_pyperf.py
rename to benchmarks/cuda_bindings/run_pyperf.py
diff --git a/cuda_bindings/benchmarks/runner/__init__.py b/benchmarks/cuda_bindings/runner/__init__.py
similarity index 100%
rename from cuda_bindings/benchmarks/runner/__init__.py
rename to benchmarks/cuda_bindings/runner/__init__.py
diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/benchmarks/cuda_bindings/runner/cpp.py
similarity index 100%
rename from cuda_bindings/benchmarks/runner/cpp.py
rename to benchmarks/cuda_bindings/runner/cpp.py
diff --git a/cuda_bindings/benchmarks/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
similarity index 98%
rename from cuda_bindings/benchmarks/runner/main.py
rename to benchmarks/cuda_bindings/runner/main.py
index 4089aa5559..b0f6e76f41 100644
--- a/cuda_bindings/benchmarks/runner/main.py
+++ b/benchmarks/cuda_bindings/runner/main.py
@@ -53,7 +53,7 @@ def _discover_module_functions(module_path: Path) -> list[str]:
     return [
         node.name
         for node in tree.body
-        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_")
+        if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef) and node.name.startswith("bench_")
     ]
 
 
diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/benchmarks/cuda_bindings/runner/runtime.py
similarity index 100%
rename from cuda_bindings/benchmarks/runner/runtime.py
rename to benchmarks/cuda_bindings/runner/runtime.py
diff --git a/cuda_bindings/benchmarks/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py
similarity index 100%
rename from cuda_bindings/benchmarks/tests/test_runner.py
rename to benchmarks/cuda_bindings/tests/test_runner.py
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
index ddb6eae107..e6366ac95d 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 import binascii
 
diff --git a/ruff.toml b/ruff.toml
index 704e422c19..210f852cd3 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -124,13 +124,18 @@ inline-quotes = "double"
 
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
 # Keep examples opted-in to enforce naming conventions in example-local identifiers.
-"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [
+"cuda_bindings/{cuda,docs,tests}/**" = [
   "N801",    # invalid-class-name
   "N802",    # invalid-function-name
   "N803",    # invalid-argument-name
   "N806",    # non-lowercase-variable-in-function
   "N816",    # mixed-case-variable-in-global-scope
 ]
+"benchmarks/cuda_bindings/pytest-legacy/**" = [
+  "N801",    # invalid-class-name
+  "N802",    # invalid-function-name
+  "N806",    # non-lowercase-variable-in-function
+]
 "cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"]
 
 # scripts and build tooling — print is the expected output method
diff --git a/toolshed/build_static_bitcode_input.py b/toolshed/build_static_bitcode_input.py
index 273ce33244..e2400100dd 100755
--- a/toolshed/build_static_bitcode_input.py
+++ b/toolshed/build_static_bitcode_input.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 """
 Helper to produce static bitcode input for test_nvvm.py.
diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py
index 6be42282bf..3d52142554 100644
--- a/toolshed/check_spdx.py
+++ b/toolshed/check_spdx.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import datetime
+import fnmatch
 import os
 import re
 import subprocess
@@ -17,12 +18,28 @@
 
 LICENSE_IDENTIFIER_REGEX = re.compile(re.escape(SPDX_LICENSE_IDENTIFIER_PREFIX) + rb"(?P<license_identifier>[^\r\n]+)")
 
-EXPECTED_LICENSE_IDENTIFIERS = (
-    ("cuda_bindings/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"),
-    ("cuda_core/", "Apache-2.0"),
-    ("cuda_pathfinder/", "Apache-2.0"),
-    ("cuda_python/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"),
-)
+TOP_LEVEL_FILE_LICENSE_IDENTIFIER = "Apache-2.0"
+
+# Every top-level directory needs to have an entry here, so new paths
+# can't slip in without a reviewed license decision.
+TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = {
+    ".github": "Apache-2.0",
+    "benchmarks": "Apache-2.0",
+    "ci": "Apache-2.0",
+    "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+    "cuda_core": "Apache-2.0",
+    "cuda_pathfinder": "Apache-2.0",
+    "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+    "cuda_python_test_helpers": "Apache-2.0",
+    "scripts": "Apache-2.0",
+    "toolshed": "Apache-2.0",
+}
+
+SPECIAL_CASE_LICENSE_IDENTIFIERS = {
+    # key: repo-relative path or glob, value: expected SPDX license identifier
+    "cuda_bindings/benchmarks/*": "Apache-2.0",
+    "cuda_bindings/benchmarks/pytest-legacy/*": "LicenseRef-NVIDIA-SOFTWARE-LICENSE",
+}
 
 SPDX_IGNORE_FILENAME = ".spdx-ignore"
 
@@ -63,12 +80,34 @@ def normalize_repo_path(filepath):
     return PureWindowsPath(filepath).as_posix()
 
 
+def get_top_level_directory(normalized_path):
+    if "/" not in normalized_path:
+        return None
+    return normalized_path.split("/", 1)[0]
+
+
 def get_expected_license_identifier(filepath):
     normalized_path = normalize_repo_path(filepath)
-    for prefix, license_identifier in EXPECTED_LICENSE_IDENTIFIERS:
-        if normalized_path.startswith(prefix):
-            return license_identifier
-    return None
+    matching_special_cases = [
+        (prefix, license_identifier)
+        for prefix, license_identifier in SPECIAL_CASE_LICENSE_IDENTIFIERS.items()
+        if fnmatch.fnmatchcase(normalized_path, prefix)
+    ]
+    if matching_special_cases:
+        return max(matching_special_cases, key=lambda item: len(item[0]))[1], None
+
+    top_level_directory = get_top_level_directory(normalized_path)
+    if top_level_directory is None:
+        return TOP_LEVEL_FILE_LICENSE_IDENTIFIER, None
+
+    if top_level_directory not in TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS:
+        return (
+            None,
+            f"MISSING TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS entry for top-level directory "
+            f"{top_level_directory!r} required by {filepath!r}",
+        )
+
+    return TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS[top_level_directory], None
 
 
 def validate_required_spdx_field(filepath, blob, expected_bytes):
@@ -82,10 +121,11 @@ def extract_license_identifier(blob):
     match = LICENSE_IDENTIFIER_REGEX.search(blob)
     if match is None:
         return None
-    try:
-        return match.group("license_identifier").decode("ascii")
-    except UnicodeDecodeError:
-        return None
+    license_identifier = match.group("license_identifier").decode("ascii", errors="replace").strip()
+    for comment_suffix in ("-->", "*/"):
+        if license_identifier.endswith(comment_suffix):
+            license_identifier = license_identifier.removesuffix(comment_suffix).rstrip()
+    return license_identifier or None
 
 
 def validate_license_identifier(filepath, blob):
@@ -94,9 +134,10 @@ def validate_license_identifier(filepath, blob):
         print(f"MISSING valid SPDX license identifier in {filepath!r}")
         return False
 
-    expected_license_identifier = get_expected_license_identifier(filepath)
-    if expected_license_identifier is None:
-        return True
+    expected_license_identifier, configuration_error = get_expected_license_identifier(filepath)
+    if configuration_error is not None:
+        print(configuration_error)
+        return False
 
     if license_identifier != expected_license_identifier:
         print(
diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py
index 84013ea94b..422bf95232 100644
--- a/toolshed/dump_cutile_b64.py
+++ b/toolshed/dump_cutile_b64.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 """
 Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=.,