NVIDIA · danielfrg · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/.coveragerc b/.coveragerc
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 [paths]
 source =

diff --git a/.spdx-ignore b/.spdx-ignore
@@ -8,9 +8,6 @@ LICENSE
 requirements*.txt
 cuda_bindings/examples/*
 
-# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
-cuda_bindings/benchmarks/*
-
 # Vendored
 cuda_core/cuda/core/_include/dlpack.h
 

diff --git a/cuda_bindings/benchmarks/.gitignore → benchmarks/cuda_bindings/.gitignore b/cuda_bindings/benchmarks/.gitignore → benchmarks/cuda_bindings/.gitignore
diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md
@@ -0,0 +1,6 @@
+# cuda.bindings benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-bindings
+that can be found in ../../cuda_bindings
diff --git a/cuda_bindings/benchmarks/README.md → benchmarks/cuda_bindings/README.md b/cuda_bindings/benchmarks/README.md → benchmarks/cuda_bindings/README.md
@@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
 pixi run -e wheel -- python -m pyperf system show
 
 # Apply tuning (may require root)
-sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+$(pixi run -e wheel -- which python) -m pyperf system tune
 ```
 
 ### Running benchmarks

diff --git a/...benchmarks/benchmarks/bench_ctx_device.py → ...a_bindings/benchmarks/bench_ctx_device.py b/...benchmarks/benchmarks/bench_ctx_device.py → ...a_bindings/benchmarks/bench_ctx_device.py
@@ -15,48 +15,48 @@
 
 
 def bench_ctx_get_current(loops: int) -> float:
-    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+    _fn = cuda.cuCtxGetCurrent
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetCurrent()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_ctx_set_current(loops: int) -> float:
-    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _fn = cuda.cuCtxSetCurrent
     _ctx = CTX
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxSetCurrent(_ctx)
+        _fn(_ctx)
     return time.perf_counter() - t0
 
 
 def bench_ctx_get_device(loops: int) -> float:
-    _cuCtxGetDevice = cuda.cuCtxGetDevice
+    _fn = cuda.cuCtxGetDevice
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuCtxGetDevice()
+        _fn()
     return time.perf_counter() - t0
 
 
 def bench_device_get(loops: int) -> float:
-    _cuDeviceGet = cuda.cuDeviceGet
+    _fn = cuda.cuDeviceGet
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGet(0)
+        _fn(0)
     return time.perf_counter() - t0
 
 
 def bench_device_get_attribute(loops: int) -> float:
-    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _fn = cuda.cuDeviceGetAttribute
     _attr = ATTRIBUTE
     _dev = DEVICE
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuDeviceGetAttribute(_attr, _dev)
+        _fn(_attr, _dev)
     return time.perf_counter() - t0
diff --git a/...ings/benchmarks/benchmarks/bench_event.py → ...s/cuda_bindings/benchmarks/bench_event.py b/...ings/benchmarks/benchmarks/bench_event.py → ...s/cuda_bindings/benchmarks/bench_event.py
@@ -20,43 +20,43 @@
 
 
 def bench_event_create_destroy(loops: int) -> float:
-    _cuEventCreate = cuda.cuEventCreate
-    _cuEventDestroy = cuda.cuEventDestroy
+    _create = cuda.cuEventCreate
+    _destroy = cuda.cuEventDestroy
     _flags = EVENT_FLAGS
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, e = _cuEventCreate(_flags)
-        _cuEventDestroy(e)
+        _, e = _create(_flags)
+        _destroy(e)
     return time.perf_counter() - t0
 
 
 def bench_event_record(loops: int) -> float:
-    _cuEventRecord = cuda.cuEventRecord
+    _fn = cuda.cuEventRecord
     _event = EVENT
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventRecord(_event, _stream)
+        _fn(_event, _stream)
     return time.perf_counter() - t0
 
 
 def bench_event_query(loops: int) -> float:
-    _cuEventQuery = cuda.cuEventQuery
+    _fn = cuda.cuEventQuery
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventQuery(_event)
+        _fn(_event)
     return time.perf_counter() - t0
 
 
 def bench_event_synchronize(loops: int) -> float:
-    _cuEventSynchronize = cuda.cuEventSynchronize
+    _fn = cuda.cuEventSynchronize
     _event = EVENT
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuEventSynchronize(_event)
+        _fn(_event)
     return time.perf_counter() - t0
diff --git a/...ngs/benchmarks/benchmarks/bench_launch.py → .../cuda_bindings/benchmarks/bench_launch.py b/...ngs/benchmarks/benchmarks/bench_launch.py → .../cuda_bindings/benchmarks/bench_launch.py
@@ -82,52 +82,52 @@ def _ensure_launch_state() -> None:
 
 def bench_launch_empty_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = EMPTY_KERNEL
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_small_kernel(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = SMALL_KERNEL
     _stream = STREAM
     _args = (FLOAT_PTR,)
     _arg_types = (None,)
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _args = INT_PTRS
     _arg_types = (None,) * 16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
     return time.perf_counter() - t0
 
 
 def bench_launch_16_args_pre_packed(loops: int) -> float:
     _ensure_launch_state()
-    _cuLaunchKernel = cuda.cuLaunchKernel
+    _fn = cuda.cuLaunchKernel
     _kernel = KERNEL_16_ARGS
     _stream = STREAM
     _packed = PACKED_16
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
+        _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
     return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+import numpy as np
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+# Allocation size for alloc/free benchmarks
+ALLOC_SIZE = 1024
+
+# Small transfer size (8 bytes) to measure call overhead, not bandwidth
+COPY_SIZE = 8
+
+# Pre-allocate device memory and host buffers for memcpy benchmarks
+DST_DPTR = alloc_persistent(COPY_SIZE)
+SRC_DPTR = alloc_persistent(COPY_SIZE)
+HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
+HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)
+
+# Stream for async operations
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    _alloc = cuda.cuMemAlloc
+    _free = cuda.cuMemFree
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size)
+        _free(ptr)
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _alloc = cuda.cuMemAllocAsync
+    _free = cuda.cuMemFreeAsync
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, ptr = _alloc(_size, _stream)
+        _free(ptr, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_htod(loops: int) -> float:
+    _fn = cuda.cuMemcpyHtoD
+    _dst = DST_DPTR
+    _src = HOST_SRC
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtoh(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoH
+    _dst = HOST_DST
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
+
+
+def bench_memcpy_dtod(loops: int) -> float:
+    _fn = cuda.cuMemcpyDtoD
+    _dst = DST_DPTR
+    _src = SRC_DPTR
+    _size = COPY_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dst, _src, _size)
+    return time.perf_counter() - t0
diff --git a/...ks/benchmarks/bench_pointer_attributes.py → ...gs/benchmarks/bench_pointer_attributes.py b/...ks/benchmarks/bench_pointer_attributes.py → ...gs/benchmarks/bench_pointer_attributes.py
@@ -15,11 +15,11 @@
 
 def bench_pointer_get_attribute(loops: int) -> float:
     # Local references to avoid global lookups in the hot loop
-    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
+    _fn = cuda.cuPointerGetAttribute
     _attr = ATTRIBUTE
     _ptr = PTR
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuPointerGetAttribute(_attr, _ptr)
+        _fn(_attr, _ptr)
     return time.perf_counter() - t0
diff --git a/...ngs/benchmarks/benchmarks/bench_stream.py → .../cuda_bindings/benchmarks/bench_stream.py b/...ngs/benchmarks/benchmarks/bench_stream.py → .../cuda_bindings/benchmarks/bench_stream.py
@@ -14,32 +14,32 @@
 
 
 def bench_stream_create_destroy(loops: int) -> float:
-    _cuStreamCreate = cuda.cuStreamCreate
-    _cuStreamDestroy = cuda.cuStreamDestroy
+    _create = cuda.cuStreamCreate
+    _destroy = cuda.cuStreamDestroy
     _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _, s = _cuStreamCreate(_flags)
-        _cuStreamDestroy(s)
+        _, s = _create(_flags)
+        _destroy(s)
     return time.perf_counter() - t0
 
 
 def bench_stream_query(loops: int) -> float:
-    _cuStreamQuery = cuda.cuStreamQuery
+    _fn = cuda.cuStreamQuery
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamQuery(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0
 
 
 def bench_stream_synchronize(loops: int) -> float:
-    _cuStreamSynchronize = cuda.cuStreamSynchronize
+    _fn = cuda.cuStreamSynchronize
     _stream = STREAM
 
     t0 = time.perf_counter()
     for _ in range(loops):
-        _cuStreamSynchronize(_stream)
+        _fn(_stream)
     return time.perf_counter() - t0
diff --git a/.../benchmarks/benchmarks/cpp/CMakeLists.txt → ...da_bindings/benchmarks/cpp/CMakeLists.txt b/.../benchmarks/benchmarks/cpp/CMakeLists.txt → ...da_bindings/benchmarks/cpp/CMakeLists.txt
@@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes)
 add_driver_benchmark(bench_ctx_device)
 add_driver_benchmark(bench_stream)
 add_driver_benchmark(bench_event)
+add_driver_benchmark(bench_memory)
 
 # NVRTC benchmarks (require nvrtc for kernel compilation)
 if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)

diff --git a/...marks/benchmarks/cpp/bench_ctx_device.cpp → ...dings/benchmarks/cpp/bench_ctx_device.cpp b/...marks/benchmarks/cpp/bench_ctx_device.cpp → ...dings/benchmarks/cpp/bench_ctx_device.cpp
diff --git a/...benchmarks/benchmarks/cpp/bench_event.cpp → ...a_bindings/benchmarks/cpp/bench_event.cpp b/...benchmarks/benchmarks/cpp/bench_event.cpp → ...a_bindings/benchmarks/cpp/bench_event.cpp
diff --git a/...enchmarks/benchmarks/cpp/bench_launch.cpp → ..._bindings/benchmarks/cpp/bench_launch.cpp b/...enchmarks/benchmarks/cpp/bench_launch.cpp → ..._bindings/benchmarks/cpp/bench_launch.cpp
@@ -168,39 +168,6 @@ int main(int argc, char** argv) {
         });
     }
 
-    // --- launch_small_kernel ---
-    {
-        void* params[] = {&float_ptr};
-        suite.run("launch.launch_small_kernel", [&]() {
-            check_cu(
-                cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args ---
-    {
-        suite.run("launch.launch_16_args", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
-    // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
-    // In C++ the params are always pre-packed, so this is identical to launch_16_args.
-    // We include it for naming parity with the Python benchmark.
-    {
-        suite.run("launch.launch_16_args_pre_packed", [&]() {
-            check_cu(
-                cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
-                "cuLaunchKernel failed"
-            );
-        });
-    }
-
     // Cleanup
     for (int i = 0; i < 16; ++i) {
         check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");