Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
# SPDX-License-Identifier: Apache-2.0

[paths]
source =
Expand Down
3 changes: 0 additions & 3 deletions .spdx-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ LICENSE
requirements*.txt
cuda_bindings/examples/*

# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
cuda_bindings/benchmarks/*

# Vendored
cuda_core/cuda/core/_include/dlpack.h

Expand Down
6 changes: 6 additions & 0 deletions benchmarks/cuda_bindings/AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# cuda.bindings benchmarks

Read the README.md in this directory for more details about the benchmarks.

When generating code verify that that the code is correct based on the source for cuda-bindings
that can be found in ../../cuda_bindings
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system
pixi run -e wheel -- python -m pyperf system show

# Apply tuning (may require root)
sudo $(pixi run -e wheel -- which python) -m pyperf system tune
$(pixi run -e wheel -- which python) -m pyperf system tune
```

### Running benchmarks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,48 +15,48 @@


def bench_ctx_get_current(loops: int) -> float:
_cuCtxGetCurrent = cuda.cuCtxGetCurrent
_fn = cuda.cuCtxGetCurrent

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetCurrent()
_fn()
return time.perf_counter() - t0


def bench_ctx_set_current(loops: int) -> float:
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
_fn = cuda.cuCtxSetCurrent
_ctx = CTX

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxSetCurrent(_ctx)
_fn(_ctx)
return time.perf_counter() - t0


def bench_ctx_get_device(loops: int) -> float:
_cuCtxGetDevice = cuda.cuCtxGetDevice
_fn = cuda.cuCtxGetDevice

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetDevice()
_fn()
return time.perf_counter() - t0


def bench_device_get(loops: int) -> float:
_cuDeviceGet = cuda.cuDeviceGet
_fn = cuda.cuDeviceGet

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGet(0)
_fn(0)
return time.perf_counter() - t0


def bench_device_get_attribute(loops: int) -> float:
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
_fn = cuda.cuDeviceGetAttribute
_attr = ATTRIBUTE
_dev = DEVICE

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGetAttribute(_attr, _dev)
_fn(_attr, _dev)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,43 @@


def bench_event_create_destroy(loops: int) -> float:
_cuEventCreate = cuda.cuEventCreate
_cuEventDestroy = cuda.cuEventDestroy
_create = cuda.cuEventCreate
_destroy = cuda.cuEventDestroy
_flags = EVENT_FLAGS

t0 = time.perf_counter()
for _ in range(loops):
_, e = _cuEventCreate(_flags)
_cuEventDestroy(e)
_, e = _create(_flags)
_destroy(e)
return time.perf_counter() - t0


def bench_event_record(loops: int) -> float:
_cuEventRecord = cuda.cuEventRecord
_fn = cuda.cuEventRecord
_event = EVENT
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuEventRecord(_event, _stream)
_fn(_event, _stream)
return time.perf_counter() - t0


def bench_event_query(loops: int) -> float:
_cuEventQuery = cuda.cuEventQuery
_fn = cuda.cuEventQuery
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventQuery(_event)
_fn(_event)
return time.perf_counter() - t0


def bench_event_synchronize(loops: int) -> float:
_cuEventSynchronize = cuda.cuEventSynchronize
_fn = cuda.cuEventSynchronize
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventSynchronize(_event)
_fn(_event)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -82,52 +82,52 @@ def _ensure_launch_state() -> None:

def bench_launch_empty_kernel(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_fn = cuda.cuLaunchKernel
_kernel = EMPTY_KERNEL
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
return time.perf_counter() - t0


def bench_launch_small_kernel(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_fn = cuda.cuLaunchKernel
_kernel = SMALL_KERNEL
_stream = STREAM
_args = (FLOAT_PTR,)
_arg_types = (None,)

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_fn = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_args = INT_PTRS
_arg_types = (None,) * 16

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args_pre_packed(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_fn = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_packed = PACKED_16

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
return time.perf_counter() - t0
88 changes: 88 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

import numpy as np
from runner.runtime import alloc_persistent, ensure_context

from cuda.bindings import driver as cuda

ensure_context()

# Allocation size for alloc/free benchmarks
ALLOC_SIZE = 1024

# Small transfer size (8 bytes) to measure call overhead, not bandwidth
COPY_SIZE = 8

# Pre-allocate device memory and host buffers for memcpy benchmarks
DST_DPTR = alloc_persistent(COPY_SIZE)
SRC_DPTR = alloc_persistent(COPY_SIZE)
HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8)
HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8)

# Stream for async operations
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)


def bench_mem_alloc_free(loops: int) -> float:
_alloc = cuda.cuMemAlloc
_free = cuda.cuMemFree
_size = ALLOC_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_, ptr = _alloc(_size)
_free(ptr)
return time.perf_counter() - t0


def bench_mem_alloc_async_free_async(loops: int) -> float:
_alloc = cuda.cuMemAllocAsync
_free = cuda.cuMemFreeAsync
_size = ALLOC_SIZE
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_, ptr = _alloc(_size, _stream)
_free(ptr, _stream)
return time.perf_counter() - t0


def bench_memcpy_htod(loops: int) -> float:
_fn = cuda.cuMemcpyHtoD
_dst = DST_DPTR
_src = HOST_SRC
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dst, _src, _size)
return time.perf_counter() - t0


def bench_memcpy_dtoh(loops: int) -> float:
_fn = cuda.cuMemcpyDtoH
_dst = HOST_DST
_src = SRC_DPTR
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dst, _src, _size)
return time.perf_counter() - t0


def bench_memcpy_dtod(loops: int) -> float:
_fn = cuda.cuMemcpyDtoD
_dst = DST_DPTR
_src = SRC_DPTR
_size = COPY_SIZE

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dst, _src, _size)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@

def bench_pointer_get_attribute(loops: int) -> float:
# Local references to avoid global lookups in the hot loop
_cuPointerGetAttribute = cuda.cuPointerGetAttribute
_fn = cuda.cuPointerGetAttribute
_attr = ATTRIBUTE
_ptr = PTR

t0 = time.perf_counter()
for _ in range(loops):
_cuPointerGetAttribute(_attr, _ptr)
_fn(_attr, _ptr)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,32 @@


def bench_stream_create_destroy(loops: int) -> float:
_cuStreamCreate = cuda.cuStreamCreate
_cuStreamDestroy = cuda.cuStreamDestroy
_create = cuda.cuStreamCreate
_destroy = cuda.cuStreamDestroy
_flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value

t0 = time.perf_counter()
for _ in range(loops):
_, s = _cuStreamCreate(_flags)
_cuStreamDestroy(s)
_, s = _create(_flags)
_destroy(s)
return time.perf_counter() - t0


def bench_stream_query(loops: int) -> float:
_cuStreamQuery = cuda.cuStreamQuery
_fn = cuda.cuStreamQuery
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamQuery(_stream)
_fn(_stream)
return time.perf_counter() - t0


def bench_stream_synchronize(loops: int) -> float:
_cuStreamSynchronize = cuda.cuStreamSynchronize
_fn = cuda.cuStreamSynchronize
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamSynchronize(_stream)
_fn(_stream)
return time.perf_counter() - t0
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes)
add_driver_benchmark(bench_ctx_device)
add_driver_benchmark(bench_stream)
add_driver_benchmark(bench_event)
add_driver_benchmark(bench_memory)

# NVRTC benchmarks (require nvrtc for kernel compilation)
if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,39 +168,6 @@ int main(int argc, char** argv) {
});
}

// --- launch_small_kernel ---
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were duplicated before. Cleaning up.

{
void* params[] = {&float_ptr};
suite.run("launch.launch_small_kernel", [&]() {
check_cu(
cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr),
"cuLaunchKernel failed"
);
});
}

// --- launch_16_args ---
{
suite.run("launch.launch_16_args", [&]() {
check_cu(
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
"cuLaunchKernel failed"
);
});
}

// --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) ---
// In C++ the params are always pre-packed, so this is identical to launch_16_args.
// We include it for naming parity with the Python benchmark.
{
suite.run("launch.launch_16_args_pre_packed", [&]() {
check_cu(
cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr),
"cuLaunchKernel failed"
);
});
}

// Cleanup
for (int i = 0; i < 16; ++i) {
check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed");
Expand Down
Loading
Loading