diff --git a/.coveragerc b/.coveragerc index 36f0f7879a..1e1776fd56 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 [paths] source = diff --git a/.spdx-ignore b/.spdx-ignore index 8c1d155c47..7263b5414f 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -8,9 +8,6 @@ LICENSE requirements*.txt cuda_bindings/examples/* -# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149) -cuda_bindings/benchmarks/* - # Vendored cuda_core/cuda/core/_include/dlpack.h diff --git a/cuda_bindings/benchmarks/.gitignore b/benchmarks/cuda_bindings/.gitignore similarity index 100% rename from cuda_bindings/benchmarks/.gitignore rename to benchmarks/cuda_bindings/.gitignore diff --git a/benchmarks/cuda_bindings/AGENTS.md b/benchmarks/cuda_bindings/AGENTS.md new file mode 100644 index 0000000000..b9096a737f --- /dev/null +++ b/benchmarks/cuda_bindings/AGENTS.md @@ -0,0 +1,6 @@ +# cuda.bindings benchmarks + +Read the README.md in this directory for more details about the benchmarks. + +When generating code verify that that the code is correct based on the source for cuda-bindings +that can be found in ../../cuda_bindings diff --git a/cuda_bindings/benchmarks/README.md b/benchmarks/cuda_bindings/README.md similarity index 97% rename from cuda_bindings/benchmarks/README.md rename to benchmarks/cuda_bindings/README.md index 75e16db031..f8d5ccf043 100644 --- a/cuda_bindings/benchmarks/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -37,7 +37,7 @@ See: https://pyperf.readthedocs.io/en/latest/system.html#system pixi run -e wheel -- python -m pyperf system show # Apply tuning (may require root) -sudo $(pixi run -e wheel -- which python) -m pyperf system tune +$(pixi run -e wheel -- which python) -m pyperf system tune ``` ### Running benchmarks diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py similarity index 75% rename from cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py rename to benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py index 1c82cd4046..2e2cd11d93 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py @@ -15,48 +15,48 @@ def bench_ctx_get_current(loops: int) -> float: - _cuCtxGetCurrent = cuda.cuCtxGetCurrent + _fn = cuda.cuCtxGetCurrent t0 = time.perf_counter() for _ in range(loops): - _cuCtxGetCurrent() + _fn() return time.perf_counter() - t0 def bench_ctx_set_current(loops: int) -> float: - _cuCtxSetCurrent = cuda.cuCtxSetCurrent + _fn = cuda.cuCtxSetCurrent _ctx = CTX t0 = time.perf_counter() for _ in range(loops): - _cuCtxSetCurrent(_ctx) + _fn(_ctx) return time.perf_counter() - t0 def bench_ctx_get_device(loops: int) -> float: - _cuCtxGetDevice = cuda.cuCtxGetDevice + _fn = cuda.cuCtxGetDevice t0 = time.perf_counter() for _ in range(loops): - _cuCtxGetDevice() + _fn() return time.perf_counter() - t0 def bench_device_get(loops: int) -> float: - _cuDeviceGet = cuda.cuDeviceGet + _fn = cuda.cuDeviceGet t0 = time.perf_counter() for _ in range(loops): - _cuDeviceGet(0) + _fn(0) return time.perf_counter() - t0 def bench_device_get_attribute(loops: int) -> float: - _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute + _fn = cuda.cuDeviceGetAttribute _attr = ATTRIBUTE _dev = DEVICE t0 = time.perf_counter() for _ in range(loops): - _cuDeviceGetAttribute(_attr, _dev) + _fn(_attr, _dev) return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_event.py b/benchmarks/cuda_bindings/benchmarks/bench_event.py similarity index 76% rename from cuda_bindings/benchmarks/benchmarks/bench_event.py rename to benchmarks/cuda_bindings/benchmarks/bench_event.py index e8e319115d..041adc2553 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_event.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_event.py @@ -20,43 +20,43 @@ def bench_event_create_destroy(loops: int) -> float: - _cuEventCreate = cuda.cuEventCreate - _cuEventDestroy = cuda.cuEventDestroy + _create = cuda.cuEventCreate + _destroy = cuda.cuEventDestroy _flags = EVENT_FLAGS t0 = time.perf_counter() for _ in range(loops): - _, e = _cuEventCreate(_flags) - _cuEventDestroy(e) + _, e = _create(_flags) + _destroy(e) return time.perf_counter() - t0 def bench_event_record(loops: int) -> float: - _cuEventRecord = cuda.cuEventRecord + _fn = cuda.cuEventRecord _event = EVENT _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuEventRecord(_event, _stream) + _fn(_event, _stream) return time.perf_counter() - t0 def bench_event_query(loops: int) -> float: - _cuEventQuery = cuda.cuEventQuery + _fn = cuda.cuEventQuery _event = EVENT t0 = time.perf_counter() for _ in range(loops): - _cuEventQuery(_event) + _fn(_event) return time.perf_counter() - t0 def bench_event_synchronize(loops: int) -> float: - _cuEventSynchronize = cuda.cuEventSynchronize + _fn = cuda.cuEventSynchronize _event = EVENT t0 = time.perf_counter() for _ in range(loops): - _cuEventSynchronize(_event) + _fn(_event) return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/benchmarks/cuda_bindings/benchmarks/bench_launch.py similarity index 87% rename from cuda_bindings/benchmarks/benchmarks/bench_launch.py rename to benchmarks/cuda_bindings/benchmarks/bench_launch.py index 931194fbd3..abf3f946cc 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_launch.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_launch.py @@ -82,19 +82,19 @@ def _ensure_launch_state() -> None: def bench_launch_empty_kernel(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = EMPTY_KERNEL _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) return time.perf_counter() - t0 def bench_launch_small_kernel(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = SMALL_KERNEL _stream = STREAM _args = (FLOAT_PTR,) @@ -102,13 +102,13 @@ def bench_launch_small_kernel(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 def bench_launch_16_args(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = KERNEL_16_ARGS _stream = STREAM _args = INT_PTRS @@ -116,18 +116,18 @@ def bench_launch_16_args(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 def bench_launch_16_args_pre_packed(loops: int) -> float: _ensure_launch_state() - _cuLaunchKernel = cuda.cuLaunchKernel + _fn = cuda.cuLaunchKernel _kernel = KERNEL_16_ARGS _stream = STREAM _packed = PACKED_16 t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) + _fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) return time.perf_counter() - t0 diff --git a/benchmarks/cuda_bindings/benchmarks/bench_memory.py b/benchmarks/cuda_bindings/benchmarks/bench_memory.py new file mode 100644 index 0000000000..875c060406 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/bench_memory.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +import numpy as np +from runner.runtime import alloc_persistent, ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +# Allocation size for alloc/free benchmarks +ALLOC_SIZE = 1024 + +# Small transfer size (8 bytes) to measure call overhead, not bandwidth +COPY_SIZE = 8 + +# Pre-allocate device memory and host buffers for memcpy benchmarks +DST_DPTR = alloc_persistent(COPY_SIZE) +SRC_DPTR = alloc_persistent(COPY_SIZE) +HOST_SRC = np.zeros(COPY_SIZE, dtype=np.uint8) +HOST_DST = np.zeros(COPY_SIZE, dtype=np.uint8) + +# Stream for async operations +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + + +def bench_mem_alloc_free(loops: int) -> float: + _alloc = cuda.cuMemAlloc + _free = cuda.cuMemFree + _size = ALLOC_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _, ptr = _alloc(_size) + _free(ptr) + return time.perf_counter() - t0 + + +def bench_mem_alloc_async_free_async(loops: int) -> float: + _alloc = cuda.cuMemAllocAsync + _free = cuda.cuMemFreeAsync + _size = ALLOC_SIZE + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _, ptr = _alloc(_size, _stream) + _free(ptr, _stream) + return time.perf_counter() - t0 + + +def bench_memcpy_htod(loops: int) -> float: + _fn = cuda.cuMemcpyHtoD + _dst = DST_DPTR + _src = HOST_SRC + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _fn(_dst, _src, _size) + return time.perf_counter() - t0 + + +def bench_memcpy_dtoh(loops: int) -> float: + _fn = cuda.cuMemcpyDtoH + _dst = HOST_DST + _src = SRC_DPTR + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _fn(_dst, _src, _size) + return time.perf_counter() - t0 + + +def bench_memcpy_dtod(loops: int) -> float: + _fn = cuda.cuMemcpyDtoD + _dst = DST_DPTR + _src = SRC_DPTR + _size = COPY_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + _fn(_dst, _src, _size) + return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py similarity index 86% rename from cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py rename to benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py index a02b82c399..191da263ee 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py @@ -15,11 +15,11 @@ def bench_pointer_get_attribute(loops: int) -> float: # Local references to avoid global lookups in the hot loop - _cuPointerGetAttribute = cuda.cuPointerGetAttribute + _fn = cuda.cuPointerGetAttribute _attr = ATTRIBUTE _ptr = PTR t0 = time.perf_counter() for _ in range(loops): - _cuPointerGetAttribute(_attr, _ptr) + _fn(_attr, _ptr) return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_stream.py b/benchmarks/cuda_bindings/benchmarks/bench_stream.py similarity index 73% rename from cuda_bindings/benchmarks/benchmarks/bench_stream.py rename to benchmarks/cuda_bindings/benchmarks/bench_stream.py index d816099ed5..3aab9288fc 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_stream.py +++ b/benchmarks/cuda_bindings/benchmarks/bench_stream.py @@ -14,32 +14,32 @@ def bench_stream_create_destroy(loops: int) -> float: - _cuStreamCreate = cuda.cuStreamCreate - _cuStreamDestroy = cuda.cuStreamDestroy + _create = cuda.cuStreamCreate + _destroy = cuda.cuStreamDestroy _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value t0 = time.perf_counter() for _ in range(loops): - _, s = _cuStreamCreate(_flags) - _cuStreamDestroy(s) + _, s = _create(_flags) + _destroy(s) return time.perf_counter() - t0 def bench_stream_query(loops: int) -> float: - _cuStreamQuery = cuda.cuStreamQuery + _fn = cuda.cuStreamQuery _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuStreamQuery(_stream) + _fn(_stream) return time.perf_counter() - t0 def bench_stream_synchronize(loops: int) -> float: - _cuStreamSynchronize = cuda.cuStreamSynchronize + _fn = cuda.cuStreamSynchronize _stream = STREAM t0 = time.perf_counter() for _ in range(loops): - _cuStreamSynchronize(_stream) + _fn(_stream) return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt similarity index 98% rename from cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt rename to benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt index b4285834aa..83326911af 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt +++ b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt @@ -82,6 +82,7 @@ add_driver_benchmark(bench_pointer_attributes) add_driver_benchmark(bench_ctx_device) add_driver_benchmark(bench_stream) add_driver_benchmark(bench_event) +add_driver_benchmark(bench_memory) # NVRTC benchmarks (require nvrtc for kernel compilation) if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY) diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp similarity index 100% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp similarity index 100% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp similarity index 84% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp index fb65da6d74..a249426963 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp @@ -168,39 +168,6 @@ int main(int argc, char** argv) { }); } - // --- launch_small_kernel --- - { - void* params[] = {&float_ptr}; - suite.run("launch.launch_small_kernel", [&]() { - check_cu( - cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args --- - { - suite.run("launch.launch_16_args", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - - // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) --- - // In C++ the params are always pre-packed, so this is identical to launch_16_args. - // We include it for naming parity with the Python benchmark. - { - suite.run("launch.launch_16_args_pre_packed", [&]() { - check_cu( - cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), - "cuLaunchKernel failed" - ); - }); - } - // Cleanup for (int i = 0; i < 16; ++i) { check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed"); diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp new file mode 100644 index 0000000000..4e71b73fb5 --- /dev/null +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +static constexpr size_t ALLOC_SIZE = 1024; +static constexpr size_t COPY_SIZE = 8; + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Pre-allocate device memory for memcpy benchmarks + CUdeviceptr dst_dptr, src_dptr; + check_cu(cuMemAlloc(&dst_dptr, COPY_SIZE), "cuMemAlloc failed"); + check_cu(cuMemAlloc(&src_dptr, COPY_SIZE), "cuMemAlloc failed"); + + // Host buffers for memcpy + uint8_t host_src[COPY_SIZE] = {}; + uint8_t host_dst[COPY_SIZE] = {}; + + bench::BenchmarkSuite suite(options); + + // --- mem_alloc_free --- + { + CUdeviceptr ptr; + suite.run("memory.mem_alloc_free", [&]() { + check_cu(cuMemAlloc(&ptr, ALLOC_SIZE), "cuMemAlloc failed"); + check_cu(cuMemFree(ptr), "cuMemFree failed"); + }); + } + + // --- mem_alloc_async_free_async --- + { + CUdeviceptr ptr; + suite.run("memory.mem_alloc_async_free_async", [&]() { + check_cu(cuMemAllocAsync(&ptr, ALLOC_SIZE, stream), "cuMemAllocAsync failed"); + check_cu(cuMemFreeAsync(ptr, stream), "cuMemFreeAsync failed"); + }); + } + + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + // --- memcpy_htod --- + { + suite.run("memory.memcpy_htod", [&]() { + check_cu(cuMemcpyHtoD(dst_dptr, host_src, COPY_SIZE), "cuMemcpyHtoD failed"); + }); + } + + // --- memcpy_dtoh --- + { + suite.run("memory.memcpy_dtoh", [&]() { + check_cu(cuMemcpyDtoH(host_dst, src_dptr, COPY_SIZE), "cuMemcpyDtoH failed"); + }); + } + + // --- memcpy_dtod --- + { + suite.run("memory.memcpy_dtod", [&]() { + check_cu(cuMemcpyDtoD(dst_dptr, src_dptr, COPY_SIZE), "cuMemcpyDtoD failed"); + }); + } + + // Cleanup + check_cu(cuMemFree(dst_dptr), "cuMemFree failed"); + check_cu(cuMemFree(src_dptr), "cuMemFree failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp similarity index 100% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp similarity index 100% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp similarity index 100% rename from cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp rename to benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp diff --git a/cuda_bindings/benchmarks/compare.py b/benchmarks/cuda_bindings/compare.py similarity index 100% rename from cuda_bindings/benchmarks/compare.py rename to benchmarks/cuda_bindings/compare.py diff --git a/cuda_bindings/benchmarks/pixi.lock b/benchmarks/cuda_bindings/pixi.lock similarity index 98% rename from cuda_bindings/benchmarks/pixi.lock rename to benchmarks/cuda_bindings/pixi.lock index c610db2f45..c571d4756c 100644 --- a/cuda_bindings/benchmarks/pixi.lock +++ b/benchmarks/cuda_bindings/pixi.lock @@ -38,8 +38,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda @@ -66,7 +66,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda @@ -130,7 +130,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - - conda: .. + - conda: ../../cuda_bindings - conda: ../../cuda_pathfinder wheel: channels: @@ -406,7 +406,7 @@ packages: license_family: GPL size: 31705 timestamp: 1771378159534 -- conda: .. +- conda: ../../cuda_bindings name: cuda-bindings version: 13.2.0 build: hb0f4dca_0 @@ -419,11 +419,11 @@ packages: - cuda-pathfinder - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.2.51,<14.0a0 + - cuda-nvrtc >=13.2.78,<14.0a0 - cuda-nvvm - libnvfatbin - libcufile - - libcufile >=1.17.0.44,<2.0a0 + - libcufile >=1.17.1.22,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -643,17 +643,17 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 35339417 timestamp: 1768272955912 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - sha256: 9de235d328b7124f715805715e9918eb7f8aa5b9c56a2afa62b84f84f98077a5 - md5: 0413baaa73be1a39d5d8e442184acc78 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8 + md5: 66623d882c42506fa3f1780b90841400 depends: - __glibc >=2.17,<3.0.a0 - cuda-version >=13.2,<13.3.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35736655 - timestamp: 1773100338749 + size: 35670504 + timestamp: 1776109867257 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3 md5: 1096fce4abad7dd975ce6d9953fceb6a @@ -668,20 +668,20 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 35845 timestamp: 1768273073971 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda - sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879 - md5: 83535200a9e77165d5291b4ac82ebf6a +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.78-hecca717_0.conda + sha256: 12505f1bbc222acf2a63da5c84e4176d2f9c18b458e2bde28939fdf326b6d292 + md5: cc313f0ea18ebc6e713a8980611431f5 depends: - __glibc >=2.17,<3.0.a0 - - cuda-nvrtc 13.2.51 hecca717_0 + - cuda-nvrtc 13.2.78 hecca717_0 - cuda-version >=13.2,<13.3.0a0 - libgcc >=14 - libstdcxx >=14 constrains: - - cuda-nvrtc-static >=13.2.51 + - cuda-nvrtc-static >=13.2.78 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 36305 - timestamp: 1773100458841 + size: 36312 + timestamp: 1776109983818 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b md5: 03cd3639b8e13623c7b91b1cb0136402 @@ -1018,9 +1018,9 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 990938 timestamp: 1768273732081 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda - sha256: dc2b0c43aeacbaa686061353807e718236d8c5b346f624e76fed98b066898e19 - md5: 6d8ed8335d144ec7303b8d3587b2205c +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda + sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74 + md5: 056a67706108efd1f9c24682ba8d3685 depends: - __glibc >=2.28,<3.0.a0 - cuda-version >=13.2,<13.3.0a0 @@ -1028,8 +1028,8 @@ packages: - libstdcxx >=14 - rdma-core >=61.0 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 1085341 - timestamp: 1773100191342 + size: 1082447 + timestamp: 1776110053053 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b md5: 1707cdd636af2ff697b53186572c9f77 diff --git a/cuda_bindings/benchmarks/pixi.toml b/benchmarks/cuda_bindings/pixi.toml similarity index 97% rename from cuda_bindings/benchmarks/pixi.toml rename to benchmarks/cuda_bindings/pixi.toml index a448e8d3e4..dbbddcd939 100644 --- a/cuda_bindings/benchmarks/pixi.toml +++ b/benchmarks/cuda_bindings/pixi.toml @@ -45,7 +45,7 @@ pre-commit = "*" cuda-bindings = "==13.1.0" [feature.bindings-source.dependencies] -cuda-bindings = { path = ".." } +cuda-bindings = { path = "../../cuda_bindings" } [environments] wheel = { features = ["cu13", "cu13-pinned", "bench", "cpp-bench", "dev", "bindings-wheel"] } diff --git a/cuda_bindings/benchmarks/pytest-legacy/conftest.py b/benchmarks/cuda_bindings/pytest-legacy/conftest.py similarity index 97% rename from cuda_bindings/benchmarks/pytest-legacy/conftest.py rename to benchmarks/cuda_bindings/pytest-legacy/conftest.py index 0ea7b1d772..5d0cc95e7a 100644 --- a/cuda_bindings/benchmarks/pytest-legacy/conftest.py +++ b/benchmarks/cuda_bindings/pytest-legacy/conftest.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import numpy as np import pytest diff --git a/cuda_bindings/benchmarks/pytest-legacy/kernels.py b/benchmarks/cuda_bindings/pytest-legacy/kernels.py similarity index 97% rename from cuda_bindings/benchmarks/pytest-legacy/kernels.py rename to benchmarks/cuda_bindings/pytest-legacy/kernels.py index 36646fba00..7e741110a3 100644 --- a/cuda_bindings/benchmarks/pytest-legacy/kernels.py +++ b/benchmarks/cuda_bindings/pytest-legacy/kernels.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 kernel_string = """\ #define ITEM_PARAM(x, T) T x diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py similarity index 98% rename from cuda_bindings/benchmarks/pytest-legacy/test_cupy.py rename to benchmarks/cuda_bindings/pytest-legacy/test_cupy.py index 76dd6e6a45..3eea752ce0 100644 --- a/cuda_bindings/benchmarks/pytest-legacy/test_cupy.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_cupy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import ctypes diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py similarity index 99% rename from cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py rename to benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py index dd994081a0..ad421de382 100755 --- a/cuda_bindings/benchmarks/pytest-legacy/test_launch_latency.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_launch_latency.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import ctypes diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py similarity index 95% rename from cuda_bindings/benchmarks/pytest-legacy/test_numba.py rename to benchmarks/cuda_bindings/pytest-legacy/test_numba.py index dfe084c6b1..d9ae0cdfee 100644 --- a/cuda_bindings/benchmarks/pytest-legacy/test_numba.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_numba.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import numpy as np import pytest diff --git a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py similarity index 98% rename from cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py rename to benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py index fae72ffd79..6df32ec511 100644 --- a/cuda_bindings/benchmarks/pytest-legacy/test_pointer_attributes.py +++ b/benchmarks/cuda_bindings/pytest-legacy/test_pointer_attributes.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import random diff --git a/cuda_bindings/benchmarks/run_cpp.py b/benchmarks/cuda_bindings/run_cpp.py similarity index 100% rename from cuda_bindings/benchmarks/run_cpp.py rename to benchmarks/cuda_bindings/run_cpp.py diff --git a/cuda_bindings/benchmarks/run_pyperf.py b/benchmarks/cuda_bindings/run_pyperf.py similarity index 100% rename from cuda_bindings/benchmarks/run_pyperf.py rename to benchmarks/cuda_bindings/run_pyperf.py diff --git a/cuda_bindings/benchmarks/runner/__init__.py b/benchmarks/cuda_bindings/runner/__init__.py similarity index 100% rename from cuda_bindings/benchmarks/runner/__init__.py rename to benchmarks/cuda_bindings/runner/__init__.py diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/benchmarks/cuda_bindings/runner/cpp.py similarity index 100% rename from cuda_bindings/benchmarks/runner/cpp.py rename to benchmarks/cuda_bindings/runner/cpp.py diff --git a/cuda_bindings/benchmarks/runner/main.py b/benchmarks/cuda_bindings/runner/main.py similarity index 98% rename from cuda_bindings/benchmarks/runner/main.py rename to benchmarks/cuda_bindings/runner/main.py index 4089aa5559..b0f6e76f41 100644 --- a/cuda_bindings/benchmarks/runner/main.py +++ b/benchmarks/cuda_bindings/runner/main.py @@ -53,7 +53,7 @@ def _discover_module_functions(module_path: Path) -> list[str]: return [ node.name for node in tree.body - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name.startswith("bench_") + if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef) and node.name.startswith("bench_") ] diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/benchmarks/cuda_bindings/runner/runtime.py similarity index 100% rename from cuda_bindings/benchmarks/runner/runtime.py rename to benchmarks/cuda_bindings/runner/runtime.py diff --git a/cuda_bindings/benchmarks/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py similarity index 100% rename from cuda_bindings/benchmarks/tests/test_runner.py rename to benchmarks/cuda_bindings/tests/test_runner.py diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py index ddb6eae107..e6366ac95d 100644 --- a/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py +++ b/cuda_python_test_helpers/cuda_python_test_helpers/nvvm_bitcode.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 import binascii diff --git a/ruff.toml b/ruff.toml index 704e422c19..210f852cd3 100644 --- a/ruff.toml +++ b/ruff.toml @@ -124,13 +124,18 @@ inline-quotes = "double" # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) # Keep examples opted-in to enforce naming conventions in example-local identifiers. -"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ +"cuda_bindings/{cuda,docs,tests}/**" = [ "N801", # invalid-class-name "N802", # invalid-function-name "N803", # invalid-argument-name "N806", # non-lowercase-variable-in-function "N816", # mixed-case-variable-in-global-scope ] +"benchmarks/cuda_bindings/pytest-legacy/**" = [ + "N801", # invalid-class-name + "N802", # invalid-function-name + "N806", # non-lowercase-variable-in-function +] "cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"] # scripts and build tooling — print is the expected output method diff --git a/toolshed/build_static_bitcode_input.py b/toolshed/build_static_bitcode_input.py index 273ce33244..e2400100dd 100755 --- a/toolshed/build_static_bitcode_input.py +++ b/toolshed/build_static_bitcode_input.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 """ Helper to produce static bitcode input for test_nvvm.py. diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py index 6be42282bf..3d52142554 100644 --- a/toolshed/check_spdx.py +++ b/toolshed/check_spdx.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import datetime +import fnmatch import os import re import subprocess @@ -17,12 +18,28 @@ LICENSE_IDENTIFIER_REGEX = re.compile(re.escape(SPDX_LICENSE_IDENTIFIER_PREFIX) + rb"(?P[^\r\n]+)") -EXPECTED_LICENSE_IDENTIFIERS = ( - ("cuda_bindings/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"), - ("cuda_core/", "Apache-2.0"), - ("cuda_pathfinder/", "Apache-2.0"), - ("cuda_python/", "LicenseRef-NVIDIA-SOFTWARE-LICENSE"), -) +TOP_LEVEL_FILE_LICENSE_IDENTIFIER = "Apache-2.0" + +# Every top-level directory needs to have an entry here, so new paths +# can't slip in without a reviewed license decision. +TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS = { + ".github": "Apache-2.0", + "benchmarks": "Apache-2.0", + "ci": "Apache-2.0", + "cuda_bindings": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", + "cuda_core": "Apache-2.0", + "cuda_pathfinder": "Apache-2.0", + "cuda_python": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", + "cuda_python_test_helpers": "Apache-2.0", + "scripts": "Apache-2.0", + "toolshed": "Apache-2.0", +} + +SPECIAL_CASE_LICENSE_IDENTIFIERS = { + # key: repo-relative path or glob, value: expected SPDX license identifier + "cuda_bindings/benchmarks/*": "Apache-2.0", + "cuda_bindings/benchmarks/pytest-legacy/*": "LicenseRef-NVIDIA-SOFTWARE-LICENSE", +} SPDX_IGNORE_FILENAME = ".spdx-ignore" @@ -63,12 +80,34 @@ def normalize_repo_path(filepath): return PureWindowsPath(filepath).as_posix() +def get_top_level_directory(normalized_path): + if "/" not in normalized_path: + return None + return normalized_path.split("/", 1)[0] + + def get_expected_license_identifier(filepath): normalized_path = normalize_repo_path(filepath) - for prefix, license_identifier in EXPECTED_LICENSE_IDENTIFIERS: - if normalized_path.startswith(prefix): - return license_identifier - return None + matching_special_cases = [ + (prefix, license_identifier) + for prefix, license_identifier in SPECIAL_CASE_LICENSE_IDENTIFIERS.items() + if fnmatch.fnmatchcase(normalized_path, prefix) + ] + if matching_special_cases: + return max(matching_special_cases, key=lambda item: len(item[0]))[1], None + + top_level_directory = get_top_level_directory(normalized_path) + if top_level_directory is None: + return TOP_LEVEL_FILE_LICENSE_IDENTIFIER, None + + if top_level_directory not in TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS: + return ( + None, + f"MISSING TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS entry for top-level directory " + f"{top_level_directory!r} required by {filepath!r}", + ) + + return TOP_LEVEL_DIRS_LICENSE_IDENTIFIERS[top_level_directory], None def validate_required_spdx_field(filepath, blob, expected_bytes): @@ -82,10 +121,11 @@ def extract_license_identifier(blob): match = LICENSE_IDENTIFIER_REGEX.search(blob) if match is None: return None - try: - return match.group("license_identifier").decode("ascii") - except UnicodeDecodeError: - return None + license_identifier = match.group("license_identifier").decode("ascii", errors="replace").strip() + for comment_suffix in ("-->", "*/"): + if license_identifier.endswith(comment_suffix): + license_identifier = license_identifier.removesuffix(comment_suffix).rstrip() + return license_identifier or None def validate_license_identifier(filepath, blob): @@ -94,9 +134,10 @@ def validate_license_identifier(filepath, blob): print(f"MISSING valid SPDX license identifier in {filepath!r}") return False - expected_license_identifier = get_expected_license_identifier(filepath) - if expected_license_identifier is None: - return True + expected_license_identifier, configuration_error = get_expected_license_identifier(filepath) + if configuration_error is not None: + print(configuration_error) + return False if license_identifier != expected_license_identifier: print( diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py index 84013ea94b..422bf95232 100644 --- a/toolshed/dump_cutile_b64.py +++ b/toolshed/dump_cutile_b64.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# SPDX-License-Identifier: Apache-2.0 """ Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=.,