diff --git a/m/module/nvidia.nix b/m/module/nvidia.nix index a41b1124..baebc42b 100644 --- a/m/module/nvidia.nix +++ b/m/module/nvidia.nix @@ -1,4 +1,4 @@ -{ lib, config, ... }: +{ lib, config, pkgs, ... }: { # Configure Nvidia driver to use with CUDA hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production; @@ -15,4 +15,6 @@ programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [ config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument ]; + + environment.systemPackages = [ pkgs.cudainfo ]; } diff --git a/pkgs/cudainfo/Makefile b/pkgs/cudainfo/Makefile new file mode 100644 index 00000000..5990ebab --- /dev/null +++ b/pkgs/cudainfo/Makefile @@ -0,0 +1,12 @@ +HOSTCXX ?= g++ +NVCC := nvcc -ccbin $(HOSTCXX) +CXXFLAGS := -m64 + +# Target rules +all: cudainfo + +cudainfo: cudainfo.cpp + $(NVCC) $(CXXFLAGS) -o $@ $< + +clean: + rm -f cudainfo cudainfo.o diff --git a/pkgs/cudainfo/cudainfo.cpp b/pkgs/cudainfo/cudainfo.cpp new file mode 100644 index 00000000..815500b5 --- /dev/null +++ b/pkgs/cudainfo/cudainfo.cpp @@ -0,0 +1,600 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ +/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */ + +// Shared Utilities (QA Testing) + +// std::system includes +#include +#include + +#include + +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) +{ + switch (error) + { + case cudaSuccess: + return "cudaSuccess"; + + case cudaErrorMissingConfiguration: + return "cudaErrorMissingConfiguration"; + + case cudaErrorMemoryAllocation: + return "cudaErrorMemoryAllocation"; + + case cudaErrorInitializationError: + return "cudaErrorInitializationError"; + + case cudaErrorLaunchFailure: + return "cudaErrorLaunchFailure"; + + case cudaErrorPriorLaunchFailure: + return "cudaErrorPriorLaunchFailure"; + + case cudaErrorLaunchTimeout: + return "cudaErrorLaunchTimeout"; + + case cudaErrorLaunchOutOfResources: + return "cudaErrorLaunchOutOfResources"; + + case cudaErrorInvalidDeviceFunction: + return "cudaErrorInvalidDeviceFunction"; + + case cudaErrorInvalidConfiguration: + return "cudaErrorInvalidConfiguration"; + + case cudaErrorInvalidDevice: + return "cudaErrorInvalidDevice"; + + case cudaErrorInvalidValue: + return "cudaErrorInvalidValue"; + + case cudaErrorInvalidPitchValue: + return "cudaErrorInvalidPitchValue"; + + case cudaErrorInvalidSymbol: + return "cudaErrorInvalidSymbol"; + + case cudaErrorMapBufferObjectFailed: + return "cudaErrorMapBufferObjectFailed"; + + case cudaErrorUnmapBufferObjectFailed: + return "cudaErrorUnmapBufferObjectFailed"; + + case cudaErrorInvalidHostPointer: + return "cudaErrorInvalidHostPointer"; + + case cudaErrorInvalidDevicePointer: + return "cudaErrorInvalidDevicePointer"; + + case cudaErrorInvalidTexture: + return "cudaErrorInvalidTexture"; + + case cudaErrorInvalidTextureBinding: + return "cudaErrorInvalidTextureBinding"; + + case cudaErrorInvalidChannelDescriptor: + return "cudaErrorInvalidChannelDescriptor"; + + case cudaErrorInvalidMemcpyDirection: + return "cudaErrorInvalidMemcpyDirection"; + + case cudaErrorAddressOfConstant: + return "cudaErrorAddressOfConstant"; + + case cudaErrorTextureFetchFailed: + return "cudaErrorTextureFetchFailed"; + + case cudaErrorTextureNotBound: + return "cudaErrorTextureNotBound"; + + case cudaErrorSynchronizationError: + return "cudaErrorSynchronizationError"; + + case cudaErrorInvalidFilterSetting: + return "cudaErrorInvalidFilterSetting"; + + case cudaErrorInvalidNormSetting: + return "cudaErrorInvalidNormSetting"; + + case cudaErrorMixedDeviceExecution: + return "cudaErrorMixedDeviceExecution"; + + case cudaErrorCudartUnloading: + return "cudaErrorCudartUnloading"; + + case cudaErrorUnknown: + return "cudaErrorUnknown"; + + case cudaErrorNotYetImplemented: + return "cudaErrorNotYetImplemented"; + + case cudaErrorMemoryValueTooLarge: + return "cudaErrorMemoryValueTooLarge"; + + case cudaErrorInvalidResourceHandle: + return "cudaErrorInvalidResourceHandle"; + + case cudaErrorNotReady: + return "cudaErrorNotReady"; + + case cudaErrorInsufficientDriver: + return "cudaErrorInsufficientDriver"; + + case cudaErrorSetOnActiveProcess: + return "cudaErrorSetOnActiveProcess"; + + case cudaErrorInvalidSurface: + return "cudaErrorInvalidSurface"; + + case cudaErrorNoDevice: + return "cudaErrorNoDevice"; + + case cudaErrorECCUncorrectable: + return "cudaErrorECCUncorrectable"; + + case cudaErrorSharedObjectSymbolNotFound: + return "cudaErrorSharedObjectSymbolNotFound"; + + case cudaErrorSharedObjectInitFailed: + return "cudaErrorSharedObjectInitFailed"; + + case cudaErrorUnsupportedLimit: + return "cudaErrorUnsupportedLimit"; + + case cudaErrorDuplicateVariableName: + return "cudaErrorDuplicateVariableName"; + + case cudaErrorDuplicateTextureName: + return "cudaErrorDuplicateTextureName"; + + case cudaErrorDuplicateSurfaceName: + return "cudaErrorDuplicateSurfaceName"; + + case cudaErrorDevicesUnavailable: + return "cudaErrorDevicesUnavailable"; + + case cudaErrorInvalidKernelImage: + return "cudaErrorInvalidKernelImage"; + + case cudaErrorNoKernelImageForDevice: + return "cudaErrorNoKernelImageForDevice"; + + case cudaErrorIncompatibleDriverContext: + return "cudaErrorIncompatibleDriverContext"; + + case cudaErrorPeerAccessAlreadyEnabled: + return "cudaErrorPeerAccessAlreadyEnabled"; + + case cudaErrorPeerAccessNotEnabled: + return "cudaErrorPeerAccessNotEnabled"; + + case cudaErrorDeviceAlreadyInUse: + return "cudaErrorDeviceAlreadyInUse"; + + case cudaErrorProfilerDisabled: + return "cudaErrorProfilerDisabled"; + + case cudaErrorProfilerNotInitialized: + return "cudaErrorProfilerNotInitialized"; + + case cudaErrorProfilerAlreadyStarted: + return "cudaErrorProfilerAlreadyStarted"; + + case cudaErrorProfilerAlreadyStopped: + return "cudaErrorProfilerAlreadyStopped"; + + /* Since CUDA 4.0*/ + case cudaErrorAssert: + return "cudaErrorAssert"; + + case cudaErrorTooManyPeers: + return "cudaErrorTooManyPeers"; + + case cudaErrorHostMemoryAlreadyRegistered: + return "cudaErrorHostMemoryAlreadyRegistered"; + + case cudaErrorHostMemoryNotRegistered: + return "cudaErrorHostMemoryNotRegistered"; + + /* Since CUDA 5.0 */ + case cudaErrorOperatingSystem: + return "cudaErrorOperatingSystem"; + + case cudaErrorPeerAccessUnsupported: + return "cudaErrorPeerAccessUnsupported"; + + case cudaErrorLaunchMaxDepthExceeded: + return "cudaErrorLaunchMaxDepthExceeded"; + + case cudaErrorLaunchFileScopedTex: + return "cudaErrorLaunchFileScopedTex"; + + case cudaErrorLaunchFileScopedSurf: + return "cudaErrorLaunchFileScopedSurf"; + + case cudaErrorSyncDepthExceeded: + return "cudaErrorSyncDepthExceeded"; + + case cudaErrorLaunchPendingCountExceeded: + return "cudaErrorLaunchPendingCountExceeded"; + + case cudaErrorNotPermitted: + return "cudaErrorNotPermitted"; + + case cudaErrorNotSupported: + return "cudaErrorNotSupported"; + + /* Since CUDA 6.0 */ + case cudaErrorHardwareStackError: + return "cudaErrorHardwareStackError"; + + case cudaErrorIllegalInstruction: + return "cudaErrorIllegalInstruction"; + + case cudaErrorMisalignedAddress: + return "cudaErrorMisalignedAddress"; + + case cudaErrorInvalidAddressSpace: + return "cudaErrorInvalidAddressSpace"; + + case cudaErrorInvalidPc: + return "cudaErrorInvalidPc"; + + case cudaErrorIllegalAddress: + return "cudaErrorIllegalAddress"; + + /* Since CUDA 6.5*/ + case cudaErrorInvalidPtx: + return "cudaErrorInvalidPtx"; + + case cudaErrorInvalidGraphicsContext: + return "cudaErrorInvalidGraphicsContext"; + + case cudaErrorStartupFailure: + return "cudaErrorStartupFailure"; + + case cudaErrorApiFailureBase: + return "cudaErrorApiFailureBase"; + } + + return ""; +} +#endif + +template< typename T > +void check(T result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", + file, line, static_cast(result), _cudaGetErrorEnum(result), func); + cudaDeviceReset(); + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +int *pArgc = NULL; +char **pArgv = NULL; + +#if CUDART_VERSION < 5000 + +// CUDA-C includes +#include + +// This function wraps the CUDA Driver API into a template function +template +inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) +{ + CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); + + if (CUDA_SUCCESS != error) { + fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", + error, __FILE__, __LINE__); + + // cudaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cudaDeviceReset causes all profile data to be + // flushed before the application exits + cudaDeviceReset(); + exit(EXIT_FAILURE); + } +} + +#endif /* CUDART_VERSION < 5000 */ + +// Beginning of GPU Architecture definitions +inline int ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + { -1, -1 } + }; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one to run properly + printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); + return nGpuArchCoresPerSM[index-1].Cores; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int +main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; + + printf("%s Starting...\n\n", argv[0]); + printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); + + int deviceCount = 0; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount failed: %s (%d)\n", + cudaGetErrorString(error_id), (int) error_id); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } + + // This function call returns 0 if there are no CUDA capable devices. + if (deviceCount == 0) + printf("There are no available device(s) that support CUDA\n"); + else + printf("Detected %d CUDA Capable device(s)\n", deviceCount); + + int dev, driverVersion = 0, runtimeVersion = 0; + + for (dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(dev); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + + printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); + + // Console log + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); + + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", + (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); + + printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", + deviceProp.multiProcessorCount, + ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); + printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); + + +#if CUDART_VERSION >= 5000 + // This is supported in CUDA 5.0 (runtime API device properties) + printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); + printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); + + if (deviceProp.l2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); + } + +#else + // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) + int memoryClock; + getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); + printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); + int memBusWidth; + getCudaAttribute(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); + printf(" Memory Bus Width: %d-bit\n", memBusWidth); + int L2CacheSize; + getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); + + if (L2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", L2CacheSize); + } + +#endif + + printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", + deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], + deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); + printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", + deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); + printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", + deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); + + + printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); + printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); + printf(" Warp size: %d\n", deviceProp.warpSize); + printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); + printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); + printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", + deviceProp.maxThreadsDim[0], + deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", + deviceProp.maxGridSize[0], + deviceProp.maxGridSize[1], + deviceProp.maxGridSize[2]); + printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); + printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); + printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); +#endif + printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); + + const char *sComputeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + printf(" Compute Mode:\n"); + printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); + } + + // If there are 2 or more GPUs, query to determine whether RDMA is supported + if (deviceCount >= 2) + { + cudaDeviceProp prop[64]; + int gpuid[64]; // we want to find the first two GPU's that can support P2P + int gpu_p2p_count = 0; + + for (int i=0; i < deviceCount; i++) + { + checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); + + // Only boards based on Fermi or later can support P2P + if ((prop[i].major >= 2) +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this + && prop[i].tccDriver +#endif + ) + { + // This is an array of P2P capable GPUs + gpuid[gpu_p2p_count++] = i; + } + } + + // Show all the combinations of support P2P GPUs + int can_access_peer_0_1, can_access_peer_1_0; + + if (gpu_p2p_count >= 2) + { + for (int i = 0; i < gpu_p2p_count-1; i++) + { + for (int j = 1; j < gpu_p2p_count; j++) + { + checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j])); + printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], + prop[gpuid[j]].name, gpuid[j] , + can_access_peer_0_1 ? "Yes" : "No"); + } + } + + for (int j = 1; j < gpu_p2p_count; j++) + { + for (int i = 0; i < gpu_p2p_count-1; i++) + { + checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i])); + printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j], + prop[gpuid[i]].name, gpuid[i] , + can_access_peer_1_0 ? "Yes" : "No"); + } + } + } + } + + // csv masterlog info + // ***************************** + // exe and CUDA driver name + printf("\n"); + std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; + char cTemp[128]; + + // driver version + sProfileString += ", CUDA Driver Version = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); +#else + sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); +#endif + sProfileString += cTemp; + + // Runtime version + sProfileString += ", CUDA Runtime Version = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); +#else + sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); +#endif + sProfileString += cTemp; + + // Device count + sProfileString += ", NumDevs = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d", deviceCount); +#else + sprintf(cTemp, "%d", deviceCount); +#endif + sProfileString += cTemp; + + // Print Out all device Names + for (dev = 0; dev < deviceCount; ++dev) + { +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 13, ", Device%d = ", dev); +#else + sprintf(cTemp, ", Device%d = ", dev); +#endif + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + sProfileString += cTemp; + sProfileString += deviceProp.name; + } + + sProfileString += "\n"; + printf("%s", sProfileString.c_str()); + + printf("Result = PASS\n"); + + // finish + // cudaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cudaDeviceReset causes all profile data to be + // flushed before the application exits + cudaDeviceReset(); + return 0; +} diff --git a/pkgs/cudainfo/default.nix b/pkgs/cudainfo/default.nix new file mode 100644 index 00000000..871d6978 --- /dev/null +++ b/pkgs/cudainfo/default.nix @@ -0,0 +1,43 @@ +{ + stdenv +, cudatoolkit +, cudaPackages +, autoAddDriverRunpath +, strace +}: + +stdenv.mkDerivation (finalAttrs: { + name = "cudainfo"; + src = ./.; + buildInputs = [ + cudatoolkit # Required for nvcc + cudaPackages.cuda_cudart.static # Required for -lcudart_static + autoAddDriverRunpath + ]; + installPhase = '' + mkdir -p $out/bin + cp -a cudainfo $out/bin + ''; + passthru.gpuCheck = stdenv.mkDerivation { + name = "cudainfo-test"; + requiredSystemFeatures = [ "cuda" ]; + dontBuild = true; + nativeCheckInputs = [ + finalAttrs.finalPackage # The cudainfo package from above + strace # When it fails, it will show the trace + ]; + dontUnpack = true; + doCheck = true; + checkPhase = '' + if ! cudainfo; then + set -x + cudainfo=$(command -v cudainfo) + ldd $cudainfo + readelf -d $cudainfo + strace -f $cudainfo + set +x + fi + ''; + installPhase = "touch $out"; + }; +}) diff --git a/pkgs/overlay.nix b/pkgs/overlay.nix index 7db9941a..9d3c9601 100644 --- a/pkgs/overlay.nix +++ b/pkgs/overlay.nix @@ -52,4 +52,5 @@ final: prev: prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { }; meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { }; upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { }; + cudainfo = prev.callPackage ./cudainfo/default.nix { }; }