5 changed files with 659 additions and 1 deletions
--- a/m/module/nvidia.nix
+++ b/m/module/nvidia.nix
@ -1,4 +1,4 @@
-{ lib, config, ... }:
+{ lib, config, pkgs, ... }:
 {
  # Configure Nvidia driver to use with CUDA
  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
@ -15,4 +15,6 @@
  programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
    config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
  ];
  environment.systemPackages = [ pkgs.cudainfo ];
 }
--- a/pkgs/cudainfo/Makefile
+++ b/pkgs/cudainfo/Makefile
@ -0,0 +1,12 @@
 HOSTCXX  ?= g++
 NVCC     := nvcc -ccbin $(HOSTCXX)
 CXXFLAGS := -m64
 # Target rules
 all: cudainfo
 cudainfo: cudainfo.cpp
 	$(NVCC) $(CXXFLAGS) -o $@ $<
 clean:
 	rm -f cudainfo cudainfo.o
--- a/pkgs/cudainfo/cudainfo.cpp
+++ b/pkgs/cudainfo/cudainfo.cpp
@ -0,0 +1,600 @@
 /*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
 /* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
 // Shared Utilities (QA Testing)
 // std::system includes
 #include <memory>
 #include <iostream>
 #include <cuda_runtime.h>
 // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 #define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
 // CUDA Runtime error messages
 #ifdef __DRIVER_TYPES_H__
 static const char *_cudaGetErrorEnum(cudaError_t error)
 {
    switch (error)
    {
        case cudaSuccess:
            return "cudaSuccess";
        case cudaErrorMissingConfiguration:
            return "cudaErrorMissingConfiguration";
        case cudaErrorMemoryAllocation:
            return "cudaErrorMemoryAllocation";
        case cudaErrorInitializationError:
            return "cudaErrorInitializationError";
        case cudaErrorLaunchFailure:
            return "cudaErrorLaunchFailure";
        case cudaErrorPriorLaunchFailure:
            return "cudaErrorPriorLaunchFailure";
        case cudaErrorLaunchTimeout:
            return "cudaErrorLaunchTimeout";
        case cudaErrorLaunchOutOfResources:
            return "cudaErrorLaunchOutOfResources";
        case cudaErrorInvalidDeviceFunction:
            return "cudaErrorInvalidDeviceFunction";
        case cudaErrorInvalidConfiguration:
            return "cudaErrorInvalidConfiguration";
        case cudaErrorInvalidDevice:
            return "cudaErrorInvalidDevice";
        case cudaErrorInvalidValue:
            return "cudaErrorInvalidValue";
        case cudaErrorInvalidPitchValue:
            return "cudaErrorInvalidPitchValue";
        case cudaErrorInvalidSymbol:
            return "cudaErrorInvalidSymbol";
        case cudaErrorMapBufferObjectFailed:
            return "cudaErrorMapBufferObjectFailed";
        case cudaErrorUnmapBufferObjectFailed:
            return "cudaErrorUnmapBufferObjectFailed";
        case cudaErrorInvalidHostPointer:
            return "cudaErrorInvalidHostPointer";
        case cudaErrorInvalidDevicePointer:
            return "cudaErrorInvalidDevicePointer";
        case cudaErrorInvalidTexture:
            return "cudaErrorInvalidTexture";
        case cudaErrorInvalidTextureBinding:
            return "cudaErrorInvalidTextureBinding";
        case cudaErrorInvalidChannelDescriptor:
            return "cudaErrorInvalidChannelDescriptor";
        case cudaErrorInvalidMemcpyDirection:
            return "cudaErrorInvalidMemcpyDirection";
        case cudaErrorAddressOfConstant:
            return "cudaErrorAddressOfConstant";
        case cudaErrorTextureFetchFailed:
            return "cudaErrorTextureFetchFailed";
        case cudaErrorTextureNotBound:
            return "cudaErrorTextureNotBound";
        case cudaErrorSynchronizationError:
            return "cudaErrorSynchronizationError";
        case cudaErrorInvalidFilterSetting:
            return "cudaErrorInvalidFilterSetting";
        case cudaErrorInvalidNormSetting:
            return "cudaErrorInvalidNormSetting";
        case cudaErrorMixedDeviceExecution:
            return "cudaErrorMixedDeviceExecution";
        case cudaErrorCudartUnloading:
            return "cudaErrorCudartUnloading";
        case cudaErrorUnknown:
            return "cudaErrorUnknown";
        case cudaErrorNotYetImplemented:
            return "cudaErrorNotYetImplemented";
        case cudaErrorMemoryValueTooLarge:
            return "cudaErrorMemoryValueTooLarge";
        case cudaErrorInvalidResourceHandle:
            return "cudaErrorInvalidResourceHandle";
        case cudaErrorNotReady:
            return "cudaErrorNotReady";
        case cudaErrorInsufficientDriver:
            return "cudaErrorInsufficientDriver";
        case cudaErrorSetOnActiveProcess:
            return "cudaErrorSetOnActiveProcess";
        case cudaErrorInvalidSurface:
            return "cudaErrorInvalidSurface";
        case cudaErrorNoDevice:
            return "cudaErrorNoDevice";
        case cudaErrorECCUncorrectable:
            return "cudaErrorECCUncorrectable";
        case cudaErrorSharedObjectSymbolNotFound:
            return "cudaErrorSharedObjectSymbolNotFound";
        case cudaErrorSharedObjectInitFailed:
            return "cudaErrorSharedObjectInitFailed";
        case cudaErrorUnsupportedLimit:
            return "cudaErrorUnsupportedLimit";
        case cudaErrorDuplicateVariableName:
            return "cudaErrorDuplicateVariableName";
        case cudaErrorDuplicateTextureName:
            return "cudaErrorDuplicateTextureName";
        case cudaErrorDuplicateSurfaceName:
            return "cudaErrorDuplicateSurfaceName";
        case cudaErrorDevicesUnavailable:
            return "cudaErrorDevicesUnavailable";
        case cudaErrorInvalidKernelImage:
            return "cudaErrorInvalidKernelImage";
        case cudaErrorNoKernelImageForDevice:
            return "cudaErrorNoKernelImageForDevice";
        case cudaErrorIncompatibleDriverContext:
            return "cudaErrorIncompatibleDriverContext";
        case cudaErrorPeerAccessAlreadyEnabled:
            return "cudaErrorPeerAccessAlreadyEnabled";
        case cudaErrorPeerAccessNotEnabled:
            return "cudaErrorPeerAccessNotEnabled";
        case cudaErrorDeviceAlreadyInUse:
            return "cudaErrorDeviceAlreadyInUse";
        case cudaErrorProfilerDisabled:
            return "cudaErrorProfilerDisabled";
        case cudaErrorProfilerNotInitialized:
            return "cudaErrorProfilerNotInitialized";
        case cudaErrorProfilerAlreadyStarted:
            return "cudaErrorProfilerAlreadyStarted";
        case cudaErrorProfilerAlreadyStopped:
            return "cudaErrorProfilerAlreadyStopped";
        /* Since CUDA 4.0*/
        case cudaErrorAssert:
            return "cudaErrorAssert";
        case cudaErrorTooManyPeers:
            return "cudaErrorTooManyPeers";
        case cudaErrorHostMemoryAlreadyRegistered:
            return "cudaErrorHostMemoryAlreadyRegistered";
        case cudaErrorHostMemoryNotRegistered:
            return "cudaErrorHostMemoryNotRegistered";
        /* Since CUDA 5.0 */
        case cudaErrorOperatingSystem:
            return "cudaErrorOperatingSystem";
        case cudaErrorPeerAccessUnsupported:
            return "cudaErrorPeerAccessUnsupported";
        case cudaErrorLaunchMaxDepthExceeded:
            return "cudaErrorLaunchMaxDepthExceeded";
        case cudaErrorLaunchFileScopedTex:
            return "cudaErrorLaunchFileScopedTex";
        case cudaErrorLaunchFileScopedSurf:
            return "cudaErrorLaunchFileScopedSurf";
        case cudaErrorSyncDepthExceeded:
            return "cudaErrorSyncDepthExceeded";
        case cudaErrorLaunchPendingCountExceeded:
            return "cudaErrorLaunchPendingCountExceeded";
        case cudaErrorNotPermitted:
            return "cudaErrorNotPermitted";
        case cudaErrorNotSupported:
            return "cudaErrorNotSupported";
        /* Since CUDA 6.0 */
        case cudaErrorHardwareStackError:
            return "cudaErrorHardwareStackError";
        case cudaErrorIllegalInstruction:
            return "cudaErrorIllegalInstruction";
        case cudaErrorMisalignedAddress:
            return "cudaErrorMisalignedAddress";
        case cudaErrorInvalidAddressSpace:
            return "cudaErrorInvalidAddressSpace";
        case cudaErrorInvalidPc:
            return "cudaErrorInvalidPc";
        case cudaErrorIllegalAddress:
            return "cudaErrorIllegalAddress";
        /* Since CUDA 6.5*/
        case cudaErrorInvalidPtx:
            return "cudaErrorInvalidPtx";
        case cudaErrorInvalidGraphicsContext:
            return "cudaErrorInvalidGraphicsContext";
        case cudaErrorStartupFailure:
            return "cudaErrorStartupFailure";
        case cudaErrorApiFailureBase:
            return "cudaErrorApiFailureBase";
    }
    return "<unknown>";
 }
 #endif
 template< typename T >
 void check(T result, char const *const func, const char *const file, int const line)
 {
    if (result)
    {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
        cudaDeviceReset();
        // Make sure we call CUDA Device Reset before exiting
        exit(EXIT_FAILURE);
    }
 }
 int *pArgc = NULL;
 char **pArgv = NULL;
 #if CUDART_VERSION < 5000
 // CUDA-C includes
 #include <cuda.h>
 // This function wraps the CUDA Driver API into a template function
 template <class T>
 inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
 {
    CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);
    if (CUDA_SUCCESS != error) {
        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
                error, __FILE__, __LINE__);
        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
 }
 #endif /* CUDART_VERSION < 5000 */
 // Beginning of GPU Architecture definitions
 inline int ConvertSMVer2Cores(int major, int minor)
 {
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
    typedef struct {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
        int Cores;
    } sSMtoCores;
    sSMtoCores nGpuArchCoresPerSM[] = {
        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
        {   -1, -1 }
    };
    int index = 0;
    while (nGpuArchCoresPerSM[index].SM != -1) {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchCoresPerSM[index].Cores;
        }
        index++;
    }
    // If we don't find the values, we default use the previous one to run properly
    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
    return nGpuArchCoresPerSM[index-1].Cores;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int
 main(int argc, char **argv)
 {
    pArgc = &argc;
    pArgv = argv;
    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
        printf("cudaGetDeviceCount failed: %s (%d)\n",
 			cudaGetErrorString(error_id), (int) error_id);
        printf("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        printf("There are no available device(s) that support CUDA\n");
    else
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    int dev, driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
 #if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);
        if (deviceProp.l2CacheSize) {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }
 #else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
        if (L2CacheSize) {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }
 #endif
        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 #endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }
    // If there are 2 or more GPUs, query to determine whether RDMA is supported
    if (deviceCount >= 2)
    {
        cudaDeviceProp prop[64];
        int gpuid[64]; // we want to find the first two GPU's that can support P2P
        int gpu_p2p_count = 0;
        for (int i=0; i < deviceCount; i++)
        {
            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
            // Only boards based on Fermi or later can support P2P
            if ((prop[i].major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
                && prop[i].tccDriver
 #endif
               )
            {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }
        // Show all the combinations of support P2P GPUs
        int can_access_peer_0_1, can_access_peer_1_0;
        if (gpu_p2p_count >= 2)
        {
            for (int i = 0; i < gpu_p2p_count-1; i++)
            {
                for (int j = 1; j < gpu_p2p_count; j++)
                {
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
                           prop[gpuid[j]].name, gpuid[j] ,
                           can_access_peer_0_1 ? "Yes" : "No");
                }
            }
            for (int j = 1; j < gpu_p2p_count; j++)
            {
                for (int i = 0; i < gpu_p2p_count-1; i++)
                {
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
                           prop[gpuid[i]].name, gpuid[i] ,
                           can_access_peer_1_0 ? "Yes" : "No");
                }
            }
        }
    }
    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[128];
    // driver version
    sProfileString += ", CUDA Driver Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
 #else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
 #endif
    sProfileString +=  cTemp;
    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
 #else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
 #endif
    sProfileString +=  cTemp;
    // Device count
    sProfileString += ", NumDevs = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d", deviceCount);
 #else
    sprintf(cTemp, "%d", deviceCount);
 #endif
    sProfileString += cTemp;
    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        sprintf_s(cTemp, 13, ", Device%d = ", dev);
 #else
        sprintf(cTemp, ", Device%d = ", dev);
 #endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    printf("%s", sProfileString.c_str());
    printf("Result = PASS\n");
    // finish
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    return 0;
 }
--- a/pkgs/cudainfo/default.nix
+++ b/pkgs/cudainfo/default.nix
@ -0,0 +1,43 @@
 {
  stdenv
 , cudatoolkit
 , cudaPackages
 , autoAddDriverRunpath
 , strace
 }:
 stdenv.mkDerivation (finalAttrs: {
  name = "cudainfo";
  src = ./.;
  buildInputs = [
    cudatoolkit # Required for nvcc
    cudaPackages.cuda_cudart.static # Required for -lcudart_static
    autoAddDriverRunpath
  ];
  installPhase = ''
    mkdir -p $out/bin
    cp -a cudainfo $out/bin
  '';
  passthru.gpuCheck = stdenv.mkDerivation {
    name = "cudainfo-test";
    requiredSystemFeatures = [ "cuda" ];
    dontBuild = true;
    nativeCheckInputs = [
      finalAttrs.finalPackage # The cudainfo package from above
      strace # When it fails, it will show the trace
    ];
    dontUnpack = true;
    doCheck = true;
    checkPhase = ''
      if ! cudainfo; then
        set -x
        cudainfo=$(command -v cudainfo)
        ldd $cudainfo
        readelf -d $cudainfo
        strace -f $cudainfo
        set +x
      fi
    '';
    installPhase = "touch $out";
  };
 })
--- a/pkgs/overlay.nix
+++ b/pkgs/overlay.nix
@ -52,4 +52,5 @@ final: prev:
  prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
  meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
  upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
  cudainfo = prev.callPackage ./cudainfo/default.nix { };
 }