Add CUDA shell example

2026-02-03 18:20:23 +01:00
parent 0775e1ce73
commit 3de9629c92
6 changed files with 705 additions and 0 deletions
--- a/cuda/.gitignore
+++ b/cuda/.gitignore
@@ -0,0 +1 @@
+cudainfo
--- a/cuda/Makefile
+++ b/cuda/Makefile
@@ -0,0 +1,12 @@
+HOSTCXX  ?= g++
+NVCC     := nvcc -ccbin $(HOSTCXX)
+CXXFLAGS := -m64 -Wno-deprecated-gpu-targets
+
+# Target rules
+all: cudainfo
+
+cudainfo: cudainfo.cpp
+	$(NVCC) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f cudainfo cudainfo.o
--- a/cuda/README.md
+++ b/cuda/README.md
@@ -0,0 +1,4 @@
+# CUDA example
+
+Run `nix develop` to load the environment and `make` to build the example CUDA
+program. Run it with `./cudainfo` from the fox machine to test it.
--- a/cuda/cudainfo.cpp
+++ b/cuda/cudainfo.cpp
@@ -0,0 +1,600 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
+
+// Shared Utilities (QA Testing)
+
+// std::system includes
+#include <memory>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+    switch (error)
+    {
+        case cudaSuccess:
+            return "cudaSuccess";
+
+        case cudaErrorMissingConfiguration:
+            return "cudaErrorMissingConfiguration";
+
+        case cudaErrorMemoryAllocation:
+            return "cudaErrorMemoryAllocation";
+
+        case cudaErrorInitializationError:
+            return "cudaErrorInitializationError";
+
+        case cudaErrorLaunchFailure:
+            return "cudaErrorLaunchFailure";
+
+        case cudaErrorPriorLaunchFailure:
+            return "cudaErrorPriorLaunchFailure";
+
+        case cudaErrorLaunchTimeout:
+            return "cudaErrorLaunchTimeout";
+
+        case cudaErrorLaunchOutOfResources:
+            return "cudaErrorLaunchOutOfResources";
+
+        case cudaErrorInvalidDeviceFunction:
+            return "cudaErrorInvalidDeviceFunction";
+
+        case cudaErrorInvalidConfiguration:
+            return "cudaErrorInvalidConfiguration";
+
+        case cudaErrorInvalidDevice:
+            return "cudaErrorInvalidDevice";
+
+        case cudaErrorInvalidValue:
+            return "cudaErrorInvalidValue";
+
+        case cudaErrorInvalidPitchValue:
+            return "cudaErrorInvalidPitchValue";
+
+        case cudaErrorInvalidSymbol:
+            return "cudaErrorInvalidSymbol";
+
+        case cudaErrorMapBufferObjectFailed:
+            return "cudaErrorMapBufferObjectFailed";
+
+        case cudaErrorUnmapBufferObjectFailed:
+            return "cudaErrorUnmapBufferObjectFailed";
+
+        case cudaErrorInvalidHostPointer:
+            return "cudaErrorInvalidHostPointer";
+
+        case cudaErrorInvalidDevicePointer:
+            return "cudaErrorInvalidDevicePointer";
+
+        case cudaErrorInvalidTexture:
+            return "cudaErrorInvalidTexture";
+
+        case cudaErrorInvalidTextureBinding:
+            return "cudaErrorInvalidTextureBinding";
+
+        case cudaErrorInvalidChannelDescriptor:
+            return "cudaErrorInvalidChannelDescriptor";
+
+        case cudaErrorInvalidMemcpyDirection:
+            return "cudaErrorInvalidMemcpyDirection";
+
+        case cudaErrorAddressOfConstant:
+            return "cudaErrorAddressOfConstant";
+
+        case cudaErrorTextureFetchFailed:
+            return "cudaErrorTextureFetchFailed";
+
+        case cudaErrorTextureNotBound:
+            return "cudaErrorTextureNotBound";
+
+        case cudaErrorSynchronizationError:
+            return "cudaErrorSynchronizationError";
+
+        case cudaErrorInvalidFilterSetting:
+            return "cudaErrorInvalidFilterSetting";
+
+        case cudaErrorInvalidNormSetting:
+            return "cudaErrorInvalidNormSetting";
+
+        case cudaErrorMixedDeviceExecution:
+            return "cudaErrorMixedDeviceExecution";
+
+        case cudaErrorCudartUnloading:
+            return "cudaErrorCudartUnloading";
+
+        case cudaErrorUnknown:
+            return "cudaErrorUnknown";
+
+        case cudaErrorNotYetImplemented:
+            return "cudaErrorNotYetImplemented";
+
+        case cudaErrorMemoryValueTooLarge:
+            return "cudaErrorMemoryValueTooLarge";
+
+        case cudaErrorInvalidResourceHandle:
+            return "cudaErrorInvalidResourceHandle";
+
+        case cudaErrorNotReady:
+            return "cudaErrorNotReady";
+
+        case cudaErrorInsufficientDriver:
+            return "cudaErrorInsufficientDriver";
+
+        case cudaErrorSetOnActiveProcess:
+            return "cudaErrorSetOnActiveProcess";
+
+        case cudaErrorInvalidSurface:
+            return "cudaErrorInvalidSurface";
+
+        case cudaErrorNoDevice:
+            return "cudaErrorNoDevice";
+
+        case cudaErrorECCUncorrectable:
+            return "cudaErrorECCUncorrectable";
+
+        case cudaErrorSharedObjectSymbolNotFound:
+            return "cudaErrorSharedObjectSymbolNotFound";
+
+        case cudaErrorSharedObjectInitFailed:
+            return "cudaErrorSharedObjectInitFailed";
+
+        case cudaErrorUnsupportedLimit:
+            return "cudaErrorUnsupportedLimit";
+
+        case cudaErrorDuplicateVariableName:
+            return "cudaErrorDuplicateVariableName";
+
+        case cudaErrorDuplicateTextureName:
+            return "cudaErrorDuplicateTextureName";
+
+        case cudaErrorDuplicateSurfaceName:
+            return "cudaErrorDuplicateSurfaceName";
+
+        case cudaErrorDevicesUnavailable:
+            return "cudaErrorDevicesUnavailable";
+
+        case cudaErrorInvalidKernelImage:
+            return "cudaErrorInvalidKernelImage";
+
+        case cudaErrorNoKernelImageForDevice:
+            return "cudaErrorNoKernelImageForDevice";
+
+        case cudaErrorIncompatibleDriverContext:
+            return "cudaErrorIncompatibleDriverContext";
+
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return "cudaErrorPeerAccessAlreadyEnabled";
+
+        case cudaErrorPeerAccessNotEnabled:
+            return "cudaErrorPeerAccessNotEnabled";
+
+        case cudaErrorDeviceAlreadyInUse:
+            return "cudaErrorDeviceAlreadyInUse";
+
+        case cudaErrorProfilerDisabled:
+            return "cudaErrorProfilerDisabled";
+
+        case cudaErrorProfilerNotInitialized:
+            return "cudaErrorProfilerNotInitialized";
+
+        case cudaErrorProfilerAlreadyStarted:
+            return "cudaErrorProfilerAlreadyStarted";
+
+        case cudaErrorProfilerAlreadyStopped:
+            return "cudaErrorProfilerAlreadyStopped";
+
+        /* Since CUDA 4.0*/
+        case cudaErrorAssert:
+            return "cudaErrorAssert";
+
+        case cudaErrorTooManyPeers:
+            return "cudaErrorTooManyPeers";
+
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return "cudaErrorHostMemoryAlreadyRegistered";
+
+        case cudaErrorHostMemoryNotRegistered:
+            return "cudaErrorHostMemoryNotRegistered";
+
+        /* Since CUDA 5.0 */
+        case cudaErrorOperatingSystem:
+            return "cudaErrorOperatingSystem";
+
+        case cudaErrorPeerAccessUnsupported:
+            return "cudaErrorPeerAccessUnsupported";
+
+        case cudaErrorLaunchMaxDepthExceeded:
+            return "cudaErrorLaunchMaxDepthExceeded";
+
+        case cudaErrorLaunchFileScopedTex:
+            return "cudaErrorLaunchFileScopedTex";
+
+        case cudaErrorLaunchFileScopedSurf:
+            return "cudaErrorLaunchFileScopedSurf";
+
+        case cudaErrorSyncDepthExceeded:
+            return "cudaErrorSyncDepthExceeded";
+
+        case cudaErrorLaunchPendingCountExceeded:
+            return "cudaErrorLaunchPendingCountExceeded";
+
+        case cudaErrorNotPermitted:
+            return "cudaErrorNotPermitted";
+
+        case cudaErrorNotSupported:
+            return "cudaErrorNotSupported";
+
+        /* Since CUDA 6.0 */
+        case cudaErrorHardwareStackError:
+            return "cudaErrorHardwareStackError";
+
+        case cudaErrorIllegalInstruction:
+            return "cudaErrorIllegalInstruction";
+
+        case cudaErrorMisalignedAddress:
+            return "cudaErrorMisalignedAddress";
+
+        case cudaErrorInvalidAddressSpace:
+            return "cudaErrorInvalidAddressSpace";
+
+        case cudaErrorInvalidPc:
+            return "cudaErrorInvalidPc";
+
+        case cudaErrorIllegalAddress:
+            return "cudaErrorIllegalAddress";
+
+        /* Since CUDA 6.5*/
+        case cudaErrorInvalidPtx:
+            return "cudaErrorInvalidPtx";
+
+        case cudaErrorInvalidGraphicsContext:
+            return "cudaErrorInvalidGraphicsContext";
+
+        case cudaErrorStartupFailure:
+            return "cudaErrorStartupFailure";
+
+        case cudaErrorApiFailureBase:
+            return "cudaErrorApiFailureBase";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        cudaDeviceReset();
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+#if CUDART_VERSION < 5000
+
+// CUDA-C includes
+#include <cuda.h>
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);
+
+    if (CUDA_SUCCESS != error) {
+        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
+                error, __FILE__, __LINE__);
+
+        // cudaDeviceReset causes the driver to clean up all state. While
+        // not mandatory in normal operation, it is good practice.  It is also
+        // needed to ensure correct operation when the application is being
+        // profiled. Calling cudaDeviceReset causes all profile data to be
+        // flushed before the application exits
+        cudaDeviceReset();
+        exit(EXIT_FAILURE);
+    }
+}
+
+#endif /* CUDART_VERSION < 5000 */
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] = {
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;
+
+    printf("%s Starting...\n\n", argv[0]);
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
+
+    int deviceCount = 0;
+    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+    if (error_id != cudaSuccess) {
+        printf("cudaGetDeviceCount failed: %s (%d)\n",
+			cudaGetErrorString(error_id), (int) error_id);
+        printf("Result = FAIL\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // This function call returns 0 if there are no CUDA capable devices.
+    if (deviceCount == 0)
+        printf("There are no available device(s) that support CUDA\n");
+    else
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+
+    int dev, driverVersion = 0, runtimeVersion = 0;
+
+    for (dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+
+        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+
+        // Console log
+        cudaDriverGetVersion(&driverVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
+
+        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
+                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
+
+        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
+               deviceProp.multiProcessorCount,
+               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+
+
+#if CUDART_VERSION >= 5000
+        // This is supported in CUDA 5.0 (runtime API device properties)
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);
+
+        if (deviceProp.l2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
+        }
+
+#else
+        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
+        int memoryClock;
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
+        int memBusWidth;
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        int L2CacheSize;
+        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+        if (L2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
+        }
+
+#endif
+
+        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
+               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
+               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
+               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
+        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
+               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
+
+
+        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
+        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
+        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
+        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
+        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxThreadsDim[0],
+               deviceProp.maxThreadsDim[1],
+               deviceProp.maxThreadsDim[2]);
+        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxGridSize[0],
+               deviceProp.maxGridSize[1],
+               deviceProp.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
+        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
+#endif
+        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
+
+        const char *sComputeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+            "Unknown",
+            NULL
+        };
+        printf("  Compute Mode:\n");
+        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
+    }
+
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
+    if (deviceCount >= 2)
+    {
+        cudaDeviceProp prop[64];
+        int gpuid[64]; // we want to find the first two GPU's that can support P2P
+        int gpu_p2p_count = 0;
+
+        for (int i=0; i < deviceCount; i++)
+        {
+            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
+
+            // Only boards based on Fermi or later can support P2P
+            if ((prop[i].major >= 2)
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
+                && prop[i].tccDriver
+#endif
+               )
+            {
+                // This is an array of P2P capable GPUs
+                gpuid[gpu_p2p_count++] = i;
+            }
+        }
+
+        // Show all the combinations of support P2P GPUs
+        int can_access_peer_0_1, can_access_peer_1_0;
+
+        if (gpu_p2p_count >= 2)
+        {
+            for (int i = 0; i < gpu_p2p_count-1; i++)
+            {
+                for (int j = 1; j < gpu_p2p_count; j++)
+                {
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
+                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
+                           prop[gpuid[j]].name, gpuid[j] ,
+                           can_access_peer_0_1 ? "Yes" : "No");
+                }
+            }
+
+            for (int j = 1; j < gpu_p2p_count; j++)
+            {
+                for (int i = 0; i < gpu_p2p_count-1; i++)
+                {
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
+                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
+                           prop[gpuid[i]].name, gpuid[i] ,
+                           can_access_peer_1_0 ? "Yes" : "No");
+                }
+            }
+        }
+    }
+
+    // csv masterlog info
+    // *****************************
+    // exe and CUDA driver name
+    printf("\n");
+    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+    char cTemp[128];
+
+    // driver version
+    sProfileString += ", CUDA Driver Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+#else
+    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+#endif
+    sProfileString +=  cTemp;
+
+    // Runtime version
+    sProfileString += ", CUDA Runtime Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+#else
+    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+#endif
+    sProfileString +=  cTemp;
+
+    // Device count
+    sProfileString += ", NumDevs = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d", deviceCount);
+#else
+    sprintf(cTemp, "%d", deviceCount);
+#endif
+    sProfileString += cTemp;
+
+    // Print Out all device Names
+    for (dev = 0; dev < deviceCount; ++dev)
+    {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        sprintf_s(cTemp, 13, ", Device%d = ", dev);
+#else
+        sprintf(cTemp, ", Device%d = ", dev);
+#endif
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        sProfileString += cTemp;
+        sProfileString += deviceProp.name;
+    }
+
+    sProfileString += "\n";
+    printf("%s", sProfileString.c_str());
+
+    printf("Result = PASS\n");
+
+    // finish
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+    return 0;
+}
--- a/cuda/flake.lock
+++ b/cuda/flake.lock
@@ -0,0 +1,45 @@
+{
+  "nodes": {
+    "jungle": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1770128250,
+        "narHash": "sha256-Kx3EwImhYCp4bLPNWGz4oL4IYVjkCLXwcVmXTY40MBc=",
+        "ref": "refs/heads/master",
+        "rev": "7a6e4232de0e181de97e099e600ffc3a964260e0",
+        "revCount": 1536,
+        "type": "git",
+        "url": "https://jungle.bsc.es/git/rarias/jungle"
+      },
+      "original": {
+        "type": "git",
+        "url": "https://jungle.bsc.es/git/rarias/jungle"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1767634882,
+        "narHash": "sha256-2GffSfQxe3sedHzK+sTKlYo/NTIAGzbFCIsNMUPAAnk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "3c9db02515ef1d9b6b709fc60ba9a540957f661c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-25.11",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "jungle": "jungle"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/cuda/flake.nix
+++ b/cuda/flake.nix
@@ -0,0 +1,43 @@
+{
+  inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
+  outputs = { self, jungle }:
+  let
+    nixpkgs = jungle.inputs.nixpkgs;
+    customOverlay = (final: prev: {
+      # Example overlay, for now empty
+    });
+    pkgs = import nixpkgs {
+      system = "x86_64-linux";
+      overlays = [
+        # Apply jungle overlay to get our BSC custom packages
+        jungle.outputs.bscOverlay
+        # And on top apply our local changes to customize for cluster
+        customOverlay
+      ];
+      # Needed for CUDA
+      config.allowUnfree = true;
+    };
+  in {
+    devShells.x86_64-linux.default = pkgs.mkShell {
+      pname = "cuda-devshell";
+      # Include these packages in the shell
+      packages = with pkgs; [
+        # Cuda packages (more at https://search.nixos.org/packages)
+	cudatoolkit # Required for nvcc
+	(lib.getOutput "static" cudaPackages.cuda_cudart) # Required for -lcudart_static
+        cudaPackages.libcusparse
+	autoAddDriverRunpath
+        # ... add more packages from https://search.nixos.org/packages
+      ];
+      # The dependencies needed to build these packages will be also included
+      inputsFrom = with pkgs; [
+        # Empty for now 
+      ];
+      shellHook = ''
+        export CUDA_PATH=${pkgs.cudatoolkit}
+        export LD_LIBRARY_PATH=/var/run/opengl-driver/lib
+        export SMS=50
+      '';
+    };
+  };
+}