WIP: Introduce Intel 2024, 2025, tasycl and oneMath #3
@ -1,4 +1,4 @@
|
|||||||
{ lib, config, ... }:
|
{ lib, config, pkgs, ... }:
|
||||||
{
|
{
|
||||||
# Configure Nvidia driver to use with CUDA
|
# Configure Nvidia driver to use with CUDA
|
||||||
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
||||||
@ -15,4 +15,6 @@
|
|||||||
programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
|
programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
|
||||||
config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
|
config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
|
||||||
];
|
];
|
||||||
|
|
||||||
|
environment.systemPackages = [ pkgs.cudainfo ];
|
||||||
}
|
}
|
||||||
|
|||||||
12
pkgs/cudainfo/Makefile
Normal file
12
pkgs/cudainfo/Makefile
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
HOSTCXX ?= g++
|
||||||
|
NVCC := nvcc -ccbin $(HOSTCXX)
|
||||||
|
CXXFLAGS := -m64
|
||||||
|
|
||||||
|
# Target rules
|
||||||
|
all: cudainfo
|
||||||
|
|
||||||
|
cudainfo: cudainfo.cpp
|
||||||
|
$(NVCC) $(CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f cudainfo cudainfo.o
|
||||||
600
pkgs/cudainfo/cudainfo.cpp
Normal file
600
pkgs/cudainfo/cudainfo.cpp
Normal file
@ -0,0 +1,600 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
|
||||||
|
|
||||||
|
// Shared Utilities (QA Testing)
|
||||||
|
|
||||||
|
// std::system includes
|
||||||
|
#include <memory>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
|
||||||
|
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
|
||||||
|
|
||||||
|
// CUDA Runtime error messages
|
||||||
|
#ifdef __DRIVER_TYPES_H__
|
||||||
|
static const char *_cudaGetErrorEnum(cudaError_t error)
|
||||||
|
{
|
||||||
|
switch (error)
|
||||||
|
{
|
||||||
|
case cudaSuccess:
|
||||||
|
return "cudaSuccess";
|
||||||
|
|
||||||
|
case cudaErrorMissingConfiguration:
|
||||||
|
return "cudaErrorMissingConfiguration";
|
||||||
|
|
||||||
|
case cudaErrorMemoryAllocation:
|
||||||
|
return "cudaErrorMemoryAllocation";
|
||||||
|
|
||||||
|
case cudaErrorInitializationError:
|
||||||
|
return "cudaErrorInitializationError";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFailure:
|
||||||
|
return "cudaErrorLaunchFailure";
|
||||||
|
|
||||||
|
case cudaErrorPriorLaunchFailure:
|
||||||
|
return "cudaErrorPriorLaunchFailure";
|
||||||
|
|
||||||
|
case cudaErrorLaunchTimeout:
|
||||||
|
return "cudaErrorLaunchTimeout";
|
||||||
|
|
||||||
|
case cudaErrorLaunchOutOfResources:
|
||||||
|
return "cudaErrorLaunchOutOfResources";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDeviceFunction:
|
||||||
|
return "cudaErrorInvalidDeviceFunction";
|
||||||
|
|
||||||
|
case cudaErrorInvalidConfiguration:
|
||||||
|
return "cudaErrorInvalidConfiguration";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDevice:
|
||||||
|
return "cudaErrorInvalidDevice";
|
||||||
|
|
||||||
|
case cudaErrorInvalidValue:
|
||||||
|
return "cudaErrorInvalidValue";
|
||||||
|
|
||||||
|
case cudaErrorInvalidPitchValue:
|
||||||
|
return "cudaErrorInvalidPitchValue";
|
||||||
|
|
||||||
|
case cudaErrorInvalidSymbol:
|
||||||
|
return "cudaErrorInvalidSymbol";
|
||||||
|
|
||||||
|
case cudaErrorMapBufferObjectFailed:
|
||||||
|
return "cudaErrorMapBufferObjectFailed";
|
||||||
|
|
||||||
|
case cudaErrorUnmapBufferObjectFailed:
|
||||||
|
return "cudaErrorUnmapBufferObjectFailed";
|
||||||
|
|
||||||
|
case cudaErrorInvalidHostPointer:
|
||||||
|
return "cudaErrorInvalidHostPointer";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDevicePointer:
|
||||||
|
return "cudaErrorInvalidDevicePointer";
|
||||||
|
|
||||||
|
case cudaErrorInvalidTexture:
|
||||||
|
return "cudaErrorInvalidTexture";
|
||||||
|
|
||||||
|
case cudaErrorInvalidTextureBinding:
|
||||||
|
return "cudaErrorInvalidTextureBinding";
|
||||||
|
|
||||||
|
case cudaErrorInvalidChannelDescriptor:
|
||||||
|
return "cudaErrorInvalidChannelDescriptor";
|
||||||
|
|
||||||
|
case cudaErrorInvalidMemcpyDirection:
|
||||||
|
return "cudaErrorInvalidMemcpyDirection";
|
||||||
|
|
||||||
|
case cudaErrorAddressOfConstant:
|
||||||
|
return "cudaErrorAddressOfConstant";
|
||||||
|
|
||||||
|
case cudaErrorTextureFetchFailed:
|
||||||
|
return "cudaErrorTextureFetchFailed";
|
||||||
|
|
||||||
|
case cudaErrorTextureNotBound:
|
||||||
|
return "cudaErrorTextureNotBound";
|
||||||
|
|
||||||
|
case cudaErrorSynchronizationError:
|
||||||
|
return "cudaErrorSynchronizationError";
|
||||||
|
|
||||||
|
case cudaErrorInvalidFilterSetting:
|
||||||
|
return "cudaErrorInvalidFilterSetting";
|
||||||
|
|
||||||
|
case cudaErrorInvalidNormSetting:
|
||||||
|
return "cudaErrorInvalidNormSetting";
|
||||||
|
|
||||||
|
case cudaErrorMixedDeviceExecution:
|
||||||
|
return "cudaErrorMixedDeviceExecution";
|
||||||
|
|
||||||
|
case cudaErrorCudartUnloading:
|
||||||
|
return "cudaErrorCudartUnloading";
|
||||||
|
|
||||||
|
case cudaErrorUnknown:
|
||||||
|
return "cudaErrorUnknown";
|
||||||
|
|
||||||
|
case cudaErrorNotYetImplemented:
|
||||||
|
return "cudaErrorNotYetImplemented";
|
||||||
|
|
||||||
|
case cudaErrorMemoryValueTooLarge:
|
||||||
|
return "cudaErrorMemoryValueTooLarge";
|
||||||
|
|
||||||
|
case cudaErrorInvalidResourceHandle:
|
||||||
|
return "cudaErrorInvalidResourceHandle";
|
||||||
|
|
||||||
|
case cudaErrorNotReady:
|
||||||
|
return "cudaErrorNotReady";
|
||||||
|
|
||||||
|
case cudaErrorInsufficientDriver:
|
||||||
|
return "cudaErrorInsufficientDriver";
|
||||||
|
|
||||||
|
case cudaErrorSetOnActiveProcess:
|
||||||
|
return "cudaErrorSetOnActiveProcess";
|
||||||
|
|
||||||
|
case cudaErrorInvalidSurface:
|
||||||
|
return "cudaErrorInvalidSurface";
|
||||||
|
|
||||||
|
case cudaErrorNoDevice:
|
||||||
|
return "cudaErrorNoDevice";
|
||||||
|
|
||||||
|
case cudaErrorECCUncorrectable:
|
||||||
|
return "cudaErrorECCUncorrectable";
|
||||||
|
|
||||||
|
case cudaErrorSharedObjectSymbolNotFound:
|
||||||
|
return "cudaErrorSharedObjectSymbolNotFound";
|
||||||
|
|
||||||
|
case cudaErrorSharedObjectInitFailed:
|
||||||
|
return "cudaErrorSharedObjectInitFailed";
|
||||||
|
|
||||||
|
case cudaErrorUnsupportedLimit:
|
||||||
|
return "cudaErrorUnsupportedLimit";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateVariableName:
|
||||||
|
return "cudaErrorDuplicateVariableName";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateTextureName:
|
||||||
|
return "cudaErrorDuplicateTextureName";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateSurfaceName:
|
||||||
|
return "cudaErrorDuplicateSurfaceName";
|
||||||
|
|
||||||
|
case cudaErrorDevicesUnavailable:
|
||||||
|
return "cudaErrorDevicesUnavailable";
|
||||||
|
|
||||||
|
case cudaErrorInvalidKernelImage:
|
||||||
|
return "cudaErrorInvalidKernelImage";
|
||||||
|
|
||||||
|
case cudaErrorNoKernelImageForDevice:
|
||||||
|
return "cudaErrorNoKernelImageForDevice";
|
||||||
|
|
||||||
|
case cudaErrorIncompatibleDriverContext:
|
||||||
|
return "cudaErrorIncompatibleDriverContext";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessAlreadyEnabled:
|
||||||
|
return "cudaErrorPeerAccessAlreadyEnabled";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessNotEnabled:
|
||||||
|
return "cudaErrorPeerAccessNotEnabled";
|
||||||
|
|
||||||
|
case cudaErrorDeviceAlreadyInUse:
|
||||||
|
return "cudaErrorDeviceAlreadyInUse";
|
||||||
|
|
||||||
|
case cudaErrorProfilerDisabled:
|
||||||
|
return "cudaErrorProfilerDisabled";
|
||||||
|
|
||||||
|
case cudaErrorProfilerNotInitialized:
|
||||||
|
return "cudaErrorProfilerNotInitialized";
|
||||||
|
|
||||||
|
case cudaErrorProfilerAlreadyStarted:
|
||||||
|
return "cudaErrorProfilerAlreadyStarted";
|
||||||
|
|
||||||
|
case cudaErrorProfilerAlreadyStopped:
|
||||||
|
return "cudaErrorProfilerAlreadyStopped";
|
||||||
|
|
||||||
|
/* Since CUDA 4.0*/
|
||||||
|
case cudaErrorAssert:
|
||||||
|
return "cudaErrorAssert";
|
||||||
|
|
||||||
|
case cudaErrorTooManyPeers:
|
||||||
|
return "cudaErrorTooManyPeers";
|
||||||
|
|
||||||
|
case cudaErrorHostMemoryAlreadyRegistered:
|
||||||
|
return "cudaErrorHostMemoryAlreadyRegistered";
|
||||||
|
|
||||||
|
case cudaErrorHostMemoryNotRegistered:
|
||||||
|
return "cudaErrorHostMemoryNotRegistered";
|
||||||
|
|
||||||
|
/* Since CUDA 5.0 */
|
||||||
|
case cudaErrorOperatingSystem:
|
||||||
|
return "cudaErrorOperatingSystem";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessUnsupported:
|
||||||
|
return "cudaErrorPeerAccessUnsupported";
|
||||||
|
|
||||||
|
case cudaErrorLaunchMaxDepthExceeded:
|
||||||
|
return "cudaErrorLaunchMaxDepthExceeded";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFileScopedTex:
|
||||||
|
return "cudaErrorLaunchFileScopedTex";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFileScopedSurf:
|
||||||
|
return "cudaErrorLaunchFileScopedSurf";
|
||||||
|
|
||||||
|
case cudaErrorSyncDepthExceeded:
|
||||||
|
return "cudaErrorSyncDepthExceeded";
|
||||||
|
|
||||||
|
case cudaErrorLaunchPendingCountExceeded:
|
||||||
|
return "cudaErrorLaunchPendingCountExceeded";
|
||||||
|
|
||||||
|
case cudaErrorNotPermitted:
|
||||||
|
return "cudaErrorNotPermitted";
|
||||||
|
|
||||||
|
case cudaErrorNotSupported:
|
||||||
|
return "cudaErrorNotSupported";
|
||||||
|
|
||||||
|
/* Since CUDA 6.0 */
|
||||||
|
case cudaErrorHardwareStackError:
|
||||||
|
return "cudaErrorHardwareStackError";
|
||||||
|
|
||||||
|
case cudaErrorIllegalInstruction:
|
||||||
|
return "cudaErrorIllegalInstruction";
|
||||||
|
|
||||||
|
case cudaErrorMisalignedAddress:
|
||||||
|
return "cudaErrorMisalignedAddress";
|
||||||
|
|
||||||
|
case cudaErrorInvalidAddressSpace:
|
||||||
|
return "cudaErrorInvalidAddressSpace";
|
||||||
|
|
||||||
|
case cudaErrorInvalidPc:
|
||||||
|
return "cudaErrorInvalidPc";
|
||||||
|
|
||||||
|
case cudaErrorIllegalAddress:
|
||||||
|
return "cudaErrorIllegalAddress";
|
||||||
|
|
||||||
|
/* Since CUDA 6.5*/
|
||||||
|
case cudaErrorInvalidPtx:
|
||||||
|
return "cudaErrorInvalidPtx";
|
||||||
|
|
||||||
|
case cudaErrorInvalidGraphicsContext:
|
||||||
|
return "cudaErrorInvalidGraphicsContext";
|
||||||
|
|
||||||
|
case cudaErrorStartupFailure:
|
||||||
|
return "cudaErrorStartupFailure";
|
||||||
|
|
||||||
|
case cudaErrorApiFailureBase:
|
||||||
|
return "cudaErrorApiFailureBase";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "<unknown>";
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template< typename T >
|
||||||
|
void check(T result, char const *const func, const char *const file, int const line)
|
||||||
|
{
|
||||||
|
if (result)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
|
||||||
|
file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
|
||||||
|
cudaDeviceReset();
|
||||||
|
// Make sure we call CUDA Device Reset before exiting
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int *pArgc = NULL;
|
||||||
|
char **pArgv = NULL;
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 5000
|
||||||
|
|
||||||
|
// CUDA-C includes
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
// This function wraps the CUDA Driver API into a template function
|
||||||
|
template <class T>
|
||||||
|
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
|
{
|
||||||
|
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
||||||
|
|
||||||
|
if (CUDA_SUCCESS != error) {
|
||||||
|
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
|
||||||
|
error, __FILE__, __LINE__);
|
||||||
|
|
||||||
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
|
// needed to ensure correct operation when the application is being
|
||||||
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
|
// flushed before the application exits
|
||||||
|
cudaDeviceReset();
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CUDART_VERSION < 5000 */
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] = {
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
|
||||||
|
{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
|
||||||
|
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
|
||||||
|
{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
|
||||||
|
{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
|
||||||
|
{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we don't find the values, we default use the previous one to run properly
|
||||||
|
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
|
||||||
|
return nGpuArchCoresPerSM[index-1].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Program main
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int
|
||||||
|
main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
pArgc = &argc;
|
||||||
|
pArgv = argv;
|
||||||
|
|
||||||
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
||||||
|
|
||||||
|
int deviceCount = 0;
|
||||||
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||||
|
|
||||||
|
if (error_id != cudaSuccess) {
|
||||||
|
printf("cudaGetDeviceCount failed: %s (%d)\n",
|
||||||
|
cudaGetErrorString(error_id), (int) error_id);
|
||||||
|
printf("Result = FAIL\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function call returns 0 if there are no CUDA capable devices.
|
||||||
|
if (deviceCount == 0)
|
||||||
|
printf("There are no available device(s) that support CUDA\n");
|
||||||
|
else
|
||||||
|
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
||||||
|
|
||||||
|
int dev, driverVersion = 0, runtimeVersion = 0;
|
||||||
|
|
||||||
|
for (dev = 0; dev < deviceCount; ++dev) {
|
||||||
|
cudaSetDevice(dev);
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDeviceProperties(&deviceProp, dev);
|
||||||
|
|
||||||
|
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
|
||||||
|
|
||||||
|
// Console log
|
||||||
|
cudaDriverGetVersion(&driverVersion);
|
||||||
|
cudaRuntimeGetVersion(&runtimeVersion);
|
||||||
|
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
|
||||||
|
|
||||||
|
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
|
||||||
|
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
|
||||||
|
|
||||||
|
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||||
|
deviceProp.multiProcessorCount,
|
||||||
|
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
|
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||||
|
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
|
||||||
|
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 5000
|
||||||
|
// This is supported in CUDA 5.0 (runtime API device properties)
|
||||||
|
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
|
||||||
|
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
|
||||||
|
|
||||||
|
if (deviceProp.l2CacheSize) {
|
||||||
|
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
|
||||||
|
int memoryClock;
|
||||||
|
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
||||||
|
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
||||||
|
int memBusWidth;
|
||||||
|
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
||||||
|
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||||
|
int L2CacheSize;
|
||||||
|
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
||||||
|
|
||||||
|
if (L2CacheSize) {
|
||||||
|
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
|
||||||
|
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
|
||||||
|
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
|
||||||
|
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
||||||
|
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
|
||||||
|
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
|
||||||
|
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
|
||||||
|
|
||||||
|
|
||||||
|
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
|
||||||
|
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
|
||||||
|
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
|
||||||
|
printf(" Warp size: %d\n", deviceProp.warpSize);
|
||||||
|
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
|
||||||
|
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
|
||||||
|
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
||||||
|
deviceProp.maxThreadsDim[0],
|
||||||
|
deviceProp.maxThreadsDim[1],
|
||||||
|
deviceProp.maxThreadsDim[2]);
|
||||||
|
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
||||||
|
deviceProp.maxGridSize[0],
|
||||||
|
deviceProp.maxGridSize[1],
|
||||||
|
deviceProp.maxGridSize[2]);
|
||||||
|
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
|
||||||
|
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
|
||||||
|
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
|
||||||
|
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
||||||
|
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
|
||||||
|
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
|
||||||
|
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
|
||||||
|
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
||||||
|
#endif
|
||||||
|
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
|
||||||
|
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
|
||||||
|
|
||||||
|
const char *sComputeMode[] = {
|
||||||
|
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
|
||||||
|
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
|
||||||
|
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
|
||||||
|
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
|
||||||
|
"Unknown",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
printf(" Compute Mode:\n");
|
||||||
|
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there are 2 or more GPUs, query to determine whether RDMA is supported
|
||||||
|
if (deviceCount >= 2)
|
||||||
|
{
|
||||||
|
cudaDeviceProp prop[64];
|
||||||
|
int gpuid[64]; // we want to find the first two GPU's that can support P2P
|
||||||
|
int gpu_p2p_count = 0;
|
||||||
|
|
||||||
|
for (int i=0; i < deviceCount; i++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
|
||||||
|
|
||||||
|
// Only boards based on Fermi or later can support P2P
|
||||||
|
if ((prop[i].major >= 2)
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
// on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
|
||||||
|
&& prop[i].tccDriver
|
||||||
|
#endif
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// This is an array of P2P capable GPUs
|
||||||
|
gpuid[gpu_p2p_count++] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show all the combinations of support P2P GPUs
|
||||||
|
int can_access_peer_0_1, can_access_peer_1_0;
|
||||||
|
|
||||||
|
if (gpu_p2p_count >= 2)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||||
|
{
|
||||||
|
for (int j = 1; j < gpu_p2p_count; j++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
|
||||||
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
|
||||||
|
prop[gpuid[j]].name, gpuid[j] ,
|
||||||
|
can_access_peer_0_1 ? "Yes" : "No");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 1; j < gpu_p2p_count; j++)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
|
||||||
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
|
||||||
|
prop[gpuid[i]].name, gpuid[i] ,
|
||||||
|
can_access_peer_1_0 ? "Yes" : "No");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// csv masterlog info
|
||||||
|
// *****************************
|
||||||
|
// exe and CUDA driver name
|
||||||
|
printf("\n");
|
||||||
|
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
|
||||||
|
char cTemp[128];
|
||||||
|
|
||||||
|
// driver version
|
||||||
|
sProfileString += ", CUDA Driver Version = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Runtime version
|
||||||
|
sProfileString += ", CUDA Runtime Version = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Device count
|
||||||
|
sProfileString += ", NumDevs = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d", deviceCount);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d", deviceCount);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Print Out all device Names
|
||||||
|
for (dev = 0; dev < deviceCount; ++dev)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 13, ", Device%d = ", dev);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, ", Device%d = ", dev);
|
||||||
|
#endif
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDeviceProperties(&deviceProp, dev);
|
||||||
|
sProfileString += cTemp;
|
||||||
|
sProfileString += deviceProp.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
sProfileString += "\n";
|
||||||
|
printf("%s", sProfileString.c_str());
|
||||||
|
|
||||||
|
printf("Result = PASS\n");
|
||||||
|
|
||||||
|
// finish
|
||||||
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
|
// needed to ensure correct operation when the application is being
|
||||||
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
|
// flushed before the application exits
|
||||||
|
cudaDeviceReset();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
43
pkgs/cudainfo/default.nix
Normal file
43
pkgs/cudainfo/default.nix
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
{
|
||||||
|
stdenv
|
||||||
|
, cudatoolkit
|
||||||
|
, cudaPackages
|
||||||
|
, autoAddDriverRunpath
|
||||||
|
, strace
|
||||||
|
}:
|
||||||
|
|
||||||
|
stdenv.mkDerivation (finalAttrs: {
|
||||||
|
name = "cudainfo";
|
||||||
|
src = ./.;
|
||||||
|
buildInputs = [
|
||||||
|
cudatoolkit # Required for nvcc
|
||||||
|
cudaPackages.cuda_cudart.static # Required for -lcudart_static
|
||||||
|
autoAddDriverRunpath
|
||||||
|
];
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/bin
|
||||||
|
cp -a cudainfo $out/bin
|
||||||
|
'';
|
||||||
|
passthru.gpuCheck = stdenv.mkDerivation {
|
||||||
|
name = "cudainfo-test";
|
||||||
|
requiredSystemFeatures = [ "cuda" ];
|
||||||
|
dontBuild = true;
|
||||||
|
nativeCheckInputs = [
|
||||||
|
finalAttrs.finalPackage # The cudainfo package from above
|
||||||
|
strace # When it fails, it will show the trace
|
||||||
|
];
|
||||||
|
dontUnpack = true;
|
||||||
|
doCheck = true;
|
||||||
|
checkPhase = ''
|
||||||
|
if ! cudainfo; then
|
||||||
|
set -x
|
||||||
|
cudainfo=$(command -v cudainfo)
|
||||||
|
ldd $cudainfo
|
||||||
|
readelf -d $cudainfo
|
||||||
|
strace -f $cudainfo
|
||||||
|
set +x
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
installPhase = "touch $out";
|
||||||
|
};
|
||||||
|
})
|
||||||
@ -52,4 +52,5 @@ final: prev:
|
|||||||
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
||||||
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
||||||
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
||||||
|
cudainfo = prev.callPackage ./cudainfo/default.nix { };
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user