forked from rarias/jungle
		
	The cudainfo program checks that we can initialize the CUDA RT library and communicate with the driver. It can be used as standalone program or built with cudainfo.gpuCheck so it is executed inside the build sandbox to see if it also works fine. It uses the autoAddDriverRunpath hook to inject in the runpath the location of the library directory for CUDA libraries. Reviewed-by: Aleix Boné <abonerib@bsc.es>
		
			
				
	
	
		
			601 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			601 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 | |
|  *
 | |
|  * Please refer to the NVIDIA end user license agreement (EULA) associated
 | |
|  * with this source code for terms and conditions that govern your use of
 | |
|  * this software. Any use, reproduction, disclosure, or distribution of
 | |
|  * this software and related documentation outside the terms of the EULA
 | |
|  * is strictly prohibited.
 | |
|  *
 | |
|  */
 | |
| /* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
 | |
| 
 | |
| // Shared Utilities (QA Testing)
 | |
| 
 | |
| // std::system includes
 | |
| #include <memory>
 | |
| #include <iostream>
 | |
| 
 | |
| #include <cuda_runtime.h>
 | |
| 
 | |
| // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 | |
| #define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
 | |
| 
 | |
| // CUDA Runtime error messages
 | |
| #ifdef __DRIVER_TYPES_H__
 | |
| static const char *_cudaGetErrorEnum(cudaError_t error)
 | |
| {
 | |
|     switch (error)
 | |
|     {
 | |
|         case cudaSuccess:
 | |
|             return "cudaSuccess";
 | |
| 
 | |
|         case cudaErrorMissingConfiguration:
 | |
|             return "cudaErrorMissingConfiguration";
 | |
| 
 | |
|         case cudaErrorMemoryAllocation:
 | |
|             return "cudaErrorMemoryAllocation";
 | |
| 
 | |
|         case cudaErrorInitializationError:
 | |
|             return "cudaErrorInitializationError";
 | |
| 
 | |
|         case cudaErrorLaunchFailure:
 | |
|             return "cudaErrorLaunchFailure";
 | |
| 
 | |
|         case cudaErrorPriorLaunchFailure:
 | |
|             return "cudaErrorPriorLaunchFailure";
 | |
| 
 | |
|         case cudaErrorLaunchTimeout:
 | |
|             return "cudaErrorLaunchTimeout";
 | |
| 
 | |
|         case cudaErrorLaunchOutOfResources:
 | |
|             return "cudaErrorLaunchOutOfResources";
 | |
| 
 | |
|         case cudaErrorInvalidDeviceFunction:
 | |
|             return "cudaErrorInvalidDeviceFunction";
 | |
| 
 | |
|         case cudaErrorInvalidConfiguration:
 | |
|             return "cudaErrorInvalidConfiguration";
 | |
| 
 | |
|         case cudaErrorInvalidDevice:
 | |
|             return "cudaErrorInvalidDevice";
 | |
| 
 | |
|         case cudaErrorInvalidValue:
 | |
|             return "cudaErrorInvalidValue";
 | |
| 
 | |
|         case cudaErrorInvalidPitchValue:
 | |
|             return "cudaErrorInvalidPitchValue";
 | |
| 
 | |
|         case cudaErrorInvalidSymbol:
 | |
|             return "cudaErrorInvalidSymbol";
 | |
| 
 | |
|         case cudaErrorMapBufferObjectFailed:
 | |
|             return "cudaErrorMapBufferObjectFailed";
 | |
| 
 | |
|         case cudaErrorUnmapBufferObjectFailed:
 | |
|             return "cudaErrorUnmapBufferObjectFailed";
 | |
| 
 | |
|         case cudaErrorInvalidHostPointer:
 | |
|             return "cudaErrorInvalidHostPointer";
 | |
| 
 | |
|         case cudaErrorInvalidDevicePointer:
 | |
|             return "cudaErrorInvalidDevicePointer";
 | |
| 
 | |
|         case cudaErrorInvalidTexture:
 | |
|             return "cudaErrorInvalidTexture";
 | |
| 
 | |
|         case cudaErrorInvalidTextureBinding:
 | |
|             return "cudaErrorInvalidTextureBinding";
 | |
| 
 | |
|         case cudaErrorInvalidChannelDescriptor:
 | |
|             return "cudaErrorInvalidChannelDescriptor";
 | |
| 
 | |
|         case cudaErrorInvalidMemcpyDirection:
 | |
|             return "cudaErrorInvalidMemcpyDirection";
 | |
| 
 | |
|         case cudaErrorAddressOfConstant:
 | |
|             return "cudaErrorAddressOfConstant";
 | |
| 
 | |
|         case cudaErrorTextureFetchFailed:
 | |
|             return "cudaErrorTextureFetchFailed";
 | |
| 
 | |
|         case cudaErrorTextureNotBound:
 | |
|             return "cudaErrorTextureNotBound";
 | |
| 
 | |
|         case cudaErrorSynchronizationError:
 | |
|             return "cudaErrorSynchronizationError";
 | |
| 
 | |
|         case cudaErrorInvalidFilterSetting:
 | |
|             return "cudaErrorInvalidFilterSetting";
 | |
| 
 | |
|         case cudaErrorInvalidNormSetting:
 | |
|             return "cudaErrorInvalidNormSetting";
 | |
| 
 | |
|         case cudaErrorMixedDeviceExecution:
 | |
|             return "cudaErrorMixedDeviceExecution";
 | |
| 
 | |
|         case cudaErrorCudartUnloading:
 | |
|             return "cudaErrorCudartUnloading";
 | |
| 
 | |
|         case cudaErrorUnknown:
 | |
|             return "cudaErrorUnknown";
 | |
| 
 | |
|         case cudaErrorNotYetImplemented:
 | |
|             return "cudaErrorNotYetImplemented";
 | |
| 
 | |
|         case cudaErrorMemoryValueTooLarge:
 | |
|             return "cudaErrorMemoryValueTooLarge";
 | |
| 
 | |
|         case cudaErrorInvalidResourceHandle:
 | |
|             return "cudaErrorInvalidResourceHandle";
 | |
| 
 | |
|         case cudaErrorNotReady:
 | |
|             return "cudaErrorNotReady";
 | |
| 
 | |
|         case cudaErrorInsufficientDriver:
 | |
|             return "cudaErrorInsufficientDriver";
 | |
| 
 | |
|         case cudaErrorSetOnActiveProcess:
 | |
|             return "cudaErrorSetOnActiveProcess";
 | |
| 
 | |
|         case cudaErrorInvalidSurface:
 | |
|             return "cudaErrorInvalidSurface";
 | |
| 
 | |
|         case cudaErrorNoDevice:
 | |
|             return "cudaErrorNoDevice";
 | |
| 
 | |
|         case cudaErrorECCUncorrectable:
 | |
|             return "cudaErrorECCUncorrectable";
 | |
| 
 | |
|         case cudaErrorSharedObjectSymbolNotFound:
 | |
|             return "cudaErrorSharedObjectSymbolNotFound";
 | |
| 
 | |
|         case cudaErrorSharedObjectInitFailed:
 | |
|             return "cudaErrorSharedObjectInitFailed";
 | |
| 
 | |
|         case cudaErrorUnsupportedLimit:
 | |
|             return "cudaErrorUnsupportedLimit";
 | |
| 
 | |
|         case cudaErrorDuplicateVariableName:
 | |
|             return "cudaErrorDuplicateVariableName";
 | |
| 
 | |
|         case cudaErrorDuplicateTextureName:
 | |
|             return "cudaErrorDuplicateTextureName";
 | |
| 
 | |
|         case cudaErrorDuplicateSurfaceName:
 | |
|             return "cudaErrorDuplicateSurfaceName";
 | |
| 
 | |
|         case cudaErrorDevicesUnavailable:
 | |
|             return "cudaErrorDevicesUnavailable";
 | |
| 
 | |
|         case cudaErrorInvalidKernelImage:
 | |
|             return "cudaErrorInvalidKernelImage";
 | |
| 
 | |
|         case cudaErrorNoKernelImageForDevice:
 | |
|             return "cudaErrorNoKernelImageForDevice";
 | |
| 
 | |
|         case cudaErrorIncompatibleDriverContext:
 | |
|             return "cudaErrorIncompatibleDriverContext";
 | |
| 
 | |
|         case cudaErrorPeerAccessAlreadyEnabled:
 | |
|             return "cudaErrorPeerAccessAlreadyEnabled";
 | |
| 
 | |
|         case cudaErrorPeerAccessNotEnabled:
 | |
|             return "cudaErrorPeerAccessNotEnabled";
 | |
| 
 | |
|         case cudaErrorDeviceAlreadyInUse:
 | |
|             return "cudaErrorDeviceAlreadyInUse";
 | |
| 
 | |
|         case cudaErrorProfilerDisabled:
 | |
|             return "cudaErrorProfilerDisabled";
 | |
| 
 | |
|         case cudaErrorProfilerNotInitialized:
 | |
|             return "cudaErrorProfilerNotInitialized";
 | |
| 
 | |
|         case cudaErrorProfilerAlreadyStarted:
 | |
|             return "cudaErrorProfilerAlreadyStarted";
 | |
| 
 | |
|         case cudaErrorProfilerAlreadyStopped:
 | |
|             return "cudaErrorProfilerAlreadyStopped";
 | |
| 
 | |
|         /* Since CUDA 4.0*/
 | |
|         case cudaErrorAssert:
 | |
|             return "cudaErrorAssert";
 | |
| 
 | |
|         case cudaErrorTooManyPeers:
 | |
|             return "cudaErrorTooManyPeers";
 | |
| 
 | |
|         case cudaErrorHostMemoryAlreadyRegistered:
 | |
|             return "cudaErrorHostMemoryAlreadyRegistered";
 | |
| 
 | |
|         case cudaErrorHostMemoryNotRegistered:
 | |
|             return "cudaErrorHostMemoryNotRegistered";
 | |
| 
 | |
|         /* Since CUDA 5.0 */
 | |
|         case cudaErrorOperatingSystem:
 | |
|             return "cudaErrorOperatingSystem";
 | |
| 
 | |
|         case cudaErrorPeerAccessUnsupported:
 | |
|             return "cudaErrorPeerAccessUnsupported";
 | |
| 
 | |
|         case cudaErrorLaunchMaxDepthExceeded:
 | |
|             return "cudaErrorLaunchMaxDepthExceeded";
 | |
| 
 | |
|         case cudaErrorLaunchFileScopedTex:
 | |
|             return "cudaErrorLaunchFileScopedTex";
 | |
| 
 | |
|         case cudaErrorLaunchFileScopedSurf:
 | |
|             return "cudaErrorLaunchFileScopedSurf";
 | |
| 
 | |
|         case cudaErrorSyncDepthExceeded:
 | |
|             return "cudaErrorSyncDepthExceeded";
 | |
| 
 | |
|         case cudaErrorLaunchPendingCountExceeded:
 | |
|             return "cudaErrorLaunchPendingCountExceeded";
 | |
| 
 | |
|         case cudaErrorNotPermitted:
 | |
|             return "cudaErrorNotPermitted";
 | |
| 
 | |
|         case cudaErrorNotSupported:
 | |
|             return "cudaErrorNotSupported";
 | |
| 
 | |
|         /* Since CUDA 6.0 */
 | |
|         case cudaErrorHardwareStackError:
 | |
|             return "cudaErrorHardwareStackError";
 | |
| 
 | |
|         case cudaErrorIllegalInstruction:
 | |
|             return "cudaErrorIllegalInstruction";
 | |
| 
 | |
|         case cudaErrorMisalignedAddress:
 | |
|             return "cudaErrorMisalignedAddress";
 | |
| 
 | |
|         case cudaErrorInvalidAddressSpace:
 | |
|             return "cudaErrorInvalidAddressSpace";
 | |
| 
 | |
|         case cudaErrorInvalidPc:
 | |
|             return "cudaErrorInvalidPc";
 | |
| 
 | |
|         case cudaErrorIllegalAddress:
 | |
|             return "cudaErrorIllegalAddress";
 | |
| 
 | |
|         /* Since CUDA 6.5*/
 | |
|         case cudaErrorInvalidPtx:
 | |
|             return "cudaErrorInvalidPtx";
 | |
| 
 | |
|         case cudaErrorInvalidGraphicsContext:
 | |
|             return "cudaErrorInvalidGraphicsContext";
 | |
| 
 | |
|         case cudaErrorStartupFailure:
 | |
|             return "cudaErrorStartupFailure";
 | |
| 
 | |
|         case cudaErrorApiFailureBase:
 | |
|             return "cudaErrorApiFailureBase";
 | |
|     }
 | |
| 
 | |
|     return "<unknown>";
 | |
| }
 | |
| #endif
 | |
| 
 | |
| template< typename T >
 | |
| void check(T result, char const *const func, const char *const file, int const line)
 | |
| {
 | |
|     if (result)
 | |
|     {
 | |
|         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
 | |
|                 file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
 | |
|         cudaDeviceReset();
 | |
|         // Make sure we call CUDA Device Reset before exiting
 | |
|         exit(EXIT_FAILURE);
 | |
|     }
 | |
| }
 | |
| 
 | |
| int *pArgc = NULL;
 | |
| char **pArgv = NULL;
 | |
| 
 | |
| #if CUDART_VERSION < 5000
 | |
| 
 | |
| // CUDA-C includes
 | |
| #include <cuda.h>
 | |
| 
 | |
| // This function wraps the CUDA Driver API into a template function
 | |
| template <class T>
 | |
| inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
 | |
| {
 | |
|     CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);
 | |
| 
 | |
|     if (CUDA_SUCCESS != error) {
 | |
|         fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
 | |
|                 error, __FILE__, __LINE__);
 | |
| 
 | |
|         // cudaDeviceReset causes the driver to clean up all state. While
 | |
|         // not mandatory in normal operation, it is good practice.  It is also
 | |
|         // needed to ensure correct operation when the application is being
 | |
|         // profiled. Calling cudaDeviceReset causes all profile data to be
 | |
|         // flushed before the application exits
 | |
|         cudaDeviceReset();
 | |
|         exit(EXIT_FAILURE);
 | |
|     }
 | |
| }
 | |
| 
 | |
| #endif /* CUDART_VERSION < 5000 */
 | |
| 
 | |
| // Beginning of GPU Architecture definitions
 | |
| inline int ConvertSMVer2Cores(int major, int minor)
 | |
| {
 | |
|     // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
 | |
|     typedef struct {
 | |
|         int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
 | |
|         int Cores;
 | |
|     } sSMtoCores;
 | |
| 
 | |
|     sSMtoCores nGpuArchCoresPerSM[] = {
 | |
|         { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
 | |
|         { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
 | |
|         { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
 | |
|         { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
 | |
|         { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
 | |
|         { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
 | |
|         { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
 | |
|         { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
 | |
|         {   -1, -1 }
 | |
|     };
 | |
| 
 | |
|     int index = 0;
 | |
| 
 | |
|     while (nGpuArchCoresPerSM[index].SM != -1) {
 | |
|         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
 | |
|             return nGpuArchCoresPerSM[index].Cores;
 | |
|         }
 | |
| 
 | |
|         index++;
 | |
|     }
 | |
| 
 | |
|     // If we don't find the values, we default use the previous one to run properly
 | |
|     printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
 | |
|     return nGpuArchCoresPerSM[index-1].Cores;
 | |
| }
 | |
| 
 | |
| ////////////////////////////////////////////////////////////////////////////////
 | |
| // Program main
 | |
| ////////////////////////////////////////////////////////////////////////////////
 | |
| int
 | |
| main(int argc, char **argv)
 | |
| {
 | |
|     pArgc = &argc;
 | |
|     pArgv = argv;
 | |
| 
 | |
|     printf("%s Starting...\n\n", argv[0]);
 | |
|     printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
 | |
| 
 | |
|     int deviceCount = 0;
 | |
|     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
 | |
| 
 | |
|     if (error_id != cudaSuccess) {
 | |
|         printf("cudaGetDeviceCount failed: %s (%d)\n",
 | |
| 			cudaGetErrorString(error_id), (int) error_id);
 | |
|         printf("Result = FAIL\n");
 | |
|         exit(EXIT_FAILURE);
 | |
|     }
 | |
| 
 | |
|     // This function call returns 0 if there are no CUDA capable devices.
 | |
|     if (deviceCount == 0)
 | |
|         printf("There are no available device(s) that support CUDA\n");
 | |
|     else
 | |
|         printf("Detected %d CUDA Capable device(s)\n", deviceCount);
 | |
| 
 | |
|     int dev, driverVersion = 0, runtimeVersion = 0;
 | |
| 
 | |
|     for (dev = 0; dev < deviceCount; ++dev) {
 | |
|         cudaSetDevice(dev);
 | |
|         cudaDeviceProp deviceProp;
 | |
|         cudaGetDeviceProperties(&deviceProp, dev);
 | |
| 
 | |
|         printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
 | |
| 
 | |
|         // Console log
 | |
|         cudaDriverGetVersion(&driverVersion);
 | |
|         cudaRuntimeGetVersion(&runtimeVersion);
 | |
|         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
 | |
|         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
 | |
| 
 | |
|         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
 | |
|                 (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
 | |
| 
 | |
|         printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
 | |
|                deviceProp.multiProcessorCount,
 | |
|                ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 | |
|                ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 | |
|         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
 | |
| 
 | |
| 
 | |
| #if CUDART_VERSION >= 5000
 | |
|         // This is supported in CUDA 5.0 (runtime API device properties)
 | |
|         printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
 | |
|         printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);
 | |
| 
 | |
|         if (deviceProp.l2CacheSize) {
 | |
|             printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
 | |
|         }
 | |
| 
 | |
| #else
 | |
|         // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
 | |
|         int memoryClock;
 | |
|         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
 | |
|         printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
 | |
|         int memBusWidth;
 | |
|         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 | |
|         printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
 | |
|         int L2CacheSize;
 | |
|         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 | |
| 
 | |
|         if (L2CacheSize) {
 | |
|             printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
 | |
|         }
 | |
| 
 | |
| #endif
 | |
| 
 | |
|         printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
 | |
|                deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
 | |
|                deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
 | |
|         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 | |
|                deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
 | |
|         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
 | |
|                deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
 | |
| 
 | |
| 
 | |
|         printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
 | |
|         printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
 | |
|         printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
 | |
|         printf("  Warp size:                                     %d\n", deviceProp.warpSize);
 | |
|         printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
 | |
|         printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
 | |
|         printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
 | |
|                deviceProp.maxThreadsDim[0],
 | |
|                deviceProp.maxThreadsDim[1],
 | |
|                deviceProp.maxThreadsDim[2]);
 | |
|         printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
 | |
|                deviceProp.maxGridSize[0],
 | |
|                deviceProp.maxGridSize[1],
 | |
|                deviceProp.maxGridSize[2]);
 | |
|         printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
 | |
|         printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
 | |
|         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
 | |
|         printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
 | |
|         printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
 | |
|         printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
 | |
|         printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
 | |
|         printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 | |
| #endif
 | |
|         printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
 | |
|         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
 | |
| 
 | |
|         const char *sComputeMode[] = {
 | |
|             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
 | |
|             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
 | |
|             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
 | |
|             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
 | |
|             "Unknown",
 | |
|             NULL
 | |
|         };
 | |
|         printf("  Compute Mode:\n");
 | |
|         printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
 | |
|     }
 | |
| 
 | |
|     // If there are 2 or more GPUs, query to determine whether RDMA is supported
 | |
|     if (deviceCount >= 2)
 | |
|     {
 | |
|         cudaDeviceProp prop[64];
 | |
|         int gpuid[64]; // we want to find the first two GPU's that can support P2P
 | |
|         int gpu_p2p_count = 0;
 | |
| 
 | |
|         for (int i=0; i < deviceCount; i++)
 | |
|         {
 | |
|             checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
 | |
| 
 | |
|             // Only boards based on Fermi or later can support P2P
 | |
|             if ((prop[i].major >= 2)
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|                 // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
 | |
|                 && prop[i].tccDriver
 | |
| #endif
 | |
|                )
 | |
|             {
 | |
|                 // This is an array of P2P capable GPUs
 | |
|                 gpuid[gpu_p2p_count++] = i;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Show all the combinations of support P2P GPUs
 | |
|         int can_access_peer_0_1, can_access_peer_1_0;
 | |
| 
 | |
|         if (gpu_p2p_count >= 2)
 | |
|         {
 | |
|             for (int i = 0; i < gpu_p2p_count-1; i++)
 | |
|             {
 | |
|                 for (int j = 1; j < gpu_p2p_count; j++)
 | |
|                 {
 | |
|                     checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
 | |
|                     printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
 | |
|                            prop[gpuid[j]].name, gpuid[j] ,
 | |
|                            can_access_peer_0_1 ? "Yes" : "No");
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             for (int j = 1; j < gpu_p2p_count; j++)
 | |
|             {
 | |
|                 for (int i = 0; i < gpu_p2p_count-1; i++)
 | |
|                 {
 | |
|                     checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
 | |
|                     printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
 | |
|                            prop[gpuid[i]].name, gpuid[i] ,
 | |
|                            can_access_peer_1_0 ? "Yes" : "No");
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // csv masterlog info
 | |
|     // *****************************
 | |
|     // exe and CUDA driver name
 | |
|     printf("\n");
 | |
|     std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
 | |
|     char cTemp[128];
 | |
| 
 | |
|     // driver version
 | |
|     sProfileString += ", CUDA Driver Version = ";
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|     sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
 | |
| #else
 | |
|     sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
 | |
| #endif
 | |
|     sProfileString +=  cTemp;
 | |
| 
 | |
|     // Runtime version
 | |
|     sProfileString += ", CUDA Runtime Version = ";
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|     sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
 | |
| #else
 | |
|     sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
 | |
| #endif
 | |
|     sProfileString +=  cTemp;
 | |
| 
 | |
|     // Device count
 | |
|     sProfileString += ", NumDevs = ";
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|     sprintf_s(cTemp, 10, "%d", deviceCount);
 | |
| #else
 | |
|     sprintf(cTemp, "%d", deviceCount);
 | |
| #endif
 | |
|     sProfileString += cTemp;
 | |
| 
 | |
|     // Print Out all device Names
 | |
|     for (dev = 0; dev < deviceCount; ++dev)
 | |
|     {
 | |
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 | |
|         sprintf_s(cTemp, 13, ", Device%d = ", dev);
 | |
| #else
 | |
|         sprintf(cTemp, ", Device%d = ", dev);
 | |
| #endif
 | |
|         cudaDeviceProp deviceProp;
 | |
|         cudaGetDeviceProperties(&deviceProp, dev);
 | |
|         sProfileString += cTemp;
 | |
|         sProfileString += deviceProp.name;
 | |
|     }
 | |
| 
 | |
|     sProfileString += "\n";
 | |
|     printf("%s", sProfileString.c_str());
 | |
| 
 | |
|     printf("Result = PASS\n");
 | |
| 
 | |
|     // finish
 | |
|     // cudaDeviceReset causes the driver to clean up all state. While
 | |
|     // not mandatory in normal operation, it is good practice.  It is also
 | |
|     // needed to ensure correct operation when the application is being
 | |
|     // profiled. Calling cudaDeviceReset causes all profile data to be
 | |
|     // flushed before the application exits
 | |
|     cudaDeviceReset();
 | |
|     return 0;
 | |
| }
 |