Compare commits
9 Commits
11a83adb54
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 1cf55785f2 | |||
| 7e55e255f9 | |||
| 150bdae46e | |||
| 0495bf0dee | |||
| 0775e1ce73 | |||
| 9bae257774 | |||
| 676a0ced1c | |||
| 9457de1983 | |||
| 59c56db491 |
15
README.md
Normal file
15
README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Nix development shells
|
||||
|
||||
This repository collects several examples of development environments to be used
|
||||
with `nix develop`.
|
||||
|
||||
The definition of the environment is located in the `flake.nix` file and the
|
||||
precise version of the commit is stored in the `flake.lock` file. These two
|
||||
files provide all the required information to reproduce the environment by any
|
||||
user.
|
||||
|
||||
Make sure they are tracked by git so that you can see what changes you do
|
||||
in your environment over time.
|
||||
|
||||
To enter an environment, go to the directory with the `flake.nix` file and run
|
||||
`nix develop`.
|
||||
1
cuda/.gitignore
vendored
Normal file
1
cuda/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
cudainfo
|
||||
12
cuda/Makefile
Normal file
12
cuda/Makefile
Normal file
@@ -0,0 +1,12 @@
|
||||
HOSTCXX ?= g++
|
||||
NVCC := nvcc -ccbin $(HOSTCXX)
|
||||
CXXFLAGS := -m64 -Wno-deprecated-gpu-targets
|
||||
|
||||
# Target rules
|
||||
all: cudainfo
|
||||
|
||||
cudainfo: cudainfo.cpp
|
||||
$(NVCC) $(CXXFLAGS) -o $@ $<
|
||||
|
||||
clean:
|
||||
rm -f cudainfo cudainfo.o
|
||||
4
cuda/README.md
Normal file
4
cuda/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# CUDA example
|
||||
|
||||
Run `nix develop` to load the environment and `make` to build the example CUDA
|
||||
program. Run it with `./cudainfo` from the fox machine to test it.
|
||||
600
cuda/cudainfo.cpp
Normal file
600
cuda/cudainfo.cpp
Normal file
@@ -0,0 +1,600 @@
|
||||
/*
|
||||
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
|
||||
|
||||
// Shared Utilities (QA Testing)
|
||||
|
||||
// std::system includes
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
|
||||
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
|
||||
|
||||
// CUDA Runtime error messages
|
||||
#ifdef __DRIVER_TYPES_H__
|
||||
static const char *_cudaGetErrorEnum(cudaError_t error)
|
||||
{
|
||||
switch (error)
|
||||
{
|
||||
case cudaSuccess:
|
||||
return "cudaSuccess";
|
||||
|
||||
case cudaErrorMissingConfiguration:
|
||||
return "cudaErrorMissingConfiguration";
|
||||
|
||||
case cudaErrorMemoryAllocation:
|
||||
return "cudaErrorMemoryAllocation";
|
||||
|
||||
case cudaErrorInitializationError:
|
||||
return "cudaErrorInitializationError";
|
||||
|
||||
case cudaErrorLaunchFailure:
|
||||
return "cudaErrorLaunchFailure";
|
||||
|
||||
case cudaErrorPriorLaunchFailure:
|
||||
return "cudaErrorPriorLaunchFailure";
|
||||
|
||||
case cudaErrorLaunchTimeout:
|
||||
return "cudaErrorLaunchTimeout";
|
||||
|
||||
case cudaErrorLaunchOutOfResources:
|
||||
return "cudaErrorLaunchOutOfResources";
|
||||
|
||||
case cudaErrorInvalidDeviceFunction:
|
||||
return "cudaErrorInvalidDeviceFunction";
|
||||
|
||||
case cudaErrorInvalidConfiguration:
|
||||
return "cudaErrorInvalidConfiguration";
|
||||
|
||||
case cudaErrorInvalidDevice:
|
||||
return "cudaErrorInvalidDevice";
|
||||
|
||||
case cudaErrorInvalidValue:
|
||||
return "cudaErrorInvalidValue";
|
||||
|
||||
case cudaErrorInvalidPitchValue:
|
||||
return "cudaErrorInvalidPitchValue";
|
||||
|
||||
case cudaErrorInvalidSymbol:
|
||||
return "cudaErrorInvalidSymbol";
|
||||
|
||||
case cudaErrorMapBufferObjectFailed:
|
||||
return "cudaErrorMapBufferObjectFailed";
|
||||
|
||||
case cudaErrorUnmapBufferObjectFailed:
|
||||
return "cudaErrorUnmapBufferObjectFailed";
|
||||
|
||||
case cudaErrorInvalidHostPointer:
|
||||
return "cudaErrorInvalidHostPointer";
|
||||
|
||||
case cudaErrorInvalidDevicePointer:
|
||||
return "cudaErrorInvalidDevicePointer";
|
||||
|
||||
case cudaErrorInvalidTexture:
|
||||
return "cudaErrorInvalidTexture";
|
||||
|
||||
case cudaErrorInvalidTextureBinding:
|
||||
return "cudaErrorInvalidTextureBinding";
|
||||
|
||||
case cudaErrorInvalidChannelDescriptor:
|
||||
return "cudaErrorInvalidChannelDescriptor";
|
||||
|
||||
case cudaErrorInvalidMemcpyDirection:
|
||||
return "cudaErrorInvalidMemcpyDirection";
|
||||
|
||||
case cudaErrorAddressOfConstant:
|
||||
return "cudaErrorAddressOfConstant";
|
||||
|
||||
case cudaErrorTextureFetchFailed:
|
||||
return "cudaErrorTextureFetchFailed";
|
||||
|
||||
case cudaErrorTextureNotBound:
|
||||
return "cudaErrorTextureNotBound";
|
||||
|
||||
case cudaErrorSynchronizationError:
|
||||
return "cudaErrorSynchronizationError";
|
||||
|
||||
case cudaErrorInvalidFilterSetting:
|
||||
return "cudaErrorInvalidFilterSetting";
|
||||
|
||||
case cudaErrorInvalidNormSetting:
|
||||
return "cudaErrorInvalidNormSetting";
|
||||
|
||||
case cudaErrorMixedDeviceExecution:
|
||||
return "cudaErrorMixedDeviceExecution";
|
||||
|
||||
case cudaErrorCudartUnloading:
|
||||
return "cudaErrorCudartUnloading";
|
||||
|
||||
case cudaErrorUnknown:
|
||||
return "cudaErrorUnknown";
|
||||
|
||||
case cudaErrorNotYetImplemented:
|
||||
return "cudaErrorNotYetImplemented";
|
||||
|
||||
case cudaErrorMemoryValueTooLarge:
|
||||
return "cudaErrorMemoryValueTooLarge";
|
||||
|
||||
case cudaErrorInvalidResourceHandle:
|
||||
return "cudaErrorInvalidResourceHandle";
|
||||
|
||||
case cudaErrorNotReady:
|
||||
return "cudaErrorNotReady";
|
||||
|
||||
case cudaErrorInsufficientDriver:
|
||||
return "cudaErrorInsufficientDriver";
|
||||
|
||||
case cudaErrorSetOnActiveProcess:
|
||||
return "cudaErrorSetOnActiveProcess";
|
||||
|
||||
case cudaErrorInvalidSurface:
|
||||
return "cudaErrorInvalidSurface";
|
||||
|
||||
case cudaErrorNoDevice:
|
||||
return "cudaErrorNoDevice";
|
||||
|
||||
case cudaErrorECCUncorrectable:
|
||||
return "cudaErrorECCUncorrectable";
|
||||
|
||||
case cudaErrorSharedObjectSymbolNotFound:
|
||||
return "cudaErrorSharedObjectSymbolNotFound";
|
||||
|
||||
case cudaErrorSharedObjectInitFailed:
|
||||
return "cudaErrorSharedObjectInitFailed";
|
||||
|
||||
case cudaErrorUnsupportedLimit:
|
||||
return "cudaErrorUnsupportedLimit";
|
||||
|
||||
case cudaErrorDuplicateVariableName:
|
||||
return "cudaErrorDuplicateVariableName";
|
||||
|
||||
case cudaErrorDuplicateTextureName:
|
||||
return "cudaErrorDuplicateTextureName";
|
||||
|
||||
case cudaErrorDuplicateSurfaceName:
|
||||
return "cudaErrorDuplicateSurfaceName";
|
||||
|
||||
case cudaErrorDevicesUnavailable:
|
||||
return "cudaErrorDevicesUnavailable";
|
||||
|
||||
case cudaErrorInvalidKernelImage:
|
||||
return "cudaErrorInvalidKernelImage";
|
||||
|
||||
case cudaErrorNoKernelImageForDevice:
|
||||
return "cudaErrorNoKernelImageForDevice";
|
||||
|
||||
case cudaErrorIncompatibleDriverContext:
|
||||
return "cudaErrorIncompatibleDriverContext";
|
||||
|
||||
case cudaErrorPeerAccessAlreadyEnabled:
|
||||
return "cudaErrorPeerAccessAlreadyEnabled";
|
||||
|
||||
case cudaErrorPeerAccessNotEnabled:
|
||||
return "cudaErrorPeerAccessNotEnabled";
|
||||
|
||||
case cudaErrorDeviceAlreadyInUse:
|
||||
return "cudaErrorDeviceAlreadyInUse";
|
||||
|
||||
case cudaErrorProfilerDisabled:
|
||||
return "cudaErrorProfilerDisabled";
|
||||
|
||||
case cudaErrorProfilerNotInitialized:
|
||||
return "cudaErrorProfilerNotInitialized";
|
||||
|
||||
case cudaErrorProfilerAlreadyStarted:
|
||||
return "cudaErrorProfilerAlreadyStarted";
|
||||
|
||||
case cudaErrorProfilerAlreadyStopped:
|
||||
return "cudaErrorProfilerAlreadyStopped";
|
||||
|
||||
/* Since CUDA 4.0*/
|
||||
case cudaErrorAssert:
|
||||
return "cudaErrorAssert";
|
||||
|
||||
case cudaErrorTooManyPeers:
|
||||
return "cudaErrorTooManyPeers";
|
||||
|
||||
case cudaErrorHostMemoryAlreadyRegistered:
|
||||
return "cudaErrorHostMemoryAlreadyRegistered";
|
||||
|
||||
case cudaErrorHostMemoryNotRegistered:
|
||||
return "cudaErrorHostMemoryNotRegistered";
|
||||
|
||||
/* Since CUDA 5.0 */
|
||||
case cudaErrorOperatingSystem:
|
||||
return "cudaErrorOperatingSystem";
|
||||
|
||||
case cudaErrorPeerAccessUnsupported:
|
||||
return "cudaErrorPeerAccessUnsupported";
|
||||
|
||||
case cudaErrorLaunchMaxDepthExceeded:
|
||||
return "cudaErrorLaunchMaxDepthExceeded";
|
||||
|
||||
case cudaErrorLaunchFileScopedTex:
|
||||
return "cudaErrorLaunchFileScopedTex";
|
||||
|
||||
case cudaErrorLaunchFileScopedSurf:
|
||||
return "cudaErrorLaunchFileScopedSurf";
|
||||
|
||||
case cudaErrorSyncDepthExceeded:
|
||||
return "cudaErrorSyncDepthExceeded";
|
||||
|
||||
case cudaErrorLaunchPendingCountExceeded:
|
||||
return "cudaErrorLaunchPendingCountExceeded";
|
||||
|
||||
case cudaErrorNotPermitted:
|
||||
return "cudaErrorNotPermitted";
|
||||
|
||||
case cudaErrorNotSupported:
|
||||
return "cudaErrorNotSupported";
|
||||
|
||||
/* Since CUDA 6.0 */
|
||||
case cudaErrorHardwareStackError:
|
||||
return "cudaErrorHardwareStackError";
|
||||
|
||||
case cudaErrorIllegalInstruction:
|
||||
return "cudaErrorIllegalInstruction";
|
||||
|
||||
case cudaErrorMisalignedAddress:
|
||||
return "cudaErrorMisalignedAddress";
|
||||
|
||||
case cudaErrorInvalidAddressSpace:
|
||||
return "cudaErrorInvalidAddressSpace";
|
||||
|
||||
case cudaErrorInvalidPc:
|
||||
return "cudaErrorInvalidPc";
|
||||
|
||||
case cudaErrorIllegalAddress:
|
||||
return "cudaErrorIllegalAddress";
|
||||
|
||||
/* Since CUDA 6.5*/
|
||||
case cudaErrorInvalidPtx:
|
||||
return "cudaErrorInvalidPtx";
|
||||
|
||||
case cudaErrorInvalidGraphicsContext:
|
||||
return "cudaErrorInvalidGraphicsContext";
|
||||
|
||||
case cudaErrorStartupFailure:
|
||||
return "cudaErrorStartupFailure";
|
||||
|
||||
case cudaErrorApiFailureBase:
|
||||
return "cudaErrorApiFailureBase";
|
||||
}
|
||||
|
||||
return "<unknown>";
|
||||
}
|
||||
#endif
|
||||
|
||||
template< typename T >
|
||||
void check(T result, char const *const func, const char *const file, int const line)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
|
||||
file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
|
||||
cudaDeviceReset();
|
||||
// Make sure we call CUDA Device Reset before exiting
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
int *pArgc = NULL;
|
||||
char **pArgv = NULL;
|
||||
|
||||
#if CUDART_VERSION < 5000
|
||||
|
||||
// CUDA-C includes
|
||||
#include <cuda.h>
|
||||
|
||||
// This function wraps the CUDA Driver API into a template function
|
||||
template <class T>
|
||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||
{
|
||||
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
||||
|
||||
if (CUDA_SUCCESS != error) {
|
||||
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
|
||||
error, __FILE__, __LINE__);
|
||||
|
||||
// cudaDeviceReset causes the driver to clean up all state. While
|
||||
// not mandatory in normal operation, it is good practice. It is also
|
||||
// needed to ensure correct operation when the application is being
|
||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||
// flushed before the application exits
|
||||
cudaDeviceReset();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CUDART_VERSION < 5000 */
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] = {
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
|
||||
{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
|
||||
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
|
||||
{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
|
||||
{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
|
||||
{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
// If we don't find the values, we default use the previous one to run properly
|
||||
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
|
||||
return nGpuArchCoresPerSM[index-1].Cores;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
pArgc = &argc;
|
||||
pArgv = argv;
|
||||
|
||||
printf("%s Starting...\n\n", argv[0]);
|
||||
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
||||
|
||||
int deviceCount = 0;
|
||||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||
|
||||
if (error_id != cudaSuccess) {
|
||||
printf("cudaGetDeviceCount failed: %s (%d)\n",
|
||||
cudaGetErrorString(error_id), (int) error_id);
|
||||
printf("Result = FAIL\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// This function call returns 0 if there are no CUDA capable devices.
|
||||
if (deviceCount == 0)
|
||||
printf("There are no available device(s) that support CUDA\n");
|
||||
else
|
||||
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
||||
|
||||
int dev, driverVersion = 0, runtimeVersion = 0;
|
||||
|
||||
for (dev = 0; dev < deviceCount; ++dev) {
|
||||
cudaSetDevice(dev);
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, dev);
|
||||
|
||||
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
|
||||
|
||||
// Console log
|
||||
cudaDriverGetVersion(&driverVersion);
|
||||
cudaRuntimeGetVersion(&runtimeVersion);
|
||||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
|
||||
|
||||
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
|
||||
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
|
||||
|
||||
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||
deviceProp.multiProcessorCount,
|
||||
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
|
||||
|
||||
|
||||
#if CUDART_VERSION >= 5000
|
||||
// This is supported in CUDA 5.0 (runtime API device properties)
|
||||
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
|
||||
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
|
||||
|
||||
if (deviceProp.l2CacheSize) {
|
||||
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
|
||||
}
|
||||
|
||||
#else
|
||||
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
|
||||
int memoryClock;
|
||||
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
||||
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
||||
int memBusWidth;
|
||||
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
||||
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||
int L2CacheSize;
|
||||
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
||||
|
||||
if (L2CacheSize) {
|
||||
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
|
||||
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
|
||||
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
|
||||
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
||||
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
|
||||
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
|
||||
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
|
||||
|
||||
|
||||
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
|
||||
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
|
||||
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
|
||||
printf(" Warp size: %d\n", deviceProp.warpSize);
|
||||
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
|
||||
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
|
||||
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
||||
deviceProp.maxThreadsDim[0],
|
||||
deviceProp.maxThreadsDim[1],
|
||||
deviceProp.maxThreadsDim[2]);
|
||||
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
||||
deviceProp.maxGridSize[0],
|
||||
deviceProp.maxGridSize[1],
|
||||
deviceProp.maxGridSize[2]);
|
||||
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
|
||||
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
|
||||
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
|
||||
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
||||
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
|
||||
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
|
||||
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
|
||||
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
||||
#endif
|
||||
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
|
||||
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
|
||||
|
||||
const char *sComputeMode[] = {
|
||||
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
|
||||
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
|
||||
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
|
||||
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
|
||||
"Unknown",
|
||||
NULL
|
||||
};
|
||||
printf(" Compute Mode:\n");
|
||||
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
||||
}
|
||||
|
||||
// If there are 2 or more GPUs, query to determine whether RDMA is supported
|
||||
if (deviceCount >= 2)
|
||||
{
|
||||
cudaDeviceProp prop[64];
|
||||
int gpuid[64]; // we want to find the first two GPU's that can support P2P
|
||||
int gpu_p2p_count = 0;
|
||||
|
||||
for (int i=0; i < deviceCount; i++)
|
||||
{
|
||||
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
|
||||
|
||||
// Only boards based on Fermi or later can support P2P
|
||||
if ((prop[i].major >= 2)
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
// on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
|
||||
&& prop[i].tccDriver
|
||||
#endif
|
||||
)
|
||||
{
|
||||
// This is an array of P2P capable GPUs
|
||||
gpuid[gpu_p2p_count++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Show all the combinations of support P2P GPUs
|
||||
int can_access_peer_0_1, can_access_peer_1_0;
|
||||
|
||||
if (gpu_p2p_count >= 2)
|
||||
{
|
||||
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||
{
|
||||
for (int j = 1; j < gpu_p2p_count; j++)
|
||||
{
|
||||
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
|
||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
|
||||
prop[gpuid[j]].name, gpuid[j] ,
|
||||
can_access_peer_0_1 ? "Yes" : "No");
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 1; j < gpu_p2p_count; j++)
|
||||
{
|
||||
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||
{
|
||||
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
|
||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
|
||||
prop[gpuid[i]].name, gpuid[i] ,
|
||||
can_access_peer_1_0 ? "Yes" : "No");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// csv masterlog info
|
||||
// *****************************
|
||||
// exe and CUDA driver name
|
||||
printf("\n");
|
||||
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
|
||||
char cTemp[128];
|
||||
|
||||
// driver version
|
||||
sProfileString += ", CUDA Driver Version = ";
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||
#else
|
||||
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||
#endif
|
||||
sProfileString += cTemp;
|
||||
|
||||
// Runtime version
|
||||
sProfileString += ", CUDA Runtime Version = ";
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||
#else
|
||||
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||
#endif
|
||||
sProfileString += cTemp;
|
||||
|
||||
// Device count
|
||||
sProfileString += ", NumDevs = ";
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
sprintf_s(cTemp, 10, "%d", deviceCount);
|
||||
#else
|
||||
sprintf(cTemp, "%d", deviceCount);
|
||||
#endif
|
||||
sProfileString += cTemp;
|
||||
|
||||
// Print Out all device Names
|
||||
for (dev = 0; dev < deviceCount; ++dev)
|
||||
{
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
sprintf_s(cTemp, 13, ", Device%d = ", dev);
|
||||
#else
|
||||
sprintf(cTemp, ", Device%d = ", dev);
|
||||
#endif
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, dev);
|
||||
sProfileString += cTemp;
|
||||
sProfileString += deviceProp.name;
|
||||
}
|
||||
|
||||
sProfileString += "\n";
|
||||
printf("%s", sProfileString.c_str());
|
||||
|
||||
printf("Result = PASS\n");
|
||||
|
||||
// finish
|
||||
// cudaDeviceReset causes the driver to clean up all state. While
|
||||
// not mandatory in normal operation, it is good practice. It is also
|
||||
// needed to ensure correct operation when the application is being
|
||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||
// flushed before the application exits
|
||||
cudaDeviceReset();
|
||||
return 0;
|
||||
}
|
||||
45
cuda/flake.lock
generated
Normal file
45
cuda/flake.lock
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jungle": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1770128250,
|
||||
"narHash": "sha256-Kx3EwImhYCp4bLPNWGz4oL4IYVjkCLXwcVmXTY40MBc=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "7a6e4232de0e181de97e099e600ffc3a964260e0",
|
||||
"revCount": 1536,
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1767634882,
|
||||
"narHash": "sha256-2GffSfQxe3sedHzK+sTKlYo/NTIAGzbFCIsNMUPAAnk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "3c9db02515ef1d9b6b709fc60ba9a540957f661c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jungle": "jungle"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
43
cuda/flake.nix
Normal file
43
cuda/flake.nix
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
nixpkgs = jungle.inputs.nixpkgs;
|
||||
customOverlay = (final: prev: {
|
||||
# Example overlay, for now empty
|
||||
});
|
||||
pkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [
|
||||
# Apply jungle overlay to get our BSC custom packages
|
||||
jungle.outputs.bscOverlay
|
||||
# And on top apply our local changes to customize for cluster
|
||||
customOverlay
|
||||
];
|
||||
# Needed for CUDA
|
||||
config.allowUnfree = true;
|
||||
};
|
||||
in {
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
pname = "cuda-devshell";
|
||||
# Include these packages in the shell
|
||||
packages = with pkgs; [
|
||||
# Cuda packages (more at https://search.nixos.org/packages)
|
||||
cudatoolkit # Required for nvcc
|
||||
(lib.getOutput "static" cudaPackages.cuda_cudart) # Required for -lcudart_static
|
||||
cudaPackages.libcusparse
|
||||
autoAddDriverRunpath
|
||||
# ... add more packages from https://search.nixos.org/packages
|
||||
];
|
||||
# The dependencies needed to build these packages will be also included
|
||||
inputsFrom = with pkgs; [
|
||||
# Empty for now
|
||||
];
|
||||
shellHook = ''
|
||||
export CUDA_PATH=${pkgs.cudatoolkit}
|
||||
export LD_LIBRARY_PATH=/var/run/opengl-driver/lib
|
||||
export SMS=50
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
45
custom/arnau/posv/flake.lock
generated
Normal file
45
custom/arnau/posv/flake.lock
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jungle": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1760427467,
|
||||
"narHash": "sha256-DemQ+XT3BWXh8fr6UDfGNUB4ba0tGJXyep5/lg+gBD4=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "4261d327c678e52abdd568a27168ea7cdd0484a0",
|
||||
"revCount": 1487,
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1752436162,
|
||||
"narHash": "sha256-Kt1UIPi7kZqkSc5HVj6UY5YLHHEzPBkgpNUByuyxtlw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "dfcd5b901dbab46c9c6e80b265648481aafb01f8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jungle": "jungle"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
47
custom/arnau/posv/flake.nix
Normal file
47
custom/arnau/posv/flake.nix
Normal file
@@ -0,0 +1,47 @@
|
||||
{
|
||||
# Fetch the list of packages for BSC
|
||||
inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
|
||||
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
customOverlay = final: prev: {
|
||||
# Disable GIL in python
|
||||
python314 = prev.python314.override {
|
||||
enableGIL = false;
|
||||
};
|
||||
# Use a custom nOS-V commit
|
||||
nosv = prev.nosv.override {
|
||||
useGit = true;
|
||||
gitBranch = "nosv_join";
|
||||
gitCommit = "33130d271a59d0794545e4a2a597a56951e428aa";
|
||||
gitUrl = "ssh://git@gitlab-internal.bsc.es/acinca/nos-v.git";
|
||||
};
|
||||
};
|
||||
pkgs = import jungle.inputs.nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [
|
||||
# Add our BSC packages
|
||||
jungle.bscOverlay
|
||||
# And our changes above on top
|
||||
customOverlay
|
||||
];
|
||||
};
|
||||
in {
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
pname = "devshell";
|
||||
# Set the NOSV_HOME to point to the current nosv package
|
||||
NOSV_HOME = pkgs.nosv;
|
||||
|
||||
# These will be included in the environment with `nix develop`.
|
||||
buildInputs = with pkgs; [
|
||||
# Add python to the develop shell, with setuptools included
|
||||
(python314.withPackages (python-pkgs: with python-pkgs; [
|
||||
setuptools
|
||||
]))
|
||||
|
||||
# Extra packages
|
||||
gcc cowsay nosv
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
45
custom/isabel/slurm/flake.lock
generated
Normal file
45
custom/isabel/slurm/flake.lock
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jungle": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1760427467,
|
||||
"narHash": "sha256-DemQ+XT3BWXh8fr6UDfGNUB4ba0tGJXyep5/lg+gBD4=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "4261d327c678e52abdd568a27168ea7cdd0484a0",
|
||||
"revCount": 1487,
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1752436162,
|
||||
"narHash": "sha256-Kt1UIPi7kZqkSc5HVj6UY5YLHHEzPBkgpNUByuyxtlw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "dfcd5b901dbab46c9c6e80b265648481aafb01f8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jungle": "jungle"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
@@ -1,8 +1,9 @@
|
||||
{
|
||||
inputs.bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
||||
outputs = { self, bscpkgs }:
|
||||
inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
nixpkgs = bscpkgs.inputs.nixpkgs;
|
||||
nixpkgs = jungle.inputs.nixpkgs;
|
||||
lib = nixpkgs.lib;
|
||||
clusterOverlay = (final: prev: {
|
||||
# Use cluster llvm compiler repo
|
||||
clangOmpss2Unwrapped = prev.clangOmpss2Unwrapped.override {
|
||||
@@ -11,12 +12,19 @@
|
||||
gitCommit = "151c260ba834826c01855da0a41fc203ffe4d025";
|
||||
gitBranch = "cluster";
|
||||
};
|
||||
# Configure MPICH to use UCX with multiple thread support
|
||||
mpich = prev.mpich.overrideAttrs (old: {
|
||||
configureFlags = (lib.remove "--with-device=ch4:ofi" old.configureFlags) ++ [
|
||||
"--with-device=ch4:ucx"
|
||||
"--enable-threads=multiple"
|
||||
];
|
||||
});
|
||||
});
|
||||
pkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [
|
||||
# Apply bscpkgs to get our BSC custom packages
|
||||
bscpkgs.outputs.bscOverlay
|
||||
# Apply jungle overlay to get our BSC custom packages
|
||||
jungle.outputs.bscOverlay
|
||||
# And on top apply our local changes to customize for cluster
|
||||
clusterOverlay
|
||||
];
|
||||
@@ -26,8 +34,7 @@
|
||||
pname = "devshell";
|
||||
buildInputs = with pkgs; [
|
||||
slurm.out slurm.dev gcc
|
||||
clangOmpss2
|
||||
nanos6
|
||||
clangOmpss2 mpich osumb
|
||||
];
|
||||
inputsFrom = with pkgs; [
|
||||
nanos6
|
||||
1
custom/vincent/chol/.gitignore
vendored
Normal file
1
custom/vincent/chol/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
out/
|
||||
45
custom/vincent/chol/flake.lock
generated
Normal file
45
custom/vincent/chol/flake.lock
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jungle": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1769187376,
|
||||
"narHash": "sha256-H8aMWt4OVwXWoUPPSZuj0eSq3Ur17nY62Ap+hYiQy3o=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "deb0cd1488b8d72ad1395b25aa4dbbdf721274d9",
|
||||
"revCount": 1533,
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1767634882,
|
||||
"narHash": "sha256-2GffSfQxe3sedHzK+sTKlYo/NTIAGzbFCIsNMUPAAnk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "3c9db02515ef1d9b6b709fc60ba9a540957f661c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jungle": "jungle"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
61
custom/vincent/chol/flake.nix
Normal file
61
custom/vincent/chol/flake.nix
Normal file
@@ -0,0 +1,61 @@
|
||||
{
|
||||
inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
nixpkgs = jungle.inputs.nixpkgs;
|
||||
customOverlay = (final: prev: {
|
||||
|
||||
# Build blis for Fox architecture and without OpenMP
|
||||
amd-blis = (prev.amd-blis.override {
|
||||
withOpenMP = false;
|
||||
withArchitecture = "zen4";
|
||||
}).overrideAttrs (old: {
|
||||
hardeningDisable = [ "all" ];
|
||||
});
|
||||
|
||||
# Disable OpenMP in flame
|
||||
amd-libflame = (prev.amd-libflame.override {
|
||||
withOpenMP = false;
|
||||
}).overrideAttrs (old: {
|
||||
hardeningDisable = [ "all" ];
|
||||
});
|
||||
|
||||
# Build bench6 with blis
|
||||
bench6 = prev.bench6.overrideAttrs (old: {
|
||||
buildInputs = with final; [
|
||||
bigotes
|
||||
openmp
|
||||
openmpv
|
||||
nanos6
|
||||
nodes
|
||||
nosv
|
||||
mpi
|
||||
tampi
|
||||
ovni
|
||||
amd-blis
|
||||
amd-libflame
|
||||
];
|
||||
cmakeFlags = (old.cmakeFlags or []) ++ [
|
||||
"-DCBLAS_INCLUDE_DIR=${final.amd-blis}/include/blis"
|
||||
"-DLAPACKE_INCLUDE_DIR=${final.amd-libflame}/include"
|
||||
"-DLAPACKE_LIBRARY=${final.amd-libflame}/lib/liblapacke.so.3"
|
||||
];
|
||||
});
|
||||
});
|
||||
pkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [
|
||||
jungle.outputs.bscOverlay
|
||||
customOverlay
|
||||
];
|
||||
};
|
||||
in {
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
pname = "devshell";
|
||||
packages = with pkgs; [
|
||||
bench6 bigotes ministat
|
||||
];
|
||||
};
|
||||
packages.x86_64-linux.bench6 = pkgs.bench6;
|
||||
};
|
||||
}
|
||||
20
custom/vincent/chol/run.sh
Executable file
20
custom/vincent/chol/run.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/sh
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
exec nix develop -c $0 run
|
||||
#exec srun -J chol -p fox --exclusive nix develop -c $0 run
|
||||
fi
|
||||
|
||||
set -eux
|
||||
|
||||
size=$((32*1024))
|
||||
bs=512
|
||||
|
||||
b6dir=$(which b6_cholesky_nodes | awk -F/ '{print $4}')
|
||||
wdir="out/$b6dir"
|
||||
|
||||
mkdir -p "$wdir"
|
||||
|
||||
log="$wdir/b6_cholesky_nodes-$size-$bs.csv"
|
||||
bigotes -o "$log" -- b6_cholesky_nodes $size $bs
|
||||
ministat -w80 "$log"
|
||||
42
isabel/slurm/flake.lock
generated
42
isabel/slurm/flake.lock
generated
@@ -1,42 +0,0 @@
|
||||
{
|
||||
"nodes": {
|
||||
"bscpkgs": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1749650500,
|
||||
"narHash": "sha256-2MHfVPV6RA7qPSCtXh4+KK0F0UjN+J4z8//+n6NK7Xs=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "9d1944c658929b6f98b3f3803fead4d1b91c4405",
|
||||
"revCount": 961,
|
||||
"type": "git",
|
||||
"url": "https://git.sr.ht/~rodarima/bscpkgs"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://git.sr.ht/~rodarima/bscpkgs"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1736867362,
|
||||
"narHash": "sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8=",
|
||||
"path": "/nix/store/2csx2kkb2hxyxhhmg2xs9jfyypikwwk6-source",
|
||||
"rev": "9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"bscpkgs": "bscpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
2
ompss2/.gitignore
vendored
Normal file
2
ompss2/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
hello
|
||||
ovni/
|
||||
13
ompss2/Makefile
Normal file
13
ompss2/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
CC=clang
|
||||
CFLAGS=-fompss-2
|
||||
|
||||
hello: hello.c
|
||||
|
||||
trace: hello
|
||||
rm -rf ovni/
|
||||
NOSV_CONFIG_OVERRIDE="instrumentation.version=ovni" NOSV_APPID=1 ./hello
|
||||
ovniemu ovni/
|
||||
ls -l ovni/*.prv
|
||||
|
||||
clean:
|
||||
rm -rf hello ovni/
|
||||
57
ompss2/README.md
Normal file
57
ompss2/README.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# OmpSs-2 environment
|
||||
|
||||
This example shows how to include the LLVM compiler to build OmpSs-2 programs
|
||||
with the new NODES and nOS-V runtime. The package `clangOmpss2Nodes` already
|
||||
sets all the needed variables to locate the right runtime.
|
||||
|
||||
Run `nix develop` then `make` to build the `hello` program:
|
||||
|
||||
apex% nix develop
|
||||
apex$ make
|
||||
clang -fompss-2 hello.c -o hello
|
||||
apex$ ./hello
|
||||
hello from task 1
|
||||
hello from task 0
|
||||
hello from task 2
|
||||
hello from task 3
|
||||
hello from task 4
|
||||
hello from task 7
|
||||
hello from task 8
|
||||
hello from task 6
|
||||
hello from task 9
|
||||
hello from task 5
|
||||
|
||||
You can use `make trace` to run the hello program with instrumentation and
|
||||
generate an ovni trace that is then converted to paraver:
|
||||
|
||||
apex$ make trace
|
||||
rm -rf ovni/
|
||||
NOSV_CONFIG_OVERRIDE="instrumentation.version=ovni" NOSV_APPID=1 ./hello
|
||||
hello from task 1
|
||||
hello from task 0
|
||||
hello from task 3
|
||||
hello from task 4
|
||||
hello from task 5
|
||||
hello from task 6
|
||||
hello from task 2
|
||||
hello from task 8
|
||||
hello from task 7
|
||||
hello from task 9
|
||||
ovniemu ovni/
|
||||
ovniemu: INFO: loaded 58 streams
|
||||
ovniemu: INFO: sorting looms by name
|
||||
ovniemu: INFO: loaded 1 looms, 1 processes, 58 threads and 56 cpus
|
||||
ovniemu: INFO: generated with libovni version 1.13.0 commit 0643266
|
||||
ovniemu: INFO: the following 2 models are enabled:
|
||||
ovniemu: INFO: ovni 1.1.0 'O' (18 events)
|
||||
ovniemu: INFO: nosv 2.6.0 'V' (64 events)
|
||||
ovniemu: INFO: emulation starts
|
||||
ovniemu: INFO: apex.nosv-u1880-p598308 burst stats: median/avg/max = 77/ 81/333 ns
|
||||
ovniemu: WARN: ignoring old event OCn
|
||||
ovniemu: INFO: 100.0% done at avg 42 kev/s
|
||||
ovniemu: INFO: processed 711 input events in 0.02 s
|
||||
ovniemu: INFO: writing traces to disk, please wait
|
||||
ovniemu: INFO: emulation finished ok
|
||||
ls -l ovni/*.prv
|
||||
-rw-r--r-- 1 rarias Computational 48224 Feb 4 11:52 ovni/cpu.prv
|
||||
-rw-r--r-- 1 rarias Computational 33689 Feb 4 11:52 ovni/thread.prv
|
||||
45
ompss2/flake.lock
generated
Normal file
45
ompss2/flake.lock
generated
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jungle": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1770128250,
|
||||
"narHash": "sha256-Kx3EwImhYCp4bLPNWGz4oL4IYVjkCLXwcVmXTY40MBc=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "7a6e4232de0e181de97e099e600ffc3a964260e0",
|
||||
"revCount": 1536,
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://jungle.bsc.es/git/rarias/jungle"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1767634882,
|
||||
"narHash": "sha256-2GffSfQxe3sedHzK+sTKlYo/NTIAGzbFCIsNMUPAAnk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "3c9db02515ef1d9b6b709fc60ba9a540957f661c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jungle": "jungle"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
36
ompss2/flake.nix
Normal file
36
ompss2/flake.nix
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
inputs.jungle.url = "git+https://jungle.bsc.es/git/rarias/jungle";
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
nixpkgs = jungle.inputs.nixpkgs;
|
||||
customOverlay = (final: prev: {
|
||||
# Example overlay, for now empty
|
||||
});
|
||||
pkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [
|
||||
# Apply jungle overlay to get our BSC custom packages
|
||||
jungle.outputs.bscOverlay
|
||||
# And on top apply our local changes to customize for cluster
|
||||
customOverlay
|
||||
];
|
||||
};
|
||||
in {
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
pname = "ompss2-devshell";
|
||||
# Include these packages in the shell
|
||||
packages = with pkgs; [
|
||||
clangOmpss2Nodes
|
||||
nodes
|
||||
nosv
|
||||
ovni
|
||||
# Optional: Add wxparaver to open .prv traces (needs a working $DISPLAY)
|
||||
# wxparaver
|
||||
];
|
||||
# The dependencies needed to build these packages will be also included
|
||||
inputsFrom = with pkgs; [
|
||||
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
10
ompss2/hello.c
Normal file
10
ompss2/hello.c
Normal file
@@ -0,0 +1,10 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
for (int i = 0; i < 10; i++) {
|
||||
#pragma oss task
|
||||
printf("hello from task %d\n", i);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user