Compare commits
40 Commits
aee54ef39f
...
gitea-lfs
| Author | SHA1 | Date | |
|---|---|---|---|
| be8c150b08 | |||
| f9d4a70791 | |||
| 729e2d3833 | |||
| 54ad962719 | |||
| 8697fc0a18 | |||
| d3b355f651 | |||
| 2ed881cd89 | |||
| 2a07df1d30 | |||
| 52380eae59 | |||
| 2fe84c4cbc | |||
| 3b16b41be3 | |||
| ee481deffb | |||
| b1bad25008 | |||
| 85f38e17a2 | |||
| 08ab01b89c | |||
| 194a6fb7f6 | |||
| 365576778b | |||
| e7490858c6 | |||
| 7606030135 | |||
| e55590f59e | |||
| c3da39c392 | |||
| d3889b3339 | |||
| 28540d8cf3 | |||
| f847621ceb | |||
| 12fe43f95f | |||
| 0e8329eef3 | |||
| df3b21b570 | |||
|
78df61d24a
|
|||
|
8e7da73151
|
|||
|
a7e17e40dc
|
|||
| 0e8bd22347 | |||
| d948f8b752 | |||
| 8f7787e217 | |||
| 30b9b23112 | |||
| 9a056737de | |||
| ac700d34a5 | |||
| 9b681ab7ce | |||
| 9ce394bffd | |||
| 8cd7b713ca | |||
| 8eed90d2bd |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
doc/bsc-ssf.pdf
BIN
doc/bsc-ssf.pdf
Binary file not shown.
@@ -5,6 +5,7 @@
|
|||||||
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
||||||
bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
|
bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
self.lfs = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
|
outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
|
||||||
|
|||||||
4
keys.nix
4
keys.nix
@@ -16,8 +16,7 @@ rec {
|
|||||||
};
|
};
|
||||||
|
|
||||||
hostGroup = with hosts; rec {
|
hostGroup = with hosts; rec {
|
||||||
untrusted = [ fox ];
|
compute = [ owl1 owl2 fox ];
|
||||||
compute = [ owl1 owl2 ];
|
|
||||||
playground = [ eudy koro weasel ];
|
playground = [ eudy koro weasel ];
|
||||||
storage = [ bay lake2 ];
|
storage = [ bay lake2 ];
|
||||||
monitor = [ hut ];
|
monitor = [ hut ];
|
||||||
@@ -31,6 +30,7 @@ rec {
|
|||||||
admins = {
|
admins = {
|
||||||
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
||||||
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
|
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
|
||||||
|
"rarias@fox" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDSbw3REAKECV7E2c/e2XJITudJQWq2qDSe2N1JHqHZd rarias@fox";
|
||||||
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
|
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,9 @@
|
|||||||
../common/xeon.nix
|
../common/xeon.nix
|
||||||
../common/ssf/hosts.nix
|
../common/ssf/hosts.nix
|
||||||
../module/ceph.nix
|
../module/ceph.nix
|
||||||
|
../module/slurm-server.nix
|
||||||
./nfs.nix
|
./nfs.nix
|
||||||
|
./wireguard.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Don't install grub MBR for now
|
# Don't install grub MBR for now
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
statdPort = 4000;
|
statdPort = 4000;
|
||||||
exports = ''
|
exports = ''
|
||||||
/home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
|
/home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
|
||||||
|
/home 10.106.0.0/24(rw,async,no_subtree_check,no_root_squash)
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
networking.firewall = {
|
networking.firewall = {
|
||||||
@@ -27,6 +28,21 @@
|
|||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
|
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
|
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
|
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
|
||||||
|
|
||||||
|
# Accept NFS traffic from wg0
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
|
||||||
|
# Same but UDP
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
|
||||||
|
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
35
m/apex/wireguard.nix
Normal file
35
m/apex/wireguard.nix
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
{ config, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
networking.firewall = {
|
||||||
|
allowedUDPPorts = [ 666 ];
|
||||||
|
};
|
||||||
|
|
||||||
|
age.secrets.wgApex.file = ../../secrets/wg-apex.age;
|
||||||
|
|
||||||
|
# Enable WireGuard
|
||||||
|
networking.wireguard.enable = true;
|
||||||
|
networking.wireguard.interfaces = {
|
||||||
|
# "wg0" is the network interface name. You can name the interface arbitrarily.
|
||||||
|
wg0 = {
|
||||||
|
ips = [ "10.106.0.30/24" ];
|
||||||
|
listenPort = 666;
|
||||||
|
privateKeyFile = config.age.secrets.wgApex.path;
|
||||||
|
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
|
||||||
|
peers = [
|
||||||
|
{
|
||||||
|
name = "Fox";
|
||||||
|
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
||||||
|
allowedIPs = [ "10.106.0.0/24" ];
|
||||||
|
endpoint = "fox.ac.upc.edu:666";
|
||||||
|
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
|
||||||
|
persistentKeepalive = 25;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
networking.hosts = {
|
||||||
|
"10.106.0.1" = [ "fox" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
# Includes the basic configuration for an Intel server.
|
# Includes the basic configuration for an Intel server.
|
||||||
imports = [
|
imports = [
|
||||||
./base/agenix.nix
|
./base/agenix.nix
|
||||||
|
./base/always-power-on.nix
|
||||||
./base/august-shutdown.nix
|
./base/august-shutdown.nix
|
||||||
./base/boot.nix
|
./base/boot.nix
|
||||||
./base/env.nix
|
./base/env.nix
|
||||||
|
|||||||
8
m/common/base/always-power-on.nix
Normal file
8
m/common/base/always-power-on.nix
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../../module/power-policy.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
# Turn on as soon as we have power
|
||||||
|
power.policy = "always-on";
|
||||||
|
}
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
# Shutdown all machines on August 2nd at 11:00 AM, so we can protect the
|
# Shutdown all machines on August 3rd at 22:00, so we can protect the
|
||||||
# hardware from spurious electrical peaks on the yearly electrical cut for
|
# hardware from spurious electrical peaks on the yearly electrical cut for
|
||||||
# manteinance that starts on August 4th.
|
# manteinance that starts on August 4th.
|
||||||
systemd.timers.august-shutdown = {
|
systemd.timers.august-shutdown = {
|
||||||
description = "Shutdown on August 2nd for maintenance";
|
description = "Shutdown on August 3rd for maintenance";
|
||||||
wantedBy = [ "timers.target" ];
|
wantedBy = [ "timers.target" ];
|
||||||
timerConfig = {
|
timerConfig = {
|
||||||
OnCalendar = "*-08-02 11:00:00";
|
OnCalendar = "*-08-03 22:00:00";
|
||||||
RandomizedDelaySec = "10min";
|
RandomizedDelaySec = "10min";
|
||||||
Unit = "systemd-poweroff.service";
|
Unit = "systemd-poweroff.service";
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
||||||
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
||||||
ncdu config.boot.kernelPackages.perf ldns pv
|
ncdu config.boot.kernelPackages.perf ldns pv git-lfs
|
||||||
# From bsckgs overlay
|
# From bsckgs overlay
|
||||||
osumb
|
osumb
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
nftables.enable = lib.mkForce false;
|
nftables.enable = lib.mkForce false;
|
||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
"84.88.53.236" = [ "apex" "ssfhead.bsc.es" "ssfhead" ];
|
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
|
||||||
"84.88.51.152" = [ "raccoon" ];
|
"84.88.51.152" = [ "raccoon" ];
|
||||||
"84.88.51.142" = [ "raccoon-ipmi" ];
|
"84.88.51.142" = [ "raccoon-ipmi" ];
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -154,6 +154,20 @@
|
|||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
csiringo = {
|
||||||
|
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
|
||||||
|
uid = 9653;
|
||||||
|
isNormalUser = true;
|
||||||
|
home = "/home/Computational/csiringo";
|
||||||
|
description = "Cesare Siringo";
|
||||||
|
group = "Computational";
|
||||||
|
hosts = [ "apex" "weasel" ];
|
||||||
|
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
||||||
|
openssh.authorizedKeys.keys = [
|
||||||
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
||||||
|
];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
groups = {
|
groups = {
|
||||||
|
|||||||
@@ -5,8 +5,15 @@
|
|||||||
../common/base.nix
|
../common/base.nix
|
||||||
../common/xeon/console.nix
|
../common/xeon/console.nix
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
|
../module/nvidia.nix
|
||||||
|
../module/slurm-client.nix
|
||||||
|
./wireguard.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Don't turn off on August as UPC has different dates.
|
||||||
|
# Fox works fine on power cuts.
|
||||||
|
systemd.timers.august-shutdown.enable = false;
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
|
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
|
||||||
|
|
||||||
@@ -30,6 +37,18 @@
|
|||||||
|
|
||||||
services.openssh.settings.X11Forwarding = true;
|
services.openssh.settings.X11Forwarding = true;
|
||||||
|
|
||||||
|
services.fail2ban.enable = true;
|
||||||
|
|
||||||
|
# Use SSH tunnel to reach internal hosts
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host bscpm04.bsc.es gitlab-internal.bsc.es tent
|
||||||
|
ProxyJump raccoon
|
||||||
|
Host raccoon
|
||||||
|
ProxyJump apex
|
||||||
|
HostName 127.0.0.1
|
||||||
|
Port 22022
|
||||||
|
'';
|
||||||
|
|
||||||
networking = {
|
networking = {
|
||||||
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
||||||
hostName = "fox";
|
hostName = "fox";
|
||||||
@@ -53,17 +72,20 @@
|
|||||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Configure Nvidia driver to use with CUDA
|
# Recommended for new graphics cards
|
||||||
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
|
||||||
hardware.nvidia.open = true;
|
hardware.nvidia.open = true;
|
||||||
hardware.graphics.enable = true;
|
|
||||||
nixpkgs.config.nvidia.acceptLicense = true;
|
|
||||||
services.xserver.videoDrivers = [ "nvidia" ];
|
|
||||||
|
|
||||||
# Mount NVME disks
|
# Mount NVME disks
|
||||||
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
|
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
|
||||||
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
|
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
|
||||||
|
|
||||||
|
# Mount the NFS home
|
||||||
|
fileSystems."/nfs/home" = {
|
||||||
|
device = "10.106.0.30:/home";
|
||||||
|
fsType = "nfs";
|
||||||
|
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Make a /nvme{0,1}/$USER directory for each user.
|
# Make a /nvme{0,1}/$USER directory for each user.
|
||||||
systemd.services.create-nvme-dirs = let
|
systemd.services.create-nvme-dirs = let
|
||||||
# Take only normal users in fox
|
# Take only normal users in fox
|
||||||
@@ -80,4 +102,20 @@
|
|||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
serviceConfig.ExecStart = script;
|
serviceConfig.ExecStart = script;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Only allow SSH connections from users who have a SLURM allocation
|
||||||
|
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||||
|
security.pam.services.sshd.rules.account.slurm = {
|
||||||
|
control = "required";
|
||||||
|
enable = true;
|
||||||
|
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
|
||||||
|
args = [ "log_level=debug5" ];
|
||||||
|
order = 999999; # Make it last one
|
||||||
|
};
|
||||||
|
|
||||||
|
# Disable systemd session (pam_systemd.so) as it will conflict with the
|
||||||
|
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
|
||||||
|
# into the slurmstepd task and then into the systemd session, which is not
|
||||||
|
# what we want, otherwise it will linger even if all jobs are gone.
|
||||||
|
security.pam.services.sshd.startSession = lib.mkForce false;
|
||||||
}
|
}
|
||||||
|
|||||||
46
m/fox/wireguard.nix
Normal file
46
m/fox/wireguard.nix
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
{ config, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
networking.firewall = {
|
||||||
|
allowedUDPPorts = [ 666 ];
|
||||||
|
};
|
||||||
|
|
||||||
|
age.secrets.wgFox.file = ../../secrets/wg-fox.age;
|
||||||
|
|
||||||
|
networking.wireguard.enable = true;
|
||||||
|
networking.wireguard.interfaces = {
|
||||||
|
# "wg0" is the network interface name. You can name the interface arbitrarily.
|
||||||
|
wg0 = {
|
||||||
|
# Determines the IP address and subnet of the server's end of the tunnel interface.
|
||||||
|
ips = [ "10.106.0.1/24" ];
|
||||||
|
|
||||||
|
# The port that WireGuard listens to. Must be accessible by the client.
|
||||||
|
listenPort = 666;
|
||||||
|
|
||||||
|
# Path to the private key file.
|
||||||
|
privateKeyFile = config.age.secrets.wgFox.path;
|
||||||
|
# Public key: VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=
|
||||||
|
|
||||||
|
peers = [
|
||||||
|
# List of allowed peers.
|
||||||
|
{
|
||||||
|
name = "Apex";
|
||||||
|
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
||||||
|
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
|
||||||
|
allowedIPs = [ "10.106.0.30/32" ];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
networking.hosts = {
|
||||||
|
"10.106.0.30" = [ "apex" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
networking.firewall = {
|
||||||
|
extraCommands = ''
|
||||||
|
# Accept slurm connections to slurmd from apex (via wireguard)
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.30/32 -d 10.106.0.1/32 --dport 6818 -j nixos-fw-accept
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -7,11 +7,9 @@
|
|||||||
../module/ceph.nix
|
../module/ceph.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/slurm-client.nix
|
|
||||||
./gitlab-runner.nix
|
./gitlab-runner.nix
|
||||||
./monitoring.nix
|
./monitoring.nix
|
||||||
./nfs.nix
|
./nfs.nix
|
||||||
./slurm-server.nix
|
|
||||||
./nix-serve.nix
|
./nix-serve.nix
|
||||||
./public-inbox.nix
|
./public-inbox.nix
|
||||||
./gitea.nix
|
./gitea.nix
|
||||||
|
|||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
services.slurm = {
|
|
||||||
server.enable = true;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
20
m/module/nvidia.nix
Normal file
20
m/module/nvidia.nix
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
{ lib, config, pkgs, ... }:
|
||||||
|
{
|
||||||
|
# Configure Nvidia driver to use with CUDA
|
||||||
|
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
||||||
|
hardware.nvidia.open = lib.mkDefault (builtins.abort "hardware.nvidia.open not set");
|
||||||
|
hardware.graphics.enable = true;
|
||||||
|
nixpkgs.config.nvidia.acceptLicense = true;
|
||||||
|
services.xserver.videoDrivers = [ "nvidia" ];
|
||||||
|
|
||||||
|
# enable support for derivations which require nvidia-gpu to be available
|
||||||
|
# > requiredSystemFeatures = [ "cuda" ];
|
||||||
|
programs.nix-required-mounts.enable = true;
|
||||||
|
programs.nix-required-mounts.presets.nvidia-gpu.enable = true;
|
||||||
|
# They forgot to add the symlink
|
||||||
|
programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
|
||||||
|
config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
|
||||||
|
];
|
||||||
|
|
||||||
|
environment.systemPackages = [ pkgs.cudainfo ];
|
||||||
|
}
|
||||||
33
m/module/power-policy.nix
Normal file
33
m/module/power-policy.nix
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
with lib;
|
||||||
|
|
||||||
|
let
|
||||||
|
cfg = config.power.policy;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options = {
|
||||||
|
power.policy = mkOption {
|
||||||
|
type = types.nullOr (types.enum [ "always-on" "previous" "always-off" ]);
|
||||||
|
default = null;
|
||||||
|
description = "Set power policy to use via IPMI.";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = mkIf (cfg != null) {
|
||||||
|
systemd.services."power-policy" = {
|
||||||
|
description = "Set power policy to use via IPMI";
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
unitConfig = {
|
||||||
|
StartLimitBurst = "10";
|
||||||
|
StartLimitIntervalSec = "10m";
|
||||||
|
};
|
||||||
|
serviceConfig = {
|
||||||
|
ExecStart = "${pkgs.ipmitool}/bin/ipmitool chassis policy ${cfg}";
|
||||||
|
Type = "oneshot";
|
||||||
|
Restart = "on-failure";
|
||||||
|
RestartSec = "5s";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,33 +1,10 @@
|
|||||||
{ config, pkgs, lib, ... }:
|
{ lib, ... }:
|
||||||
|
|
||||||
let
|
{
|
||||||
suspendProgram = pkgs.writeScript "suspend.sh" ''
|
imports = [
|
||||||
#!/usr/bin/env bash
|
./slurm-common.nix
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
];
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Shutting down host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
resumeProgram = pkgs.writeScript "resume.sh" ''
|
|
||||||
#!/usr/bin/env bash
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Starting host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
in {
|
|
||||||
systemd.services.slurmd.serviceConfig = {
|
systemd.services.slurmd.serviceConfig = {
|
||||||
# Kill all processes in the control group on stop/restart. This will kill
|
# Kill all processes in the control group on stop/restart. This will kill
|
||||||
# all the jobs running, so ensure that we only upgrade when the nodes are
|
# all the jobs running, so ensure that we only upgrade when the nodes are
|
||||||
@@ -37,90 +14,5 @@ in {
|
|||||||
KillMode = lib.mkForce "control-group";
|
KillMode = lib.mkForce "control-group";
|
||||||
};
|
};
|
||||||
|
|
||||||
services.slurm = {
|
services.slurm.client.enable = true;
|
||||||
client.enable = true;
|
|
||||||
controlMachine = "hut";
|
|
||||||
clusterName = "jungle";
|
|
||||||
nodeName = [
|
|
||||||
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
|
|
||||||
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
|
|
||||||
];
|
|
||||||
|
|
||||||
partitionName = [
|
|
||||||
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
|
||||||
];
|
|
||||||
|
|
||||||
# See slurm.conf(5) for more details about these options.
|
|
||||||
extraConfig = ''
|
|
||||||
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
|
||||||
# not with Intel MPI. For that use the compatibility shim libpmi.so
|
|
||||||
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
|
|
||||||
# library in SLURM (--mpi=pmix). See more details here:
|
|
||||||
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
|
|
||||||
MpiDefault=pmix
|
|
||||||
|
|
||||||
# When a node reboots return that node to the slurm queue as soon as it
|
|
||||||
# becomes operative again.
|
|
||||||
ReturnToService=2
|
|
||||||
|
|
||||||
# Track all processes by using a cgroup
|
|
||||||
ProctrackType=proctrack/cgroup
|
|
||||||
|
|
||||||
# Enable task/affinity to allow the jobs to run in a specified subset of
|
|
||||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
|
||||||
TaskPlugin=task/affinity,task/cgroup
|
|
||||||
|
|
||||||
# Power off unused nodes until they are requested
|
|
||||||
SuspendProgram=${suspendProgram}
|
|
||||||
SuspendTimeout=60
|
|
||||||
ResumeProgram=${resumeProgram}
|
|
||||||
ResumeTimeout=300
|
|
||||||
SuspendExcNodes=hut
|
|
||||||
|
|
||||||
# Turn the nodes off after 1 hour of inactivity
|
|
||||||
SuspendTime=3600
|
|
||||||
|
|
||||||
# Reduce port range so we can allow only this range in the firewall
|
|
||||||
SrunPortRange=60000-61000
|
|
||||||
|
|
||||||
# Use cores as consumable resources. In SLURM terms, a core may have
|
|
||||||
# multiple hardware threads (or CPUs).
|
|
||||||
SelectType=select/cons_tres
|
|
||||||
|
|
||||||
# Ignore memory constraints and only use unused cores to share a node with
|
|
||||||
# other jobs.
|
|
||||||
SelectTypeParameters=CR_Core
|
|
||||||
|
|
||||||
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
|
|
||||||
# This sets up the "extern" step into which ssh-launched processes will be
|
|
||||||
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
|
|
||||||
# when a task runs (srun) so we can ssh early.
|
|
||||||
PrologFlags=Alloc,Contain,X11
|
|
||||||
|
|
||||||
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
|
|
||||||
# adopted by the external step, similar to tasks running in regular steps
|
|
||||||
# LaunchParameters=ulimit_pam_adopt
|
|
||||||
SlurmdDebug=debug5
|
|
||||||
#DebugFlags=Protocol,Cgroup
|
|
||||||
'';
|
|
||||||
|
|
||||||
extraCgroupConfig = ''
|
|
||||||
CgroupPlugin=cgroup/v2
|
|
||||||
#ConstrainCores=yes
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
|
|
||||||
# Place the slurm config in /etc as this will be required by PAM
|
|
||||||
environment.etc.slurm.source = config.services.slurm.etcSlurm;
|
|
||||||
|
|
||||||
age.secrets.mungeKey = {
|
|
||||||
file = ../../secrets/munge-key.age;
|
|
||||||
owner = "munge";
|
|
||||||
group = "munge";
|
|
||||||
};
|
|
||||||
|
|
||||||
services.munge = {
|
|
||||||
enable = true;
|
|
||||||
password = config.age.secrets.mungeKey.path;
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|||||||
115
m/module/slurm-common.nix
Normal file
115
m/module/slurm-common.nix
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
{ config, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
|
||||||
|
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||||
|
set -x
|
||||||
|
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||||
|
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||||
|
hosts=$(scontrol show hostnames $1)
|
||||||
|
for host in $hosts; do
|
||||||
|
echo Shutting down host: $host
|
||||||
|
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
|
||||||
|
resumeProgram = pkgs.writeShellScript "resume.sh" ''
|
||||||
|
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||||
|
set -x
|
||||||
|
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||||
|
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||||
|
hosts=$(scontrol show hostnames $1)
|
||||||
|
for host in $hosts; do
|
||||||
|
echo Starting host: $host
|
||||||
|
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
|
||||||
|
in {
|
||||||
|
services.slurm = {
|
||||||
|
controlMachine = "apex";
|
||||||
|
clusterName = "jungle";
|
||||||
|
nodeName = [
|
||||||
|
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
|
||||||
|
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
|
||||||
|
];
|
||||||
|
|
||||||
|
partitionName = [
|
||||||
|
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||||
|
"fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||||
|
];
|
||||||
|
|
||||||
|
# See slurm.conf(5) for more details about these options.
|
||||||
|
extraConfig = ''
|
||||||
|
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
||||||
|
# not with Intel MPI. For that use the compatibility shim libpmi.so
|
||||||
|
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
|
||||||
|
# library in SLURM (--mpi=pmix). See more details here:
|
||||||
|
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
|
||||||
|
MpiDefault=pmix
|
||||||
|
|
||||||
|
# When a node reboots return that node to the slurm queue as soon as it
|
||||||
|
# becomes operative again.
|
||||||
|
ReturnToService=2
|
||||||
|
|
||||||
|
# Track all processes by using a cgroup
|
||||||
|
ProctrackType=proctrack/cgroup
|
||||||
|
|
||||||
|
# Enable task/affinity to allow the jobs to run in a specified subset of
|
||||||
|
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||||
|
TaskPlugin=task/affinity,task/cgroup
|
||||||
|
|
||||||
|
# Power off unused nodes until they are requested
|
||||||
|
SuspendProgram=${suspendProgram}
|
||||||
|
SuspendTimeout=60
|
||||||
|
ResumeProgram=${resumeProgram}
|
||||||
|
ResumeTimeout=300
|
||||||
|
SuspendExcNodes=fox
|
||||||
|
|
||||||
|
# Turn the nodes off after 1 hour of inactivity
|
||||||
|
SuspendTime=3600
|
||||||
|
|
||||||
|
# Reduce port range so we can allow only this range in the firewall
|
||||||
|
SrunPortRange=60000-61000
|
||||||
|
|
||||||
|
# Use cores as consumable resources. In SLURM terms, a core may have
|
||||||
|
# multiple hardware threads (or CPUs).
|
||||||
|
SelectType=select/cons_tres
|
||||||
|
|
||||||
|
# Ignore memory constraints and only use unused cores to share a node with
|
||||||
|
# other jobs.
|
||||||
|
SelectTypeParameters=CR_Core
|
||||||
|
|
||||||
|
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||||
|
# This sets up the "extern" step into which ssh-launched processes will be
|
||||||
|
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
|
||||||
|
# when a task runs (srun) so we can ssh early.
|
||||||
|
PrologFlags=Alloc,Contain,X11
|
||||||
|
|
||||||
|
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
|
||||||
|
# adopted by the external step, similar to tasks running in regular steps
|
||||||
|
# LaunchParameters=ulimit_pam_adopt
|
||||||
|
SlurmdDebug=debug5
|
||||||
|
#DebugFlags=Protocol,Cgroup
|
||||||
|
'';
|
||||||
|
|
||||||
|
extraCgroupConfig = ''
|
||||||
|
CgroupPlugin=cgroup/v2
|
||||||
|
#ConstrainCores=yes
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Place the slurm config in /etc as this will be required by PAM
|
||||||
|
environment.etc.slurm.source = config.services.slurm.etcSlurm;
|
||||||
|
|
||||||
|
age.secrets.mungeKey = {
|
||||||
|
file = ../../secrets/munge-key.age;
|
||||||
|
owner = "munge";
|
||||||
|
group = "munge";
|
||||||
|
};
|
||||||
|
|
||||||
|
services.munge = {
|
||||||
|
enable = true;
|
||||||
|
password = config.age.secrets.mungeKey.path;
|
||||||
|
};
|
||||||
|
}
|
||||||
23
m/module/slurm-server.nix
Normal file
23
m/module/slurm-server.nix
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{ ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
./slurm-common.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
services.slurm.server.enable = true;
|
||||||
|
|
||||||
|
networking.firewall = {
|
||||||
|
extraCommands = ''
|
||||||
|
# Accept slurm connections to controller from compute nodes
|
||||||
|
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817 -j nixos-fw-accept
|
||||||
|
# Accept slurm connections from compute nodes for srun
|
||||||
|
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
|
||||||
|
|
||||||
|
# Accept slurm connections to controller from fox (via wireguard)
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 6817 -j nixos-fw-accept
|
||||||
|
# Accept slurm connections from fox for srun (via wireguard)
|
||||||
|
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 60000:61000 -j nixos-fw-accept
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -6,6 +6,7 @@
|
|||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
../module/ssh-hut-extern.nix
|
../module/ssh-hut-extern.nix
|
||||||
|
../module/nvidia.nix
|
||||||
../eudy/kernel/perf.nix
|
../eudy/kernel/perf.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -38,6 +39,7 @@
|
|||||||
};
|
};
|
||||||
hosts = {
|
hosts = {
|
||||||
"10.0.44.4" = [ "tent" ];
|
"10.0.44.4" = [ "tent" ];
|
||||||
|
"84.88.53.236" = [ "apex" ];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -49,12 +51,7 @@
|
|||||||
# Enable performance governor
|
# Enable performance governor
|
||||||
powerManagement.cpuFreqGovernor = "performance";
|
powerManagement.cpuFreqGovernor = "performance";
|
||||||
|
|
||||||
# Configure Nvidia driver to use with CUDA
|
|
||||||
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
|
||||||
hardware.nvidia.open = false; # Maxwell is older than Turing architecture
|
hardware.nvidia.open = false; # Maxwell is older than Turing architecture
|
||||||
hardware.graphics.enable = true;
|
|
||||||
nixpkgs.config.nvidia.acceptLicense = true;
|
|
||||||
services.xserver.videoDrivers = [ "nvidia" ];
|
|
||||||
|
|
||||||
services.openssh.settings.X11Forwarding = true;
|
services.openssh.settings.X11Forwarding = true;
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,9 @@
|
|||||||
nameservers = [ "84.88.52.35" "84.88.52.36" ];
|
nameservers = [ "84.88.52.35" "84.88.52.36" ];
|
||||||
search = [ "bsc.es" "ac.upc.edu" ];
|
search = [ "bsc.es" "ac.upc.edu" ];
|
||||||
defaultGateway = "10.0.44.1";
|
defaultGateway = "10.0.44.1";
|
||||||
|
hosts = {
|
||||||
|
"84.88.53.236" = [ "apex" ];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
services.p.enable = true;
|
services.p.enable = true;
|
||||||
|
|||||||
@@ -26,5 +26,7 @@
|
|||||||
SENDMAIL_ARGS = "--";
|
SENDMAIL_ARGS = "--";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
lfs.enable = true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ in
|
|||||||
rewrite ^/git/(.*) /$1 break;
|
rewrite ^/git/(.*) /$1 break;
|
||||||
proxy_pass http://127.0.0.1:3000;
|
proxy_pass http://127.0.0.1:3000;
|
||||||
proxy_redirect http:// $scheme://;
|
proxy_redirect http:// $scheme://;
|
||||||
|
client_max_body_size 64M;
|
||||||
}
|
}
|
||||||
location /cache {
|
location /cache {
|
||||||
rewrite ^/cache/(.*) /$1 break;
|
rewrite ^/cache/(.*) /$1 break;
|
||||||
|
|||||||
@@ -14,6 +14,10 @@
|
|||||||
# Users with sudo access
|
# Users with sudo access
|
||||||
users.groups.wheel.members = [ "abonerib" "anavarro" ];
|
users.groups.wheel.members = [ "abonerib" "anavarro" ];
|
||||||
|
|
||||||
|
# Run julia installed with juliaup using julia's own libraries:
|
||||||
|
# NIX_LD_LIBRARY_PATH=~/.julia/juliaup/${VERS}/lib/julia ~/.juliaup/bin/julia
|
||||||
|
programs.nix-ld.enable = true;
|
||||||
|
|
||||||
networking = {
|
networking = {
|
||||||
hostName = "weasel";
|
hostName = "weasel";
|
||||||
interfaces.eno1.ipv4.addresses = [ {
|
interfaces.eno1.ipv4.addresses = [ {
|
||||||
|
|||||||
12
pkgs/cudainfo/Makefile
Normal file
12
pkgs/cudainfo/Makefile
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
HOSTCXX ?= g++
|
||||||
|
NVCC := nvcc -ccbin $(HOSTCXX)
|
||||||
|
CXXFLAGS := -m64
|
||||||
|
|
||||||
|
# Target rules
|
||||||
|
all: cudainfo
|
||||||
|
|
||||||
|
cudainfo: cudainfo.cpp
|
||||||
|
$(NVCC) $(CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f cudainfo cudainfo.o
|
||||||
600
pkgs/cudainfo/cudainfo.cpp
Normal file
600
pkgs/cudainfo/cudainfo.cpp
Normal file
@@ -0,0 +1,600 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
|
||||||
|
|
||||||
|
// Shared Utilities (QA Testing)
|
||||||
|
|
||||||
|
// std::system includes
|
||||||
|
#include <memory>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
|
||||||
|
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
|
||||||
|
|
||||||
|
// CUDA Runtime error messages
|
||||||
|
#ifdef __DRIVER_TYPES_H__
|
||||||
|
static const char *_cudaGetErrorEnum(cudaError_t error)
|
||||||
|
{
|
||||||
|
switch (error)
|
||||||
|
{
|
||||||
|
case cudaSuccess:
|
||||||
|
return "cudaSuccess";
|
||||||
|
|
||||||
|
case cudaErrorMissingConfiguration:
|
||||||
|
return "cudaErrorMissingConfiguration";
|
||||||
|
|
||||||
|
case cudaErrorMemoryAllocation:
|
||||||
|
return "cudaErrorMemoryAllocation";
|
||||||
|
|
||||||
|
case cudaErrorInitializationError:
|
||||||
|
return "cudaErrorInitializationError";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFailure:
|
||||||
|
return "cudaErrorLaunchFailure";
|
||||||
|
|
||||||
|
case cudaErrorPriorLaunchFailure:
|
||||||
|
return "cudaErrorPriorLaunchFailure";
|
||||||
|
|
||||||
|
case cudaErrorLaunchTimeout:
|
||||||
|
return "cudaErrorLaunchTimeout";
|
||||||
|
|
||||||
|
case cudaErrorLaunchOutOfResources:
|
||||||
|
return "cudaErrorLaunchOutOfResources";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDeviceFunction:
|
||||||
|
return "cudaErrorInvalidDeviceFunction";
|
||||||
|
|
||||||
|
case cudaErrorInvalidConfiguration:
|
||||||
|
return "cudaErrorInvalidConfiguration";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDevice:
|
||||||
|
return "cudaErrorInvalidDevice";
|
||||||
|
|
||||||
|
case cudaErrorInvalidValue:
|
||||||
|
return "cudaErrorInvalidValue";
|
||||||
|
|
||||||
|
case cudaErrorInvalidPitchValue:
|
||||||
|
return "cudaErrorInvalidPitchValue";
|
||||||
|
|
||||||
|
case cudaErrorInvalidSymbol:
|
||||||
|
return "cudaErrorInvalidSymbol";
|
||||||
|
|
||||||
|
case cudaErrorMapBufferObjectFailed:
|
||||||
|
return "cudaErrorMapBufferObjectFailed";
|
||||||
|
|
||||||
|
case cudaErrorUnmapBufferObjectFailed:
|
||||||
|
return "cudaErrorUnmapBufferObjectFailed";
|
||||||
|
|
||||||
|
case cudaErrorInvalidHostPointer:
|
||||||
|
return "cudaErrorInvalidHostPointer";
|
||||||
|
|
||||||
|
case cudaErrorInvalidDevicePointer:
|
||||||
|
return "cudaErrorInvalidDevicePointer";
|
||||||
|
|
||||||
|
case cudaErrorInvalidTexture:
|
||||||
|
return "cudaErrorInvalidTexture";
|
||||||
|
|
||||||
|
case cudaErrorInvalidTextureBinding:
|
||||||
|
return "cudaErrorInvalidTextureBinding";
|
||||||
|
|
||||||
|
case cudaErrorInvalidChannelDescriptor:
|
||||||
|
return "cudaErrorInvalidChannelDescriptor";
|
||||||
|
|
||||||
|
case cudaErrorInvalidMemcpyDirection:
|
||||||
|
return "cudaErrorInvalidMemcpyDirection";
|
||||||
|
|
||||||
|
case cudaErrorAddressOfConstant:
|
||||||
|
return "cudaErrorAddressOfConstant";
|
||||||
|
|
||||||
|
case cudaErrorTextureFetchFailed:
|
||||||
|
return "cudaErrorTextureFetchFailed";
|
||||||
|
|
||||||
|
case cudaErrorTextureNotBound:
|
||||||
|
return "cudaErrorTextureNotBound";
|
||||||
|
|
||||||
|
case cudaErrorSynchronizationError:
|
||||||
|
return "cudaErrorSynchronizationError";
|
||||||
|
|
||||||
|
case cudaErrorInvalidFilterSetting:
|
||||||
|
return "cudaErrorInvalidFilterSetting";
|
||||||
|
|
||||||
|
case cudaErrorInvalidNormSetting:
|
||||||
|
return "cudaErrorInvalidNormSetting";
|
||||||
|
|
||||||
|
case cudaErrorMixedDeviceExecution:
|
||||||
|
return "cudaErrorMixedDeviceExecution";
|
||||||
|
|
||||||
|
case cudaErrorCudartUnloading:
|
||||||
|
return "cudaErrorCudartUnloading";
|
||||||
|
|
||||||
|
case cudaErrorUnknown:
|
||||||
|
return "cudaErrorUnknown";
|
||||||
|
|
||||||
|
case cudaErrorNotYetImplemented:
|
||||||
|
return "cudaErrorNotYetImplemented";
|
||||||
|
|
||||||
|
case cudaErrorMemoryValueTooLarge:
|
||||||
|
return "cudaErrorMemoryValueTooLarge";
|
||||||
|
|
||||||
|
case cudaErrorInvalidResourceHandle:
|
||||||
|
return "cudaErrorInvalidResourceHandle";
|
||||||
|
|
||||||
|
case cudaErrorNotReady:
|
||||||
|
return "cudaErrorNotReady";
|
||||||
|
|
||||||
|
case cudaErrorInsufficientDriver:
|
||||||
|
return "cudaErrorInsufficientDriver";
|
||||||
|
|
||||||
|
case cudaErrorSetOnActiveProcess:
|
||||||
|
return "cudaErrorSetOnActiveProcess";
|
||||||
|
|
||||||
|
case cudaErrorInvalidSurface:
|
||||||
|
return "cudaErrorInvalidSurface";
|
||||||
|
|
||||||
|
case cudaErrorNoDevice:
|
||||||
|
return "cudaErrorNoDevice";
|
||||||
|
|
||||||
|
case cudaErrorECCUncorrectable:
|
||||||
|
return "cudaErrorECCUncorrectable";
|
||||||
|
|
||||||
|
case cudaErrorSharedObjectSymbolNotFound:
|
||||||
|
return "cudaErrorSharedObjectSymbolNotFound";
|
||||||
|
|
||||||
|
case cudaErrorSharedObjectInitFailed:
|
||||||
|
return "cudaErrorSharedObjectInitFailed";
|
||||||
|
|
||||||
|
case cudaErrorUnsupportedLimit:
|
||||||
|
return "cudaErrorUnsupportedLimit";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateVariableName:
|
||||||
|
return "cudaErrorDuplicateVariableName";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateTextureName:
|
||||||
|
return "cudaErrorDuplicateTextureName";
|
||||||
|
|
||||||
|
case cudaErrorDuplicateSurfaceName:
|
||||||
|
return "cudaErrorDuplicateSurfaceName";
|
||||||
|
|
||||||
|
case cudaErrorDevicesUnavailable:
|
||||||
|
return "cudaErrorDevicesUnavailable";
|
||||||
|
|
||||||
|
case cudaErrorInvalidKernelImage:
|
||||||
|
return "cudaErrorInvalidKernelImage";
|
||||||
|
|
||||||
|
case cudaErrorNoKernelImageForDevice:
|
||||||
|
return "cudaErrorNoKernelImageForDevice";
|
||||||
|
|
||||||
|
case cudaErrorIncompatibleDriverContext:
|
||||||
|
return "cudaErrorIncompatibleDriverContext";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessAlreadyEnabled:
|
||||||
|
return "cudaErrorPeerAccessAlreadyEnabled";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessNotEnabled:
|
||||||
|
return "cudaErrorPeerAccessNotEnabled";
|
||||||
|
|
||||||
|
case cudaErrorDeviceAlreadyInUse:
|
||||||
|
return "cudaErrorDeviceAlreadyInUse";
|
||||||
|
|
||||||
|
case cudaErrorProfilerDisabled:
|
||||||
|
return "cudaErrorProfilerDisabled";
|
||||||
|
|
||||||
|
case cudaErrorProfilerNotInitialized:
|
||||||
|
return "cudaErrorProfilerNotInitialized";
|
||||||
|
|
||||||
|
case cudaErrorProfilerAlreadyStarted:
|
||||||
|
return "cudaErrorProfilerAlreadyStarted";
|
||||||
|
|
||||||
|
case cudaErrorProfilerAlreadyStopped:
|
||||||
|
return "cudaErrorProfilerAlreadyStopped";
|
||||||
|
|
||||||
|
/* Since CUDA 4.0*/
|
||||||
|
case cudaErrorAssert:
|
||||||
|
return "cudaErrorAssert";
|
||||||
|
|
||||||
|
case cudaErrorTooManyPeers:
|
||||||
|
return "cudaErrorTooManyPeers";
|
||||||
|
|
||||||
|
case cudaErrorHostMemoryAlreadyRegistered:
|
||||||
|
return "cudaErrorHostMemoryAlreadyRegistered";
|
||||||
|
|
||||||
|
case cudaErrorHostMemoryNotRegistered:
|
||||||
|
return "cudaErrorHostMemoryNotRegistered";
|
||||||
|
|
||||||
|
/* Since CUDA 5.0 */
|
||||||
|
case cudaErrorOperatingSystem:
|
||||||
|
return "cudaErrorOperatingSystem";
|
||||||
|
|
||||||
|
case cudaErrorPeerAccessUnsupported:
|
||||||
|
return "cudaErrorPeerAccessUnsupported";
|
||||||
|
|
||||||
|
case cudaErrorLaunchMaxDepthExceeded:
|
||||||
|
return "cudaErrorLaunchMaxDepthExceeded";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFileScopedTex:
|
||||||
|
return "cudaErrorLaunchFileScopedTex";
|
||||||
|
|
||||||
|
case cudaErrorLaunchFileScopedSurf:
|
||||||
|
return "cudaErrorLaunchFileScopedSurf";
|
||||||
|
|
||||||
|
case cudaErrorSyncDepthExceeded:
|
||||||
|
return "cudaErrorSyncDepthExceeded";
|
||||||
|
|
||||||
|
case cudaErrorLaunchPendingCountExceeded:
|
||||||
|
return "cudaErrorLaunchPendingCountExceeded";
|
||||||
|
|
||||||
|
case cudaErrorNotPermitted:
|
||||||
|
return "cudaErrorNotPermitted";
|
||||||
|
|
||||||
|
case cudaErrorNotSupported:
|
||||||
|
return "cudaErrorNotSupported";
|
||||||
|
|
||||||
|
/* Since CUDA 6.0 */
|
||||||
|
case cudaErrorHardwareStackError:
|
||||||
|
return "cudaErrorHardwareStackError";
|
||||||
|
|
||||||
|
case cudaErrorIllegalInstruction:
|
||||||
|
return "cudaErrorIllegalInstruction";
|
||||||
|
|
||||||
|
case cudaErrorMisalignedAddress:
|
||||||
|
return "cudaErrorMisalignedAddress";
|
||||||
|
|
||||||
|
case cudaErrorInvalidAddressSpace:
|
||||||
|
return "cudaErrorInvalidAddressSpace";
|
||||||
|
|
||||||
|
case cudaErrorInvalidPc:
|
||||||
|
return "cudaErrorInvalidPc";
|
||||||
|
|
||||||
|
case cudaErrorIllegalAddress:
|
||||||
|
return "cudaErrorIllegalAddress";
|
||||||
|
|
||||||
|
/* Since CUDA 6.5*/
|
||||||
|
case cudaErrorInvalidPtx:
|
||||||
|
return "cudaErrorInvalidPtx";
|
||||||
|
|
||||||
|
case cudaErrorInvalidGraphicsContext:
|
||||||
|
return "cudaErrorInvalidGraphicsContext";
|
||||||
|
|
||||||
|
case cudaErrorStartupFailure:
|
||||||
|
return "cudaErrorStartupFailure";
|
||||||
|
|
||||||
|
case cudaErrorApiFailureBase:
|
||||||
|
return "cudaErrorApiFailureBase";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "<unknown>";
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template< typename T >
|
||||||
|
void check(T result, char const *const func, const char *const file, int const line)
|
||||||
|
{
|
||||||
|
if (result)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
|
||||||
|
file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
|
||||||
|
cudaDeviceReset();
|
||||||
|
// Make sure we call CUDA Device Reset before exiting
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int *pArgc = NULL;
|
||||||
|
char **pArgv = NULL;
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 5000
|
||||||
|
|
||||||
|
// CUDA-C includes
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
// This function wraps the CUDA Driver API into a template function
|
||||||
|
template <class T>
|
||||||
|
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
|
{
|
||||||
|
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
||||||
|
|
||||||
|
if (CUDA_SUCCESS != error) {
|
||||||
|
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
|
||||||
|
error, __FILE__, __LINE__);
|
||||||
|
|
||||||
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
|
// needed to ensure correct operation when the application is being
|
||||||
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
|
// flushed before the application exits
|
||||||
|
cudaDeviceReset();
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CUDART_VERSION < 5000 */
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] = {
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
|
||||||
|
{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
|
||||||
|
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
|
||||||
|
{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
|
||||||
|
{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
|
||||||
|
{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we don't find the values, we default use the previous one to run properly
|
||||||
|
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
|
||||||
|
return nGpuArchCoresPerSM[index-1].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Program main
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int
|
||||||
|
main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
pArgc = &argc;
|
||||||
|
pArgv = argv;
|
||||||
|
|
||||||
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
||||||
|
|
||||||
|
int deviceCount = 0;
|
||||||
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||||
|
|
||||||
|
if (error_id != cudaSuccess) {
|
||||||
|
printf("cudaGetDeviceCount failed: %s (%d)\n",
|
||||||
|
cudaGetErrorString(error_id), (int) error_id);
|
||||||
|
printf("Result = FAIL\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function call returns 0 if there are no CUDA capable devices.
|
||||||
|
if (deviceCount == 0)
|
||||||
|
printf("There are no available device(s) that support CUDA\n");
|
||||||
|
else
|
||||||
|
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
||||||
|
|
||||||
|
int dev, driverVersion = 0, runtimeVersion = 0;
|
||||||
|
|
||||||
|
for (dev = 0; dev < deviceCount; ++dev) {
|
||||||
|
cudaSetDevice(dev);
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDeviceProperties(&deviceProp, dev);
|
||||||
|
|
||||||
|
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
|
||||||
|
|
||||||
|
// Console log
|
||||||
|
cudaDriverGetVersion(&driverVersion);
|
||||||
|
cudaRuntimeGetVersion(&runtimeVersion);
|
||||||
|
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
|
||||||
|
|
||||||
|
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
|
||||||
|
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
|
||||||
|
|
||||||
|
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||||
|
deviceProp.multiProcessorCount,
|
||||||
|
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
|
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||||
|
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
|
||||||
|
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 5000
|
||||||
|
// This is supported in CUDA 5.0 (runtime API device properties)
|
||||||
|
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
|
||||||
|
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
|
||||||
|
|
||||||
|
if (deviceProp.l2CacheSize) {
|
||||||
|
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
|
||||||
|
int memoryClock;
|
||||||
|
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
||||||
|
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
||||||
|
int memBusWidth;
|
||||||
|
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
||||||
|
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||||
|
int L2CacheSize;
|
||||||
|
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
||||||
|
|
||||||
|
if (L2CacheSize) {
|
||||||
|
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
|
||||||
|
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
|
||||||
|
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
|
||||||
|
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
||||||
|
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
|
||||||
|
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
|
||||||
|
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
|
||||||
|
|
||||||
|
|
||||||
|
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
|
||||||
|
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
|
||||||
|
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
|
||||||
|
printf(" Warp size: %d\n", deviceProp.warpSize);
|
||||||
|
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
|
||||||
|
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
|
||||||
|
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
||||||
|
deviceProp.maxThreadsDim[0],
|
||||||
|
deviceProp.maxThreadsDim[1],
|
||||||
|
deviceProp.maxThreadsDim[2]);
|
||||||
|
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
||||||
|
deviceProp.maxGridSize[0],
|
||||||
|
deviceProp.maxGridSize[1],
|
||||||
|
deviceProp.maxGridSize[2]);
|
||||||
|
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
|
||||||
|
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
|
||||||
|
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
|
||||||
|
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
||||||
|
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
|
||||||
|
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
|
||||||
|
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
|
||||||
|
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
||||||
|
#endif
|
||||||
|
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
|
||||||
|
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
|
||||||
|
|
||||||
|
const char *sComputeMode[] = {
|
||||||
|
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
|
||||||
|
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
|
||||||
|
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
|
||||||
|
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
|
||||||
|
"Unknown",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
printf(" Compute Mode:\n");
|
||||||
|
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there are 2 or more GPUs, query to determine whether RDMA is supported
|
||||||
|
if (deviceCount >= 2)
|
||||||
|
{
|
||||||
|
cudaDeviceProp prop[64];
|
||||||
|
int gpuid[64]; // we want to find the first two GPU's that can support P2P
|
||||||
|
int gpu_p2p_count = 0;
|
||||||
|
|
||||||
|
for (int i=0; i < deviceCount; i++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
|
||||||
|
|
||||||
|
// Only boards based on Fermi or later can support P2P
|
||||||
|
if ((prop[i].major >= 2)
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
// on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
|
||||||
|
&& prop[i].tccDriver
|
||||||
|
#endif
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// This is an array of P2P capable GPUs
|
||||||
|
gpuid[gpu_p2p_count++] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show all the combinations of support P2P GPUs
|
||||||
|
int can_access_peer_0_1, can_access_peer_1_0;
|
||||||
|
|
||||||
|
if (gpu_p2p_count >= 2)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||||
|
{
|
||||||
|
for (int j = 1; j < gpu_p2p_count; j++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
|
||||||
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
|
||||||
|
prop[gpuid[j]].name, gpuid[j] ,
|
||||||
|
can_access_peer_0_1 ? "Yes" : "No");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 1; j < gpu_p2p_count; j++)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < gpu_p2p_count-1; i++)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
|
||||||
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
|
||||||
|
prop[gpuid[i]].name, gpuid[i] ,
|
||||||
|
can_access_peer_1_0 ? "Yes" : "No");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// csv masterlog info
|
||||||
|
// *****************************
|
||||||
|
// exe and CUDA driver name
|
||||||
|
printf("\n");
|
||||||
|
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
|
||||||
|
char cTemp[128];
|
||||||
|
|
||||||
|
// driver version
|
||||||
|
sProfileString += ", CUDA Driver Version = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Runtime version
|
||||||
|
sProfileString += ", CUDA Runtime Version = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Device count
|
||||||
|
sProfileString += ", NumDevs = ";
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 10, "%d", deviceCount);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, "%d", deviceCount);
|
||||||
|
#endif
|
||||||
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
// Print Out all device Names
|
||||||
|
for (dev = 0; dev < deviceCount; ++dev)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
sprintf_s(cTemp, 13, ", Device%d = ", dev);
|
||||||
|
#else
|
||||||
|
sprintf(cTemp, ", Device%d = ", dev);
|
||||||
|
#endif
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDeviceProperties(&deviceProp, dev);
|
||||||
|
sProfileString += cTemp;
|
||||||
|
sProfileString += deviceProp.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
sProfileString += "\n";
|
||||||
|
printf("%s", sProfileString.c_str());
|
||||||
|
|
||||||
|
printf("Result = PASS\n");
|
||||||
|
|
||||||
|
// finish
|
||||||
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
|
// needed to ensure correct operation when the application is being
|
||||||
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
|
// flushed before the application exits
|
||||||
|
cudaDeviceReset();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
43
pkgs/cudainfo/default.nix
Normal file
43
pkgs/cudainfo/default.nix
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
{
|
||||||
|
stdenv
|
||||||
|
, cudatoolkit
|
||||||
|
, cudaPackages
|
||||||
|
, autoAddDriverRunpath
|
||||||
|
, strace
|
||||||
|
}:
|
||||||
|
|
||||||
|
stdenv.mkDerivation (finalAttrs: {
|
||||||
|
name = "cudainfo";
|
||||||
|
src = ./.;
|
||||||
|
buildInputs = [
|
||||||
|
cudatoolkit # Required for nvcc
|
||||||
|
cudaPackages.cuda_cudart.static # Required for -lcudart_static
|
||||||
|
autoAddDriverRunpath
|
||||||
|
];
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/bin
|
||||||
|
cp -a cudainfo $out/bin
|
||||||
|
'';
|
||||||
|
passthru.gpuCheck = stdenv.mkDerivation {
|
||||||
|
name = "cudainfo-test";
|
||||||
|
requiredSystemFeatures = [ "cuda" ];
|
||||||
|
dontBuild = true;
|
||||||
|
nativeCheckInputs = [
|
||||||
|
finalAttrs.finalPackage # The cudainfo package from above
|
||||||
|
strace # When it fails, it will show the trace
|
||||||
|
];
|
||||||
|
dontUnpack = true;
|
||||||
|
doCheck = true;
|
||||||
|
checkPhase = ''
|
||||||
|
if ! cudainfo; then
|
||||||
|
set -x
|
||||||
|
cudainfo=$(command -v cudainfo)
|
||||||
|
ldd $cudainfo
|
||||||
|
readelf -d $cudainfo
|
||||||
|
strace -f $cudainfo
|
||||||
|
set +x
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
installPhase = "touch $out";
|
||||||
|
};
|
||||||
|
})
|
||||||
@@ -52,4 +52,5 @@ final: prev:
|
|||||||
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
||||||
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
||||||
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
||||||
|
cudainfo = prev.callPackage ./cudainfo/default.nix { };
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
@@ -1,11 +1,13 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 HY2yRg d7+nvfAcdC3GjJxipXFrsfGGyP5jAY+gRWRV+4FVYAM
|
-> ssh-ed25519 HY2yRg gKGxsjHfpiRDQ6Tuvcx7pjKgrVUGweotuplLYwCGvik
|
||||||
CG7r0bRGgnUWcdfDnpe7HwZ3L/y7b5iuJuqvf15b3/Y
|
DSz9j/stVyB1lXpVP+kg+H+RDgSftREGFFLQZClC3kI
|
||||||
-> ssh-ed25519 CAWG4Q X0vITOErz4wkR3VQYOcVlnrkHtwe+ytdZz1Hcrs4vVs
|
-> ssh-ed25519 cK5kHw 17DpKekfNVy4V742QSd61r2w6iawtOJR7Ct3UflDXio
|
||||||
6IWYOhXLQ+BnML9YfLLHJYEO2CZ/uEc9IBqhoWvjDHI
|
hsqTEPCYjHKvndMWPl4GpG23CzjGgVrS+cLIymISJHU
|
||||||
-> ssh-ed25519 xA739A p5e/0AJtZ0+zbRvkB/usLuxusY8xXRx9Ksi/LQlcIHw
|
-> ssh-ed25519 CAWG4Q oK01d4pbBqEZVsymSiKijPvJo714xsMSRMbzkssJKiw
|
||||||
M4S/qlzT9POyJx4gY9lmycstUcdwG2cinN4OlV22zzo
|
hs0tVFkqtIHXg9jtC2iDgCtefFcWvGJkXB+HJUcqXQs
|
||||||
-> ssh-ed25519 MSF3dg Ydl7uBWzBx6sAaxbzC3x8qiaU3ysGqV4rUFLpHCEV30
|
-> ssh-ed25519 xA739A KxO+AawfLMERHwzt3YnZRwPFlCfGETma7fo8M+ZtsAY
|
||||||
/1AUHBhCNOs9i7LJbmzwQDHsu+ybzYf6+coztKk5E3U
|
eSn0+/rhLQxNKt5xKubKck8Nxun2Sh3eJqBU/hwgzZM
|
||||||
--- kYt15WxClpT7PXD1oFe9GqJU+OswjH7y9wIc8/GzZ7M
|
-> ssh-ed25519 MSF3dg OyaZBLB2kO8fU139lXbbC404gT7IzIWk+BMhYzabBDg
|
||||||
<EFBFBD><EFBFBD>h<>ߓ<><DF93><EFBFBD>`<60><><EFBFBD>V4F<34><46>_k)^<5E>m$uj:ѳ<><D1B3><17><><EFBFBD>}<7D>Z]$U]<12>u<EFBFBD> <20>0<EFBFBD><30><EFBFBD>v8<76>?<3F>X<EFBFBD>P<EFBFBD>g%d<>#<23>d9{rAi<41><69>
|
/fiPFfBJcb+e40+fZbwCw7niF2hh+JxUPiKSiwUSOWg
|
||||||
|
--- ycZyGX+Li+LsOuweF9OVPl8aoMaRgp/RdFbDrPszkUs
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>YM<EFBFBD><EFBFBD>:E O<><4F>2<EFBFBD>r=<15>&4<><04>CQΣ<51><CEA3>hC<68><43><EFBFBD>cb<63>^Sy<53><79>% <09><>x-vC`g<><15><><EFBFBD><EFBFBD>W^<5E><>wVG<0B><><EFBFBD>
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -2,6 +2,8 @@ let
|
|||||||
keys = import ../keys.nix;
|
keys = import ../keys.nix;
|
||||||
adminsKeys = builtins.attrValues keys.admins;
|
adminsKeys = builtins.attrValues keys.admins;
|
||||||
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
||||||
|
fox = [ keys.hosts.fox ] ++ adminsKeys;
|
||||||
|
apex = [ keys.hosts.apex ] ++ adminsKeys;
|
||||||
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
||||||
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
||||||
# Only expose ceph keys to safe nodes and admins
|
# Only expose ceph keys to safe nodes and admins
|
||||||
@@ -24,4 +26,7 @@ in
|
|||||||
|
|
||||||
"ceph-user.age".publicKeys = safe;
|
"ceph-user.age".publicKeys = safe;
|
||||||
"munge-key.age".publicKeys = safe;
|
"munge-key.age".publicKeys = safe;
|
||||||
|
|
||||||
|
"wg-fox.age".publicKeys = fox;
|
||||||
|
"wg-apex.age".publicKeys = apex;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 G5LX5w HlQ4V8lBd3im5j8KHEuQZBTuztvPj1QoWdv6FL6qzGI
|
-> ssh-ed25519 G5LX5w Zhbs+NM/SI49qQ0X8bBpWUWxYM0vUKCXNAnPpIE2NR0
|
||||||
Jpt91X1UIIVFQt1X6Q//kALn+Cetp/LqBZZvTuhFthw
|
CkBUmJ26EkwHztT8Pz0UGq2KZwN0Xz8iYQ9cEHL9OWQ
|
||||||
-> ssh-ed25519 CAWG4Q StnngJAcuAwUnTrXDR3nJ2KFN0jNdTqSz+/1TfmWkzA
|
-> ssh-ed25519 cK5kHw 5KjUXJywRDp2A7l5ukTCS+WIAalxwP1f71ejGxwNrX4
|
||||||
CR4AQ6fqaJVY1mdUIX1gzaZwRs1sU8F8hHztnkN8vN0
|
JW8OLmfkULXo9AwYMGNyOgZ+nQ0MVc0PCM4kKPIo6V4
|
||||||
-> ssh-ed25519 xA739A xya5A5t63Owx+VrGgUfV/lIP8b/xV1cerMpuZBLaDVM
|
-> ssh-ed25519 CAWG4Q cVjY3R0ZHAfokA4kWlu5vOl2Gs7mdqRgRk4WSUOXAjg
|
||||||
w+pA583yUnFq2AvGBGzWbQIGQEY9WqW0CSLQ9v+SG0c
|
IxEDvuximW99EqxmpW+Btpm0Zydmwg/u87bqnl26NYc
|
||||||
-> ssh-ed25519 MSF3dg aXkLxCyYdOwVopHHmpXEI6WlAIizKdJi4IO0KEdhS3s
|
-> ssh-ed25519 xA739A hmuwZuxmJnuAjmU4X8yhPQ+hPWvN1G+ZS0pvD7fHamg
|
||||||
WKXkTszZN66+QZdSDJ4D9q7xgYWMfliOLCubIF2Dqkc
|
fnAPW6ZCrv5pSO4RQhhr8xz7ij7jAZJk0ApWluOXDng
|
||||||
--- uVWoU2lMkqQ/9Z0BqKRCeUpsKi8lwmHukT/FV8wYMbg
|
-> ssh-ed25519 MSF3dg SSGLcWnum0Qo/0OnKDZVg9xAZMwGwVNYYmRJXxb4GU0
|
||||||
<EFBFBD><EFBFBD>1G+<2B>6<EFBFBD><36>g[|x]2T<32>й<EFBFBD><D0B9><EFBFBD> <20>CKu)<29><><EFBFBD>]<5D><>8֓<38><D693><EFBFBD><EFBFBD>l<EFBFBD><6C>S<EFBFBD><53><EFBFBD>Q<EFBFBD><07><>x<EFBFBD><78><EFBFBD><EFBFBD>#7r<37>k{*<2A><>3ս~C<>b<EFBFBD><62><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵ<EFBFBD>Np<1E><05>]J]h<>je+d%Е<>#<23>m<EFBFBD>?=6}<7D>
|
pdl6kATG7n2oMsoUboBfu+vDKurJcH1UvUa70rfMQkE
|
||||||
|
--- a2ZQAeAQlO9DWnegIAq6NpI1Po6f38l+hitZvq+zIW8
|
||||||
|
<EFBFBD>\ֺ"^<5E>DT<44>H<EFBFBD><48>3<EFBFBD><33><EFBFBD>_|.h<0E><><EFBFBD><EFBFBD><03>^<5E>n<14><0E><><EFBFBD><EFBFBD><1A>g<EFBFBD>S<EFBFBD>]_<><5F>?n<>z~2<>!<21>p7<70><37><<3C><14>ʨD?<3F>~<02>F<EFBFBD>$<24>`<60>q+<2B><><EFBFBD>SW<53>(+<2B><>P<EFBFBD>c<1E>u[<5B>m<EFBFBD>`O<>ܛ<EFBFBD>ϖT
|
||||||
@@ -1,11 +1,13 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 G5LX5w sg9SmahxBg35MDIxhrp4oHkaTaxsKoVQju2eNhCt0BM
|
-> ssh-ed25519 G5LX5w VKM/Y6Wy0gmb2gc4Q00VzHQ4IAxfSyshuDoaAzlEkFM
|
||||||
CZ64dEGqz2tbkG8KtimZvLUEMrQpVVBJP7Fu46WTMgc
|
vf18uoEN5ZLJ4HcJg85epaseh1CRL9/ncXtU2HpH+QE
|
||||||
-> ssh-ed25519 CAWG4Q jzS1R14W1CWxdziMLG/yCGPLWSkiyE+9lqyCVe491ng
|
-> ssh-ed25519 cK5kHw sMuG07kjlI6VjPjELOUPzkn+KT9Yq7BPf0zSATM2aGI
|
||||||
acJo/nhKq3pSPoFEPaFLN1fzHHbEzstNoLtohWAHKiM
|
/eODwL8KwyVgFjBK2MJlbqjN7mEvXCSsjq9D96szrng
|
||||||
-> ssh-ed25519 xA739A qeGJoLeSIQwLU2Yg+Gi2bikHJ3HscLfyo1msqL3JwHw
|
-> ssh-ed25519 CAWG4Q t3/Ty7yCqC5x8KQY4VaHSQ9Q3epqMpXoBDKyKx9+VzE
|
||||||
tTwaxRBKTl/SoyY/LnxR/j/5WvCNX5VeZLKi018YMrY
|
JwgUsqMd+1jFZvFp9/SIoowbhSMVEkKp03T69+OHjho
|
||||||
-> ssh-ed25519 MSF3dg Wym7Uyf1XvH1H6mNDERkO8opkMiN0zzXm2PjXftEOWs
|
-> ssh-ed25519 xA739A 0ohmKK427+4vupivrtjXp0dDK8wT4XUA9rWgcsCGKgA
|
||||||
Uw8ZwwKIB5UqgVuoSLE2QajNDJZkH7/Y3Nsy+WFl7Xs
|
msbeQyz3pL8RLtAeXX5tsfyHyOXxhfYpqaLEKnRxpPQ
|
||||||
--- 94hGVbYiCGZdMEJesCMLh7IZi+w5l/Kr1lZJHQgrc0o
|
-> ssh-ed25519 MSF3dg H+6jAoP7/Dxp8C/7Bk1C4CT1hpkUhtbnTWWIxkO24Ec
|
||||||
j5j磛<6A><04><>J<EFBFBD><4A><EFBFBD>a<EFBFBD>]<5D>a%dr<64><72>FDT<44><54>^<5E><>Q<EFBFBD>s/<2F>kwB<77>$<24><>$<24><>H<EFBFBD>'<27><><EFBFBD><EFBFBD><EFBFBD>w<14><?^|<7C><07>h$<24>ؗ<EFBFBD>GI<47>ĕsT2RU<52><55>*/O<>7<EFBFBD><37><EFBFBD>G<EFBFBD>pͪ<70>4<EFBFBD><34><EFBFBD>M9<4D>j<><06>
|
SrMuUG93T5lUw3xINEen5EEKLXJizIGFhBO1fVroFHE
|
||||||
|
--- tIPnH9cxTV3m3qzvZB97Egz+raWwZJ182BXXKDu8f+o
|
||||||
|
<EFBFBD><EFBFBD>f#<23>,|<7C>Ey.v<>DL<44>Ӻ<05>JPX<50><07><>`<60><><EFBFBD><EFBFBD>-#<23>F<EFBFBD>Ubs<62>(Q!?<3F><1A>#xJG?5<><35><EFBFBD><EFBFBD><EFBFBD>~<7E><>6MA<15>U<><55><EFBFBD>C<01><>M<>$+}W<>NϨG!<21><><EFBFBD><EFBFBD>a<EFBFBD><61><EFBFBD><EFBFBD>%<25>ǽ<EFBFBD>G
|
||||||
@@ -1,12 +1,13 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 G5LX5w 5K0mzfJGvAB2LGmoQ9ZLbWooVEX6F4+fQdo1JUoB3FM
|
-> ssh-ed25519 G5LX5w 1KfTmTRP3iSdcclf/FuIpFWpy1tgKs5ED+qSYWo7inY
|
||||||
AKGa507bUrYjXFaMQ1MXTDBFYsdS6zbs+flmxYN0UNo
|
RX6Q1nLFF/yiVLpkWrl0BI0PpLoBi753+y8l/AXjNE4
|
||||||
-> ssh-ed25519 CAWG4Q 8KzLc949on8iN1pK8q11OpCIeO71t6b0zxCLHhcQ6ns
|
-> ssh-ed25519 cK5kHw TP7+OQpQSNuyArnUo1C97J3P3oB0YtzCEPeVvlzsYHE
|
||||||
uy7z6RdIuoUes+Uap3k5eoFFuu/DcSrEBwq4V4C/ygc
|
Bsy5KPNHTVNHnF1sxOvlfJq3CNMVFaXdYkRG2vSj7qM
|
||||||
-> ssh-ed25519 xA739A SLx5cKo0fdAHj+cLpJ4FYTWTUTyDsCqKQOufDu3xnGo
|
-> ssh-ed25519 CAWG4Q eQyzwNaH6CfaYIjs8abEuQxt6vxRXsGz69UletMUVDE
|
||||||
VnS/WsiSaf6RpXuhgfij4pYu4p9hlJl1oXrfYY9rKlQ
|
FDcynPO7xg4PWez5Z8gTg5LyE0Wgb3zT9i3Kon67QsU
|
||||||
-> ssh-ed25519 MSF3dg c5ZXvdNxNfZU3HeWsttuhy+UC5JxWN/IFuCuCGbksn4
|
-> ssh-ed25519 xA739A 2JuLai2fUu3dZBydS8cMrLrEUIUkz4NNaiupoBOtTwU
|
||||||
vcKlIirf+VvERX71YpmwW6zp6ClhlG2PR4R8LIN7cQo
|
sdM3X+XRzysop7yqa76Z7FAwTHOj91STCtZvfIgCdB0
|
||||||
--- pJKICDaYAlxqNnvHIuzB3Yk7tv0ZNYflGTQD+Zk/8+4
|
-> ssh-ed25519 MSF3dg fSPkiWnpInX1V5p3afPCoPotcGFoWFiOMPThtY927lc
|
||||||
<EFBFBD>h/\J<>J
|
8v7E/3l0xA2VWZPXzkN4NmnaA0KJutLMurn/ZXZmhxA
|
||||||
<EFBFBD>0?<3F> <20>p<EFBFBD><70><EFBFBD>@܉7<DC89><37>3<EFBFBD><33><EFBFBD><EFBFBD>z<EFBFBD><7A><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>a<EFBFBD><61>'<27>,ka<6B>I<EFBFBD>XXOZ<4F>I\<5C><><EFBFBD><EFBFBD><EFBFBD> <09>BP<42><50>/cUɿ~B<><42>S'Q<><51><EFBFBD><EFBFBD>f<06><><EFBFBD>er<65><72><EFBFBD><EFBFBD>^<5E><><EFBFBD><EFBFBD>8l<38><6C>V<EFBFBD>E<EFBFBD><45><EFBFBD>
|
--- MQkyBx9hT4ILYXKoZT18PWny1QbDFymcZr63zjMN/qQ
|
||||||
|
-b<>#<23><>M.<16>@<40>t<EFBFBD><74><EFBFBD>ŵ}+ό#@<40><><EFBFBD><EFBFBD><EFBFBD>k<EFBFBD>y<EFBFBD><79><EFBFBD>?v<><76>n<1F><>T<EFBFBD>+<2B><><EFBFBD>[<5B>Q<EFBFBD> gA<67><41><EFBFBD>
|
||||||
Binary file not shown.
@@ -1,12 +1,14 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 G5LX5w /RF8uZ/KahUqjEFILbF3+Jin+U0SQdoQChcc9RJ9axc
|
-> ssh-ed25519 G5LX5w SRJhNenoQXbT1FgX3TMPnVH5P6oe2eHot+M1YsEjsEk
|
||||||
aEmPk++86nBR6d2BIa/oaUdyiLS6cH8TUoYJE3bxba4
|
hfTSLgKi98Eh7JK5o7x2POpTEtQlQCpEa3keUFYCuME
|
||||||
-> ssh-ed25519 CAWG4Q qHyh9nQi8c3z/KHby9y5vhzN0Dwz0zca98ebjJmXrzs
|
-> ssh-ed25519 cK5kHw z5TwWJTkvx7HztjXHJW/aCOtOfPrQaLP0gyIT7rXcyU
|
||||||
ZbmwNzrSSQ3RvskE8SqcBa0vMy8pzm/HPGHLm5zuPGQ
|
b4NCpHfasgvkLLr+6LcWUl60p59aSNnfp3bl2OFYXo0
|
||||||
-> ssh-ed25519 xA739A FlGbfS4bUxA3gVDzb3yPjp4hV8a7aiNBLUctnN3bGEY
|
-> ssh-ed25519 CAWG4Q 4VpS1/OnFe8nxcQbRTKNhjsh/ZQ5cbhSMXwK/jjQ+3o
|
||||||
3fI6SyVjVhh2M8uc/XV3blpdQMPMYi2qzaHNXvx0bvM
|
WF9wvOkqVml4UcEzyzeumKuUwCwwr2zvKLMg+PCB8nk
|
||||||
-> ssh-ed25519 MSF3dg 0Bs/aW0nNISS+93It75o6hKZWa7S+LF5bF5ApsJ2fQ8
|
-> ssh-ed25519 xA739A 67FhuJ070jBVMt/xbKHWhfri6iIm0FyaFvzQabsvFBM
|
||||||
y7o0KYDHEen13ndIxg/mYil3eMxxzvYF2pWqhMb+rBU
|
1G5/913dDv/r/6p1x/c5YiUnZzrX/LvIj33KW+PN0KU
|
||||||
--- Iqo75G4+02Y9nc1OOkcEx+iQlKnGYCekAx76tRH53wA
|
-> ssh-ed25519 MSF3dg Bj/yB4N2wkyHCHC22tcjjJAA4ebSamN0Z4UVX3ZnryI
|
||||||
<10>
|
6D/ZgTs+j+MGDAbPU5zyK0i9zN6tQy68IcOnQZ27mYg
|
||||||
<EFBFBD>X<EFBFBD><EFBFBD>%f<0C><><12>hX<0B><>R<>c<EFBFBD>+z<><7A>eg<65>& <20>d<EFBFBD><64><EFBFBD>ק<06><>A<EFBFBD><41><EFBFBD>чXM<58>1<EFBFBD>
|
--- 169erk3ICSYLs4FPEuXCn7QlekWhsmSn0Lr+/R14I5Q
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><05>ҽ3<D2BD>s<EFBFBD>
|
||||||
|
w<EFBFBD><EFBFBD>4D<EFBFBD><EFBFBD>b.<2E><><EFBFBD>"|<7C><><EFBFBD>)"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>;<3B>.<2E>ɫ7)<29>LeC<05>=S؟
|
||||||
BIN
secrets/wg-apex.age
Normal file
BIN
secrets/wg-apex.age
Normal file
Binary file not shown.
14
secrets/wg-fox.age
Normal file
14
secrets/wg-fox.age
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
age-encryption.org/v1
|
||||||
|
-> ssh-ed25519 cDBabA heyW9/cxgwFX9IexQIXjAQDWGQPNcMXcArQp2Rxsqx4
|
||||||
|
o9MQ7EH8PDDjsJdpH9F3Xq2zUoaDAJQlfFmYucSFs6Y
|
||||||
|
-> ssh-ed25519 cK5kHw Sza4pos7K3qW3omEeyidI/jszJNf9smemSZnUJfCIww
|
||||||
|
D6vazXki7hIYraIuSiGPS+FPbkFUwHhHWDf52OhEIMg
|
||||||
|
-> ssh-ed25519 CAWG4Q YexIHueOIMmIN8JIDyNUOKBkyz/k18HqV3hTXh48KlM
|
||||||
|
xh8UJzzWT6ByN+Dpn4JrMNsjGC/uc/v6LynwjBDz9NQ
|
||||||
|
-> ssh-ed25519 xA739A KySG3TXdqfCMUkVEDGa74B0op745s3XGYxFLyAXSQAc
|
||||||
|
5EI/yb5ctW9Qu18bHm3/sK97kwGcKzzmWvPSCWm89XA
|
||||||
|
-> ssh-ed25519 MSF3dg MNxnNj0fHmri8ophexXPNjRUBUWrzcuk5S1mucxUMTE
|
||||||
|
GVFWXtISEU8ZmlwL4nh4weAgfGrt2GHX0DTzbpS6zg8
|
||||||
|
--- UdrqkYG2ZApAuwdZeNhC50NP2rkD/Ol6y8nJa4RHx7Y
|
||||||
|
<EFBFBD>ܻ<EFBFBD>m(<28><><EFBFBD>><3E>H<48>Y87<><37>G<0F>+*<12><><EFBFBD><EFBFBD>9V<>.<2E><><EFBFBD><EFBFBD><03><><EFBFBD>p<EFBFBD>Oo<4F>=+哇<>P0<50><30>{<7B>)<29><17><><EFBFBD><EFBFBD>><3E>z3P^
|
||||||
|
u
|
||||||
@@ -21,17 +21,28 @@ the detailed specifications:
|
|||||||
|
|
||||||
## Access
|
## Access
|
||||||
|
|
||||||
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
|
To access the machine, request a SLURM session from [apex](/apex) using the `fox`
|
||||||
partition:
|
partition. If you need the machine for performance measurements, use an
|
||||||
|
exclusive reservation:
|
||||||
|
|
||||||
hut% salloc -p fox
|
apex% salloc -p fox --exclusive
|
||||||
|
|
||||||
Then connect via ssh:
|
Otherwise, specify the CPUs that you need so other users can also use the node
|
||||||
|
at the same time:
|
||||||
|
|
||||||
hut% ssh fox
|
apex% salloc -p fox -c 8
|
||||||
|
|
||||||
|
Then use srun to execute an interactive shell:
|
||||||
|
|
||||||
|
apex% srun --pty $SHELL
|
||||||
fox%
|
fox%
|
||||||
|
|
||||||
Follow [these steps](/access) if you don't have access to hut or fox.
|
Make sure you get all CPUs you expect:
|
||||||
|
|
||||||
|
fox% grep Cpus_allowed_list /proc/self/status
|
||||||
|
Cpus_allowed_list: 0-191
|
||||||
|
|
||||||
|
Follow [these steps](/access) if you don't have access to apex or fox.
|
||||||
|
|
||||||
## CUDA
|
## CUDA
|
||||||
|
|
||||||
@@ -89,9 +100,8 @@ Then just run `nix develop` from the same directory:
|
|||||||
|
|
||||||
The machine has several file systems available.
|
The machine has several file systems available.
|
||||||
|
|
||||||
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
|
- `/nfs/home`: The `/home` from apex via NFS, which is also shared with other
|
||||||
Don't abuse.
|
xeon machines. It has about 2 ms of latency, so not suitable for quick random
|
||||||
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
|
access.
|
||||||
capacity. Stores three redundant copies of every file.
|
|
||||||
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
|
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
|
||||||
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
|
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
|
||||||
|
|||||||
Reference in New Issue
Block a user