1 Commits

Author SHA1 Message Date
95ca7b8750 Keep compute nodes off when power comes back
When the power comes back, we don't know if the AC unit will be
operating properly or if the room will be at a safe temperature. So,
instead of powering all the machines back, only configure the login to
power on, so we can check the state of the room and power the rest of
the machines.
2025-07-31 17:44:49 +02:00
51 changed files with 258 additions and 812 deletions

View File

@@ -2,22 +2,22 @@
# here all the public keys
rec {
hosts = {
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
raccoon = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNQttFvL0dNEyy7klIhLoK4xXOeM2/K9R7lPMTG3qvK raccoon";
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
};
hostGroup = with hosts; rec {
compute = [ owl1 owl2 fox raccoon ];
untrusted = [ fox ];
compute = [ owl1 owl2 ];
playground = [ eudy koro weasel ];
storage = [ bay lake2 ];
monitor = [ hut ];
@@ -31,7 +31,6 @@ rec {
admins = {
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
"rarias@fox" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDSbw3REAKECV7E2c/e2XJITudJQWq2qDSe2N1JHqHZd rarias@fox";
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
};
}

View File

@@ -5,11 +5,12 @@
../common/xeon.nix
../common/ssf/hosts.nix
../module/ceph.nix
../module/slurm-server.nix
../module/power-policy.nix
./nfs.nix
./wireguard.nix
];
power.policy = "always-on";
# Don't install grub MBR for now
boot.loader.grub.device = "nodev";
@@ -56,6 +57,17 @@
};
};
# Use SSH tunnel to reach internal hosts
programs.ssh.extraConfig = ''
Host bscpm04.bsc.es gitlab-internal.bsc.es knights3.bsc.es
ProxyCommand nc -X connect -x localhost:23080 %h %p
Host raccoon
HostName knights3.bsc.es
ProxyCommand nc -X connect -x localhost:23080 %h %p
Host tent
ProxyJump raccoon
'';
networking.firewall = {
extraCommands = ''
# Blackhole BSC vulnerability scanner (OpenVAS) as it is spamming our

View File

@@ -8,7 +8,6 @@
statdPort = 4000;
exports = ''
/home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
/home 10.106.0.0/24(rw,async,no_subtree_check,no_root_squash)
'';
};
networking.firewall = {
@@ -28,21 +27,6 @@
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
# Accept NFS traffic from wg0
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
# Same but UDP
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
'';
};
}

View File

@@ -1,42 +0,0 @@
{ config, ... }:
{
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgApex.file = ../../secrets/wg-apex.age;
# Enable WireGuard
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
# "wg0" is the network interface name. You can name the interface arbitrarily.
wg0 = {
ips = [ "10.106.0.30/24" ];
listenPort = 666;
privateKeyFile = config.age.secrets.wgApex.path;
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
peers = [
{
name = "fox";
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
allowedIPs = [ "10.106.0.1/32" ];
endpoint = "fox.ac.upc.edu:666";
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
persistentKeepalive = 25;
}
{
name = "raccoon";
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
}
];
};
};
networking.hosts = {
"10.106.0.1" = [ "fox" ];
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
}

View File

@@ -3,7 +3,6 @@
# Includes the basic configuration for an Intel server.
imports = [
./base/agenix.nix
./base/always-power-on.nix
./base/august-shutdown.nix
./base/boot.nix
./base/env.nix
@@ -12,6 +11,7 @@
./base/net.nix
./base/nix.nix
./base/ntp.nix
./base/power-policy.nix
./base/rev.nix
./base/ssh.nix
./base/users.nix

View File

@@ -1,8 +0,0 @@
{
imports = [
../../module/power-policy.nix
];
# Turn on as soon as we have power
power.policy = "always-on";
}

View File

@@ -14,10 +14,9 @@
nftables.enable = lib.mkForce false;
hosts = {
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
"84.88.53.236" = [ "apex" "ssfhead.bsc.es" "ssfhead" ];
"84.88.51.152" = [ "raccoon" ];
"84.88.51.142" = [ "raccoon-ipmi" ];
"192.168.11.12" = [ "bscpm04.bsc.es" ];
"192.168.11.15" = [ "gitlab-internal.bsc.es" ];
};
};
}

View File

@@ -0,0 +1,9 @@
{
imports = [
../../module/power-policy.nix
];
# By default, keep the machines off as we don't know if the AC will be working
# once the electricity comes back.
power.policy = "always-off";
}

View File

@@ -154,20 +154,6 @@
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
];
};
csiringo = {
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
uid = 9653;
isNormalUser = true;
home = "/home/Computational/csiringo";
description = "Cesare Siringo";
group = "Computational";
hosts = [ "apex" "weasel" ];
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
];
};
};
groups = {

View File

@@ -4,7 +4,7 @@
./xeon.nix
./ssf/fs.nix
./ssf/hosts.nix
./ssf/hosts-remote.nix
./ssf/net.nix
./ssf/ssh.nix
];
}

View File

@@ -1,9 +0,0 @@
{ pkgs, ... }:
{
networking.hosts = {
# Remote hosts visible from compute nodes
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
}

16
m/common/ssf/ssh.nix Normal file
View File

@@ -0,0 +1,16 @@
{
# Use SSH tunnel to apex to reach internal hosts
programs.ssh.extraConfig = ''
Host tent
ProxyJump raccoon
# Access raccoon via the HTTP proxy
Host raccoon knights3.bsc.es
HostName knights3.bsc.es
ProxyCommand=ssh apex 'nc -X connect -x localhost:23080 %h %p'
# Make sure we can reach gitlab even if we don't have SSH access to raccoon
Host bscpm04.bsc.es gitlab-internal.bsc.es
ProxyCommand=ssh apex 'nc -X connect -x localhost:23080 %h %p'
'';
}

View File

@@ -4,13 +4,13 @@
imports = [
../common/base.nix
../common/xeon/console.nix
../module/amd-uprof.nix
../module/emulation.nix
../module/nvidia.nix
../module/slurm-client.nix
./wireguard.nix
../module/power-policy.nix
];
power.policy = "always-on";
# Don't turn off on August as UPC has different dates.
# Fox works fine on power cuts.
systemd.timers.august-shutdown.enable = false;
@@ -22,7 +22,7 @@
swapDevices = lib.mkForce [];
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ];
boot.kernelModules = [ "kvm-amd" "amd_uncore" ];
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
@@ -30,21 +30,14 @@
# Use performance for benchmarks
powerManagement.cpuFreqGovernor = "performance";
services.amd-uprof.enable = true;
# Disable NUMA balancing
boot.kernel.sysctl."kernel.numa_balancing" = 0;
# Expose kernel addresses
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
# Disable NMI watchdog to save one hw counter (for AMD uProf)
boot.kernel.sysctl."kernel.nmi_watchdog" = 0;
services.openssh.settings.X11Forwarding = true;
services.fail2ban.enable = true;
networking = {
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
hostName = "fox";
@@ -75,13 +68,6 @@
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
# Mount the NFS home
fileSystems."/nfs/home" = {
device = "10.106.0.30:/home";
fsType = "nfs";
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
};
# Make a /nvme{0,1}/$USER directory for each user.
systemd.services.create-nvme-dirs = let
# Take only normal users in fox
@@ -98,20 +84,4 @@
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

View File

@@ -1,53 +0,0 @@
{ config, ... }:
{
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgFox.file = ../../secrets/wg-fox.age;
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
# "wg0" is the network interface name. You can name the interface arbitrarily.
wg0 = {
# Determines the IP address and subnet of the server's end of the tunnel interface.
ips = [ "10.106.0.1/24" ];
# The port that WireGuard listens to. Must be accessible by the client.
listenPort = 666;
# Path to the private key file.
privateKeyFile = config.age.secrets.wgFox.path;
# Public key: VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=
peers = [
# List of allowed peers.
{
name = "apex";
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
allowedIPs = [ "10.106.0.30/32" ];
}
{
name = "raccoon";
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
}
];
};
};
networking.hosts = {
"10.106.0.30" = [ "apex" ];
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
networking.firewall = {
extraCommands = ''
# Accept slurm connections to slurmd from apex (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.30/32 -d 10.106.0.1/32 --dport 6818 -j nixos-fw-accept
'';
};
}

View File

@@ -7,9 +7,11 @@
../module/ceph.nix
../module/debuginfod.nix
../module/emulation.nix
../module/slurm-client.nix
./gitlab-runner.nix
./monitoring.nix
./nfs.nix
./slurm-server.nix
./nix-serve.nix
./public-inbox.nix
./gitea.nix

7
m/hut/slurm-server.nix Normal file
View File

@@ -0,0 +1,7 @@
{ ... }:
{
services.slurm = {
server.enable = true;
};
}

View File

@@ -1,49 +0,0 @@
{ config, lib, pkgs, ... }:
{
options = {
services.amd-uprof = {
enable = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to enable AMD uProf.";
};
};
};
# Only setup amd-uprof if enabled
config = lib.mkIf config.services.amd-uprof.enable {
# First make sure that we add the module to the list of available modules
# in the kernel matching the same kernel version of this configuration.
boot.extraModulePackages = with config.boot.kernelPackages; [ amd-uprof-driver ];
boot.kernelModules = [ "AMDPowerProfiler" ];
# Make the userspace tools available in $PATH.
environment.systemPackages = with pkgs; [ amd-uprof ];
# The AMDPowerProfiler module doesn't create the /dev device nor it emits
# any uevents, so we cannot use udev rules to automatically create the
# device. Instead, we run a systemd unit that does it after loading the
# modules.
systemd.services.amd-uprof-device = {
description = "Create /dev/AMDPowerProfiler device";
after = [ "systemd-modules-load.service" ];
wantedBy = [ "multi-user.target" ];
unitConfig.ConditionPathExists = [
"/proc/AMDPowerProfiler/device"
"!/dev/AMDPowerProfiler"
];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "add-amd-uprof-dev.sh" ''
mknod /dev/AMDPowerProfiler -m 666 c $(< /proc/AMDPowerProfiler/device) 0
'';
ExecStop = pkgs.writeShellScript "remove-amd-uprof-dev.sh" ''
rm -f /dev/AMDPowerProfiler
'';
};
};
};
}

View File

@@ -1,10 +1,33 @@
{ lib, ... }:
{ config, pkgs, lib, ... }:
{
imports = [
./slurm-common.nix
];
let
suspendProgram = pkgs.writeScript "suspend.sh" ''
#!/usr/bin/env bash
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeScript "resume.sh" ''
#!/usr/bin/env bash
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill
# all the jobs running, so ensure that we only upgrade when the nodes are
@@ -14,5 +37,90 @@
KillMode = lib.mkForce "control-group";
};
services.slurm.client.enable = true;
services.slurm = {
client.enable = true;
controlMachine = "hut";
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
];
partitionName = [
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
];
# See slurm.conf(5) for more details about these options.
extraConfig = ''
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
# not with Intel MPI. For that use the compatibility shim libpmi.so
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
# library in SLURM (--mpi=pmix). See more details here:
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
MpiDefault=pmix
# When a node reboots return that node to the slurm queue as soon as it
# becomes operative again.
ReturnToService=2
# Track all processes by using a cgroup
ProctrackType=proctrack/cgroup
# Enable task/affinity to allow the jobs to run in a specified subset of
# the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=hut
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000
# Use cores as consumable resources. In SLURM terms, a core may have
# multiple hardware threads (or CPUs).
SelectType=select/cons_tres
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
# when a task runs (srun) so we can ssh early.
PrologFlags=Alloc,Contain,X11
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
# adopted by the external step, similar to tasks running in regular steps
# LaunchParameters=ulimit_pam_adopt
SlurmdDebug=debug5
#DebugFlags=Protocol,Cgroup
'';
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
'';
};
# Place the slurm config in /etc as this will be required by PAM
environment.etc.slurm.source = config.services.slurm.etcSlurm;
age.secrets.mungeKey = {
file = ../../secrets/munge-key.age;
owner = "munge";
group = "munge";
};
services.munge = {
enable = true;
password = config.age.secrets.mungeKey.path;
};
}

View File

@@ -1,115 +0,0 @@
{ config, pkgs, ... }:
let
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeShellScript "resume.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
services.slurm = {
controlMachine = "apex";
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
];
partitionName = [
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
"fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
];
# See slurm.conf(5) for more details about these options.
extraConfig = ''
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
# not with Intel MPI. For that use the compatibility shim libpmi.so
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
# library in SLURM (--mpi=pmix). See more details here:
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
MpiDefault=pmix
# When a node reboots return that node to the slurm queue as soon as it
# becomes operative again.
ReturnToService=2
# Track all processes by using a cgroup
ProctrackType=proctrack/cgroup
# Enable task/affinity to allow the jobs to run in a specified subset of
# the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=fox
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000
# Use cores as consumable resources. In SLURM terms, a core may have
# multiple hardware threads (or CPUs).
SelectType=select/cons_tres
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
# when a task runs (srun) so we can ssh early.
PrologFlags=Alloc,Contain,X11
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
# adopted by the external step, similar to tasks running in regular steps
# LaunchParameters=ulimit_pam_adopt
SlurmdDebug=debug5
#DebugFlags=Protocol,Cgroup
'';
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
'';
};
# Place the slurm config in /etc as this will be required by PAM
environment.etc.slurm.source = config.services.slurm.etcSlurm;
age.secrets.mungeKey = {
file = ../../secrets/munge-key.age;
owner = "munge";
group = "munge";
};
services.munge = {
enable = true;
password = config.age.secrets.mungeKey.path;
};
}

View File

@@ -1,23 +0,0 @@
{ ... }:
{
imports = [
./slurm-common.nix
];
services.slurm.server.enable = true;
networking.firewall = {
extraCommands = ''
# Accept slurm connections to controller from compute nodes
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817 -j nixos-fw-accept
# Accept slurm connections from compute nodes for srun
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
# Accept slurm connections to controller from fox (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 6817 -j nixos-fw-accept
# Accept slurm connections from fox for srun (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 60000:61000 -j nixos-fw-accept
'';
};
}

View File

@@ -0,0 +1,8 @@
{
programs.ssh.extraConfig = ''
Host apex ssfhead
HostName ssflogin.bsc.es
Host hut
ProxyJump apex
'';
}

View File

@@ -3,14 +3,16 @@
{
imports = [
../common/base.nix
../common/ssf/hosts.nix
../module/emulation.nix
../module/debuginfod.nix
../module/ssh-hut-extern.nix
../module/nvidia.nix
../module/power-policy.nix
../eudy/kernel/perf.nix
./wireguard.nix
];
power.policy = "always-on";
# Don't install Grub on the disk yet
boot.loader.grub.device = "nodev";
@@ -40,17 +42,9 @@
};
hosts = {
"10.0.44.4" = [ "tent" ];
"84.88.53.236" = [ "apex" ];
};
};
# Mount the NFS home
fileSystems."/nfs/home" = {
device = "10.106.0.30:/home";
fsType = "nfs";
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
};
nix.settings = {
extra-substituters = [ "https://jungle.bsc.es/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];

View File

@@ -1,48 +0,0 @@
{ config, pkgs, ... }:
{
networking.nat = {
enable = true;
enableIPv6 = false;
externalInterface = "eno0";
internalInterfaces = [ "wg0" ];
};
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgRaccoon.file = ../../secrets/wg-raccoon.age;
# Enable WireGuard
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
wg0 = {
ips = [ "10.106.0.236/24" ];
listenPort = 666;
privateKeyFile = config.age.secrets.wgRaccoon.path;
# Public key: QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=
peers = [
{
name = "fox";
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
allowedIPs = [ "10.106.0.1/32" ];
endpoint = "fox.ac.upc.edu:666";
persistentKeepalive = 25;
}
{
name = "apex";
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
allowedIPs = [ "10.106.0.30/32" "10.0.40.0/24" ];
endpoint = "ssfhead.bsc.es:666";
persistentKeepalive = 25;
}
];
};
};
networking.hosts = {
"10.106.0.1" = [ "fox.wg" ];
"10.106.0.30" = [ "apex.wg" ];
};
}

View File

@@ -3,9 +3,9 @@
{
imports = [
../common/xeon.nix
../common/ssf/hosts.nix
../module/emulation.nix
../module/debuginfod.nix
../module/ssh-hut-extern.nix
./monitoring.nix
./nginx.nix
./nix-serve.nix
@@ -33,10 +33,6 @@
nameservers = [ "84.88.52.35" "84.88.52.36" ];
search = [ "bsc.es" "ac.upc.edu" ];
defaultGateway = "10.0.44.1";
hosts = {
"84.88.53.236" = [ "apex" ];
"10.0.44.1" = [ "raccoon" ];
};
};
services.p.enable = true;

View File

@@ -67,9 +67,6 @@ in
location /p/ {
alias /var/lib/p/;
}
location /pub/ {
alias /vault/pub/;
}
'';
};
};

View File

@@ -14,10 +14,6 @@
# Users with sudo access
users.groups.wheel.members = [ "abonerib" "anavarro" ];
# Run julia installed with juliaup using julia's own libraries:
# NIX_LD_LIBRARY_PATH=~/.julia/juliaup/${VERS}/lib/julia ~/.juliaup/bin/julia
programs.nix-ld.enable = true;
networking = {
hostName = "weasel";
interfaces.eno1.ipv4.addresses = [ {

View File

@@ -1,89 +0,0 @@
{ stdenv
, lib
, curl
, cacert
, runCommandLocal
, autoPatchelfHook
, elfutils
, glib
, libGL
, ncurses5
, xorg
, zlib
, libxkbcommon
, freetype
, fontconfig
, libGLU
, dbus
, rocmPackages
, libxcrypt-legacy
, numactl
, radare2
}:
let
version = "5.1.701";
tarball = "AMDuProf_Linux_x64_${version}.tar.bz2";
# NOTE: Remember to update the radare2 patch below if AMDuProfPcm changes.
uprofSrc = runCommandLocal tarball {
nativeBuildInputs = [ curl ];
outputHash = "sha256-j9gxcBcIg6Zhc5FglUXf/VV9bKSo+PAKeootbN7ggYk=";
SSL_CERT_FILE="${cacert}/etc/ssl/certs/ca-bundle.crt";
} ''
curl \
-o $out \
'https://download.amd.com/developer/eula/uprof/uprof-5-1/${tarball}' \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
-H 'Accept-Language: en-US,en;q=0.5' \
-H 'Accept-Encoding: gzip, deflate, br, zstd' \
-H 'Referer: https://www.amd.com/' 2>&1 | tr '\r' '\n'
'';
in
stdenv.mkDerivation {
pname = "AMD-uProf";
inherit version;
src = uprofSrc;
dontStrip = true;
phases = [ "installPhase" "fixupPhase" ];
nativeBuildInputs = [ autoPatchelfHook radare2 ];
buildInputs = [
stdenv.cc.cc.lib
ncurses5
elfutils
glib
libGL
libGLU
libxcrypt-legacy
xorg.libX11
xorg.libXext
xorg.libXi
xorg.libXmu
xorg.libxcb
xorg.xcbutilwm
xorg.xcbutilrenderutil
xorg.xcbutilkeysyms
xorg.xcbutilimage
fontconfig.lib
libxkbcommon
zlib
freetype
dbus
rocmPackages.rocprofiler
numactl
];
installPhase = ''
set -x
mkdir -p $out
tar -x -v -C $out --strip-components=1 -f $src
rm $out/bin/AMDPowerProfilerDriverSource.tar.gz
patchelf --replace-needed libroctracer64.so.1 libroctracer64.so $out/bin/ProfileAgents/x64/libAMDGpuAgent.so
patchelf --add-needed libcrypt.so.1 --add-needed libstdc++.so.6 $out/bin/AMDuProfSys
echo "16334a51fcc48668307ad94e20482ca4 $out/bin/AMDuProfPcm" | md5sum -c -
radare2 -w -q -i ${./libnuma.r2} $out/bin/AMDuProfPcm
patchelf --add-needed libnuma.so $out/bin/AMDuProfPcm
set +x
'';
}

View File

@@ -1,33 +0,0 @@
{ stdenv
, lib
, amd-uprof
, kernel
, runCommandLocal
}:
let
version = amd-uprof.version;
tarball = amd-uprof.src;
in stdenv.mkDerivation {
pname = "AMDPowerProfilerDriver";
inherit version;
src = runCommandLocal "AMDPowerProfilerDriverSource.tar.gz" { } ''
set -x
tar -x -f ${tarball} AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz
mv AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz $out
set +x
'';
hardeningDisable = [ "pic" "format" ];
nativeBuildInputs = kernel.moduleBuildDependencies;
patches = [ ./makefile.patch ./hrtimer.patch ];
makeFlags = [
"KERNEL_VERSION=${kernel.modDirVersion}"
"KERNEL_DIR=${kernel.dev}/lib/modules/${kernel.modDirVersion}/build"
"INSTALL_MOD_PATH=$(out)"
];
meta = {
description = "AMD Power Profiler Driver";
homepage = "https://www.amd.com/es/developer/uprof.html";
platforms = lib.platforms.linux;
};
}

View File

@@ -1,31 +0,0 @@
--- a/src/PmcTimerConfig.c 2025-09-04 12:17:16.771707049 +0200
+++ b/src/PmcTimerConfig.c 2025-09-04 12:17:04.878515468 +0200
@@ -99,7 +99,7 @@ static void PmcInitTimer(void* pInfo)
DRVPRINT("pTimerConfig(%p)", pTimerConfig);
- hrtimer_init(&pTimerConfig->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hrtimer_setup(&pTimerConfig->m_hrTimer, PmcTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
}
int PmcSetupTimer(ClientContext* pClientCtx)
@@ -157,7 +157,6 @@ int PmcSetupTimer(ClientContext* pClient
{
/* Interval in ms */
pTimerConfig->m_time = ktime_set(interval / 1000, interval * 1000000);
- pTimerConfig->m_hrTimer.function = PmcTimerCallback;
DRVPRINT("retVal(%d) m_time(%lld)", retVal, (long long int) pTimerConfig->m_time);
}
--- a/src/PwrProfTimer.c 2025-09-04 12:18:08.750544327 +0200
+++ b/src/PwrProfTimer.c 2025-09-04 12:18:28.557863382 +0200
@@ -573,8 +573,7 @@ void InitHrTimer(uint32 cpu)
pCoreClientData = &per_cpu(g_coreClientData, cpu);
// initialize HR timer
- hrtimer_init(&pCoreClientData->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- pCoreClientData->m_hrTimer.function = &HrTimerCallback;
+ hrtimer_setup(&pCoreClientData->m_hrTimer, &HrTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
return;
} // InitHrTimer

View File

@@ -1,10 +0,0 @@
# Patch arguments to call sym std::string::find(char const*, unsigned long, unsigned long)
# so it matches NixOS:
#
# Change OS name to NixOS
wz NixOS @ 0x00550a43
# And set the length to 5 characters
wa mov ecx, 5 @0x00517930
#
# Then change the argument to dlopen() so it only uses libnuma.so
wz libnuma.so @ 0x00562940

View File

@@ -1,66 +0,0 @@
--- a/Makefile 2025-06-19 20:36:49.346693267 +0200
+++ b/Makefile 2025-06-19 20:42:29.778088660 +0200
@@ -27,7 +27,7 @@ MODULE_VERSION=$(shell cat AMDPowerProfi
MODULE_NAME_KO=$(MODULE_NAME).ko
# check is module inserted
-MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
+#MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
# check pcore dkms status
PCORE_DKMS_STATUS=$(shell dkms status | grep $(MODULE_NAME) | grep $(MODULE_VERSION))
@@ -50,7 +50,7 @@ endif
# “-Wno-missing-attributes” is added for GCC version >= 9.0 and kernel version <= 5.00
G_VERSION=9
K_VERSION=5
-KERNEL_MAJOR_VERSION=$(shell uname -r | cut -f1 -d.)
+KERNEL_MAJOR_VERSION=$(shell echo "$(KERNEL_VERSION)" | cut -f1 -d.)
GCCVERSION = $(shell gcc -dumpversion | cut -f1 -d.)
ifeq ($(G_VERSION),$(firstword $(sort $(GCCVERSION) $(G_VERSION))))
ifeq ($(K_VERSION),$(lastword $(sort $(KERNEL_MAJOR_VERSION) $(K_VERSION))))
@@ -66,17 +66,7 @@ ${MODULE_NAME}-objs := src/PmcDataBuffe
# make
all:
- @chmod a+x ./AMDPPcert.sh
- @./AMDPPcert.sh 0 1; echo $$? > $(PWD)/sign_status;
- @SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
- if [ $$SIGSTATUS1 -eq 1 ]; then \
- exit 1; \
- fi
- @make -C /lib/modules/$(KERNEL_VERSION)/build M=$(PWD) $(MAKE_OPTS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" modules
- @SIGSTATUS3=`cat $(PWD)/sign_status | tr -d '\n'`; \
- if [ $$SIGSTATUS3 -eq 0 ]; then \
- ./AMDPPcert.sh 1 $(MODULE_NAME_KO); \
- fi
+ make -C $(KERNEL_DIR) M=$(PWD) $(MAKE_OPTS) CFLAGS_MODULE="$(EXTRA_CFLAGS)" modules
# make clean
clean:
@@ -84,23 +74,9 @@ clean:
# make install
install:
- @mkdir -p /lib/modules/`uname -r`/kernel/drivers/extra
- @rm -f /lib/modules/`uname -r`/kernel/drivers/extra/$(MODULE_NAME_KO)
- @cp $(MODULE_NAME_KO) /lib/modules/`uname -r`/kernel/drivers/extra/
- @depmod -a
- @if [ ! -z "$(MODPROBE_OUTPUT)" ]; then \
- echo "Uninstalling AMDPowerProfiler Linux kernel module.";\
- rmmod $(MODULE_NAME);\
- fi
- @modprobe $(MODULE_NAME) 2> $(PWD)/sign_status1; \
- cat $(PWD)/sign_status1 | grep "Key was rejected by service"; \
- echo $$? > $(PWD)/sign_status; SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
- if [ $$SIGSTATUS1 -eq 0 ]; then \
- echo "ERROR: Secure Boot enabled, correct key is not yet enrolled in BIOS key table"; \
- exit 1; \
- else \
- cat $(PWD)/sign_status1; \
- fi
+ mkdir -p $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
+ cp -a $(MODULE_NAME_KO) $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
+
# make dkms
dkms:
@chmod a+x ./AMDPPcert.sh

View File

@@ -53,15 +53,4 @@ final: prev:
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
cudainfo = prev.callPackage ./cudainfo/default.nix { };
amd-uprof = prev.callPackage ./amd-uprof/default.nix { };
# FIXME: Extend this to all linuxPackages variants. Open problem, see:
# https://discourse.nixos.org/t/whats-the-right-way-to-make-a-custom-kernel-module-available/4636
linuxPackages = prev.linuxPackages.extend (_final: _prev: {
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
});
linuxPackages_latest = prev.linuxPackages_latest.extend(_final: _prev: {
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
});
}

Binary file not shown.

View File

@@ -1,13 +1,11 @@
age-encryption.org/v1
-> ssh-ed25519 HY2yRg gKGxsjHfpiRDQ6Tuvcx7pjKgrVUGweotuplLYwCGvik
DSz9j/stVyB1lXpVP+kg+H+RDgSftREGFFLQZClC3kI
-> ssh-ed25519 cK5kHw 17DpKekfNVy4V742QSd61r2w6iawtOJR7Ct3UflDXio
hsqTEPCYjHKvndMWPl4GpG23CzjGgVrS+cLIymISJHU
-> ssh-ed25519 CAWG4Q oK01d4pbBqEZVsymSiKijPvJo714xsMSRMbzkssJKiw
hs0tVFkqtIHXg9jtC2iDgCtefFcWvGJkXB+HJUcqXQs
-> ssh-ed25519 xA739A KxO+AawfLMERHwzt3YnZRwPFlCfGETma7fo8M+ZtsAY
eSn0+/rhLQxNKt5xKubKck8Nxun2Sh3eJqBU/hwgzZM
-> ssh-ed25519 MSF3dg OyaZBLB2kO8fU139lXbbC404gT7IzIWk+BMhYzabBDg
/fiPFfBJcb+e40+fZbwCw7niF2hh+JxUPiKSiwUSOWg
--- ycZyGX+Li+LsOuweF9OVPl8aoMaRgp/RdFbDrPszkUs
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>YM<EFBFBD><EFBFBD>:E O<><4F>2<EFBFBD>r=<15>&4<><04>CQΣ<51><CEA3>hC<68><43><EFBFBD>cb<63>^Sy<53><79>% <09><>x-vC`g<><15><><EFBFBD><EFBFBD>W^<5E><>wVG <0B><><EFBFBD>
-> ssh-ed25519 HY2yRg d7+nvfAcdC3GjJxipXFrsfGGyP5jAY+gRWRV+4FVYAM
CG7r0bRGgnUWcdfDnpe7HwZ3L/y7b5iuJuqvf15b3/Y
-> ssh-ed25519 CAWG4Q X0vITOErz4wkR3VQYOcVlnrkHtwe+ytdZz1Hcrs4vVs
6IWYOhXLQ+BnML9YfLLHJYEO2CZ/uEc9IBqhoWvjDHI
-> ssh-ed25519 xA739A p5e/0AJtZ0+zbRvkB/usLuxusY8xXRx9Ksi/LQlcIHw
M4S/qlzT9POyJx4gY9lmycstUcdwG2cinN4OlV22zzo
-> ssh-ed25519 MSF3dg Ydl7uBWzBx6sAaxbzC3x8qiaU3ysGqV4rUFLpHCEV30
/1AUHBhCNOs9i7LJbmzwQDHsu+ybzYf6+coztKk5E3U
--- kYt15WxClpT7PXD1oFe9GqJU+OswjH7y9wIc8/GzZ7M
<EFBFBD><EFBFBD>h<>ߓ<><DF93><EFBFBD>`<60><><EFBFBD>V4F<34><46>_k)^<5E>m$uj:ѳ<><D1B3><17><><EFBFBD>}<7D>Z]$U]<12>u<EFBFBD> <20>0<EFBFBD><30><EFBFBD>v8<76>?<3F>X<EFBFBD>P<EFBFBD>g%d<>#<23>d9{rAi<41><69>

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -2,9 +2,6 @@ let
keys = import ../keys.nix;
adminsKeys = builtins.attrValues keys.admins;
hut = [ keys.hosts.hut ] ++ adminsKeys;
fox = [ keys.hosts.fox ] ++ adminsKeys;
apex = [ keys.hosts.apex ] ++ adminsKeys;
raccoon = [ keys.hosts.raccoon ] ++ adminsKeys;
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
tent = [ keys.hosts.tent ] ++ adminsKeys;
# Only expose ceph keys to safe nodes and admins
@@ -27,8 +24,4 @@ in
"ceph-user.age".publicKeys = safe;
"munge-key.age".publicKeys = safe;
"wg-fox.age".publicKeys = fox;
"wg-apex.age".publicKeys = apex;
"wg-raccoon.age".publicKeys = raccoon;
}

View File

@@ -1,13 +1,11 @@
age-encryption.org/v1
-> ssh-ed25519 G5LX5w Zhbs+NM/SI49qQ0X8bBpWUWxYM0vUKCXNAnPpIE2NR0
CkBUmJ26EkwHztT8Pz0UGq2KZwN0Xz8iYQ9cEHL9OWQ
-> ssh-ed25519 cK5kHw 5KjUXJywRDp2A7l5ukTCS+WIAalxwP1f71ejGxwNrX4
JW8OLmfkULXo9AwYMGNyOgZ+nQ0MVc0PCM4kKPIo6V4
-> ssh-ed25519 CAWG4Q cVjY3R0ZHAfokA4kWlu5vOl2Gs7mdqRgRk4WSUOXAjg
IxEDvuximW99EqxmpW+Btpm0Zydmwg/u87bqnl26NYc
-> ssh-ed25519 xA739A hmuwZuxmJnuAjmU4X8yhPQ+hPWvN1G+ZS0pvD7fHamg
fnAPW6ZCrv5pSO4RQhhr8xz7ij7jAZJk0ApWluOXDng
-> ssh-ed25519 MSF3dg SSGLcWnum0Qo/0OnKDZVg9xAZMwGwVNYYmRJXxb4GU0
pdl6kATG7n2oMsoUboBfu+vDKurJcH1UvUa70rfMQkE
--- a2ZQAeAQlO9DWnegIAq6NpI1Po6f38l+hitZvq+zIW8
<EFBFBD>\ֺ"^<5E>DT<44>H<EFBFBD><48>3<EFBFBD><33><EFBFBD>_|.h<0E><><EFBFBD><EFBFBD><03>^<5E>n<14><0E><><EFBFBD><EFBFBD><1A>g<EFBFBD>S<EFBFBD>]_<><5F>?n<>z~2<>!<21>p7<70><37><<3C><14>ʨD?<3F>~<02>F<EFBFBD>$<24>`<60>q+<2B><><EFBFBD>SW<53>(+<2B><>P<EFBFBD>c<1E>u[<5B>m<EFBFBD>`O<>ܛ<EFBFBD>ϖT
-> ssh-ed25519 G5LX5w HlQ4V8lBd3im5j8KHEuQZBTuztvPj1QoWdv6FL6qzGI
Jpt91X1UIIVFQt1X6Q//kALn+Cetp/LqBZZvTuhFthw
-> ssh-ed25519 CAWG4Q StnngJAcuAwUnTrXDR3nJ2KFN0jNdTqSz+/1TfmWkzA
CR4AQ6fqaJVY1mdUIX1gzaZwRs1sU8F8hHztnkN8vN0
-> ssh-ed25519 xA739A xya5A5t63Owx+VrGgUfV/lIP8b/xV1cerMpuZBLaDVM
w+pA583yUnFq2AvGBGzWbQIGQEY9WqW0CSLQ9v+SG0c
-> ssh-ed25519 MSF3dg aXkLxCyYdOwVopHHmpXEI6WlAIizKdJi4IO0KEdhS3s
WKXkTszZN66+QZdSDJ4D9q7xgYWMfliOLCubIF2Dqkc
--- uVWoU2lMkqQ/9Z0BqKRCeUpsKi8lwmHukT/FV8wYMbg
<EFBFBD><EFBFBD>1G+<2B>6<EFBFBD><36>g[|x]2T<32>й<EFBFBD><D0B9><EFBFBD> <20>CKu)<29><><EFBFBD>]<5D><><38><D693><EFBFBD><EFBFBD>l<EFBFBD><6C>S<EFBFBD><53><EFBFBD>Q<EFBFBD><07><>x<EFBFBD><78><EFBFBD><EFBFBD>#7r<37>k{*<2A><>3ս~C<>b<EFBFBD><62><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵ<EFBFBD>Np<1E><05>]J]h<>je+d%Е<>#<23>m<EFBFBD>?=6}<7D>

View File

@@ -1,13 +1,11 @@
age-encryption.org/v1
-> ssh-ed25519 G5LX5w VKM/Y6Wy0gmb2gc4Q00VzHQ4IAxfSyshuDoaAzlEkFM
vf18uoEN5ZLJ4HcJg85epaseh1CRL9/ncXtU2HpH+QE
-> ssh-ed25519 cK5kHw sMuG07kjlI6VjPjELOUPzkn+KT9Yq7BPf0zSATM2aGI
/eODwL8KwyVgFjBK2MJlbqjN7mEvXCSsjq9D96szrng
-> ssh-ed25519 CAWG4Q t3/Ty7yCqC5x8KQY4VaHSQ9Q3epqMpXoBDKyKx9+VzE
JwgUsqMd+1jFZvFp9/SIoowbhSMVEkKp03T69+OHjho
-> ssh-ed25519 xA739A 0ohmKK427+4vupivrtjXp0dDK8wT4XUA9rWgcsCGKgA
msbeQyz3pL8RLtAeXX5tsfyHyOXxhfYpqaLEKnRxpPQ
-> ssh-ed25519 MSF3dg H+6jAoP7/Dxp8C/7Bk1C4CT1hpkUhtbnTWWIxkO24Ec
SrMuUG93T5lUw3xINEen5EEKLXJizIGFhBO1fVroFHE
--- tIPnH9cxTV3m3qzvZB97Egz+raWwZJ182BXXKDu8f+o
<EFBFBD><EFBFBD>f#<23>,|<7C>Ey.v<>DL<44>Ӻ<05>JPX<50><07><>`<60><><EFBFBD><EFBFBD>-#<23>F<EFBFBD>Ubs<62>(Q!?<3F><1A>#xJG?5<><35><EFBFBD><EFBFBD><EFBFBD>~<7E><>6MA<15> U<><55><EFBFBD>C<01><>M<>$+}W<>NϨG!<21><><EFBFBD><EFBFBD>a<EFBFBD><61><EFBFBD><EFBFBD>%<25>ǽ<EFBFBD>G
-> ssh-ed25519 G5LX5w sg9SmahxBg35MDIxhrp4oHkaTaxsKoVQju2eNhCt0BM
CZ64dEGqz2tbkG8KtimZvLUEMrQpVVBJP7Fu46WTMgc
-> ssh-ed25519 CAWG4Q jzS1R14W1CWxdziMLG/yCGPLWSkiyE+9lqyCVe491ng
acJo/nhKq3pSPoFEPaFLN1fzHHbEzstNoLtohWAHKiM
-> ssh-ed25519 xA739A qeGJoLeSIQwLU2Yg+Gi2bikHJ3HscLfyo1msqL3JwHw
tTwaxRBKTl/SoyY/LnxR/j/5WvCNX5VeZLKi018YMrY
-> ssh-ed25519 MSF3dg Wym7Uyf1XvH1H6mNDERkO8opkMiN0zzXm2PjXftEOWs
Uw8ZwwKIB5UqgVuoSLE2QajNDJZkH7/Y3Nsy+WFl7Xs
--- 94hGVbYiCGZdMEJesCMLh7IZi+w5l/Kr1lZJHQgrc0o
j5j磛<6A><04><>J<EFBFBD><4A><EFBFBD>a<EFBFBD>]<5D>a%dr<64><72>FDT<44><54>^<5E><>Q<EFBFBD>s/<2F>kwB<77>$<24><>$<24><>H<EFBFBD>'<27><><EFBFBD><EFBFBD><EFBFBD>w<14><?^|<7C><07>h$<24>ؗ<EFBFBD>GI<47>ĕsT2RU<52><55>*/O<>7<EFBFBD><37><EFBFBD>G<EFBFBD><70>4<EFBFBD><34><EFBFBD>M9<4D>j<><06>

View File

@@ -1,13 +1,12 @@
age-encryption.org/v1
-> ssh-ed25519 G5LX5w 1KfTmTRP3iSdcclf/FuIpFWpy1tgKs5ED+qSYWo7inY
RX6Q1nLFF/yiVLpkWrl0BI0PpLoBi753+y8l/AXjNE4
-> ssh-ed25519 cK5kHw TP7+OQpQSNuyArnUo1C97J3P3oB0YtzCEPeVvlzsYHE
Bsy5KPNHTVNHnF1sxOvlfJq3CNMVFaXdYkRG2vSj7qM
-> ssh-ed25519 CAWG4Q eQyzwNaH6CfaYIjs8abEuQxt6vxRXsGz69UletMUVDE
FDcynPO7xg4PWez5Z8gTg5LyE0Wgb3zT9i3Kon67QsU
-> ssh-ed25519 xA739A 2JuLai2fUu3dZBydS8cMrLrEUIUkz4NNaiupoBOtTwU
sdM3X+XRzysop7yqa76Z7FAwTHOj91STCtZvfIgCdB0
-> ssh-ed25519 MSF3dg fSPkiWnpInX1V5p3afPCoPotcGFoWFiOMPThtY927lc
8v7E/3l0xA2VWZPXzkN4NmnaA0KJutLMurn/ZXZmhxA
--- MQkyBx9hT4ILYXKoZT18PWny1QbDFymcZr63zjMN/qQ
-b<>#<23><>M.<16>@<40>t<EFBFBD><74><EFBFBD>ŵ}+ό#@<40><><EFBFBD><EFBFBD><EFBFBD>k<EFBFBD>y<EFBFBD><79><EFBFBD>?v<><76>n<1F><>T<EFBFBD>+<2B><><EFBFBD>[<5B>Q<EFBFBD> gA<67><41><EFBFBD>
-> ssh-ed25519 G5LX5w 5K0mzfJGvAB2LGmoQ9ZLbWooVEX6F4+fQdo1JUoB3FM
AKGa507bUrYjXFaMQ1MXTDBFYsdS6zbs+flmxYN0UNo
-> ssh-ed25519 CAWG4Q 8KzLc949on8iN1pK8q11OpCIeO71t6b0zxCLHhcQ6ns
uy7z6RdIuoUes+Uap3k5eoFFuu/DcSrEBwq4V4C/ygc
-> ssh-ed25519 xA739A SLx5cKo0fdAHj+cLpJ4FYTWTUTyDsCqKQOufDu3xnGo
VnS/WsiSaf6RpXuhgfij4pYu4p9hlJl1oXrfYY9rKlQ
-> ssh-ed25519 MSF3dg c5ZXvdNxNfZU3HeWsttuhy+UC5JxWN/IFuCuCGbksn4
vcKlIirf+VvERX71YpmwW6zp6ClhlG2PR4R8LIN7cQo
--- pJKICDaYAlxqNnvHIuzB3Yk7tv0ZNYflGTQD+Zk/8+4
<EFBFBD>h/\J<>J
<EFBFBD>0?<3F> <20>p<EFBFBD><70><EFBFBD>@܉7<DC89><37>3<EFBFBD><33><EFBFBD><EFBFBD>z<EFBFBD><7A><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>a<EFBFBD><61>'<27>,ka<6B>I<EFBFBD>XXOZ<4F>I\<5C><><EFBFBD><EFBFBD><EFBFBD> <09>BP<42><50>/cUɿ~B<><42>S' Q<><51><EFBFBD><EFBFBD>f<06><><EFBFBD>er<65><72><EFBFBD><EFBFBD>^<5E><><EFBFBD><EFBFBD>8l<38><6C>V<EFBFBD>E<EFBFBD><45><EFBFBD>

Binary file not shown.

View File

@@ -1,14 +1,12 @@
age-encryption.org/v1
-> ssh-ed25519 G5LX5w SRJhNenoQXbT1FgX3TMPnVH5P6oe2eHot+M1YsEjsEk
hfTSLgKi98Eh7JK5o7x2POpTEtQlQCpEa3keUFYCuME
-> ssh-ed25519 cK5kHw z5TwWJTkvx7HztjXHJW/aCOtOfPrQaLP0gyIT7rXcyU
b4NCpHfasgvkLLr+6LcWUl60p59aSNnfp3bl2OFYXo0
-> ssh-ed25519 CAWG4Q 4VpS1/OnFe8nxcQbRTKNhjsh/ZQ5cbhSMXwK/jjQ+3o
WF9wvOkqVml4UcEzyzeumKuUwCwwr2zvKLMg+PCB8nk
-> ssh-ed25519 xA739A 67FhuJ070jBVMt/xbKHWhfri6iIm0FyaFvzQabsvFBM
1G5/913dDv/r/6p1x/c5YiUnZzrX/LvIj33KW+PN0KU
-> ssh-ed25519 MSF3dg Bj/yB4N2wkyHCHC22tcjjJAA4ebSamN0Z4UVX3ZnryI
6D/ZgTs+j+MGDAbPU5zyK0i9zN6tQy68IcOnQZ27mYg
--- 169erk3ICSYLs4FPEuXCn7QlekWhsmSn0Lr+/R14I5Q
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><05>ҽ3<D2BD>s<EFBFBD>
w<EFBFBD><EFBFBD>4D<EFBFBD><EFBFBD>b.<2E><><EFBFBD>"|<7C><><EFBFBD>)"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>;<3B>.<2E>ɫ7)<29>LeC<05>=S؟
-> ssh-ed25519 G5LX5w /RF8uZ/KahUqjEFILbF3+Jin+U0SQdoQChcc9RJ9axc
aEmPk++86nBR6d2BIa/oaUdyiLS6cH8TUoYJE3bxba4
-> ssh-ed25519 CAWG4Q qHyh9nQi8c3z/KHby9y5vhzN0Dwz0zca98ebjJmXrzs
ZbmwNzrSSQ3RvskE8SqcBa0vMy8pzm/HPGHLm5zuPGQ
-> ssh-ed25519 xA739A FlGbfS4bUxA3gVDzb3yPjp4hV8a7aiNBLUctnN3bGEY
3fI6SyVjVhh2M8uc/XV3blpdQMPMYi2qzaHNXvx0bvM
-> ssh-ed25519 MSF3dg 0Bs/aW0nNISS+93It75o6hKZWa7S+LF5bF5ApsJ2fQ8
y7o0KYDHEen13ndIxg/mYil3eMxxzvYF2pWqhMb+rBU
--- Iqo75G4+02Y9nc1OOkcEx+iQlKnGYCekAx76tRH53wA
<10>
<EFBFBD>X<EFBFBD><EFBFBD>%f <0C><><12>hX <0B><>R<>c<EFBFBD>+z<><7A>eg<65>& <20>d<EFBFBD><64><EFBFBD>ק<06><>A<EFBFBD><41><EFBFBD>чXM<58>1<EFBFBD>

Binary file not shown.

View File

@@ -1,14 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 cDBabA heyW9/cxgwFX9IexQIXjAQDWGQPNcMXcArQp2Rxsqx4
o9MQ7EH8PDDjsJdpH9F3Xq2zUoaDAJQlfFmYucSFs6Y
-> ssh-ed25519 cK5kHw Sza4pos7K3qW3omEeyidI/jszJNf9smemSZnUJfCIww
D6vazXki7hIYraIuSiGPS+FPbkFUwHhHWDf52OhEIMg
-> ssh-ed25519 CAWG4Q YexIHueOIMmIN8JIDyNUOKBkyz/k18HqV3hTXh48KlM
xh8UJzzWT6ByN+Dpn4JrMNsjGC/uc/v6LynwjBDz9NQ
-> ssh-ed25519 xA739A KySG3TXdqfCMUkVEDGa74B0op745s3XGYxFLyAXSQAc
5EI/yb5ctW9Qu18bHm3/sK97kwGcKzzmWvPSCWm89XA
-> ssh-ed25519 MSF3dg MNxnNj0fHmri8ophexXPNjRUBUWrzcuk5S1mucxUMTE
GVFWXtISEU8ZmlwL4nh4weAgfGrt2GHX0DTzbpS6zg8
--- UdrqkYG2ZApAuwdZeNhC50NP2rkD/Ol6y8nJa4RHx7Y
<EFBFBD>ܻ<EFBFBD>m(<28><><EFBFBD>><3E>H<48>Y87<><37>G<0F>+*<12><><EFBFBD><EFBFBD>9V<>.<2E><><EFBFBD><EFBFBD><03><><EFBFBD>p<EFBFBD>Oo<4F>=+哇<>P0<50><30>{<7B>)<29><17><><EFBFBD><EFBFBD>><3E>z3P^
u

Binary file not shown.

View File

@@ -21,28 +21,17 @@ the detailed specifications:
## Access
To access the machine, request a SLURM session from [apex](/apex) using the `fox`
partition. If you need the machine for performance measurements, use an
exclusive reservation:
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
partition:
apex% salloc -p fox --exclusive
hut% salloc -p fox
Otherwise, specify the CPUs that you need so other users can also use the node
at the same time:
Then connect via ssh:
apex% salloc -p fox -c 8
Then use srun to execute an interactive shell:
apex% srun --pty $SHELL
hut% ssh fox
fox%
Make sure you get all CPUs you expect:
fox% grep Cpus_allowed_list /proc/self/status
Cpus_allowed_list: 0-191
Follow [these steps](/access) if you don't have access to apex or fox.
Follow [these steps](/access) if you don't have access to hut or fox.
## CUDA
@@ -96,22 +85,13 @@ Then just run `nix develop` from the same directory:
Cuda compilation tools, release 12.4, V12.4.99
Build cuda_12.4.r12.4/compiler.33961263_0
## AMD uProf
The [AMD uProf](https://www.amd.com/en/developer/uprof.html) performance
analysis tool-suite is installed and ready to use.
See the [AMD uProf user guide](https://docs.amd.com/r/en-US/57368-uProf-user-guide)
([PDF backup for v5.1](https://jungle.bsc.es/pub/57368-uprof-user-guide.pdf))
for more details on how to use the tools. To use the GUI make sure that you
connect to fox using X11 forwarding.
## Filesystems
The machine has several file systems available.
- `/nfs/home`: The `/home` from apex via NFS, which is also shared with other
xeon machines. It has about 2 ms of latency, so not suitable for quick random
access.
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
Don't abuse.
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
capacity. Stores three redundant copies of every file.
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.