Disable flake LFS

Increase body to 64 MiB for Gitea LFS
Track PDFs with Git LFS
2025-09-17 12:40:06 +02:00 · 2025-09-17 12:19:56 +02:00 · 2025-09-17 12:06:27 +02:00 · 2025-09-17 11:53:58 +02:00 · 2025-09-17 11:45:14 +02:00 · 2025-09-03 15:34:05 +02:00
52 changed files with 1123 additions and 193 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.pdf filter=lfs diff=lfs merge=lfs -text
--- a/doc/Intel_Server_Board_S2600WF_TPS_2_6.pdf
+++ b/doc/Intel_Server_Board_S2600WF_TPS_2_6.pdf
--- a/doc/R1000WF_SystemIntegration_and_ServiceGuide_Rev2_4.pdf
+++ b/doc/R1000WF_SystemIntegration_and_ServiceGuide_Rev2_4.pdf
--- a/doc/SEL_TroubleshootingGuide.pdf
+++ b/doc/SEL_TroubleshootingGuide.pdf
--- a/doc/bsc-ssf.pdf
+++ b/doc/bsc-ssf.pdf
--- a/flake.nix
+++ b/flake.nix
@@ -5,6 +5,7 @@
    agenix.inputs.nixpkgs.follows = "nixpkgs";
    bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
    bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
+    self.lfs = false;
  };

  outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
--- a/keys.nix
+++ b/keys.nix
@@ -16,8 +16,7 @@ rec {
  };

  hostGroup = with hosts; rec {
-    untrusted  = [ fox ];
-    compute    = [ owl1 owl2 ];
+    compute    = [ owl1 owl2 fox ];
    playground = [ eudy koro weasel ];
    storage    = [ bay lake2 ];
    monitor    = [ hut ];
@@ -31,6 +30,7 @@ rec {
  admins = {
    "rarias@hut"  = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
    "rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
+    "rarias@fox"  = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDSbw3REAKECV7E2c/e2XJITudJQWq2qDSe2N1JHqHZd rarias@fox";
    root          = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
  };
 }
--- a/m/apex/configuration.nix
+++ b/m/apex/configuration.nix
@@ -5,7 +5,9 @@
    ../common/xeon.nix
    ../common/ssf/hosts.nix
    ../module/ceph.nix
+    ../module/slurm-server.nix
    ./nfs.nix
+    ./wireguard.nix
  ];

  # Don't install grub MBR for now
--- a/m/apex/nfs.nix
+++ b/m/apex/nfs.nix
@@ -8,6 +8,7 @@
    statdPort = 4000;
    exports = ''
      /home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
+      /home 10.106.0.0/24(rw,async,no_subtree_check,no_root_squash)
    '';
  };
  networking.firewall = {
@@ -27,6 +28,21 @@
      iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001  -j nixos-fw-accept
      iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002  -j nixos-fw-accept
      iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
+
+      # Accept NFS traffic from wg0
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 111   -j nixos-fw-accept
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 2049  -j nixos-fw-accept
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4000  -j nixos-fw-accept
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4001  -j nixos-fw-accept
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4002  -j nixos-fw-accept
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
+      # Same but UDP
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 111   -j nixos-fw-accept
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 2049  -j nixos-fw-accept
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4000  -j nixos-fw-accept
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4001  -j nixos-fw-accept
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4002  -j nixos-fw-accept
+      iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
    '';
  };
 }
--- a/m/apex/wireguard.nix
+++ b/m/apex/wireguard.nix
@@ -0,0 +1,35 @@
+{ config, ... }:
+
+{
+  networking.firewall = {
+    allowedUDPPorts = [ 666 ];
+  };
+
+  age.secrets.wgApex.file = ../../secrets/wg-apex.age;
+
+  # Enable WireGuard
+  networking.wireguard.enable = true;
+  networking.wireguard.interfaces = {
+    # "wg0" is the network interface name. You can name the interface arbitrarily.
+    wg0 = {
+      ips = [ "10.106.0.30/24" ];
+      listenPort = 666;
+      privateKeyFile = config.age.secrets.wgApex.path;
+      # Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
+      peers = [
+        {
+          name = "Fox";
+          publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
+          allowedIPs = [ "10.106.0.0/24" ];
+          endpoint = "fox.ac.upc.edu:666";
+          # Send keepalives every 25 seconds. Important to keep NAT tables alive.
+          persistentKeepalive = 25;
+        }
+      ];
+    };
+  };
+
+  networking.hosts = {
+    "10.106.0.1" = [ "fox" ];
+  };
+}
--- a/m/common/base.nix
+++ b/m/common/base.nix
@@ -3,6 +3,7 @@
  # Includes the basic configuration for an Intel server.
  imports = [
    ./base/agenix.nix
+    ./base/always-power-on.nix
    ./base/august-shutdown.nix
    ./base/boot.nix
    ./base/env.nix
--- a/m/common/base/always-power-on.nix
+++ b/m/common/base/always-power-on.nix
@@ -0,0 +1,8 @@
+{
+  imports = [
+    ../../module/power-policy.nix
+  ];
+
+  # Turn on as soon as we have power
+  power.policy = "always-on";
+}
--- a/m/common/base/august-shutdown.nix
+++ b/m/common/base/august-shutdown.nix
@@ -1,12 +1,12 @@
 {
-  # Shutdown all machines on August 2nd at 11:00 AM, so we can protect the
+  # Shutdown all machines on August 3rd at 22:00, so we can protect the
  # hardware from spurious electrical peaks on the yearly electrical cut for
  # manteinance that starts on August 4th.
  systemd.timers.august-shutdown = {
-    description = "Shutdown on August 2nd for maintenance";
+    description = "Shutdown on August 3rd for maintenance";
    wantedBy = [ "timers.target" ];
    timerConfig = {
-      OnCalendar = "*-08-02 11:00:00";
+      OnCalendar = "*-08-03 22:00:00";
      RandomizedDelaySec = "10min";
      Unit = "systemd-poweroff.service";
    };
--- a/m/common/base/env.nix
+++ b/m/common/base/env.nix
@@ -4,7 +4,7 @@
  environment.systemPackages = with pkgs; [
    vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
    nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
-    ncdu config.boot.kernelPackages.perf ldns pv
+    ncdu config.boot.kernelPackages.perf ldns pv git-lfs
    # From bsckgs overlay
    osumb
  ];
--- a/m/common/base/net.nix
+++ b/m/common/base/net.nix
@@ -14,7 +14,7 @@
    nftables.enable = lib.mkForce false;

    hosts = {
-      "84.88.53.236" = [ "apex" "ssfhead.bsc.es" "ssfhead" ];
+      "84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
      "84.88.51.152" = [ "raccoon" ];
      "84.88.51.142" = [ "raccoon-ipmi" ];
    };
--- a/m/common/base/users.nix
+++ b/m/common/base/users.nix
@@ -154,6 +154,20 @@
          "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
        ];
      };
+
+      csiringo = {
+        # Arbitrary UID but large so it doesn't collide with other users on ssfhead.
+        uid = 9653;
+        isNormalUser = true;
+        home = "/home/Computational/csiringo";
+        description = "Cesare Siringo";
+        group = "Computational";
+        hosts = [ "apex" "weasel" ];
+        hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
+        openssh.authorizedKeys.keys = [
+          "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
+        ];
+      };
    };

    groups = {
--- a/m/fox/configuration.nix
+++ b/m/fox/configuration.nix
@@ -6,8 +6,14 @@
    ../common/xeon/console.nix
    ../module/emulation.nix
    ../module/nvidia.nix
+    ../module/slurm-client.nix
+    ./wireguard.nix
  ];

+  # Don't turn off on August as UPC has different dates.
+  # Fox works fine on power cuts.
+  systemd.timers.august-shutdown.enable = false;
+
  # Select the this using the ID to avoid mismatches
  boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";

@@ -31,6 +37,18 @@

  services.openssh.settings.X11Forwarding = true;

+  services.fail2ban.enable = true;
+
+  # Use SSH tunnel to reach internal hosts
+  programs.ssh.extraConfig = ''
+    Host bscpm04.bsc.es gitlab-internal.bsc.es tent
+      ProxyJump raccoon
+    Host raccoon
+      ProxyJump apex
+      HostName 127.0.0.1
+      Port 22022
+  '';
+
  networking = {
    timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
    hostName = "fox";
@@ -61,6 +79,13 @@
  fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
  fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };

+  # Mount the NFS home
+  fileSystems."/nfs/home" = {
+    device = "10.106.0.30:/home";
+    fsType = "nfs";
+    options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
+  };
+
  # Make a /nvme{0,1}/$USER directory for each user.
  systemd.services.create-nvme-dirs = let
    # Take only normal users in fox
@@ -77,4 +102,20 @@
    wantedBy = [ "multi-user.target" ];
    serviceConfig.ExecStart = script;
  };
+
+  # Only allow SSH connections from users who have a SLURM allocation
+  # See: https://slurm.schedmd.com/pam_slurm_adopt.html
+  security.pam.services.sshd.rules.account.slurm = {
+    control = "required";
+    enable = true;
+    modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
+    args = [ "log_level=debug5" ];
+    order = 999999; # Make it last one
+  };
+
+  # Disable systemd session (pam_systemd.so) as it will conflict with the
+  # pam_slurm_adopt.so module. What happens is that the shell is first adopted
+  # into the slurmstepd task and then into the systemd session, which is not
+  # what we want, otherwise it will linger even if all jobs are gone.
+  security.pam.services.sshd.startSession = lib.mkForce false;
 }
--- a/m/fox/wireguard.nix
+++ b/m/fox/wireguard.nix
@@ -0,0 +1,46 @@
+{ config, ... }:
+
+{
+  networking.firewall = {
+    allowedUDPPorts = [ 666 ];
+  };
+
+  age.secrets.wgFox.file = ../../secrets/wg-fox.age;
+
+  networking.wireguard.enable = true;
+  networking.wireguard.interfaces = {
+    # "wg0" is the network interface name. You can name the interface arbitrarily.
+    wg0 = {
+      # Determines the IP address and subnet of the server's end of the tunnel interface.
+      ips = [ "10.106.0.1/24" ];
+
+      # The port that WireGuard listens to. Must be accessible by the client.
+      listenPort = 666;
+
+      # Path to the private key file.
+      privateKeyFile = config.age.secrets.wgFox.path;
+      # Public key: VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=
+
+      peers = [
+        # List of allowed peers.
+        { 
+          name = "Apex";
+          publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
+          # List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
+          allowedIPs = [ "10.106.0.30/32" ];
+        }
+      ];
+    };
+  };
+
+  networking.hosts = {
+    "10.106.0.30" = [ "apex" ];
+  };
+
+  networking.firewall = {
+    extraCommands = ''
+      # Accept slurm connections to slurmd from apex (via wireguard)
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.30/32 -d 10.106.0.1/32 --dport 6818 -j nixos-fw-accept
+    '';
+  };
+}
--- a/m/hut/configuration.nix
+++ b/m/hut/configuration.nix
@@ -7,11 +7,9 @@
    ../module/ceph.nix
    ../module/debuginfod.nix
    ../module/emulation.nix
-    ../module/slurm-client.nix
    ./gitlab-runner.nix
    ./monitoring.nix
    ./nfs.nix
-    ./slurm-server.nix
    ./nix-serve.nix
    ./public-inbox.nix
    ./gitea.nix
--- a/m/hut/slurm-server.nix
+++ b/m/hut/slurm-server.nix
@@ -1,7 +0,0 @@
-{ ... }:
-
-{
-  services.slurm = {
-    server.enable = true;
-  };
-}
--- a/m/module/nvidia.nix
+++ b/m/module/nvidia.nix
@@ -1,4 +1,4 @@
-{ lib, config, ... }:
+{ lib, config, pkgs, ... }:
 {
  # Configure Nvidia driver to use with CUDA
  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
@@ -11,4 +11,10 @@
  # > requiredSystemFeatures = [ "cuda" ];
  programs.nix-required-mounts.enable = true;
  programs.nix-required-mounts.presets.nvidia-gpu.enable = true;
+  # They forgot to add the symlink
+  programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
+    config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
+  ];
+
+  environment.systemPackages = [ pkgs.cudainfo ];
 }
--- a/m/module/power-policy.nix
+++ b/m/module/power-policy.nix
@@ -0,0 +1,33 @@
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+  cfg = config.power.policy;
+in
+{
+  options = {
+    power.policy = mkOption {
+      type = types.nullOr (types.enum [ "always-on" "previous" "always-off" ]);
+      default = null;
+      description = "Set power policy to use via IPMI.";
+    };
+  };
+
+  config = mkIf (cfg != null) {
+    systemd.services."power-policy" = {
+      description = "Set power policy to use via IPMI";
+      wantedBy = [ "multi-user.target" ];
+      unitConfig = {
+        StartLimitBurst = "10";
+        StartLimitIntervalSec = "10m";
+      };
+      serviceConfig = {
+        ExecStart = "${pkgs.ipmitool}/bin/ipmitool chassis policy ${cfg}";
+        Type = "oneshot";
+        Restart = "on-failure";
+        RestartSec = "5s";
+      };
+    };
+  };
+}
--- a/m/module/slurm-client.nix
+++ b/m/module/slurm-client.nix
@@ -1,33 +1,10 @@
-{ config, pkgs, lib, ... }:
+{ lib, ... }:

-let
-  suspendProgram = pkgs.writeScript "suspend.sh" ''
-    #!/usr/bin/env bash
-    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
-    set -x
-    export "PATH=/run/current-system/sw/bin:$PATH"
-    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
-    hosts=$(scontrol show hostnames $1)
-    for host in $hosts; do
-      echo Shutting down host: $host
-      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
-    done
-  '';
+{
+  imports = [
+    ./slurm-common.nix
+  ];

-  resumeProgram = pkgs.writeScript "resume.sh" ''
-    #!/usr/bin/env bash
-    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
-    set -x
-    export "PATH=/run/current-system/sw/bin:$PATH"
-    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
-    hosts=$(scontrol show hostnames $1)
-    for host in $hosts; do
-      echo Starting host: $host
-      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
-    done
-  '';
-
-in {
  systemd.services.slurmd.serviceConfig = {
    # Kill all processes in the control group on stop/restart. This will kill
    # all the jobs running, so ensure that we only upgrade when the nodes are
@@ -37,90 +14,5 @@ in {
    KillMode = lib.mkForce "control-group";
  };

-  services.slurm = {
-    client.enable = true;
-    controlMachine = "hut";
-    clusterName = "jungle";
-    nodeName = [
-      "owl[1,2]  Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
-      "hut       Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
-    ];
-
-    partitionName = [
-      "owl Nodes=owl[1-2]     Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
-    ];
-
-    # See slurm.conf(5) for more details about these options.
-    extraConfig = ''
-      # Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
-      # not with Intel MPI. For that use the compatibility shim libpmi.so
-      # setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
-      # library in SLURM (--mpi=pmix). See more details here:
-      # https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
-      MpiDefault=pmix
-
-      # When a node reboots return that node to the slurm queue as soon as it
-      # becomes operative again.
-      ReturnToService=2
-
-      # Track all processes by using a cgroup
-      ProctrackType=proctrack/cgroup
-
-      # Enable task/affinity to allow the jobs to run in a specified subset of
-      # the resources. Use the task/cgroup plugin to enable process containment.
-      TaskPlugin=task/affinity,task/cgroup
-
-      # Power off unused nodes until they are requested
-      SuspendProgram=${suspendProgram}
-      SuspendTimeout=60
-      ResumeProgram=${resumeProgram}
-      ResumeTimeout=300
-      SuspendExcNodes=hut
-
-      # Turn the nodes off after 1 hour of inactivity
-      SuspendTime=3600
-
-      # Reduce port range so we can allow only this range in the firewall
-      SrunPortRange=60000-61000
-
-      # Use cores as consumable resources. In SLURM terms, a core may have
-      # multiple hardware threads (or CPUs).
-      SelectType=select/cons_tres
-
-      # Ignore memory constraints and only use unused cores to share a node with
-      # other jobs.
-      SelectTypeParameters=CR_Core
-
-      # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
-      # This sets up the "extern" step into which ssh-launched processes will be
-      # adopted. Alloc runs the prolog at job allocation (salloc) rather than
-      # when a task runs (srun) so we can ssh early.
-      PrologFlags=Alloc,Contain,X11
-
-      # LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
-      # adopted by the external step, similar to tasks running in regular steps
-      # LaunchParameters=ulimit_pam_adopt
-      SlurmdDebug=debug5
-      #DebugFlags=Protocol,Cgroup
-    '';
-
-    extraCgroupConfig = ''
-      CgroupPlugin=cgroup/v2
-      #ConstrainCores=yes
-    '';
-  };
-
-  # Place the slurm config in /etc as this will be required by PAM
-  environment.etc.slurm.source = config.services.slurm.etcSlurm;
-
-  age.secrets.mungeKey = {
-    file = ../../secrets/munge-key.age;
-    owner = "munge";
-    group = "munge";
-  };
-
-  services.munge = {
-    enable = true;
-    password = config.age.secrets.mungeKey.path;
-  };
+  services.slurm.client.enable = true;
 }
--- a/m/module/slurm-common.nix
+++ b/m/module/slurm-common.nix
@@ -0,0 +1,115 @@
+{ config, pkgs, ... }:
+
+let
+  suspendProgram = pkgs.writeShellScript "suspend.sh" ''
+    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
+    set -x
+    export "PATH=/run/current-system/sw/bin:$PATH"
+    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
+    hosts=$(scontrol show hostnames $1)
+    for host in $hosts; do
+      echo Shutting down host: $host
+      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
+    done
+  '';
+
+  resumeProgram = pkgs.writeShellScript "resume.sh" ''
+    exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
+    set -x
+    export "PATH=/run/current-system/sw/bin:$PATH"
+    echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
+    hosts=$(scontrol show hostnames $1)
+    for host in $hosts; do
+      echo Starting host: $host
+      ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
+    done
+  '';
+
+in {
+  services.slurm = {
+    controlMachine = "apex";
+    clusterName = "jungle";
+    nodeName = [
+      "owl[1,2]  Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
+      "fox       Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
+    ];
+
+    partitionName = [
+      "owl Nodes=owl[1-2]     Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
+      "fox Nodes=fox          Default=NO  DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
+    ];
+
+    # See slurm.conf(5) for more details about these options.
+    extraConfig = ''
+      # Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
+      # not with Intel MPI. For that use the compatibility shim libpmi.so
+      # setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
+      # library in SLURM (--mpi=pmix). See more details here:
+      # https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
+      MpiDefault=pmix
+
+      # When a node reboots return that node to the slurm queue as soon as it
+      # becomes operative again.
+      ReturnToService=2
+
+      # Track all processes by using a cgroup
+      ProctrackType=proctrack/cgroup
+
+      # Enable task/affinity to allow the jobs to run in a specified subset of
+      # the resources. Use the task/cgroup plugin to enable process containment.
+      TaskPlugin=task/affinity,task/cgroup
+
+      # Power off unused nodes until they are requested
+      SuspendProgram=${suspendProgram}
+      SuspendTimeout=60
+      ResumeProgram=${resumeProgram}
+      ResumeTimeout=300
+      SuspendExcNodes=fox
+
+      # Turn the nodes off after 1 hour of inactivity
+      SuspendTime=3600
+
+      # Reduce port range so we can allow only this range in the firewall
+      SrunPortRange=60000-61000
+
+      # Use cores as consumable resources. In SLURM terms, a core may have
+      # multiple hardware threads (or CPUs).
+      SelectType=select/cons_tres
+
+      # Ignore memory constraints and only use unused cores to share a node with
+      # other jobs.
+      SelectTypeParameters=CR_Core
+
+      # Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
+      # This sets up the "extern" step into which ssh-launched processes will be
+      # adopted. Alloc runs the prolog at job allocation (salloc) rather than
+      # when a task runs (srun) so we can ssh early.
+      PrologFlags=Alloc,Contain,X11
+
+      # LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
+      # adopted by the external step, similar to tasks running in regular steps
+      # LaunchParameters=ulimit_pam_adopt
+      SlurmdDebug=debug5
+      #DebugFlags=Protocol,Cgroup
+    '';
+
+    extraCgroupConfig = ''
+      CgroupPlugin=cgroup/v2
+      #ConstrainCores=yes
+    '';
+  };
+
+  # Place the slurm config in /etc as this will be required by PAM
+  environment.etc.slurm.source = config.services.slurm.etcSlurm;
+
+  age.secrets.mungeKey = {
+    file = ../../secrets/munge-key.age;
+    owner = "munge";
+    group = "munge";
+  };
+
+  services.munge = {
+    enable = true;
+    password = config.age.secrets.mungeKey.path;
+  };
+}
--- a/m/module/slurm-server.nix
+++ b/m/module/slurm-server.nix
@@ -0,0 +1,23 @@
+{ ... }:
+
+{
+  imports = [
+    ./slurm-common.nix
+  ];
+
+  services.slurm.server.enable = true;
+
+  networking.firewall = {
+    extraCommands = ''
+      # Accept slurm connections to controller from compute nodes
+      iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817 -j nixos-fw-accept
+      # Accept slurm connections from compute nodes for srun
+      iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
+
+      # Accept slurm connections to controller from fox (via wireguard)
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 6817 -j nixos-fw-accept
+      # Accept slurm connections from fox for srun (via wireguard)
+      iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 60000:61000 -j nixos-fw-accept
+    '';
+  };
+}
--- a/m/raccoon/configuration.nix
+++ b/m/raccoon/configuration.nix
@@ -39,6 +39,7 @@
    };
    hosts = {
      "10.0.44.4" = [ "tent" ];
+      "84.88.53.236" = [ "apex" ];
    };
  };

--- a/m/tent/configuration.nix
+++ b/m/tent/configuration.nix
@@ -33,6 +33,9 @@
    nameservers = [ "84.88.52.35" "84.88.52.36" ];
    search = [ "bsc.es" "ac.upc.edu" ];
    defaultGateway = "10.0.44.1";
+    hosts = {
+      "84.88.53.236" = [ "apex" ];
+    };
  };

  services.p.enable = true;
--- a/m/tent/gitea.nix
+++ b/m/tent/gitea.nix
@@ -26,5 +26,7 @@
        SENDMAIL_ARGS = "--";
      };
    };
+
+    lfs.enable = true;
  };
 }
--- a/m/tent/nginx.nix
+++ b/m/tent/nginx.nix
@@ -39,6 +39,7 @@ in
          rewrite ^/git/(.*) /$1 break;
          proxy_pass http://127.0.0.1:3000;
          proxy_redirect http:// $scheme://;
+          client_max_body_size 64M;
        }
        location /cache {
          rewrite ^/cache/(.*) /$1 break;
--- a/m/weasel/configuration.nix
+++ b/m/weasel/configuration.nix
@@ -14,6 +14,10 @@
  # Users with sudo access
  users.groups.wheel.members = [ "abonerib" "anavarro" ];

+  # Run julia installed with juliaup using julia's own libraries:
+  # NIX_LD_LIBRARY_PATH=~/.julia/juliaup/${VERS}/lib/julia ~/.juliaup/bin/julia
+  programs.nix-ld.enable = true;
+
  networking = {
    hostName = "weasel";
    interfaces.eno1.ipv4.addresses = [ {
--- a/pkgs/cudainfo/Makefile
+++ b/pkgs/cudainfo/Makefile
@@ -0,0 +1,12 @@
+HOSTCXX  ?= g++
+NVCC     := nvcc -ccbin $(HOSTCXX)
+CXXFLAGS := -m64
+
+# Target rules
+all: cudainfo
+
+cudainfo: cudainfo.cpp
+	$(NVCC) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f cudainfo cudainfo.o
--- a/pkgs/cudainfo/cudainfo.cpp
+++ b/pkgs/cudainfo/cudainfo.cpp
@@ -0,0 +1,600 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
+
+// Shared Utilities (QA Testing)
+
+// std::system includes
+#include <memory>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+    switch (error)
+    {
+        case cudaSuccess:
+            return "cudaSuccess";
+
+        case cudaErrorMissingConfiguration:
+            return "cudaErrorMissingConfiguration";
+
+        case cudaErrorMemoryAllocation:
+            return "cudaErrorMemoryAllocation";
+
+        case cudaErrorInitializationError:
+            return "cudaErrorInitializationError";
+
+        case cudaErrorLaunchFailure:
+            return "cudaErrorLaunchFailure";
+
+        case cudaErrorPriorLaunchFailure:
+            return "cudaErrorPriorLaunchFailure";
+
+        case cudaErrorLaunchTimeout:
+            return "cudaErrorLaunchTimeout";
+
+        case cudaErrorLaunchOutOfResources:
+            return "cudaErrorLaunchOutOfResources";
+
+        case cudaErrorInvalidDeviceFunction:
+            return "cudaErrorInvalidDeviceFunction";
+
+        case cudaErrorInvalidConfiguration:
+            return "cudaErrorInvalidConfiguration";
+
+        case cudaErrorInvalidDevice:
+            return "cudaErrorInvalidDevice";
+
+        case cudaErrorInvalidValue:
+            return "cudaErrorInvalidValue";
+
+        case cudaErrorInvalidPitchValue:
+            return "cudaErrorInvalidPitchValue";
+
+        case cudaErrorInvalidSymbol:
+            return "cudaErrorInvalidSymbol";
+
+        case cudaErrorMapBufferObjectFailed:
+            return "cudaErrorMapBufferObjectFailed";
+
+        case cudaErrorUnmapBufferObjectFailed:
+            return "cudaErrorUnmapBufferObjectFailed";
+
+        case cudaErrorInvalidHostPointer:
+            return "cudaErrorInvalidHostPointer";
+
+        case cudaErrorInvalidDevicePointer:
+            return "cudaErrorInvalidDevicePointer";
+
+        case cudaErrorInvalidTexture:
+            return "cudaErrorInvalidTexture";
+
+        case cudaErrorInvalidTextureBinding:
+            return "cudaErrorInvalidTextureBinding";
+
+        case cudaErrorInvalidChannelDescriptor:
+            return "cudaErrorInvalidChannelDescriptor";
+
+        case cudaErrorInvalidMemcpyDirection:
+            return "cudaErrorInvalidMemcpyDirection";
+
+        case cudaErrorAddressOfConstant:
+            return "cudaErrorAddressOfConstant";
+
+        case cudaErrorTextureFetchFailed:
+            return "cudaErrorTextureFetchFailed";
+
+        case cudaErrorTextureNotBound:
+            return "cudaErrorTextureNotBound";
+
+        case cudaErrorSynchronizationError:
+            return "cudaErrorSynchronizationError";
+
+        case cudaErrorInvalidFilterSetting:
+            return "cudaErrorInvalidFilterSetting";
+
+        case cudaErrorInvalidNormSetting:
+            return "cudaErrorInvalidNormSetting";
+
+        case cudaErrorMixedDeviceExecution:
+            return "cudaErrorMixedDeviceExecution";
+
+        case cudaErrorCudartUnloading:
+            return "cudaErrorCudartUnloading";
+
+        case cudaErrorUnknown:
+            return "cudaErrorUnknown";
+
+        case cudaErrorNotYetImplemented:
+            return "cudaErrorNotYetImplemented";
+
+        case cudaErrorMemoryValueTooLarge:
+            return "cudaErrorMemoryValueTooLarge";
+
+        case cudaErrorInvalidResourceHandle:
+            return "cudaErrorInvalidResourceHandle";
+
+        case cudaErrorNotReady:
+            return "cudaErrorNotReady";
+
+        case cudaErrorInsufficientDriver:
+            return "cudaErrorInsufficientDriver";
+
+        case cudaErrorSetOnActiveProcess:
+            return "cudaErrorSetOnActiveProcess";
+
+        case cudaErrorInvalidSurface:
+            return "cudaErrorInvalidSurface";
+
+        case cudaErrorNoDevice:
+            return "cudaErrorNoDevice";
+
+        case cudaErrorECCUncorrectable:
+            return "cudaErrorECCUncorrectable";
+
+        case cudaErrorSharedObjectSymbolNotFound:
+            return "cudaErrorSharedObjectSymbolNotFound";
+
+        case cudaErrorSharedObjectInitFailed:
+            return "cudaErrorSharedObjectInitFailed";
+
+        case cudaErrorUnsupportedLimit:
+            return "cudaErrorUnsupportedLimit";
+
+        case cudaErrorDuplicateVariableName:
+            return "cudaErrorDuplicateVariableName";
+
+        case cudaErrorDuplicateTextureName:
+            return "cudaErrorDuplicateTextureName";
+
+        case cudaErrorDuplicateSurfaceName:
+            return "cudaErrorDuplicateSurfaceName";
+
+        case cudaErrorDevicesUnavailable:
+            return "cudaErrorDevicesUnavailable";
+
+        case cudaErrorInvalidKernelImage:
+            return "cudaErrorInvalidKernelImage";
+
+        case cudaErrorNoKernelImageForDevice:
+            return "cudaErrorNoKernelImageForDevice";
+
+        case cudaErrorIncompatibleDriverContext:
+            return "cudaErrorIncompatibleDriverContext";
+
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return "cudaErrorPeerAccessAlreadyEnabled";
+
+        case cudaErrorPeerAccessNotEnabled:
+            return "cudaErrorPeerAccessNotEnabled";
+
+        case cudaErrorDeviceAlreadyInUse:
+            return "cudaErrorDeviceAlreadyInUse";
+
+        case cudaErrorProfilerDisabled:
+            return "cudaErrorProfilerDisabled";
+
+        case cudaErrorProfilerNotInitialized:
+            return "cudaErrorProfilerNotInitialized";
+
+        case cudaErrorProfilerAlreadyStarted:
+            return "cudaErrorProfilerAlreadyStarted";
+
+        case cudaErrorProfilerAlreadyStopped:
+            return "cudaErrorProfilerAlreadyStopped";
+
+        /* Since CUDA 4.0*/
+        case cudaErrorAssert:
+            return "cudaErrorAssert";
+
+        case cudaErrorTooManyPeers:
+            return "cudaErrorTooManyPeers";
+
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return "cudaErrorHostMemoryAlreadyRegistered";
+
+        case cudaErrorHostMemoryNotRegistered:
+            return "cudaErrorHostMemoryNotRegistered";
+
+        /* Since CUDA 5.0 */
+        case cudaErrorOperatingSystem:
+            return "cudaErrorOperatingSystem";
+
+        case cudaErrorPeerAccessUnsupported:
+            return "cudaErrorPeerAccessUnsupported";
+
+        case cudaErrorLaunchMaxDepthExceeded:
+            return "cudaErrorLaunchMaxDepthExceeded";
+
+        case cudaErrorLaunchFileScopedTex:
+            return "cudaErrorLaunchFileScopedTex";
+
+        case cudaErrorLaunchFileScopedSurf:
+            return "cudaErrorLaunchFileScopedSurf";
+
+        case cudaErrorSyncDepthExceeded:
+            return "cudaErrorSyncDepthExceeded";
+
+        case cudaErrorLaunchPendingCountExceeded:
+            return "cudaErrorLaunchPendingCountExceeded";
+
+        case cudaErrorNotPermitted:
+            return "cudaErrorNotPermitted";
+
+        case cudaErrorNotSupported:
+            return "cudaErrorNotSupported";
+
+        /* Since CUDA 6.0 */
+        case cudaErrorHardwareStackError:
+            return "cudaErrorHardwareStackError";
+
+        case cudaErrorIllegalInstruction:
+            return "cudaErrorIllegalInstruction";
+
+        case cudaErrorMisalignedAddress:
+            return "cudaErrorMisalignedAddress";
+
+        case cudaErrorInvalidAddressSpace:
+            return "cudaErrorInvalidAddressSpace";
+
+        case cudaErrorInvalidPc:
+            return "cudaErrorInvalidPc";
+
+        case cudaErrorIllegalAddress:
+            return "cudaErrorIllegalAddress";
+
+        /* Since CUDA 6.5*/
+        case cudaErrorInvalidPtx:
+            return "cudaErrorInvalidPtx";
+
+        case cudaErrorInvalidGraphicsContext:
+            return "cudaErrorInvalidGraphicsContext";
+
+        case cudaErrorStartupFailure:
+            return "cudaErrorStartupFailure";
+
+        case cudaErrorApiFailureBase:
+            return "cudaErrorApiFailureBase";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        cudaDeviceReset();
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+#if CUDART_VERSION < 5000
+
+// CUDA-C includes
+#include <cuda.h>
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);
+
+    if (CUDA_SUCCESS != error) {
+        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
+                error, __FILE__, __LINE__);
+
+        // cudaDeviceReset causes the driver to clean up all state. While
+        // not mandatory in normal operation, it is good practice.  It is also
+        // needed to ensure correct operation when the application is being
+        // profiled. Calling cudaDeviceReset causes all profile data to be
+        // flushed before the application exits
+        cudaDeviceReset();
+        exit(EXIT_FAILURE);
+    }
+}
+
+#endif /* CUDART_VERSION < 5000 */
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] = {
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;
+
+    printf("%s Starting...\n\n", argv[0]);
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
+
+    int deviceCount = 0;
+    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+    if (error_id != cudaSuccess) {
+        printf("cudaGetDeviceCount failed: %s (%d)\n",
+			cudaGetErrorString(error_id), (int) error_id);
+        printf("Result = FAIL\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // This function call returns 0 if there are no CUDA capable devices.
+    if (deviceCount == 0)
+        printf("There are no available device(s) that support CUDA\n");
+    else
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+
+    int dev, driverVersion = 0, runtimeVersion = 0;
+
+    for (dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+
+        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+
+        // Console log
+        cudaDriverGetVersion(&driverVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
+
+        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
+                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
+
+        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
+               deviceProp.multiProcessorCount,
+               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+
+
+#if CUDART_VERSION >= 5000
+        // This is supported in CUDA 5.0 (runtime API device properties)
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);
+
+        if (deviceProp.l2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
+        }
+
+#else
+        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
+        int memoryClock;
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
+        int memBusWidth;
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        int L2CacheSize;
+        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+        if (L2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
+        }
+
+#endif
+
+        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
+               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
+               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
+               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
+        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
+               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
+
+
+        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
+        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
+        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
+        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
+        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxThreadsDim[0],
+               deviceProp.maxThreadsDim[1],
+               deviceProp.maxThreadsDim[2]);
+        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxGridSize[0],
+               deviceProp.maxGridSize[1],
+               deviceProp.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
+        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
+#endif
+        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
+
+        const char *sComputeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+            "Unknown",
+            NULL
+        };
+        printf("  Compute Mode:\n");
+        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
+    }
+
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
+    if (deviceCount >= 2)
+    {
+        cudaDeviceProp prop[64];
+        int gpuid[64]; // we want to find the first two GPU's that can support P2P
+        int gpu_p2p_count = 0;
+
+        for (int i=0; i < deviceCount; i++)
+        {
+            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
+
+            // Only boards based on Fermi or later can support P2P
+            if ((prop[i].major >= 2)
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
+                && prop[i].tccDriver
+#endif
+               )
+            {
+                // This is an array of P2P capable GPUs
+                gpuid[gpu_p2p_count++] = i;
+            }
+        }
+
+        // Show all the combinations of support P2P GPUs
+        int can_access_peer_0_1, can_access_peer_1_0;
+
+        if (gpu_p2p_count >= 2)
+        {
+            for (int i = 0; i < gpu_p2p_count-1; i++)
+            {
+                for (int j = 1; j < gpu_p2p_count; j++)
+                {
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
+                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
+                           prop[gpuid[j]].name, gpuid[j] ,
+                           can_access_peer_0_1 ? "Yes" : "No");
+                }
+            }
+
+            for (int j = 1; j < gpu_p2p_count; j++)
+            {
+                for (int i = 0; i < gpu_p2p_count-1; i++)
+                {
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
+                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
+                           prop[gpuid[i]].name, gpuid[i] ,
+                           can_access_peer_1_0 ? "Yes" : "No");
+                }
+            }
+        }
+    }
+
+    // csv masterlog info
+    // *****************************
+    // exe and CUDA driver name
+    printf("\n");
+    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+    char cTemp[128];
+
+    // driver version
+    sProfileString += ", CUDA Driver Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+#else
+    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+#endif
+    sProfileString +=  cTemp;
+
+    // Runtime version
+    sProfileString += ", CUDA Runtime Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+#else
+    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+#endif
+    sProfileString +=  cTemp;
+
+    // Device count
+    sProfileString += ", NumDevs = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d", deviceCount);
+#else
+    sprintf(cTemp, "%d", deviceCount);
+#endif
+    sProfileString += cTemp;
+
+    // Print Out all device Names
+    for (dev = 0; dev < deviceCount; ++dev)
+    {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        sprintf_s(cTemp, 13, ", Device%d = ", dev);
+#else
+        sprintf(cTemp, ", Device%d = ", dev);
+#endif
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        sProfileString += cTemp;
+        sProfileString += deviceProp.name;
+    }
+
+    sProfileString += "\n";
+    printf("%s", sProfileString.c_str());
+
+    printf("Result = PASS\n");
+
+    // finish
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    cudaDeviceReset();
+    return 0;
+}
--- a/pkgs/cudainfo/default.nix
+++ b/pkgs/cudainfo/default.nix
@@ -0,0 +1,43 @@
+{
+  stdenv
+, cudatoolkit
+, cudaPackages
+, autoAddDriverRunpath
+, strace
+}:
+
+stdenv.mkDerivation (finalAttrs: {
+  name = "cudainfo";
+  src = ./.;
+  buildInputs = [
+    cudatoolkit # Required for nvcc
+    cudaPackages.cuda_cudart.static # Required for -lcudart_static
+    autoAddDriverRunpath
+  ];
+  installPhase = ''
+    mkdir -p $out/bin
+    cp -a cudainfo $out/bin
+  '';
+  passthru.gpuCheck = stdenv.mkDerivation {
+    name = "cudainfo-test";
+    requiredSystemFeatures = [ "cuda" ];
+    dontBuild = true;
+    nativeCheckInputs = [
+      finalAttrs.finalPackage # The cudainfo package from above
+      strace # When it fails, it will show the trace
+    ];
+    dontUnpack = true;
+    doCheck = true;
+    checkPhase = ''
+      if ! cudainfo; then
+        set -x
+        cudainfo=$(command -v cudainfo)
+        ldd $cudainfo
+        readelf -d $cudainfo
+        strace -f $cudainfo
+        set +x
+      fi
+    '';
+    installPhase = "touch $out";
+  };
+})
--- a/pkgs/overlay.nix
+++ b/pkgs/overlay.nix
@@ -52,4 +52,5 @@ final: prev:
  prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
  meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
  upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
+  cudainfo = prev.callPackage ./cudainfo/default.nix { };
 }
--- a/secrets/ceph-user.age
+++ b/secrets/ceph-user.age
--- a/secrets/gitea-runner-token.age
+++ b/secrets/gitea-runner-token.age
@@ -1,11 +1,13 @@
 age-encryption.org/v1
-> ssh-ed25519 HY2yRg d7+nvfAcdC3GjJxipXFrsfGGyP5jAY+gRWRV+4FVYAM
-CG7r0bRGgnUWcdfDnpe7HwZ3L/y7b5iuJuqvf15b3/Y
-> ssh-ed25519 CAWG4Q X0vITOErz4wkR3VQYOcVlnrkHtwe+ytdZz1Hcrs4vVs
-6IWYOhXLQ+BnML9YfLLHJYEO2CZ/uEc9IBqhoWvjDHI
-> ssh-ed25519 xA739A p5e/0AJtZ0+zbRvkB/usLuxusY8xXRx9Ksi/LQlcIHw
-M4S/qlzT9POyJx4gY9lmycstUcdwG2cinN4OlV22zzo
-> ssh-ed25519 MSF3dg Ydl7uBWzBx6sAaxbzC3x8qiaU3ysGqV4rUFLpHCEV30
-/1AUHBhCNOs9i7LJbmzwQDHsu+ybzYf6+coztKk5E3U
--- kYt15WxClpT7PXD1oFe9GqJU+OswjH7y9wIc8/GzZ7M
-<EFBFBD><EFBFBD>h<>ߓ<><DF93><EFBFBD>`<60><><EFBFBD>V4F<34><46>_k)^<5E>m$uj:ѳ<><D1B3><17><><EFBFBD>}<7D>Z]$U]<12>u<EFBFBD> <20>0<EFBFBD><30><EFBFBD>v8<76>?<3F>X<EFBFBD>P<EFBFBD>g%d<>#<23>d9{rAi<41><69>
+-> ssh-ed25519 HY2yRg gKGxsjHfpiRDQ6Tuvcx7pjKgrVUGweotuplLYwCGvik
+DSz9j/stVyB1lXpVP+kg+H+RDgSftREGFFLQZClC3kI
+-> ssh-ed25519 cK5kHw 17DpKekfNVy4V742QSd61r2w6iawtOJR7Ct3UflDXio
+hsqTEPCYjHKvndMWPl4GpG23CzjGgVrS+cLIymISJHU
+-> ssh-ed25519 CAWG4Q oK01d4pbBqEZVsymSiKijPvJo714xsMSRMbzkssJKiw
+hs0tVFkqtIHXg9jtC2iDgCtefFcWvGJkXB+HJUcqXQs
+-> ssh-ed25519 xA739A KxO+AawfLMERHwzt3YnZRwPFlCfGETma7fo8M+ZtsAY
+eSn0+/rhLQxNKt5xKubKck8Nxun2Sh3eJqBU/hwgzZM
+-> ssh-ed25519 MSF3dg OyaZBLB2kO8fU139lXbbC404gT7IzIWk+BMhYzabBDg
+/fiPFfBJcb+e40+fZbwCw7niF2hh+JxUPiKSiwUSOWg
+--- ycZyGX+Li+LsOuweF9OVPl8aoMaRgp/RdFbDrPszkUs
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD>YM<EFBFBD><EFBFBD>:E O<><4F>2<EFBFBD>r=<15>&4<><04>CQΣ<51><CEA3>hC<68><43><EFBFBD>cb<63>^Sy<53><79>%	<09><>x-vC`g<><15><><EFBFBD><EFBFBD>W^<5E><>wVG<0B><><EFBFBD>
--- a/secrets/gitlab-bsc-docker-token.age
+++ b/secrets/gitlab-bsc-docker-token.age
--- a/secrets/gitlab-runner-docker-token.age
+++ b/secrets/gitlab-runner-docker-token.age
--- a/secrets/gitlab-runner-shell-token.age
+++ b/secrets/gitlab-runner-shell-token.age
--- a/secrets/ipmi.yml.age
+++ b/secrets/ipmi.yml.age
--- a/secrets/jungle-robot-password.age
+++ b/secrets/jungle-robot-password.age
--- a/secrets/munge-key.age
+++ b/secrets/munge-key.age
--- a/secrets/nix-serve.age
+++ b/secrets/nix-serve.age
--- a/secrets/secrets.nix
+++ b/secrets/secrets.nix
@@ -2,6 +2,8 @@ let
  keys = import ../keys.nix;
  adminsKeys = builtins.attrValues keys.admins;
  hut = [ keys.hosts.hut ] ++ adminsKeys;
+  fox = [ keys.hosts.fox ] ++ adminsKeys;
+  apex = [ keys.hosts.apex ] ++ adminsKeys;
  mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
  tent = [ keys.hosts.tent ] ++ adminsKeys;
  # Only expose ceph keys to safe nodes and admins
@@ -24,4 +26,7 @@ in

  "ceph-user.age".publicKeys = safe;
  "munge-key.age".publicKeys = safe;
+
+  "wg-fox.age".publicKeys = fox;
+  "wg-apex.age".publicKeys = apex;
 }
--- a/secrets/tent-gitlab-runner-bsc-docker-token.age
+++ b/secrets/tent-gitlab-runner-bsc-docker-token.age
@@ -1,11 +1,13 @@
 age-encryption.org/v1
-> ssh-ed25519 G5LX5w HlQ4V8lBd3im5j8KHEuQZBTuztvPj1QoWdv6FL6qzGI
-Jpt91X1UIIVFQt1X6Q//kALn+Cetp/LqBZZvTuhFthw
-> ssh-ed25519 CAWG4Q StnngJAcuAwUnTrXDR3nJ2KFN0jNdTqSz+/1TfmWkzA
-CR4AQ6fqaJVY1mdUIX1gzaZwRs1sU8F8hHztnkN8vN0
-> ssh-ed25519 xA739A xya5A5t63Owx+VrGgUfV/lIP8b/xV1cerMpuZBLaDVM
-w+pA583yUnFq2AvGBGzWbQIGQEY9WqW0CSLQ9v+SG0c
-> ssh-ed25519 MSF3dg aXkLxCyYdOwVopHHmpXEI6WlAIizKdJi4IO0KEdhS3s
-WKXkTszZN66+QZdSDJ4D9q7xgYWMfliOLCubIF2Dqkc
--- uVWoU2lMkqQ/9Z0BqKRCeUpsKi8lwmHukT/FV8wYMbg
-<EFBFBD><EFBFBD>1G+<2B>6<EFBFBD><36>g[|x]2T<32>й<EFBFBD><D0B9><EFBFBD> <20>CKu)<29><><EFBFBD>]<5D><>8֓<38><D693><EFBFBD><EFBFBD>l<EFBFBD><6C>S<EFBFBD><53><EFBFBD>Q<EFBFBD><07><>x<EFBFBD><78><EFBFBD><EFBFBD>#7r<37>k{*<2A><>3ս~C<>b<EFBFBD><62><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵ<EFBFBD>Np<1E><05>]J]h<>je+d%Е<>#<23>m<EFBFBD>?=6}<7D> 
+-> ssh-ed25519 G5LX5w Zhbs+NM/SI49qQ0X8bBpWUWxYM0vUKCXNAnPpIE2NR0
+CkBUmJ26EkwHztT8Pz0UGq2KZwN0Xz8iYQ9cEHL9OWQ
+-> ssh-ed25519 cK5kHw 5KjUXJywRDp2A7l5ukTCS+WIAalxwP1f71ejGxwNrX4
+JW8OLmfkULXo9AwYMGNyOgZ+nQ0MVc0PCM4kKPIo6V4
+-> ssh-ed25519 CAWG4Q cVjY3R0ZHAfokA4kWlu5vOl2Gs7mdqRgRk4WSUOXAjg
+IxEDvuximW99EqxmpW+Btpm0Zydmwg/u87bqnl26NYc
+-> ssh-ed25519 xA739A hmuwZuxmJnuAjmU4X8yhPQ+hPWvN1G+ZS0pvD7fHamg
+fnAPW6ZCrv5pSO4RQhhr8xz7ij7jAZJk0ApWluOXDng
+-> ssh-ed25519 MSF3dg SSGLcWnum0Qo/0OnKDZVg9xAZMwGwVNYYmRJXxb4GU0
+pdl6kATG7n2oMsoUboBfu+vDKurJcH1UvUa70rfMQkE
+--- a2ZQAeAQlO9DWnegIAq6NpI1Po6f38l+hitZvq+zIW8
+<EFBFBD>\ֺ"^<5E>DT<44>H<EFBFBD><48>3<EFBFBD><33><EFBFBD>_|.h<0E><><EFBFBD><EFBFBD><03>^<5E>n<14><0E><><EFBFBD><EFBFBD><1A>g<EFBFBD>S<EFBFBD>]_<><5F>?n<>z~2<>!<21>p7<70><37><<3C><14>ʨD?<3F>~<02>F<EFBFBD>$<24>`<60>q+<2B><><EFBFBD>SW<53>(+<2B><>P<EFBFBD>c<1E>u[<5B>m<EFBFBD>`O<>ܛ<EFBFBD>ϖT
--- a/secrets/tent-gitlab-runner-pm-docker-token.age
+++ b/secrets/tent-gitlab-runner-pm-docker-token.age
@@ -1,11 +1,13 @@
 age-encryption.org/v1
-> ssh-ed25519 G5LX5w sg9SmahxBg35MDIxhrp4oHkaTaxsKoVQju2eNhCt0BM
-CZ64dEGqz2tbkG8KtimZvLUEMrQpVVBJP7Fu46WTMgc
-> ssh-ed25519 CAWG4Q jzS1R14W1CWxdziMLG/yCGPLWSkiyE+9lqyCVe491ng
-acJo/nhKq3pSPoFEPaFLN1fzHHbEzstNoLtohWAHKiM
-> ssh-ed25519 xA739A qeGJoLeSIQwLU2Yg+Gi2bikHJ3HscLfyo1msqL3JwHw
-tTwaxRBKTl/SoyY/LnxR/j/5WvCNX5VeZLKi018YMrY
-> ssh-ed25519 MSF3dg Wym7Uyf1XvH1H6mNDERkO8opkMiN0zzXm2PjXftEOWs
-Uw8ZwwKIB5UqgVuoSLE2QajNDJZkH7/Y3Nsy+WFl7Xs
--- 94hGVbYiCGZdMEJesCMLh7IZi+w5l/Kr1lZJHQgrc0o
-j5j磛<6A><04><>J<EFBFBD><4A><EFBFBD>a<EFBFBD>]<5D>a%dr<64><72>FDT<44><54>^<5E><>Q<EFBFBD>s/<2F>kwB<77>$<24><>$<24><>H<EFBFBD>'<27><><EFBFBD><EFBFBD><EFBFBD>w<14><?^|<7C><07>h$<24>ؗ<EFBFBD>GI<47>ĕsT2RU<52><55>*/O<>7<EFBFBD><37><EFBFBD>G<EFBFBD>pͪ<70>4<EFBFBD><34><EFBFBD>M9<4D>j<><06>
+-> ssh-ed25519 G5LX5w VKM/Y6Wy0gmb2gc4Q00VzHQ4IAxfSyshuDoaAzlEkFM
+vf18uoEN5ZLJ4HcJg85epaseh1CRL9/ncXtU2HpH+QE
+-> ssh-ed25519 cK5kHw sMuG07kjlI6VjPjELOUPzkn+KT9Yq7BPf0zSATM2aGI
+/eODwL8KwyVgFjBK2MJlbqjN7mEvXCSsjq9D96szrng
+-> ssh-ed25519 CAWG4Q t3/Ty7yCqC5x8KQY4VaHSQ9Q3epqMpXoBDKyKx9+VzE
+JwgUsqMd+1jFZvFp9/SIoowbhSMVEkKp03T69+OHjho
+-> ssh-ed25519 xA739A 0ohmKK427+4vupivrtjXp0dDK8wT4XUA9rWgcsCGKgA
+msbeQyz3pL8RLtAeXX5tsfyHyOXxhfYpqaLEKnRxpPQ
+-> ssh-ed25519 MSF3dg H+6jAoP7/Dxp8C/7Bk1C4CT1hpkUhtbnTWWIxkO24Ec
+SrMuUG93T5lUw3xINEen5EEKLXJizIGFhBO1fVroFHE
+--- tIPnH9cxTV3m3qzvZB97Egz+raWwZJ182BXXKDu8f+o
+<EFBFBD><EFBFBD>f#<23>,|<7C>Ey.v<>DL<44>Ӻ<05>JPX<50><07><>`<60><><EFBFBD><EFBFBD>-#<23>F<EFBFBD>Ubs<62>(Q!?<3F><1A>#xJG?5<><35><EFBFBD><EFBFBD><EFBFBD>~<7E><>6MA<15>U<><55><EFBFBD>C<01><>M<>$+}W<>NϨG!<21><><EFBFBD><EFBFBD>a<EFBFBD><61><EFBFBD><EFBFBD>%<25>ǽ<EFBFBD>G
--- a/secrets/tent-gitlab-runner-pm-shell-token.age
+++ b/secrets/tent-gitlab-runner-pm-shell-token.age
@@ -1,12 +1,13 @@
 age-encryption.org/v1
-> ssh-ed25519 G5LX5w 5K0mzfJGvAB2LGmoQ9ZLbWooVEX6F4+fQdo1JUoB3FM
-AKGa507bUrYjXFaMQ1MXTDBFYsdS6zbs+flmxYN0UNo
-> ssh-ed25519 CAWG4Q 8KzLc949on8iN1pK8q11OpCIeO71t6b0zxCLHhcQ6ns
-uy7z6RdIuoUes+Uap3k5eoFFuu/DcSrEBwq4V4C/ygc
-> ssh-ed25519 xA739A SLx5cKo0fdAHj+cLpJ4FYTWTUTyDsCqKQOufDu3xnGo
-VnS/WsiSaf6RpXuhgfij4pYu4p9hlJl1oXrfYY9rKlQ
-> ssh-ed25519 MSF3dg c5ZXvdNxNfZU3HeWsttuhy+UC5JxWN/IFuCuCGbksn4
-vcKlIirf+VvERX71YpmwW6zp6ClhlG2PR4R8LIN7cQo
--- pJKICDaYAlxqNnvHIuzB3Yk7tv0ZNYflGTQD+Zk/8+4
-<EFBFBD>h/\J<>J
-<EFBFBD>0?<3F> <20>p<EFBFBD><70><EFBFBD>@܉7<DC89><37>3<EFBFBD><33><EFBFBD><EFBFBD>z<EFBFBD><7A><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>a<EFBFBD><61>'<27>,ka<6B>I<EFBFBD>XXOZ<4F>I\<5C><><EFBFBD><EFBFBD><EFBFBD>	<09>BP<42><50>/cUɿ~B<><42>S'Q<><51><EFBFBD><EFBFBD>f<06><><EFBFBD>er<65><72><EFBFBD><EFBFBD>^<5E><><EFBFBD><EFBFBD>8l<38><6C>V<EFBFBD>E<EFBFBD><45><EFBFBD>
+-> ssh-ed25519 G5LX5w 1KfTmTRP3iSdcclf/FuIpFWpy1tgKs5ED+qSYWo7inY
+RX6Q1nLFF/yiVLpkWrl0BI0PpLoBi753+y8l/AXjNE4
+-> ssh-ed25519 cK5kHw TP7+OQpQSNuyArnUo1C97J3P3oB0YtzCEPeVvlzsYHE
+Bsy5KPNHTVNHnF1sxOvlfJq3CNMVFaXdYkRG2vSj7qM
+-> ssh-ed25519 CAWG4Q eQyzwNaH6CfaYIjs8abEuQxt6vxRXsGz69UletMUVDE
+FDcynPO7xg4PWez5Z8gTg5LyE0Wgb3zT9i3Kon67QsU
+-> ssh-ed25519 xA739A 2JuLai2fUu3dZBydS8cMrLrEUIUkz4NNaiupoBOtTwU
+sdM3X+XRzysop7yqa76Z7FAwTHOj91STCtZvfIgCdB0
+-> ssh-ed25519 MSF3dg fSPkiWnpInX1V5p3afPCoPotcGFoWFiOMPThtY927lc
+8v7E/3l0xA2VWZPXzkN4NmnaA0KJutLMurn/ZXZmhxA
+--- MQkyBx9hT4ILYXKoZT18PWny1QbDFymcZr63zjMN/qQ
+-b<>#<23><>M.<16>@<40>t<EFBFBD><74><EFBFBD>ŵ}+ό#@<40><><EFBFBD><EFBFBD><EFBFBD>k<EFBFBD>y<EFBFBD><79><EFBFBD>?v<><76>n<1F><>T<EFBFBD>+<2B><><EFBFBD>[<5B>Q<EFBFBD> gA<67><41><EFBFBD>
--- a/secrets/vpn-dac-client-key.age
+++ b/secrets/vpn-dac-client-key.age
--- a/secrets/vpn-dac-login.age
+++ b/secrets/vpn-dac-login.age
@@ -1,12 +1,14 @@
 age-encryption.org/v1
-> ssh-ed25519 G5LX5w /RF8uZ/KahUqjEFILbF3+Jin+U0SQdoQChcc9RJ9axc
-aEmPk++86nBR6d2BIa/oaUdyiLS6cH8TUoYJE3bxba4
-> ssh-ed25519 CAWG4Q qHyh9nQi8c3z/KHby9y5vhzN0Dwz0zca98ebjJmXrzs
-ZbmwNzrSSQ3RvskE8SqcBa0vMy8pzm/HPGHLm5zuPGQ
-> ssh-ed25519 xA739A FlGbfS4bUxA3gVDzb3yPjp4hV8a7aiNBLUctnN3bGEY
-3fI6SyVjVhh2M8uc/XV3blpdQMPMYi2qzaHNXvx0bvM
-> ssh-ed25519 MSF3dg 0Bs/aW0nNISS+93It75o6hKZWa7S+LF5bF5ApsJ2fQ8
-y7o0KYDHEen13ndIxg/mYil3eMxxzvYF2pWqhMb+rBU
--- Iqo75G4+02Y9nc1OOkcEx+iQlKnGYCekAx76tRH53wA
-<10>
-<EFBFBD>X<EFBFBD><EFBFBD>%f<0C><><12>hX<0B><>R<>c<EFBFBD>+z<><7A>eg<65>& <20>d<EFBFBD><64><EFBFBD>ק<06><>A<EFBFBD><41><EFBFBD>чXM<58>1<EFBFBD>
+-> ssh-ed25519 G5LX5w SRJhNenoQXbT1FgX3TMPnVH5P6oe2eHot+M1YsEjsEk
+hfTSLgKi98Eh7JK5o7x2POpTEtQlQCpEa3keUFYCuME
+-> ssh-ed25519 cK5kHw z5TwWJTkvx7HztjXHJW/aCOtOfPrQaLP0gyIT7rXcyU
+b4NCpHfasgvkLLr+6LcWUl60p59aSNnfp3bl2OFYXo0
+-> ssh-ed25519 CAWG4Q 4VpS1/OnFe8nxcQbRTKNhjsh/ZQ5cbhSMXwK/jjQ+3o
+WF9wvOkqVml4UcEzyzeumKuUwCwwr2zvKLMg+PCB8nk
+-> ssh-ed25519 xA739A 67FhuJ070jBVMt/xbKHWhfri6iIm0FyaFvzQabsvFBM
+1G5/913dDv/r/6p1x/c5YiUnZzrX/LvIj33KW+PN0KU
+-> ssh-ed25519 MSF3dg Bj/yB4N2wkyHCHC22tcjjJAA4ebSamN0Z4UVX3ZnryI
+6D/ZgTs+j+MGDAbPU5zyK0i9zN6tQy68IcOnQZ27mYg
+--- 169erk3ICSYLs4FPEuXCn7QlekWhsmSn0Lr+/R14I5Q
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><05>ҽ3<D2BD>s<EFBFBD>
+w<EFBFBD><EFBFBD>4D<EFBFBD><EFBFBD>b.<2E><><EFBFBD>"|<7C><><EFBFBD>)"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>;<3B>.<2E>ɫ7)<29>LeC<05>=S؟
--- a/secrets/wg-apex.age
+++ b/secrets/wg-apex.age
--- a/secrets/wg-fox.age
+++ b/secrets/wg-fox.age
@@ -0,0 +1,14 @@
+age-encryption.org/v1
+-> ssh-ed25519 cDBabA heyW9/cxgwFX9IexQIXjAQDWGQPNcMXcArQp2Rxsqx4
+o9MQ7EH8PDDjsJdpH9F3Xq2zUoaDAJQlfFmYucSFs6Y
+-> ssh-ed25519 cK5kHw Sza4pos7K3qW3omEeyidI/jszJNf9smemSZnUJfCIww
+D6vazXki7hIYraIuSiGPS+FPbkFUwHhHWDf52OhEIMg
+-> ssh-ed25519 CAWG4Q YexIHueOIMmIN8JIDyNUOKBkyz/k18HqV3hTXh48KlM
+xh8UJzzWT6ByN+Dpn4JrMNsjGC/uc/v6LynwjBDz9NQ
+-> ssh-ed25519 xA739A KySG3TXdqfCMUkVEDGa74B0op745s3XGYxFLyAXSQAc
+5EI/yb5ctW9Qu18bHm3/sK97kwGcKzzmWvPSCWm89XA
+-> ssh-ed25519 MSF3dg MNxnNj0fHmri8ophexXPNjRUBUWrzcuk5S1mucxUMTE
+GVFWXtISEU8ZmlwL4nh4weAgfGrt2GHX0DTzbpS6zg8
+--- UdrqkYG2ZApAuwdZeNhC50NP2rkD/Ol6y8nJa4RHx7Y
+<EFBFBD>ܻ<EFBFBD>m(<28><><EFBFBD>><3E>H<48>Y87<><37>G<0F>+*<12><><EFBFBD><EFBFBD>9V<>.<2E><><EFBFBD><EFBFBD><03><><EFBFBD>p<EFBFBD>Oo<4F>=+哇<>P0<50><30>{<7B>)<29><17><><EFBFBD><EFBFBD>><3E>z3P^
+u
--- a/web/content/fox/_index.md
+++ b/web/content/fox/_index.md
@@ -21,17 +21,28 @@ the detailed specifications:

 ## Access

-To access the machine, request a SLURM session from [hut](/hut) using the `fox`
-partition:
+To access the machine, request a SLURM session from [apex](/apex) using the `fox`
+partition. If you need the machine for performance measurements, use an
+exclusive reservation:

-    hut% salloc -p fox
+    apex% salloc -p fox --exclusive

-Then connect via ssh:
+Otherwise, specify the CPUs that you need so other users can also use the node
+at the same time:

-    hut% ssh fox
+    apex% salloc -p fox -c 8
+
+Then use srun to execute an interactive shell:
+
+    apex% srun --pty $SHELL
    fox%

-Follow [these steps](/access) if you don't have access to hut or fox.
+Make sure you get all CPUs you expect:
+
+    fox% grep Cpus_allowed_list /proc/self/status
+    Cpus_allowed_list:	0-191
+
+Follow [these steps](/access) if you don't have access to apex or fox.

 ## CUDA

@@ -89,9 +100,8 @@ Then just run `nix develop` from the same directory:

 The machine has several file systems available.

- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
-  Don't abuse.
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
-  capacity. Stores three redundant copies of every file.
+- `/nfs/home`: The `/home` from apex via NFS, which is also shared with other
+  xeon machines. It has about 2 ms of latency, so not suitable for quick random
+  access.
 - `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
 - `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
Author	SHA1	Message	Date
Rodrigo Arias Mallo	be8c150b08	Disable flake LFS	2025-09-17 12:40:06 +02:00
Rodrigo Arias Mallo	f9d4a70791	Increase body to 64 MiB for Gitea LFS	2025-09-17 12:19:56 +02:00
Rodrigo Arias Mallo	729e2d3833	Track PDFs with Git LFS	2025-09-17 12:06:27 +02:00
Rodrigo Arias Mallo	54ad962719	Add git-lfs client to all machines	2025-09-17 11:53:58 +02:00
Rodrigo Arias Mallo	8697fc0a18	Enable Git LFS in Gitea	2025-09-17 11:45:14 +02:00
Rodrigo Arias Mallo	d3b355f651	Add /nfs/home to fox documentation Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 15:34:05 +02:00
Rodrigo Arias Mallo	2ed881cd89	Mount home via NFS from apex in fox Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 15:34:02 +02:00
Rodrigo Arias Mallo	2a07df1d30	Allow access to NFS via wireguard subnet Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 15:33:47 +02:00
Rodrigo Arias Mallo	52380eae59	Use 10.106.0.0/24 subnet to avoid collisions The 106 byte is the code for 'j' (jungle) in ASCII: % printf j \| od -t d 0000000 106 0000001 Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:03:13 +02:00
Rodrigo Arias Mallo	2fe84c4cbc	Update fox documentation for SLURM and FS Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:03:09 +02:00
Rodrigo Arias Mallo	3b16b41be3	Revert "Remove pam_slurm_adopt from fox" This reverts commit `64a52801ed`. Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:03:06 +02:00
Rodrigo Arias Mallo	ee481deffb	Enable fail2ban in fox Protect fox against ssh bruteforce attacks: fox% sudo lastb \| head root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:25 - 11:25 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:24 - 11:24 (00:00) root ssh:notty 200.124.28.102 Mon Sep 1 11:24 - 11:24 (00:00) Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:03:02 +02:00
Rodrigo Arias Mallo	b1bad25008	Accept connections from apex to fox slurmd Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:03:00 +02:00
Rodrigo Arias Mallo	85f38e17a2	Accept fox connection to slurm controller Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:59 +02:00
Rodrigo Arias Mallo	08ab01b89c	Add fox machine to SLURM Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:57 +02:00
Rodrigo Arias Mallo	194a6fb7f6	Rekey secrets with trusted fox key Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:55 +02:00
Rodrigo Arias Mallo	365576778b	Trust fox for compute node secrets Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:52 +02:00
Rodrigo Arias Mallo	e7490858c6	Make apex host specific to each machine Allows direct contact via the VPN when accessing from fox, but use Internet when using the rest of the machines. Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:49 +02:00
Rodrigo Arias Mallo	7606030135	Add local host fox in apex Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:46 +02:00
Rodrigo Arias Mallo	e55590f59e	Enable wireguard in apex Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:43 +02:00
Rodrigo Arias Mallo	c3da39c392	Add wireguard server in fox Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-09-03 12:02:38 +02:00
Rodrigo Arias Mallo	d3889b3339	Use writeShellScript for suspend.sh and resume.sh Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:28 +02:00
Rodrigo Arias Mallo	28540d8cf3	Add firewall rules to slurm server Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:26 +02:00
Rodrigo Arias Mallo	f847621ceb	Remove hut from slurm Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:24 +02:00
Rodrigo Arias Mallo	12fe43f95f	Only configure apex as slurm server Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:22 +02:00
Rodrigo Arias Mallo	0e8329eef3	Split slurm configuration for client and server Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:20 +02:00
Rodrigo Arias Mallo	df3b21b570	Move slurm control server to apex Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-08-29 12:35:16 +02:00
Aleix Boné	78df61d24a	Fix typo in csiringo ssh key Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>	2025-08-27 17:44:20 +02:00
Aleix Boné	8e7da73151	Enable nix-ld in weasel Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>	2025-08-27 16:19:34 +02:00
Aleix Boné	a7e17e40dc	Add csiringo user with access to apex and weasel Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>	2025-08-27 16:02:26 +02:00
Rodrigo Arias Mallo	0e8bd22347	Access gitlab via raccoon in fox Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>	2025-08-27 15:27:38 +02:00
Rodrigo Arias Mallo	d948f8b752	Move StartLimit* options to unit section The StartLimitBurst and StartLimitIntervalSec options belong to the [Unit] section, otherwise they are ignored in [Service]: > Unknown key 'StartLimitIntervalSec' in section [Service], ignoring. When using [Unit], the limits are properly set: apex% systemctl show power-policy.service \| grep StartLimit StartLimitIntervalUSec=10min StartLimitBurst=10 StartLimitAction=none Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-24 14:32:46 +02:00
Rodrigo Arias Mallo	8f7787e217	Set power policy to always turn on In all machines, as soon as we recover the power, turn the machine back on. We cannot rely on the previous state as we will shut them down before the power is cut to prevent damage on the power supply monitoring circuit. Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es> Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-24 11:22:38 +02:00
Rodrigo Arias Mallo	30b9b23112	Add NixOS module to control power policy Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es> Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-24 11:22:36 +02:00
Rodrigo Arias Mallo	9a056737de	Move August shutdown to 3rd at 22h Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es> Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-24 11:22:33 +02:00
Rodrigo Arias Mallo	ac700d34a5	Disable automatic August shutdown for Fox The UPC has different dates for the yearly power cut, and Fox can recover properly from a power loss, so we don't need to have it turned off before the power cut. Simply disabling the timer is enough. Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es> Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-24 11:22:10 +02:00
Rodrigo Arias Mallo	9b681ab7ce	Add cudainfo program to test CUDA The cudainfo program checks that we can initialize the CUDA RT library and communicate with the driver. It can be used as standalone program or built with cudainfo.gpuCheck so it is executed inside the build sandbox to see if it also works fine. It uses the autoAddDriverRunpath hook to inject in the runpath the location of the library directory for CUDA libraries. Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-23 11:52:09 +02:00
Rodrigo Arias Mallo	9ce394bffd	Add missing symlink in cuda sandbox Reviewed-by: Aleix Boné <abonerib@bsc.es>	2025-07-23 11:51:47 +02:00