Compare commits

...

10 Commits

Author SHA1 Message Date
581efb4312 Reject SSH connections without SLURM allocation 2025-02-13 22:13:23 +01:00
c32c1bd03b Add users to fox 2025-02-12 16:46:56 +01:00
1ddc5b7248 Add dalvare1 user 2025-02-12 16:39:51 +01:00
8968deb4db Add fox page in jungle website 2025-02-12 16:39:33 +01:00
5a21baf2be Mount NVME disks in /nvme{0,1} 2025-02-12 15:49:55 +01:00
f4534e1e5a Exclude fox from being suspended by slurm 2025-02-12 15:02:18 +01:00
d6ed4b4521 Use IPMI host names instead of IP addresses 2025-02-12 12:35:46 +01:00
049ad4d062 Add fox IPMI monitoring
Use agenix to store the credentials safely.
2025-02-12 12:10:45 +01:00
07ab4018d8 Add new fox machine 2025-02-11 21:55:49 +01:00
a1135306ed Add new GitLab runner for gitlab.bsc.es
It uses docker based on alpine and the host nix store, so we can perform
builds but isolate them from the system.
2025-01-28 12:58:44 +01:00
24 changed files with 380 additions and 71 deletions

View File

@ -25,6 +25,7 @@ in
bay = mkConf "bay"; bay = mkConf "bay";
lake2 = mkConf "lake2"; lake2 = mkConf "lake2";
raccoon = mkConf "raccoon"; raccoon = mkConf "raccoon";
fox = mkConf "fox";
}; };
packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // { packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // {

View File

@ -9,10 +9,11 @@ rec {
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro"; koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay"; bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2"; lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDa9lId4rB/EKGkkCCVOy0cuId2SYLs+8W8kx0kmpO1y fox";
}; };
hostGroup = with hosts; rec { hostGroup = with hosts; rec {
compute = [ owl1 owl2 ]; compute = [ owl1 owl2 fox ];
playground = [ eudy koro ]; playground = [ eudy koro ];
storage = [ bay lake2 ]; storage = [ bay lake2 ];
monitor = [ hut ]; monitor = [ hut ];

View File

@ -68,7 +68,7 @@
home = "/home/Computational/anavarro"; home = "/home/Computational/anavarro";
description = "Antoni Navarro"; description = "Antoni Navarro";
group = "Computational"; group = "Computational";
hosts = [ "hut" "raccoon" ]; hosts = [ "hut" "raccoon" "fox" ];
hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31"; hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31";
openssh.authorizedKeys.keys = [ openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead" "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead"
@ -113,6 +113,19 @@
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791" "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
]; ];
}; };
dalvare1 = {
uid = 2758;
isNormalUser = true;
home = "/home/Computational/dalvare1";
description = "David Álvarez";
group = "Computational";
hosts = [ "hut" "fox" ];
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
];
};
}; };
groups = { groups = {

View File

@ -34,37 +34,37 @@
# Node Entry for node: mds01 (ID=72) # Node Entry for node: mds01 (ID=72)
10.0.40.40 bay mds01 mds01-eth0 10.0.40.40 bay mds01 mds01-eth0
10.0.42.40 bay-ib mds01-ib0 10.0.42.40 bay-ib mds01-ib0
10.0.40.141 bay-ipmi mds01-ipmi0 10.0.40.141 bay-ipmi mds01-ipmi0 mds01-ipmi
# Node Entry for node: oss01 (ID=73) # Node Entry for node: oss01 (ID=73)
10.0.40.41 oss01 oss01-eth0 10.0.40.41 oss01 oss01-eth0
10.0.42.41 oss01-ib0 10.0.42.41 oss01-ib0
10.0.40.142 oss01-ipmi0 10.0.40.142 oss01-ipmi0 oss01-ipmi
# Node Entry for node: oss02 (ID=74) # Node Entry for node: oss02 (ID=74)
10.0.40.42 lake2 oss02 oss02-eth0 10.0.40.42 lake2 oss02 oss02-eth0
10.0.42.42 lake2-ib oss02-ib0 10.0.42.42 lake2-ib oss02-ib0
10.0.40.143 lake2-ipmi oss02-ipmi0 10.0.40.143 lake2-ipmi oss02-ipmi0 oss02-ipmi
# Node Entry for node: xeon01 (ID=15) # Node Entry for node: xeon01 (ID=15)
10.0.40.1 owl1 xeon01 xeon01-eth0 10.0.40.1 owl1 xeon01 xeon01-eth0
10.0.42.1 owl1-ib xeon01-ib0 10.0.42.1 owl1-ib xeon01-ib0
10.0.40.101 owl1-ipmi xeon01-ipmi0 10.0.40.101 owl1-ipmi xeon01-ipmi0 xeon01-ipmi
# Node Entry for node: xeon02 (ID=16) # Node Entry for node: xeon02 (ID=16)
10.0.40.2 owl2 xeon02 xeon02-eth0 10.0.40.2 owl2 xeon02 xeon02-eth0
10.0.42.2 owl2-ib xeon02-ib0 10.0.42.2 owl2-ib xeon02-ib0
10.0.40.102 owl2-ipmi xeon02-ipmi0 10.0.40.102 owl2-ipmi xeon02-ipmi0 xeon02-ipmi
# Node Entry for node: xeon03 (ID=17) # Node Entry for node: xeon03 (ID=17)
10.0.40.3 xeon03 xeon03-eth0 10.0.40.3 xeon03 xeon03-eth0
10.0.42.3 xeon03-ib0 10.0.42.3 xeon03-ib0
10.0.40.103 xeon03-ipmi0 10.0.40.103 xeon03-ipmi0 xeon03-ipmi
# Node Entry for node: xeon04 (ID=18) # Node Entry for node: xeon04 (ID=18)
10.0.40.4 xeon04 xeon04-eth0 10.0.40.4 xeon04 xeon04-eth0
10.0.42.4 xeon04-ib0 10.0.42.4 xeon04-ib0
10.0.40.104 xeon04-ipmi0 10.0.40.104 xeon04-ipmi0 xeon04-ipmi
# Node Entry for node: xeon05 (ID=19) # Node Entry for node: xeon05 (ID=19)
10.0.40.5 koro xeon05 xeon05-eth0 10.0.40.5 koro xeon05 xeon05-eth0
@ -74,17 +74,21 @@
# Node Entry for node: xeon06 (ID=20) # Node Entry for node: xeon06 (ID=20)
10.0.40.6 xeon06 xeon06-eth0 10.0.40.6 xeon06 xeon06-eth0
10.0.42.6 xeon06-ib0 10.0.42.6 xeon06-ib0
10.0.40.106 xeon06-ipmi0 10.0.40.106 xeon06-ipmi0 xeon06-ipmi
# Node Entry for node: xeon07 (ID=21) # Node Entry for node: xeon07 (ID=21)
10.0.40.7 hut xeon07 xeon07-eth0 10.0.40.7 hut xeon07 xeon07-eth0
10.0.42.7 hut-ib xeon07-ib0 10.0.42.7 hut-ib xeon07-ib0
10.0.40.107 hut-ipmi xeon07-ipmi0 10.0.40.107 hut-ipmi xeon07-ipmi0 xeon07-ipmi
# Node Entry for node: xeon08 (ID=22) # Node Entry for node: xeon08 (ID=22)
10.0.40.8 eudy xeon08 xeon08-eth0 10.0.40.8 eudy xeon08 xeon08-eth0
10.0.42.8 eudy-ib xeon08-ib0 10.0.42.8 eudy-ib xeon08-ib0
10.0.40.108 eudy-ipmi xeon08-ipmi0 10.0.40.108 eudy-ipmi xeon08-ipmi0 xeon08-ipmi
# fox
10.0.40.26 fox
10.0.40.126 fox-ipmi
''; '';
}; };
} }

75
m/fox/configuration.nix Normal file
View File

@ -0,0 +1,75 @@
{ lib, config, pkgs, ... }:
{
imports = [
../common/xeon.nix
../module/ceph.nix
../module/emulation.nix
../module/slurm-client.nix
../module/slurm-firewall.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
# No swap, there is plenty of RAM
swapDevices = lib.mkForce [];
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
boot.kernelModules = [ "kvm-amd" ];
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
networking = {
hostName = "fox";
interfaces.enp1s0f0np0.ipv4.addresses = [ {
address = "10.0.40.26";
prefixLength = 24;
} ];
};
# Configure Nvidia driver to use with CUDA
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
hardware.graphics.enable = true;
nixpkgs.config.allowUnfree = true;
nixpkgs.config.nvidia.acceptLicense = true;
services.xserver.videoDrivers = [ "nvidia" ];
# Mount NVME disks
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
# Make a /nvme{0,1}/$USER directory for each user.
systemd.services.create-nvme-dirs = let
# Take only normal users in fox
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
commands = lib.concatLists (lib.mapAttrsToList
(_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0755 /nvme{0,1}/${user.name}"
]) users);
script = pkgs.writeShellScript "create-nvme-dirs.sh" (lib.concatLines commands);
in {
enable = true;
wants = [ "local-fs.target" ];
after = [ "local-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

View File

@ -1,8 +1,9 @@
{ pkgs, lib, config, ... }: { pkgs, lib, config, ... }:
{ {
age.secrets.gitlabRunnerShellToken.file = ../../secrets/gitlab-runner-shell-token.age; age.secrets.gitlab-pm-shell.file = ../../secrets/gitlab-runner-shell-token.age;
age.secrets.gitlabRunnerDockerToken.file = ../../secrets/gitlab-runner-docker-token.age; age.secrets.gitlab-pm-docker.file = ../../secrets/gitlab-runner-docker-token.age;
age.secrets.gitlab-bsc-docker.file = ../../secrets/gitlab-bsc-docker-token.age;
services.gitlab-runner = { services.gitlab-runner = {
enable = true; enable = true;
@ -28,10 +29,65 @@
in { in {
# For pm.bsc.es/gitlab # For pm.bsc.es/gitlab
gitlab-pm-shell = common-shell // { gitlab-pm-shell = common-shell // {
authenticationTokenConfigFile = config.age.secrets.gitlabRunnerShellToken.path; authenticationTokenConfigFile = config.age.secrets.gitlab-pm-shell.path;
}; };
gitlab-pm-docker = common-docker // { gitlab-pm-docker = common-docker // {
authenticationTokenConfigFile = config.age.secrets.gitlabRunnerDockerToken.path; authenticationTokenConfigFile = config.age.secrets.gitlab-pm-docker.path;
};
gitlab-bsc-docker = {
# gitlab.bsc.es still uses the old token mechanism
registrationConfigFile = config.age.secrets.gitlab-bsc-docker.path;
environmentVariables = {
https_proxy = "http://localhost:23080";
http_proxy = "http://localhost:23080";
};
# FIXME
registrationFlags = [
"--docker-network-mode host"
];
executor = "docker";
dockerImage = "alpine";
dockerVolumes = [
"/nix/store:/nix/store:ro"
"/nix/var/nix/db:/nix/var/nix/db:ro"
"/nix/var/nix/daemon-socket:/nix/var/nix/daemon-socket:ro"
];
dockerDisableCache = true;
preBuildScript = pkgs.writeScript "setup-container" ''
mkdir -p -m 0755 /nix/var/log/nix/drvs
mkdir -p -m 0755 /nix/var/nix/gcroots
mkdir -p -m 0755 /nix/var/nix/profiles
mkdir -p -m 0755 /nix/var/nix/temproots
mkdir -p -m 0755 /nix/var/nix/userpool
mkdir -p -m 1777 /nix/var/nix/gcroots/per-user
mkdir -p -m 1777 /nix/var/nix/profiles/per-user
mkdir -p -m 0755 /nix/var/nix/profiles/per-user/root
mkdir -p -m 0700 "$HOME/.nix-defexpr"
mkdir -p -m 0700 "$HOME/.ssh"
cat > "$HOME/.ssh/config" << EOF
Host bscpm03.bsc.es gitlab-internal.bsc.es
User git
ProxyCommand nc -X connect -x hut:23080 %h %p
Host amdlogin1.bsc.es armlogin1.bsc.es hualogin1.bsc.es glogin1.bsc.es glogin2.bsc.es fpgalogin1.bsc.es
ProxyCommand nc -X connect -x hut:23080 %h %p
EOF
cat >> "$HOME/.ssh/known_hosts" << EOF
bscpm03.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM2NuSUPsEhqz1j5b4Gqd+MWFnRqyqY57+xMvBUqHYUS
gitlab-internal.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3
EOF
. ${pkgs.nix}/etc/profile.d/nix-daemon.sh
${pkgs.nix}/bin/nix-channel --add https://nixos.org/channels/nixos-24.11 nixpkgs
${pkgs.nix}/bin/nix-channel --update nixpkgs
${pkgs.nix}/bin/nix-env -i ${lib.concatStringsSep " " (with pkgs; [ nix cacert git openssh netcat curl ])}
'';
environmentVariables = {
ENV = "/etc/profile";
USER = "root";
NIX_REMOTE = "daemon";
PATH = "/nix/var/nix/profiles/default/bin:/nix/var/nix/profiles/default/sbin:/bin:/sbin:/usr/bin:/usr/sbin";
NIX_SSL_CERT_FILE = "/nix/var/nix/profiles/default/etc/ssl/certs/ca-bundle.crt";
};
}; };
}; };
}; };

View File

@ -1,13 +0,0 @@
modules:
default:
collectors:
- bmc
- ipmi
- chassis
lan:
collectors:
- ipmi
- chassis
user: ""
pass: ""

View File

@ -12,6 +12,8 @@
mode = "400"; mode = "400";
}; };
age.secrets.ipmiYml.file = ../../secrets/ipmi.yml.age;
services.grafana = { services.grafana = {
enable = true; enable = true;
settings = { settings = {
@ -73,8 +75,8 @@
enable = true; enable = true;
group = "root"; group = "root";
user = "root"; user = "root";
configFile = ./ipmi.yml; configFile = config.age.secrets.ipmiYml.path;
#extraFlags = [ "--log.level=debug" ]; extraFlags = [ "--log.level=debug" ];
listenAddress = "127.0.0.1"; listenAddress = "127.0.0.1";
}; };
node = { node = {
@ -206,7 +208,7 @@
# Sets the "instance" label with the remote host we are querying # Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ]; source_labels = [ "__param_target" ];
separator = ";"; separator = ";";
regex = "(.*)"; regex = "(.*)-ipmi"; # Remove "-ipm̀i" at the end
target_label = "instance"; target_label = "instance";
replacement = "\${1}"; replacement = "\${1}";
action = "replace"; action = "replace";
@ -248,6 +250,17 @@
module = [ "raccoon" ]; module = [ "raccoon" ];
}; };
} }
{
job_name = "ipmi-fox";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9290" ]; }
];
params = {
target = [ "fox-ipmi" ];
module = [ "fox" ];
};
}
]; ];
}; };
} }

View File

@ -1,15 +1,15 @@
- targets: - targets:
- 10.0.40.101 - owl1-ipmi
- 10.0.40.102 - owl2-ipmi
- 10.0.40.103 - xeon03-ipmi
- 10.0.40.104 - xeon04-ipmi
- 10.0.40.105 - koro-ipmi
- 10.0.40.106 - xeon06-ipmi
- 10.0.40.107 - hut-ipmi
- 10.0.40.108 - eudy-ipmi
# Storage # Storage
- 10.0.40.141 - bay-ipmi
- 10.0.40.142 - oss01-ipmi
- 10.0.40.143 - lake2-ipmi
labels: labels:
job: ipmi-lan job: ipmi-lan

View File

@ -27,6 +27,22 @@ let
done done
''; '';
prolog = pkgs.writeScript "prolog.sh" ''
#!/usr/bin/env bash
echo "hello from the prolog"
exit 0
'';
epilog = pkgs.writeScript "epilog.sh" ''
#!/usr/bin/env bash
echo "hello from the epilog"
exit 0
'';
in { in {
systemd.services.slurmd.serviceConfig = { systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill # Kill all processes in the control group on stop/restart. This will kill
@ -43,12 +59,14 @@ in {
clusterName = "jungle"; clusterName = "jungle";
nodeName = [ nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl" "owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=2 CoresPerSocket=96 ThreadsPerCore=2 Feature=fox"
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2" "hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
]; ];
partitionName = [ partitionName = [
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP" "owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
"all Nodes=owl[1-2],hut Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP" "fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
"all Nodes=owl[1-2],hut Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
]; ];
# See slurm.conf(5) for more details about these options. # See slurm.conf(5) for more details about these options.
@ -76,7 +94,7 @@ in {
SuspendTimeout=60 SuspendTimeout=60
ResumeProgram=${resumeProgram} ResumeProgram=${resumeProgram}
ResumeTimeout=300 ResumeTimeout=300
SuspendExcNodes=hut SuspendExcNodes=hut,fox
# Turn the nodes off after 1 hour of inactivity # Turn the nodes off after 1 hour of inactivity
SuspendTime=3600 SuspendTime=3600
@ -91,9 +109,29 @@ in {
# Ignore memory constraints and only use unused cores to share a node with # Ignore memory constraints and only use unused cores to share a node with
# other jobs. # other jobs.
SelectTypeParameters=CR_Core SelectTypeParameters=CR_Core
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
# when a task runs (srun) so we can ssh early.
PrologFlags=Alloc,Contain,X11
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
# adopted by the external step, similar to tasks running in regular steps
# LaunchParameters=ulimit_pam_adopt
SlurmdDebug=debug5
#DebugFlags=Protocol,Cgroup
'';
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
''; '';
}; };
# Place the slurm config in /etc as this will be required by PAM
environment.etc.slurm.source = config.services.slurm.etcSlurm;
age.secrets.mungeKey = { age.secrets.mungeKey = {
file = ../../secrets/munge-key.age; file = ../../secrets/munge-key.age;
owner = "munge"; owner = "munge";

View File

@ -39,6 +39,18 @@ final: prev:
# See https://bugs.schedmd.com/show_bug.cgi?id=19324 # See https://bugs.schedmd.com/show_bug.cgi?id=19324
./slurm-rank-expansion.patch ./slurm-rank-expansion.patch
]; ];
# Install also the pam_slurm_adopt library to restrict users from accessing
# nodes with no job allocated.
postBuild = (old.postBuild or "") + ''
pushd contribs/pam_slurm_adopt
make "PAM_DIR=$out/lib/security"
popd
'';
postInstall = (old.postInstall or "") + ''
pushd contribs/pam_slurm_adopt
make "PAM_DIR=$out/lib/security" install
popd
'';
}); });
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { }; prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };

Binary file not shown.

View File

@ -1,9 +1,9 @@
age-encryption.org/v1 age-encryption.org/v1
-> ssh-ed25519 HY2yRg DQdgCk16Yu524BsrWVf0krnwWzDM6SeaJCgQipOfwCA -> ssh-ed25519 HY2yRg eRVX5yndWDLg9hw7sY1Iu8pJFy47luHvdL+zZGK2u1s
Ab9ocqra/UWJZI+QGMlxUhBu5AzqfjPgXl+ENIiHYGs e1nXXiMW0ywkZYh2s6c7/quGMfBOJOaRhNQDjCD2Iyc
-> ssh-ed25519 CAWG4Q KF9rGCenb3nf+wyz2hyVs/EUEbsmUs5R+1fBxlCibC8 -> ssh-ed25519 CAWG4Q gYG7GRxRpJ0/5Wz0Z0J2wfLfkMFNmcy81dQEewM7gUA
7++Kxbr3FHVdVfnFdHYdAuR0Tgfd+sRcO6WRss6LhEw lamdUdx+xOFWF1lmUM4x9TT0cJtKu9Sp7w9JHwm13u0
-> ssh-ed25519 MSF3dg aUe4DhRsu4X8CFOEAnD/XM/o/0qHYSB522woCaAVh0I -> ssh-ed25519 MSF3dg HEzfpR8alG6WPzhaEjAmmjOFoFcMSQUldx46dBsXri4
GRcs5cm2YqA/lGhUtbpboBaz7mfgiLaCr+agaB7vACU OAD5H/zZGhfevYrFJzJrbNKPomKZDOS9Qx5tmTp78Jo
--- 9Q7Ou+Pxq+3RZilCb2dKC/pCFjZEt4rp5KnTUUU7WJ8 --- A0sMSiNXWaEIgRXR0x6UAIaluuVH6Zlv4CJ9sI0NXOw
1¬Mw4Í ì:Hµ@Á/ägLtMÇ,߯¥ô*¡žzñNV5ˆmÍNŽoÞáj1 $÷TøG_³E{Œ%“‰1ǯ<>îAÛp™ ÿú6çphóÎÆ{Ñ>®F|ÅiÃv âæE}{ìruÎâÆ·Ý°ËÍ}^»‰>ñc6¥´j÷ ùgèGW<47>Ã:—J3ù|ø|†ZÑ

View File

@ -0,0 +1,9 @@
age-encryption.org/v1
-> ssh-ed25519 HY2yRg 4Xns3jybBuv8flzd+h3DArVBa/AlKjt1J9jAyJsasCE
uyVjJxh5i8aGgAgCpPl6zTYeIkf9mIwURof51IKWvwE
-> ssh-ed25519 CAWG4Q T2r6r1tyNgq1XlYXVtLJFfOfUnm6pSVlPwUqC1pkyRo
9yDoKU0EC34QMUXYnsJvhPCLm6oD9w7NlTi2sheoBqQ
-> ssh-ed25519 MSF3dg Bh9DekFTq+QMUEAonwcaIAJX4Js1O7cHjDniCD0gtm8
t/Ro0URLeDUWcvb7rlkG2s03PZ+9Rr3N4TIX03tXpVc
--- E5+/D4aK2ihKRR4YC5XOTmUbKgOqBR0Nk0gYvFOzXOI
‰ÀÍyKF~djº˜r%¸Š'ÉÓÖPä&_-lŸ”ö&o¥_ér¯¦r¢ÿß<C3BF>0ï,­U7†nC·Te…÷[fˆ97ü•…šÙ˦“ÈC!D±E<C2B1>Wé*ÐLAôx6¾#–¯ sqôiéËÆäÏŸ“åk ,ùÝ“

View File

@ -1,9 +1,10 @@
age-encryption.org/v1 age-encryption.org/v1
-> ssh-ed25519 HY2yRg 0sEIUEJBJQ0k0rBfHaOEbq1pNBqsPin4Xq85v0ds9jY -> ssh-ed25519 HY2yRg GdmdkW+BqqwBgu30b846jv3J7jtCM+a3rgOERuA050A
4wzjLapoOcq53nT2K3hSGED4jTDXci25GLHkl/fL4EI FeGqM75jG9egesR+yyVKHm0/M+uBBp5Hclg4+qN0BR8
-> ssh-ed25519 CAWG4Q f68ZbJGwXuCZVnqhwbh+8dh0X/MCdjEd+sVtPyBu/hU -> ssh-ed25519 CAWG4Q a0wTWHgulQUYDAMZmXf3dOf6PdYgCqNtSylzWVVRNVM
u2TQreyWQvP6PGuwuUNKA/AL68560flqSlaItN3k41I Bx+WSYaiY4ZwlSZJo2a1XPMQmbKOU7F0tKAqVRLBOPo
-> ssh-ed25519 MSF3dg HdrtRW2j7mfkLH0/4aJK5R0cWdjf56HYtEZgzHi9EAs -> ssh-ed25519 MSF3dg KccUvZZUbxbCrRWUWrX8KcHF6vQ5FV/BqUqI59G7dj4
A6MF6tXmSUq2RF2bpmav0GFTRERwluSZGh2snP/KqkA CFr7GXpZ9rPgy7HBfOyiYF9FnZUw6KcZwq9f7/0KaU8
--- drsezqi7J/g8gm6N10SkfeAWnYct99WUraB5djLJqpo --- E0Rp6RR/8+o0jvB1lRdhnlabxvI6uu/IgL2ZpPXzTc8
gÔ (ìÐJ!M6¬É3e¸AÜæÃ?\1y÷eüFN\<>/MêòªN`K^€+"¤«Y^å>dÒH÷°‡¸]P…ÓûJ`xôã»{Ú±ô„y°ÅÎøSˆéyPX{w‰Sï ž^5X¶JPô;v‰ û#ã¶H÷$°F;Ñéù%›ÈË2†¢rfXŸ\Dn ÖшºÈ‰©x™Î>¥Ù&;÷cUŠI=ÑMöÀª?Tœ¡Ç¸ÂÂ"px†Ó­\sÙãbFý<46>ù¹W
AW>?U©ÙÊçÐHÔ³

View File

@ -1,9 +1,9 @@
age-encryption.org/v1 age-encryption.org/v1
-> ssh-ed25519 HY2yRg VY8s9s1zuHOv2axmIacwKg2ozsJnskHTQtslRZ3YI1M -> ssh-ed25519 HY2yRg xWRxJGWSzA5aplRYCYLB6aBwrUrQQJ2MtDYaD75V5nI
fKkJuydLOzF/ciPYSYu4ziSCozdl6sowvDMYZmxqmHY J07XF3NQiaYKKKNRcNWi9MloJD2wXHd+2K7bo6lF+QU
-> ssh-ed25519 CAWG4Q 2ARFd/7RWQ/QOk47FnJFChaVBgoV4LE6EA+JHezkXgg -> ssh-ed25519 CAWG4Q jNWymbyCczcm8RcaIEbFQBlOMALsuxTl4+pLUi0aR20
MV4g4Llv8Qcd/wUgJyoNG5AXb6o3aFTrOYGC+lXlSzw z5NixlrRD+Y7Z/aFPs6hiDW4/lp8CBQCeJYpbuG9yYM
-> ssh-ed25519 MSF3dg SKoxWe8Mi8EkBjkESxStOCI5V4C0KYEXIOx7OdENgTA -> ssh-ed25519 MSF3dg QsUQloEKN3k1G49FQnNR/Do6ILgGpjFcw3zu5kk1Ako
p/owKwQ4e4pcGV+hqej2AfPU5QaM2i8VfxhlkjCM4Z4 IHwyFWUEWqCStNcFprnpBa8L5J6zKIsn+7HcgGRv3sM
--- 0VWKU5CQiGbiOtQ2tsZZg88oZm1qcUDEnU5zDTtV+KU --- oUia0fsL6opeYWACyXtHAu/Ld+bUIt/7S1VszYTvwgU
ŸÖuµcl÷ª`Ÿ¡Mþ¸'Vk6Yè!Ó=¦LÀ¦yš-ž¬ÁO¢Az«Æ˜VEK¦<4B>R†_ÌqL|1V•[)²qœ©„Æ“Lç<4C>DyÌÉ0¹_áßåq)-T,ƪú_9û ”?å<>àûib†1 ™êVäœ*øtë27·œŽ“§hÜ&‰éÍ¢_!Õ¿+”·±¯(ã¡nù¿ ¬í(Ëê÷/}òœäáCúNÍ·|ÇNèuÎ5‰Ã¹åšKÀìlÆ"ÃØklOX¨yº÷æØàù¤¹ø²Aíõe„È$

BIN
secrets/ipmi.yml.age Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -9,8 +9,10 @@ in
"gitea-runner-token.age".publicKeys = hut; "gitea-runner-token.age".publicKeys = hut;
"gitlab-runner-docker-token.age".publicKeys = hut; "gitlab-runner-docker-token.age".publicKeys = hut;
"gitlab-runner-shell-token.age".publicKeys = hut; "gitlab-runner-shell-token.age".publicKeys = hut;
"gitlab-bsc-docker-token.age".publicKeys = hut;
"nix-serve.age".publicKeys = hut; "nix-serve.age".publicKeys = hut;
"jungle-robot-password.age".publicKeys = hut; "jungle-robot-password.age".publicKeys = hut;
"ipmi.yml.age".publicKeys = hut;
"ceph-user.age".publicKeys = safe; "ceph-user.age".publicKeys = safe;
"munge-key.age".publicKeys = safe; "munge-key.age".publicKeys = safe;

View File

@ -11,7 +11,7 @@ access to the login machine using a resource petition in the BSC intranet.
Then, to request access to the machines we will need some information about you: Then, to request access to the machines we will need some information about you:
1. Which machines you want access to (hut, owl1, owl2, eudy, koro...) 1. Which machines you want access to ([hut](/hut), [fox](/fox), owl1, owl2, eudy, koro...)
1. Your user name and user id (to match the NFS permissions) 1. Your user name and user id (to match the NFS permissions)
1. Your real name and surname (for identification purposes) 1. Your real name and surname (for identification purposes)
1. The salted hash of your login password, generated with `mkpasswd -m sha-512` 1. The salted hash of your login password, generated with `mkpasswd -m sha-512`

97
web/content/fox/_index.md Normal file
View File

@ -0,0 +1,97 @@
---
title: "Fox"
description: "AMD Genoa 9684X with 2 NVIDIA RTX4000 GPUs"
date: 2025-02-12
---
![Fox](fox.jpg)
Picture by [Joanne Redwood](https://web.archive.org/web/20191109175146/https://www.inaturalist.org/photos/6568074),
[CC0](http://creativecommons.org/publicdomain/zero/1.0/deed.en).
The *fox* machine is a big GPU server that is configured to run heavy workloads.
It has two fast AMD CPUs with large cache and 2 reasonable NVIDIA GPUs. Here are
the detailed specifications:
- 2x AMD GENOA X 9684X DP/UP 96C/192T 2.55G 1,150M 400W SP5 3D V-cach
- 24x 32GB DDR5-4800 ECC RDIMM (total 768 GiB of RAM)
- 1x 2.5" SSD SATA3 MICRON 5400 MAX 480GB
- 2x 2.5" KIOXIA CM7-R 1.92TB NVMe GEN5 PCIe 5x4
- 2x NVIDIA RTX4000 ADA Gen 20GB GDDR6 PCIe 4.0
## Access
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
partition:
hut% salloc -p fox
Then connect via ssh:
hut% ssh fox
fox%
Follow [these steps](/access) if you don't have access to hut or fox.
## CUDA
To use CUDA, you can use the following `flake.nix` placed in a new directory to
load all the required dependencies:
```nix
{
inputs.jungle.url = "jungle";
outputs = { jungle, ... }: {
devShell.x86_64-linux = let
pkgs = jungle.nixosConfigurations.fox.pkgs;
in pkgs.mkShell {
name = "cuda-env-shell";
buildInputs = with pkgs; [
git gitRepo gnupg autoconf curl
procps gnumake util-linux m4 gperf unzip
# Cuda packages (more at https://search.nixos.org/packages)
cudatoolkit linuxPackages.nvidia_x11
cudaPackages.cuda_cudart.static
cudaPackages.libcusparse
libGLU libGL
xorg.libXi xorg.libXmu freeglut
xorg.libXext xorg.libX11 xorg.libXv xorg.libXrandr zlib
ncurses5 stdenv.cc binutils
];
shellHook = ''
export CUDA_PATH=${pkgs.cudatoolkit}
export LD_LIBRARY_PATH=/var/run/opengl-driver/lib
export SMS=50
'';
};
};
}
```
Then just run `nix develop` from the same directory:
% mkdir cuda
% cd cuda
% vim flake.nix
[...]
% nix develop
$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Feb_27_16:19:38_PST_2024
Cuda compilation tools, release 12.4, V12.4.99
Build cuda_12.4.r12.4/compiler.33961263_0
## Filesystems
The machine has several file systems available.
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
Don't abuse.
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
capacity. Stores three redundant copies of every file.
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.

BIN
web/content/fox/fox.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB