Compare commits
21 Commits
58ce0e4445
...
3590879c17
Author | SHA1 | Date | |
---|---|---|---|
3590879c17 | |||
31d03ee3ac | |||
4e92b14384 | |||
b90209b4bf | |||
785f7cfee8 | |||
edf744db8d | |||
b82894eaec | |||
1c47199891 | |||
8738bd4eeb | |||
7699783aac | |||
fee1d4da7e | |||
b77ce7fb56 | |||
b4a12625c5 | |||
302106ea9a | |||
96877de8d9 | |||
8878985be6 | |||
737578db34 | |||
88555e3f8c | |||
feb2060be7 | |||
00999434c2 | |||
29d58cc62d |
@ -25,6 +25,7 @@ in
|
||||
bay = mkConf "bay";
|
||||
lake2 = mkConf "lake2";
|
||||
raccoon = mkConf "raccoon";
|
||||
fox = mkConf "fox";
|
||||
};
|
||||
|
||||
packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // {
|
||||
|
3
keys.nix
3
keys.nix
@ -9,10 +9,11 @@ rec {
|
||||
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
|
||||
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
|
||||
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
||||
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDa9lId4rB/EKGkkCCVOy0cuId2SYLs+8W8kx0kmpO1y fox";
|
||||
};
|
||||
|
||||
hostGroup = with hosts; rec {
|
||||
compute = [ owl1 owl2 ];
|
||||
compute = [ owl1 owl2 fox ];
|
||||
playground = [ eudy koro ];
|
||||
storage = [ bay lake2 ];
|
||||
monitor = [ hut ];
|
||||
|
@ -68,7 +68,7 @@
|
||||
home = "/home/Computational/anavarro";
|
||||
description = "Antoni Navarro";
|
||||
group = "Computational";
|
||||
hosts = [ "hut" "raccoon" ];
|
||||
hosts = [ "hut" "raccoon" "fox" ];
|
||||
hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead"
|
||||
@ -81,7 +81,7 @@
|
||||
home = "/home/Computational/abonerib";
|
||||
description = "Aleix Boné";
|
||||
group = "Computational";
|
||||
hosts = [ "owl1" "owl2" "hut" "raccoon" ];
|
||||
hosts = [ "owl1" "owl2" "hut" "raccoon" "fox" ];
|
||||
hashedPassword = "$6$V1EQWJr474whv7XJ$OfJ0wueM2l.dgiJiiah0Tip9ITcJ7S7qDvtSycsiQ43QBFyP4lU0e0HaXWps85nqB4TypttYR4hNLoz3bz662/";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
|
||||
@ -113,6 +113,32 @@
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
|
||||
];
|
||||
};
|
||||
|
||||
dalvare1 = {
|
||||
uid = 2758;
|
||||
isNormalUser = true;
|
||||
home = "/home/Computational/dalvare1";
|
||||
description = "David Álvarez";
|
||||
group = "Computational";
|
||||
hosts = [ "hut" "fox" ];
|
||||
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
|
||||
];
|
||||
};
|
||||
|
||||
varcila = {
|
||||
uid = 5650;
|
||||
isNormalUser = true;
|
||||
home = "/home/Computational/varcila";
|
||||
description = "Vincent Arcila";
|
||||
group = "Computational";
|
||||
hosts = [ "hut" "fox" ];
|
||||
hashedPassword = "$6$oB0Tcn99DcM4Ch$Vn1A0ulLTn/8B2oFPi9wWl/NOsJzaFAWjqekwcuC9sMC7cgxEVb.Nk5XSzQ2xzYcNe5MLtmzkVYnRS1CqP39Y0";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGt0ESYxekBiHJQowmKpfdouw0hVm3N7tUMtAaeLejK vincent@varch"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
groups = {
|
||||
|
@ -34,37 +34,37 @@
|
||||
# Node Entry for node: mds01 (ID=72)
|
||||
10.0.40.40 bay mds01 mds01-eth0
|
||||
10.0.42.40 bay-ib mds01-ib0
|
||||
10.0.40.141 bay-ipmi mds01-ipmi0
|
||||
10.0.40.141 bay-ipmi mds01-ipmi0 mds01-ipmi
|
||||
|
||||
# Node Entry for node: oss01 (ID=73)
|
||||
10.0.40.41 oss01 oss01-eth0
|
||||
10.0.42.41 oss01-ib0
|
||||
10.0.40.142 oss01-ipmi0
|
||||
10.0.40.142 oss01-ipmi0 oss01-ipmi
|
||||
|
||||
# Node Entry for node: oss02 (ID=74)
|
||||
10.0.40.42 lake2 oss02 oss02-eth0
|
||||
10.0.42.42 lake2-ib oss02-ib0
|
||||
10.0.40.143 lake2-ipmi oss02-ipmi0
|
||||
10.0.40.143 lake2-ipmi oss02-ipmi0 oss02-ipmi
|
||||
|
||||
# Node Entry for node: xeon01 (ID=15)
|
||||
10.0.40.1 owl1 xeon01 xeon01-eth0
|
||||
10.0.42.1 owl1-ib xeon01-ib0
|
||||
10.0.40.101 owl1-ipmi xeon01-ipmi0
|
||||
10.0.40.101 owl1-ipmi xeon01-ipmi0 xeon01-ipmi
|
||||
|
||||
# Node Entry for node: xeon02 (ID=16)
|
||||
10.0.40.2 owl2 xeon02 xeon02-eth0
|
||||
10.0.42.2 owl2-ib xeon02-ib0
|
||||
10.0.40.102 owl2-ipmi xeon02-ipmi0
|
||||
10.0.40.102 owl2-ipmi xeon02-ipmi0 xeon02-ipmi
|
||||
|
||||
# Node Entry for node: xeon03 (ID=17)
|
||||
10.0.40.3 xeon03 xeon03-eth0
|
||||
10.0.42.3 xeon03-ib0
|
||||
10.0.40.103 xeon03-ipmi0
|
||||
10.0.40.103 xeon03-ipmi0 xeon03-ipmi
|
||||
|
||||
# Node Entry for node: xeon04 (ID=18)
|
||||
10.0.40.4 xeon04 xeon04-eth0
|
||||
10.0.42.4 xeon04-ib0
|
||||
10.0.40.104 xeon04-ipmi0
|
||||
10.0.40.104 xeon04-ipmi0 xeon04-ipmi
|
||||
|
||||
# Node Entry for node: xeon05 (ID=19)
|
||||
10.0.40.5 koro xeon05 xeon05-eth0
|
||||
@ -74,17 +74,21 @@
|
||||
# Node Entry for node: xeon06 (ID=20)
|
||||
10.0.40.6 xeon06 xeon06-eth0
|
||||
10.0.42.6 xeon06-ib0
|
||||
10.0.40.106 xeon06-ipmi0
|
||||
10.0.40.106 xeon06-ipmi0 xeon06-ipmi
|
||||
|
||||
# Node Entry for node: xeon07 (ID=21)
|
||||
10.0.40.7 hut xeon07 xeon07-eth0
|
||||
10.0.42.7 hut-ib xeon07-ib0
|
||||
10.0.40.107 hut-ipmi xeon07-ipmi0
|
||||
10.0.40.107 hut-ipmi xeon07-ipmi0 xeon07-ipmi
|
||||
|
||||
# Node Entry for node: xeon08 (ID=22)
|
||||
10.0.40.8 eudy xeon08 xeon08-eth0
|
||||
10.0.42.8 eudy-ib xeon08-ib0
|
||||
10.0.40.108 eudy-ipmi xeon08-ipmi0
|
||||
10.0.40.108 eudy-ipmi xeon08-ipmi0 xeon08-ipmi
|
||||
|
||||
# fox
|
||||
10.0.40.26 fox
|
||||
10.0.40.126 fox-ipmi
|
||||
'';
|
||||
};
|
||||
}
|
||||
|
75
m/fox/configuration.nix
Normal file
75
m/fox/configuration.nix
Normal file
@ -0,0 +1,75 @@
|
||||
{ lib, config, pkgs, ... }:
|
||||
|
||||
{
|
||||
imports = [
|
||||
../common/xeon.nix
|
||||
../module/ceph.nix
|
||||
../module/emulation.nix
|
||||
../module/slurm-client.nix
|
||||
../module/slurm-firewall.nix
|
||||
];
|
||||
|
||||
# Select the this using the ID to avoid mismatches
|
||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
|
||||
|
||||
# No swap, there is plenty of RAM
|
||||
swapDevices = lib.mkForce [];
|
||||
|
||||
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
|
||||
boot.kernelModules = [ "kvm-amd" ];
|
||||
|
||||
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
||||
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
|
||||
|
||||
networking = {
|
||||
hostName = "fox";
|
||||
interfaces.enp1s0f0np0.ipv4.addresses = [ {
|
||||
address = "10.0.40.26";
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
};
|
||||
|
||||
# Configure Nvidia driver to use with CUDA
|
||||
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
||||
hardware.graphics.enable = true;
|
||||
nixpkgs.config.allowUnfree = true;
|
||||
nixpkgs.config.nvidia.acceptLicense = true;
|
||||
services.xserver.videoDrivers = [ "nvidia" ];
|
||||
|
||||
# Mount NVME disks
|
||||
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
|
||||
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
|
||||
|
||||
# Make a /nvme{0,1}/$USER directory for each user.
|
||||
systemd.services.create-nvme-dirs = let
|
||||
# Take only normal users in fox
|
||||
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
|
||||
commands = lib.concatLists (lib.mapAttrsToList
|
||||
(_: user: [
|
||||
"install -d -o ${user.name} -g ${user.group} -m 0755 /nvme{0,1}/${user.name}"
|
||||
]) users);
|
||||
script = pkgs.writeShellScript "create-nvme-dirs.sh" (lib.concatLines commands);
|
||||
in {
|
||||
enable = true;
|
||||
wants = [ "local-fs.target" ];
|
||||
after = [ "local-fs.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
serviceConfig.ExecStart = script;
|
||||
};
|
||||
|
||||
# Only allow SSH connections from users who have a SLURM allocation
|
||||
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||
security.pam.services.sshd.rules.account.slurm = {
|
||||
control = "required";
|
||||
enable = true;
|
||||
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
|
||||
args = [ "log_level=debug5" ];
|
||||
order = 999999; # Make it last one
|
||||
};
|
||||
|
||||
# Disable systemd session (pam_systemd.so) as it will conflict with the
|
||||
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
|
||||
# into the slurmstepd task and then into the systemd session, which is not
|
||||
# what we want, otherwise it will linger even if all jobs are gone.
|
||||
security.pam.services.sshd.startSession = lib.mkForce false;
|
||||
}
|
@ -1,8 +1,9 @@
|
||||
{ pkgs, lib, config, ... }:
|
||||
|
||||
{
|
||||
age.secrets.gitlabRunnerShellToken.file = ../../secrets/gitlab-runner-shell-token.age;
|
||||
age.secrets.gitlabRunnerDockerToken.file = ../../secrets/gitlab-runner-docker-token.age;
|
||||
age.secrets.gitlab-pm-shell.file = ../../secrets/gitlab-runner-shell-token.age;
|
||||
age.secrets.gitlab-pm-docker.file = ../../secrets/gitlab-runner-docker-token.age;
|
||||
age.secrets.gitlab-bsc-docker.file = ../../secrets/gitlab-bsc-docker-token.age;
|
||||
|
||||
services.gitlab-runner = {
|
||||
enable = true;
|
||||
@ -21,20 +22,88 @@
|
||||
"--docker-network-mode host"
|
||||
];
|
||||
environmentVariables = {
|
||||
https_proxy = "http://localhost:23080";
|
||||
http_proxy = "http://localhost:23080";
|
||||
https_proxy = "http://hut:23080";
|
||||
http_proxy = "http://hut:23080";
|
||||
};
|
||||
};
|
||||
in {
|
||||
# For pm.bsc.es/gitlab
|
||||
gitlab-pm-shell = common-shell // {
|
||||
authenticationTokenConfigFile = config.age.secrets.gitlabRunnerShellToken.path;
|
||||
authenticationTokenConfigFile = config.age.secrets.gitlab-pm-shell.path;
|
||||
};
|
||||
gitlab-pm-docker = common-docker // {
|
||||
authenticationTokenConfigFile = config.age.secrets.gitlabRunnerDockerToken.path;
|
||||
authenticationTokenConfigFile = config.age.secrets.gitlab-pm-docker.path;
|
||||
};
|
||||
|
||||
gitlab-bsc-docker = {
|
||||
# gitlab.bsc.es still uses the old token mechanism
|
||||
registrationConfigFile = config.age.secrets.gitlab-bsc-docker.path;
|
||||
tagList = [ "docker" "hut" ];
|
||||
environmentVariables = {
|
||||
# We cannot access the hut local interface from docker, so we connect
|
||||
# to hut directly via the ethernet one.
|
||||
https_proxy = "http://hut:23080";
|
||||
http_proxy = "http://hut:23080";
|
||||
};
|
||||
executor = "docker";
|
||||
dockerImage = "alpine";
|
||||
dockerVolumes = [
|
||||
"/nix/store:/nix/store:ro"
|
||||
"/nix/var/nix/db:/nix/var/nix/db:ro"
|
||||
"/nix/var/nix/daemon-socket:/nix/var/nix/daemon-socket:ro"
|
||||
];
|
||||
dockerExtraHosts = [
|
||||
# Required to pass the proxy via hut
|
||||
"hut:10.0.40.7"
|
||||
];
|
||||
dockerDisableCache = true;
|
||||
registrationFlags = [
|
||||
# Increase build log length to 64 MiB
|
||||
"--output-limit 65536"
|
||||
];
|
||||
preBuildScript = pkgs.writeScript "setup-container" ''
|
||||
mkdir -p -m 0755 /nix/var/log/nix/drvs
|
||||
mkdir -p -m 0755 /nix/var/nix/gcroots
|
||||
mkdir -p -m 0755 /nix/var/nix/profiles
|
||||
mkdir -p -m 0755 /nix/var/nix/temproots
|
||||
mkdir -p -m 0755 /nix/var/nix/userpool
|
||||
mkdir -p -m 1777 /nix/var/nix/gcroots/per-user
|
||||
mkdir -p -m 1777 /nix/var/nix/profiles/per-user
|
||||
mkdir -p -m 0755 /nix/var/nix/profiles/per-user/root
|
||||
mkdir -p -m 0700 "$HOME/.nix-defexpr"
|
||||
mkdir -p -m 0700 "$HOME/.ssh"
|
||||
cat > "$HOME/.ssh/config" << EOF
|
||||
Host bscpm04.bsc.es gitlab-internal.bsc.es
|
||||
User git
|
||||
ProxyCommand nc -X connect -x hut:23080 %h %p
|
||||
Host amdlogin1.bsc.es armlogin1.bsc.es hualogin1.bsc.es glogin1.bsc.es glogin2.bsc.es fpgalogin1.bsc.es
|
||||
ProxyCommand nc -X connect -x hut:23080 %h %p
|
||||
EOF
|
||||
cat >> "$HOME/.ssh/known_hosts" << EOF
|
||||
bscpm04.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPx4mC0etyyjYUT2Ztc/bs4ZXSbVMrogs1ZTP924PDgT
|
||||
gitlab-internal.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3
|
||||
EOF
|
||||
. ${pkgs.nix}/etc/profile.d/nix-daemon.sh
|
||||
# Required to load SSL certificate paths
|
||||
. ${pkgs.cacert}/nix-support/setup-hook
|
||||
'';
|
||||
environmentVariables = {
|
||||
ENV = "/etc/profile";
|
||||
USER = "root";
|
||||
NIX_REMOTE = "daemon";
|
||||
PATH = "${config.system.path}/bin:/bin:/sbin:/usr/bin:/usr/sbin";
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
# DOCKER* chains are useless, override at FORWARD
|
||||
networking.firewall.extraCommands = ''
|
||||
# Allow docker to use our proxy
|
||||
iptables -I FORWARD 1 -p tcp -i docker0 -d hut --dport 23080 -j nixos-fw-accept
|
||||
# Block anything else coming from docker
|
||||
iptables -I FORWARD 2 -p all -i docker0 -j nixos-fw-log-refuse
|
||||
'';
|
||||
|
||||
#systemd.services.gitlab-runner.serviceConfig.Shell = "${pkgs.bash}/bin/bash";
|
||||
systemd.services.gitlab-runner.serviceConfig.DynamicUser = lib.mkForce false;
|
||||
|
@ -1,13 +0,0 @@
|
||||
modules:
|
||||
default:
|
||||
collectors:
|
||||
- bmc
|
||||
- ipmi
|
||||
- chassis
|
||||
|
||||
lan:
|
||||
collectors:
|
||||
- ipmi
|
||||
- chassis
|
||||
user: ""
|
||||
pass: ""
|
@ -12,6 +12,8 @@
|
||||
mode = "400";
|
||||
};
|
||||
|
||||
age.secrets.ipmiYml.file = ../../secrets/ipmi.yml.age;
|
||||
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
settings = {
|
||||
@ -73,7 +75,7 @@
|
||||
enable = true;
|
||||
group = "root";
|
||||
user = "root";
|
||||
configFile = ./ipmi.yml;
|
||||
configFile = config.age.secrets.ipmiYml.path;
|
||||
# extraFlags = [ "--log.level=debug" ];
|
||||
listenAddress = "127.0.0.1";
|
||||
};
|
||||
@ -206,7 +208,7 @@
|
||||
# Sets the "instance" label with the remote host we are querying
|
||||
source_labels = [ "__param_target" ];
|
||||
separator = ";";
|
||||
regex = "(.*)";
|
||||
regex = "(.*)-ipmi"; # Remove "-ipm̀i" at the end
|
||||
target_label = "instance";
|
||||
replacement = "\${1}";
|
||||
action = "replace";
|
||||
@ -248,6 +250,17 @@
|
||||
module = [ "raccoon" ];
|
||||
};
|
||||
}
|
||||
{
|
||||
job_name = "ipmi-fox";
|
||||
metrics_path = "/ipmi";
|
||||
static_configs = [
|
||||
{ targets = [ "127.0.0.1:9290" ]; }
|
||||
];
|
||||
params = {
|
||||
target = [ "fox-ipmi" ];
|
||||
module = [ "fox" ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
|
@ -12,6 +12,8 @@ let
|
||||
installPhase = ''
|
||||
cp -r public $out
|
||||
'';
|
||||
# Don't mess doc/
|
||||
dontFixup = true;
|
||||
};
|
||||
in
|
||||
{
|
||||
|
@ -1,15 +1,15 @@
|
||||
- targets:
|
||||
- 10.0.40.101
|
||||
- 10.0.40.102
|
||||
- 10.0.40.103
|
||||
- 10.0.40.104
|
||||
- 10.0.40.105
|
||||
- 10.0.40.106
|
||||
- 10.0.40.107
|
||||
- 10.0.40.108
|
||||
- owl1-ipmi
|
||||
- owl2-ipmi
|
||||
- xeon03-ipmi
|
||||
- xeon04-ipmi
|
||||
- koro-ipmi
|
||||
- xeon06-ipmi
|
||||
- hut-ipmi
|
||||
- eudy-ipmi
|
||||
# Storage
|
||||
- 10.0.40.141
|
||||
- 10.0.40.142
|
||||
- 10.0.40.143
|
||||
- bay-ipmi
|
||||
- oss01-ipmi
|
||||
- lake2-ipmi
|
||||
labels:
|
||||
job: ipmi-lan
|
||||
|
@ -43,12 +43,13 @@ in {
|
||||
clusterName = "jungle";
|
||||
nodeName = [
|
||||
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
|
||||
"fox Sockets=2 CoresPerSocket=96 ThreadsPerCore=1 Feature=fox"
|
||||
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
|
||||
];
|
||||
|
||||
partitionName = [
|
||||
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||
"all Nodes=owl[1-2],hut Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||
"fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||
];
|
||||
|
||||
# See slurm.conf(5) for more details about these options.
|
||||
@ -76,7 +77,7 @@ in {
|
||||
SuspendTimeout=60
|
||||
ResumeProgram=${resumeProgram}
|
||||
ResumeTimeout=300
|
||||
SuspendExcNodes=hut
|
||||
SuspendExcNodes=hut,fox
|
||||
|
||||
# Turn the nodes off after 1 hour of inactivity
|
||||
SuspendTime=3600
|
||||
@ -91,9 +92,29 @@ in {
|
||||
# Ignore memory constraints and only use unused cores to share a node with
|
||||
# other jobs.
|
||||
SelectTypeParameters=CR_Core
|
||||
|
||||
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||
# This sets up the "extern" step into which ssh-launched processes will be
|
||||
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
|
||||
# when a task runs (srun) so we can ssh early.
|
||||
PrologFlags=Alloc,Contain,X11
|
||||
|
||||
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
|
||||
# adopted by the external step, similar to tasks running in regular steps
|
||||
# LaunchParameters=ulimit_pam_adopt
|
||||
SlurmdDebug=debug5
|
||||
#DebugFlags=Protocol,Cgroup
|
||||
'';
|
||||
|
||||
extraCgroupConfig = ''
|
||||
CgroupPlugin=cgroup/v2
|
||||
#ConstrainCores=yes
|
||||
'';
|
||||
};
|
||||
|
||||
# Place the slurm config in /etc as this will be required by PAM
|
||||
environment.etc.slurm.source = config.services.slurm.etcSlurm;
|
||||
|
||||
age.secrets.mungeKey = {
|
||||
file = ../../secrets/munge-key.age;
|
||||
owner = "munge";
|
||||
|
@ -39,6 +39,18 @@ final: prev:
|
||||
# See https://bugs.schedmd.com/show_bug.cgi?id=19324
|
||||
./slurm-rank-expansion.patch
|
||||
];
|
||||
# Install also the pam_slurm_adopt library to restrict users from accessing
|
||||
# nodes with no job allocated.
|
||||
postBuild = (old.postBuild or "") + ''
|
||||
pushd contribs/pam_slurm_adopt
|
||||
make "PAM_DIR=$out/lib/security"
|
||||
popd
|
||||
'';
|
||||
postInstall = (old.postInstall or "") + ''
|
||||
pushd contribs/pam_slurm_adopt
|
||||
make "PAM_DIR=$out/lib/security" install
|
||||
popd
|
||||
'';
|
||||
});
|
||||
|
||||
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
||||
|
Binary file not shown.
@ -1,9 +1,9 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg DQdgCk16Yu524BsrWVf0krnwWzDM6SeaJCgQipOfwCA
|
||||
Ab9ocqra/UWJZI+QGMlxUhBu5AzqfjPgXl+ENIiHYGs
|
||||
-> ssh-ed25519 CAWG4Q KF9rGCenb3nf+wyz2hyVs/EUEbsmUs5R+1fBxlCibC8
|
||||
7++Kxbr3FHVdVfnFdHYdAuR0Tgfd+sRcO6WRss6LhEw
|
||||
-> ssh-ed25519 MSF3dg aUe4DhRsu4X8CFOEAnD/XM/o/0qHYSB522woCaAVh0I
|
||||
GRcs5cm2YqA/lGhUtbpboBaz7mfgiLaCr+agaB7vACU
|
||||
--- 9Q7Ou+Pxq+3RZilCb2dKC/pCFjZEt4rp5KnTUUU7WJ8
|
||||
1¬Mw4‘Í ì:Hµ@Á/ägLtMÇ,߯¥ô*¡žzñNV5ˆm‚ÍNŽoÞáj1$÷TøG_³E{Œ%“‰1ǯ‘<>H£îAÛp™
|
||||
-> ssh-ed25519 HY2yRg eRVX5yndWDLg9hw7sY1Iu8pJFy47luHvdL+zZGK2u1s
|
||||
e1nXXiMW0ywkZYh2s6c7/quGMfBOJOaRhNQDjCD2Iyc
|
||||
-> ssh-ed25519 CAWG4Q gYG7GRxRpJ0/5Wz0Z0J2wfLfkMFNmcy81dQEewM7gUA
|
||||
lamdUdx+xOFWF1lmUM4x9TT0cJtKu9Sp7w9JHwm13u0
|
||||
-> ssh-ed25519 MSF3dg HEzfpR8alG6WPzhaEjAmmjOFoFcMSQUldx46dBsXri4
|
||||
OAD5H/zZGhfevYrFJzJrbNKPomKZDOS9Qx5tmTp78Jo
|
||||
--- A0sMSiNXWaEIgRXR0x6UAIaluuVH6Zlv4CJ9sI0NXOw
|
||||
ÿú6çphóÎÆ{Ñ>®F|ÅiÃvâæE}{ìruÎâÆ·‹Ý°ËÍ}^»‰>ñc6¥´j÷ ùgèGW<47>Ã:—J3ù|ø|†ZÑ
|
11
secrets/gitlab-bsc-docker-token.age
Normal file
11
secrets/gitlab-bsc-docker-token.age
Normal file
@ -0,0 +1,11 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg WSdjyQPzBJ4JbzQpGeq1AAYpWKoXmLI1ZtmNmM5QOzs
|
||||
qGDlDT31DQF1DdHen0+5+52DdsQlabJdA2pOB5O1I6g
|
||||
-> ssh-ed25519 CAWG4Q wioWMDxQjN+d4JdIbCwZg0DLQu1OH2mV6gukRprjuAs
|
||||
670fE61hidOEh20hHiQAhP0+CjDF0WMBNzgwkGT8Yqg
|
||||
-> ssh-ed25519 MSF3dg DN19uvAEtqq4708P6HpuX9i/o/qAvHX6dj69dCF2H1o
|
||||
4Lu9GnjiFLMeXJ2C7aVPJsCHCQVlhylNWJi896Av92s
|
||||
--- 7cKBwOYNOUZ2h3/kAY09aSMASZSxX7hZIT4kvlIiT6w
|
||||
³6—çà•äfQF5=¦bX+‡v e`Ï7/øªA~PÎÖѦ7<15>Ì
|
||||
´ÖA÷)·h³ù=oZ¸$é^´V0ñ/Ü…µr
|
||||
k¸uœbĶ:R‘<52>>^gŒõ¼ik_*%<0B>a7ùKGæ<47>ÐÖçâ&PI¶£n
|
@ -1,9 +1,10 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg 0sEIUEJBJQ0k0rBfHaOEbq1pNBqsPin4Xq85v0ds9jY
|
||||
4wzjLapoOcq53nT2K3hSGED4jTDXci25GLHkl/fL4EI
|
||||
-> ssh-ed25519 CAWG4Q f68ZbJGwXuCZVnqhwbh+8dh0X/MCdjEd+sVtPyBu/hU
|
||||
u2TQreyWQvP6PGuwuUNKA/AL68560flqSlaItN3k41I
|
||||
-> ssh-ed25519 MSF3dg HdrtRW2j7mfkLH0/4aJK5R0cWdjf56HYtEZgzHi9EAs
|
||||
A6MF6tXmSUq2RF2bpmav0GFTRERwluSZGh2snP/KqkA
|
||||
--- drsezqi7J/g8gm6N10SkfeAWnYct99WUraB5djLJqpo
|
||||
gÔ
(ìÐJ!M6¬É3e¸AÜæÃ?\1y÷eüFN\‘<>/MêòªN`K^€+"¤«Y^å>dÒH÷°‡¸†]P…ÓûJ‘`xôã»{Ú±ô„y°ÅÎøSˆéyPX{w‰Sï
ž^5X¶JPô;v‰
|
||||
-> ssh-ed25519 HY2yRg GdmdkW+BqqwBgu30b846jv3J7jtCM+a3rgOERuA050A
|
||||
FeGqM75jG9egesR+yyVKHm0/M+uBBp5Hclg4+qN0BR8
|
||||
-> ssh-ed25519 CAWG4Q a0wTWHgulQUYDAMZmXf3dOf6PdYgCqNtSylzWVVRNVM
|
||||
Bx+WSYaiY4ZwlSZJo2a1XPMQmbKOU7F0tKAqVRLBOPo
|
||||
-> ssh-ed25519 MSF3dg KccUvZZUbxbCrRWUWrX8KcHF6vQ5FV/BqUqI59G7dj4
|
||||
CFr7GXpZ9rPgy7HBfOyiYF9FnZUw6KcZwq9f7/0KaU8
|
||||
--- E0Rp6RR/8+o0jvB1lRdhnlabxvI6uu/IgL2ZpPXzTc8
|
||||
û#ã¶H÷$°F;Ñéù%›È6êË2†¢rfXŸ\Dn ÖшºÈ‰©x™Î>¥Ù&;÷c‘UŠI=›ÑMöÀª?Tœ¡Ç¸ÂÂ"px†Ó\s‚ÙãbFý<46>ù¹WD¼{Ë
|
||||
AW>?U©ÙÊçÐHÔ³
|
@ -1,9 +1,9 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg VY8s9s1zuHOv2axmIacwKg2ozsJnskHTQtslRZ3YI1M
|
||||
fKkJuydLOzF/ciPYSYu4ziSCozdl6sowvDMYZmxqmHY
|
||||
-> ssh-ed25519 CAWG4Q 2ARFd/7RWQ/QOk47FnJFChaVBgoV4LE6EA+JHezkXgg
|
||||
MV4g4Llv8Qcd/wUgJyoNG5AXb6o3aFTrOYGC+lXlSzw
|
||||
-> ssh-ed25519 MSF3dg SKoxWe8Mi8EkBjkESxStOCI5V4C0KYEXIOx7OdENgTA
|
||||
p/owKwQ4e4pcGV+hqej2AfPU5QaM2i8VfxhlkjCM4Z4
|
||||
--- 0VWKU5CQiGbiOtQ2tsZZg88oZm1qcUDEnU5zDTtV+KU
|
||||
ŸÖuµcl÷ª`Ÿ¡Mþ¸'Vk6Yè!Ó=¦LÀ¦yš-ž¬ÁO¢Az«Æ˜VEK¦<4B>‚R†_ÌqL|1V•[)²qœ©„Æ“Lç<4C>DyÌÉ0¹_áßåq)-T,ƪú_9û ”?å<>àûib†1
|
||||
-> ssh-ed25519 HY2yRg xWRxJGWSzA5aplRYCYLB6aBwrUrQQJ2MtDYaD75V5nI
|
||||
J07XF3NQiaYKKKNRcNWi9MloJD2wXHd+2K7bo6lF+QU
|
||||
-> ssh-ed25519 CAWG4Q jNWymbyCczcm8RcaIEbFQBlOMALsuxTl4+pLUi0aR20
|
||||
z5NixlrRD+Y7Z/aFPs6hiDW4/lp8CBQCeJYpbuG9yYM
|
||||
-> ssh-ed25519 MSF3dg QsUQloEKN3k1G49FQnNR/Do6ILgGpjFcw3zu5kk1Ako
|
||||
IHwyFWUEWqCStNcFprnpBa8L5J6zKIsn+7HcgGRv3sM
|
||||
--- oUia0fsL6opeYWACyXtHAu/Ld+bUIt/7S1VszYTvwgU
|
||||
™êVäœ*øtë2-Ÿ7·œ–Ž“§hÜ&‰éÍ¢_!Õ¿+”·±¯(‚ã¡nù¿ ¬í(Ëê÷/}òœäáCúNÍ·|ÇNèuÎ5‰Ã¹å‹šKÀìlÆ"ÃØklOX¨yº÷æØàù¤¹ø²Aíõe„È$
|
BIN
secrets/ipmi.yml.age
Normal file
BIN
secrets/ipmi.yml.age
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -9,8 +9,10 @@ in
|
||||
"gitea-runner-token.age".publicKeys = hut;
|
||||
"gitlab-runner-docker-token.age".publicKeys = hut;
|
||||
"gitlab-runner-shell-token.age".publicKeys = hut;
|
||||
"gitlab-bsc-docker-token.age".publicKeys = hut;
|
||||
"nix-serve.age".publicKeys = hut;
|
||||
"jungle-robot-password.age".publicKeys = hut;
|
||||
"ipmi.yml.age".publicKeys = hut;
|
||||
|
||||
"ceph-user.age".publicKeys = safe;
|
||||
"munge-key.age".publicKeys = safe;
|
||||
|
@ -11,7 +11,7 @@ access to the login machine using a resource petition in the BSC intranet.
|
||||
|
||||
Then, to request access to the machines we will need some information about you:
|
||||
|
||||
1. Which machines you want access to (hut, owl1, owl2, eudy, koro...)
|
||||
1. Which machines you want access to ([hut](/hut), [fox](/fox), owl1, owl2, eudy, koro...)
|
||||
1. Your user name and user id (to match the NFS permissions)
|
||||
1. Your real name and surname (for identification purposes)
|
||||
1. The salted hash of your login password, generated with `mkpasswd -m sha-512`
|
||||
|
10
web/content/doc/_index.md
Normal file
10
web/content/doc/_index.md
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
title: "Docs"
|
||||
description: "Documentation for users of jungle machines"
|
||||
date: 2023-09-15
|
||||
---
|
||||
|
||||
If this is the first time you use any of the jungle machines with NixOS, follow
|
||||
the [quick start guide](quickstart).
|
||||
|
||||
|
234
web/content/doc/quickstart.md
Normal file
234
web/content/doc/quickstart.md
Normal file
@ -0,0 +1,234 @@
|
||||
---
|
||||
title: "Quick start"
|
||||
date: 2023-09-15
|
||||
---
|
||||
|
||||
This documentation will guide you on how to build custom packages of software
|
||||
and use them in the jungle machines. It has been designed to reduce the friction
|
||||
from users coming from module systems.
|
||||
|
||||
You should be able to access the jungle machines, otherwise [request
|
||||
access](/access).
|
||||
|
||||
## Changes from other HPC machines
|
||||
|
||||
Users of other machines have been using the Lmod tool (module load ...) to add
|
||||
or remove programs from their environment, as well as manually building their
|
||||
own software for too many years.
|
||||
|
||||
While we cannot prevent users from continuing to use this tedious mechanism, we
|
||||
have designed the jungle machines to be much easier to operate by using the nix
|
||||
package manager.
|
||||
|
||||
### Freedom to install packages
|
||||
|
||||
When a user wanted to install a package, it was forced to either do it on its
|
||||
own directory, or request a system administrator to install it in a shared
|
||||
directory, so other users can also use that package.
|
||||
|
||||
This situation is gone, each user can install any package of software by
|
||||
themselves, without requiring any other authorization. When two users request
|
||||
the same package, the same copy will be provided.
|
||||
|
||||
A new package will be downloaded if it is available (someone already built it)
|
||||
or will be built from source on demand.
|
||||
|
||||
### No changes over time
|
||||
|
||||
All users retain the same versions of the packages they request until they
|
||||
decide to update them.
|
||||
|
||||
## Using nix to manage packages
|
||||
|
||||
In this chapter we show how to install packages and enter a development shell to
|
||||
build new programs from source. The examples are done from the hut machine,
|
||||
read [this page](/access) to request access.
|
||||
|
||||
### Installing binaries
|
||||
|
||||
To temporarily install new packages, use:
|
||||
|
||||
```text
|
||||
hut% nix shell jungle#gcc jungle#cowsay jungle#ovni
|
||||
```
|
||||
|
||||
Notice that the packages are described as two parts divided by the `#` symbol.
|
||||
The first part defines where to take the package from and the second part is
|
||||
the name of the package. For now we will use `jungle#<package>`. You can find
|
||||
many more packages here:
|
||||
|
||||
<https://search.nixos.org/packages>
|
||||
|
||||
You will now enter a new shell, where those requested package **binaries are
|
||||
available in $PATH**:
|
||||
|
||||
```text
|
||||
hut% cowsay hello world
|
||||
_____________
|
||||
< hello world >
|
||||
-------------
|
||||
\ ^__^
|
||||
\ (oo)\_______
|
||||
(__)\ )\/\
|
||||
||----w |
|
||||
|| ||
|
||||
|
||||
hut% ovniver
|
||||
LD_LIBRARY_PATH not set
|
||||
libovni: build v1.11.0 (a7103f8), dynamic v1.11.0 (a7103f8)
|
||||
|
||||
hut% gcc --version
|
||||
gcc (GCC) 13.3.0
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
This is free software; see the source for copying conditions. There is NO
|
||||
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
```
|
||||
|
||||
### Building programs
|
||||
|
||||
The above method only loads new binaries in the `$PATH`. If we try to build a
|
||||
program that includes headers or links with a library, it will fail to find
|
||||
them:
|
||||
|
||||
```text
|
||||
hut$ cat test.c
|
||||
#include <ovni.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
ovni_version_check();
|
||||
return 0;
|
||||
}
|
||||
hut% gcc test.c -lovni -o test
|
||||
test.c:1:10: fatal error: ovni.h: No such file or directory
|
||||
1 | #include <ovni.h>
|
||||
| ^~~~~~~~
|
||||
compilation terminated.
|
||||
```
|
||||
|
||||
We could manually add the full path to the ovni include directory with `-I` and
|
||||
the libraries with `-L`, but there is a tool that already perform these steps
|
||||
automatically for us, `nix develop`.
|
||||
|
||||
Let's go back to our original shell first, where those packages are not
|
||||
available anymore:
|
||||
|
||||
```
|
||||
hut% ps
|
||||
PID TTY TIME CMD
|
||||
2356260 pts/1 00:00:01 zsh
|
||||
2457268 pts/1 00:00:00 zsh
|
||||
2457297 pts/1 00:00:00 ps
|
||||
hut% exit
|
||||
hut% ovniver
|
||||
ovniver: command not found
|
||||
```
|
||||
|
||||
### Creating a flake.nix
|
||||
|
||||
To define which packages we want, we will write a small file that list them, a
|
||||
flake.nix file.
|
||||
|
||||
First, we will create a new directory where we are going to be working:
|
||||
|
||||
```
|
||||
hut% mkdir example
|
||||
hut% cd exmple
|
||||
```
|
||||
|
||||
Then place this flake.nix file:
|
||||
|
||||
```nix
|
||||
{
|
||||
inputs.jungle.url = "jungle";
|
||||
outputs = { self, jungle }:
|
||||
let
|
||||
pkgs = jungle.outputs.packages.x86_64-linux;
|
||||
in {
|
||||
devShells.x86_64-linux.default = pkgs.mkShell {
|
||||
pname = "devshell";
|
||||
buildInputs = with pkgs; [
|
||||
ovni gcc cowsay # more packages here...
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Now enter the shell with:
|
||||
|
||||
```
|
||||
hut% nix develop
|
||||
warning: creating lock file '/home/Computational/rarias/example/flake.lock':
|
||||
• Added input 'jungle':
|
||||
'path:/nix/store/27srv8haj6vv4ywrbmw0a8vds561m8rq-source?lastModified=1739479441&narHash=sha256-Kgjs8SO1w9NbPBu8ghwzCxYJ9kvWpoQOT%2BXwPvA9DcU%3D&rev=76396c0d67ef0cf32377d5c1894bb695293bca9d' (2025-02-13)
|
||||
• Added input 'jungle/agenix':
|
||||
'github:ryantm/agenix/f6291c5935fdc4e0bef208cfc0dcab7e3f7a1c41?narHash=sha256-b%2Buqzj%2BWa6xgMS9aNbX4I%2BsXeb5biPDi39VgvSFqFvU%3D' (2024-08-10)
|
||||
• Added input 'jungle/agenix/darwin':
|
||||
'github:lnl7/nix-darwin/4b9b83d5a92e8c1fbfd8eb27eda375908c11ec4d?narHash=sha256-gzGLZSiOhf155FW7262kdHo2YDeugp3VuIFb4/GGng0%3D' (2023-11-24)
|
||||
• Added input 'jungle/agenix/darwin/nixpkgs':
|
||||
follows 'jungle/agenix/nixpkgs'
|
||||
• Added input 'jungle/agenix/home-manager':
|
||||
'github:nix-community/home-manager/3bfaacf46133c037bb356193bd2f1765d9dc82c1?narHash=sha256-7ulcXOk63TIT2lVDSExj7XzFx09LpdSAPtvgtM7yQPE%3D' (2023-12-20)
|
||||
• Added input 'jungle/agenix/home-manager/nixpkgs':
|
||||
follows 'jungle/agenix/nixpkgs'
|
||||
• Added input 'jungle/agenix/nixpkgs':
|
||||
follows 'jungle/nixpkgs'
|
||||
• Added input 'jungle/agenix/systems':
|
||||
'github:nix-systems/default/da67096a3b9bf56a91d16901293e51ba5b49a27e?narHash=sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768%3D' (2023-04-09)
|
||||
• Added input 'jungle/bscpkgs':
|
||||
'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=6782fc6c5b5a29e84a7f2c2d1064f4bcb1288c0f' (2024-11-29)
|
||||
• Added input 'jungle/bscpkgs/nixpkgs':
|
||||
follows 'jungle/nixpkgs'
|
||||
• Added input 'jungle/nixpkgs':
|
||||
'github:NixOS/nixpkgs/9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc?narHash=sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8%3D' (2025-01-14)
|
||||
|
||||
hut$
|
||||
```
|
||||
|
||||
Notice that long list of messages is Nix creating a new flake.lock file with the
|
||||
current state of the packages. Next invocations will use the same packages as
|
||||
described by the lock file.
|
||||
|
||||
### Building a program from nix develop
|
||||
|
||||
Now let's try again building our test program:
|
||||
|
||||
```text
|
||||
hut$ cat test.c
|
||||
#include <ovni.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
ovni_version_check();
|
||||
return 0;
|
||||
}
|
||||
hut$ gcc test.c -o test -lovni
|
||||
hut$ ldd test
|
||||
linux-vdso.so.1 (0x00007ffff7fc4000)
|
||||
libovni.so.1 => /nix/store/sqk972akjv0q8dchn8ccjln2llzyyfd0-ovni-1.11.0/lib/libovni.so.1 (0x00007ffff7fab000)
|
||||
libc.so.6 => /nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib/libc.so.6 (0x00007ffff7db2000)
|
||||
/nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib/ld-linux-x86-64.so.2 => /nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib64/ld-linux-x86-64.so.2 (0x00007ffff7fc6000)
|
||||
hut$ ./test
|
||||
```
|
||||
|
||||
Now the ovni.h header and the libovni library are found and the program is
|
||||
successfully built, linked and executed.
|
||||
|
||||
You can add more packages as needed in your flake.nix:
|
||||
|
||||
```nix
|
||||
buildInputs = with pkgs; [
|
||||
ovni gcc cowsay # more packages here...
|
||||
];
|
||||
```
|
||||
|
||||
Make sure you exit the develop shell first, and then enter again with `nix
|
||||
develop`.
|
||||
|
||||
## Remember
|
||||
|
||||
- `nix shell` places binaries in the `$PATH`.
|
||||
- `nix develop` enters a development shell where both binaries and the libraries
|
||||
and includes are available so you can build new programs.
|
97
web/content/fox/_index.md
Normal file
97
web/content/fox/_index.md
Normal file
@ -0,0 +1,97 @@
|
||||
---
|
||||
title: "Fox"
|
||||
description: "AMD Genoa 9684X with 2 NVIDIA RTX4000 GPUs"
|
||||
date: 2025-02-12
|
||||
---
|
||||
|
||||

|
||||
|
||||
Picture by [Joanne Redwood](https://web.archive.org/web/20191109175146/https://www.inaturalist.org/photos/6568074),
|
||||
[CC0](http://creativecommons.org/publicdomain/zero/1.0/deed.en).
|
||||
|
||||
The *fox* machine is a big GPU server that is configured to run heavy workloads.
|
||||
It has two fast AMD CPUs with large cache and 2 reasonable NVIDIA GPUs. Here are
|
||||
the detailed specifications:
|
||||
|
||||
- 2x AMD GENOA X 9684X DP/UP 96C/192T 2.55G 1,150M 400W SP5 3D V-cach
|
||||
- 24x 32GB DDR5-4800 ECC RDIMM (total 768 GiB of RAM)
|
||||
- 1x 2.5" SSD SATA3 MICRON 5400 MAX 480GB
|
||||
- 2x 2.5" KIOXIA CM7-R 1.92TB NVMe GEN5 PCIe 5x4
|
||||
- 2x NVIDIA RTX4000 ADA Gen 20GB GDDR6 PCIe 4.0
|
||||
|
||||
## Access
|
||||
|
||||
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
|
||||
partition:
|
||||
|
||||
hut% salloc -p fox
|
||||
|
||||
Then connect via ssh:
|
||||
|
||||
hut% ssh fox
|
||||
fox%
|
||||
|
||||
Follow [these steps](/access) if you don't have access to hut or fox.
|
||||
|
||||
## CUDA
|
||||
|
||||
To use CUDA, you can use the following `flake.nix` placed in a new directory to
|
||||
load all the required dependencies:
|
||||
|
||||
```nix
|
||||
{
|
||||
inputs.jungle.url = "jungle";
|
||||
|
||||
outputs = { jungle, ... }: {
|
||||
devShell.x86_64-linux = let
|
||||
pkgs = jungle.nixosConfigurations.fox.pkgs;
|
||||
in pkgs.mkShell {
|
||||
name = "cuda-env-shell";
|
||||
buildInputs = with pkgs; [
|
||||
git gitRepo gnupg autoconf curl
|
||||
procps gnumake util-linux m4 gperf unzip
|
||||
|
||||
# Cuda packages (more at https://search.nixos.org/packages)
|
||||
cudatoolkit linuxPackages.nvidia_x11
|
||||
cudaPackages.cuda_cudart.static
|
||||
cudaPackages.libcusparse
|
||||
|
||||
libGLU libGL
|
||||
xorg.libXi xorg.libXmu freeglut
|
||||
xorg.libXext xorg.libX11 xorg.libXv xorg.libXrandr zlib
|
||||
ncurses5 stdenv.cc binutils
|
||||
];
|
||||
shellHook = ''
|
||||
export CUDA_PATH=${pkgs.cudatoolkit}
|
||||
export LD_LIBRARY_PATH=/var/run/opengl-driver/lib
|
||||
export SMS=50
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
Then just run `nix develop` from the same directory:
|
||||
|
||||
% mkdir cuda
|
||||
% cd cuda
|
||||
% vim flake.nix
|
||||
[...]
|
||||
% nix develop
|
||||
$ nvcc -V
|
||||
nvcc: NVIDIA (R) Cuda compiler driver
|
||||
Copyright (c) 2005-2024 NVIDIA Corporation
|
||||
Built on Tue_Feb_27_16:19:38_PST_2024
|
||||
Cuda compilation tools, release 12.4, V12.4.99
|
||||
Build cuda_12.4.r12.4/compiler.33961263_0
|
||||
|
||||
## Filesystems
|
||||
|
||||
The machine has several file systems available.
|
||||
|
||||
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
|
||||
Don't abuse.
|
||||
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
|
||||
capacity. Stores three redundant copies of every file.
|
||||
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
|
||||
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
|
BIN
web/content/fox/fox.jpg
Normal file
BIN
web/content/fox/fox.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 126 KiB |
@ -3,32 +3,38 @@ languageCode = 'en-us'
|
||||
title = 'The jungle'
|
||||
theme = 'PaperMod'
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "doc"
|
||||
name = "Docs"
|
||||
url = "/doc/"
|
||||
weight = 10
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "grafana"
|
||||
name = "Grafana"
|
||||
url = "/grafana/"
|
||||
weight = 10
|
||||
weight = 20
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "Git"
|
||||
name = "Git"
|
||||
url = "/git/"
|
||||
weight = 20
|
||||
weight = 30
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "Lists"
|
||||
name = "Lists"
|
||||
url = "/lists/"
|
||||
weight = 30
|
||||
weight = 40
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "Paste"
|
||||
name = "Paste"
|
||||
url = "/paste/"
|
||||
weight = 40
|
||||
weight = 50
|
||||
|
||||
[[menu.main]]
|
||||
identifier = "Posts"
|
||||
name = "Posts"
|
||||
url = "/posts/"
|
||||
weight = 50
|
||||
weight = 60
|
||||
|
Loading…
x
Reference in New Issue
Block a user