10 Commits

11 changed files with 469 additions and 9 deletions

View File

@@ -68,7 +68,7 @@
home = "/home/Computational/anavarro";
description = "Antoni Navarro";
group = "Computational";
hosts = [ "hut" "raccoon" ];
hosts = [ "hut" "raccoon" "fox" ];
hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead"
@@ -81,7 +81,7 @@
home = "/home/Computational/abonerib";
description = "Aleix Boné";
group = "Computational";
hosts = [ "owl1" "owl2" "hut" "raccoon" ];
hosts = [ "owl1" "owl2" "hut" "raccoon" "fox" ];
hashedPassword = "$6$V1EQWJr474whv7XJ$OfJ0wueM2l.dgiJiiah0Tip9ITcJ7S7qDvtSycsiQ43QBFyP4lU0e0HaXWps85nqB4TypttYR4hNLoz3bz662/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
@@ -113,6 +113,32 @@
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
];
};
dalvare1 = {
uid = 2758;
isNormalUser = true;
home = "/home/Computational/dalvare1";
description = "David Álvarez";
group = "Computational";
hosts = [ "hut" "fox" ];
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
];
};
varcila = {
uid = 5650;
isNormalUser = true;
home = "/home/Computational/varcila";
description = "Vincent Arcila";
group = "Computational";
hosts = [ "hut" "fox" ];
hashedPassword = "$6$oB0Tcn99DcM4Ch$Vn1A0ulLTn/8B2oFPi9wWl/NOsJzaFAWjqekwcuC9sMC7cgxEVb.Nk5XSzQ2xzYcNe5MLtmzkVYnRS1CqP39Y0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGt0ESYxekBiHJQowmKpfdouw0hVm3N7tUMtAaeLejK vincent@varch"
];
};
};
groups = {

View File

@@ -35,4 +35,41 @@
nixpkgs.config.allowUnfree = true;
nixpkgs.config.nvidia.acceptLicense = true;
services.xserver.videoDrivers = [ "nvidia" ];
# Mount NVME disks
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
# Make a /nvme{0,1}/$USER directory for each user.
systemd.services.create-nvme-dirs = let
# Take only normal users in fox
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
commands = lib.concatLists (lib.mapAttrsToList
(_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0755 /nvme{0,1}/${user.name}"
]) users);
script = pkgs.writeShellScript "create-nvme-dirs.sh" (lib.concatLines commands);
in {
enable = true;
wants = [ "local-fs.target" ];
after = [ "local-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

View File

@@ -12,6 +12,8 @@ let
installPhase = ''
cp -r public $out
'';
# Don't mess doc/
dontFixup = true;
};
in
{

View File

@@ -27,6 +27,22 @@ let
done
'';
prolog = pkgs.writeScript "prolog.sh" ''
#!/usr/bin/env bash
echo "hello from the prolog"
exit 0
'';
epilog = pkgs.writeScript "epilog.sh" ''
#!/usr/bin/env bash
echo "hello from the epilog"
exit 0
'';
in {
systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill
@@ -43,7 +59,7 @@ in {
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=2 CoresPerSocket=96 ThreadsPerCore=2 Feature=fox"
"fox Sockets=2 CoresPerSocket=96 ThreadsPerCore=1 Feature=fox"
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
];
@@ -93,9 +109,29 @@ in {
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
# when a task runs (srun) so we can ssh early.
PrologFlags=Alloc,Contain,X11
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
# adopted by the external step, similar to tasks running in regular steps
# LaunchParameters=ulimit_pam_adopt
SlurmdDebug=debug5
#DebugFlags=Protocol,Cgroup
'';
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
'';
};
# Place the slurm config in /etc as this will be required by PAM
environment.etc.slurm.source = config.services.slurm.etcSlurm;
age.secrets.mungeKey = {
file = ../../secrets/munge-key.age;
owner = "munge";

View File

@@ -39,6 +39,18 @@ final: prev:
# See https://bugs.schedmd.com/show_bug.cgi?id=19324
./slurm-rank-expansion.patch
];
# Install also the pam_slurm_adopt library to restrict users from accessing
# nodes with no job allocated.
postBuild = (old.postBuild or "") + ''
pushd contribs/pam_slurm_adopt
make "PAM_DIR=$out/lib/security"
popd
'';
postInstall = (old.postInstall or "") + ''
pushd contribs/pam_slurm_adopt
make "PAM_DIR=$out/lib/security" install
popd
'';
});
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };

View File

@@ -11,7 +11,7 @@ access to the login machine using a resource petition in the BSC intranet.
Then, to request access to the machines we will need some information about you:
1. Which machines you want access to (hut, owl1, owl2, eudy, koro...)
1. Which machines you want access to ([hut](/hut), [fox](/fox), owl1, owl2, eudy, koro...)
1. Your user name and user id (to match the NFS permissions)
1. Your real name and surname (for identification purposes)
1. The salted hash of your login password, generated with `mkpasswd -m sha-512`

10
web/content/doc/_index.md Normal file
View File

@@ -0,0 +1,10 @@
---
title: "Docs"
description: "Documentation for users of jungle machines"
date: 2023-09-15
---
If this is the first time you use any of the jungle machines with NixOS, follow
the [quick start guide](quickstart).

View File

@@ -0,0 +1,234 @@
---
title: "Quick start"
date: 2023-09-15
---
This documentation will guide you on how to build custom packages of software
and use them in the jungle machines. It has been designed to reduce the friction
from users coming from module systems.
You should be able to access the jungle machines, otherwise [request
access](/access).
## Changes from other HPC machines
Users of other machines have been using the Lmod tool (module load ...) to add
or remove programs from their environment, as well as manually building their
own software for too many years.
While we cannot prevent users from continuing to use this tedious mechanism, we
have designed the jungle machines to be much easier to operate by using the nix
package manager.
### Freedom to install packages
When a user wanted to install a package, it was forced to either do it on its
own directory, or request a system administrator to install it in a shared
directory, so other users can also use that package.
This situation is gone, each user can install any package of software by
themselves, without requiring any other authorization. When two users request
the same package, the same copy will be provided.
A new package will be downloaded if it is available (someone already built it)
or will be built from source on demand.
### No changes over time
All users retain the same versions of the packages they request until they
decide to update them.
## Using nix to manage packages
In this chapter we show how to install packages and enter a development shell to
build new programs from source. The examples are done from the hut machine,
read [this page](/access) to request access.
### Installing binaries
To temporarily install new packages, use:
```text
hut% nix shell jungle#gcc jungle#cowsay jungle#ovni
```
Notice that the packages are described as two parts divided by the `#` symbol.
The first part defines where to take the package from and the second part is
the name of the package. For now we will use `jungle#<package>`. You can find
many more packages here:
<https://search.nixos.org/packages>
You will now enter a new shell, where those requested package **binaries are
available in $PATH**:
```text
hut% cowsay hello world
_____________
< hello world >
-------------
\ ^__^
\ (oo)\_______
(__)\ )\/\
||----w |
|| ||
hut% ovniver
LD_LIBRARY_PATH not set
libovni: build v1.11.0 (a7103f8), dynamic v1.11.0 (a7103f8)
hut% gcc --version
gcc (GCC) 13.3.0
Copyright (C) 2023 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
```
### Building programs
The above method only loads new binaries in the `$PATH`. If we try to build a
program that includes headers or links with a library, it will fail to find
them:
```text
hut$ cat test.c
#include <ovni.h>
int main()
{
ovni_version_check();
return 0;
}
hut% gcc test.c -lovni -o test
test.c:1:10: fatal error: ovni.h: No such file or directory
1 | #include <ovni.h>
| ^~~~~~~~
compilation terminated.
```
We could manually add the full path to the ovni include directory with `-I` and
the libraries with `-L`, but there is a tool that already perform these steps
automatically for us, `nix develop`.
Let's go back to our original shell first, where those packages are not
available anymore:
```
hut% ps
PID TTY TIME CMD
2356260 pts/1 00:00:01 zsh
2457268 pts/1 00:00:00 zsh
2457297 pts/1 00:00:00 ps
hut% exit
hut% ovniver
ovniver: command not found
```
### Creating a flake.nix
To define which packages we want, we will write a small file that list them, a
flake.nix file.
First, we will create a new directory where we are going to be working:
```
hut% mkdir example
hut% cd exmple
```
Then place this flake.nix file:
```nix
{
inputs.jungle.url = "jungle";
outputs = { self, jungle }:
let
pkgs = jungle.outputs.packages.x86_64-linux;
in {
devShells.x86_64-linux.default = pkgs.mkShell {
pname = "devshell";
buildInputs = with pkgs; [
ovni gcc cowsay # more packages here...
];
};
};
}
```
Now enter the shell with:
```
hut% nix develop
warning: creating lock file '/home/Computational/rarias/example/flake.lock':
• Added input 'jungle':
'path:/nix/store/27srv8haj6vv4ywrbmw0a8vds561m8rq-source?lastModified=1739479441&narHash=sha256-Kgjs8SO1w9NbPBu8ghwzCxYJ9kvWpoQOT%2BXwPvA9DcU%3D&rev=76396c0d67ef0cf32377d5c1894bb695293bca9d' (2025-02-13)
• Added input 'jungle/agenix':
'github:ryantm/agenix/f6291c5935fdc4e0bef208cfc0dcab7e3f7a1c41?narHash=sha256-b%2Buqzj%2BWa6xgMS9aNbX4I%2BsXeb5biPDi39VgvSFqFvU%3D' (2024-08-10)
• Added input 'jungle/agenix/darwin':
'github:lnl7/nix-darwin/4b9b83d5a92e8c1fbfd8eb27eda375908c11ec4d?narHash=sha256-gzGLZSiOhf155FW7262kdHo2YDeugp3VuIFb4/GGng0%3D' (2023-11-24)
• Added input 'jungle/agenix/darwin/nixpkgs':
follows 'jungle/agenix/nixpkgs'
• Added input 'jungle/agenix/home-manager':
'github:nix-community/home-manager/3bfaacf46133c037bb356193bd2f1765d9dc82c1?narHash=sha256-7ulcXOk63TIT2lVDSExj7XzFx09LpdSAPtvgtM7yQPE%3D' (2023-12-20)
• Added input 'jungle/agenix/home-manager/nixpkgs':
follows 'jungle/agenix/nixpkgs'
• Added input 'jungle/agenix/nixpkgs':
follows 'jungle/nixpkgs'
• Added input 'jungle/agenix/systems':
'github:nix-systems/default/da67096a3b9bf56a91d16901293e51ba5b49a27e?narHash=sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768%3D' (2023-04-09)
• Added input 'jungle/bscpkgs':
'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=6782fc6c5b5a29e84a7f2c2d1064f4bcb1288c0f' (2024-11-29)
• Added input 'jungle/bscpkgs/nixpkgs':
follows 'jungle/nixpkgs'
• Added input 'jungle/nixpkgs':
'github:NixOS/nixpkgs/9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc?narHash=sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8%3D' (2025-01-14)
hut$
```
Notice that long list of messages is Nix creating a new flake.lock file with the
current state of the packages. Next invocations will use the same packages as
described by the lock file.
### Building a program from nix develop
Now let's try again building our test program:
```text
hut$ cat test.c
#include <ovni.h>
int main()
{
ovni_version_check();
return 0;
}
hut$ gcc test.c -o test -lovni
hut$ ldd test
linux-vdso.so.1 (0x00007ffff7fc4000)
libovni.so.1 => /nix/store/sqk972akjv0q8dchn8ccjln2llzyyfd0-ovni-1.11.0/lib/libovni.so.1 (0x00007ffff7fab000)
libc.so.6 => /nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib/libc.so.6 (0x00007ffff7db2000)
/nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib/ld-linux-x86-64.so.2 => /nix/store/nqb2ns2d1lahnd5ncwmn6k84qfd7vx2k-glibc-2.40-36/lib64/ld-linux-x86-64.so.2 (0x00007ffff7fc6000)
hut$ ./test
```
Now the ovni.h header and the libovni library are found and the program is
successfully built, linked and executed.
You can add more packages as needed in your flake.nix:
```nix
buildInputs = with pkgs; [
ovni gcc cowsay # more packages here...
];
```
Make sure you exit the develop shell first, and then enter again with `nix
develop`.
## Remember
- `nix shell` places binaries in the `$PATH`.
- `nix develop` enters a development shell where both binaries and the libraries
and includes are available so you can build new programs.

97
web/content/fox/_index.md Normal file
View File

@@ -0,0 +1,97 @@
---
title: "Fox"
description: "AMD Genoa 9684X with 2 NVIDIA RTX4000 GPUs"
date: 2025-02-12
---
![Fox](fox.jpg)
Picture by [Joanne Redwood](https://web.archive.org/web/20191109175146/https://www.inaturalist.org/photos/6568074),
[CC0](http://creativecommons.org/publicdomain/zero/1.0/deed.en).
The *fox* machine is a big GPU server that is configured to run heavy workloads.
It has two fast AMD CPUs with large cache and 2 reasonable NVIDIA GPUs. Here are
the detailed specifications:
- 2x AMD GENOA X 9684X DP/UP 96C/192T 2.55G 1,150M 400W SP5 3D V-cach
- 24x 32GB DDR5-4800 ECC RDIMM (total 768 GiB of RAM)
- 1x 2.5" SSD SATA3 MICRON 5400 MAX 480GB
- 2x 2.5" KIOXIA CM7-R 1.92TB NVMe GEN5 PCIe 5x4
- 2x NVIDIA RTX4000 ADA Gen 20GB GDDR6 PCIe 4.0
## Access
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
partition:
hut% salloc -p fox
Then connect via ssh:
hut% ssh fox
fox%
Follow [these steps](/access) if you don't have access to hut or fox.
## CUDA
To use CUDA, you can use the following `flake.nix` placed in a new directory to
load all the required dependencies:
```nix
{
inputs.jungle.url = "jungle";
outputs = { jungle, ... }: {
devShell.x86_64-linux = let
pkgs = jungle.nixosConfigurations.fox.pkgs;
in pkgs.mkShell {
name = "cuda-env-shell";
buildInputs = with pkgs; [
git gitRepo gnupg autoconf curl
procps gnumake util-linux m4 gperf unzip
# Cuda packages (more at https://search.nixos.org/packages)
cudatoolkit linuxPackages.nvidia_x11
cudaPackages.cuda_cudart.static
cudaPackages.libcusparse
libGLU libGL
xorg.libXi xorg.libXmu freeglut
xorg.libXext xorg.libX11 xorg.libXv xorg.libXrandr zlib
ncurses5 stdenv.cc binutils
];
shellHook = ''
export CUDA_PATH=${pkgs.cudatoolkit}
export LD_LIBRARY_PATH=/var/run/opengl-driver/lib
export SMS=50
'';
};
};
}
```
Then just run `nix develop` from the same directory:
% mkdir cuda
% cd cuda
% vim flake.nix
[...]
% nix develop
$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Feb_27_16:19:38_PST_2024
Cuda compilation tools, release 12.4, V12.4.99
Build cuda_12.4.r12.4/compiler.33961263_0
## Filesystems
The machine has several file systems available.
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
Don't abuse.
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
capacity. Stores three redundant copies of every file.
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.

BIN
web/content/fox/fox.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

View File

@@ -3,32 +3,38 @@ languageCode = 'en-us'
title = 'The jungle'
theme = 'PaperMod'
[[menu.main]]
identifier = "doc"
name = "Docs"
url = "/doc/"
weight = 10
[[menu.main]]
identifier = "grafana"
name = "Grafana"
url = "/grafana/"
weight = 10
weight = 20
[[menu.main]]
identifier = "Git"
name = "Git"
url = "/git/"
weight = 20
weight = 30
[[menu.main]]
identifier = "Lists"
name = "Lists"
url = "/lists/"
weight = 30
weight = 40
[[menu.main]]
identifier = "Paste"
name = "Paste"
url = "/paste/"
weight = 40
weight = 50
[[menu.main]]
identifier = "Posts"
name = "Posts"
url = "/posts/"
weight = 50
weight = 60