Compare commits

...

491 Commits

Author SHA1 Message Date
43a1b25c23
Set strictDeps=true on our top level packages 2025-10-07 16:27:46 +02:00
44cc60fcd8 Update license year range to 2025
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:32 +02:00
ca48ce556c Update gitlab CI after merge
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:30 +02:00
e8ac9dfb64 Upgrade README after bscpkgs merge
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:28 +02:00
188ba6df0a Remove bscpkgs input
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:26 +02:00
b1a37ae1fe Enable unfree packages in nixpkgs config
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:24 +02:00
63822bb054 Move the rest of packages to main overlay
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:23 +02:00
b94a1493d5 Merge flake.nix with bscpkgs
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:21 +02:00
826d6a28ef Move slurm to pkgs/
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:19 +02:00
ae6b0ae161 Move MPICH to pkgs/mpich and set as default
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-07 16:07:01 +02:00
01986c376b Merge remote-tracking branch 'bscpkgs/master' into merge-bscpkgs 2025-10-03 13:47:04 +02:00
e42058f08b Allow access to hut from fox
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-02 17:03:21 +02:00
f3bfe89f27 Fetch website from its own git repository
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-02 15:45:21 +02:00
ee6f981006 Add script to trim the repository
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-02 15:44:56 +02:00
92ee4a09d7 Rename test to tests and tests to testList
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
Tested-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:53:09 +02:00
34f4b6aa37 Move bsc-ci test into let
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:33 +02:00
2f2d6cbea8 Rework bsc-ci
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:31 +02:00
69b09b6dda Add riscv64 cross compilation to bsc-ci and hydra
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:29 +02:00
a737d725ed Put helper attrs of ompss2 drv to passthru
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:28 +02:00
6c1d1f3b2b Remove gcc from tampi *buildInputs
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:26 +02:00
f338ef47d5 Fix strictDeps ovni
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:25 +02:00
239e84c40c Fix strictDeps osu
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:23 +02:00
ed820e79f8 Fix strictDeps mercurium
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:22 +02:00
afeb415c98 Fix strictDeps tampi
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:20 +02:00
256b24b97b Fix strictDeps sonar
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:18 +02:00
492f73b600 Fix strictDeps nanos6
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:17 +02:00
76ddd85afe Fix strictDeps paraver
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:15 +02:00
7affb8ef4b Fix strictDeps ompss2
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:13 +02:00
4ba823e5b7 Fix strictDeps intel 2023
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:11 +02:00
51eecde59e Fix strictDeps bench6
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:45:08 +02:00
9eb5c486ba Fix strictDeps bigotes
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 15:43:58 +02:00
5df49dcfab Add gitea CI configuration
Builds the .#bsc-ci.all target on each PR. Causes all packages to be
built in hut, populating the nix cache.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
Tested-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-10-01 14:59:25 +02:00
b040bebd1d Add acinca user
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-10-01 12:27:43 +02:00
f69629d2da Restart slurmd on failure
A failure to reach the control node can cause slurmd to fail and the
unit remains in the failed state until is manually restarted. Instead,
try to restart the service every 30 seconds, forever:

    owl1% systemctl show slurmd | grep -E 'Restart=|RestartUSec='
    Restart=on-failure
    RestartUSec=30s
    owl1% pgrep slurmd
    5903
    owl1% sudo kill -SEGV 5903
    owl1% pgrep slurmd
    6137

Fixes: rarias/jungle#177
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-30 17:20:39 +02:00
0668f0db74 Lower connect timeout when using hut substituter
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-09-29 18:44:48 +02:00
5fcd57a061 Use hut substituter in all nodes
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-09-29 18:44:38 +02:00
ad1544759f Remove machine access for user csiringo
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-09-29 18:23:24 +02:00
e1c950a530 Mount apex /home via NFS in raccoon
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:53 +02:00
f9632c37f8 Remove extra SSH jump configuration
We now have direct visibility among nodes so we don't need any extra
SSH configuration to reach them.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:51 +02:00
1f0cb4ae76 Add raccoon peer to wireguard
It routes traffic from fox, apex and the compute nodes so that we can
reach the git servers and tent.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:48 +02:00
d49d078bed Add raccoon host key
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:46 +02:00
e98fdb89ab Restrict fox peer to a single IP
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:43 +02:00
6afe05b5fd Use lowercase peer hostnames
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-26 12:28:25 +02:00
7d5aebf882 Share a public folder for documents
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:59:40 +02:00
94cbfd38a6 Fix AMDuProfPcm so it finds libnuma.so
We change the search procedure so it detects NixOS from /etc/os-release
and uses "libnuma.so" when calling dlopen, instead of harcoding a full
path to /usr. The full patch of libnuma is stored in the runpath, so
dlopen can find it.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
Tested-by: Vincent Arcila <vincent.arcila@bsc.es>
2025-09-19 10:54:36 +02:00
4da7780472 Add amd_hsmp module in fox for AMD uProf
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:24 +02:00
a6dfc267fd Fix hidden dependencies for AMDuProfSys
It tries to dlopen libcrypt.so.1 and libstdc++.so.6, so we make sure
they are available by adding them to the runpath.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:19 +02:00
d6126501ba Disable NMI watchdog in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:17 +02:00
ac0deb47b6 Fix amd-uprof dependencies with patchelf
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:15 +02:00
f7d676de77 Fix hrtimer new interface
The hrtimer_init() is now done via hrtimer_setup() with the callback
function as argument.

See: https://lwn.net/Articles/996598/
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:09 +02:00
cf1db201b2 Use CFLAGS_MODULE instead of EXTRA_CFLAGS
Fixes the build in Linux 6.15.6, as it was not able to find the include
files.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:07 +02:00
e6e4846529 Add AMD uProf module and enable it in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:54:05 +02:00
084d556c56 Add AMD uProf package and driver
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-09-19 10:53:49 +02:00
ff0fc18d0a Mount home via NFS from apex in fox
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 15:34:02 +02:00
19c7e32678 Allow access to NFS via wireguard subnet
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 15:33:47 +02:00
017c19e7d0 Use 10.106.0.0/24 subnet to avoid collisions
The 106 byte is the code for 'j' (jungle) in ASCII:

	% printf j | od -t d
	0000000         106
	0000001

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:03:13 +02:00
a36eff8749 Revert "Remove pam_slurm_adopt from fox"
This reverts commit 1eac0fcad8211195499bc566e6c70312b31af700.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:03:06 +02:00
df17b11458 Enable fail2ban in fox
Protect fox against ssh bruteforce attacks:

fox% sudo lastb | head
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:25 - 11:25  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:24 - 11:24  (00:00)
root     ssh:notty    200.124.28.102   Mon Sep  1 11:24 - 11:24  (00:00)

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:03:02 +02:00
0dc7b7eb3d Accept connections from apex to fox slurmd
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:03:00 +02:00
dff6eaf587 Accept fox connection to slurm controller
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:59 +02:00
4b6b67b587 Add fox machine to SLURM
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:57 +02:00
20e7d244d1 Rekey secrets with trusted fox key
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:55 +02:00
c5d3b8e7f0 Trust fox for compute node secrets
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:52 +02:00
6bbfb0d124 Make apex host specific to each machine
Allows direct contact via the VPN when accessing from fox, but use
Internet when using the rest of the machines.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:49 +02:00
46d03d5ca7 Add local host fox in apex
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:46 +02:00
e366e6ce87 Enable wireguard in apex
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:43 +02:00
e415f70bbb Add wireguard server in fox
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-09-03 12:02:38 +02:00
200c727bbf Use writeShellScript for suspend.sh and resume.sh
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:28 +02:00
7413021440 Add firewall rules to slurm server
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:26 +02:00
20b4805335 Remove hut from slurm
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:24 +02:00
f7dff9deab Only configure apex as slurm server
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:22 +02:00
f569933732 Split slurm configuration for client and server
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:20 +02:00
ee895d2e4f Move slurm control server to apex
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-08-29 12:35:16 +02:00
5ee8623af2 Fix typo in csiringo ssh key
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-08-27 17:44:20 +02:00
a0e4b209b0 Enable nix-ld in weasel
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-08-27 16:19:34 +02:00
ce25867421 Add csiringo user with access to apex and weasel
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-08-27 16:02:26 +02:00
f89bba35a6 Access gitlab via raccoon in fox
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-08-27 15:27:38 +02:00
d591721a61 Move StartLimit* options to unit section
The StartLimitBurst and StartLimitIntervalSec options belong to the
[Unit] section, otherwise they are ignored in [Service]:

> Unknown key 'StartLimitIntervalSec' in section [Service], ignoring.

When using [Unit], the limits are properly set:

  apex% systemctl show power-policy.service | grep StartLimit
  StartLimitIntervalUSec=10min
  StartLimitBurst=10
  StartLimitAction=none

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 14:32:46 +02:00
343b4f155e Set power policy to always turn on
In all machines, as soon as we recover the power, turn the machine back
on. We cannot rely on the previous state as we will shut them down
before the power is cut to prevent damage on the power supply
monitoring circuit.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:38 +02:00
39a211a846 Add NixOS module to control power policy
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:36 +02:00
142985c505 Move August shutdown to 3rd at 22h
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:33 +02:00
3f3dc2d037 Disable automatic August shutdown for Fox
The UPC has different dates for the yearly power cut, and Fox can
recover properly from a power loss, so we don't need to have it turned
off before the power cut. Simply disabling the timer is enough.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-24 11:22:10 +02:00
3269d763aa Add cudainfo program to test CUDA
The cudainfo program checks that we can initialize the CUDA RT library
and communicate with the driver. It can be used as standalone program or
built with cudainfo.gpuCheck so it is executed inside the build sandbox
to see if it also works fine. It uses the autoAddDriverRunpath hook to
inject in the runpath the location of the library directory for CUDA
libraries.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-23 11:52:09 +02:00
f2d8ee8552 Add missing symlink in cuda sandbox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-23 11:51:47 +02:00
8d984a0672 Enable cuda systemFeature in raccoon and fox
This allows running derivations which depend on cuda runtime without
breaking the sandbox. We only need to add `requiredSystemFeatures = [ "cuda" ];`
to the derivation.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-07-22 17:07:13 +02:00
f3733418b2 Move shared nvidia settings to a separate module
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-07-22 17:06:45 +02:00
ce8b05b142 Replace xeon07 by hut in ssh config
The xeon07 machine has been renamed to hut.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-07-21 18:10:08 +02:00
4a5787e0c6 Enable automatic Nix GC in raccoon
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-21 17:58:26 +02:00
6c11093033 Select proprietary NVIDIA driver in raccoon
The NVIDIA GTX 960 from 2016 has the Maxwell architecture, and NixOS
suggests using the proprietary driver for older than Turing:

> It is suggested to use the open source kernel modules on Turing or
> later GPUs (RTX series, GTX 16xx), and the closed source modules
> otherwise.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-21 17:58:21 +02:00
750504744f Enable open source NVidia driver in fox
It is recommended for newer versions.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-18 09:57:38 +02:00
c26ec1b6f1 Remove option allowUnfree from fox and raccoon
It is already set to true for all machines.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-18 09:57:21 +02:00
2ef32f773c Ban another scanner trying to connect via SSH
It is constantly spamming out logs:

apex# journalctl | grep 'Connection closed by 84.88.52.176' | wc -l
2255

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-18 09:51:49 +02:00
fc9fcd602a Update weasel IPMI hostname for monitoring
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-18 09:51:21 +02:00
0e37ab5fe1 Remove merged MPICH patch
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-07-16 13:07:12 +02:00
a1b387e454 Remove package ix as it is gone
Fails with: "error: ix has been removed from Nixpkgs, as the ix.io
pastebin has been offline since Dec. 2023".

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-07-16 13:07:06 +02:00
380abe9957 flake.lock: Update
Flake lock file updates:

• Updated input 'agenix':
    'github:ryantm/agenix/f6291c5935fdc4e0bef208cfc0dcab7e3f7a1c41?narHash=sha256-b%2Buqzj%2BWa6xgMS9aNbX4I%2BsXeb5biPDi39VgvSFqFvU%3D' (2024-08-10)
  → 'github:ryantm/agenix/531beac616433bac6f9e2a19feb8e99a22a66baf?narHash=sha256-9P1FziAwl5%2B3edkfFcr5HeGtQUtrSdk/MksX39GieoA%3D' (2025-06-17)
• Updated input 'agenix/darwin':
    'github:lnl7/nix-darwin/4b9b83d5a92e8c1fbfd8eb27eda375908c11ec4d?narHash=sha256-gzGLZSiOhf155FW7262kdHo2YDeugp3VuIFb4/GGng0%3D' (2023-11-24)
  → 'github:lnl7/nix-darwin/43975d782b418ebf4969e9ccba82466728c2851b?narHash=sha256-dyN%2BteG9G82G%2Bm%2BPX/aSAagkC%2BvUv0SgUw3XkPhQodQ%3D' (2025-04-12)
• Updated input 'agenix/home-manager':
    'github:nix-community/home-manager/3bfaacf46133c037bb356193bd2f1765d9dc82c1?narHash=sha256-7ulcXOk63TIT2lVDSExj7XzFx09LpdSAPtvgtM7yQPE%3D' (2023-12-20)
  → 'github:nix-community/home-manager/abfad3d2958c9e6300a883bd443512c55dfeb1be?narHash=sha256-YZCh2o9Ua1n9uCvrvi5pRxtuVNml8X2a03qIFfRKpFs%3D' (2025-04-24)
• Updated input 'bscpkgs':
    'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=6782fc6c5b5a29e84a7f2c2d1064f4bcb1288c0f' (2024-11-29)
  → 'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=9d1944c658929b6f98b3f3803fead4d1b91c4405' (2025-06-11)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc?narHash=sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8%3D' (2025-01-14)
  → 'github:NixOS/nixpkgs/dfcd5b901dbab46c9c6e80b265648481aafb01f8?narHash=sha256-Kt1UIPi7kZqkSc5HVj6UY5YLHHEzPBkgpNUByuyxtlw%3D' (2025-07-13)

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-07-16 13:07:01 +02:00
37c12783bb Upgrade nixpkgs to nixos 25.05
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-07-16 13:06:40 +02:00
7379e84e79 Silently ban OpenVAS BSC scanner from apex
It is spamming our logs with refused connection lines:

apex% sudo journalctl -b0 | grep 'refused connection.*SRC=192.168.8.16' | wc -l
13945

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 17:40:41 +02:00
b802f88df9 Rotate anavarro password and SSH key
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 17:24:41 +02:00
bd94c4ad00 Add weasel machine configuration
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 17:24:38 +02:00
570c6e175d Remove extra flush commands on firewall stop
They are not needed as they are already flushed when the firewall
starts or stops.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:45 +02:00
96661dd0d4 Prevent accidental use of nftables
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:42 +02:00
28db7799ea Add proxy configuration for internal hosts
Access internal hosts via apex proxy. From the compute nodes we first
open an SSH connection to apex, and then tunnel it through the HTTP
proxy with netcat.

This way we allow reaching internal GitLab repositories without
requiring the user to have credentials in the remote host, while we can
use multiple remotes to provide redundancy.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:36 +02:00
508059c99e Remove unused blackbox configuration modules
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:30 +02:00
b9f9cc7d7a Use IPv4 in blackbox probes
Otherwise they simply fail as IPv6 doesn't work.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:26 +02:00
eae0c7cb59 Make NFS mount async to improve latency
Don't wait to flush writes, as we don't care about consistency on a
crash:

> This option allows the NFS server to violate the NFS protocol and
> reply to requests before any changes made by that request have been
> committed to stable storage (e.g. disc drive).
>
> Using this option usually improves performance, but at the cost that
> an unclean server restart (i.e. a crash) can cause data to be lost or
> corrupted.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:20 +02:00
2280635cd6 Disable root_squash from NFS
Allows root to read files in the NFS export, so we can directly run
`nixos-rebuild switch` from /home.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:16 +02:00
16ada09600 Remove SSH proxy to access BSC clusters
We now have direct connection to them.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:13 +02:00
0d291d715c Add users to apex machine
They need to be able to login to apex to access any other machine from
the SSF rack.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:09 +02:00
66001f76f7 Remove proxy from hut HTTP probes
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:04 +02:00
1e3b85067d Remove proxy configuration from environment
All machines have now direct connection with the outside world.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:18:00 +02:00
36ee1f3adc Add storcli utility to apex
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:17:57 +02:00
25e9c071b0 Add new configuration for apex
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-15 11:17:43 +02:00
80cee2dbd0 Add pmartin1 user with access to fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-03 11:16:43 +02:00
ee92934c74 Add access to fox for rpenacob user
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 16:58:53 +02:00
db0f3fed91 Revert "Only allow Vincent to access fox for now"
This reverts commit e9e3704b677baed1649583f25e4e1bc050a9534e.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 16:58:49 +02:00
adeaa0484d Add all terminfo files in environment
Fixes problems with the kitty terminal when opening vim or kakoune.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-07-02 16:02:45 +02:00
815810830e Monitor Fox BMC with ICMP probes too
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:51:22 +02:00
7a52e1907c Restrict DAC VPN to fox-ipmi machine only
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:51:19 +02:00
22a2e1b9e8 Monitor fox via VPN
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:51:16 +02:00
f29461ae32 Add OpenVPN service to connect to fox BMC
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:51:13 +02:00
208197f099 Add ac.upc.edu as name search server
Allows referring to fox.ac.upc.edu directly as fox.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:51:09 +02:00
479ca1b671 Disable kptr_restrict in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:08:42 +02:00
40529fbdcb Disable NUMA balancing in fox
See: https://www.kernel.org/doc/html/latest/admin-guide/sysctl/kernel.html#numa-balancing

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:08:02 +02:00
9b0d3fb21e Load amd_uncore module in fox
Needed for L3 events in perf.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:07:58 +02:00
d8444131d8 Enable SSH X11 forwarding
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-07-02 15:07:54 +02:00
af540456a6 Disable registration in Gitea
Get rid of all the spam accounts they are trying to register.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:18 +02:00
42d6734da8 Enable msmtp configuration in tent
Allows gitea to send notifications via email.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:15 +02:00
071a8084a0 Add GitLab runner with debian docker for PM
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:13 +02:00
24a0c58592 Monitor nix-daemon in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:11 +02:00
810a6dfcec Move nix-daemon exporter to modules
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:09 +02:00
47ad89dee1 Add p service for pastes
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:07 +02:00
8af1b259f5 Enable public-inbox service in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:06 +02:00
560003d4fd Enable gitea in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:04 +02:00
68ff45075c Add bsc.es to resolve domain names
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:02 +02:00
fc68d16197 Monitor AXLE machine too
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:36:00 +02:00
f6ec1293f4 Use IPv4 for blackbox exporter
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:59 +02:00
4feeff978c Add public html files to tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:57 +02:00
7b19292912 Add docker GitLab runner for BSC GitLab
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:55 +02:00
0627db0eb9 Add GitLab shell runner in tent for PM
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:54 +02:00
ae2f6dde41 Enable jungle robot emails for Grafana in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:52 +02:00
3bf70656dc Add tent key for nix-serve
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:50 +02:00
1cf989d727 Remove jungle nix cache from tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:48 +02:00
19f734e622 Enable nix cache
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:47 +02:00
d6e3d9626c Serve Grafana from subpath
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:45 +02:00
9c32e42dcc Add nginx server in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:43 +02:00
61e6d3232b Add monitoring in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-18 15:35:00 +02:00
d0fd8cde46 Disable nix garbage collector in tent
Reviewed-by: Aleix Boné <abonerib@bsc.es>
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-06-11 16:05:05 +02:00
5223ea53f6 Rekey secrets with tent keys
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:20 +02:00
253426ce00 Add tent host key and admin keys
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:16 +02:00
df67b6cd26 Create directories in /vault/home for tent users
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:12 +02:00
766da21097 Add software RAID in tent using 3 disks
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:10 +02:00
18461c0d59 Add access to tent to all hut users too
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:06 +02:00
028b151c78 Add hut SSH configuration from outside SSF LAN
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:04 +02:00
7176b066bb Don't use proxy in base preset
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:04:00 +02:00
c3c3614f63 Add tent machine from xeon04
We moved the tent machine to the server room in the BSC building and is
now directly connected to the raccoon via NAT.

Fixes: rarias/jungle#106
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:03:54 +02:00
e13288fc29 Create specific SSF rack configuration
Allow xeon machines to optionally inherit SSF configuration such as the
NFS mount point and the network configuration.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 16:03:49 +02:00
e9e3704b67 Only allow Vincent to access fox for now
Needed to run benchmarks without interference.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 12:08:57 +02:00
7d3c7342ae Use performance governor in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 12:08:55 +02:00
8f80ed2cce Add hut as nix cache in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 12:08:51 +02:00
d00f996f59 Use extra- for substituters and trusted-public-keys
From the nix manual:

> A configuration setting usually overrides any previous value. However,
> for settings that take a list of items, you can prefix the name of the
> setting by extra- to append to the previous value.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-06-11 11:27:37 +02:00
e40fd24f26 Use DHCP for Ethernet in fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 10:24:53 +02:00
83efd6c876 Use UPC time servers as others are blocked
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-11 10:24:47 +02:00
f0c4206ab8 Create tracing group and add arocanon in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 11:09:41 +02:00
8b43a6ffb6 Extend perf support in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 11:09:30 +02:00
2bca10b0e4 Enable nixdebuginfod in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:50:01 +02:00
eec3e27d66 Make raccoon use performance governor
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:35 +02:00
e51ef52721 Enable binfmt emulation in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:33 +02:00
9dc67d402f Disable nix garbage collector in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:31 +02:00
62ec4e014a Add dbautist user to raccoon machine
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:28 +02:00
4d03842f7c Add node exporter monitoring in raccoon
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:26 +02:00
8fedc5518e Allow X11 forwarding via SSH
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:23 +02:00
43dc336638 Enable linger for user rarias
Allows services to run without a login session.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:45:19 +02:00
2b08fcd21a Only proxy SSH git remotes via hut in xeon
Other machines like raccoon have direct access.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-06-03 10:44:31 +02:00
557618d43f Add machine map file
Documents the location, board and serial numbers so we can track the
machines if they move around. Some information is unkown.

Using the Nix language to encode the machines location and properties
allows us to later use that information in the configuration of the
machines themselves.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 14:55:58 +02:00
e8ac6cf0f3 Remove fox monitoring via IPMI
We will need to setup an VPN to be able to access fox in its new
location, so for now we simply remove the IPMI monitoring.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:53 +02:00
f8fc391cae Monitor fox, gateway and UPC anella via ICMP
Fox should reply once the machine is connected to the UPC network.
Monitoring also the gateway and UPC anella allows us to estimate if the
whole network is down or just fox.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:51 +02:00
6c1afa3fd8 Update configuration for UPC network
The fox machine will be placed in the UPC network, so we update the
configuration with the new IP and gateway. We won't be able to reach hut
directly so we also remove the host entry and proxy.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:48 +02:00
008584b465 Disable home via NFS in fox
It won't be accesible anymore as we won't be in the same LAN.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:46 +02:00
a22c862192 Rekey all secrets
Fox is no longer able to use munge or ceph, so we remove the key and
rekey them.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:44 +02:00
cd0c070439 Rotate fox SSH host key
Prevent decrypting old secrets by reading the git history.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:42 +02:00
201ff64b25 Distrust fox SSH key
We no longer will share secrets with fox until we can regain our trust.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:38 +02:00
9bee145e25 Remove Ceph module from fox
It will no longer be accesible from the UPC.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:36 +02:00
4528b7c2a6 Remove fox from SLURM
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:20 +02:00
1eac0fcad8 Remove pam_slurm_adopt from fox
We no longer will be able to use SLURM from jungle.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-06-02 11:26:02 +02:00
dd15f9c943 Add UPC temperature sensor monitoring
These sensors are part of their air quality measurements, which just
happen to be very close to our server room.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-05-29 13:01:37 +02:00
4048b3327a Add meteocat exporter
Allows us to track ambient temperature changes and estimate the
temperature delta between the server room and exterior temperature.
We should be able to predict when we would need to stop the machines due
to excesive temperature as summer approaches.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-05-29 13:01:29 +02:00
f4229e34f6 Add custom nix-daemon exporter
Allows us to see which derivations are being built in realtime. It is a
bit of a hack, but it seems to work. We simply look at the environment
of the child processes of nix-daemon (usually bash) and then look for
the $name variable which should hold the current derivation being
built. Needs root to be able to read the environ file of the different
nix-daemon processes as they are owned by the nixbld* users.

See: https://discourse.nixos.org/t/query-ongoing-builds/23486
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-05-29 12:57:07 +02:00
5208a3483b Set keep-outputs to true in all machines
From the documentation of keep-outputs, setting it to true would prevent
the GC from removing build time dependencies:

If true, the garbage collector will keep the outputs of non-garbage
derivations. If false (default), outputs will be deleted unless they are
GC roots themselves (or reachable from other roots).

In general, outputs must be registered as roots separately. However,
even if the output of a derivation is registered as a root, the
collector will still delete store paths that are used only at build time
(e.g., the C compiler, or source tarballs downloaded from the network).
To prevent it from doing so, set this option to true.

See: https://nix.dev/manual/nix/2.24/command-ref/conf-file.html#conf-keep-outputs
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2025-04-22 17:27:37 +02:00
92eacfad20 Add raccoon node exporter monitoring
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-22 14:50:08 +02:00
80309d107b Increase data retention to 5 years
Now that we have more space, we can extend the retention time to 5 years
to hold the monitoring metrics. For a year we have:

	# du -sh /var/lib/prometheus2
	13G     /var/lib/prometheus2

So we can expect it to increase to about 65 GiB. In the future we may
want to reduce some adquisition frequency.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-22 14:50:03 +02:00
d0f151595f Don't forward any docker traffic
Access to the 23080 local port will be done by applying the INPUT rules,
which pass through nixos-fw.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-15 14:16:15 +02:00
93f8d3aa89 Allow traffic from docker to enter port 23080
Before:

  hut% sudo docker run -it --rm alpine /bin/ash -xc 'true | nc -w 3 -v 10.0.40.7 23080'
  + true
  + nc -w 3 -v 10.0.40.7 23080
  nc: 10.0.40.7 (10.0.40.7:23080): Operation timed out

After:

  hut% sudo docker run -it --rm alpine /bin/ash -xc 'true | nc -w 3 -v 10.0.40.7 23080'
  + true
  + nc -w 3 -v 10.0.40.7 23080
  10.0.40.7 (10.0.40.7:23080) open

Fixes: rarias/jungle#94
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-15 14:16:10 +02:00
d84645f3e1 Add bscpm04.bsc.es SSH host and public key
Allows fetching repositories from hut and other machines in jungle
without the need to do any extra configuration.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-15 14:15:45 +02:00
55b71d6901 Use hut nix cache in owl1, owl2 and raccoon
For owl1 and owl2 directly connect to hut via LAN with HTTP, but for
raccoon pass via the proxy using jungle.bsc.es with HTTPS. There is no
risk of tampering as packages are signed.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-04-15 14:08:17 +02:00
89c65ea578 Clean all iptables rules on stop
Prevents the "iptables: Chain already exists." error by making sure that
we don't leave any chain on start. The ideal solution is to use
iptables-restore instead, which will do the right job. But this needs to
be changed in NixOS entirely.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-15 14:08:14 +02:00
129273e8d8 Make nginx listen on all interfaces
Needed for local hosts to contact the nix cache via HTTP directly.
We also allow the incoming traffic on port 80.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-15 14:08:07 +02:00
fdac196c6c Fix nginx /cache regex
`nix-serve` does not handle duplicates in the path:
```
hut$ curl http://127.0.0.1:5000/nix-cache-info
StoreDir: /nix/store
WantMassQuery: 1
Priority: 30
hut$ curl http://127.0.0.1:5000//nix-cache-info
File not found.
```

This meant that the cache was not accessible via:
`curl https://jungle.bsc.es/cache/nix-cache-info` but
`curl https://jungle.bsc.es/cachenix-cache-info` worked.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2025-04-15 14:08:04 +02:00
3f4b4fb810 Add new GitLab runner for gitlab.bsc.es
It uses docker based on alpine and the host nix store, so we can perform
builds but isolate them from the system.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:41:18 +02:00
2c7211ffa3 Remove SLURM partition all
We no longer have homogeneous nodes so it doesn't make much sense to
allocate a mix of them.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:27 +02:00
18f25307ab Add varcila user to hut and fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:25 +02:00
7c55d10ceb Adjust fox slurm config after disabling SMT
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:23 +02:00
5c549faaa8 Add abonerib user to fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:21 +02:00
9fd35a9ce4 Don't move doc in web output
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:19 +02:00
5487a93972 Reject SSH connections without SLURM allocation
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:15 +02:00
fe16ea373f Add users to fox
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:13 +02:00
163434af09 Add dalvare1 user
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:11 +02:00
71164400d4 Mount NVME disks in /nvme{0,1}
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:06 +02:00
f887dacdea Exclude fox from being suspended by slurm
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:04 +02:00
4f5c8dbbaf Use IPMI host names instead of IP addresses
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:15:01 +02:00
14b192b1d9 Add fox IPMI monitoring
Use agenix to store the credentials safely.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:14:59 +02:00
2b04812320 Add new fox machine
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-04-08 17:14:42 +02:00
2f6f6ba703 Update PM GitLab tokens to new URL
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 15:43:13 +01:00
371b0c7e76 Fix MPICH build by fetching upstream patches too
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 15:43:13 +01:00
ae34eacf4a flake.lock: Update
Flake lock file updates:

• Updated input 'agenix':
    'github:ryantm/agenix/de96bd907d5fbc3b14fc33ad37d1b9a3cb15edc6' (2024-07-09)
  → 'github:ryantm/agenix/f6291c5935fdc4e0bef208cfc0dcab7e3f7a1c41' (2024-08-10)
• Updated input 'bscpkgs':
    'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=de89197a4a7b162db7df9d41c9d07759d87c5709' (2024-04-24)
  → 'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=6782fc6c5b5a29e84a7f2c2d1064f4bcb1288c0f' (2024-11-29)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/693bc46d169f5af9c992095736e82c3488bf7dbb' (2024-07-14)
  → 'github:NixOS/nixpkgs/9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc' (2025-01-14)

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 15:43:13 +01:00
dab6f08d89 Set nixpkgs to track nixos-24.11
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 15:43:13 +01:00
8190523c30 Add script to monitor GPFS
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 15:43:07 +01:00
d335d69ba6 Add BSC machines to ssh config
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:51 +01:00
cec49eb5fc Collect statistics from logged users
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:48 +01:00
22db38c98f Add custom GPFS exporter for MN5
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:46 +01:00
0d4eebbb59 Remove exception to fetch task endpoint
It causes the request to go to the website rather than the Gitea
service.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:43 +01:00
025f6a0c0c Use SSD for boot, then switch to NVME
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:40 +01:00
abc74c5445 Use NVME as root
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:37 +01:00
6942f09f69 Keep host header for Grafana requests
This was breaking requests due to CSRF check.

See: https://github.com/grafana/grafana/issues/45117#issuecomment-1033842787
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:32 +01:00
56f6855af7 Ignore logging requests from the gitea runner
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:28 +01:00
81c822e68e Log the client IP not the proxy
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:22 +01:00
53e80b1f19 Ignore misc directory
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:19 +01:00
21feb01e7b Create paste directories in /ceph/p
Ensure that all hut users have a paste directory in /ceph/p owned by
themselves. We need to wait for the ceph mount point to create them, so
we use a systemd service that waits for the remote-fs.target.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:16 +01:00
9ea7b2b475 Add p command to paste files
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:10 +01:00
fce4d89e1d Use nginx to serve website and other services
Instead of using multiple tunels to forward all our services to the VM
that serves jungle.bsc.es, just use nginx to redirect the traffic from
hut. This allows adding custom rules for paths that are not posible
otherwise.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:23:07 +01:00
6b282375f8 Mount the NVME disk in /nvme
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2025-01-16 14:22:58 +01:00
260986b9f2 Delay nix-gc until /home is mounted
Prevents starting the garbage collector before the remote FS are
mounted, in particular /home. Otherwise, all the gcroots which have
symlinks in /home will be considered stale and they will be removed.

See: rarias/jungle#79
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-09-20 09:45:30 +02:00
15afbe94bd Add dbautist user with access to hut
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-09-20 09:42:02 +02:00
efd35a9cd1 Set the serial console to ttyS1 in raccoon
Apparently the ttyS0 console doesn't exist but ttyS1 does:

  raccoon% sudo stty -F /dev/ttyS0
  stty: /dev/ttyS0: Input/output error
  raccoon% sudo stty -F /dev/ttyS1
  speed 9600 baud; line = 0;
  -brkint -imaxbel

The dmesg line agrees:

  00:03: ttyS1 at I/O 0x2f8 (irq = 3, base_baud = 115200) is a 16550A

The console configuration is then moved from base to xeon to allow
changing it for the raccoon machine.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:56 +02:00
50ad1d637c Remove setLdLibraryPath and driSupport options
They have been removed from NixOS. The "hardware.opengl" group is now
renamed to "hardware.graphics".

See: 98cef4c273
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:53 +02:00
c299d53146 Add documentation section about GRUB chain loading
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:47 +02:00
152b71e718 Add 10 min shutdown jitter to avoid spikes
The shutdown timer will fire at slightly different times for the
different nodes, so we slowly decrease the power consumption.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:44 +02:00
0911d5b92a Don't mount the nix store in owl nodes
Initially we planned to run jobs in those nodes by sharing the same nix
store from hut. However, these nodes are now used to build packages
which are not available in hut. Users also ssh to the nodes, which
doesn't mount the hut store, so it doesn't make much sense to keep
mounting it.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:42 +02:00
5ddae068af Emulate other architectures in owl nodes too
Allows cross-compilation of packages for RISC-V that are known to try to
run RISC-V programs in the host.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:39 +02:00
d17be714ec Program shutdown for August 2nd for all machines
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:36 +02:00
28ce15d74d Enable debuginfod daemon in owl nodes
WARNING: This will introduce noise, as the daemon wakes up from time to
time to check for new packages.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:30 +02:00
504f9bb570 Set gitea and grafana log level to warn
Prevents filling the journal logs with information messages.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:27 +02:00
f158cb63e8 Set default SLURM job time limit to one hour
Prevents enless jobs from being left forever, while allow users to
request a larger time limit.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:24 +02:00
8860f76cad Allow other jobs to run in unused cores
The current select mechanism was using the memory too as a consumable
resource, which by default only sets 1 MiB per node. As each job already
requests 1 MiB, it prevents other jobs from running.

As we are not really concerned with memory usage, we only use the unused
cores in the select criteria.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:22 +02:00
b86798cd69 Use authentication tokens for PM GitLab runner
Starting with GitLab 16, there is a new mechanism to authenticate the
runners via authentication tokens, so use it instead.  Older tokens and
runners are also removed, as they are no longer used.

With the new way of managing tokens, both the tags and the locked state
are managed from the GitLab web page.

See: https://docs.gitlab.com/ee/ci/runners/new_creation_workflow.html
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:16 +02:00
7ed74931cf flake.lock: Update
Flake lock file updates:

• Updated input 'agenix':
    'github:ryantm/agenix/1381a759b205dff7a6818733118d02253340fd5e' (2024-04-02)
  → 'github:ryantm/agenix/de96bd907d5fbc3b14fc33ad37d1b9a3cb15edc6' (2024-07-09)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/6143fc5eeb9c4f00163267708e26191d1e918932' (2024-04-21)
  → 'github:NixOS/nixpkgs/693bc46d169f5af9c992095736e82c3488bf7dbb' (2024-07-14)

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:13 +02:00
6e9d33b483 Allow ptrace to any process of the same user
Allows users to attach GDB to their own processes, without requiring
running the program with GDB from the start. It is only available in
compute nodes, the storage nodes continue with the restricted settings.

Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:09 +02:00
58abaefbc4 Add abonerib user to hut, raccon, owl1 and owl2
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:07 +02:00
5ea7827a8a Grant rpenacob access to owl1 and owl2 nodes
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:05 +02:00
b17e4a13f9 Access private repositories via hut SSH proxy
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:36:03 +02:00
9c4e60c2c2 Set the default proxy to point to hut
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:35:56 +02:00
e7376917bd Allow incoming traffic to hut proxy
Reviewed-by: Aleix Boné <abonerib@bsc.es>
2024-09-12 08:35:23 +02:00
130e191d37 eudy: koro: fcs: Fix fcs unprotected cpuid all
smp_processor_id() was called in a preepmtible context, which could
invalidate the returned value. However, this was not harmful, because
fcs threads in nosv are pinned.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2024-07-17 11:40:20 +02:00
349f69e30a Add support for armv7 emulation in hut
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-17 11:12:48 +02:00
59ab6405c5 Monitor raccoon machine via IPMI
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-17 11:12:32 +02:00
a0dab66aa5 Move vlopez user to jungleUsers for koro host
Access to other machines can be easily added into the "hosts" attribute
without the need to replicate the configuration.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-16 12:35:39 +02:00
525cad4117 Add raccoon motd file
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-16 12:35:38 +02:00
24ee74d614 Split xeon specific configuration from base
To accomodate the raccoon knights workstation, some of the configuration
pulled by m/common/main.nix has to be removed. To solve it, the xeon
specific parts are placed into m/common/xeon.nix and only the common
configuration is at m/common/base.nix.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-16 12:35:37 +02:00
15b4b28d2c Control user access to each machine
The users.jungleUsers configuration option behaves like the users.users
option, but defines the list attribute `hosts` for each user, which
filters users so that only the user can only access those hosts.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-16 12:35:34 +02:00
b1ce302e4b Add PostgreSQL DB for performance test results
The database will hold the performance results of the execution of the
benchmarks. We follow the same setup on knights3 for now.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-07-16 12:35:24 +02:00
b8b85f55cd Enable Grafana email alerts
Allows sending Grafana alerts via email too, so we have a reduntant
mechanism in case Slack fails to deliver them.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-05-31 15:57:38 +02:00
1189626a6f Enable mail notification in Gitea
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-05-31 10:56:49 +02:00
dbd95dd7b8 Add msmtp to send notifications via email
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-05-31 10:56:20 +02:00
81b680a7d2 Allow Ceph traffic to lake2 2024-05-02 17:43:48 +02:00
ba60e121df Collect Gitea metrics in Prometheus
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-05-02 17:32:25 +02:00
432e6c8521 Add Gitea service
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-05-02 17:31:51 +02:00
c8160122b3 Add firewall rules for Ceph and monitoring
The firewall was blocking the monitoring traffic from hut and the Ceph
traffic among OSDs. The rules only allow connecting from the specific
host that they are supposed to be coming from.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:25:11 +02:00
3863fc25a5 Add workaround for MPICH 4.2.0
See: https://github.com/pmodels/mpich/issues/6946

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:25:08 +02:00
2b26cd2f46 Fix SLURM bug in rank integer sign expansion
See: https://bugs.schedmd.com/show_bug.cgi?id=19324

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:25:05 +02:00
30f2079f0b Merge pmix outputs for MPICH
MPICH expects headers and libraries to be present in the same directory.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:25:03 +02:00
366436b6d3 Remove nixseparatedebuginfod input
It has been integrated in nixpkgs, so is no longer required.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:24:58 +02:00
9f1cd02144 flake.lock: Update
Flake lock file updates:

• Updated input 'agenix':
    'github:ryantm/agenix/daf42cb35b2dc614d1551e37f96406e4c4a2d3e4' (2023-10-08)
  → 'github:ryantm/agenix/1381a759b205dff7a6818733118d02253340fd5e' (2024-04-02)
• Updated input 'agenix/darwin':
    'github:lnl7/nix-darwin/87b9d090ad39b25b2400029c64825fc2a8868943' (2023-01-09)
  → 'github:lnl7/nix-darwin/4b9b83d5a92e8c1fbfd8eb27eda375908c11ec4d' (2023-11-24)
• Updated input 'agenix/home-manager':
    'github:nix-community/home-manager/32d3e39c491e2f91152c84f8ad8b003420eab0a1' (2023-04-22)
  → 'github:nix-community/home-manager/3bfaacf46133c037bb356193bd2f1765d9dc82c1' (2023-12-20)
• Added input 'agenix/systems':
    'github:nix-systems/default/da67096a3b9bf56a91d16901293e51ba5b49a27e' (2023-04-09)
• Updated input 'bscpkgs':
    'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=e148de50d68b3eeafc3389b331cf042075971c4b' (2023-11-22)
  → 'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=de89197a4a7b162db7df9d41c9d07759d87c5709' (2024-04-24)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/e4ad989506ec7d71f7302cc3067abd82730a4beb' (2023-11-19)
  → 'github:NixOS/nixpkgs/6143fc5eeb9c4f00163267708e26191d1e918932' (2024-04-21)
• Updated input 'nixseparatedebuginfod':
    'github:symphorien/nixseparatedebuginfod/232591f5274501b76dbcd83076a57760237fcd64' (2023-11-05)
  → 'github:symphorien/nixseparatedebuginfod/98d79461660f595637fa710d59a654f242b4c3f7' (2024-03-07)
• Removed input 'nixseparatedebuginfod'
• Removed input 'nixseparatedebuginfod/flake-utils'
• Removed input 'nixseparatedebuginfod/flake-utils/systems'
• Removed input 'nixseparatedebuginfod/nixpkgs'

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-04-25 13:24:29 +02:00
82ccae1315 Use google.com probe instead of bsc.es
The main website of the BSC is failing every day around 3:00 AM for
almost one hour, so it is not a very good target. Instead, google.com is
used which should be more reliable. The same robots.txt path is fetched,
as it is smaller than the main page.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-03-05 16:52:21 +01:00
1df80460d2 Add another HTTPS probe for bsc.es
As all other HTTPS probes pass through the opsproxy01.bsc.es proxy, we
cannot detect a problem in our proxy or in the BSC one. Adding another
target like bsc.es that doesn't use the ops proxy allows us to discern
where the problem lies.

Instead of monitoring https://www.bsc.es/ directly, which will trigger
the whole Drupal server and take a whole second, we just fetch robots.txt
so the overhead on the server is minimal (and returns in less than 10 ms).

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2024-02-13 12:26:56 +01:00
7f17fe8874 Move slurm client in a separate module
Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2024-02-13 11:11:17 +01:00
5880a6e5f6 Enable public-inbox at jungle.bsc.es/lists
The public-inbox service fetches emails from the sourcehut mailing lists
and displays them on the web. The idea is to reduce the dependency on
external services and add a secondary storage for the mailing lists in
case sourcehut goes down or changes the current free plans.

The service is available in https://jungle.bsc.es/lists/ and is open to
the public. It currently mirrors the bscpkgs and jungle mailing list.

We also edited the CSS to improve the readability and have larger fonts
by default.

The service for public-inbox produced by NixOS is not well configured to
fetch emails from an IMAP mail server, so we also manually edit the
service file to enable the network.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-12-15 11:18:08 +01:00
ecbb45d6ac Monitor https://pm.bsc.es/gitlab/ too
The GitLab instance is in the /gitlab endpoint and may fail
independently of https://pm.bsc.es/.

Cc: Víctor López <victor.lopez@bsc.es>
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-12-05 09:56:28 +01:00
c564d945d4 Enable nixseparatedebuginfod module
The module is only enabled on Hut and Eudy because we noticed activity
on the debuginfod service even if no debug session was active.

Reviewed-by: Rodrigo Arias Mallo <rodrigo.arias@bsc.es>
2023-12-04 11:04:52 +01:00
ed887b0412 Use tmpfs in /tmp
The /tmp directory was using the SSD disk which is not erased across
boots. Nix will use /tmp to perform the builds, so we want it to be as
fast as possible. In general, all the machines have enough space to
handle large builds like LLVM.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-28 12:25:50 +01:00
fe1d3fbb80 Enable runners for pm.bsc.es/gitlab too
The old runners for the PM gitlab were disabled in configuration in the
last outage, but they remained working until we reboot the node. With
this change we enable the runners for both PM and gitlab.bsc.es.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-24 14:45:23 +01:00
5234ca32fd Remove complete ceph package from hut
Only the ceph-client is needed.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-24 12:58:54 +01:00
cfe0c0e6e6 Fix warning in slurm exporter using vendorHash
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-24 12:58:50 +01:00
7afe7344ac Remove old Ceph package overlay
The Ceph package is now integrated in upstream nixpkgs.

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-24 12:58:47 +01:00
bd83ca53ab flake.lock: Update
Flake lock file updates:

• Updated input 'agenix':
    'github:ryantm/agenix/d8c973fd228949736dedf61b7f8cc1ece3236792' (2023-07-24)
  → 'github:ryantm/agenix/daf42cb35b2dc614d1551e37f96406e4c4a2d3e4' (2023-10-08)
• Updated input 'bscpkgs':
    'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=f605f8e5e4a1f392589f1ea2b9ffe2074f72a538' (2023-10-31)
  → 'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=e148de50d68b3eeafc3389b331cf042075971c4b' (2023-11-22)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/e56990880811a451abd32515698c712788be5720' (2023-09-02)
  → 'github:NixOS/nixpkgs/e4ad989506ec7d71f7302cc3067abd82730a4beb' (2023-11-19)

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-24 12:57:44 +01:00
0d9c99a24e BSC packages are no longer in bsc attribute
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-09 13:40:48 +01:00
db98b1f698 flake.lock: Update
Flake lock file updates:

• Updated input 'bscpkgs':
    'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=3a4062ac04be6263c64a481420d8e768c2521b80' (2023-09-14)
  → 'git+https://git.sr.ht/~rodarima/bscpkgs?ref=refs/heads/master&rev=f605f8e5e4a1f392589f1ea2b9ffe2074f72a538' (2023-10-31)

Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-09 13:40:48 +01:00
84c4b6b81c Switch bscpkgs URL to sourcehut
Reviewed-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-11-09 13:40:48 +01:00
19e195b894 Monitor anella instead of gw.bsc.es
The target gw.bsc.es doesn't reply to our ICMP probes from hut. However,
the anella hop in the tracepath is a good candidate to identify cuts
between the login and the provider and between the provider and external
hosts like Google or Cloudflare DNS.

Reviewed-By: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-10-27 12:46:08 +02:00
54c2bd119f Add ICMP probes
These probes check if we can reach several targets via ICMP, which is
not proxied, so they can be used to see if ICMP forwarding is working in
the login node.

In particular, we test if we can reach the Google (8.8.8.8) and
Cloudflare (1.1.1.1) DNS servers, the BSC gateway which responds to ping
only from the intranet and the login node (ssfhead).

Reviewed-By: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-10-25 17:13:03 +02:00
e5d85c1b38 Enable proxy for Grafana too
The alerts need to contact the slack endpoint, so we add the proxy
environment variables to the grafana systemd service.

Reviewed-By: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-10-25 16:55:56 +02:00
f1486b84c1 Make blackbox exporter use the proxy
By default it was trying to reach the targets using the default gateway,
but since the electrical cut of 2023-10-20, the login node has not
enabled forwarding again. So better if we don't rely on it.

Reviewed-By: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
2023-10-25 16:55:24 +02:00
472f4b0334 Don't log SLURM connection attempts from ssfhead 2023-10-06 15:22:04 +02:00
425dca3e00 Add docker runner too 2023-10-06 15:17:07 +02:00
e4080cf931 Monitor gitlab.bsc.es too 2023-10-06 15:17:07 +02:00
fc9285f89d Monitor PM webpage via blackbox 2023-10-06 15:17:07 +02:00
fbe238f5b6 Temporarily disable pm runners 2023-10-06 15:17:07 +02:00
9874da566d Add runner for gitlab.bsc.es 2023-10-06 15:17:07 +02:00
ebc5c4d84f Allow anonymous access to grafana 2023-09-22 10:51:30 +02:00
8634a9e133 Remove user/group when using DynamicUsers 2023-09-22 10:13:06 +02:00
0ce79ed79e Set the SLURM_CONF variable 2023-09-21 22:22:00 +02:00
5f492ee1d7 Enable slurm-exporter service 2023-09-21 21:40:02 +02:00
9071a4de8b Add prometheus-slurm-exporter package 2023-09-21 21:34:18 +02:00
3040a803b2 Mount the hut nix store for SLURM jobs 2023-09-20 19:38:43 +02:00
70a9e855cf Enable direnv integration 2023-09-20 09:32:58 +02:00
aa64e9ef24 Remove bscpkgs from the registry and nixPath
This is done to prevent accidental evaluations where the nixpkgs input
of bscpkgs is still pointing to a different version that the one
specified in the jungle flake. Instead use jungle#bscpkgs.X to get a
package from bscpkgs.
2023-09-15 12:00:33 +02:00
ba2b74fd5a Add bscpkgs and nixpkgs top level attributes
Allows the evaluation of packages of the intermediate overlays.
2023-09-15 12:00:33 +02:00
1ae5d9e25e Use hut packages as the default package set
Allows the user to directly access nixpkgs and bscpkgs from the top
level as `nix build jungle#htop` and `nix build jungle#bsc.ovni`.
2023-09-15 12:00:28 +02:00
ff98ba47c4 Don't fetch registry flakes from the net 2023-09-15 12:00:28 +02:00
599b23ef52 flake.lock: Update
Flake lock file updates:

• Updated input 'bscpkgs':
    'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=6122fef92701701e1a0622550ac0fc5c2beb5906' (2023-09-07)
  → 'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=3a4062ac04be6263c64a481420d8e768c2521b80' (2023-09-14)
2023-09-15 11:50:47 +02:00
8dbee06d1d Revert "Update slurm to 23.02.05.1"
This reverts commit 7bfd786c01c36131cd00b90fc6a9503fd1226578.
2023-09-14 15:46:18 +02:00
d522113cb9 Open ports in firewall of compute nodes 2023-09-14 15:45:43 +02:00
7bfd786c01 Update slurm to 23.02.05.1 2023-09-13 17:44:24 +02:00
5a5f4672cd Monitor storage nodes via IPMI too 2023-09-13 15:57:13 +02:00
2646ad4b70 Enable fstrim service 2023-09-12 16:39:45 +02:00
b120a7ca85 Serve the nix store from hut 2023-09-12 12:19:43 +02:00
2a0254b684 Add encrypted munge key with agenix 2023-09-08 19:05:45 +02:00
e3e6e7662d Remove unused large port hole in firewall 2023-09-08 18:22:48 +02:00
868f825e26 Make exporters listen in localhost only 2023-09-08 18:13:04 +02:00
f231dc81f1 Allow only some ports for srun 2023-09-08 17:51:37 +02:00
a758eef354 Block ssfhead from reaching our slurm daemon 2023-09-08 17:36:28 +02:00
9c9c41fb57 Poweroff idle slurm nodes after 1 hour 2023-09-08 16:49:53 +02:00
1a1708f16f Add IB and IPMI node host names 2023-09-08 13:21:37 +02:00
efe1b7e399 flake.lock: Update
Flake lock file updates:

• Updated input 'bscpkgs':
    'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=ee24b910a1cb95bd222e253da43238e843816f2f' (2023-09-01)
  → 'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=6122fef92701701e1a0622550ac0fc5c2beb5906' (2023-09-07)
2023-09-07 11:13:45 +02:00
eb9876aff6 Unlock ovni gitlab runners 2023-09-05 16:59:45 +02:00
8d31c552f5 flake.lock: Update
Flake lock file updates:

• Updated input 'bscpkgs':
    'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=18d64c352c10f9ce74aabddeba5a5db02b74ec27' (2023-08-31)
  → 'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs/heads/master&rev=ee24b910a1cb95bd222e253da43238e843816f2f' (2023-09-01)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/d680ded26da5cf104dd2735a51e88d2d8f487b4d' (2023-08-19)
  → 'github:NixOS/nixpkgs/e56990880811a451abd32515698c712788be5720' (2023-09-02)
2023-09-05 15:03:26 +02:00
68f4d54dd1 Add agenix to all nodes 2023-09-04 22:10:43 +02:00
2042d58b72 Add agenix module to ceph 2023-09-04 22:07:07 +02:00
2c8c90e6e4 Remove old secrets 2023-09-04 22:04:32 +02:00
208dcb7dde Mount /ceph in owl1 and owl2 2023-09-04 22:00:36 +02:00
e2f82a6383 Warn about the owl2 omnipath device 2023-09-04 22:00:17 +02:00
d704816de9 Clean owl2 configuration 2023-09-04 21:59:56 +02:00
74ec4eb22a Move the ceph client config to an external module 2023-09-04 21:59:04 +02:00
0a5f9b55f5 Reorganize secrets and ssh keys
The agenix tools needs to read the secrets from a standalone file, but
we also need the same information for the SSH keys.
2023-09-04 21:36:31 +02:00
900de39e2f Add anavarro user 2023-09-04 16:00:01 +02:00
1e466d07df Set zsh inc_append_history option 2023-09-03 16:57:53 +02:00
13807c5e8f Set zsh shell for rarias 2023-09-03 16:46:27 +02:00
d8d6d6d421 Enable zsh and fix key bindings 2023-09-03 16:42:04 +02:00
a242ddd39c Keep a log over time with the config commits 2023-09-03 00:02:14 +02:00
a2c5fe1f5e Configure bscpkgs.nixpkgs to follow nixpkgs 2023-09-02 23:37:59 +02:00
2c52ef9ff0 Store nixos config in /etc/nixos/config.rev 2023-09-02 23:37:11 +02:00
acb91695ac Enable binary emulation for other architectures 2023-08-31 17:27:08 +02:00
9d93760e6f Enable watchdog 2023-08-30 16:32:17 +02:00
aad67b9d99 Enable all osd on boot in lake2 2023-08-30 16:32:17 +02:00
e1d406023d Scrape lake2 too 2023-08-29 12:33:26 +02:00
db6bb90af8 Also enable monitoring in lake2 2023-08-29 12:29:41 +02:00
1266c8f04e Scrape metrics from bay 2023-08-29 11:58:00 +02:00
2b7823788c Add monitoring in the bay node 2023-08-29 11:53:32 +02:00
86eacdd3e5 Add fio tool 2023-08-29 11:27:50 +02:00
4fa074f893 Add ceph tools in hut too 2023-08-28 17:58:21 +02:00
a260a1bc1b Switch ceph logs to journal 2023-08-28 17:58:08 +02:00
8912d2b9bc Update ceph to 18.2.0 in overlay 2023-08-25 18:20:21 +02:00
b4015ded86 Move pkgs overlay to overlay.nix 2023-08-25 18:12:00 +02:00
0f54d63a46 Enable ceph osd daemons in lake2 2023-08-25 14:54:51 +02:00
6c656182f1 Add the lake2 hostname to the hosts 2023-08-25 14:44:35 +02:00
be4187de3c Use the sda for lake2 2023-08-25 13:40:10 +02:00
0b22a1b8a4 Remove netboot module 2023-08-25 13:39:01 +02:00
f18f1937ae Disable pixiecore in hut for now 2023-08-25 13:21:00 +02:00
4b78ec9134 Add PXE helper 2023-08-25 12:05:33 +02:00
6c0c26b3aa Enable netboot again for PXE 2023-08-24 19:08:23 +02:00
fb1744306d Specify the disk by path 2023-08-24 15:27:37 +02:00
394c7ecd7b Prepare lake2 config after bootstrap
The disk ID is different under NixOS.
2023-08-24 13:54:53 +02:00
3276f54e86 Add lake2 bootstrap config 2023-08-24 12:30:46 +02:00
4c806b8ae9 Add section to enable serial console 2023-08-24 12:29:44 +02:00
832866cbfa Add agenix to PATH in hut 2023-08-23 17:42:50 +02:00
9fc393bb6a Store ceph secret key in age
This allows a node to mount the ceph FS without any extra ceph
configuration in /etc/ceph.
2023-08-23 17:26:44 +02:00
d81d9d58e1 Add rarias key for secrets 2023-08-23 17:15:26 +02:00
d54dcc8d8f Add ceph metrics to prometheus 2023-08-22 16:33:55 +02:00
a5fae4a289 Mount the ceph filesystem in hut 2023-08-22 16:15:46 +02:00
a355926cf0 Add ceph config in bay 2023-08-22 15:58:48 +02:00
d7a4420205 Add the bay host name 2023-08-22 15:56:09 +02:00
0b55ce3d02 Remove netboot and fixes 2023-08-22 12:12:15 +02:00
0ce574800e Add bay node 2023-08-22 12:12:15 +02:00
a7e09e55df Update flake 2023-08-22 11:28:54 +02:00
1622b3e7fc Monitor power from other nodes via LAN 2023-08-22 11:28:54 +02:00
3424cac761 Increase prometheus retention time to one year 2023-08-22 11:28:54 +02:00
f98af9aeef Don't set all_proxy 2023-08-22 11:28:54 +02:00
8c14b75e44 Update nixpkgs to fix docker problem 2023-07-28 14:24:51 +02:00
e497e1b88b Allow access to devices for node_exporter 2023-07-28 13:55:35 +02:00
07411beb49 GRUB version no longer needed 2023-07-27 17:22:20 +02:00
e8bab9928d Upgrade flake: nixpkgs, bscpkgs and agenix 2023-07-27 17:19:17 +02:00
544d5a3d69 Kill slurmd remaining processes on upgrade 2023-07-27 14:49:20 +02:00
312f2cb368 koro: Add vlopez user 2023-07-21 13:00:43 +02:00
45ac6e95e9 Add koro node 2023-07-21 13:00:08 +02:00
e6bb6e735d eudy: Add fcsv3 and intermediate versions for testing 2023-07-21 11:27:51 +02:00
cfbfcdbe8c eudy: Enable memory overcommit 2023-07-21 11:27:51 +02:00
c31bfd6b4d eudy: disable all cpu mitigations 2023-07-21 11:27:51 +02:00
d20fa359d9 Enable NTP using the BSC time server 2023-06-30 14:02:15 +02:00
9be15fdad2 Add the ssfhead node as gateway 2023-06-30 14:01:35 +02:00
13e365002c Use our host names first by default 2023-06-23 16:22:18 +02:00
a38072762f Add DNS tools to resolve hosts 2023-06-23 16:15:45 +02:00
adf1ff29a7 Lower perf_event_paranoid to -1 2023-06-23 16:01:27 +02:00
1ec8d7a625 Set perf paranoid to 0 by default 2023-06-21 16:24:19 +02:00
f78f4f5822 Add perf to packages 2023-06-21 15:41:06 +02:00
67a57cb3e5 Allow srun to specify the cpu binding
The task/affinity plugin needs to be selected.
2023-06-21 13:16:23 +02:00
85896f8546 Move authorized keys to users.nix 2023-06-20 14:08:34 +02:00
5e728773c3 Add rpenacob user 2023-06-20 12:54:26 +02:00
0a06cf564b Add osumb to the system packages 2023-06-16 19:22:41 +02:00
db26b2ae37 flake.lock: Update
Flake lock file updates:

• Updated input 'bscpkgs':
    'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs%2fheads%2fmaster&rev=c775ee4d6f76aded05b08ae13924c302f18f9b2c' (2023-04-26)
  → 'git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git?ref=refs%2fheads%2fmaster&rev=cbe9af5d042e9d5585fe2acef65a1347c68b2fbd' (2023-06-16)
2023-06-16 18:33:54 +02:00
f7d00dec25 Set mpi to mpich by default in bscpkgs 2023-06-16 18:26:51 +02:00
2053ec82b7 Add missing parameter to extend 2023-06-16 18:26:51 +02:00
f2434a17c2 Use explicit order in overlays 2023-06-16 18:26:51 +02:00
1f7045fcfe Replace mpi inside bsc attribute 2023-06-16 18:26:51 +02:00
0c4a1efa27 Add mpich overlay 2023-06-16 18:26:51 +02:00
530958496b Add coments in slurm config 2023-06-16 18:26:50 +02:00
df378a2933 Add eudy host key to known hosts 2023-06-16 17:29:48 +02:00
2a0fe5a137 Rename xeon08 to eudy
From Eudyptula, a little penguin.
2023-06-16 17:16:05 +02:00
dfbeafa2b2 Update rebuild script for all nodes 2023-06-16 12:13:07 +02:00
7d4281a5c1 Add ssh host keys 2023-06-16 12:01:12 +02:00
dfea0be2d9 Set the name of the slurm cluster to jungle 2023-06-16 12:00:54 +02:00
df91da8c34 Change owl hostnames 2023-06-16 11:42:39 +02:00
30c21155af Add owl and all partition 2023-06-16 11:34:00 +02:00
a43016ebee Simplify flake and expose host pkgs
The configuration of the machines is now moved to m/
2023-06-16 11:31:31 +02:00
801bb4ba3c Rename xeon07 to hut 2023-06-14 17:28:40 +02:00
a9d740e95a Remove profiles older than 30 days with gc 2023-06-14 17:28:39 +02:00
08eaf312f2 Add ncdu to system packages 2023-06-14 17:28:39 +02:00
0b57bbc6e3 Move arocanon user from xeon08 to common 2023-06-14 16:22:43 +02:00
6558a6ab77 xeon08: Add config for kernel non-voluntary preemption 2023-06-14 16:17:33 +02:00
0d196af473 xeon08: Add perf 2023-06-14 15:42:20 +02:00
d35becb663 xeon08: Enable lttng lockdep tracepoints 2023-06-14 15:42:20 +02:00
5421eab09a xeon08: Add lttng module and tools 2023-06-14 15:42:20 +02:00
1c7de2f7c9 Serve grafana in https://jungle.bsc.es/grafana 2023-05-31 18:12:14 +02:00
c7692995f4 Add tree command 2023-05-31 18:11:34 +02:00
0af185afd8 Add file to system packages 2023-05-31 18:11:34 +02:00
470b3d2512 Add gnumake to system packages 2023-05-31 18:11:34 +02:00
1bf6747b3a Add cmake to system packages 2023-05-31 18:11:34 +02:00
59bf51dfde Add ix to common packages 2023-05-31 18:11:34 +02:00
b72d9936a2 Improve documentation 2023-05-26 11:38:27 +02:00
5ebb57deff Add gitignore 2023-05-26 11:38:27 +02:00
5b82a72647 Set intel_pstate=passive and disable frequency boost 2023-05-26 11:38:26 +02:00
a5c7205481 Add xeon08 basic config 2023-05-26 11:38:26 +02:00
fd1b467a60 Add nixos-config.nix to easily enable nix repl 2023-05-26 11:29:59 +02:00
882161b21e Automatically resume restarted nodes in SLURM 2023-05-18 12:48:04 +02:00
5e8ff50c98 Allow public dashboards in grafana 2023-05-09 18:53:31 +02:00
cdb0688ec1 Add hal ssh key 2023-05-09 18:37:38 +02:00
ebb5e94416 Increase the number of CPUs to 56 for nOS-V docker 2023-05-02 17:47:57 +02:00
89049d0b1f Allow 5 concurrent buils in the gitlab-runner 2023-05-02 17:38:10 +02:00
6d16772d07 Simplify bash prompt 2023-04-28 18:15:04 +02:00
e37f9e2b0f Roolback to bash as default shell
Zsh doesn't behave properly, it needs further configuration.
2023-04-28 17:59:19 +02:00
9767238c76 Use pmix by default in slurm 2023-04-28 17:07:48 +02:00
a5a0fd9b6f Increase locked memory to 1 GiB 2023-04-28 12:34:51 +02:00
be69070f61 Use the latest kernel 2023-04-28 11:51:38 +02:00
53f6dcec8d Disable osnoise and hwlat tracer for now
Reuse nix cache to avoid rebuilding the kernel.
2023-04-28 11:19:47 +02:00
87c4521de3 Update nixpkgs to nixos-unstable 2023-04-28 11:18:37 +02:00
461d6d2f34 Update nixpkgs 2023-04-28 11:13:46 +02:00
ef2ffa61c3 Update ib interface name in xeon02
It seems to be plugged in another PCI port
2023-04-27 18:29:32 +02:00
c0b23ad450 Add steps in install documentation 2023-04-27 17:30:53 +02:00
f12ba9f8b0 Add minimal netboot module to build kexec image 2023-04-27 16:36:15 +02:00
a211e9ebee Add xeon02 configuration 2023-04-27 16:28:12 +02:00
5dbbb27c43 Refacto slurm configuration into compute/control 2023-04-27 16:27:04 +02:00
69bb2128db Lock flakes and add inputs 2023-04-27 13:52:59 +02:00
de7cae6208 Test flakes 2023-04-26 14:27:02 +02:00
de4ac8cbd6 Enable slurm in xeon01 2023-04-26 14:10:36 +02:00
e1dcad50d0 Use xeon07 as control machine 2023-04-26 14:10:36 +02:00
0120be66fb Remove xeon07 overlay to load upstream slurm 2023-04-26 14:10:36 +02:00
6cb079a44e Add script to rebuild configuration 2023-04-26 14:09:23 +02:00
a5449067a7 Add configuration for xeon01 2023-04-26 11:44:00 +00:00
1009736d81 Load overlays from /config 2023-04-26 11:44:00 +00:00
a94765e8ae Move net.nix to common 2023-04-26 11:44:00 +00:00
9630b23ce2 Remove host specific network options from net.nix 2023-04-26 11:44:00 +00:00
ed158ee87f Move ssh.nix to common 2023-04-26 11:44:00 +00:00
480dd95d9b Move overlays.nix to common 2023-04-26 11:44:00 +00:00
f7b18098b1 Move users.nix to common 2023-04-26 11:44:00 +00:00
c580254dde Move common options from configuration.nix 2023-04-26 11:44:00 +00:00
7e6c395ff8 Move the remaining hw config to common 2023-04-26 11:44:00 +00:00
6978677cb5 Move boot config to common/boot.nix 2023-04-26 11:44:00 +00:00
f5b4580dae Move filesystems config to common/fs.nix 2023-04-26 11:44:00 +00:00
035becd018 Use partition labels for / and swap 2023-04-26 11:44:00 +00:00
a7fb69ab92 Move fs.nix to common 2023-04-26 11:44:00 +00:00
733eb93f23 Move boot.nix to common 2023-04-26 11:44:00 +00:00
b60e821eaa Move disk selection to configuration.nix 2023-04-26 11:44:00 +00:00
f43d549294 Add common directory 2023-04-26 11:44:00 +00:00
848efdcb2d Move xeon07 configuration to a directory 2023-04-18 16:09:23 +02:00
0f7a0c3ac2 Add smartctl monitoring 2023-04-18 16:03:46 +02:00
40d0a16736 Allow wheel users to build derivations 2023-04-14 10:14:17 +02:00
59b8ba0e76 Use bscpkgs master 2023-04-11 21:22:00 +02:00
b5153009ea Run the garbage collector once a week 2023-04-11 21:21:22 +02:00
93a37b8353 Set EDITOR and add nix-diff 2023-04-11 20:36:54 +02:00
0ca649b715 Add nos-v gitlab runner 2023-04-11 12:59:21 +02:00
1b5e227095 Disable debug from gitlab runner 2023-04-11 12:58:24 +02:00
9310a7b0b9 Add gitlab-runner secrets using agenix 2023-04-11 12:47:52 +02:00
40b9beb86b Disable ethernet specific useDHCP
Is already configured by default for all interfaces.
2023-04-06 13:58:55 +02:00
72f9659430 Enable IPoIB and set the infiniband IP 2023-04-06 13:58:24 +02:00
8fe301203c Export nix store over nfs 2023-04-06 13:57:32 +02:00
a813ea6561 Enable gitlab runner monitoring 2023-04-06 13:56:52 +02:00
5d8b4e96b2 Add agenix tool 2023-04-05 17:04:42 +02:00
60ff89b7cc Add monitoring services 2023-04-05 17:00:01 +02:00
e6c35604bb Add some tools and use relaxed for build sandbox 2023-04-05 16:59:09 +02:00
d0dfba5c03 Remove commencted docker settings 2023-04-05 16:56:27 +02:00
ccee2339a3 Add mio key 2023-04-05 16:56:05 +02:00
df371c950f Setup slurm and gitlab-runner 2023-04-03 12:51:44 +02:00
52eed708f0 Add initial configuration 2023-03-31 18:27:25 +02:00
154 changed files with 6004 additions and 270 deletions

15
.gitea/workflows/ci.yaml Normal file
View File

@ -0,0 +1,15 @@
name: CI
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
build:all:
runs-on: native
steps:
- uses: https://gitea.com/ScMi1/checkout@v1.4
- run: nix build -L --no-link --print-out-paths .#bsc-ci.all

4
.gitignore vendored
View File

@ -1,3 +1,3 @@
source
result
**.swp
/result
/misc

View File

@ -3,4 +3,4 @@ build:bsc-ci.all:
tags:
- nix
script:
- nix build -L "jungle#bsc-ci.all" --override-input bscpkgs . -v --show-trace
- nix build -L --no-link --print-out-paths .#bsc-ci.all

View File

@ -1,4 +1,4 @@
Copyright (c) 2020-2021 Barcelona Supercomputing Center
Copyright (c) 2020-2025 Barcelona Supercomputing Center
Copyright (c) 2003-2020 Eelco Dolstra and the Nixpkgs/NixOS contributors
Permission is hereby granted, free of charge, to any person obtaining

View File

@ -1 +1,9 @@
Nix overlay with BSC packages.
# Jungle
This repository provides two components that can be used independently:
- A Nix overlay with packages used at BSC (formerly known as bscpkgs). Access
them directly with `nix shell .#<pkgname>`.
- NixOS configurations for jungle machines. Use `nixos-rebuild switch --flake .`
to upgrade the current machine.

176
doc/install.md Normal file
View File

@ -0,0 +1,176 @@
# Installing NixOS in a new node
This article shows the steps to install NixOS in a node following the
configuration of the repo.
## Enable the serial console
By default, the nodes have the serial console disabled in the GRUB and also boot
without the serial enabled.
To enable the serial console in the GRUB, set in /etc/default/grub the following
lines:
```
GRUB_TERMINAL="console serial"
GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=1"
```
To boot Linux with the serial enabled, so you can see the boot log and login via
serial set:
```
GRUB_CMDLINE_LINUX="console=ttyS0,115200n8 console=tty0"
```
Then update the grub config:
```
# grub2-mkconfig -o /boot/grub2/grub.cfg
```
And reboot.
## Prepare the disk
Create a main partition and label it `nixos` following [the manual][1].
[1]: https://nixos.org/manual/nixos/stable/index.html#sec-installation-manual-partitioning.
```
# disk=/dev/sdX
# parted $disk -- mklabel msdos
# parted $disk -- mkpart primary 1MB -8GB
# parted $disk -- mkpart primary linux-swap -8GB 100%
# parted $disk -- set 1 boot on
```
Then create an etx4 filesystem, labeled `nixos` where the system will be
installed. **Ensure that no other partition has the same label.**
```
# mkfs.ext4 -L nixos "${disk}1"
# mkswap -L swap "${disk}2"
# mount ${disk}1 /mnt
# lsblk -f $disk
NAME FSTYPE LABEL UUID MOUNTPOINT
sdX
`-sdX1 ext4 nixos 10d73b75-809c-4fa3-b99d-4fab2f0d0d8e /mnt
```
## Prepare nix and nixos-install
Mount the nix store from the hut node in read-only /nix.
```
# mkdir /nix
# mount -o ro hut:/nix /nix
```
Get the nix binary and nixos-install tool from hut:
```
# ssh hut 'readlink -f $(which nix)'
/nix/store/0sxbaj71c4c4n43qhdxm31f56gjalksw-nix-2.13.3/bin/nix
# ssh hut 'readlink -f $(which nixos-install)'
/nix/store/9yq8ps06ysr2pfiwiij39ny56yk3pdcs-nixos-install/bin/nixos-install
```
And add them to the PATH:
```
# export PATH=$PATH:/nix/store/0sxbaj71c4c4n43qhdxm31f56gjalksw-nix-2.13.3/bin
# export PATH=$PATH:/nix/store/9yq8ps06ysr2pfiwiij39ny56yk3pdcs-nixos-install/bin/
# nix --version
nix (Nix) 2.13.3
```
## Adapt owl configuration
Clone owl repo:
```
$ git clone git@bscpm03.bsc.es:rarias/owl.git
$ cd owl
```
Edit the configuration to your needs.
## Install from another Linux OS
Install nixOS into the storage drive.
```
# nixos-install --flake --root /mnt .#xeon0X
```
At this point, the nixOS grub has been installed into the nixos device, which
is not the default boot device. To keep both the old Linux and NixOS grubs, add
an entry into the old Linux grub to jump into the new grub.
```
# echo "
menuentry 'NixOS' {
insmod chain
search --no-floppy --label nixos --set root
configfile /boot/grub/grub.cfg
} " >> /etc/grub.d/40_custom
```
Rebuild grub config.
```
# grub2-mkconfig -o /boot/grub/grub.cfg
```
To boot into NixOS manually, reboot and select NixOS in the grub menu to boot
into NixOS.
To temporarily boot into NixOS only on the next reboot run:
```
# grub2-reboot 'NixOS'
```
To permanently boot into NixOS as the default boot OS, edit `/etc/default/grub/`:
```
GRUB_DEFAULT='NixOS'
```
And update grub.
```
# grub2-mkconfig -o /boot/grub/grub.cfg
```
## Build the nixos kexec image
```
# nix build .#nixosConfigurations.xeon02.config.system.build.kexecTree -v
```
## Chain NixOS in same disk with other systems
To install NixOS on a partition along another system which controls the GRUB,
first disable the grub device, so the GRUB is not installed in the disk by
NixOS (only the /boot files will be generated):
```
boot.loader.grub.device = "nodev";
```
Then add the following entry to the old GRUB configuration:
```
menuentry 'NixOS' {
insmod chain
search --no-floppy --label nixos --set root
configfile /boot/grub/grub.cfg
}
```
The partition with NixOS must have the label "nixos" for it to be found. New
system configuration entries will be stored in the GRUB configuration managed
by NixOS, so there is no need to change the old GRUB settings.

46
doc/trim.sh Executable file
View File

@ -0,0 +1,46 @@
#!/bin/sh
# Trims the jungle repository by moving the website to its own repository and
# removing it from jungle. It also removes big pdf files and kernel
# configurations so the jungle repository is small.
set -e
if [ -e oldjungle -o -e newjungle -o -e website ]; then
echo "remove oldjungle/, newjungle/ and website/ first"
exit 1
fi
# Clone the old jungle repo
git clone gitea@tent:rarias/jungle.git oldjungle
# First split the website into a new repository
mkdir website && git -C website init -b master
git-filter-repo \
--path web \
--subdirectory-filter web \
--source oldjungle \
--target website
# Then remove the website, pdf files and big kernel configs
mkdir newjungle && git -C newjungle init -b master
git-filter-repo \
--invert-paths \
--path web \
--path-glob 'doc*.pdf' \
--path-glob '**/kernel/configs/lockdep' \
--path-glob '**/kernel/configs/defconfig' \
--source oldjungle \
--target newjungle
set -x
du -sh oldjungle newjungle website
# 57M oldjungle
# 2,3M newjungle
# 6,4M website
du -sh --exclude=.git oldjungle newjungle website
# 30M oldjungle
# 700K newjungle
# 3,5M website

93
flake.lock generated
View File

@ -1,22 +1,107 @@
{
"nodes": {
"agenix": {
"inputs": {
"darwin": "darwin",
"home-manager": "home-manager",
"nixpkgs": [
"nixpkgs"
],
"systems": "systems"
},
"locked": {
"lastModified": 1750173260,
"narHash": "sha256-9P1FziAwl5+3edkfFcr5HeGtQUtrSdk/MksX39GieoA=",
"owner": "ryantm",
"repo": "agenix",
"rev": "531beac616433bac6f9e2a19feb8e99a22a66baf",
"type": "github"
},
"original": {
"owner": "ryantm",
"repo": "agenix",
"type": "github"
}
},
"darwin": {
"inputs": {
"nixpkgs": [
"agenix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1744478979,
"narHash": "sha256-dyN+teG9G82G+m+PX/aSAagkC+vUv0SgUw3XkPhQodQ=",
"owner": "lnl7",
"repo": "nix-darwin",
"rev": "43975d782b418ebf4969e9ccba82466728c2851b",
"type": "github"
},
"original": {
"owner": "lnl7",
"ref": "master",
"repo": "nix-darwin",
"type": "github"
}
},
"home-manager": {
"inputs": {
"nixpkgs": [
"agenix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1745494811,
"narHash": "sha256-YZCh2o9Ua1n9uCvrvi5pRxtuVNml8X2a03qIFfRKpFs=",
"owner": "nix-community",
"repo": "home-manager",
"rev": "abfad3d2958c9e6300a883bd443512c55dfeb1be",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "home-manager",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1752436162,
"narHash": "sha256-Kt1UIPi7kZqkSc5HVj6UY5YLHHEzPBkgpNUByuyxtlw=",
"path": "/nix/store/zk8v61cpk1wprp9ld5ayc1g5fq4pdkwv-source",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "dfcd5b901dbab46c9c6e80b265648481aafb01f8",
"type": "path"
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"agenix": "agenix",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",

View File

@ -1,26 +1,52 @@
{
inputs.nixpkgs.url = "nixpkgs";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
agenix.url = "github:ryantm/agenix";
agenix.inputs.nixpkgs.follows = "nixpkgs";
};
outputs = { self, nixpkgs, ...}:
let
# For now we only support x86
system = "x86_64-linux";
pkgs = import nixpkgs {
inherit system;
overlays = [ self.overlays.default ];
};
in
{
bscOverlay = import ./overlay.nix;
overlays.default = self.bscOverlay;
# full nixpkgs with our overlay applied
legacyPackages.${system} = pkgs;
hydraJobs = {
inherit (self.legacyPackages.${system}.bsc-ci) test pkgs;
};
# propagate nixpkgs lib, so we can do bscpkgs.lib
inherit (nixpkgs) lib;
outputs = { self, nixpkgs, agenix, ... }:
let
mkConf = name: nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
specialArgs = { inherit nixpkgs agenix; theFlake = self; };
modules = [ "${self.outPath}/m/${name}/configuration.nix" ];
};
# For now we only support x86
system = "x86_64-linux";
pkgs = import nixpkgs {
inherit system;
overlays = [ self.overlays.default ];
config.allowUnfree = true;
};
in
{
nixosConfigurations = {
hut = mkConf "hut";
tent = mkConf "tent";
owl1 = mkConf "owl1";
owl2 = mkConf "owl2";
eudy = mkConf "eudy";
koro = mkConf "koro";
bay = mkConf "bay";
lake2 = mkConf "lake2";
raccoon = mkConf "raccoon";
fox = mkConf "fox";
apex = mkConf "apex";
weasel = mkConf "weasel";
};
bscOverlay = import ./overlay.nix;
overlays.default = self.bscOverlay;
# full nixpkgs with our overlay applied
legacyPackages.${system} = pkgs;
hydraJobs = {
inherit (self.legacyPackages.${system}.bsc-ci) tests pkgs cross;
};
# propagate nixpkgs lib, so we can do bscpkgs.lib
inherit (nixpkgs) lib;
};
}

37
keys.nix Normal file
View File

@ -0,0 +1,37 @@
# As agenix needs to parse the secrets from a standalone .nix file, we describe
# here all the public keys
rec {
hosts = {
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
raccoon = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNQttFvL0dNEyy7klIhLoK4xXOeM2/K9R7lPMTG3qvK raccoon";
};
hostGroup = with hosts; rec {
compute = [ owl1 owl2 fox raccoon ];
playground = [ eudy koro weasel ];
storage = [ bay lake2 ];
monitor = [ hut ];
login = [ apex ];
system = storage ++ monitor ++ login;
safe = system ++ compute;
all = safe ++ playground;
};
admins = {
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
"rarias@fox" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDSbw3REAKECV7E2c/e2XJITudJQWq2qDSe2N1JHqHZd rarias@fox";
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
};
}

69
m/apex/configuration.nix Normal file
View File

@ -0,0 +1,69 @@
{ lib, config, pkgs, ... }:
{
imports = [
../common/xeon.nix
../common/ssf/hosts.nix
../module/ceph.nix
../module/hut-substituter.nix
../module/slurm-server.nix
./nfs.nix
./wireguard.nix
];
# Don't install grub MBR for now
boot.loader.grub.device = "nodev";
boot.initrd.kernelModules = [
"megaraid_sas" # For HW RAID
];
environment.systemPackages = with pkgs; [
storcli # To manage HW RAID
];
fileSystems."/home" = {
device = "/dev/disk/by-label/home";
fsType = "ext4";
};
# No swap, there is plenty of RAM
swapDevices = lib.mkForce [];
networking = {
hostName = "apex";
defaultGateway = "84.88.53.233";
nameservers = [ "8.8.8.8" ];
# Public facing interface
interfaces.eno1.ipv4.addresses = [ {
address = "84.88.53.236";
prefixLength = 29;
} ];
# Internal LAN to our Ethernet switch
interfaces.eno2.ipv4.addresses = [ {
address = "10.0.40.30";
prefixLength = 24;
} ];
# Infiniband over Omnipath switch (disconnected for now)
# interfaces.ibp5s0 = {};
nat = {
enable = true;
internalInterfaces = [ "eno2" ];
externalInterface = "eno1";
};
};
networking.firewall = {
extraCommands = ''
# Blackhole BSC vulnerability scanner (OpenVAS) as it is spamming our
# logs. Insert as first position so we also protect SSH.
iptables -I nixos-fw 1 -p tcp -s 192.168.8.16 -j nixos-fw-refuse
# Same with opsmonweb01.bsc.es which seems to be trying to access via SSH
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
'';
};
}

48
m/apex/nfs.nix Normal file
View File

@ -0,0 +1,48 @@
{ ... }:
{
services.nfs.server = {
enable = true;
lockdPort = 4001;
mountdPort = 4002;
statdPort = 4000;
exports = ''
/home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
/home 10.106.0.0/24(rw,async,no_subtree_check,no_root_squash)
'';
};
networking.firewall = {
# Check with `rpcinfo -p`
extraCommands = ''
# Accept NFS traffic from compute nodes but not from the outside
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
# Same but UDP
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
# Accept NFS traffic from wg0
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
# Same but UDP
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
'';
};
}

42
m/apex/wireguard.nix Normal file
View File

@ -0,0 +1,42 @@
{ config, ... }:
{
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgApex.file = ../../secrets/wg-apex.age;
# Enable WireGuard
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
# "wg0" is the network interface name. You can name the interface arbitrarily.
wg0 = {
ips = [ "10.106.0.30/24" ];
listenPort = 666;
privateKeyFile = config.age.secrets.wgApex.path;
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
peers = [
{
name = "fox";
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
allowedIPs = [ "10.106.0.1/32" ];
endpoint = "fox.ac.upc.edu:666";
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
persistentKeepalive = 25;
}
{
name = "raccoon";
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
}
];
};
};
networking.hosts = {
"10.106.0.1" = [ "fox" ];
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
}

108
m/bay/configuration.nix Normal file
View File

@ -0,0 +1,108 @@
{ config, pkgs, lib, ... }:
{
imports = [
../common/ssf.nix
../module/hut-substituter.nix
../module/monitoring.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53562d";
boot.kernel.sysctl = {
"kernel.yama.ptrace_scope" = lib.mkForce "1";
};
environment.systemPackages = with pkgs; [
ceph
];
networking = {
hostName = "bay";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.40";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.40";
prefixLength = 24;
} ];
firewall = {
extraCommands = ''
# Accept all incoming TCP traffic from lake2
iptables -A nixos-fw -p tcp -s lake2 -j nixos-fw-accept
# Accept monitoring requests from hut
iptables -A nixos-fw -p tcp -s hut -m multiport --dport 9283,9002 -j nixos-fw-accept
# Accept all Ceph traffic from the local network
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 -m multiport --dport 3300,6789,6800:7568 -j nixos-fw-accept
'';
};
};
services.ceph = {
enable = true;
global = {
fsid = "9c8d06e0-485f-4aaf-b16b-06d6daf1232b";
monHost = "10.0.40.40";
monInitialMembers = "bay";
clusterNetwork = "10.0.40.40/24"; # Use Ethernet only
};
extraConfig = {
# Only log to stderr so it appears in the journal
"log_file" = "/dev/null";
"mon_cluster_log_file" = "/dev/null";
"log_to_stderr" = "true";
"err_to_stderr" = "true";
"log_to_file" = "false";
};
mds = {
enable = true;
daemons = [ "mds0" "mds1" ];
extraConfig = {
"host" = "bay";
};
};
mgr = {
enable = true;
daemons = [ "bay" ];
};
mon = {
enable = true;
daemons = [ "bay" ];
};
osd = {
enable = true;
# One daemon per NVME disk
daemons = [ "0" "1" "2" "3" ];
extraConfig = {
"osd crush chooseleaf type" = "0";
"osd journal size" = "10000";
"osd pool default min size" = "2";
"osd pool default pg num" = "200";
"osd pool default pgp num" = "200";
"osd pool default size" = "3";
};
};
};
# Missing service for volumes, see:
# https://www.reddit.com/r/ceph/comments/14otjyo/comment/jrd69vt/
systemd.services.ceph-volume = {
enable = true;
description = "Ceph Volume activation";
unitConfig = {
Type = "oneshot";
After = "local-fs.target";
Wants = "local-fs.target";
};
path = [ pkgs.ceph pkgs.util-linux pkgs.lvm2 pkgs.cryptsetup ];
serviceConfig = {
KillMode = "none";
Environment = "CEPH_VOLUME_TIMEOUT=10000";
ExecStart = "/bin/sh -c 'timeout $CEPH_VOLUME_TIMEOUT ${pkgs.ceph}/bin/ceph-volume lvm activate --all --no-systemd'";
TimeoutSec = "0";
};
wantedBy = [ "multi-user.target" ];
};
}

21
m/common/base.nix Normal file
View File

@ -0,0 +1,21 @@
{
# All machines should include this profile.
# Includes the basic configuration for an Intel server.
imports = [
./base/agenix.nix
./base/always-power-on.nix
./base/august-shutdown.nix
./base/boot.nix
./base/env.nix
./base/fs.nix
./base/hw.nix
./base/net.nix
./base/nix.nix
./base/ntp.nix
./base/rev.nix
./base/ssh.nix
./base/users.nix
./base/watchdog.nix
./base/zsh.nix
];
}

9
m/common/base/agenix.nix Normal file
View File

@ -0,0 +1,9 @@
{ agenix, ... }:
{
imports = [ agenix.nixosModules.default ];
environment.systemPackages = [
agenix.packages.x86_64-linux.default
];
}

View File

@ -0,0 +1,8 @@
{
imports = [
../../module/power-policy.nix
];
# Turn on as soon as we have power
power.policy = "always-on";
}

View File

@ -0,0 +1,14 @@
{
# Shutdown all machines on August 3rd at 22:00, so we can protect the
# hardware from spurious electrical peaks on the yearly electrical cut for
# manteinance that starts on August 4th.
systemd.timers.august-shutdown = {
description = "Shutdown on August 3rd for maintenance";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*-08-03 22:00:00";
RandomizedDelaySec = "10min";
Unit = "systemd-poweroff.service";
};
};
}

37
m/common/base/boot.nix Normal file
View File

@ -0,0 +1,37 @@
{ lib, pkgs, ... }:
{
# Use the GRUB 2 boot loader.
boot.loader.grub.enable = true;
# Enable GRUB2 serial console
boot.loader.grub.extraConfig = ''
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
terminal_input --append serial
terminal_output --append serial
'';
boot.kernel.sysctl = {
"kernel.perf_event_paranoid" = lib.mkDefault "-1";
# Allow ptracing (i.e. attach with GDB) any process of the same user, see:
# https://www.kernel.org/doc/Documentation/security/Yama.txt
"kernel.yama.ptrace_scope" = "0";
};
boot.kernelPackages = pkgs.linuxPackages_latest;
#boot.kernelPatches = lib.singleton {
# name = "osnoise-tracer";
# patch = null;
# extraStructuredConfig = with lib.kernel; {
# OSNOISE_TRACER = yes;
# HWLAT_TRACER = yes;
# };
#};
boot.initrd.availableKernelModules = [ "ahci" "xhci_pci" "ehci_pci" "nvme" "usbhid" "sd_mod" ];
boot.initrd.kernelModules = [ ];
boot.kernelModules = [ "kvm-intel" ];
boot.extraModulePackages = [ ];
}

37
m/common/base/env.nix Normal file
View File

@ -0,0 +1,37 @@
{ pkgs, config, ... }:
{
environment.systemPackages = with pkgs; [
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
ncdu config.boot.kernelPackages.perf ldns pv
# From bsckgs overlay
osumb
];
programs.direnv.enable = true;
# Increase limits
security.pam.loginLimits = [
{
domain = "*";
type = "-";
item = "memlock";
value = "1048576"; # 1 GiB of mem locked
}
];
environment.enableAllTerminfo = true;
environment.variables = {
EDITOR = "vim";
VISUAL = "vim";
};
programs.bash.promptInit = ''
PS1="\h\\$ "
'';
time.timeZone = "Europe/Madrid";
i18n.defaultLocale = "en_DK.UTF-8";
}

24
m/common/base/fs.nix Normal file
View File

@ -0,0 +1,24 @@
{ ... }:
{
fileSystems."/" =
{ device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};
# Trim unused blocks weekly
services.fstrim.enable = true;
swapDevices =
[ { device = "/dev/disk/by-label/swap"; }
];
# Tracing
fileSystems."/sys/kernel/tracing" = {
device = "none";
fsType = "tracefs";
};
# Mount a tmpfs into /tmp
boot.tmp.useTmpfs = true;
}

14
m/common/base/hw.nix Normal file
View File

@ -0,0 +1,14 @@
# Do not modify this file! It was generated by nixos-generate-config
# and may be overwritten by future invocations. Please make changes
# to /etc/nixos/configuration.nix instead.
{ config, lib, pkgs, modulesPath, ... }:
{
imports =
[ (modulesPath + "/installer/scan/not-detected.nix")
];
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
powerManagement.cpuFreqGovernor = lib.mkDefault "powersave";
hardware.cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
}

23
m/common/base/net.nix Normal file
View File

@ -0,0 +1,23 @@
{ pkgs, lib, ... }:
{
networking = {
enableIPv6 = false;
useDHCP = false;
firewall = {
enable = true;
allowedTCPPorts = [ 22 ];
};
# Make sure we use iptables
nftables.enable = lib.mkForce false;
hosts = {
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
"84.88.51.142" = [ "raccoon-ipmi" ];
"192.168.11.12" = [ "bscpm04.bsc.es" ];
"192.168.11.15" = [ "gitlab-internal.bsc.es" ];
};
};
}

59
m/common/base/nix.nix Normal file
View File

@ -0,0 +1,59 @@
{ pkgs, nixpkgs, theFlake, ... }:
{
nixpkgs.overlays = [
(import ../../../overlay.nix)
];
nixpkgs.config.allowUnfree = true;
nix = {
nixPath = [
"nixpkgs=${nixpkgs}"
"jungle=${theFlake.outPath}"
];
registry = {
nixpkgs.flake = nixpkgs;
jungle.flake = theFlake;
};
settings = {
experimental-features = [ "nix-command" "flakes" ];
sandbox = "relaxed";
trusted-users = [ "@wheel" ];
flake-registry = pkgs.writeText "global-registry.json"
''{"flakes":[],"version":2}'';
keep-outputs = true;
};
gc = {
automatic = true;
dates = "weekly";
options = "--delete-older-than 30d";
};
};
# The nix-gc.service can begin its execution *before* /home is mounted,
# causing it to remove all gcroots considering them as stale, as it cannot
# access the symlink. To prevent this problem, we force the service to wait
# until /home is mounted as well as other remote FS like /ceph.
systemd.services.nix-gc = {
# Start remote-fs.target if not already being started and fail if it fails
# to start. It will also be stopped if the remote-fs.target fails after
# starting successfully.
bindsTo = [ "remote-fs.target" ];
# Wait until remote-fs.target fully starts before starting this one.
after = [ "remote-fs.target"];
# Ensure we can access a remote path inside /home
unitConfig.ConditionPathExists = "/home/Computational";
};
# This value determines the NixOS release from which the default
# settings for stateful data, like file locations and database versions
# on your system were taken. Its perfectly fine and recommended to leave
# this value at the release version of the first install of this system.
# Before changing this value read the documentation for this option
# (e.g. man configuration.nix or on https://nixos.org/nixos/options.html).
system.stateVersion = "22.11"; # Did you read the comment?
}

9
m/common/base/ntp.nix Normal file
View File

@ -0,0 +1,9 @@
{ pkgs, ... }:
{
services.ntp.enable = true;
# Use the NTP server at BSC, as we don't have direct access
# to the outside world
networking.timeServers = [ "84.88.52.36" ];
}

21
m/common/base/rev.nix Normal file
View File

@ -0,0 +1,21 @@
{ theFlake, ... }:
let
# Prevent building a configuration without revision
rev = if theFlake ? rev then theFlake.rev
else throw ("Refusing to build from a dirty Git tree!");
in {
# Save the commit of the config in /etc/configrev
environment.etc.configrev.text = rev + "\n";
# Keep a log with the config over time
system.activationScripts.configRevLog.text = ''
BOOTED=$(cat /run/booted-system/etc/configrev 2>/dev/null || echo unknown)
CURRENT=$(cat /run/current-system/etc/configrev 2>/dev/null || echo unknown)
NEXT=${rev}
DATENOW=$(date --iso-8601=seconds)
echo "$DATENOW booted=$BOOTED current=$CURRENT next=$NEXT" >> /var/configrev.log
'';
system.configurationRevision = rev;
}

18
m/common/base/ssh.nix Normal file
View File

@ -0,0 +1,18 @@
{ lib, ... }:
let
keys = import ../../../keys.nix;
hostsKeys = lib.mapAttrs (name: value: { publicKey = value; }) keys.hosts;
in
{
# Enable the OpenSSH daemon.
services.openssh.enable = true;
programs.ssh.knownHosts = hostsKeys // {
"gitlab-internal.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3";
"bscpm03.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM2NuSUPsEhqz1j5b4Gqd+MWFnRqyqY57+xMvBUqHYUS";
"bscpm04.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPx4mC0etyyjYUT2Ztc/bs4ZXSbVMrogs1ZTP924PDgT";
"glogin1.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFsHsZGCrzpd4QDVn5xoDOtrNBkb0ylxKGlyBt6l9qCz";
"glogin2.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFsHsZGCrzpd4QDVn5xoDOtrNBkb0ylxKGlyBt6l9qCz";
};
}

190
m/common/base/users.nix Normal file
View File

@ -0,0 +1,190 @@
{ pkgs, ... }:
{
imports = [
../../module/jungle-users.nix
];
users = {
mutableUsers = false;
users = {
# Generate hashedPassword with `mkpasswd -m sha-512`
root.openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKBOf4r4lzQfyO0bx5BaREePREw8Zw5+xYgZhXwOZoBO ram@hop"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINa0tvnNgwkc5xOwd6xTtaIdFi5jv0j2FrE7jl5MTLoE ram@mio"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF3zeB5KSimMBAjvzsp1GCkepVaquVZGPYwRIzyzaCba aleix@bsc"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut"
];
rarias = {
uid = 1880;
isNormalUser = true;
linger = true;
home = "/home/Computational/rarias";
description = "Rodrigo Arias";
group = "Computational";
extraGroups = [ "wheel" ];
hashedPassword = "$6$u06tkCy13enReBsb$xiI.twRvvTfH4jdS3s68NZ7U9PSbGKs5.LXU/UgoawSwNWhZo2hRAjNL5qG0/lAckzcho2LjD0r3NfVPvthY6/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKBOf4r4lzQfyO0bx5BaREePREw8Zw5+xYgZhXwOZoBO ram@hop"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINa0tvnNgwkc5xOwd6xTtaIdFi5jv0j2FrE7jl5MTLoE ram@mio"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGYcXIxe0poOEGLpk8NjiRozls7fMRX0N3j3Ar94U+Gl rarias@hal"
];
shell = pkgs.zsh;
};
arocanon = {
uid = 1042;
isNormalUser = true;
home = "/home/Computational/arocanon";
description = "Aleix Roca";
group = "Computational";
extraGroups = [ "wheel" "tracing" ];
hashedPassword = "$6$hliZiW4tULC/tH7p$pqZarwJkNZ7vS0G5llWQKx08UFG9DxDYgad7jplMD8WkZh5k58i4dfPoWtnEShfjTO6JHiIin05ny5lmSXzGM/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF3zeB5KSimMBAjvzsp1GCkepVaquVZGPYwRIzyzaCba aleix@bsc"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGdphWxLAEekicZ/WBrvP7phMyxKSSuLAZBovNX+hZXQ aleix@kerneland"
];
};
};
jungleUsers = {
rpenacob = {
uid = 2761;
isNormalUser = true;
home = "/home/Computational/rpenacob";
description = "Raúl Peñacoba";
group = "Computational";
hosts = [ "apex" "owl1" "owl2" "hut" "tent" "fox" ];
hashedPassword = "$6$TZm3bDIFyPrMhj1E$uEDXoYYd1z2Wd5mMPfh3DZAjP7ztVjJ4ezIcn82C0ImqafPA.AnTmcVftHEzLB3tbe2O4SxDyPSDEQgJ4GOtj/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFYfXg37mauGeurqsLpedgA2XQ9d4Nm0ZGo/hI1f7wwH rpenacob@bsc"
];
};
anavarro = {
uid = 1037;
isNormalUser = true;
home = "/home/Computational/anavarro";
description = "Antoni Navarro";
group = "Computational";
hosts = [ "apex" "hut" "tent" "raccoon" "fox" "weasel" ];
hashedPassword = "$6$EgturvVYXlKgP43g$gTN78LLHIhaF8hsrCXD.O6mKnZSASWSJmCyndTX8QBWT6wTlUhcWVAKz65lFJPXjlJA4u7G1ydYQ0GG6Wk07b1";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMsbM21uepnJwPrRe6jYFz8zrZ6AYMtSEvvt4c9spmFP toni@delltoni"
];
};
abonerib = {
uid = 4541;
isNormalUser = true;
home = "/home/Computational/abonerib";
description = "Aleix Boné";
group = "Computational";
hosts = [ "apex" "owl1" "owl2" "hut" "tent" "raccoon" "fox" "weasel" ];
hashedPassword = "$6$V1EQWJr474whv7XJ$OfJ0wueM2l.dgiJiiah0Tip9ITcJ7S7qDvtSycsiQ43QBFyP4lU0e0HaXWps85nqB4TypttYR4hNLoz3bz662/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
];
};
vlopez = {
uid = 4334;
isNormalUser = true;
home = "/home/Computational/vlopez";
description = "Victor López";
group = "Computational";
hosts = [ "apex" "koro" ];
hashedPassword = "$6$0ZBkgIYE/renVqtt$1uWlJsb0FEezRVNoETTzZMx4X2SvWiOsKvi0ppWCRqI66S6TqMBXBdP4fcQyvRRBt0e4Z7opZIvvITBsEtO0f0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGMwlUZRf9jfG666Qa5Sb+KtEhXqkiMlBV2su3x/dXHq victor@arch"
];
};
dbautist = {
uid = 5649;
isNormalUser = true;
home = "/home/Computational/dbautist";
description = "Dylan Bautista Cases";
group = "Computational";
hosts = [ "apex" "hut" "tent" "raccoon" ];
hashedPassword = "$6$a2lpzMRVkG9nSgIm$12G6.ka0sFX1YimqJkBAjbvhRKZ.Hl090B27pdbnQOW0wzyxVWySWhyDDCILjQELky.HKYl9gqOeVXW49nW7q/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
];
};
dalvare1 = {
uid = 2758;
isNormalUser = true;
home = "/home/Computational/dalvare1";
description = "David Álvarez";
group = "Computational";
hosts = [ "apex" "hut" "tent" "fox" ];
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
];
};
varcila = {
uid = 5650;
isNormalUser = true;
home = "/home/Computational/varcila";
description = "Vincent Arcila";
group = "Computational";
hosts = [ "apex" "hut" "tent" "fox" ];
hashedPassword = "$6$oB0Tcn99DcM4Ch$Vn1A0ulLTn/8B2oFPi9wWl/NOsJzaFAWjqekwcuC9sMC7cgxEVb.Nk5XSzQ2xzYcNe5MLtmzkVYnRS1CqP39Y0";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGt0ESYxekBiHJQowmKpfdouw0hVm3N7tUMtAaeLejK vincent@varch"
];
};
pmartin1 = {
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
uid = 9652;
isNormalUser = true;
home = "/home/Computational/pmartin1";
description = "Pedro J. Martinez-Ferrer";
group = "Computational";
hosts = [ "fox" ];
hashedPassword = "$6$nIgDMGnt4YIZl3G.$.JQ2jXLtDPRKsbsJfJAXdSvjDIzRrg7tNNjPkLPq3KJQhMjfDXRUvzagUHUU2TrE2hHM8/6uq8ex0UdxQ0ysl.";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
];
};
csiringo = {
uid = 9653;
isNormalUser = true;
home = "/home/Computational/csiringo";
description = "Cesare Siringo";
group = "Computational";
hosts = [ ];
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
];
};
acinca = {
uid = 9654;
isNormalUser = true;
home = "/home/Computational/acinca";
description = "Arnau Cinca";
group = "Computational";
hosts = [ "apex" "hut" "fox" "owl1" "owl2" ];
hashedPassword = "$6$S6PUeRpdzYlidxzI$szyvWejQ4hEN76yBYhp1diVO5ew1FFg.cz4lKiXt2Idy4XdpifwrFTCIzLTs5dvYlR62m7ekA5MrhcVxR5F/q/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmMqKqPg4uocNOr3O41kLbZMOMJn3m2ZdN1JvTR96z3 bsccns@arnau-bsc"
];
};
};
groups = {
Computational = { gid = 564; };
tracing = { };
};
};
}

View File

@ -0,0 +1,9 @@
{ ... }:
{
# The boards have a BMC watchdog controlled by IPMI
boot.kernelModules = [ "ipmi_watchdog" ];
# Enable systemd watchdog with 30 s interval
systemd.watchdog.runtimeTime = "30s";
}

91
m/common/base/zsh.nix Normal file
View File

@ -0,0 +1,91 @@
{ pkgs, ... }:
{
environment.systemPackages = with pkgs; [
zsh-completions
nix-zsh-completions
];
programs.zsh = {
enable = true;
histSize = 1000000;
shellInit = ''
# Disable new user prompt
if [ ! -e ~/.zshrc ]; then
touch ~/.zshrc
fi
'';
promptInit = ''
# Note that to manually override this in ~/.zshrc you should run `prompt off`
# before setting your PS1 and etc. Otherwise this will likely to interact with
# your ~/.zshrc configuration in unexpected ways as the default prompt sets
# a lot of different prompt variables.
autoload -U promptinit && promptinit && prompt default && setopt prompt_sp
'';
# Taken from Ulli Kehrle config:
# https://git.hrnz.li/Ulli/nixos/src/commit/2e203b8d8d671f4e3ced0f1744a51d5c6ee19846/profiles/shell.nix#L199-L205
interactiveShellInit = ''
source "${pkgs.zsh-history-substring-search}/share/zsh-history-substring-search/zsh-history-substring-search.zsh"
# Save history immediately, but only load it when the shell starts
setopt inc_append_history
# dircolors doesn't support alacritty:
# https://lists.gnu.org/archive/html/bug-coreutils/2019-05/msg00029.html
export LS_COLORS='rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.swp=00;90:*.tmp=00;90:*.dpkg-dist=00;90:*.dpkg-old=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:';
# From Arch Linux and GRML
bindkey "^R" history-incremental-pattern-search-backward
bindkey "^S" history-incremental-pattern-search-forward
# Auto rehash for new binaries
zstyle ':completion:*' rehash true
# show a nice menu with the matches
zstyle ':completion:*' menu yes select
bindkey '^[OA' history-substring-search-up # Up
bindkey '^[[A' history-substring-search-up # Up
bindkey '^[OB' history-substring-search-down # Down
bindkey '^[[B' history-substring-search-down # Down
bindkey '\e[1~' beginning-of-line # Home
bindkey '\e[7~' beginning-of-line # Home
bindkey '\e[H' beginning-of-line # Home
bindkey '\eOH' beginning-of-line # Home
bindkey '\e[4~' end-of-line # End
bindkey '\e[8~' end-of-line # End
bindkey '\e[F ' end-of-line # End
bindkey '\eOF' end-of-line # End
bindkey '^?' backward-delete-char # Backspace
bindkey '\e[3~' delete-char # Del
# bindkey '\e[3;5~' delete-char # sometimes Del, sometimes C-Del
bindkey '\e[2~' overwrite-mode # Ins
bindkey '^H' backward-kill-word # C-Backspace
bindkey '5~' kill-word # C-Del
bindkey '^[[3;5~' kill-word # C-Del
bindkey '^[[3^' kill-word # C-Del
bindkey "^[[1;5H" backward-kill-line # C-Home
bindkey "^[[7^" backward-kill-line # C-Home
bindkey "^[[1;5F" kill-line # C-End
bindkey "^[[8^" kill-line # C-End
bindkey '^[[1;5C' forward-word # C-Right
bindkey '^[0c' forward-word # C-Right
bindkey '^[[5C' forward-word # C-Right
bindkey '^[[1;5D' backward-word # C-Left
bindkey '^[0d' backward-word # C-Left
bindkey '^[[5D' backward-word # C-Left
'';
};
}

10
m/common/ssf.nix Normal file
View File

@ -0,0 +1,10 @@
{
# Provides the base system for a xeon node in the SSF rack.
imports = [
./xeon.nix
./ssf/fs.nix
./ssf/hosts.nix
./ssf/hosts-remote.nix
./ssf/net.nix
];
}

8
m/common/ssf/fs.nix Normal file
View File

@ -0,0 +1,8 @@
{
# Mount the home via NFS
fileSystems."/home" = {
device = "10.0.40.30:/home";
fsType = "nfs";
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
};
}

View File

@ -0,0 +1,9 @@
{ pkgs, ... }:
{
networking.hosts = {
# Remote hosts visible from compute nodes
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
}

23
m/common/ssf/hosts.nix Normal file
View File

@ -0,0 +1,23 @@
{ pkgs, ... }:
{
networking.hosts = {
# Login
"10.0.40.30" = [ "apex" ];
# Storage
"10.0.40.40" = [ "bay" ]; "10.0.42.40" = [ "bay-ib" ]; "10.0.40.141" = [ "bay-ipmi" ];
"10.0.40.41" = [ "oss01" ]; "10.0.42.41" = [ "oss01-ib0" ]; "10.0.40.142" = [ "oss01-ipmi" ];
"10.0.40.42" = [ "lake2" ]; "10.0.42.42" = [ "lake2-ib" ]; "10.0.40.143" = [ "lake2-ipmi" ];
# Xeon compute
"10.0.40.1" = [ "owl1" ]; "10.0.42.1" = [ "owl1-ib" ]; "10.0.40.101" = [ "owl1-ipmi" ];
"10.0.40.2" = [ "owl2" ]; "10.0.42.2" = [ "owl2-ib" ]; "10.0.40.102" = [ "owl2-ipmi" ];
"10.0.40.3" = [ "xeon03" ]; "10.0.42.3" = [ "xeon03-ib" ]; "10.0.40.103" = [ "xeon03-ipmi" ];
#"10.0.40.4" = [ "tent" ]; "10.0.42.4" = [ "tent-ib" ]; "10.0.40.104" = [ "tent-ipmi" ];
"10.0.40.5" = [ "koro" ]; "10.0.42.5" = [ "koro-ib" ]; "10.0.40.105" = [ "koro-ipmi" ];
"10.0.40.6" = [ "weasel" ]; "10.0.42.6" = [ "weasel-ib" ]; "10.0.40.106" = [ "weasel-ipmi" ];
"10.0.40.7" = [ "hut" ]; "10.0.42.7" = [ "hut-ib" ]; "10.0.40.107" = [ "hut-ipmi" ];
"10.0.40.8" = [ "eudy" ]; "10.0.42.8" = [ "eudy-ib" ]; "10.0.40.108" = [ "eudy-ipmi" ];
};
}

23
m/common/ssf/net.nix Normal file
View File

@ -0,0 +1,23 @@
{ pkgs, ... }:
{
# Infiniband (IPoIB)
environment.systemPackages = [ pkgs.rdma-core ];
boot.kernelModules = [ "ib_umad" "ib_ipoib" ];
networking = {
defaultGateway = "10.0.40.30";
nameservers = ["8.8.8.8"];
firewall = {
extraCommands = ''
# Prevent ssfhead from contacting our slurmd daemon
iptables -A nixos-fw -p tcp -s ssfhead --dport 6817:6819 -j nixos-fw-refuse
# But accept traffic to slurm ports from any other node in the subnet
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817:6819 -j nixos-fw-accept
# We also need to open the srun port range
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
'';
};
};
}

7
m/common/xeon.nix Normal file
View File

@ -0,0 +1,7 @@
{
# Provides the base system for a xeon node, not necessarily in the SSF rack.
imports = [
./base.nix
./xeon/console.nix
];
}

14
m/common/xeon/console.nix Normal file
View File

@ -0,0 +1,14 @@
{
# Restart the serial console
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
# Enable serial console
boot.kernelParams = [
"console=tty1"
"console=ttyS0,115200"
];
}

38
m/eudy/configuration.nix Normal file
View File

@ -0,0 +1,38 @@
{ config, pkgs, lib, modulesPath, ... }:
{
imports = [
../common/ssf.nix
#(modulesPath + "/installer/netboot/netboot-minimal.nix")
./kernel/kernel.nix
./cpufreq.nix
./fs.nix
./users.nix
../module/hut-substituter.nix
../module/debuginfod.nix
];
# Select this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53564b";
# disable automatic garbage collector
nix.gc.automatic = lib.mkForce false;
# members of the tracing group can use the lttng-provided kernel events
# without root permissions
users.groups.tracing.members = [ "arocanon" ];
# set up both ethernet and infiniband ips
networking = {
hostName = "eudy";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.8";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.8";
prefixLength = 24;
} ];
};
}

40
m/eudy/cpufreq.nix Normal file
View File

@ -0,0 +1,40 @@
{ lib, ... }:
{
# Disable frequency boost by default. Use the intel_pstate driver instead of
# acpi_cpufreq driver because the acpi_cpufreq driver does not read the
# complete range of P-States [1]. Use the intel_pstate passive mode [2] to
# disable HWP, which allows a core to "select P-states by itself". Also, this
# disables intel governors, which confusingly, have the same names as the
# generic ones but behave differently [3].
# Essentially, we use the generic governors, but use the intel driver to read
# the P-state list.
# [1] - https://www.kernel.org/doc/html/latest/admin-guide/pm/intel_pstate.html#intel-pstate-vs-acpi-cpufreq
# [2] - https://www.kernel.org/doc/html/latest/admin-guide/pm/intel_pstate.html#passive-mode
# [3] - https://www.kernel.org/doc/html/latest/admin-guide/pm/intel_pstate.html#active-mode
# https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
# set intel_pstate to passive mode
boot.kernelParams = [
"intel_pstate=passive"
];
# Disable frequency boost
system.activationScripts = {
disableFrequencyBoost.text = ''
echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
'';
};
## disable intel_pstate
#boot.kernelParams = [
# "intel_pstate=disable"
#];
## Disable frequency boost
#system.activationScripts = {
# disableFrequencyBoost.text = ''
# echo 0 > /sys/devices/system/cpu/cpufreq/boost
# '';
#};
}

13
m/eudy/fs.nix Normal file
View File

@ -0,0 +1,13 @@
{ ... }:
{
fileSystems."/nix" = {
device = "/dev/disk/by-label/optane";
fsType = "ext4";
neededForBoot = true;
};
fileSystems."/mnt/data" = {
device = "/dev/disk/by-label/data";
fsType = "ext4";
};
}

70
m/eudy/kernel/kernel.nix Normal file
View File

@ -0,0 +1,70 @@
{ pkgs, lib, ... }:
let
#fcs-devel = pkgs.linuxPackages_custom {
# version = "6.2.8";
# src = /mnt/data/kernel/fcs/kernel/src;
# configfile = /mnt/data/kernel/fcs/kernel/configs/defconfig;
#};
#fcsv1 = fcs-kernel "bc11660676d3d68ce2459b9fb5d5e654e3f413be" false;
#fcsv2 = fcs-kernel "db0f2eca0cd57a58bf456d7d2c7d5d8fdb25dfb1" false;
#fcsv1-lockdep = fcs-kernel "bc11660676d3d68ce2459b9fb5d5e654e3f413be" true;
#fcsv2-lockdep = fcs-kernel "db0f2eca0cd57a58bf456d7d2c7d5d8fdb25dfb1" true;
#fcs-kernel = gitCommit: lockdep: pkgs.linuxPackages_custom {
# version = "6.2.8";
# src = builtins.fetchGit {
# url = "git@bscpm03.bsc.es:ompss-kernel/linux.git";
# rev = gitCommit;
# ref = "fcs";
# };
# configfile = if lockdep then ./configs/lockdep else ./configs/defconfig;
#};
kernel = nixos-fcs;
nixos-fcs-kernel = lib.makeOverridable ({gitCommit, lockStat ? false, preempt ? false, branch ? "fcs"}: pkgs.linuxPackagesFor (pkgs.buildLinux rec {
version = "6.2.8";
src = builtins.fetchGit {
url = "git@bscpm03.bsc.es:ompss-kernel/linux.git";
rev = gitCommit;
ref = branch;
};
structuredExtraConfig = with lib.kernel; {
# add general custom kernel options here
} // lib.optionalAttrs lockStat {
LOCK_STAT = yes;
} // lib.optionalAttrs preempt {
PREEMPT = lib.mkForce yes;
PREEMPT_VOLUNTARY = lib.mkForce no;
};
kernelPatches = [];
extraMeta.branch = lib.versions.majorMinor version;
}));
nixos-fcs = nixos-fcs-kernel {gitCommit = "8a09822dfcc8f0626b209d6d2aec8b5da459dfee";};
nixos-fcs-lockstat = nixos-fcs.override {
lockStat = true;
};
nixos-fcs-lockstat-preempt = nixos-fcs.override {
lockStat = true;
preempt = true;
};
latest = pkgs.linuxPackages_latest;
in {
imports = [
./lttng.nix
./perf.nix
];
boot.kernelPackages = lib.mkForce kernel;
# disable all cpu mitigations
boot.kernelParams = [
"mitigations=off"
];
# enable memory overcommit, needed to build a taglibc system using nix after
# increasing the openblas memory footprint
boot.kernel.sysctl."vm.overcommit_memory" = 1;
}

43
m/eudy/kernel/lttng.nix Normal file
View File

@ -0,0 +1,43 @@
{ config, pkgs, lib, ... }:
let
# The lttng btrfs probe crashes at compile time because of an undefined
# function. This disables the btrfs tracepoints to avoid the issue.
# Also enable lockdep tracepoints, this is disabled by default because it
# does not work well on architectures other than x86_64 (i think that arm) as
# I was told on the mailing list.
lttng-modules-fixed = config.boot.kernelPackages.lttng-modules.overrideAttrs (finalAttrs: previousAttrs: {
patchPhase = (lib.optionalString (previousAttrs ? patchPhase) previousAttrs.patchPhase) + ''
# disable btrfs
substituteInPlace src/probes/Kbuild \
--replace " obj-\$(CONFIG_LTTNG) += lttng-probe-btrfs.o" " #obj-\$(CONFIG_LTTNG) += lttng-probe-btrfs.o"
# enable lockdep tracepoints
substituteInPlace src/probes/Kbuild \
--replace "#ifneq (\$(CONFIG_LOCKDEP),)" "ifneq (\$(CONFIG_LOCKDEP),)" \
--replace "# obj-\$(CONFIG_LTTNG) += lttng-probe-lock.o" " obj-\$(CONFIG_LTTNG) += lttng-probe-lock.o" \
--replace "#endif # CONFIG_LOCKDEP" "endif # CONFIG_LOCKDEP"
'';
});
in {
# add the lttng tools and modules to the system environment
boot.extraModulePackages = [ lttng-modules-fixed ];
environment.systemPackages = with pkgs; [
lttng-tools lttng-ust babeltrace
];
# start the lttng root daemon to manage kernel events
systemd.services.lttng-sessiond = {
wantedBy = [ "multi-user.target" ];
description = "LTTng session daemon for the root user";
serviceConfig = {
User = "root";
ExecStart = ''
${pkgs.lttng-tools}/bin/lttng-sessiond
'';
};
};
}

22
m/eudy/kernel/perf.nix Normal file
View File

@ -0,0 +1,22 @@
{ config, pkgs, lib, ... }:
{
# add the perf tool
environment.systemPackages = with pkgs; [
config.boot.kernelPackages.perf
];
# allow non-root users to read tracing data from the kernel
boot.kernel.sysctl."kernel.perf_event_paranoid" = -2;
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
# specify additionl options to the tracefs directory to allow members of the
# tracing group to access tracefs.
fileSystems."/sys/kernel/tracing" = {
options = [
"mode=755"
"gid=tracing"
];
};
}

11
m/eudy/users.nix Normal file
View File

@ -0,0 +1,11 @@
{ ... }:
{
security.sudo.extraRules= [{
users = [ "arocanon" ];
commands = [{
command = "ALL" ;
options= [ "NOPASSWD" ]; # "SETENV" # Adding the following could be a good idea
}];
}];
}

112
m/fox/configuration.nix Normal file
View File

@ -0,0 +1,112 @@
{ lib, config, pkgs, ... }:
{
imports = [
../common/base.nix
../common/xeon/console.nix
../module/amd-uprof.nix
../module/emulation.nix
../module/nvidia.nix
../module/slurm-client.nix
../module/hut-substituter.nix
./wireguard.nix
];
# Don't turn off on August as UPC has different dates.
# Fox works fine on power cuts.
systemd.timers.august-shutdown.enable = false;
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
# No swap, there is plenty of RAM
swapDevices = lib.mkForce [];
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ];
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
# Use performance for benchmarks
powerManagement.cpuFreqGovernor = "performance";
services.amd-uprof.enable = true;
# Disable NUMA balancing
boot.kernel.sysctl."kernel.numa_balancing" = 0;
# Expose kernel addresses
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
# Disable NMI watchdog to save one hw counter (for AMD uProf)
boot.kernel.sysctl."kernel.nmi_watchdog" = 0;
services.openssh.settings.X11Forwarding = true;
services.fail2ban.enable = true;
networking = {
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
hostName = "fox";
# UPC network (may change over time, use DHCP)
# Public IP configuration:
# - Hostname: fox.ac.upc.edu
# - IP: 147.83.30.141
# - Gateway: 147.83.30.130
# - NetMask: 255.255.255.192
# Private IP configuration for BMC:
# - Hostname: fox-ipmi.ac.upc.edu
# - IP: 147.83.35.27
# - Gateway: 147.83.35.2
# - NetMask: 255.255.255.0
interfaces.enp1s0f0np0.useDHCP = true;
};
# Recommended for new graphics cards
hardware.nvidia.open = true;
# Mount NVME disks
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
# Mount the NFS home
fileSystems."/nfs/home" = {
device = "10.106.0.30:/home";
fsType = "nfs";
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
};
# Make a /nvme{0,1}/$USER directory for each user.
systemd.services.create-nvme-dirs = let
# Take only normal users in fox
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
commands = lib.concatLists (lib.mapAttrsToList
(_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0755 /nvme{0,1}/${user.name}"
]) users);
script = pkgs.writeShellScript "create-nvme-dirs.sh" (lib.concatLines commands);
in {
enable = true;
wants = [ "local-fs.target" ];
after = [ "local-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# Only allow SSH connections from users who have a SLURM allocation
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
security.pam.services.sshd.rules.account.slurm = {
control = "required";
enable = true;
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
args = [ "log_level=debug5" ];
order = 999999; # Make it last one
};
# Disable systemd session (pam_systemd.so) as it will conflict with the
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
# into the slurmstepd task and then into the systemd session, which is not
# what we want, otherwise it will linger even if all jobs are gone.
security.pam.services.sshd.startSession = lib.mkForce false;
}

54
m/fox/wireguard.nix Normal file
View File

@ -0,0 +1,54 @@
{ config, ... }:
{
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgFox.file = ../../secrets/wg-fox.age;
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
# "wg0" is the network interface name. You can name the interface arbitrarily.
wg0 = {
# Determines the IP address and subnet of the server's end of the tunnel interface.
ips = [ "10.106.0.1/24" ];
# The port that WireGuard listens to. Must be accessible by the client.
listenPort = 666;
# Path to the private key file.
privateKeyFile = config.age.secrets.wgFox.path;
# Public key: VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=
peers = [
# List of allowed peers.
{
name = "apex";
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
allowedIPs = [ "10.106.0.30/32" "10.0.40.7/32" ];
}
{
name = "raccoon";
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
}
];
};
};
networking.hosts = {
"10.106.0.30" = [ "apex" ];
"10.0.40.7" = [ "hut" ];
"10.106.0.236" = [ "raccoon" ];
"10.0.44.4" = [ "tent" ];
};
networking.firewall = {
extraCommands = ''
# Accept slurm connections to slurmd from apex (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.30/32 -d 10.106.0.1/32 --dport 6818 -j nixos-fw-accept
'';
};
}

14
m/hut/blackbox.yml Normal file
View File

@ -0,0 +1,14 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
follow_redirects: true
preferred_ip_protocol: "ip4"
valid_status_codes: [] # Defaults to 2xx
method: GET
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"

67
m/hut/configuration.nix Normal file
View File

@ -0,0 +1,67 @@
{ config, pkgs, lib, ... }:
{
imports = [
../common/ssf.nix
../module/ceph.nix
../module/debuginfod.nix
../module/emulation.nix
./gitlab-runner.nix
./monitoring.nix
./nfs.nix
./nix-serve.nix
./public-inbox.nix
./gitea.nix
./msmtp.nix
./postgresql.nix
./nginx.nix
./p.nix
#./pxe.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53567f";
fileSystems = {
"/" = lib.mkForce {
device = "/dev/disk/by-label/nvme";
fsType = "ext4";
neededForBoot = true;
options = [ "noatime" ];
};
"/boot" = lib.mkForce {
device = "/dev/disk/by-label/nixos-boot";
fsType = "ext4";
neededForBoot = true;
};
};
networking = {
hostName = "hut";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.7";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.7";
prefixLength = 24;
} ];
firewall = {
extraCommands = ''
# Accept all proxy traffic from compute nodes but not the login
iptables -A nixos-fw -p tcp -s 10.0.40.30 --dport 23080 -j nixos-fw-log-refuse
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 23080 -j nixos-fw-accept
'';
# Flush all rules and chains on stop so it won't break on start
extraStopCommands = ''
iptables -F
iptables -X
'';
};
};
# Allow proxy to bind to the ethernet interface
services.openssh.settings.GatewayPorts = "clientspecified";
}

63
m/hut/gitea.nix Normal file
View File

@ -0,0 +1,63 @@
{ config, lib, ... }:
{
age.secrets.giteaRunnerToken.file = ../../secrets/gitea-runner-token.age;
services.gitea = {
enable = true;
appName = "Gitea in the jungle";
settings = {
server = {
ROOT_URL = "https://jungle.bsc.es/git/";
LOCAL_ROOT_URL = "https://jungle.bsc.es/git/";
LANDING_PAGE = "explore";
};
metrics.ENABLED = true;
service = {
REGISTER_MANUAL_CONFIRM = true;
ENABLE_NOTIFY_MAIL = true;
};
log.LEVEL = "Warn";
mailer = {
ENABLED = true;
FROM = "jungle-robot@bsc.es";
PROTOCOL = "sendmail";
SENDMAIL_PATH = "/run/wrappers/bin/sendmail";
SENDMAIL_ARGS = "--";
};
};
};
services.gitea-actions-runner.instances = {
runrun = {
enable = true;
name = "runrun";
url = "https://jungle.bsc.es/git/";
tokenFile = config.age.secrets.giteaRunnerToken.path;
labels = [ "native:host" ];
settings.runner.capacity = 8;
};
};
systemd.services.gitea-runner-runrun = {
path = [ "/run/current-system/sw" ];
serviceConfig = {
# DynamicUser doesn't work well with SSH
DynamicUser = lib.mkForce false;
User = "gitea-runner";
Group = "gitea-runner";
};
};
users.users.gitea-runner = {
isSystemUser = true;
home = "/var/lib/gitea-runner";
description = "Gitea Runner";
group = "gitea-runner";
extraGroups = [ "docker" ];
createHome = true;
};
users.groups.gitea-runner = {};
}

126
m/hut/gitlab-runner.nix Normal file
View File

@ -0,0 +1,126 @@
{ pkgs, lib, config, ... }:
{
age.secrets.gitlab-pm-shell.file = ../../secrets/gitlab-runner-shell-token.age;
age.secrets.gitlab-pm-docker.file = ../../secrets/gitlab-runner-docker-token.age;
age.secrets.gitlab-bsc-docker.file = ../../secrets/gitlab-bsc-docker-token.age;
services.gitlab-runner = {
enable = true;
settings.concurrent = 5;
services = let
common-shell = {
executor = "shell";
environmentVariables = {
SHELL = "${pkgs.bash}/bin/bash";
};
};
common-docker = {
executor = "docker";
dockerImage = "debian:stable";
registrationFlags = [
"--docker-network-mode host"
];
environmentVariables = {
https_proxy = "http://hut:23080";
http_proxy = "http://hut:23080";
};
};
in {
# For pm.bsc.es/gitlab
gitlab-pm-shell = common-shell // {
authenticationTokenConfigFile = config.age.secrets.gitlab-pm-shell.path;
};
gitlab-pm-docker = common-docker // {
authenticationTokenConfigFile = config.age.secrets.gitlab-pm-docker.path;
};
gitlab-bsc-docker = {
# gitlab.bsc.es still uses the old token mechanism
registrationConfigFile = config.age.secrets.gitlab-bsc-docker.path;
tagList = [ "docker" "hut" ];
environmentVariables = {
# We cannot access the hut local interface from docker, so we connect
# to hut directly via the ethernet one.
https_proxy = "http://hut:23080";
http_proxy = "http://hut:23080";
};
executor = "docker";
dockerImage = "alpine";
dockerVolumes = [
"/nix/store:/nix/store:ro"
"/nix/var/nix/db:/nix/var/nix/db:ro"
"/nix/var/nix/daemon-socket:/nix/var/nix/daemon-socket:ro"
];
dockerExtraHosts = [
# Required to pass the proxy via hut
"hut:10.0.40.7"
];
dockerDisableCache = true;
registrationFlags = [
# Increase build log length to 64 MiB
"--output-limit 65536"
];
preBuildScript = pkgs.writeScript "setup-container" ''
mkdir -p -m 0755 /nix/var/log/nix/drvs
mkdir -p -m 0755 /nix/var/nix/gcroots
mkdir -p -m 0755 /nix/var/nix/profiles
mkdir -p -m 0755 /nix/var/nix/temproots
mkdir -p -m 0755 /nix/var/nix/userpool
mkdir -p -m 1777 /nix/var/nix/gcroots/per-user
mkdir -p -m 1777 /nix/var/nix/profiles/per-user
mkdir -p -m 0755 /nix/var/nix/profiles/per-user/root
mkdir -p -m 0700 "$HOME/.nix-defexpr"
mkdir -p -m 0700 "$HOME/.ssh"
cat > "$HOME/.ssh/config" << EOF
Host bscpm04.bsc.es gitlab-internal.bsc.es
User git
ProxyCommand nc -X connect -x hut:23080 %h %p
Host amdlogin1.bsc.es armlogin1.bsc.es hualogin1.bsc.es glogin1.bsc.es glogin2.bsc.es fpgalogin1.bsc.es
ProxyCommand nc -X connect -x hut:23080 %h %p
EOF
cat >> "$HOME/.ssh/known_hosts" << EOF
bscpm04.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPx4mC0etyyjYUT2Ztc/bs4ZXSbVMrogs1ZTP924PDgT
gitlab-internal.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3
EOF
. ${pkgs.nix}/etc/profile.d/nix-daemon.sh
# Required to load SSL certificate paths
. ${pkgs.cacert}/nix-support/setup-hook
'';
environmentVariables = {
ENV = "/etc/profile";
USER = "root";
NIX_REMOTE = "daemon";
PATH = "${config.system.path}/bin:/bin:/sbin:/usr/bin:/usr/sbin";
};
};
};
};
# DOCKER* chains are useless, override at FORWARD and nixos-fw
networking.firewall.extraCommands = ''
# Don't forward any traffic from docker
iptables -I FORWARD 1 -p all -i docker0 -j nixos-fw-log-refuse
# Allow incoming traffic from docker to 23080
iptables -A nixos-fw -p tcp -i docker0 -d hut --dport 23080 -j ACCEPT
'';
#systemd.services.gitlab-runner.serviceConfig.Shell = "${pkgs.bash}/bin/bash";
systemd.services.gitlab-runner.serviceConfig.DynamicUser = lib.mkForce false;
systemd.services.gitlab-runner.serviceConfig.User = "gitlab-runner";
systemd.services.gitlab-runner.serviceConfig.Group = "gitlab-runner";
systemd.services.gitlab-runner.serviceConfig.ExecStart = lib.mkForce
''${pkgs.gitlab-runner}/bin/gitlab-runner run --config ''${HOME}/.gitlab-runner/config.toml --listen-address "127.0.0.1:9252" --working-directory ''${HOME}'';
users.users.gitlab-runner = {
uid = config.ids.uids.gitlab-runner;
#isNormalUser = true;
home = "/var/lib/gitlab-runner";
description = "Gitlab Runner";
group = "gitlab-runner";
extraGroups = [ "docker" ];
createHome = true;
};
users.groups.gitlab-runner.gid = config.ids.gids.gitlab-runner;
}

31
m/hut/gpfs-probe.nix Normal file
View File

@ -0,0 +1,31 @@
{ pkgs, config, lib, ... }:
let
gpfs-probe-script = pkgs.runCommand "gpfs-probe.sh" { }
''
cp ${./gpfs-probe.sh} $out;
chmod +x $out
''
;
in
{
# Use a new user to handle the SSH keys
users.groups.ssh-robot = { };
users.users.ssh-robot = {
description = "SSH Robot";
isNormalUser = true;
home = "/var/lib/ssh-robot";
};
systemd.services.gpfs-probe = {
description = "Daemon to report GPFS latency via SSH";
path = [ pkgs.openssh pkgs.netcat ];
after = [ "network.target" ];
wantedBy = [ "default.target" ];
serviceConfig = {
Type = "simple";
ExecStart = "${pkgs.socat}/bin/socat TCP4-LISTEN:9966,fork EXEC:${gpfs-probe-script}";
User = "ssh-robot";
Group = "ssh-robot";
};
};
}

18
m/hut/gpfs-probe.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/sh
N=500
t=$(timeout 5 ssh bsc015557@glogin2.bsc.es "timeout 3 command time -f %e touch /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N} 2>&1; rm -f /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N}")
if [ -z "$t" ]; then
t="5.00"
fi
cat <<EOF
HTTP/1.1 200 OK
Content-Type: text/plain; version=0.0.4; charset=utf-8; escaping=values
# HELP gpfs_touch_latency Time to create $N files.
# TYPE gpfs_touch_latency gauge
gpfs_touch_latency $t
EOF

272
m/hut/monitoring.nix Normal file
View File

@ -0,0 +1,272 @@
{ config, lib, ... }:
{
imports = [
../module/slurm-exporter.nix
../module/meteocat-exporter.nix
../module/upc-qaire-exporter.nix
./gpfs-probe.nix
../module/nix-daemon-exporter.nix
];
age.secrets.grafanaJungleRobotPassword = {
file = ../../secrets/jungle-robot-password.age;
owner = "grafana";
mode = "400";
};
age.secrets.ipmiYml.file = ../../secrets/ipmi.yml.age;
services.grafana = {
enable = true;
settings = {
server = {
domain = "jungle.bsc.es";
root_url = "%(protocol)s://%(domain)s/grafana";
serve_from_sub_path = true;
http_port = 2342;
http_addr = "127.0.0.1";
};
smtp = {
enabled = true;
from_address = "jungle-robot@bsc.es";
user = "jungle-robot";
# Read the password from a file, which is only readable by grafana user
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
password = "$__file{${config.age.secrets.grafanaJungleRobotPassword.path}}";
host = "mail.bsc.es:465";
startTLS_policy = "NoStartTLS";
};
feature_toggles.publicDashboards = true;
"auth.anonymous".enabled = true;
log.level = "warn";
};
};
# Make grafana alerts also use the proxy
systemd.services.grafana.environment = config.networking.proxy.envVars;
services.prometheus = {
enable = true;
port = 9001;
retentionTime = "5y";
listenAddress = "127.0.0.1";
};
systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false;
systemd.services.prometheus-ipmi-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
# We need access to the devices to monitor the disk space
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
virtualisation.docker.daemon.settings = {
metrics-addr = "127.0.0.1:9323";
};
# Required to allow the smartctl exporter to read the nvme0 character device,
# see the commit message on:
# https://github.com/NixOS/nixpkgs/commit/12c26aca1fd55ab99f831bedc865a626eee39f80
services.udev.extraRules = ''
SUBSYSTEM=="nvme", KERNEL=="nvme[0-9]*", GROUP="disk"
'';
services.prometheus = {
exporters = {
ipmi = {
enable = true;
group = "root";
user = "root";
configFile = config.age.secrets.ipmiYml.path;
# extraFlags = [ "--log.level=debug" ];
listenAddress = "127.0.0.1";
};
node = {
enable = true;
enabledCollectors = [ "systemd" "logind" ];
port = 9002;
listenAddress = "127.0.0.1";
};
smartctl = {
enable = true;
listenAddress = "127.0.0.1";
};
blackbox = {
enable = true;
listenAddress = "127.0.0.1";
configFile = ./blackbox.yml;
};
};
scrapeConfigs = [
{
job_name = "xeon07";
static_configs = [{
targets = [
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
"127.0.0.1:${toString config.services.prometheus.exporters.ipmi.port}"
"127.0.0.1:9323"
"127.0.0.1:9252"
"127.0.0.1:${toString config.services.prometheus.exporters.smartctl.port}"
"127.0.0.1:9341" # Slurm exporter
"127.0.0.1:9966" # GPFS custom exporter
"127.0.0.1:9999" # Nix-daemon custom exporter
"127.0.0.1:9929" # Meteocat custom exporter
"127.0.0.1:9928" # UPC Qaire custom exporter
"127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"
];
}];
}
{
job_name = "ceph";
static_configs = [{
targets = [
"10.0.40.40:9283" # Ceph statistics
"10.0.40.40:9002" # Node exporter
"10.0.40.42:9002" # Node exporter
];
}];
}
{
job_name = "blackbox-http";
metrics_path = "/probe";
params = { module = [ "http_2xx" ]; };
static_configs = [{
targets = [
"https://www.google.com/robots.txt"
"https://pm.bsc.es/"
"https://pm.bsc.es/gitlab/"
"https://jungle.bsc.es/"
"https://gitlab.bsc.es/"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "blackbox-icmp";
metrics_path = "/probe";
params = { module = [ "icmp" ]; };
static_configs = [{
targets = [
"1.1.1.1"
"8.8.8.8"
"ssfhead"
"anella-bsc.cesca.cat"
"upc-anella.cesca.cat"
"fox.ac.upc.edu"
"arenys5.ac.upc.edu"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
}
];
}
{
job_name = "gitea";
static_configs = [{ targets = [ "127.0.0.1:3000" ]; }];
}
{
# Scrape the IPMI info of the hosts remotely via LAN
job_name = "ipmi-lan";
scrape_interval = "1m";
scrape_timeout = "30s";
metrics_path = "/ipmi";
scheme = "http";
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
separator = ";";
regex = "(.*)(:80)?";
target_label = "__param_target";
replacement = "\${1}";
action = "replace";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
separator = ";";
regex = "(.*)-ipmi"; # Remove "-ipm̀i" at the end
target_label = "instance";
replacement = "\${1}";
action = "replace";
}
{
# Sets the fixed "module=lan" URL param
separator = ";";
regex = "(.*)";
target_label = "__param_module";
replacement = "lan";
action = "replace";
}
{
# Sets the target to query as the localhost IPMI exporter
separator = ";";
regex = ".*";
target_label = "__address__";
replacement = "127.0.0.1:9290";
action = "replace";
}
];
# Load the list of targets from another file
file_sd_configs = [
{
files = [ "${./targets.yml}" ];
refresh_interval = "30s";
}
];
}
{
job_name = "ipmi-raccoon";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9291" ]; }
];
params = {
target = [ "84.88.51.142" ];
module = [ "raccoon" ];
};
}
{
job_name = "raccoon";
static_configs = [
{
targets = [ "127.0.0.1:19002" ]; # Node exporter
}
];
}
];
};
}

24
m/hut/msmtp.nix Normal file
View File

@ -0,0 +1,24 @@
{ config, lib, ... }:
{
age.secrets.jungleRobotPassword = {
file = ../../secrets/jungle-robot-password.age;
group = "gitea";
mode = "440";
};
programs.msmtp = {
enable = true;
accounts = {
default = {
auth = true;
tls = true;
tls_starttls = false;
port = 465;
host = "mail.bsc.es";
user = "jungle-robot";
passwordeval = "cat ${config.age.secrets.jungleRobotPassword.path}";
from = "jungle-robot@bsc.es";
};
};
};
}

9
m/hut/nfs.nix Normal file
View File

@ -0,0 +1,9 @@
{ ... }:
{
services.nfs.server.enable = true;
services.nfs.server.exports = ''
/nix 10.0.40.0/24(ro,sync,no_subtree_check,root_squash)
'';
networking.firewall.allowedTCPPorts = [ 2049 ];
}

76
m/hut/nginx.nix Normal file
View File

@ -0,0 +1,76 @@
{ theFlake, pkgs, ... }:
let
website = pkgs.stdenv.mkDerivation {
name = "jungle-web";
src = pkgs.fetchgit {
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
};
buildInputs = [ pkgs.hugo ];
buildPhase = ''
rm -rf public/
hugo
'';
installPhase = ''
cp -r public $out
'';
# Don't mess doc/
dontFixup = true;
};
in
{
networking.firewall.allowedTCPPorts = [ 80 ];
services.nginx = {
enable = true;
virtualHosts."jungle.bsc.es" = {
root = "${website}";
listen = [
{
addr = "0.0.0.0";
port = 80;
}
];
extraConfig = ''
set_real_ip_from 127.0.0.1;
set_real_ip_from 84.88.52.107;
real_ip_recursive on;
real_ip_header X-Forwarded-For;
location /git {
rewrite ^/git$ / break;
rewrite ^/git/(.*) /$1 break;
proxy_pass http://127.0.0.1:3000;
proxy_redirect http:// $scheme://;
}
location /cache {
rewrite ^/cache/(.*) /$1 break;
proxy_pass http://127.0.0.1:5000;
proxy_redirect http:// $scheme://;
}
location /lists {
proxy_pass http://127.0.0.1:8081;
proxy_redirect http:// $scheme://;
}
location /grafana {
proxy_pass http://127.0.0.1:2342;
proxy_redirect http:// $scheme://;
proxy_set_header Host $host;
# Websockets
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
location ~ ^/~(.+?)(/.*)?$ {
alias /ceph/home/$1/public_html$2;
index index.html index.htm;
autoindex on;
absolute_redirect off;
}
location /p/ {
alias /ceph/p/;
}
'';
};
};
}

16
m/hut/nix-serve.nix Normal file
View File

@ -0,0 +1,16 @@
{ config, ... }:
{
age.secrets.nixServe.file = ../../secrets/nix-serve.age;
services.nix-serve = {
enable = true;
# Only listen locally, as we serve it via ssh
bindAddress = "127.0.0.1";
port = 5000;
secretKeyFile = config.age.secrets.nixServe.path;
# Public key:
# jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=
};
}

43
m/hut/p.nix Normal file
View File

@ -0,0 +1,43 @@
{ pkgs, lib, config, ... }:
let
p = pkgs.writeShellScriptBin "p" ''
set -e
cd /ceph
pastedir="p/$USER"
mkdir -p "$pastedir"
ext="txt"
if [ -n "$1" ]; then
ext="$1"
fi
out=$(mktemp "$pastedir/XXXXXXXX.$ext")
cat > "$out"
chmod go+r "$out"
echo "https://jungle.bsc.es/$out"
'';
in
{
environment.systemPackages = with pkgs; [ p ];
# Make sure we have a directory per user. We cannot use the nice
# systemd-tmpfiles-setup.service service because this is a remote FS, and it
# may not be mounted when it runs.
systemd.services.create-paste-dirs = let
# Take only normal users in hut
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
commands = lib.concatLists (lib.mapAttrsToList
(_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0755 /ceph/p/${user.name}"
]) users);
script = pkgs.writeShellScript "create-paste-dirs.sh" (lib.concatLines commands);
in {
enable = true;
wants = [ "remote-fs.target" ];
after = [ "remote-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
}

19
m/hut/postgresql.nix Normal file
View File

@ -0,0 +1,19 @@
{ lib, ... }:
{
services.postgresql = {
enable = true;
ensureDatabases = [ "perftestsdb" ];
ensureUsers = [
{ name = "anavarro"; ensureClauses.superuser = true; }
{ name = "rarias"; ensureClauses.superuser = true; }
{ name = "grafana"; }
];
authentication = ''
#type database DBuser auth-method
local perftestsdb rarias trust
local perftestsdb anavarro trust
local perftestsdb grafana trust
'';
};
}

79
m/hut/public-inbox.css Normal file
View File

@ -0,0 +1,79 @@
/*
* CC0-1.0 <https://creativecommons.org/publicdomain/zero/1.0/legalcode>
* Dark color scheme using 216 web-safe colors, inspired
* somewhat by the default color scheme in mutt.
* It reduces eyestrain for me, and energy usage for all:
* https://en.wikipedia.org/wiki/Light-on-dark_color_scheme
*/
* {
font-size: 14px;
font-family: monospace;
}
pre {
white-space: pre-wrap;
padding: 10px;
background: #f5f5f5;
}
hr {
margin: 30px 0;
}
body {
max-width: 120ex; /* 120 columns wide */
margin: 50px auto;
}
/*
* Underlined links add visual noise which make them hard-to-read.
* Use colors to make them stand out, instead.
*/
a:link {
color: #007;
text-decoration: none;
}
a:visited {
color:#504;
}
a:hover {
text-decoration: underline;
}
/* quoted text in emails gets a different color */
*.q { color:gray }
/*
* these may be used with cgit <https://git.zx2c4.com/cgit/>, too.
* (cgit uses <div>, public-inbox uses <span>)
*/
*.add { color:darkgreen } /* diff post-image lines */
*.del { color:darkred } /* diff pre-image lines */
*.head { color:black } /* diff header (metainformation) */
*.hunk { color:gray } /* diff hunk-header */
/*
* highlight 3.x colors (tested 3.18) for displaying blobs.
* This doesn't use most of the colors available, as I find too
* many colors overwhelming, so the default is commented out.
*/
.hl.num { color:#f30 } /* number */
.hl.esc { color:#f0f } /* escape character */
.hl.str { color:#f30 } /* string */
.hl.ppc { color:#f0f } /* preprocessor */
.hl.pps { color:#f30 } /* preprocessor string */
.hl.slc { color:#09f } /* single-line comment */
.hl.com { color:#09f } /* multi-line comment */
/* .hl.opt { color:#ccc } */ /* operator */
/* .hl.ipl { color:#ccc } */ /* interpolation */
/* keyword groups kw[a-z] */
.hl.kwa { color:#ff0 }
.hl.kwb { color:#0f0 }
.hl.kwc { color:#ff0 }
/* .hl.kwd { color:#ccc } */
/* line-number (unused by public-inbox) */
/* .hl.lin { color:#ccc } */

47
m/hut/public-inbox.nix Normal file
View File

@ -0,0 +1,47 @@
{ lib, ... }:
{
services.public-inbox = {
enable = true;
http = {
enable = true;
port = 8081;
mounts = [ "/lists" ];
};
settings.publicinbox = {
css = [ "${./public-inbox.css}" ];
wwwlisting = "all";
};
inboxes = {
bscpkgs = {
url = "https://jungle.bsc.es/lists/bscpkgs";
address = [ "~rodarima/bscpkgs@lists.sr.ht" ];
watch = [ "imaps://jungle-robot%40gmx.com@imap.gmx.com/INBOX" ];
description = "Patches for bscpkgs";
listid = "~rodarima/bscpkgs.lists.sr.ht";
};
jungle = {
url = "https://jungle.bsc.es/lists/jungle";
address = [ "~rodarima/jungle@lists.sr.ht" ];
watch = [ "imaps://jungle-robot%40gmx.com@imap.gmx.com/INBOX" ];
description = "Patches for jungle";
listid = "~rodarima/jungle.lists.sr.ht";
};
};
};
# We need access to the network for the watch service, as we will fetch the
# emails directly from the IMAP server.
systemd.services.public-inbox-watch.serviceConfig = {
PrivateNetwork = lib.mkForce false;
RestrictAddressFamilies = lib.mkForce [ "AF_UNIX" "AF_INET" "AF_INET6" ];
KillSignal = "SIGKILL"; # Avoid slow shutdown
# Required for chmod(..., 02750) on directories by git, from
# systemd.exec(8):
# > Note that this restricts marking of any type of file system object with
# > these bits, including both regular files and directories (where the SGID
# > is a different meaning than for files, see documentation).
RestrictSUIDSGID = lib.mkForce false;
};
}

35
m/hut/pxe.nix Normal file
View File

@ -0,0 +1,35 @@
{ theFlake, pkgs, ... }:
# This module describes a script that can launch the pixiecore daemon to serve a
# NixOS image via PXE to a node to directly boot from there, without requiring a
# working disk.
let
# The host config must have the netboot-minimal.nix module too
host = theFlake.nixosConfigurations.lake2;
sys = host.config.system;
build = sys.build;
kernel = "${build.kernel}/bzImage";
initrd = "${build.netbootRamdisk}/initrd";
init = "${build.toplevel}/init";
script = pkgs.writeShellScriptBin "pixiecore-helper" ''
#!/usr/bin/env bash -x
${pkgs.pixiecore}/bin/pixiecore \
boot ${kernel} ${initrd} --cmdline "init=${init} loglevel=4" \
--debug --dhcp-no-bind --port 64172 --status-port 64172 "$@"
'';
in
{
## We need a DHCP server to provide the IP
#services.dnsmasq = {
# enable = true;
# settings = {
# domain-needed = true;
# dhcp-range = [ "192.168.0.2,192.168.0.254" ];
# };
#};
environment.systemPackages = [ script ];
}

15
m/hut/targets.yml Normal file
View File

@ -0,0 +1,15 @@
- targets:
- owl1-ipmi
- owl2-ipmi
- xeon03-ipmi
- xeon04-ipmi
- koro-ipmi
- weasel-ipmi
- hut-ipmi
- eudy-ipmi
# Storage
- bay-ipmi
- oss01-ipmi
- lake2-ipmi
labels:
job: ipmi-lan

35
m/koro/configuration.nix Normal file
View File

@ -0,0 +1,35 @@
{ config, pkgs, lib, modulesPath, ... }:
{
imports = [
../common/ssf.nix
#(modulesPath + "/installer/netboot/netboot-minimal.nix")
../eudy/cpufreq.nix
../eudy/users.nix
./kernel.nix
];
# Select this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d5376d2";
# disable automatic garbage collector
nix.gc.automatic = lib.mkForce false;
# members of the tracing group can use the lttng-provided kernel events
# without root permissions
users.groups.tracing.members = [ "arocanon" "vlopez" ];
# set up both ethernet and infiniband ips
networking = {
hostName = "koro";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.5";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.5";
prefixLength = 24;
} ];
};
}

70
m/koro/kernel.nix Normal file
View File

@ -0,0 +1,70 @@
{ pkgs, lib, ... }:
let
#fcs-devel = pkgs.linuxPackages_custom {
# version = "6.2.8";
# src = /mnt/data/kernel/fcs/kernel/src;
# configfile = /mnt/data/kernel/fcs/kernel/configs/defconfig;
#};
#fcsv1 = fcs-kernel "bc11660676d3d68ce2459b9fb5d5e654e3f413be" false;
#fcsv2 = fcs-kernel "db0f2eca0cd57a58bf456d7d2c7d5d8fdb25dfb1" false;
#fcsv1-lockdep = fcs-kernel "bc11660676d3d68ce2459b9fb5d5e654e3f413be" true;
#fcsv2-lockdep = fcs-kernel "db0f2eca0cd57a58bf456d7d2c7d5d8fdb25dfb1" true;
#fcs-kernel = gitCommit: lockdep: pkgs.linuxPackages_custom {
# version = "6.2.8";
# src = builtins.fetchGit {
# url = "git@bscpm03.bsc.es:ompss-kernel/linux.git";
# rev = gitCommit;
# ref = "fcs";
# };
# configfile = if lockdep then ./configs/lockdep else ./configs/defconfig;
#};
kernel = nixos-fcs;
nixos-fcs-kernel = lib.makeOverridable ({gitCommit, lockStat ? false, preempt ? false, branch ? "fcs"}: pkgs.linuxPackagesFor (pkgs.buildLinux rec {
version = "6.2.8";
src = builtins.fetchGit {
url = "git@bscpm03.bsc.es:ompss-kernel/linux.git";
rev = gitCommit;
ref = branch;
};
structuredExtraConfig = with lib.kernel; {
# add general custom kernel options here
} // lib.optionalAttrs lockStat {
LOCK_STAT = yes;
} // lib.optionalAttrs preempt {
PREEMPT = lib.mkForce yes;
PREEMPT_VOLUNTARY = lib.mkForce no;
};
kernelPatches = [];
extraMeta.branch = lib.versions.majorMinor version;
}));
nixos-fcs = nixos-fcs-kernel {gitCommit = "8a09822dfcc8f0626b209d6d2aec8b5da459dfee";};
nixos-fcs-lockstat = nixos-fcs.override {
lockStat = true;
};
nixos-fcs-lockstat-preempt = nixos-fcs.override {
lockStat = true;
preempt = true;
};
latest = pkgs.linuxPackages_latest;
in {
imports = [
../eudy/kernel/lttng.nix
../eudy/kernel/perf.nix
];
boot.kernelPackages = lib.mkForce kernel;
# disable all cpu mitigations
boot.kernelParams = [
"mitigations=off"
];
# enable memory overcommit, needed to build a taglibc system using nix after
# increasing the openblas memory footprint
boot.kernel.sysctl."vm.overcommit_memory" = 1;
}

84
m/lake2/configuration.nix Normal file
View File

@ -0,0 +1,84 @@
{ config, pkgs, lib, modulesPath, ... }:
{
imports = [
../common/ssf.nix
../module/monitoring.nix
../module/hut-substituter.nix
];
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
boot.kernel.sysctl = {
"kernel.yama.ptrace_scope" = lib.mkForce "1";
};
environment.systemPackages = with pkgs; [
ceph
];
services.ceph = {
enable = true;
global = {
fsid = "9c8d06e0-485f-4aaf-b16b-06d6daf1232b";
monHost = "10.0.40.40";
monInitialMembers = "bay";
clusterNetwork = "10.0.40.40/24"; # Use Ethernet only
};
osd = {
enable = true;
# One daemon per NVME disk
daemons = [ "4" "5" "6" "7" ];
extraConfig = {
"osd crush chooseleaf type" = "0";
"osd journal size" = "10000";
"osd pool default min size" = "2";
"osd pool default pg num" = "200";
"osd pool default pgp num" = "200";
"osd pool default size" = "3";
};
};
};
networking = {
hostName = "lake2";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.42";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.42";
prefixLength = 24;
} ];
firewall = {
extraCommands = ''
# Accept all incoming TCP traffic from bay
iptables -A nixos-fw -p tcp -s bay -j nixos-fw-accept
# Accept monitoring requests from hut
iptables -A nixos-fw -p tcp -s hut --dport 9002 -j nixos-fw-accept
# Accept all Ceph traffic from the local network
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 -m multiport --dport 3300,6789,6800:7568 -j nixos-fw-accept
'';
};
};
# Missing service for volumes, see:
# https://www.reddit.com/r/ceph/comments/14otjyo/comment/jrd69vt/
systemd.services.ceph-volume = {
enable = true;
description = "Ceph Volume activation";
unitConfig = {
Type = "oneshot";
After = "local-fs.target";
Wants = "local-fs.target";
};
path = [ pkgs.ceph pkgs.util-linux pkgs.lvm2 pkgs.cryptsetup ];
serviceConfig = {
KillMode = "none";
Environment = "CEPH_VOLUME_TIMEOUT=10000";
ExecStart = "/bin/sh -c 'timeout $CEPH_VOLUME_TIMEOUT ${pkgs.ceph}/bin/ceph-volume lvm activate --all --no-systemd'";
TimeoutSec = "0";
};
wantedBy = [ "multi-user.target" ];
};
}

70
m/map.nix Normal file
View File

@ -0,0 +1,70 @@
{
# In physical order from top to bottom (see note below)
ssf = {
# Switches for Ethernet and OmniPath
switch-C6-S1A-05 = { pos=42; size=1; model="Dell S3048-ON"; };
switch-opa = { pos=41; size=1; };
# SSF login
apex = { pos=39; size=2; label="SSFHEAD"; board="R2208WTTYSR"; contact="rodrigo.arias@bsc.es"; };
# Storage
bay = { pos=38; size=1; label="MDS01"; board="S2600WT2R"; sn="BQWL64850303"; contact="rodrigo.arias@bsc.es"; };
lake1 = { pos=37; size=1; label="OSS01"; board="S2600WT2R"; sn="BQWL64850234"; contact="rodrigo.arias@bsc.es"; };
lake2 = { pos=36; size=1; label="OSS02"; board="S2600WT2R"; sn="BQWL64850266"; contact="rodrigo.arias@bsc.es"; };
# Compute xeon
owl1 = { pos=35; size=1; label="SSF-XEON01"; board="S2600WTTR"; sn="BQWL64954172"; contact="rodrigo.arias@bsc.es"; };
owl2 = { pos=34; size=1; label="SSF-XEON02"; board="S2600WTTR"; sn="BQWL64756560"; contact="rodrigo.arias@bsc.es"; };
xeon03 = { pos=33; size=1; label="SSF-XEON03"; board="S2600WTTR"; sn="BQWL64750826"; contact="rodrigo.arias@bsc.es"; };
# Slot 34 empty
koro = { pos=31; size=1; label="SSF-XEON05"; board="S2600WTTR"; sn="BQWL64954293"; contact="rodrigo.arias@bsc.es"; };
weasel = { pos=30; size=1; label="SSF-XEON06"; board="S2600WTTR"; sn="BQWL64750846"; contact="antoni.navarro@bsc.es"; };
hut = { pos=29; size=1; label="SSF-XEON07"; board="S2600WTTR"; sn="BQWL64751184"; contact="rodrigo.arias@bsc.es"; };
eudy = { pos=28; size=1; label="SSF-XEON08"; board="S2600WTTR"; sn="BQWL64756586"; contact="aleix.rocanonell@bsc.es"; };
# 16 KNL nodes, 4 per chassis
knl01_04 = { pos=26; size=2; label="KNL01..KNL04"; board="HNS7200APX"; };
knl05_08 = { pos=24; size=2; label="KNL05..KNL18"; board="HNS7200APX"; };
knl09_12 = { pos=22; size=2; label="KNL09..KNL12"; board="HNS7200APX"; };
knl13_16 = { pos=20; size=2; label="KNL13..KNL16"; board="HNS7200APX"; };
# Slot 19 empty
# EPI (hw team, guessed order)
epi01 = { pos=18; size=1; contact="joan.cabre@bsc.es"; };
epi02 = { pos=17; size=1; contact="joan.cabre@bsc.es"; };
epi03 = { pos=16; size=1; contact="joan.cabre@bsc.es"; };
anon = { pos=14; size=2; }; # Unlabeled machine. Operative
# These are old and decommissioned (off)
power8 = { pos=12; size=2; label="BSCPOWER8N3"; decommissioned=true; };
powern1 = { pos=8; size=4; label="BSCPOWERN1"; decommissioned=true; };
gustafson = { pos=7; size=1; label="gustafson"; decommissioned=true; };
odap01 = { pos=3; size=4; label="ODAP01"; decommissioned=true; };
amhdal = { pos=2; size=1; label="AMHDAL"; decommissioned=true; }; # sic
moore = { pos=1; size=1; label="moore (earth)"; decommissioned=true; };
};
bsc2218 = {
raccoon = { board="W2600CR"; sn="QSIP22500829"; contact="rodrigo.arias@bsc.es"; };
tent = { label="SSF-XEON04"; board="S2600WTTR"; sn="BQWL64751229"; contact="rodrigo.arias@bsc.es"; };
};
upc = {
fox = { board="H13DSG-O-CPU"; sn="UM24CS600392"; prod="AS-4125GS-TNRT"; prod_sn="E508839X5103339"; contact="rodrigo.arias@bsc.es"; };
};
# NOTE: Position is specified in "U" units (44.45 mm) and starts at 1 from the
# bottom. Example:
#
# | ... | - [pos+size] <--- Label in chassis
# +--------+
# | node | - [pos+1]
# | 2U | - [pos]
# +------- +
# | ... | - [pos-1]
#
# NOTE: The board and sn refers to the FRU information (Board Product and
# Board Serial) via `ipmitool fru print 0`.
}

49
m/module/amd-uprof.nix Normal file
View File

@ -0,0 +1,49 @@
{ config, lib, pkgs, ... }:
{
options = {
services.amd-uprof = {
enable = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to enable AMD uProf.";
};
};
};
# Only setup amd-uprof if enabled
config = lib.mkIf config.services.amd-uprof.enable {
# First make sure that we add the module to the list of available modules
# in the kernel matching the same kernel version of this configuration.
boot.extraModulePackages = with config.boot.kernelPackages; [ amd-uprof-driver ];
boot.kernelModules = [ "AMDPowerProfiler" ];
# Make the userspace tools available in $PATH.
environment.systemPackages = with pkgs; [ amd-uprof ];
# The AMDPowerProfiler module doesn't create the /dev device nor it emits
# any uevents, so we cannot use udev rules to automatically create the
# device. Instead, we run a systemd unit that does it after loading the
# modules.
systemd.services.amd-uprof-device = {
description = "Create /dev/AMDPowerProfiler device";
after = [ "systemd-modules-load.service" ];
wantedBy = [ "multi-user.target" ];
unitConfig.ConditionPathExists = [
"/proc/AMDPowerProfiler/device"
"!/dev/AMDPowerProfiler"
];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "add-amd-uprof-dev.sh" ''
mknod /dev/AMDPowerProfiler -m 666 c $(< /proc/AMDPowerProfiler/device) 0
'';
ExecStop = pkgs.writeShellScript "remove-amd-uprof-dev.sh" ''
rm -f /dev/AMDPowerProfiler
'';
};
};
};
}

24
m/module/ceph.nix Normal file
View File

@ -0,0 +1,24 @@
{ config, pkgs, ... }:
# Mounts the /ceph filesystem at boot
{
environment.systemPackages = with pkgs; [
ceph-client
fio # For benchmarks
];
# We need the ceph module loaded as the mount.ceph binary fails to run the
# modprobe command.
boot.kernelModules = [ "ceph" ];
age.secrets.cephUser.file = ../../secrets/ceph-user.age;
fileSystems."/ceph" = {
fsType = "ceph";
device = "user@9c8d06e0-485f-4aaf-b16b-06d6daf1232b.cephfs=/";
options = [
"mon_addr=10.0.40.40"
"secretfile=${config.age.secrets.cephUser.path}"
];
};
}

3
m/module/debuginfod.nix Normal file
View File

@ -0,0 +1,3 @@
{
services.nixseparatedebuginfod.enable = true;
}

3
m/module/emulation.nix Normal file
View File

@ -0,0 +1,3 @@
{
boot.binfmt.emulatedSystems = [ "armv7l-linux" "aarch64-linux" "powerpc64le-linux" "riscv64-linux" ];
}

View File

@ -0,0 +1,13 @@
{ config, ... }:
{
nix.settings =
# Don't add hut as a cache to itself
assert config.networking.hostName != "hut";
{
extra-substituters = [ "http://hut/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
# Set a low timeout in case hut is down
connect-timeout = 3; # seconds
};
}

24
m/module/jungle-users.nix Normal file
View File

@ -0,0 +1,24 @@
{ config, lib, ... }:
with lib;
{
options = {
users.jungleUsers = mkOption {
type = types.attrsOf (types.anything // { check = (x: x ? "hosts"); });
description = ''
Same as users.users but with the extra `hosts` attribute, which controls
access to the nodes by `networking.hostName`.
'';
};
};
config = let
allowedUser = host: userConf: builtins.elem host userConf.hosts;
filterUsers = host: users: filterAttrs (n: v: allowedUser host v) users;
removeHosts = users: mapAttrs (n: v: builtins.removeAttrs v [ "hosts" ]) users;
currentHost = config.networking.hostName;
in {
users.users = removeHosts (filterUsers currentHost config.users.jungleUsers);
};
}

View File

@ -0,0 +1,17 @@
{ config, lib, pkgs, ... }:
with lib;
{
systemd.services."prometheus-meteocat-exporter" = {
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Restart = mkDefault "always";
PrivateTmp = mkDefault true;
WorkingDirectory = mkDefault "/tmp";
DynamicUser = mkDefault true;
ExecStart = "${pkgs.meteocat-exporter}/bin/meteocat-exporter";
};
};
}

25
m/module/monitoring.nix Normal file
View File

@ -0,0 +1,25 @@
{ config, lib, ... }:
{
# We need access to the devices to monitor the disk space
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
# Required to allow the smartctl exporter to read the nvme0 character device,
# see the commit message on:
# https://github.com/NixOS/nixpkgs/commit/12c26aca1fd55ab99f831bedc865a626eee39f80
services.udev.extraRules = ''
SUBSYSTEM=="nvme", KERNEL=="nvme[0-9]*", GROUP="disk"
'';
services.prometheus = {
exporters = {
node = {
enable = true;
enabledCollectors = [ "systemd" ];
port = 9002;
};
smartctl.enable = true;
};
};
}

26
m/module/nix-daemon-builds.sh Executable file
View File

@ -0,0 +1,26 @@
#!/bin/sh
# Locate nix daemon pid
nd=$(pgrep -o nix-daemon)
# Locate children of nix-daemon
pids1=$(tr ' ' '\n' < "/proc/$nd/task/$nd/children")
# For each children, locate 2nd level children
pids2=$(echo "$pids1" | xargs -I @ /bin/sh -c 'cat /proc/@/task/*/children' | tr ' ' '\n')
cat <<EOF
HTTP/1.1 200 OK
Content-Type: text/plain; version=0.0.4; charset=utf-8; escaping=values
# HELP nix_daemon_build Nix daemon derivation build state.
# TYPE nix_daemon_build gauge
EOF
for pid in $pids2; do
name=$(cat /proc/$pid/environ 2>/dev/null | tr '\0' '\n' | rg "^name=(.+)" - --replace '$1' | tr -dc ' [:alnum:]_\-\.')
user=$(ps -o uname= -p "$pid")
if [ -n "$name" -a -n "$user" ]; then
printf 'nix_daemon_build{user="%s",name="%s"} 1\n' "$user" "$name"
fi
done

View File

@ -0,0 +1,23 @@
{ pkgs, config, lib, ... }:
let
script = pkgs.runCommand "nix-daemon-exporter.sh" { }
''
cp ${./nix-daemon-builds.sh} $out;
chmod +x $out
''
;
in
{
systemd.services.nix-daemon-exporter = {
description = "Daemon to export nix-daemon metrics";
path = [ pkgs.procps pkgs.ripgrep ];
wantedBy = [ "default.target" ];
serviceConfig = {
Type = "simple";
ExecStart = "${pkgs.socat}/bin/socat TCP4-LISTEN:9999,fork EXEC:${script}";
# Needed root to read the environment, potentially unsafe
User = "root";
Group = "root";
};
};
}

20
m/module/nvidia.nix Normal file
View File

@ -0,0 +1,20 @@
{ lib, config, pkgs, ... }:
{
# Configure Nvidia driver to use with CUDA
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
hardware.nvidia.open = lib.mkDefault (builtins.abort "hardware.nvidia.open not set");
hardware.graphics.enable = true;
nixpkgs.config.nvidia.acceptLicense = true;
services.xserver.videoDrivers = [ "nvidia" ];
# enable support for derivations which require nvidia-gpu to be available
# > requiredSystemFeatures = [ "cuda" ];
programs.nix-required-mounts.enable = true;
programs.nix-required-mounts.presets.nvidia-gpu.enable = true;
# They forgot to add the symlink
programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
];
environment.systemPackages = [ pkgs.cudainfo ];
}

68
m/module/p.nix Normal file
View File

@ -0,0 +1,68 @@
{ config, lib, pkgs, ... }:
let
cfg = config.services.p;
in
{
options = {
services.p = {
enable = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to enable the p service.";
};
path = lib.mkOption {
type = lib.types.str;
default = "/var/lib/p";
description = "Where to save the pasted files on disk.";
};
url = lib.mkOption {
type = lib.types.str;
default = "https://jungle.bsc.es/p";
description = "URL prefix for the printed file.";
};
};
};
config = lib.mkIf cfg.enable {
environment.systemPackages = let
p = pkgs.writeShellScriptBin "p" ''
set -e
pastedir="${cfg.path}/$USER"
cd "$pastedir"
ext="txt"
if [ -n "$1" ]; then
ext="$1"
fi
out=$(mktemp "XXXXXXXX.$ext")
cat > "$out"
chmod go+r "$out"
echo "${cfg.url}/$USER/$out"
'';
in [ p ];
systemd.services.p = let
# Take only normal users
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
# Create a directory for each user
commands = lib.concatLists (lib.mapAttrsToList (_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0755 ${cfg.path}/${user.name}"
]) users);
in {
description = "P service setup";
requires = [ "network-online.target" ];
#wants = [ "remote-fs.target" ];
#after = [ "remote-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
ExecStart = pkgs.writeShellScript "p-init.sh" (''
install -d -o root -g root -m 0755 ${cfg.path}
'' + (lib.concatLines commands));
};
};
};
}

33
m/module/power-policy.nix Normal file
View File

@ -0,0 +1,33 @@
{ config, lib, pkgs, ... }:
with lib;
let
cfg = config.power.policy;
in
{
options = {
power.policy = mkOption {
type = types.nullOr (types.enum [ "always-on" "previous" "always-off" ]);
default = null;
description = "Set power policy to use via IPMI.";
};
};
config = mkIf (cfg != null) {
systemd.services."power-policy" = {
description = "Set power policy to use via IPMI";
wantedBy = [ "multi-user.target" ];
unitConfig = {
StartLimitBurst = "10";
StartLimitIntervalSec = "10m";
};
serviceConfig = {
ExecStart = "${pkgs.ipmitool}/bin/ipmitool chassis policy ${cfg}";
Type = "oneshot";
Restart = "on-failure";
RestartSec = "5s";
};
};
};
}

24
m/module/slurm-client.nix Normal file
View File

@ -0,0 +1,24 @@
{ lib, ... }:
{
imports = [
./slurm-common.nix
];
systemd.services.slurmd.serviceConfig = {
# Kill all processes in the control group on stop/restart. This will kill
# all the jobs running, so ensure that we only upgrade when the nodes are
# not in use. See:
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
KillMode = lib.mkForce "control-group";
# If slurmd fails to contact the control server it will fail, causing the
# node to remain out of service until manually restarted. Always try to
# restart it.
Restart = "always";
RestartSec = "30s";
};
services.slurm.client.enable = true;
}

115
m/module/slurm-common.nix Normal file
View File

@ -0,0 +1,115 @@
{ config, pkgs, ... }:
let
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Shutting down host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
done
'';
resumeProgram = pkgs.writeShellScript "resume.sh" ''
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
set -x
export "PATH=/run/current-system/sw/bin:$PATH"
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
hosts=$(scontrol show hostnames $1)
for host in $hosts; do
echo Starting host: $host
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
done
'';
in {
services.slurm = {
controlMachine = "apex";
clusterName = "jungle";
nodeName = [
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
];
partitionName = [
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
"fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
];
# See slurm.conf(5) for more details about these options.
extraConfig = ''
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
# not with Intel MPI. For that use the compatibility shim libpmi.so
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
# library in SLURM (--mpi=pmix). See more details here:
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
MpiDefault=pmix
# When a node reboots return that node to the slurm queue as soon as it
# becomes operative again.
ReturnToService=2
# Track all processes by using a cgroup
ProctrackType=proctrack/cgroup
# Enable task/affinity to allow the jobs to run in a specified subset of
# the resources. Use the task/cgroup plugin to enable process containment.
TaskPlugin=task/affinity,task/cgroup
# Power off unused nodes until they are requested
SuspendProgram=${suspendProgram}
SuspendTimeout=60
ResumeProgram=${resumeProgram}
ResumeTimeout=300
SuspendExcNodes=fox
# Turn the nodes off after 1 hour of inactivity
SuspendTime=3600
# Reduce port range so we can allow only this range in the firewall
SrunPortRange=60000-61000
# Use cores as consumable resources. In SLURM terms, a core may have
# multiple hardware threads (or CPUs).
SelectType=select/cons_tres
# Ignore memory constraints and only use unused cores to share a node with
# other jobs.
SelectTypeParameters=CR_Core
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
# This sets up the "extern" step into which ssh-launched processes will be
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
# when a task runs (srun) so we can ssh early.
PrologFlags=Alloc,Contain,X11
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
# adopted by the external step, similar to tasks running in regular steps
# LaunchParameters=ulimit_pam_adopt
SlurmdDebug=debug5
#DebugFlags=Protocol,Cgroup
'';
extraCgroupConfig = ''
CgroupPlugin=cgroup/v2
#ConstrainCores=yes
'';
};
# Place the slurm config in /etc as this will be required by PAM
environment.etc.slurm.source = config.services.slurm.etcSlurm;
age.secrets.mungeKey = {
file = ../../secrets/munge-key.age;
owner = "munge";
group = "munge";
};
services.munge = {
enable = true;
password = config.age.secrets.mungeKey.path;
};
}

View File

@ -0,0 +1,28 @@
{ config, lib, pkgs, ... }:
# See also: https://github.com/NixOS/nixpkgs/pull/112010
# And: https://github.com/NixOS/nixpkgs/pull/115839
with lib;
{
systemd.services."prometheus-slurm-exporter" = {
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Restart = mkDefault "always";
PrivateTmp = mkDefault true;
WorkingDirectory = mkDefault "/tmp";
DynamicUser = mkDefault true;
ExecStart = ''
${pkgs.prometheus-slurm-exporter}/bin/prometheus-slurm-exporter --listen-address "127.0.0.1:9341"
'';
Environment = [
"PATH=${pkgs.slurm}/bin"
# We need to specify the slurm config to be able to talk to the slurmd
# daemon.
"SLURM_CONF=${config.services.slurm.etcSlurm}/slurm.conf"
];
};
};
}

View File

@ -0,0 +1,8 @@
{ ... }:
{
networking.firewall = {
# Required for PMIx in SLURM, we should find a better way
allowedTCPPortRanges = [ { from=1024; to=65535; } ];
};
}

View File

@ -0,0 +1,19 @@
{ ... }:
{
# Mount the hut nix store via NFS
fileSystems."/mnt/hut-nix-store" = {
device = "hut:/nix/store";
fsType = "nfs";
options = [ "ro" ];
};
systemd.services.slurmd.serviceConfig = {
# When running a job, bind the hut store in /nix/store so the paths are
# available too.
# FIXME: This doesn't keep the programs in /run/current-system/sw/bin
# available in the store. Ideally they should be merged but the overlay FS
# doesn't work when the underlying directories change.
BindReadOnlyPaths = "/mnt/hut-nix-store:/nix/store";
};
}

23
m/module/slurm-server.nix Normal file
View File

@ -0,0 +1,23 @@
{ ... }:
{
imports = [
./slurm-common.nix
];
services.slurm.server.enable = true;
networking.firewall = {
extraCommands = ''
# Accept slurm connections to controller from compute nodes
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817 -j nixos-fw-accept
# Accept slurm connections from compute nodes for srun
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
# Accept slurm connections to controller from fox (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 6817 -j nixos-fw-accept
# Accept slurm connections from fox for srun (via wireguard)
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 60000:61000 -j nixos-fw-accept
'';
};
}

View File

@ -0,0 +1,17 @@
{ config, lib, pkgs, ... }:
with lib;
{
systemd.services."prometheus-upc-qaire-exporter" = {
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
Restart = mkDefault "always";
PrivateTmp = mkDefault true;
WorkingDirectory = mkDefault "/tmp";
DynamicUser = mkDefault true;
ExecStart = "${pkgs.upc-qaire-exporter}/bin/upc-qaire-exporter";
};
};
}

35
m/module/vpn-dac.nix Normal file
View File

@ -0,0 +1,35 @@
{config, ...}:
{
age.secrets.vpn-dac-login.file = ../../secrets/vpn-dac-login.age;
age.secrets.vpn-dac-client-key.file = ../../secrets/vpn-dac-client-key.age;
services.openvpn.servers = {
# systemctl status openvpn-dac.service
dac = {
config = ''
client
dev tun
proto tcp
remote vpn.ac.upc.edu 1194
remote vpn.ac.upc.edu 80
resolv-retry infinite
nobind
persist-key
persist-tun
ca ${./vpn-dac/ca.crt}
cert ${./vpn-dac/client.crt}
# Only key needs to be secret
key ${config.age.secrets.vpn-dac-client-key.path}
remote-cert-tls server
comp-lzo
verb 3
auth-user-pass ${config.age.secrets.vpn-dac-login.path}
reneg-sec 0
# Only route fox-ipmi
pull-filter ignore "route "
route 147.83.35.27 255.255.255.255
'';
};
};
}

31
m/module/vpn-dac/ca.crt Normal file
View File

@ -0,0 +1,31 @@
-----BEGIN CERTIFICATE-----
MIIFUjCCBDqgAwIBAgIJAJH118PApk5hMA0GCSqGSIb3DQEBCwUAMIHLMQswCQYD
VQQGEwJFUzESMBAGA1UECBMJQmFyY2Vsb25hMRIwEAYDVQQHEwlCYXJjZWxvbmEx
LTArBgNVBAoTJFVuaXZlcnNpdGF0IFBvbGl0ZWNuaWNhIGRlIENhdGFsdW55YTEk
MCIGA1UECxMbQXJxdWl0ZWN0dXJhIGRlIENvbXB1dGFkb3JzMRAwDgYDVQQDEwdM
Q0FDIENBMQ0wCwYDVQQpEwRMQ0FDMR4wHAYJKoZIhvcNAQkBFg9sY2FjQGFjLnVw
Yy5lZHUwHhcNMTYwMTEyMTI0NDIxWhcNNDYwMTEyMTI0NDIxWjCByzELMAkGA1UE
BhMCRVMxEjAQBgNVBAgTCUJhcmNlbG9uYTESMBAGA1UEBxMJQmFyY2Vsb25hMS0w
KwYDVQQKEyRVbml2ZXJzaXRhdCBQb2xpdGVjbmljYSBkZSBDYXRhbHVueWExJDAi
BgNVBAsTG0FycXVpdGVjdHVyYSBkZSBDb21wdXRhZG9yczEQMA4GA1UEAxMHTENB
QyBDQTENMAsGA1UEKRMETENBQzEeMBwGCSqGSIb3DQEJARYPbGNhY0BhYy51cGMu
ZWR1MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA0CteSeof7Xwi51kC
F0nQ4E9iR5Lq7wtfRuVPn6JJcIxJJ6+F9gr4R/HIHTztW4XAzReE36DYfexupx3D
6UgQIkMLlVyGqRbulNF+RnCx20GosF7Dm4RGBVvOxBP1PGjYq/A+XhaaDAFd0cOF
LMNkzuYP7PF0bnBEaHnxmN8bPmuyDyas7fK9AAc3scyWT2jSBPbOVFvCJwPg8MH9
V/h+hKwL/7hRt1MVfVv2qyIuKwTki8mUt0RcVbP7oJoRY5K1+R52phIz/GL/b4Fx
L6MKXlQxLi8vzP4QZXgCMyV7oFNdU3VqCEXBA11YIRvsOZ4QS19otIk/ZWU5x+HH
LAIJ7wIDAQABo4IBNTCCATEwHQYDVR0OBBYEFNyezX1cH1N4QR14ebBpljqmtE7q
MIIBAAYDVR0jBIH4MIH1gBTcns19XB9TeEEdeHmwaZY6prRO6qGB0aSBzjCByzEL
MAkGA1UEBhMCRVMxEjAQBgNVBAgTCUJhcmNlbG9uYTESMBAGA1UEBxMJQmFyY2Vs
b25hMS0wKwYDVQQKEyRVbml2ZXJzaXRhdCBQb2xpdGVjbmljYSBkZSBDYXRhbHVu
eWExJDAiBgNVBAsTG0FycXVpdGVjdHVyYSBkZSBDb21wdXRhZG9yczEQMA4GA1UE
AxMHTENBQyBDQTENMAsGA1UEKRMETENBQzEeMBwGCSqGSIb3DQEJARYPbGNhY0Bh
Yy51cGMuZWR1ggkAkfXXw8CmTmEwDAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsF
AAOCAQEAUAmOvVXIQrR+aZVO0bOTeugKBHB75eTIZSIHIn2oDUvDbAP5GXIJ56A1
6mZXxemSMY8/9k+pRcwJhfat3IgvAN159XSqf9kRv0NHgc3FWUI1Qv/BsAn0vJO/
oK0dbmbbRWqt86qNrCN+cUfz5aovvxN73jFfnvfDQFBk/8enj9wXxYfokjjLPR1Q
+oTkH8dY68qf71oaUB9MndppPEPSz0K1S6h1XxvJoSu9MVSXOQHiq1cdZdxRazI3
4f7q9sTCL+khwDAuZxAYzlEYxFFa/NN8PWU6xPw6V+t/aDhOiXUPJQB/O/K7mw3Z
TQQx5NqM7B5jjak5fauR3/oRD8XXsA==
-----END CERTIFICATE-----

100
m/module/vpn-dac/client.crt Normal file
View File

@ -0,0 +1,100 @@
Certificate:
Data:
Version: 3 (0x2)
Serial Number: 2 (0x2)
Signature Algorithm: sha256WithRSAEncryption
Issuer: C=ES, ST=Barcelona, L=Barcelona, O=Universitat Politecnica de Catalunya, OU=Arquitectura de Computadors, CN=LCAC CA/name=LCAC/emailAddress=lcac@ac.upc.edu
Validity
Not Before: Jan 12 12:45:41 2016 GMT
Not After : Jan 12 12:45:41 2046 GMT
Subject: C=ES, ST=Barcelona, L=Barcelona, O=Universitat Politecnica de Catalunya, OU=Arquitectura de Computadors, CN=client/name=LCAC/emailAddress=lcac@ac.upc.edu
Subject Public Key Info:
Public Key Algorithm: rsaEncryption
Public-Key: (2048 bit)
Modulus:
00:97:99:fa:7a:0e:4d:e2:1d:a5:b1:a8:14:18:64:
c7:66:bf:de:99:1d:92:3b:86:82:4d:95:39:f7:a6:
56:49:97:14:4f:e3:37:00:6c:f4:d0:1d:56:79:e7:
19:b5:dd:36:15:8e:1d:57:7b:59:29:d2:11:bf:58:
48:e0:f7:41:3d:16:64:8d:a2:0b:4a:ac:fa:c6:83:
dc:10:2a:2c:d9:97:48:ee:11:2a:bc:4b:60:dd:b9:
2e:8f:45:ca:87:0b:38:65:1c:f8:a2:1d:f9:50:aa:
6e:60:f9:48:df:57:12:23:e1:e7:0c:81:5c:9f:c5:
b2:e6:99:99:95:30:6d:57:36:06:8c:fd:fb:f9:4f:
60:d2:3c:ba:ae:28:56:2f:da:58:5c:e8:c5:7b:ec:
76:d9:28:6e:fb:8c:07:f9:d7:23:c3:72:76:3c:fa:
dc:20:67:8f:cc:16:e0:91:07:d5:68:f9:20:4d:7d:
5c:2d:02:04:16:76:52:f3:53:be:a3:dc:0d:d5:fb:
6b:55:29:f3:52:35:c8:7d:99:d1:4a:94:be:b1:8e:
fd:85:18:25:eb:41:e9:56:da:af:62:84:20:0a:00:
17:94:92:94:91:6a:f8:54:37:17:ee:1e:bb:fb:93:
71:91:d9:e4:e9:b8:3b:18:7d:6d:7d:4c:ce:58:55:
f9:41
Exponent: 65537 (0x10001)
X509v3 extensions:
X509v3 Basic Constraints:
CA:FALSE
Netscape Comment:
Easy-RSA Generated Certificate
X509v3 Subject Key Identifier:
1B:88:06:D5:33:1D:5C:48:46:B5:DE:78:89:36:96:91:3A:74:43:18
X509v3 Authority Key Identifier:
keyid:DC:9E:CD:7D:5C:1F:53:78:41:1D:78:79:B0:69:96:3A:A6:B4:4E:EA
DirName:/C=ES/ST=Barcelona/L=Barcelona/O=Universitat Politecnica de Catalunya/OU=Arquitectura de Computadors/CN=LCAC CA/name=LCAC/emailAddress=lcac@ac.upc.edu
serial:91:F5:D7:C3:C0:A6:4E:61
X509v3 Extended Key Usage:
TLS Web Client Authentication
X509v3 Key Usage:
Digital Signature
X509v3 Subject Alternative Name:
DNS:client
Signature Algorithm: sha256WithRSAEncryption
42:e8:50:b2:e7:88:75:86:0b:bb:29:e3:aa:c6:0e:4c:e8:ea:
3d:0c:02:31:7f:3b:80:0c:3f:80:af:45:d6:62:27:a0:0e:e7:
26:09:12:97:95:f8:d9:9b:89:b5:ef:56:64:f1:de:82:74:e0:
31:0a:cc:90:0a:bd:50:b8:54:95:0a:ae:3b:40:df:76:b6:d1:
01:2e:f3:96:9f:52:d4:e9:14:6d:b7:14:9d:45:99:33:36:2a:
01:0b:15:1a:ed:55:dc:64:83:65:1a:06:42:d9:c7:dc:97:d4:
02:81:c2:58:2b:ea:e4:b7:ae:84:3a:e4:3f:f1:2e:fa:ec:f3:
40:5d:b8:6a:d5:5e:e1:e8:2f:e2:2f:48:a4:38:a1:4f:22:e3:
4f:66:94:aa:02:78:9a:2b:7a:5d:aa:aa:51:a5:e3:d0:91:e9:
1d:f9:08:ed:8b:51:c9:a6:af:46:85:b5:1c:ed:12:a1:28:33:
75:36:00:d8:5c:14:65:96:c0:28:7d:47:50:a4:89:5f:b0:72:
1a:4b:13:17:26:0f:f0:b8:65:3c:e9:96:36:f9:bf:90:59:33:
87:1f:01:03:25:f8:f0:3a:9b:33:02:d0:0a:43:b5:0a:cf:62:
a1:45:38:37:07:9d:9c:94:0b:31:c6:3c:34:b7:fc:5a:0c:e4:
bf:23:f6:7d
-----BEGIN CERTIFICATE-----
MIIFqjCCBJKgAwIBAgIBAjANBgkqhkiG9w0BAQsFADCByzELMAkGA1UEBhMCRVMx
EjAQBgNVBAgTCUJhcmNlbG9uYTESMBAGA1UEBxMJQmFyY2Vsb25hMS0wKwYDVQQK
EyRVbml2ZXJzaXRhdCBQb2xpdGVjbmljYSBkZSBDYXRhbHVueWExJDAiBgNVBAsT
G0FycXVpdGVjdHVyYSBkZSBDb21wdXRhZG9yczEQMA4GA1UEAxMHTENBQyBDQTEN
MAsGA1UEKRMETENBQzEeMBwGCSqGSIb3DQEJARYPbGNhY0BhYy51cGMuZWR1MB4X
DTE2MDExMjEyNDU0MVoXDTQ2MDExMjEyNDU0MVowgcoxCzAJBgNVBAYTAkVTMRIw
EAYDVQQIEwlCYXJjZWxvbmExEjAQBgNVBAcTCUJhcmNlbG9uYTEtMCsGA1UEChMk
VW5pdmVyc2l0YXQgUG9saXRlY25pY2EgZGUgQ2F0YWx1bnlhMSQwIgYDVQQLExtB
cnF1aXRlY3R1cmEgZGUgQ29tcHV0YWRvcnMxDzANBgNVBAMTBmNsaWVudDENMAsG
A1UEKRMETENBQzEeMBwGCSqGSIb3DQEJARYPbGNhY0BhYy51cGMuZWR1MIIBIjAN
BgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAl5n6eg5N4h2lsagUGGTHZr/emR2S
O4aCTZU596ZWSZcUT+M3AGz00B1WeecZtd02FY4dV3tZKdIRv1hI4PdBPRZkjaIL
Sqz6xoPcECos2ZdI7hEqvEtg3bkuj0XKhws4ZRz4oh35UKpuYPlI31cSI+HnDIFc
n8Wy5pmZlTBtVzYGjP37+U9g0jy6rihWL9pYXOjFe+x22Shu+4wH+dcjw3J2PPrc
IGePzBbgkQfVaPkgTX1cLQIEFnZS81O+o9wN1ftrVSnzUjXIfZnRSpS+sY79hRgl
60HpVtqvYoQgCgAXlJKUkWr4VDcX7h67+5Nxkdnk6bg7GH1tfUzOWFX5QQIDAQAB
o4IBljCCAZIwCQYDVR0TBAIwADAtBglghkgBhvhCAQ0EIBYeRWFzeS1SU0EgR2Vu
ZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQWBBQbiAbVMx1cSEa13niJNpaROnRD
GDCCAQAGA1UdIwSB+DCB9YAU3J7NfVwfU3hBHXh5sGmWOqa0TuqhgdGkgc4wgcsx
CzAJBgNVBAYTAkVTMRIwEAYDVQQIEwlCYXJjZWxvbmExEjAQBgNVBAcTCUJhcmNl
bG9uYTEtMCsGA1UEChMkVW5pdmVyc2l0YXQgUG9saXRlY25pY2EgZGUgQ2F0YWx1
bnlhMSQwIgYDVQQLExtBcnF1aXRlY3R1cmEgZGUgQ29tcHV0YWRvcnMxEDAOBgNV
BAMTB0xDQUMgQ0ExDTALBgNVBCkTBExDQUMxHjAcBgkqhkiG9w0BCQEWD2xjYWNA
YWMudXBjLmVkdYIJAJH118PApk5hMBMGA1UdJQQMMAoGCCsGAQUFBwMCMAsGA1Ud
DwQEAwIHgDARBgNVHREECjAIggZjbGllbnQwDQYJKoZIhvcNAQELBQADggEBAELo
ULLniHWGC7sp46rGDkzo6j0MAjF/O4AMP4CvRdZiJ6AO5yYJEpeV+NmbibXvVmTx
3oJ04DEKzJAKvVC4VJUKrjtA33a20QEu85afUtTpFG23FJ1FmTM2KgELFRrtVdxk
g2UaBkLZx9yX1AKBwlgr6uS3roQ65D/xLvrs80BduGrVXuHoL+IvSKQ4oU8i409m
lKoCeJorel2qqlGl49CR6R35CO2LUcmmr0aFtRztEqEoM3U2ANhcFGWWwCh9R1Ck
iV+wchpLExcmD/C4ZTzpljb5v5BZM4cfAQMl+PA6mzMC0ApDtQrPYqFFODcHnZyU
CzHGPDS3/FoM5L8j9n0=
-----END CERTIFICATE-----

28
m/owl1/configuration.nix Normal file
View File

@ -0,0 +1,28 @@
{ config, pkgs, ... }:
{
imports = [
../common/ssf.nix
../module/ceph.nix
../module/emulation.nix
../module/slurm-client.nix
../module/slurm-firewall.nix
../module/debuginfod.nix
../module/hut-substituter.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53566c";
networking = {
hostName = "owl1";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.1";
prefixLength = 24;
} ];
interfaces.ibp5s0.ipv4.addresses = [ {
address = "10.0.42.1";
prefixLength = 24;
} ];
};
}

29
m/owl2/configuration.nix Normal file
View File

@ -0,0 +1,29 @@
{ config, pkgs, ... }:
{
imports = [
../common/ssf.nix
../module/ceph.nix
../module/emulation.nix
../module/slurm-client.nix
../module/slurm-firewall.nix
../module/debuginfod.nix
../module/hut-substituter.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d535629";
networking = {
hostName = "owl2";
interfaces.eno1.ipv4.addresses = [ {
address = "10.0.40.2";
prefixLength = 24;
} ];
# Watch out! The OmniPath device is not in the same place here:
interfaces.ibp129s0.ipv4.addresses = [ {
address = "10.0.42.2";
prefixLength = 24;
} ];
};
}

View File

@ -0,0 +1,98 @@
{ config, pkgs, lib, modulesPath, ... }:
{
imports = [
../common/base.nix
../common/ssf/hosts.nix
../module/emulation.nix
../module/debuginfod.nix
../module/nvidia.nix
../eudy/kernel/perf.nix
./wireguard.nix
../module/hut-substituter.nix
];
# Don't install Grub on the disk yet
boot.loader.grub.device = "nodev";
# Enable serial console
boot.kernelParams = [
"console=tty1"
"console=ttyS1,115200"
];
networking = {
hostName = "raccoon";
# Only BSC DNSs seem to be reachable from the office VLAN
nameservers = [ "84.88.52.35" "84.88.52.36" ];
defaultGateway = "84.88.51.129";
interfaces.eno0.ipv4.addresses = [ {
address = "84.88.51.152";
prefixLength = 25;
} ];
interfaces.enp5s0f1.ipv4.addresses = [ {
address = "10.0.44.1";
prefixLength = 24;
} ];
nat = {
enable = true;
internalInterfaces = [ "enp5s0f1" ];
externalInterface = "eno0";
};
hosts = {
"10.0.44.4" = [ "tent" ];
"84.88.53.236" = [ "apex" ];
};
};
# Mount the NFS home
fileSystems."/nfs/home" = {
device = "10.106.0.30:/home";
fsType = "nfs";
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
};
# Enable performance governor
powerManagement.cpuFreqGovernor = "performance";
hardware.nvidia.open = false; # Maxwell is older than Turing architecture
services.openssh.settings.X11Forwarding = true;
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [ "systemd" ];
port = 9002;
listenAddress = "127.0.0.1";
};
users.motd = ''
DO YOU BRING FEEDS?
'';
}

48
m/raccoon/wireguard.nix Normal file
View File

@ -0,0 +1,48 @@
{ config, pkgs, ... }:
{
networking.nat = {
enable = true;
enableIPv6 = false;
externalInterface = "eno0";
internalInterfaces = [ "wg0" ];
};
networking.firewall = {
allowedUDPPorts = [ 666 ];
};
age.secrets.wgRaccoon.file = ../../secrets/wg-raccoon.age;
# Enable WireGuard
networking.wireguard.enable = true;
networking.wireguard.interfaces = {
wg0 = {
ips = [ "10.106.0.236/24" ];
listenPort = 666;
privateKeyFile = config.age.secrets.wgRaccoon.path;
# Public key: QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=
peers = [
{
name = "fox";
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
allowedIPs = [ "10.106.0.1/32" ];
endpoint = "fox.ac.upc.edu:666";
persistentKeepalive = 25;
}
{
name = "apex";
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
allowedIPs = [ "10.106.0.30/32" "10.0.40.0/24" ];
endpoint = "ssfhead.bsc.es:666";
persistentKeepalive = 25;
}
];
};
};
networking.hosts = {
"10.106.0.1" = [ "fox.wg" ];
"10.106.0.30" = [ "apex.wg" ];
};
}

14
m/tent/blackbox.yml Normal file
View File

@ -0,0 +1,14 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
preferred_ip_protocol: "ip4"
follow_redirects: true
valid_status_codes: [] # Defaults to 2xx
method: GET
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"

85
m/tent/configuration.nix Normal file
View File

@ -0,0 +1,85 @@
{ config, pkgs, lib, ... }:
{
imports = [
../common/xeon.nix
../common/ssf/hosts.nix
../module/emulation.nix
../module/debuginfod.nix
./monitoring.nix
./nginx.nix
./nix-serve.nix
./gitlab-runner.nix
./gitea.nix
../hut/public-inbox.nix
../hut/msmtp.nix
../module/p.nix
../module/vpn-dac.nix
../module/hut-substituter.nix
];
# Select the this using the ID to avoid mismatches
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d537675";
networking = {
hostName = "tent";
interfaces.eno1.ipv4.addresses = [
{
address = "10.0.44.4";
prefixLength = 24;
}
];
# Only BSC DNSs seem to be reachable from the office VLAN
nameservers = [ "84.88.52.35" "84.88.52.36" ];
search = [ "bsc.es" "ac.upc.edu" ];
defaultGateway = "10.0.44.1";
hosts = {
"84.88.53.236" = [ "apex" ];
"10.0.44.1" = [ "raccoon" ];
};
};
services.p.enable = true;
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [ "systemd" ];
port = 9002;
listenAddress = "127.0.0.1";
};
boot.swraid = {
enable = true;
mdadmConf = ''
DEVICE partitions
ARRAY /dev/md0 metadata=1.2 UUID=496db1e2:056a92aa:a544543f:40db379d
MAILADDR root
'';
};
fileSystems."/vault" = {
device = "/dev/disk/by-label/vault";
fsType = "ext4";
};
# Make a /vault/$USER directory for each user.
systemd.services.create-vault-dirs = let
# Take only normal users in tent
users = lib.filterAttrs (_: v: v.isNormalUser) config.users.users;
commands = lib.concatLists (lib.mapAttrsToList
(_: user: [
"install -d -o ${user.name} -g ${user.group} -m 0711 /vault/home/${user.name}"
]) users);
script = pkgs.writeShellScript "create-vault-dirs.sh" (lib.concatLines commands);
in {
enable = true;
wants = [ "local-fs.target" ];
after = [ "local-fs.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = script;
};
# disable automatic garbage collector
nix.gc.automatic = lib.mkForce false;
}

30
m/tent/gitea.nix Normal file
View File

@ -0,0 +1,30 @@
{ config, lib, ... }:
{
services.gitea = {
enable = true;
appName = "Gitea in the jungle";
settings = {
server = {
ROOT_URL = "https://jungle.bsc.es/git/";
LOCAL_ROOT_URL = "https://jungle.bsc.es/git/";
LANDING_PAGE = "explore";
};
metrics.ENABLED = true;
service = {
DISABLE_REGISTRATION = true;
REGISTER_MANUAL_CONFIRM = true;
ENABLE_NOTIFY_MAIL = true;
};
log.LEVEL = "Warn";
mailer = {
ENABLED = true;
FROM = "jungle-robot@bsc.es";
PROTOCOL = "sendmail";
SENDMAIL_PATH = "/run/wrappers/bin/sendmail";
SENDMAIL_ARGS = "--";
};
};
};
}

93
m/tent/gitlab-runner.nix Normal file
View File

@ -0,0 +1,93 @@
{ pkgs, lib, config, ... }:
{
age.secrets.tent-gitlab-runner-pm-shell.file = ../../secrets/tent-gitlab-runner-pm-shell-token.age;
age.secrets.tent-gitlab-runner-pm-docker.file = ../../secrets/tent-gitlab-runner-pm-docker-token.age;
age.secrets.tent-gitlab-runner-bsc-docker.file = ../../secrets/tent-gitlab-runner-bsc-docker-token.age;
services.gitlab-runner = let sec = config.age.secrets; in {
enable = true;
settings.concurrent = 5;
services = {
# For gitlab.pm.bsc.es
gitlab-pm-shell = {
executor = "shell";
environmentVariables = {
SHELL = "${pkgs.bash}/bin/bash";
};
authenticationTokenConfigFile = sec.tent-gitlab-runner-pm-shell.path;
preGetSourcesScript = pkgs.writeScript "setup" ''
echo "This is the preGetSources script running, brace for impact"
env
'';
};
gitlab-pm-docker = {
authenticationTokenConfigFile = sec.tent-gitlab-runner-pm-docker.path;
executor = "docker";
dockerImage = "debian:stable";
};
# For gitlab.bsc.es
gitlab-bsc-docker = {
# gitlab.bsc.es still uses the old token mechanism
registrationConfigFile = sec.tent-gitlab-runner-bsc-docker.path;
tagList = [ "docker" "tent" "nix" ];
executor = "docker";
dockerImage = "alpine";
dockerVolumes = [
"/nix/store:/nix/store:ro"
"/nix/var/nix/db:/nix/var/nix/db:ro"
"/nix/var/nix/daemon-socket:/nix/var/nix/daemon-socket:ro"
];
dockerDisableCache = true;
registrationFlags = [
# Increase build log length to 64 MiB
"--output-limit 65536"
];
preBuildScript = pkgs.writeScript "setup-container" ''
mkdir -p -m 0755 /nix/var/log/nix/drvs
mkdir -p -m 0755 /nix/var/nix/gcroots
mkdir -p -m 0755 /nix/var/nix/profiles
mkdir -p -m 0755 /nix/var/nix/temproots
mkdir -p -m 0755 /nix/var/nix/userpool
mkdir -p -m 1777 /nix/var/nix/gcroots/per-user
mkdir -p -m 1777 /nix/var/nix/profiles/per-user
mkdir -p -m 0755 /nix/var/nix/profiles/per-user/root
mkdir -p -m 0700 "$HOME/.nix-defexpr"
mkdir -p -m 0700 "$HOME/.ssh"
cat >> "$HOME/.ssh/known_hosts" << EOF
bscpm04.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPx4mC0etyyjYUT2Ztc/bs4ZXSbVMrogs1ZTP924PDgT
gitlab-internal.bsc.es ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3
EOF
. ${pkgs.nix}/etc/profile.d/nix-daemon.sh
# Required to load SSL certificate paths
. ${pkgs.cacert}/nix-support/setup-hook
'';
environmentVariables = {
ENV = "/etc/profile";
USER = "root";
NIX_REMOTE = "daemon";
PATH = "${config.system.path}/bin:/bin:/sbin:/usr/bin:/usr/sbin";
};
};
};
};
systemd.services.gitlab-runner.serviceConfig = {
DynamicUser = lib.mkForce false;
User = "gitlab-runner";
Group = "gitlab-runner";
ExecStart = lib.mkForce
''${pkgs.gitlab-runner}/bin/gitlab-runner run --config ''${HOME}/.gitlab-runner/config.toml --listen-address "127.0.0.1:9252" --working-directory ''${HOME}'';
};
users.users.gitlab-runner = {
uid = config.ids.uids.gitlab-runner;
home = "/var/lib/gitlab-runner";
description = "Gitlab Runner";
group = "gitlab-runner";
extraGroups = [ "docker" ];
createHome = true;
};
users.groups.gitlab-runner.gid = config.ids.gids.gitlab-runner;
}

217
m/tent/monitoring.nix Normal file
View File

@ -0,0 +1,217 @@
{ config, lib, pkgs, ... }:
{
imports = [
../module/meteocat-exporter.nix
../module/upc-qaire-exporter.nix
../module/nix-daemon-exporter.nix
];
age.secrets.grafanaJungleRobotPassword = {
file = ../../secrets/jungle-robot-password.age;
owner = "grafana";
mode = "400";
};
services.grafana = {
enable = true;
settings = {
server = {
domain = "jungle.bsc.es";
root_url = "%(protocol)s://%(domain)s/grafana";
serve_from_sub_path = true;
http_port = 2342;
http_addr = "127.0.0.1";
};
smtp = {
enabled = true;
from_address = "jungle-robot@bsc.es";
user = "jungle-robot";
# Read the password from a file, which is only readable by grafana user
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
password = "$__file{${config.age.secrets.grafanaJungleRobotPassword.path}}";
host = "mail.bsc.es:465";
startTLS_policy = "NoStartTLS";
};
feature_toggles.publicDashboards = true;
"auth.anonymous".enabled = true;
log.level = "warn";
};
};
services.prometheus = {
enable = true;
port = 9001;
retentionTime = "5y";
listenAddress = "127.0.0.1";
};
# We need access to the devices to monitor the disk space
systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false;
systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only";
# Credentials for IPMI exporter
age.secrets.ipmiYml = {
file = ../../secrets/ipmi.yml.age;
owner = "ipmi-exporter";
};
# Create an IPMI group and assign the ipmi0 device
users.groups.ipmi = {};
services.udev.extraRules = ''
SUBSYSTEM=="ipmi", KERNEL=="ipmi0", GROUP="ipmi", MODE="0660"
'';
# Add a new ipmi-exporter user that can read the ipmi0 device
users.users.ipmi-exporter = {
isSystemUser = true;
group = "ipmi";
};
# Disable dynamic user so we have the ipmi-exporter user available for the credentials
systemd.services.prometheus-ipmi-exporter.serviceConfig = {
DynamicUser = lib.mkForce false;
PrivateDevices = lib.mkForce false;
User = lib.mkForce "ipmi-exporter";
Group = lib.mkForce "ipmi";
RestrictNamespaces = lib.mkForce false;
# Fake uid to 0 so it shuts up
ExecStart = let
cfg = config.services.prometheus.exporters.ipmi;
in lib.mkForce (lib.concatStringsSep " " ([
"${pkgs.util-linux}/bin/unshare --map-user 0"
"${pkgs.prometheus-ipmi-exporter}/bin/ipmi_exporter"
"--web.listen-address ${cfg.listenAddress}:${toString cfg.port}"
"--config.file ${lib.escapeShellArg cfg.configFile}"
] ++ cfg.extraFlags));
};
services.prometheus = {
exporters = {
ipmi = {
enable = true;
configFile = config.age.secrets.ipmiYml.path;
#extraFlags = [ "--log.level=debug" ];
listenAddress = "127.0.0.1";
};
node = {
enable = true;
enabledCollectors = [ "logind" ];
port = 9002;
listenAddress = "127.0.0.1";
};
blackbox = {
enable = true;
listenAddress = "127.0.0.1";
configFile = ./blackbox.yml;
};
};
scrapeConfigs = [
{
job_name = "local";
static_configs = [{
targets = [
"127.0.0.1:9002" # Node exporter
#"127.0.0.1:9115" # Blackbox exporter
"127.0.0.1:9290" # IPMI exporter for local node
"127.0.0.1:9928" # UPC Qaire custom exporter
"127.0.0.1:9929" # Meteocat custom exporter
"127.0.0.1:9999" # Nix-daemon custom exporter
];
}];
}
{
job_name = "blackbox-http";
metrics_path = "/probe";
params = { module = [ "http_2xx" ]; };
static_configs = [{
targets = [
"https://www.google.com/robots.txt"
"https://pm.bsc.es/"
"https://pm.bsc.es/gitlab/"
"https://jungle.bsc.es/"
"https://gitlab.bsc.es/"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:9115";
}
];
}
{
job_name = "blackbox-icmp";
metrics_path = "/probe";
params = { module = [ "icmp" ]; };
static_configs = [{
targets = [
"1.1.1.1"
"8.8.8.8"
"ssfhead"
"raccoon"
"anella-bsc.cesca.cat"
"upc-anella.cesca.cat"
"fox.ac.upc.edu"
"fox-ipmi.ac.upc.edu"
"arenys5.ac.upc.edu"
"arenys0-2.ac.upc.edu"
"epi01.bsc.es"
"axle.bsc.es"
];
}];
relabel_configs = [
{
# Takes the address and sets it in the "target=<xyz>" URL parameter
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
# Sets the "instance" label with the remote host we are querying
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
# Shows the host target address instead of the blackbox address
target_label = "__address__";
replacement = "127.0.0.1:9115";
}
];
}
{
job_name = "ipmi-raccoon";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9290" ]; }
];
params = {
target = [ "raccoon-ipmi" ];
module = [ "raccoon" ];
};
}
{
job_name = "ipmi-fox";
metrics_path = "/ipmi";
static_configs = [
{ targets = [ "127.0.0.1:9290" ]; }
];
params = {
target = [ "fox-ipmi.ac.upc.edu" ];
module = [ "fox" ];
};
}
];
};
}

79
m/tent/nginx.nix Normal file
View File

@ -0,0 +1,79 @@
{ theFlake, pkgs, ... }:
let
website = pkgs.stdenv.mkDerivation {
name = "jungle-web";
src = pkgs.fetchgit {
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
};
buildInputs = [ pkgs.hugo ];
buildPhase = ''
rm -rf public/
hugo
'';
installPhase = ''
cp -r public $out
'';
# Don't mess doc/
dontFixup = true;
};
in
{
networking.firewall.allowedTCPPorts = [ 80 ];
services.nginx = {
enable = true;
virtualHosts."jungle.bsc.es" = {
root = "${website}";
listen = [
{
addr = "0.0.0.0";
port = 80;
}
];
extraConfig = ''
set_real_ip_from 127.0.0.1;
set_real_ip_from 84.88.52.107;
real_ip_recursive on;
real_ip_header X-Forwarded-For;
location /git {
rewrite ^/git$ / break;
rewrite ^/git/(.*) /$1 break;
proxy_pass http://127.0.0.1:3000;
proxy_redirect http:// $scheme://;
}
location /cache {
rewrite ^/cache/(.*) /$1 break;
proxy_pass http://127.0.0.1:5000;
proxy_redirect http:// $scheme://;
}
location /lists {
proxy_pass http://127.0.0.1:8081;
proxy_redirect http:// $scheme://;
}
location /grafana {
proxy_pass http://127.0.0.1:2342;
proxy_redirect http:// $scheme://;
proxy_set_header Host $host;
# Websockets
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
location ~ ^/~(.+?)(/.*)?$ {
alias /vault/home/$1/public_html$2;
index index.html index.htm;
autoindex on;
absolute_redirect off;
}
location /p/ {
alias /var/lib/p/;
}
location /pub/ {
alias /vault/pub/;
}
'';
};
};
}

Some files were not shown because too many files have changed in this diff Show More