forked from rarias/jungle
Compare commits
53 Commits
lake2-ipoi
...
shared-nix
| Author | SHA1 | Date | |
|---|---|---|---|
| e065cde376 | |||
| 3bb0b550aa | |||
| e4cbcab81c | |||
| 77b41a90e2 | |||
| 1fc6891dc6 | |||
| 8c11c7460a | |||
| e6014511f5 | |||
| 320c58ce48 | |||
| d145ee9b2c | |||
| 140178d58e | |||
| d48f3b989a | |||
| 653d411b9e | |||
| 51c57dbc41 | |||
| 33cd40160e | |||
| a1e8cfea47 | |||
| 5d72ee3da3 | |||
| fdc6445d47 | |||
| e88805947e | |||
| aaefddc44a | |||
| d9d249411d | |||
| c07f75c6bb | |||
| 8d449ba20c | |||
| 10ca572aec | |||
| 75b0f48715 | |||
| 19a451db77 | |||
| ec9be9bb62 | |||
| 7ddd1977f3 | |||
| 7050c505b5 | |||
| 033a1fe97b | |||
| 77cb3c494e | |||
| 6db5772ac4 | |||
| 3e347e673c | |||
| dca274d020 | |||
| c33909f32f | |||
| 64e856e8b9 | |||
| 02f40a8217 | |||
| 77d43b6da9 | |||
| ab55aac5ff | |||
| 9b5bfbb7a3 | |||
| a69a71d1b0 | |||
| 98374bd303 | |||
| 3b6be8a2fc | |||
| 2bb366b9ac | |||
| 2d16709648 | |||
| 9344daa31c | |||
| 80c98041b5 | |||
| 3418e57907 | |||
| 6848b58e39 | |||
| 13a70411aa | |||
| f9c77b433a | |||
| 9d487845f6 | |||
| 3c99c2a662 | |||
| 7d09108c9f |
19
flake.lock
generated
19
flake.lock
generated
@@ -23,12 +23,17 @@
|
||||
}
|
||||
},
|
||||
"bscpkgs": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1690560045,
|
||||
"narHash": "sha256-39ZP+FIzlWoN3c43hReBYpStg4RLYw/z7TdxCQmOvTM=",
|
||||
"lastModified": 1694708510,
|
||||
"narHash": "sha256-72bvRBhq8Q8V6ibsR9lyBE92V2EC6C6Ek3J5cOM79So=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "b4a20d7c3af854b39682484adfd1c7979319f439",
|
||||
"revCount": 841,
|
||||
"rev": "3a4062ac04be6263c64a481420d8e768c2521b80",
|
||||
"revCount": 862,
|
||||
"type": "git",
|
||||
"url": "https://pm.bsc.es/gitlab/rarias/bscpkgs.git"
|
||||
},
|
||||
@@ -82,11 +87,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1692447944,
|
||||
"narHash": "sha256-fkJGNjEmTPvqBs215EQU4r9ivecV5Qge5cF/QDLVn3U=",
|
||||
"lastModified": 1693663421,
|
||||
"narHash": "sha256-ImMIlWE/idjcZAfxKK8sQA7A1Gi/O58u5/CJA+mxvl8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d680ded26da5cf104dd2735a51e88d2d8f487b4d",
|
||||
"rev": "e56990880811a451abd32515698c712788be5720",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
agenix.url = "github:ryantm/agenix";
|
||||
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
bscpkgs.url = "git+https://pm.bsc.es/gitlab/rarias/bscpkgs.git";
|
||||
bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
|
||||
@@ -25,6 +26,9 @@ in
|
||||
lake2 = mkConf "lake2";
|
||||
};
|
||||
|
||||
packages.x86_64-linux.hut = self.nixosConfigurations.hut.pkgs;
|
||||
packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // {
|
||||
bscpkgs = bscpkgs.packages.x86_64-linux;
|
||||
nixpkgs = nixpkgs.legacyPackages.x86_64-linux;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
29
keys.nix
Normal file
29
keys.nix
Normal file
@@ -0,0 +1,29 @@
|
||||
# As agenix needs to parse the secrets from a standalone .nix file, we describe
|
||||
# here all the public keys
|
||||
rec {
|
||||
hosts = {
|
||||
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
|
||||
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
|
||||
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
|
||||
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
|
||||
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
|
||||
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
|
||||
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
||||
};
|
||||
|
||||
hostGroup = with hosts; rec {
|
||||
compute = [ owl1 owl2 ];
|
||||
playground = [ eudy koro ];
|
||||
storage = [ bay lake2 ];
|
||||
monitor = [ hut ];
|
||||
|
||||
system = storage ++ monitor;
|
||||
safe = system ++ compute;
|
||||
all = safe ++ playground;
|
||||
};
|
||||
|
||||
admins = {
|
||||
rarias = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
||||
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
|
||||
};
|
||||
}
|
||||
9
m/common/agenix.nix
Normal file
9
m/common/agenix.nix
Normal file
@@ -0,0 +1,9 @@
|
||||
{ agenix, ... }:
|
||||
|
||||
{
|
||||
imports = [ agenix.nixosModules.default ];
|
||||
|
||||
environment.systemPackages = [
|
||||
agenix.packages.x86_64-linux.default
|
||||
];
|
||||
}
|
||||
@@ -6,6 +6,9 @@
|
||||
fsType = "ext4";
|
||||
};
|
||||
|
||||
# Trim unused blocks weekly
|
||||
services.fstrim.enable = true;
|
||||
|
||||
swapDevices =
|
||||
[ { device = "/dev/disk/by-label/swap"; }
|
||||
];
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
{
|
||||
imports = [
|
||||
./agenix.nix
|
||||
./boot.nix
|
||||
./fs.nix
|
||||
./hw.nix
|
||||
@@ -10,6 +11,9 @@
|
||||
./slurm.nix
|
||||
./ssh.nix
|
||||
./users.nix
|
||||
./watchdog.nix
|
||||
./rev.nix
|
||||
./zsh.nix
|
||||
];
|
||||
|
||||
nixpkgs.overlays = [
|
||||
@@ -17,21 +21,22 @@
|
||||
(import ../../pkgs/overlay.nix)
|
||||
];
|
||||
|
||||
nix.nixPath = [
|
||||
"nixpkgs=${nixpkgs}"
|
||||
"bscpkgs=${bscpkgs}"
|
||||
"jungle=${theFlake.outPath}"
|
||||
];
|
||||
|
||||
nix.registry.nixpkgs.flake = nixpkgs;
|
||||
nix.registry.bscpkgs.flake = bscpkgs;
|
||||
nix.registry.jungle.flake = theFlake;
|
||||
|
||||
system.configurationRevision =
|
||||
if theFlake ? rev
|
||||
then theFlake.rev
|
||||
else throw ("Refusing to build from a dirty Git tree!");
|
||||
|
||||
nix.nixPath = [
|
||||
"nixpkgs=${nixpkgs}"
|
||||
"jungle=${theFlake.outPath}"
|
||||
];
|
||||
|
||||
nix.settings.flake-registry =
|
||||
pkgs.writeText "global-registry.json" ''{"flakes":[],"version":2}'';
|
||||
|
||||
nix.registry.nixpkgs.flake = nixpkgs;
|
||||
nix.registry.jungle.flake = theFlake;
|
||||
|
||||
environment.systemPackages = with pkgs; [
|
||||
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
||||
nix-diff ipmitool freeipmi ethtool lm_sensors ix cmake gnumake file tree
|
||||
@@ -40,6 +45,8 @@
|
||||
bsc.osumb
|
||||
];
|
||||
|
||||
programs.direnv.enable = true;
|
||||
|
||||
systemd.services."serial-getty@ttyS0" = {
|
||||
enable = true;
|
||||
wantedBy = [ "getty.target" ];
|
||||
@@ -71,9 +78,6 @@
|
||||
nix.gc.dates = "weekly";
|
||||
nix.gc.options = "--delete-older-than 30d";
|
||||
|
||||
programs.zsh.enable = true;
|
||||
programs.zsh.histSize = 100000;
|
||||
|
||||
programs.bash.promptInit = ''
|
||||
PS1="\h\\$ "
|
||||
'';
|
||||
|
||||
@@ -21,9 +21,14 @@
|
||||
firewall = {
|
||||
enable = true;
|
||||
allowedTCPPorts = [ 22 ];
|
||||
|
||||
# FIXME: For slurmd as it requests the compute nodes to connect to us
|
||||
allowedTCPPortRanges = [ { from=1024; to=65535; } ];
|
||||
extraCommands = ''
|
||||
# Prevent ssfhead from contacting our slurmd daemon
|
||||
iptables -A nixos-fw -p tcp -s ssfhead --dport 6817:6819 -j nixos-fw-log-refuse
|
||||
# But accept traffic to slurm ports from any other node in the subnet
|
||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817:6819 -j nixos-fw-accept
|
||||
# We also need to open the srun port range
|
||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
|
||||
'';
|
||||
};
|
||||
|
||||
extraHosts = ''
|
||||
@@ -32,8 +37,8 @@
|
||||
|
||||
# Node Entry for node: mds01 (ID=72)
|
||||
10.0.40.40 bay mds01 mds01-eth0
|
||||
10.0.42.40 mds01-ib0
|
||||
10.0.40.141 mds01-ipmi0
|
||||
10.0.42.40 bay-ib mds01-ib0
|
||||
10.0.40.141 bay-ipmi mds01-ipmi0
|
||||
|
||||
# Node Entry for node: oss01 (ID=73)
|
||||
10.0.40.41 oss01 oss01-eth0
|
||||
@@ -42,18 +47,18 @@
|
||||
|
||||
# Node Entry for node: oss02 (ID=74)
|
||||
10.0.40.42 lake2 oss02 oss02-eth0
|
||||
10.0.42.42 oss02-ib0
|
||||
10.0.40.143 oss02-ipmi0
|
||||
10.0.42.42 lake2-ib oss02-ib0
|
||||
10.0.40.143 lake2-ipmi oss02-ipmi0
|
||||
|
||||
# Node Entry for node: xeon01 (ID=15)
|
||||
10.0.40.1 owl1 xeon01 xeon01-eth0
|
||||
10.0.42.1 xeon01-ib0
|
||||
10.0.40.101 xeon01-ipmi0
|
||||
10.0.42.1 owl1-ib xeon01-ib0
|
||||
10.0.40.101 owl1-ipmi xeon01-ipmi0
|
||||
|
||||
# Node Entry for node: xeon02 (ID=16)
|
||||
10.0.40.2 owl2 xeon02 xeon02-eth0
|
||||
10.0.42.2 xeon02-ib0
|
||||
10.0.40.102 xeon02-ipmi0
|
||||
10.0.42.2 owl2-ib xeon02-ib0
|
||||
10.0.40.102 owl2-ipmi xeon02-ipmi0
|
||||
|
||||
# Node Entry for node: xeon03 (ID=17)
|
||||
10.0.40.3 xeon03 xeon03-eth0
|
||||
@@ -67,8 +72,8 @@
|
||||
|
||||
# Node Entry for node: xeon05 (ID=19)
|
||||
10.0.40.5 koro xeon05 xeon05-eth0
|
||||
10.0.42.5 xeon05-ib0
|
||||
10.0.40.105 xeon05-ipmi0
|
||||
10.0.42.5 koro-ib xeon05-ib0
|
||||
10.0.40.105 koro-ipmi xeon05-ipmi0
|
||||
|
||||
# Node Entry for node: xeon06 (ID=20)
|
||||
10.0.40.6 xeon06 xeon06-eth0
|
||||
@@ -77,13 +82,13 @@
|
||||
|
||||
# Node Entry for node: xeon07 (ID=21)
|
||||
10.0.40.7 hut xeon07 xeon07-eth0
|
||||
10.0.42.7 xeon07-ib0
|
||||
10.0.40.107 xeon07-ipmi0
|
||||
10.0.42.7 hut-ib xeon07-ib0
|
||||
10.0.40.107 hut-ipmi xeon07-ipmi0
|
||||
|
||||
# Node Entry for node: xeon08 (ID=22)
|
||||
10.0.40.8 eudy xeon08 xeon08-eth0
|
||||
10.0.42.8 xeon08-ib0
|
||||
10.0.40.108 xeon08-ipmi0
|
||||
10.0.42.8 eudy-ib xeon08-ib0
|
||||
10.0.40.108 eudy-ipmi xeon08-ipmi0
|
||||
'';
|
||||
};
|
||||
}
|
||||
|
||||
18
m/common/rev.nix
Normal file
18
m/common/rev.nix
Normal file
@@ -0,0 +1,18 @@
|
||||
{ theFlake, ... }:
|
||||
|
||||
let
|
||||
rev = if theFlake ? rev then theFlake.rev
|
||||
else throw ("Refusing to build from a dirty Git tree!");
|
||||
in {
|
||||
# Save the commit of the config in /etc/configrev
|
||||
environment.etc.configrev.text = rev + "\n";
|
||||
|
||||
# Keep a log with the config over time
|
||||
system.activationScripts.configRevLog.text = ''
|
||||
BOOTED=$(cat /run/booted-system/etc/configrev 2>/dev/null || echo unknown)
|
||||
CURRENT=$(cat /run/current-system/etc/configrev 2>/dev/null || echo unknown)
|
||||
NEXT=${rev}
|
||||
DATENOW=$(date --iso-8601=seconds)
|
||||
echo "$DATENOW booted=$BOOTED current=$CURRENT next=$NEXT" >> /var/configrev.log
|
||||
'';
|
||||
}
|
||||
@@ -1,6 +1,33 @@
|
||||
{ lib, ... }:
|
||||
{ config, pkgs, lib, ... }:
|
||||
|
||||
{
|
||||
let
|
||||
suspendProgram = pkgs.writeScript "suspend.sh" ''
|
||||
#!/usr/bin/env bash
|
||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||
set -x
|
||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||
hosts=$(scontrol show hostnames $1)
|
||||
for host in $hosts; do
|
||||
echo Shutting down host: $host
|
||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
||||
done
|
||||
'';
|
||||
|
||||
resumeProgram = pkgs.writeScript "resume.sh" ''
|
||||
#!/usr/bin/env bash
|
||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||
set -x
|
||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||
hosts=$(scontrol show hostnames $1)
|
||||
for host in $hosts; do
|
||||
echo Starting host: $host
|
||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
||||
done
|
||||
'';
|
||||
|
||||
in {
|
||||
systemd.services.slurmd.serviceConfig = {
|
||||
# Kill all processes in the control group on stop/restart. This will kill
|
||||
# all the jobs running, so ensure that we only upgrade when the nodes are
|
||||
@@ -9,6 +36,7 @@
|
||||
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
||||
KillMode = lib.mkForce "control-group";
|
||||
};
|
||||
|
||||
services.slurm = {
|
||||
client.enable = true;
|
||||
controlMachine = "hut";
|
||||
@@ -18,6 +46,11 @@
|
||||
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
|
||||
];
|
||||
|
||||
partitionName = [
|
||||
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
|
||||
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
|
||||
];
|
||||
|
||||
# See slurm.conf(5) for more details about these options.
|
||||
extraConfig = ''
|
||||
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
||||
@@ -37,6 +70,30 @@
|
||||
# Enable task/affinity to allow the jobs to run in a specified subset of
|
||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||
TaskPlugin=task/affinity,task/cgroup
|
||||
|
||||
# Power off unused nodes until they are requested
|
||||
SuspendProgram=${suspendProgram}
|
||||
SuspendTimeout=60
|
||||
ResumeProgram=${resumeProgram}
|
||||
ResumeTimeout=300
|
||||
SuspendExcNodes=hut
|
||||
|
||||
# Turn the nodes off after 1 hour of inactivity
|
||||
SuspendTime=3600
|
||||
|
||||
# Reduce port range so we can allow only this range in the firewall
|
||||
SrunPortRange=60000-61000
|
||||
'';
|
||||
};
|
||||
|
||||
age.secrets.mungeKey = {
|
||||
file = ../../secrets/munge-key.age;
|
||||
owner = "munge";
|
||||
group = "munge";
|
||||
};
|
||||
|
||||
services.munge = {
|
||||
enable = true;
|
||||
password = config.age.secrets.mungeKey.path;
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
{ ... }:
|
||||
{ lib, ... }:
|
||||
|
||||
let
|
||||
keys = import ../../keys.nix;
|
||||
hostsKeys = lib.mapAttrs (name: value: { publicKey = value; }) keys.hosts;
|
||||
in
|
||||
{
|
||||
# Enable the OpenSSH daemon.
|
||||
services.openssh.enable = true;
|
||||
@@ -11,13 +15,7 @@
|
||||
ProxyCommand nc -X connect -x localhost:23080 %h %p
|
||||
'';
|
||||
|
||||
programs.ssh.knownHosts = {
|
||||
"hut".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1";
|
||||
"owl1".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv";
|
||||
"owl2".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK";
|
||||
"eudy".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG";
|
||||
"koro".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67";
|
||||
|
||||
programs.ssh.knownHosts = hostsKeys // {
|
||||
"gitlab-internal.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIF9arsAOSRB06hdy71oTvJHG2Mg8zfebADxpvc37lZo3";
|
||||
"bscpm03.bsc.es".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM2NuSUPsEhqz1j5b4Gqd+MWFnRqyqY57+xMvBUqHYUS";
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{ ... }:
|
||||
{ pkgs, ... }:
|
||||
|
||||
{
|
||||
users = {
|
||||
@@ -26,6 +26,7 @@
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINa0tvnNgwkc5xOwd6xTtaIdFi5jv0j2FrE7jl5MTLoE ram@mio"
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGYcXIxe0poOEGLpk8NjiRozls7fMRX0N3j3Ar94U+Gl rarias@hal"
|
||||
];
|
||||
shell = pkgs.zsh;
|
||||
};
|
||||
|
||||
arocanon = {
|
||||
@@ -53,6 +54,18 @@
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFYfXg37mauGeurqsLpedgA2XQ9d4Nm0ZGo/hI1f7wwH rpenacob@bsc"
|
||||
];
|
||||
};
|
||||
|
||||
anavarro = {
|
||||
uid = 1037;
|
||||
isNormalUser = true;
|
||||
home = "/home/Computational/anavarro";
|
||||
description = "Antoni Navarro";
|
||||
group = "Computational";
|
||||
hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
groups = {
|
||||
|
||||
9
m/common/watchdog.nix
Normal file
9
m/common/watchdog.nix
Normal file
@@ -0,0 +1,9 @@
|
||||
{ ... }:
|
||||
|
||||
{
|
||||
# The boards have a BMC watchdog controlled by IPMI
|
||||
boot.kernelModules = [ "ipmi_watchdog" ];
|
||||
|
||||
# Enable systemd watchdog with 30 s interval
|
||||
systemd.watchdog.runtimeTime = "30s";
|
||||
}
|
||||
91
m/common/zsh.nix
Normal file
91
m/common/zsh.nix
Normal file
@@ -0,0 +1,91 @@
|
||||
{ pkgs, ... }:
|
||||
|
||||
{
|
||||
environment.systemPackages = with pkgs; [
|
||||
zsh-completions
|
||||
nix-zsh-completions
|
||||
];
|
||||
|
||||
programs.zsh = {
|
||||
enable = true;
|
||||
histSize = 1000000;
|
||||
|
||||
shellInit = ''
|
||||
# Disable new user prompt
|
||||
if [ ! -e ~/.zshrc ]; then
|
||||
touch ~/.zshrc
|
||||
fi
|
||||
'';
|
||||
|
||||
promptInit = ''
|
||||
# Note that to manually override this in ~/.zshrc you should run `prompt off`
|
||||
# before setting your PS1 and etc. Otherwise this will likely to interact with
|
||||
# your ~/.zshrc configuration in unexpected ways as the default prompt sets
|
||||
# a lot of different prompt variables.
|
||||
autoload -U promptinit && promptinit && prompt default && setopt prompt_sp
|
||||
'';
|
||||
|
||||
# Taken from Ulli Kehrle config:
|
||||
# https://git.hrnz.li/Ulli/nixos/src/commit/2e203b8d8d671f4e3ced0f1744a51d5c6ee19846/profiles/shell.nix#L199-L205
|
||||
interactiveShellInit = ''
|
||||
source "${pkgs.zsh-history-substring-search}/share/zsh-history-substring-search/zsh-history-substring-search.zsh"
|
||||
|
||||
# Save history immediately, but only load it when the shell starts
|
||||
setopt inc_append_history
|
||||
|
||||
# dircolors doesn't support alacritty:
|
||||
# https://lists.gnu.org/archive/html/bug-coreutils/2019-05/msg00029.html
|
||||
export LS_COLORS='rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.swp=00;90:*.tmp=00;90:*.dpkg-dist=00;90:*.dpkg-old=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:';
|
||||
|
||||
# From Arch Linux and GRML
|
||||
bindkey "^R" history-incremental-pattern-search-backward
|
||||
bindkey "^S" history-incremental-pattern-search-forward
|
||||
|
||||
# Auto rehash for new binaries
|
||||
zstyle ':completion:*' rehash true
|
||||
# show a nice menu with the matches
|
||||
zstyle ':completion:*' menu yes select
|
||||
|
||||
bindkey '^[OA' history-substring-search-up # Up
|
||||
bindkey '^[[A' history-substring-search-up # Up
|
||||
|
||||
bindkey '^[OB' history-substring-search-down # Down
|
||||
bindkey '^[[B' history-substring-search-down # Down
|
||||
|
||||
bindkey '\e[1~' beginning-of-line # Home
|
||||
bindkey '\e[7~' beginning-of-line # Home
|
||||
bindkey '\e[H' beginning-of-line # Home
|
||||
bindkey '\eOH' beginning-of-line # Home
|
||||
|
||||
bindkey '\e[4~' end-of-line # End
|
||||
bindkey '\e[8~' end-of-line # End
|
||||
bindkey '\e[F ' end-of-line # End
|
||||
bindkey '\eOF' end-of-line # End
|
||||
|
||||
bindkey '^?' backward-delete-char # Backspace
|
||||
bindkey '\e[3~' delete-char # Del
|
||||
# bindkey '\e[3;5~' delete-char # sometimes Del, sometimes C-Del
|
||||
bindkey '\e[2~' overwrite-mode # Ins
|
||||
|
||||
bindkey '^H' backward-kill-word # C-Backspace
|
||||
|
||||
bindkey '5~' kill-word # C-Del
|
||||
bindkey '^[[3;5~' kill-word # C-Del
|
||||
bindkey '^[[3^' kill-word # C-Del
|
||||
|
||||
bindkey "^[[1;5H" backward-kill-line # C-Home
|
||||
bindkey "^[[7^" backward-kill-line # C-Home
|
||||
|
||||
bindkey "^[[1;5F" kill-line # C-End
|
||||
bindkey "^[[8^" kill-line # C-End
|
||||
|
||||
bindkey '^[[1;5C' forward-word # C-Right
|
||||
bindkey '^[0c' forward-word # C-Right
|
||||
bindkey '^[[5C' forward-word # C-Right
|
||||
|
||||
bindkey '^[[1;5D' backward-word # C-Left
|
||||
bindkey '^[0d' backward-word # C-Left
|
||||
bindkey '^[[5D' backward-word # C-Left
|
||||
'';
|
||||
};
|
||||
}
|
||||
@@ -1,21 +1,19 @@
|
||||
{ config, pkgs, agenix, ... }:
|
||||
{ config, pkgs, ... }:
|
||||
|
||||
{
|
||||
imports = [
|
||||
../common/main.nix
|
||||
|
||||
../module/ceph.nix
|
||||
./gitlab-runner.nix
|
||||
./monitoring.nix
|
||||
./nfs.nix
|
||||
./slurm-daemon.nix
|
||||
./ceph.nix
|
||||
./nix-serve.nix
|
||||
#./pxe.nix
|
||||
agenix.nixosModules.default
|
||||
];
|
||||
|
||||
environment.systemPackages = [
|
||||
agenix.packages.x86_64-linux.default
|
||||
];
|
||||
boot.binfmt.emulatedSystems = [ "aarch64-linux" "powerpc64le-linux" "riscv64-linux" ];
|
||||
|
||||
# Select the this using the ID to avoid mismatches
|
||||
boot.loader.grub.device = "/dev/disk/by-id/ata-INTEL_SSDSC2BB240G7_PHDV6462004Y240AGN";
|
||||
|
||||
@@ -1,33 +1,40 @@
|
||||
{ pkgs, lib, config, ... }:
|
||||
|
||||
{
|
||||
age.secrets."secrets/ovni-token".file = ./secrets/ovni-token.age;
|
||||
age.secrets."secrets/nosv-token".file = ./secrets/nosv-token.age;
|
||||
age.secrets.ovniToken.file = ../../secrets/ovni-token.age;
|
||||
age.secrets.nosvToken.file = ../../secrets/nosv-token.age;
|
||||
|
||||
services.gitlab-runner = {
|
||||
enable = true;
|
||||
settings.concurrent = 5;
|
||||
services = {
|
||||
ovni-shell = {
|
||||
registrationConfigFile = config.age.secrets."secrets/ovni-token".path;
|
||||
registrationConfigFile = config.age.secrets.ovniToken.path;
|
||||
executor = "shell";
|
||||
tagList = [ "nix" "xeon" ];
|
||||
registrationFlags = [
|
||||
# Using space doesn't work, and causes it to misread the next flag
|
||||
"--locked='false'"
|
||||
];
|
||||
environmentVariables = {
|
||||
SHELL = "${pkgs.bash}/bin/bash";
|
||||
};
|
||||
};
|
||||
ovni-docker = {
|
||||
registrationConfigFile = config.age.secrets."secrets/ovni-token".path;
|
||||
registrationConfigFile = config.age.secrets.ovniToken.path;
|
||||
dockerImage = "debian:stable";
|
||||
tagList = [ "docker" "xeon" ];
|
||||
registrationFlags = [ "--docker-network-mode host" ];
|
||||
registrationFlags = [
|
||||
"--locked='false'"
|
||||
"--docker-network-mode host"
|
||||
];
|
||||
environmentVariables = {
|
||||
https_proxy = "http://localhost:23080";
|
||||
http_proxy = "http://localhost:23080";
|
||||
};
|
||||
};
|
||||
nosv-docker = {
|
||||
registrationConfigFile = config.age.secrets."secrets/nosv-token".path;
|
||||
registrationConfigFile = config.age.secrets.nosvToken.path;
|
||||
dockerImage = "debian:stable";
|
||||
tagList = [ "docker" "xeon" ];
|
||||
registrationFlags = [
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
enable = true;
|
||||
port = 9001;
|
||||
retentionTime = "1y";
|
||||
listenAddress = "127.0.0.1";
|
||||
};
|
||||
|
||||
systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false;
|
||||
@@ -48,13 +49,18 @@
|
||||
user = "root";
|
||||
configFile = ./ipmi.yml;
|
||||
#extraFlags = [ "--log.level=debug" ];
|
||||
listenAddress = "127.0.0.1";
|
||||
};
|
||||
node = {
|
||||
enable = true;
|
||||
enabledCollectors = [ "systemd" ];
|
||||
port = 9002;
|
||||
listenAddress = "127.0.0.1";
|
||||
};
|
||||
smartctl = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
};
|
||||
smartctl.enable = true;
|
||||
};
|
||||
|
||||
scrapeConfigs = [
|
||||
|
||||
16
m/hut/nix-serve.nix
Normal file
16
m/hut/nix-serve.nix
Normal file
@@ -0,0 +1,16 @@
|
||||
{ config, ... }:
|
||||
|
||||
{
|
||||
age.secrets.nixServe.file = ../../secrets/nix-serve.age;
|
||||
|
||||
services.nix-serve = {
|
||||
enable = true;
|
||||
# Only listen locally, as we serve it via ssh
|
||||
bindAddress = "127.0.0.1";
|
||||
port = 5000;
|
||||
|
||||
secretKeyFile = config.age.secrets.nixServe.path;
|
||||
# Public key:
|
||||
# jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=
|
||||
};
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
let
|
||||
rarias = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
||||
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb";
|
||||
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1";
|
||||
default = [ rarias root hut ];
|
||||
in
|
||||
{
|
||||
"secrets/ovni-token.age".publicKeys = default;
|
||||
"secrets/nosv-token.age".publicKeys = default;
|
||||
"secrets/ceph-user.age".publicKeys = default;
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 CAWG4Q 35Ak+Mep9k5KnDLF1ywDbMD4l4mRFg6D0et19tqXxAw
|
||||
Wgr+CX4rzrPmUszSidtLAVSvgD80F2dqtd92hGZIFwo
|
||||
-> ssh-ed25519 MSF3dg OVFvpkAyWTowtxsafstX31H/hJpNZmnOCbvqMIN0+AQ
|
||||
VxjRcQmp+BadEh2y0PB96EeizIl3tTQpVu0CWHmsc1s
|
||||
-> ssh-ed25519 HY2yRg MJSQIpre9m0XnojgXuKQ/+hVBZNrZNGZqplwhqicpjI
|
||||
CLkE52iqpoqSnbzisNjQgxTfNqKeaRl5ntcw1d+ZDyQ
|
||||
-> m$8`De%~-grease '85p}`by
|
||||
52zMpprONcawWDDtzHdWNwFoYXErPUnVjhSONbUBpDlqAmJmD1LcAnsU
|
||||
--- 0vZOPyXQIMMGTwgFfvm8Sn8O7vjrsjGUEy5m/BASCyc
|
||||
<EFBFBD>|<04><><EFBFBD>)<29><><EFBFBD><EFBFBD><EFBFBD>*_<>D<EFBFBD>US`<06><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>r <20>s<EFBFBD><73>N<EFBFBD><4E>[^e+A1<><31>G.<2E>#<23><><EFBFBD>m<EFBFBD><6D>W<57> <20>5<0C><><EFBFBD><EFBFBD>(
|
||||
Binary file not shown.
Binary file not shown.
@@ -3,9 +3,5 @@
|
||||
{
|
||||
services.slurm = {
|
||||
server.enable = true;
|
||||
partitionName = [
|
||||
"owl Nodes=owl[1-2] Default=YES MaxTime=INFINITE State=UP"
|
||||
"all Nodes=owl[1-2],hut Default=NO MaxTime=INFINITE State=UP"
|
||||
];
|
||||
};
|
||||
}
|
||||
|
||||
@@ -7,5 +7,9 @@
|
||||
- 10.0.40.106
|
||||
- 10.0.40.107
|
||||
- 10.0.40.108
|
||||
# Storage
|
||||
- 10.0.40.141
|
||||
- 10.0.40.142
|
||||
- 10.0.40.143
|
||||
labels:
|
||||
job: ipmi-lan
|
||||
|
||||
@@ -50,4 +50,24 @@
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
};
|
||||
|
||||
# Missing service for volumes, see:
|
||||
# https://www.reddit.com/r/ceph/comments/14otjyo/comment/jrd69vt/
|
||||
systemd.services.ceph-volume = {
|
||||
enable = true;
|
||||
description = "Ceph Volume activation";
|
||||
unitConfig = {
|
||||
Type = "oneshot";
|
||||
After = "local-fs.target";
|
||||
Wants = "local-fs.target";
|
||||
};
|
||||
path = [ pkgs.ceph pkgs.util-linux pkgs.lvm2 pkgs.cryptsetup ];
|
||||
serviceConfig = {
|
||||
KillMode = "none";
|
||||
Environment = "CEPH_VOLUME_TIMEOUT=10000";
|
||||
ExecStart = "/bin/sh -c 'timeout $CEPH_VOLUME_TIMEOUT ${pkgs.ceph}/bin/ceph-volume lvm activate --all --no-systemd'";
|
||||
TimeoutSec = "0";
|
||||
};
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
{ config, pkgs, ... }:
|
||||
|
||||
# Mounts the /ceph filesystem at boot
|
||||
{
|
||||
environment.systemPackages = with pkgs; [
|
||||
ceph
|
||||
@@ -11,14 +12,14 @@
|
||||
# modprobe command.
|
||||
boot.kernelModules = [ "ceph" ];
|
||||
|
||||
age.secrets."secrets/ceph-user".file = ./secrets/ceph-user.age;
|
||||
age.secrets.cephUser.file = ../../secrets/ceph-user.age;
|
||||
|
||||
fileSystems."/ceph" = {
|
||||
fsType = "ceph";
|
||||
device = "user@9c8d06e0-485f-4aaf-b16b-06d6daf1232b.cephfs=/";
|
||||
options = [
|
||||
"mon_addr=10.0.40.40"
|
||||
"secretfile=${config.age.secrets."secrets/ceph-user".path}"
|
||||
"secretfile=${config.age.secrets.cephUser.path}"
|
||||
];
|
||||
};
|
||||
}
|
||||
69
m/module/shared-nix-store.nix
Normal file
69
m/module/shared-nix-store.nix
Normal file
@@ -0,0 +1,69 @@
|
||||
{ ... }:
|
||||
{
|
||||
# Don't make the nix store read-only, as this would prevent the overlay FS
|
||||
# from being able to mount it.
|
||||
boot.readOnlyNixStore = false;
|
||||
|
||||
# The nix-daemon.socket has an unnecessary dependency over the /nix/store
|
||||
# mount point. But that mount point won't be provided until the network is
|
||||
# ready. However, the network-address-eno1.service, has a dependency over
|
||||
# sockets.target, causing a cycle.
|
||||
# One solution is to make the nix-daemon.socket depend only on the socket
|
||||
# patch (which is already covered by ConditionPathIsReadWrite =
|
||||
# /nix/var/nix/daemon-socket), instead on the /nix/store.
|
||||
#
|
||||
# Using systemd.sockets.nix-daemon.unitConfig.RequiresMountsFor =
|
||||
# "/nix/var/nix/daemon-socket" doesn't work, as the the mount options get
|
||||
# added by systemd when the override config is merged with the one that Nix
|
||||
# provides:
|
||||
#
|
||||
# owl2% sudo systemctl show nix-daemon.socket | grep RequiresMountsFor
|
||||
# RequiresMountsFor=/nix/store /nix/var/nix/daemon-socket/socket /nix/var/nix/daemon-socket
|
||||
#
|
||||
# To fix this, the Nix package is patched to only depend on /nix/var instead.
|
||||
# See ../../pkgs/overlay.nix for details.
|
||||
|
||||
# Mount the hut nix store via NFS in read-only mode.
|
||||
fileSystems."/mnt/hut-nix-store" = {
|
||||
device = "hut:/nix/store";
|
||||
fsType = "nfs";
|
||||
options = [ "ro" ];
|
||||
};
|
||||
|
||||
# A workdir is also needed, so setup a permanent dir using tmpfiles.
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /mnt/nix-work 0700 root root -"
|
||||
];
|
||||
|
||||
# Mount an overlay in /nix/store using as lower layer the NFS store and upper
|
||||
# layer the disk nix store. The destination is still the nix store in
|
||||
# /nix/store (confusing). We need rw access, as the daemon need to write the
|
||||
# lock files to build derivations locally. Use a systemd mount unit directly
|
||||
# so we can specify the LazyUmount option and we avoid having it mounted
|
||||
# in the stage1 before systemd.
|
||||
systemd.mounts = [
|
||||
{
|
||||
what = "overlay";
|
||||
type = "overlay";
|
||||
where = "/nix/store";
|
||||
# We need the local-fs.target to be ready, so the network interfaces can
|
||||
# be configured to the network.target is reached. So make this a netdev
|
||||
# mount.
|
||||
options = "_netdev,lowerdir=/mnt/hut-nix-store,upperdir=/nix/store,workdir=/mnt/nix-work";
|
||||
description = "Overlay /nix/store mount";
|
||||
mountConfig = {
|
||||
LazyUnmount = true;
|
||||
};
|
||||
|
||||
# Run the unit after remote-fs-pre.target but before the remote-fs.target
|
||||
after = [ "remote-fs-pre.target"];
|
||||
before = [ "umount.target" "remote-fs.target" ];
|
||||
# Install by using wantedBy over remote-fs.target
|
||||
wantedBy = [ "remote-fs.target" ];
|
||||
unitConfig = {
|
||||
# We need to wait for the NFS mount
|
||||
RequiresMountsFor = "/nix/store /mnt/hut-nix-store";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
8
m/module/slurm-firewall.nix
Normal file
8
m/module/slurm-firewall.nix
Normal file
@@ -0,0 +1,8 @@
|
||||
{ ... }:
|
||||
|
||||
{
|
||||
networking.firewall = {
|
||||
# Required for PMIx in SLURM, we should find a better way
|
||||
allowedTCPPortRanges = [ { from=1024; to=65535; } ];
|
||||
};
|
||||
}
|
||||
@@ -1,7 +1,12 @@
|
||||
{ config, pkgs, ... }:
|
||||
|
||||
{
|
||||
imports = [ ../common/main.nix ];
|
||||
imports = [
|
||||
../common/main.nix
|
||||
../module/ceph.nix
|
||||
../module/slurm-firewall.nix
|
||||
../module/shared-nix-store.nix
|
||||
];
|
||||
|
||||
# Select the this using the ID to avoid mismatches
|
||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53566c";
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
{ config, pkgs, modulesPath, lib, ... }:
|
||||
{ config, pkgs, ... }:
|
||||
|
||||
{
|
||||
imports = [
|
||||
#(modulesPath + "/installer/netboot/netboot-minimal.nix")
|
||||
../common/main.nix
|
||||
../module/ceph.nix
|
||||
../module/slurm-firewall.nix
|
||||
../module/shared-nix-store.nix
|
||||
];
|
||||
|
||||
# Select the this using the ID to avoid mismatches
|
||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d535629";
|
||||
#programs.ssh.forwardX11 = false;
|
||||
#programs.ssh.setXAuthLocation = lib.mkForce true;
|
||||
|
||||
networking = {
|
||||
hostName = "owl2";
|
||||
@@ -17,6 +17,7 @@
|
||||
address = "10.0.40.2";
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
# Watch out! The OmniPath device is not in the same place here:
|
||||
interfaces.ibp129s0.ipv4.addresses = [ {
|
||||
address = "10.0.42.2";
|
||||
prefixLength = 24;
|
||||
|
||||
11
pkgs/nix-socket.patch
Normal file
11
pkgs/nix-socket.patch
Normal file
@@ -0,0 +1,11 @@
|
||||
--- a/misc/systemd/nix-daemon.socket.in 1970-01-01 01:00:01.000000000 +0100
|
||||
+++ b/misc/systemd/nix-daemon.socket.in 2023-09-18 17:53:32.351760208 +0200
|
||||
@@ -1,7 +1,7 @@
|
||||
[Unit]
|
||||
Description=Nix Daemon Socket
|
||||
Before=multi-user.target
|
||||
-RequiresMountsFor=@storedir@
|
||||
+RequiresMountsFor=@localstatedir@
|
||||
ConditionPathIsReadWrite=@localstatedir@/nix/daemon-socket
|
||||
|
||||
[Socket]
|
||||
@@ -32,4 +32,8 @@ final: prev:
|
||||
lua = prev.lua5_4;
|
||||
fmt = prev.fmt_8;
|
||||
}) ceph ceph-client;
|
||||
|
||||
nix = prev.nix.overrideAttrs (old: {
|
||||
patches = old.patches ++ [ ./nix-socket.patch ];
|
||||
});
|
||||
}
|
||||
|
||||
21
secrets/ceph-user.age
Normal file
21
secrets/ceph-user.age
Normal file
@@ -0,0 +1,21 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 AY8zKw J00a6ZOhkupkhLU5WQ0kD05HEF4KKsSs2hwjHKbnnHU
|
||||
J14VoNOCqLpScVO7OLXbqTcLI4tcVUHt5cqY/XQmbGs
|
||||
-> ssh-ed25519 sgAamA k8R/bSUdvVmlBI6yHPi5NBQPBGM36lPJwsir8DFGgxE
|
||||
4ZKC3gYvic6AVrNGgNjwztbUzhxP8ViX5O3wFo9wlrk
|
||||
-> ssh-ed25519 HY2yRg 966xf2fTnA6Wq0uYXbXZQOManqITJcCbQS9LZCGEOh4
|
||||
Qg5echQSrzqeDqvaMx+5fqi8XyTjAeCsY/UFJX6YnDs
|
||||
-> ssh-ed25519 tcumPQ e0U2okrGIoUpLfPYjIRx1V92rE3hZW13nJef+l3kBQg
|
||||
LejAUKBl+tPhwocCF00ZHTzFISnwX8og8GvemiMIcyo
|
||||
-> ssh-ed25519 JJ1LWg QkzTsPq9Gdh+FNz/a4bDb9LQOreFyxeTC51UNd1fsj0
|
||||
ayrlKenETfQzH1Z9drVEWqszQebicGVJve0/pCnxAE8
|
||||
-> ssh-ed25519 CAWG4Q lJLW9+dxvyoD4hYzeXeE/4rzJ6HIeEQOB1+fbhV3xw0
|
||||
T2RrVCtTuQvya9HiJB7txk3QGrntpsMX9Tt1cyXoW5E
|
||||
-> ssh-ed25519 MSF3dg JOZkFb2CfqWKvZIz7lYxXWgv8iEVDkQF8hInDMZvknc
|
||||
MHDWxjUw4dNiC1h4MrU9uKKcI3rwkxABm0+5FYMZkok
|
||||
-> ~8m;7f-grease
|
||||
lDIullfC98RhpTZ4Mk87Td+VtPmwPdgz+iIilpKugUkmV5r4Uqd7yE+5ArA6ekr/
|
||||
G/X4EA
|
||||
--- Cz4sv9ZunBcVdZCozdTh1zlg1zIASjk2MjYeYfcN9eA
|
||||
<EFBFBD>N <09>$[H<><48>Q<EFBFBD><51><EFBFBD>
|
||||
d<EFBFBD><EFBFBD><EFBFBD>'<27><><EFBFBD>7<EFBFBD><1F>Ͳ)<29><><EFBFBD><17>x9y<39><79><EFBFBD>E<04><><EFBFBD>M7^<5E>[<5B>M<EFBFBD>+<2B>&<26><><EFBFBD><0E>$8tM<74>в
|
||||
BIN
secrets/munge-key.age
Normal file
BIN
secrets/munge-key.age
Normal file
Binary file not shown.
12
secrets/nix-serve.age
Normal file
12
secrets/nix-serve.age
Normal file
@@ -0,0 +1,12 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg d144D+VvxhYgKtH//uD2qNuVnYX6bh74YqkyM3ZjBwU
|
||||
0IeVmFAf4U8Sm0d01O6ZwJ1V2jl/mSMl4wF0MP5LrIg
|
||||
-> ssh-ed25519 CAWG4Q H4nKxue/Cj/3KUF5A+/ygHMjjArwgx3SIWwXcqFtyUo
|
||||
4k5NJkLUrueLYiPkr2LAwQLWmuaOIsDmV/86ravpleU
|
||||
-> ssh-ed25519 MSF3dg HpgUAFHLPs4w0cdJHqTwf8lySkTeV9O9NnBf49ClDHs
|
||||
foPIUUgAYe1YSDy6+aMfjN7xv9xud9fDmhRlIztHoEo
|
||||
-> vLkF\<-grease
|
||||
3GRT+W8gYSpjl/a6Ix9+g9UJnTpl1ZH/oucfR801vfE8y77DV2Jxz/XJwzxYxKG5
|
||||
YEhiTGMNbXw/V7E5aVSz6Bdc
|
||||
--- GtiHKCZdHByq9j0BSLd544PhbEwTN138E8TFdxipeiA
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>G$S<><53><EFBFBD>RA<52><41><EFBFBD>Th]n<>8<EFBFBD>,<2C>H<EFBFBD>s<EFBFBD><73><EFBFBD>=p<><70><EFBFBD>'<27><><EFBFBD>+<2B>j<><6A><EFBFBD><EFBFBD>9<EFBFBD>)<29>:<3A>)<15><><EFBFBD>Y<EFBFBD><59><EFBFBD>8<EFBFBD>I<EFBFBD><49>8:ol<6F><6C><EFBFBD><1F><><EFBFBD>Z<EFBFBD><5A>3<>PM<50>F;<3B>rY<72><59><EFBFBD><EFBFBD><1F>$<24><>y<EFBFBD>L<>ٜ<EFBFBD>Μ<1B><>U<EFBFBD>s16Ǿ<>L<EFBFBD>b<EFBFBD><62><EFBFBD>
|
||||
11
secrets/nosv-token.age
Normal file
11
secrets/nosv-token.age
Normal file
@@ -0,0 +1,11 @@
|
||||
age-encryption.org/v1
|
||||
-> ssh-ed25519 HY2yRg hrdS7Dl/j+u3XVfM79ZJpZSlre9TcD7DTQ+EEAT6kEE
|
||||
avUO96P1h7w2BYWgrQ7GpUgdaCV9AZL7eOTTcF9gfro
|
||||
-> ssh-ed25519 CAWG4Q A5raRY1CAgFYZgoQ92GMyNejYNdHx/7Y6uTS+EjLPWA
|
||||
FRFqT2Jz7qRcybaxkQTKHGl797LVXoHpYG4RZSrX/70
|
||||
-> ssh-ed25519 MSF3dg D+R80Bg7W9AuiOMAqtGFZQl994dRBIegYRLmmTaeZ3o
|
||||
BHvZsugRiuZ91b4jk91h30o3eF3hadSnVCwxXge95T8
|
||||
-> BT/El`a-grease W{nq|Vm )bld 2Nl}4 N$#JGB4t
|
||||
oLG+0S1aGfO/ohCfgGmhDhwwLi4H
|
||||
--- 2I5C+FvBG/K1ZHh7C5QD39feTSLoFGwcTeZAmeILNsI
|
||||
<EFBFBD><EFBFBD>W<EFBFBD>o<> <14><>d;<3B><>C<EFBFBD>.<2E><>_(<28>u
|
||||
BIN
secrets/ovni-token.age
Normal file
BIN
secrets/ovni-token.age
Normal file
Binary file not shown.
15
secrets/secrets.nix
Normal file
15
secrets/secrets.nix
Normal file
@@ -0,0 +1,15 @@
|
||||
let
|
||||
keys = import ../keys.nix;
|
||||
adminsKeys = builtins.attrValues keys.admins;
|
||||
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
||||
# Only expose ceph keys to safe nodes and admins
|
||||
safe = keys.hostGroup.safe ++ adminsKeys;
|
||||
in
|
||||
{
|
||||
"ovni-token.age".publicKeys = hut;
|
||||
"nosv-token.age".publicKeys = hut;
|
||||
"nix-serve.age".publicKeys = hut;
|
||||
|
||||
"ceph-user.age".publicKeys = safe;
|
||||
"munge-key.age".publicKeys = safe;
|
||||
}
|
||||
@@ -17,6 +17,6 @@ Then, to request access to the machines we will need some information about you:
|
||||
1. The salted hash of your login password, generated with `mkpasswd -m sha-512`
|
||||
1. An SSH public key of type Ed25519 (can be generated with `ssh-keygen -t ed25519`)
|
||||
|
||||
You can send us both an email at <rodrigo.arias@bsc.es> and
|
||||
<aleix.rocanonell@bsc.es> with the details, or directly open a merge request in
|
||||
the [jungle repository](https://pm.bsc.es/gitlab/rarias/jungle/).
|
||||
Send an email to <jungle@bsc.es> with the details, or directly open a
|
||||
merge request in the [jungle
|
||||
repository](https://pm.bsc.es/gitlab/rarias/jungle/).
|
||||
|
||||
71
web/content/posts/2023-09-12/_index.md
Normal file
71
web/content/posts/2023-09-12/_index.md
Normal file
@@ -0,0 +1,71 @@
|
||||
---
|
||||
title: "Update 2023-09-12"
|
||||
author: "Rodrigo Arias Mallo"
|
||||
date: 2023-09-12
|
||||
---
|
||||
|
||||
This is a summary of notable changes introduced in the jungle cluster in the
|
||||
last months.
|
||||
|
||||
### New Ceph filesystem available
|
||||
|
||||
We have installed the latest [Ceph filesystem][1] (18.2.0) which stores three
|
||||
redundant copies of the data so a failure in one disk doesn't cause data loss.
|
||||
It is mounted in /ceph and available for use in the owl1, owl2 and hut
|
||||
nodes. For now it provides 2.8 TiB of space and it is expected to
|
||||
increase when the last storage node is installed.
|
||||
|
||||
[1]: https://en.wikipedia.org/wiki/Ceph_(software)
|
||||
|
||||
The throughput is limited by the 1 Gigabit Ethernet speed, but should be
|
||||
reasonably fast for most workloads. Here is a test with dd which reaches the
|
||||
network limit:
|
||||
|
||||
```txt
|
||||
hut% dd if=/dev/urandom of=/ceph/rarias/urandom bs=1M count=1024
|
||||
1024+0 records in
|
||||
1024+0 records out
|
||||
1073741824 bytes (1,1 GB, 1,0 GiB) copied, 8,98544 s, 119 MB/s
|
||||
```
|
||||
|
||||
### SLURM power save
|
||||
|
||||
The SLURM daemon has been configured to power down the nodes after one hour of
|
||||
idling. When a new job is allocated to a node that is powered off, it is
|
||||
automatically turned on and as soon as it becomes available it will execute the
|
||||
job. Here is an example with two nodes that boot and execute a simple job that
|
||||
shows the date.
|
||||
|
||||
```txt
|
||||
hut% date; srun -N 2 date
|
||||
2023-09-12T17:36:09 CEST
|
||||
2023-09-12T17:38:26 CEST
|
||||
2023-09-12T17:38:18 CEST
|
||||
```
|
||||
|
||||
You can expect a similar delay (around 2-3 min) while the nodes are starting.
|
||||
Notice that while the nodes are kept on, the delay is not noticeable:
|
||||
|
||||
```txt
|
||||
hut% date; srun -N 2 date
|
||||
2023-09-12T17:40:04 CEST
|
||||
2023-09-12T17:40:04 CEST
|
||||
2023-09-12T17:40:04 CEST
|
||||
```
|
||||
|
||||
### Power and temperature monitoring
|
||||
|
||||
In the cluster, we monitor the temperature and the power draw of all nodes. This
|
||||
allows us to understand which machines are not being used and turn them off to
|
||||
save energy that otherwise would be wasted. Here is an example where some nodes
|
||||
are powered off to save energy:
|
||||
|
||||

|
||||
|
||||
We also configured the nodes to work at low CPU frequencies, so the temperature
|
||||
is kept low to increase the lifespan of the node components. Towards these
|
||||
goals, we have configured two alerts that trigger when the CPUs of a node
|
||||
exceeds the limit temperature of 80 °C or when the power draw exceeds 350 W.
|
||||
|
||||
By keeping the power consumption and temperatures controlled, we can safely
|
||||
incorporate more machines that will only be used on demand.
|
||||
BIN
web/content/posts/2023-09-12/power.png
Normal file
BIN
web/content/posts/2023-09-12/power.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
Reference in New Issue
Block a user