1 Commits

Author SHA1 Message Date
e8eb47c9b8 Add web post update for 2025 2025-09-26 14:53:37 +02:00
15 changed files with 40 additions and 105 deletions

View File

@@ -1,46 +0,0 @@
#!/bin/sh
# Trims the jungle repository by moving the website to its own repository and
# removing it from jungle. It also removes big pdf files and kernel
# configurations so the jungle repository is small.
set -e
if [ -e oldjungle -o -e newjungle -o -e website ]; then
echo "remove oldjungle/, newjungle/ and website/ first"
exit 1
fi
# Clone the old jungle repo
git clone gitea@tent:rarias/jungle.git oldjungle
# First split the website into a new repository
mkdir website && git -C website init -b master
git-filter-repo \
--path web \
--subdirectory-filter web \
--source oldjungle \
--target website
# Then remove the website, pdf files and big kernel configs
mkdir newjungle && git -C newjungle init -b master
git-filter-repo \
--invert-paths \
--path web \
--path-glob 'doc*.pdf' \
--path-glob '**/kernel/configs/lockdep' \
--path-glob '**/kernel/configs/defconfig' \
--source oldjungle \
--target newjungle
set -x
du -sh oldjungle newjungle website
# 57M oldjungle
# 2,3M newjungle
# 6,4M website
du -sh --exclude=.git oldjungle newjungle website
# 30M oldjungle
# 700K newjungle
# 3,5M website

View File

@@ -5,7 +5,6 @@
../common/xeon.nix ../common/xeon.nix
../common/ssf/hosts.nix ../common/ssf/hosts.nix
../module/ceph.nix ../module/ceph.nix
../module/hut-substituter.nix
../module/slurm-server.nix ../module/slurm-server.nix
./nfs.nix ./nfs.nix
./wireguard.nix ./wireguard.nix
@@ -66,4 +65,10 @@
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
''; '';
}; };
# Use tent for cache
nix.settings = {
extra-substituters = [ "https://jungle.bsc.es/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
};
} }

View File

@@ -3,7 +3,6 @@
{ {
imports = [ imports = [
../common/ssf.nix ../common/ssf.nix
../module/hut-substituter.nix
../module/monitoring.nix ../module/monitoring.nix
]; ];

View File

@@ -156,30 +156,18 @@
}; };
csiringo = { csiringo = {
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
uid = 9653; uid = 9653;
isNormalUser = true; isNormalUser = true;
home = "/home/Computational/csiringo"; home = "/home/Computational/csiringo";
description = "Cesare Siringo"; description = "Cesare Siringo";
group = "Computational"; group = "Computational";
hosts = [ ]; hosts = [ "apex" "weasel" ];
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1"; hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
openssh.authorizedKeys.keys = [ openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es" "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
]; ];
}; };
acinca = {
uid = 9654;
isNormalUser = true;
home = "/home/Computational/acinca";
description = "Arnau Cinca";
group = "Computational";
hosts = [ "apex" "hut" "fox" "owl1" "owl2" ];
hashedPassword = "$6$S6PUeRpdzYlidxzI$szyvWejQ4hEN76yBYhp1diVO5ew1FFg.cz4lKiXt2Idy4XdpifwrFTCIzLTs5dvYlR62m7ekA5MrhcVxR5F/q/";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmMqKqPg4uocNOr3O41kLbZMOMJn3m2ZdN1JvTR96z3 bsccns@arnau-bsc"
];
};
}; };
groups = { groups = {

View File

@@ -9,7 +9,6 @@
./cpufreq.nix ./cpufreq.nix
./fs.nix ./fs.nix
./users.nix ./users.nix
../module/hut-substituter.nix
../module/debuginfod.nix ../module/debuginfod.nix
]; ];

View File

@@ -8,7 +8,6 @@
../module/emulation.nix ../module/emulation.nix
../module/nvidia.nix ../module/nvidia.nix
../module/slurm-client.nix ../module/slurm-client.nix
../module/hut-substituter.nix
./wireguard.nix ./wireguard.nix
]; ];
@@ -63,6 +62,12 @@
interfaces.enp1s0f0np0.useDHCP = true; interfaces.enp1s0f0np0.useDHCP = true;
}; };
# Use hut for cache
nix.settings = {
extra-substituters = [ "https://jungle.bsc.es/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
};
# Recommended for new graphics cards # Recommended for new graphics cards
hardware.nvidia.open = true; hardware.nvidia.open = true;

View File

@@ -2,13 +2,10 @@
let let
website = pkgs.stdenv.mkDerivation { website = pkgs.stdenv.mkDerivation {
name = "jungle-web"; name = "jungle-web";
src = pkgs.fetchgit { src = theFlake;
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
};
buildInputs = [ pkgs.hugo ]; buildInputs = [ pkgs.hugo ];
buildPhase = '' buildPhase = ''
cd web
rm -rf public/ rm -rf public/
hugo hugo
''; '';

View File

@@ -4,7 +4,6 @@
imports = [ imports = [
../common/ssf.nix ../common/ssf.nix
../module/monitoring.nix ../module/monitoring.nix
../module/hut-substituter.nix
]; ];
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a"; boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";

View File

@@ -6,8 +6,5 @@
{ {
extra-substituters = [ "http://hut/cache" ]; extra-substituters = [ "http://hut/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ]; extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
# Set a low timeout in case hut is down
connect-timeout = 3; # seconds
}; };
} }

View File

@@ -12,12 +12,6 @@
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb # https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24 # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
KillMode = lib.mkForce "control-group"; KillMode = lib.mkForce "control-group";
# If slurmd fails to contact the control server it will fail, causing the
# node to remain out of service until manually restarted. Always try to
# restart it.
Restart = "always";
RestartSec = "30s";
}; };
services.slurm.client.enable = true; services.slurm.client.enable = true;

View File

@@ -9,7 +9,6 @@
../module/nvidia.nix ../module/nvidia.nix
../eudy/kernel/perf.nix ../eudy/kernel/perf.nix
./wireguard.nix ./wireguard.nix
../module/hut-substituter.nix
]; ];
# Don't install Grub on the disk yet # Don't install Grub on the disk yet
@@ -52,6 +51,11 @@
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ]; options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
}; };
nix.settings = {
extra-substituters = [ "https://jungle.bsc.es/cache" ];
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
};
# Enable performance governor # Enable performance governor
powerManagement.cpuFreqGovernor = "performance"; powerManagement.cpuFreqGovernor = "performance";

View File

@@ -15,7 +15,6 @@
../hut/msmtp.nix ../hut/msmtp.nix
../module/p.nix ../module/p.nix
../module/vpn-dac.nix ../module/vpn-dac.nix
../module/hut-substituter.nix
]; ];
# Select the this using the ID to avoid mismatches # Select the this using the ID to avoid mismatches

View File

@@ -2,13 +2,10 @@
let let
website = pkgs.stdenv.mkDerivation { website = pkgs.stdenv.mkDerivation {
name = "jungle-web"; name = "jungle-web";
src = pkgs.fetchgit { src = theFlake;
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
};
buildInputs = [ pkgs.hugo ]; buildInputs = [ pkgs.hugo ];
buildPhase = '' buildPhase = ''
cd web
rm -rf public/ rm -rf public/
hugo hugo
''; '';

View File

@@ -3,7 +3,6 @@
{ {
imports = [ imports = [
../common/ssf.nix ../common/ssf.nix
../module/hut-substituter.nix
]; ];
# Select this using the ID to avoid mismatches # Select this using the ID to avoid mismatches

View File

@@ -13,37 +13,36 @@ NixOS 25.05).
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
RTX4000 GPUs. During the last months we have been doing some tests and it seems RTX4000 GPUs. During the last months we have been doing some tests and it seems
that most of the components work well. We have configured CUDA to use the NVIDIA that most of the components work well. We have configured CUDA to use the NVIDIA
GPUs, as well as AMD uProf to trace performance and energy counters from the GPUs as well as AMD uProf to trace performance and energy counters from the
CPUs. CPUs.
### Upgraded login node: apex ### Upgraded login node: apex
We have upgraded the operating system on the login node to NixOS, which now runs We have upgraded the operating system on the login node to NixOS, which now runs
Linux 6.15.6. During the upgrade, we have detected a problem with the storage Linux 6.15.6. During the upgrade, we have detected a problem with the RAID
disks. The `/` and `/home` partitions sit on a controller that caused a catastrophic failure that prevented the BIOS from
[RAID 5](https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5), starting.
transparently handled by a RAID hardware controller which starts its own
firmware before passing the control to the BIOS to continue the boot sequence. A
problem during the startup of the firmware prevented the node to even reach the
BIOS screen.
After a long debugging session, we detected that the flash memory that stores The `/` and `/home` partitions sit on a RAID 5 governed by a RAID hardware
the firmware of the hardware controller was likely to be the issue, since controller, however it was unable to boot properly before handling
the control over to the BIOS. After a long debugging session, we detected that
the flash memory that stores the firmware of the hardware controller was likely
to be the issue, as
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation) [memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
may lose charge over time and can end up corrupting the content. We flashed may lose charge over time and can end up corrupting the content. So we flashed
the latest firmware so the memory cells are charged again with the new bits and the latest firmware so the memory cells are charged again with the new bits and
that fixed the problem. Hopefully we will be able to use it for some more years. that fixed the problem. Hopefully we will be able to use it for some more years.
The SLURM server has been moved to apex which allows users to also submit jobs The SLURM server has been moved to apex, so now you can allocate your jobs from
to fox. there, including the new fox machine.
### Migrated machines to BSC building ### Translated machines to BSC building
The server room had a temperature issue that had been affecting our machines The server room had a temperature issue that affected our machines since the end
since the end of February of 2025. As the summer approached, the temperature of February of 2025. As the summer approached, the temperature exceeded the safe
exceeded the safe limits for our hardware, so we had to shutdown the cluster. limits for our hardware, so we had to shutdown the cluster.
![Room temperature](temp.png) ![Room temperature](temp.png)
Since then, we have moved the cluster to BSC premises, where it now rests at a Since then, we have moved the cluster to BSC premises, where now rests at a
stable temperature, so hopefully we won't have more unscheduled downtime. stable temperature, so hopefully we won't have more unscheduled downtime.