Compare commits
1 Commits
old-master
...
e8eb47c9b8
| Author | SHA1 | Date | |
|---|---|---|---|
| e8eb47c9b8 |
46
doc/trim.sh
46
doc/trim.sh
@@ -1,46 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# Trims the jungle repository by moving the website to its own repository and
|
|
||||||
# removing it from jungle. It also removes big pdf files and kernel
|
|
||||||
# configurations so the jungle repository is small.
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ -e oldjungle -o -e newjungle -o -e website ]; then
|
|
||||||
echo "remove oldjungle/, newjungle/ and website/ first"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone the old jungle repo
|
|
||||||
git clone gitea@tent:rarias/jungle.git oldjungle
|
|
||||||
|
|
||||||
# First split the website into a new repository
|
|
||||||
mkdir website && git -C website init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--path web \
|
|
||||||
--subdirectory-filter web \
|
|
||||||
--source oldjungle \
|
|
||||||
--target website
|
|
||||||
|
|
||||||
# Then remove the website, pdf files and big kernel configs
|
|
||||||
mkdir newjungle && git -C newjungle init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--invert-paths \
|
|
||||||
--path web \
|
|
||||||
--path-glob 'doc*.pdf' \
|
|
||||||
--path-glob '**/kernel/configs/lockdep' \
|
|
||||||
--path-glob '**/kernel/configs/defconfig' \
|
|
||||||
--source oldjungle \
|
|
||||||
--target newjungle
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
du -sh oldjungle newjungle website
|
|
||||||
# 57M oldjungle
|
|
||||||
# 2,3M newjungle
|
|
||||||
# 6,4M website
|
|
||||||
|
|
||||||
du -sh --exclude=.git oldjungle newjungle website
|
|
||||||
# 30M oldjungle
|
|
||||||
# 700K newjungle
|
|
||||||
# 3,5M website
|
|
||||||
@@ -5,7 +5,6 @@
|
|||||||
../common/xeon.nix
|
../common/xeon.nix
|
||||||
../common/ssf/hosts.nix
|
../common/ssf/hosts.nix
|
||||||
../module/ceph.nix
|
../module/ceph.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/slurm-server.nix
|
../module/slurm-server.nix
|
||||||
./nfs.nix
|
./nfs.nix
|
||||||
./wireguard.nix
|
./wireguard.nix
|
||||||
@@ -66,4 +65,10 @@
|
|||||||
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Use tent for cache
|
||||||
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -156,30 +156,18 @@
|
|||||||
};
|
};
|
||||||
|
|
||||||
csiringo = {
|
csiringo = {
|
||||||
|
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
|
||||||
uid = 9653;
|
uid = 9653;
|
||||||
isNormalUser = true;
|
isNormalUser = true;
|
||||||
home = "/home/Computational/csiringo";
|
home = "/home/Computational/csiringo";
|
||||||
description = "Cesare Siringo";
|
description = "Cesare Siringo";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ ];
|
hosts = [ "apex" "weasel" ];
|
||||||
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
acinca = {
|
|
||||||
uid = 9654;
|
|
||||||
isNormalUser = true;
|
|
||||||
home = "/home/Computational/acinca";
|
|
||||||
description = "Arnau Cinca";
|
|
||||||
group = "Computational";
|
|
||||||
hosts = [ "apex" "hut" "fox" "owl1" "owl2" ];
|
|
||||||
hashedPassword = "$6$S6PUeRpdzYlidxzI$szyvWejQ4hEN76yBYhp1diVO5ew1FFg.cz4lKiXt2Idy4XdpifwrFTCIzLTs5dvYlR62m7ekA5MrhcVxR5F/q/";
|
|
||||||
openssh.authorizedKeys.keys = [
|
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmMqKqPg4uocNOr3O41kLbZMOMJn3m2ZdN1JvTR96z3 bsccns@arnau-bsc"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
groups = {
|
groups = {
|
||||||
|
|||||||
@@ -9,7 +9,6 @@
|
|||||||
./cpufreq.nix
|
./cpufreq.nix
|
||||||
./fs.nix
|
./fs.nix
|
||||||
./users.nix
|
./users.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@
|
|||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/nvidia.nix
|
../module/nvidia.nix
|
||||||
../module/slurm-client.nix
|
../module/slurm-client.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
./wireguard.nix
|
./wireguard.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -63,6 +62,12 @@
|
|||||||
interfaces.enp1s0f0np0.useDHCP = true;
|
interfaces.enp1s0f0np0.useDHCP = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Use hut for cache
|
||||||
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Recommended for new graphics cards
|
# Recommended for new graphics cards
|
||||||
hardware.nvidia.open = true;
|
hardware.nvidia.open = true;
|
||||||
|
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
||||||
|
|||||||
@@ -6,8 +6,5 @@
|
|||||||
{
|
{
|
||||||
extra-substituters = [ "http://hut/cache" ];
|
extra-substituters = [ "http://hut/cache" ];
|
||||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
|
||||||
# Set a low timeout in case hut is down
|
|
||||||
connect-timeout = 3; # seconds
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,12 +12,6 @@
|
|||||||
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
||||||
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
||||||
KillMode = lib.mkForce "control-group";
|
KillMode = lib.mkForce "control-group";
|
||||||
|
|
||||||
# If slurmd fails to contact the control server it will fail, causing the
|
|
||||||
# node to remain out of service until manually restarted. Always try to
|
|
||||||
# restart it.
|
|
||||||
Restart = "always";
|
|
||||||
RestartSec = "30s";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
services.slurm.client.enable = true;
|
services.slurm.client.enable = true;
|
||||||
|
|||||||
@@ -9,7 +9,6 @@
|
|||||||
../module/nvidia.nix
|
../module/nvidia.nix
|
||||||
../eudy/kernel/perf.nix
|
../eudy/kernel/perf.nix
|
||||||
./wireguard.nix
|
./wireguard.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Don't install Grub on the disk yet
|
# Don't install Grub on the disk yet
|
||||||
@@ -52,6 +51,11 @@
|
|||||||
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Enable performance governor
|
# Enable performance governor
|
||||||
powerManagement.cpuFreqGovernor = "performance";
|
powerManagement.cpuFreqGovernor = "performance";
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
../hut/msmtp.nix
|
../hut/msmtp.nix
|
||||||
../module/p.nix
|
../module/p.nix
|
||||||
../module/vpn-dac.nix
|
../module/vpn-dac.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select this using the ID to avoid mismatches
|
# Select this using the ID to avoid mismatches
|
||||||
|
|||||||
@@ -13,37 +13,36 @@ NixOS 25.05).
|
|||||||
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
|
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
|
||||||
RTX4000 GPUs. During the last months we have been doing some tests and it seems
|
RTX4000 GPUs. During the last months we have been doing some tests and it seems
|
||||||
that most of the components work well. We have configured CUDA to use the NVIDIA
|
that most of the components work well. We have configured CUDA to use the NVIDIA
|
||||||
GPUs, as well as AMD uProf to trace performance and energy counters from the
|
GPUs as well as AMD uProf to trace performance and energy counters from the
|
||||||
CPUs.
|
CPUs.
|
||||||
|
|
||||||
### Upgraded login node: apex
|
### Upgraded login node: apex
|
||||||
|
|
||||||
We have upgraded the operating system on the login node to NixOS, which now runs
|
We have upgraded the operating system on the login node to NixOS, which now runs
|
||||||
Linux 6.15.6. During the upgrade, we have detected a problem with the storage
|
Linux 6.15.6. During the upgrade, we have detected a problem with the RAID
|
||||||
disks. The `/` and `/home` partitions sit on a
|
controller that caused a catastrophic failure that prevented the BIOS from
|
||||||
[RAID 5](https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5),
|
starting.
|
||||||
transparently handled by a RAID hardware controller which starts its own
|
|
||||||
firmware before passing the control to the BIOS to continue the boot sequence. A
|
|
||||||
problem during the startup of the firmware prevented the node to even reach the
|
|
||||||
BIOS screen.
|
|
||||||
|
|
||||||
After a long debugging session, we detected that the flash memory that stores
|
The `/` and `/home` partitions sit on a RAID 5 governed by a RAID hardware
|
||||||
the firmware of the hardware controller was likely to be the issue, since
|
controller, however it was unable to boot properly before handling
|
||||||
|
the control over to the BIOS. After a long debugging session, we detected that
|
||||||
|
the flash memory that stores the firmware of the hardware controller was likely
|
||||||
|
to be the issue, as
|
||||||
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
|
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
|
||||||
may lose charge over time and can end up corrupting the content. We flashed
|
may lose charge over time and can end up corrupting the content. So we flashed
|
||||||
the latest firmware so the memory cells are charged again with the new bits and
|
the latest firmware so the memory cells are charged again with the new bits and
|
||||||
that fixed the problem. Hopefully we will be able to use it for some more years.
|
that fixed the problem. Hopefully we will be able to use it for some more years.
|
||||||
|
|
||||||
The SLURM server has been moved to apex which allows users to also submit jobs
|
The SLURM server has been moved to apex, so now you can allocate your jobs from
|
||||||
to fox.
|
there, including the new fox machine.
|
||||||
|
|
||||||
### Migrated machines to BSC building
|
### Translated machines to BSC building
|
||||||
|
|
||||||
The server room had a temperature issue that had been affecting our machines
|
The server room had a temperature issue that affected our machines since the end
|
||||||
since the end of February of 2025. As the summer approached, the temperature
|
of February of 2025. As the summer approached, the temperature exceeded the safe
|
||||||
exceeded the safe limits for our hardware, so we had to shutdown the cluster.
|
limits for our hardware, so we had to shutdown the cluster.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
Since then, we have moved the cluster to BSC premises, where it now rests at a
|
Since then, we have moved the cluster to BSC premises, where now rests at a
|
||||||
stable temperature, so hopefully we won't have more unscheduled downtime.
|
stable temperature, so hopefully we won't have more unscheduled downtime.
|
||||||
|
|||||||
Reference in New Issue
Block a user