forked from rarias/jungle
Compare commits
12 Commits
26cd5f768d
...
c4a63b8ffd
| Author | SHA1 | Date | |
|---|---|---|---|
|
c4a63b8ffd
|
|||
|
47da32d2cb
|
|||
|
a0a425b013
|
|||
|
6db0d7e1ef
|
|||
|
3275646804
|
|||
|
b29403db13
|
|||
|
8e2d703492
|
|||
|
db6a3faa44
|
|||
| 163d19bd05 | |||
| 360f67cfab | |||
| a402bc880c | |||
| c441178910 |
@@ -5,6 +5,7 @@
|
||||
../common/xeon.nix
|
||||
../common/ssf/hosts.nix
|
||||
../module/ceph.nix
|
||||
../module/hut-substituter.nix
|
||||
../module/slurm-server.nix
|
||||
./nfs.nix
|
||||
./wireguard.nix
|
||||
@@ -65,10 +66,4 @@
|
||||
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
||||
'';
|
||||
};
|
||||
|
||||
# Use tent for cache
|
||||
nix.settings = {
|
||||
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
{
|
||||
imports = [
|
||||
../common/ssf.nix
|
||||
../module/hut-substituter.nix
|
||||
../module/monitoring.nix
|
||||
];
|
||||
|
||||
|
||||
@@ -11,11 +11,13 @@
|
||||
./base/hw.nix
|
||||
./base/net.nix
|
||||
./base/nix.nix
|
||||
./base/nosv.nix
|
||||
./base/ntp.nix
|
||||
./base/rev.nix
|
||||
./base/ssh.nix
|
||||
./base/users.nix
|
||||
./base/watchdog.nix
|
||||
./base/zsh.nix
|
||||
./base/fish.nix
|
||||
];
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
||||
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
||||
ncdu config.boot.kernelPackages.perf ldns pv
|
||||
nix-output-monitor
|
||||
nixfmt-rfc-style
|
||||
# From bsckgs overlay
|
||||
osumb
|
||||
];
|
||||
|
||||
4
m/common/base/fish.nix
Normal file
4
m/common/base/fish.nix
Normal file
@@ -0,0 +1,4 @@
|
||||
{ ... }:
|
||||
{
|
||||
programs.fish.enable = true;
|
||||
}
|
||||
9
m/common/base/nosv.nix
Normal file
9
m/common/base/nosv.nix
Normal file
@@ -0,0 +1,9 @@
|
||||
{ ... }:
|
||||
{
|
||||
nix.settings.system-features = [ "nosv" ];
|
||||
programs.nix-required-mounts.enable = true;
|
||||
programs.nix-required-mounts.allowedPatterns.nosv.paths = [
|
||||
"/sys/devices/system/cpu"
|
||||
"/sys/devices/system/node"
|
||||
];
|
||||
}
|
||||
@@ -87,6 +87,12 @@
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
|
||||
];
|
||||
shell = pkgs.fish;
|
||||
packages = with pkgs; [
|
||||
starship
|
||||
jujutsu
|
||||
neovim
|
||||
];
|
||||
};
|
||||
|
||||
vlopez = {
|
||||
@@ -162,7 +168,7 @@
|
||||
home = "/home/Computational/csiringo";
|
||||
description = "Cesare Siringo";
|
||||
group = "Computational";
|
||||
hosts = [ "apex" "weasel" ];
|
||||
hosts = [ ];
|
||||
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
||||
openssh.authorizedKeys.keys = [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
./cpufreq.nix
|
||||
./fs.nix
|
||||
./users.nix
|
||||
../module/hut-substituter.nix
|
||||
../module/debuginfod.nix
|
||||
];
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
../module/emulation.nix
|
||||
../module/nvidia.nix
|
||||
../module/slurm-client.nix
|
||||
../module/hut-substituter.nix
|
||||
./wireguard.nix
|
||||
];
|
||||
|
||||
@@ -62,12 +63,6 @@
|
||||
interfaces.enp1s0f0np0.useDHCP = true;
|
||||
};
|
||||
|
||||
# Use hut for cache
|
||||
nix.settings = {
|
||||
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||
};
|
||||
|
||||
# Recommended for new graphics cards
|
||||
hardware.nvidia.open = true;
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
imports = [
|
||||
../common/ssf.nix
|
||||
../module/monitoring.nix
|
||||
../module/hut-substituter.nix
|
||||
];
|
||||
|
||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
||||
|
||||
@@ -6,5 +6,8 @@
|
||||
{
|
||||
extra-substituters = [ "http://hut/cache" ];
|
||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||
|
||||
# Set a low timeout in case hut is down
|
||||
connect-timeout = 3; # seconds
|
||||
};
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
../module/nvidia.nix
|
||||
../eudy/kernel/perf.nix
|
||||
./wireguard.nix
|
||||
../module/hut-substituter.nix
|
||||
];
|
||||
|
||||
# Don't install Grub on the disk yet
|
||||
@@ -51,11 +52,6 @@
|
||||
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
||||
};
|
||||
|
||||
nix.settings = {
|
||||
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||
};
|
||||
|
||||
# Enable performance governor
|
||||
powerManagement.cpuFreqGovernor = "performance";
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
../hut/msmtp.nix
|
||||
../module/p.nix
|
||||
../module/vpn-dac.nix
|
||||
../module/hut-substituter.nix
|
||||
];
|
||||
|
||||
# Select the this using the ID to avoid mismatches
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
{
|
||||
imports = [
|
||||
../common/ssf.nix
|
||||
../module/hut-substituter.nix
|
||||
./virtualization.nix
|
||||
];
|
||||
|
||||
# Select this using the ID to avoid mismatches
|
||||
@@ -29,4 +31,5 @@
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
40
m/weasel/virtualization.nix
Normal file
40
m/weasel/virtualization.nix
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
lib,
|
||||
pkgs,
|
||||
config,
|
||||
...
|
||||
}:
|
||||
|
||||
{
|
||||
# Enable common container config files in /etc/containers
|
||||
virtualisation.containers.enable = true;
|
||||
virtualisation = {
|
||||
podman = {
|
||||
enable = true;
|
||||
|
||||
# Required for containers under podman-compose to be able to talk to each other.
|
||||
defaultNetwork.settings.dns_enabled = true;
|
||||
};
|
||||
};
|
||||
|
||||
# We cannot use /home since nfs does not support fileattrs needed by podman
|
||||
systemd.tmpfiles.settings = {
|
||||
"podman-users" = lib.mapAttrs' (
|
||||
name: value:
|
||||
lib.nameValuePair ("/var/lib/podman-users/" + name) {
|
||||
d = {
|
||||
group = value.group;
|
||||
mode = value.homeMode;
|
||||
user = name;
|
||||
};
|
||||
}
|
||||
) (lib.filterAttrs (_: x: x.isNormalUser) config.users.users);
|
||||
};
|
||||
|
||||
# Useful other development tools
|
||||
environment.systemPackages = with pkgs; [
|
||||
dive # look into docker image layers
|
||||
podman-tui # status of containers in the terminal
|
||||
podman-compose # start group of containers for dev
|
||||
];
|
||||
}
|
||||
49
web/content/posts/2025-09-26/_index.md
Normal file
49
web/content/posts/2025-09-26/_index.md
Normal file
@@ -0,0 +1,49 @@
|
||||
---
|
||||
title: "Update 2025-09-26"
|
||||
author: "Rodrigo Arias Mallo"
|
||||
date: 2025-09-26
|
||||
---
|
||||
|
||||
This is a summary of notable changes introduced in the last two years. We
|
||||
continue to maintain all machines updated to the last NixOS release (currently
|
||||
NixOS 25.05).
|
||||
|
||||
### New compute node: fox
|
||||
|
||||
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
|
||||
RTX4000 GPUs. During the last months we have been doing some tests and it seems
|
||||
that most of the components work well. We have configured CUDA to use the NVIDIA
|
||||
GPUs, as well as AMD uProf to trace performance and energy counters from the
|
||||
CPUs.
|
||||
|
||||
### Upgraded login node: apex
|
||||
|
||||
We have upgraded the operating system on the login node to NixOS, which now runs
|
||||
Linux 6.15.6. During the upgrade, we have detected a problem with the storage
|
||||
disks. The `/` and `/home` partitions sit on a
|
||||
[RAID 5](https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5),
|
||||
transparently handled by a RAID hardware controller which starts its own
|
||||
firmware before passing the control to the BIOS to continue the boot sequence. A
|
||||
problem during the startup of the firmware prevented the node to even reach the
|
||||
BIOS screen.
|
||||
|
||||
After a long debugging session, we detected that the flash memory that stores
|
||||
the firmware of the hardware controller was likely to be the issue, since
|
||||
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
|
||||
may lose charge over time and can end up corrupting the content. We flashed
|
||||
the latest firmware so the memory cells are charged again with the new bits and
|
||||
that fixed the problem. Hopefully we will be able to use it for some more years.
|
||||
|
||||
The SLURM server has been moved to apex which allows users to also submit jobs
|
||||
to fox.
|
||||
|
||||
### Migrated machines to BSC building
|
||||
|
||||
The server room had a temperature issue that had been affecting our machines
|
||||
since the end of February of 2025. As the summer approached, the temperature
|
||||
exceeded the safe limits for our hardware, so we had to shutdown the cluster.
|
||||
|
||||

|
||||
|
||||
Since then, we have moved the cluster to BSC premises, where it now rests at a
|
||||
stable temperature, so hopefully we won't have more unscheduled downtime.
|
||||
BIN
web/content/posts/2025-09-26/temp.png
Normal file
BIN
web/content/posts/2025-09-26/temp.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 97 KiB |
Reference in New Issue
Block a user