Compare commits
5 Commits
old-master
...
gitea-lfs
| Author | SHA1 | Date | |
|---|---|---|---|
| be8c150b08 | |||
| f9d4a70791 | |||
| 729e2d3833 | |||
| 54ad962719 | |||
| 8697fc0a18 |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
doc/bsc-ssf.pdf
BIN
doc/bsc-ssf.pdf
Binary file not shown.
46
doc/trim.sh
46
doc/trim.sh
@@ -1,46 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# Trims the jungle repository by moving the website to its own repository and
|
|
||||||
# removing it from jungle. It also removes big pdf files and kernel
|
|
||||||
# configurations so the jungle repository is small.
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ -e oldjungle -o -e newjungle -o -e website ]; then
|
|
||||||
echo "remove oldjungle/, newjungle/ and website/ first"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone the old jungle repo
|
|
||||||
git clone gitea@tent:rarias/jungle.git oldjungle
|
|
||||||
|
|
||||||
# First split the website into a new repository
|
|
||||||
mkdir website && git -C website init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--path web \
|
|
||||||
--subdirectory-filter web \
|
|
||||||
--source oldjungle \
|
|
||||||
--target website
|
|
||||||
|
|
||||||
# Then remove the website, pdf files and big kernel configs
|
|
||||||
mkdir newjungle && git -C newjungle init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--invert-paths \
|
|
||||||
--path web \
|
|
||||||
--path-glob 'doc*.pdf' \
|
|
||||||
--path-glob '**/kernel/configs/lockdep' \
|
|
||||||
--path-glob '**/kernel/configs/defconfig' \
|
|
||||||
--source oldjungle \
|
|
||||||
--target newjungle
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
du -sh oldjungle newjungle website
|
|
||||||
# 57M oldjungle
|
|
||||||
# 2,3M newjungle
|
|
||||||
# 6,4M website
|
|
||||||
|
|
||||||
du -sh --exclude=.git oldjungle newjungle website
|
|
||||||
# 30M oldjungle
|
|
||||||
# 700K newjungle
|
|
||||||
# 3,5M website
|
|
||||||
@@ -5,6 +5,7 @@
|
|||||||
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
||||||
bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
|
bscpkgs.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
self.lfs = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
|
outputs = { self, nixpkgs, agenix, bscpkgs, ... }:
|
||||||
|
|||||||
25
keys.nix
25
keys.nix
@@ -2,22 +2,21 @@
|
|||||||
# here all the public keys
|
# here all the public keys
|
||||||
rec {
|
rec {
|
||||||
hosts = {
|
hosts = {
|
||||||
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
|
hut = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICO7jIp6JRnRWTMDsTB/aiaICJCl4x8qmKMPSs4lCqP1 hut";
|
||||||
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
|
owl1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMqMEXO0ApVsBA6yjmb0xP2kWyoPDIWxBB0Q3+QbHVhv owl1";
|
||||||
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
|
owl2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHurEYpQzNHqWYF6B9Pd7W8UPgF3BxEg0BvSbsA7BAdK owl2";
|
||||||
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
|
eudy = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIL+WYPRRvZupqLAG0USKmd/juEPmisyyJaP8hAgYwXsG eudy";
|
||||||
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
|
koro = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIImiTFDbxyUYPumvm8C4mEnHfuvtBY1H8undtd6oDd67 koro";
|
||||||
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
|
bay = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICvGBzpRQKuQYHdlUQeAk6jmdbkrhmdLwTBqf3el7IgU bay";
|
||||||
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
||||||
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
|
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
|
||||||
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
|
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
|
||||||
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
|
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
|
||||||
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
|
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
|
||||||
raccoon = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNQttFvL0dNEyy7klIhLoK4xXOeM2/K9R7lPMTG3qvK raccoon";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
hostGroup = with hosts; rec {
|
hostGroup = with hosts; rec {
|
||||||
compute = [ owl1 owl2 fox raccoon ];
|
compute = [ owl1 owl2 fox ];
|
||||||
playground = [ eudy koro weasel ];
|
playground = [ eudy koro weasel ];
|
||||||
storage = [ bay lake2 ];
|
storage = [ bay lake2 ];
|
||||||
monitor = [ hut ];
|
monitor = [ hut ];
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
../common/xeon.nix
|
../common/xeon.nix
|
||||||
../common/ssf/hosts.nix
|
../common/ssf/hosts.nix
|
||||||
../module/ceph.nix
|
../module/ceph.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/slurm-server.nix
|
../module/slurm-server.nix
|
||||||
./nfs.nix
|
./nfs.nix
|
||||||
./wireguard.nix
|
./wireguard.nix
|
||||||
@@ -57,6 +56,17 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Use SSH tunnel to reach internal hosts
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host bscpm04.bsc.es gitlab-internal.bsc.es knights3.bsc.es
|
||||||
|
ProxyCommand nc -X connect -x localhost:23080 %h %p
|
||||||
|
Host raccoon
|
||||||
|
HostName knights3.bsc.es
|
||||||
|
ProxyCommand nc -X connect -x localhost:23080 %h %p
|
||||||
|
Host tent
|
||||||
|
ProxyJump raccoon
|
||||||
|
'';
|
||||||
|
|
||||||
networking.firewall = {
|
networking.firewall = {
|
||||||
extraCommands = ''
|
extraCommands = ''
|
||||||
# Blackhole BSC vulnerability scanner (OpenVAS) as it is spamming our
|
# Blackhole BSC vulnerability scanner (OpenVAS) as it is spamming our
|
||||||
@@ -66,4 +76,10 @@
|
|||||||
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Use tent for cache
|
||||||
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,25 +18,18 @@
|
|||||||
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
|
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
|
||||||
peers = [
|
peers = [
|
||||||
{
|
{
|
||||||
name = "fox";
|
name = "Fox";
|
||||||
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
||||||
allowedIPs = [ "10.106.0.1/32" ];
|
allowedIPs = [ "10.106.0.0/24" ];
|
||||||
endpoint = "fox.ac.upc.edu:666";
|
endpoint = "fox.ac.upc.edu:666";
|
||||||
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
|
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
|
||||||
persistentKeepalive = 25;
|
persistentKeepalive = 25;
|
||||||
}
|
}
|
||||||
{
|
|
||||||
name = "raccoon";
|
|
||||||
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
|
|
||||||
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
|
|
||||||
}
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hosts = {
|
networking.hosts = {
|
||||||
"10.106.0.1" = [ "fox" ];
|
"10.106.0.1" = [ "fox" ];
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
||||||
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
||||||
ncdu config.boot.kernelPackages.perf ldns pv
|
ncdu config.boot.kernelPackages.perf ldns pv git-lfs
|
||||||
# From bsckgs overlay
|
# From bsckgs overlay
|
||||||
osumb
|
osumb
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -15,9 +15,8 @@
|
|||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
|
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
|
||||||
|
"84.88.51.152" = [ "raccoon" ];
|
||||||
"84.88.51.142" = [ "raccoon-ipmi" ];
|
"84.88.51.142" = [ "raccoon-ipmi" ];
|
||||||
"192.168.11.12" = [ "bscpm04.bsc.es" ];
|
|
||||||
"192.168.11.15" = [ "gitlab-internal.bsc.es" ];
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -156,30 +156,18 @@
|
|||||||
};
|
};
|
||||||
|
|
||||||
csiringo = {
|
csiringo = {
|
||||||
|
# Arbitrary UID but large so it doesn't collide with other users on ssfhead.
|
||||||
uid = 9653;
|
uid = 9653;
|
||||||
isNormalUser = true;
|
isNormalUser = true;
|
||||||
home = "/home/Computational/csiringo";
|
home = "/home/Computational/csiringo";
|
||||||
description = "Cesare Siringo";
|
description = "Cesare Siringo";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ ];
|
hosts = [ "apex" "weasel" ];
|
||||||
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
acinca = {
|
|
||||||
uid = 9654;
|
|
||||||
isNormalUser = true;
|
|
||||||
home = "/home/Computational/acinca";
|
|
||||||
description = "Arnau Cinca";
|
|
||||||
group = "Computational";
|
|
||||||
hosts = [ "apex" "hut" "fox" "owl1" "owl2" ];
|
|
||||||
hashedPassword = "$6$S6PUeRpdzYlidxzI$szyvWejQ4hEN76yBYhp1diVO5ew1FFg.cz4lKiXt2Idy4XdpifwrFTCIzLTs5dvYlR62m7ekA5MrhcVxR5F/q/";
|
|
||||||
openssh.authorizedKeys.keys = [
|
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmMqKqPg4uocNOr3O41kLbZMOMJn3m2ZdN1JvTR96z3 bsccns@arnau-bsc"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
groups = {
|
groups = {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
./xeon.nix
|
./xeon.nix
|
||||||
./ssf/fs.nix
|
./ssf/fs.nix
|
||||||
./ssf/hosts.nix
|
./ssf/hosts.nix
|
||||||
./ssf/hosts-remote.nix
|
|
||||||
./ssf/net.nix
|
./ssf/net.nix
|
||||||
|
./ssf/ssh.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.hosts = {
|
|
||||||
# Remote hosts visible from compute nodes
|
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
16
m/common/ssf/ssh.nix
Normal file
16
m/common/ssf/ssh.nix
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
# Use SSH tunnel to apex to reach internal hosts
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host tent
|
||||||
|
ProxyJump raccoon
|
||||||
|
|
||||||
|
# Access raccoon via the HTTP proxy
|
||||||
|
Host raccoon knights3.bsc.es
|
||||||
|
HostName knights3.bsc.es
|
||||||
|
ProxyCommand=ssh apex 'nc -X connect -x localhost:23080 %h %p'
|
||||||
|
|
||||||
|
# Make sure we can reach gitlab even if we don't have SSH access to raccoon
|
||||||
|
Host bscpm04.bsc.es gitlab-internal.bsc.es
|
||||||
|
ProxyCommand=ssh apex 'nc -X connect -x localhost:23080 %h %p'
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -9,7 +9,6 @@
|
|||||||
./cpufreq.nix
|
./cpufreq.nix
|
||||||
./fs.nix
|
./fs.nix
|
||||||
./users.nix
|
./users.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -4,11 +4,9 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../common/base.nix
|
../common/base.nix
|
||||||
../common/xeon/console.nix
|
../common/xeon/console.nix
|
||||||
../module/amd-uprof.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/nvidia.nix
|
../module/nvidia.nix
|
||||||
../module/slurm-client.nix
|
../module/slurm-client.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
./wireguard.nix
|
./wireguard.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -23,7 +21,7 @@
|
|||||||
swapDevices = lib.mkForce [];
|
swapDevices = lib.mkForce [];
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
|
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
|
||||||
boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ];
|
boot.kernelModules = [ "kvm-amd" "amd_uncore" ];
|
||||||
|
|
||||||
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
||||||
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
|
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
|
||||||
@@ -31,21 +29,26 @@
|
|||||||
# Use performance for benchmarks
|
# Use performance for benchmarks
|
||||||
powerManagement.cpuFreqGovernor = "performance";
|
powerManagement.cpuFreqGovernor = "performance";
|
||||||
|
|
||||||
services.amd-uprof.enable = true;
|
|
||||||
|
|
||||||
# Disable NUMA balancing
|
# Disable NUMA balancing
|
||||||
boot.kernel.sysctl."kernel.numa_balancing" = 0;
|
boot.kernel.sysctl."kernel.numa_balancing" = 0;
|
||||||
|
|
||||||
# Expose kernel addresses
|
# Expose kernel addresses
|
||||||
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
|
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
|
||||||
|
|
||||||
# Disable NMI watchdog to save one hw counter (for AMD uProf)
|
|
||||||
boot.kernel.sysctl."kernel.nmi_watchdog" = 0;
|
|
||||||
|
|
||||||
services.openssh.settings.X11Forwarding = true;
|
services.openssh.settings.X11Forwarding = true;
|
||||||
|
|
||||||
services.fail2ban.enable = true;
|
services.fail2ban.enable = true;
|
||||||
|
|
||||||
|
# Use SSH tunnel to reach internal hosts
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host bscpm04.bsc.es gitlab-internal.bsc.es tent
|
||||||
|
ProxyJump raccoon
|
||||||
|
Host raccoon
|
||||||
|
ProxyJump apex
|
||||||
|
HostName 127.0.0.1
|
||||||
|
Port 22022
|
||||||
|
'';
|
||||||
|
|
||||||
networking = {
|
networking = {
|
||||||
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
||||||
hostName = "fox";
|
hostName = "fox";
|
||||||
@@ -63,6 +66,12 @@
|
|||||||
interfaces.enp1s0f0np0.useDHCP = true;
|
interfaces.enp1s0f0np0.useDHCP = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Use hut for cache
|
||||||
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Recommended for new graphics cards
|
# Recommended for new graphics cards
|
||||||
hardware.nvidia.open = true;
|
hardware.nvidia.open = true;
|
||||||
|
|
||||||
|
|||||||
@@ -24,24 +24,17 @@
|
|||||||
peers = [
|
peers = [
|
||||||
# List of allowed peers.
|
# List of allowed peers.
|
||||||
{
|
{
|
||||||
name = "apex";
|
name = "Apex";
|
||||||
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
||||||
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
|
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
|
||||||
allowedIPs = [ "10.106.0.30/32" ];
|
allowedIPs = [ "10.106.0.30/32" ];
|
||||||
}
|
}
|
||||||
{
|
|
||||||
name = "raccoon";
|
|
||||||
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
|
|
||||||
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
|
|
||||||
}
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hosts = {
|
networking.hosts = {
|
||||||
"10.106.0.30" = [ "apex" ];
|
"10.106.0.30" = [ "apex" ];
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.firewall = {
|
networking.firewall = {
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
options = {
|
|
||||||
services.amd-uprof = {
|
|
||||||
enable = lib.mkOption {
|
|
||||||
type = lib.types.bool;
|
|
||||||
default = false;
|
|
||||||
description = "Whether to enable AMD uProf.";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Only setup amd-uprof if enabled
|
|
||||||
config = lib.mkIf config.services.amd-uprof.enable {
|
|
||||||
|
|
||||||
# First make sure that we add the module to the list of available modules
|
|
||||||
# in the kernel matching the same kernel version of this configuration.
|
|
||||||
boot.extraModulePackages = with config.boot.kernelPackages; [ amd-uprof-driver ];
|
|
||||||
boot.kernelModules = [ "AMDPowerProfiler" ];
|
|
||||||
|
|
||||||
# Make the userspace tools available in $PATH.
|
|
||||||
environment.systemPackages = with pkgs; [ amd-uprof ];
|
|
||||||
|
|
||||||
# The AMDPowerProfiler module doesn't create the /dev device nor it emits
|
|
||||||
# any uevents, so we cannot use udev rules to automatically create the
|
|
||||||
# device. Instead, we run a systemd unit that does it after loading the
|
|
||||||
# modules.
|
|
||||||
systemd.services.amd-uprof-device = {
|
|
||||||
description = "Create /dev/AMDPowerProfiler device";
|
|
||||||
after = [ "systemd-modules-load.service" ];
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
|
||||||
unitConfig.ConditionPathExists = [
|
|
||||||
"/proc/AMDPowerProfiler/device"
|
|
||||||
"!/dev/AMDPowerProfiler"
|
|
||||||
];
|
|
||||||
serviceConfig = {
|
|
||||||
Type = "oneshot";
|
|
||||||
RemainAfterExit = true;
|
|
||||||
ExecStart = pkgs.writeShellScript "add-amd-uprof-dev.sh" ''
|
|
||||||
mknod /dev/AMDPowerProfiler -m 666 c $(< /proc/AMDPowerProfiler/device) 0
|
|
||||||
'';
|
|
||||||
ExecStop = pkgs.writeShellScript "remove-amd-uprof-dev.sh" ''
|
|
||||||
rm -f /dev/AMDPowerProfiler
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -6,8 +6,5 @@
|
|||||||
{
|
{
|
||||||
extra-substituters = [ "http://hut/cache" ];
|
extra-substituters = [ "http://hut/cache" ];
|
||||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
|
||||||
# Set a low timeout in case hut is down
|
|
||||||
connect-timeout = 3; # seconds
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,12 +12,6 @@
|
|||||||
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
||||||
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
||||||
KillMode = lib.mkForce "control-group";
|
KillMode = lib.mkForce "control-group";
|
||||||
|
|
||||||
# If slurmd fails to contact the control server it will fail, causing the
|
|
||||||
# node to remain out of service until manually restarted. Always try to
|
|
||||||
# restart it.
|
|
||||||
Restart = "always";
|
|
||||||
RestartSec = "30s";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
services.slurm.client.enable = true;
|
services.slurm.client.enable = true;
|
||||||
|
|||||||
8
m/module/ssh-hut-extern.nix
Normal file
8
m/module/ssh-hut-extern.nix
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host apex ssfhead
|
||||||
|
HostName ssflogin.bsc.es
|
||||||
|
Host hut
|
||||||
|
ProxyJump apex
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -3,13 +3,11 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/base.nix
|
../common/base.nix
|
||||||
../common/ssf/hosts.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
|
../module/ssh-hut-extern.nix
|
||||||
../module/nvidia.nix
|
../module/nvidia.nix
|
||||||
../eudy/kernel/perf.nix
|
../eudy/kernel/perf.nix
|
||||||
./wireguard.nix
|
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Don't install Grub on the disk yet
|
# Don't install Grub on the disk yet
|
||||||
@@ -45,11 +43,9 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# Mount the NFS home
|
nix.settings = {
|
||||||
fileSystems."/nfs/home" = {
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
device = "10.106.0.30:/home";
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
fsType = "nfs";
|
|
||||||
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable performance governor
|
# Enable performance governor
|
||||||
|
|||||||
@@ -1,48 +0,0 @@
|
|||||||
{ config, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.nat = {
|
|
||||||
enable = true;
|
|
||||||
enableIPv6 = false;
|
|
||||||
externalInterface = "eno0";
|
|
||||||
internalInterfaces = [ "wg0" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.firewall = {
|
|
||||||
allowedUDPPorts = [ 666 ];
|
|
||||||
};
|
|
||||||
|
|
||||||
age.secrets.wgRaccoon.file = ../../secrets/wg-raccoon.age;
|
|
||||||
|
|
||||||
# Enable WireGuard
|
|
||||||
networking.wireguard.enable = true;
|
|
||||||
networking.wireguard.interfaces = {
|
|
||||||
wg0 = {
|
|
||||||
ips = [ "10.106.0.236/24" ];
|
|
||||||
listenPort = 666;
|
|
||||||
privateKeyFile = config.age.secrets.wgRaccoon.path;
|
|
||||||
# Public key: QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=
|
|
||||||
peers = [
|
|
||||||
{
|
|
||||||
name = "fox";
|
|
||||||
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
|
||||||
allowedIPs = [ "10.106.0.1/32" ];
|
|
||||||
endpoint = "fox.ac.upc.edu:666";
|
|
||||||
persistentKeepalive = 25;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
name = "apex";
|
|
||||||
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
|
||||||
allowedIPs = [ "10.106.0.30/32" "10.0.40.0/24" ];
|
|
||||||
endpoint = "ssfhead.bsc.es:666";
|
|
||||||
persistentKeepalive = 25;
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hosts = {
|
|
||||||
"10.106.0.1" = [ "fox.wg" ];
|
|
||||||
"10.106.0.30" = [ "apex.wg" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -3,9 +3,9 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/xeon.nix
|
../common/xeon.nix
|
||||||
../common/ssf/hosts.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
|
../module/ssh-hut-extern.nix
|
||||||
./monitoring.nix
|
./monitoring.nix
|
||||||
./nginx.nix
|
./nginx.nix
|
||||||
./nix-serve.nix
|
./nix-serve.nix
|
||||||
@@ -15,7 +15,6 @@
|
|||||||
../hut/msmtp.nix
|
../hut/msmtp.nix
|
||||||
../module/p.nix
|
../module/p.nix
|
||||||
../module/vpn-dac.nix
|
../module/vpn-dac.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
@@ -36,7 +35,6 @@
|
|||||||
defaultGateway = "10.0.44.1";
|
defaultGateway = "10.0.44.1";
|
||||||
hosts = {
|
hosts = {
|
||||||
"84.88.53.236" = [ "apex" ];
|
"84.88.53.236" = [ "apex" ];
|
||||||
"10.0.44.1" = [ "raccoon" ];
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -26,5 +26,7 @@
|
|||||||
SENDMAIL_ARGS = "--";
|
SENDMAIL_ARGS = "--";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
lfs.enable = true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
@@ -42,6 +39,7 @@ in
|
|||||||
rewrite ^/git/(.*) /$1 break;
|
rewrite ^/git/(.*) /$1 break;
|
||||||
proxy_pass http://127.0.0.1:3000;
|
proxy_pass http://127.0.0.1:3000;
|
||||||
proxy_redirect http:// $scheme://;
|
proxy_redirect http:// $scheme://;
|
||||||
|
client_max_body_size 64M;
|
||||||
}
|
}
|
||||||
location /cache {
|
location /cache {
|
||||||
rewrite ^/cache/(.*) /$1 break;
|
rewrite ^/cache/(.*) /$1 break;
|
||||||
@@ -70,9 +68,6 @@ in
|
|||||||
location /p/ {
|
location /p/ {
|
||||||
alias /var/lib/p/;
|
alias /var/lib/p/;
|
||||||
}
|
}
|
||||||
location /pub/ {
|
|
||||||
alias /vault/pub/;
|
|
||||||
}
|
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select this using the ID to avoid mismatches
|
# Select this using the ID to avoid mismatches
|
||||||
|
|||||||
@@ -1,89 +0,0 @@
|
|||||||
{ stdenv
|
|
||||||
, lib
|
|
||||||
, curl
|
|
||||||
, cacert
|
|
||||||
, runCommandLocal
|
|
||||||
, autoPatchelfHook
|
|
||||||
, elfutils
|
|
||||||
, glib
|
|
||||||
, libGL
|
|
||||||
, ncurses5
|
|
||||||
, xorg
|
|
||||||
, zlib
|
|
||||||
, libxkbcommon
|
|
||||||
, freetype
|
|
||||||
, fontconfig
|
|
||||||
, libGLU
|
|
||||||
, dbus
|
|
||||||
, rocmPackages
|
|
||||||
, libxcrypt-legacy
|
|
||||||
, numactl
|
|
||||||
, radare2
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
|
||||||
version = "5.1.701";
|
|
||||||
tarball = "AMDuProf_Linux_x64_${version}.tar.bz2";
|
|
||||||
|
|
||||||
# NOTE: Remember to update the radare2 patch below if AMDuProfPcm changes.
|
|
||||||
uprofSrc = runCommandLocal tarball {
|
|
||||||
nativeBuildInputs = [ curl ];
|
|
||||||
outputHash = "sha256-j9gxcBcIg6Zhc5FglUXf/VV9bKSo+PAKeootbN7ggYk=";
|
|
||||||
SSL_CERT_FILE="${cacert}/etc/ssl/certs/ca-bundle.crt";
|
|
||||||
} ''
|
|
||||||
curl \
|
|
||||||
-o $out \
|
|
||||||
'https://download.amd.com/developer/eula/uprof/uprof-5-1/${tarball}' \
|
|
||||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0' \
|
|
||||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
|
|
||||||
-H 'Accept-Language: en-US,en;q=0.5' \
|
|
||||||
-H 'Accept-Encoding: gzip, deflate, br, zstd' \
|
|
||||||
-H 'Referer: https://www.amd.com/' 2>&1 | tr '\r' '\n'
|
|
||||||
'';
|
|
||||||
|
|
||||||
in
|
|
||||||
stdenv.mkDerivation {
|
|
||||||
pname = "AMD-uProf";
|
|
||||||
inherit version;
|
|
||||||
src = uprofSrc;
|
|
||||||
dontStrip = true;
|
|
||||||
phases = [ "installPhase" "fixupPhase" ];
|
|
||||||
nativeBuildInputs = [ autoPatchelfHook radare2 ];
|
|
||||||
buildInputs = [
|
|
||||||
stdenv.cc.cc.lib
|
|
||||||
ncurses5
|
|
||||||
elfutils
|
|
||||||
glib
|
|
||||||
libGL
|
|
||||||
libGLU
|
|
||||||
libxcrypt-legacy
|
|
||||||
xorg.libX11
|
|
||||||
xorg.libXext
|
|
||||||
xorg.libXi
|
|
||||||
xorg.libXmu
|
|
||||||
xorg.libxcb
|
|
||||||
xorg.xcbutilwm
|
|
||||||
xorg.xcbutilrenderutil
|
|
||||||
xorg.xcbutilkeysyms
|
|
||||||
xorg.xcbutilimage
|
|
||||||
fontconfig.lib
|
|
||||||
libxkbcommon
|
|
||||||
zlib
|
|
||||||
freetype
|
|
||||||
dbus
|
|
||||||
rocmPackages.rocprofiler
|
|
||||||
numactl
|
|
||||||
];
|
|
||||||
installPhase = ''
|
|
||||||
set -x
|
|
||||||
mkdir -p $out
|
|
||||||
tar -x -v -C $out --strip-components=1 -f $src
|
|
||||||
rm $out/bin/AMDPowerProfilerDriverSource.tar.gz
|
|
||||||
patchelf --replace-needed libroctracer64.so.1 libroctracer64.so $out/bin/ProfileAgents/x64/libAMDGpuAgent.so
|
|
||||||
patchelf --add-needed libcrypt.so.1 --add-needed libstdc++.so.6 $out/bin/AMDuProfSys
|
|
||||||
echo "16334a51fcc48668307ad94e20482ca4 $out/bin/AMDuProfPcm" | md5sum -c -
|
|
||||||
radare2 -w -q -i ${./libnuma.r2} $out/bin/AMDuProfPcm
|
|
||||||
patchelf --add-needed libnuma.so $out/bin/AMDuProfPcm
|
|
||||||
set +x
|
|
||||||
'';
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{ stdenv
|
|
||||||
, lib
|
|
||||||
, amd-uprof
|
|
||||||
, kernel
|
|
||||||
, runCommandLocal
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
|
||||||
version = amd-uprof.version;
|
|
||||||
tarball = amd-uprof.src;
|
|
||||||
in stdenv.mkDerivation {
|
|
||||||
pname = "AMDPowerProfilerDriver";
|
|
||||||
inherit version;
|
|
||||||
src = runCommandLocal "AMDPowerProfilerDriverSource.tar.gz" { } ''
|
|
||||||
set -x
|
|
||||||
tar -x -f ${tarball} AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz
|
|
||||||
mv AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz $out
|
|
||||||
set +x
|
|
||||||
'';
|
|
||||||
hardeningDisable = [ "pic" "format" ];
|
|
||||||
nativeBuildInputs = kernel.moduleBuildDependencies;
|
|
||||||
patches = [ ./makefile.patch ./hrtimer.patch ];
|
|
||||||
makeFlags = [
|
|
||||||
"KERNEL_VERSION=${kernel.modDirVersion}"
|
|
||||||
"KERNEL_DIR=${kernel.dev}/lib/modules/${kernel.modDirVersion}/build"
|
|
||||||
"INSTALL_MOD_PATH=$(out)"
|
|
||||||
];
|
|
||||||
meta = {
|
|
||||||
description = "AMD Power Profiler Driver";
|
|
||||||
homepage = "https://www.amd.com/es/developer/uprof.html";
|
|
||||||
platforms = lib.platforms.linux;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
--- a/src/PmcTimerConfig.c 2025-09-04 12:17:16.771707049 +0200
|
|
||||||
+++ b/src/PmcTimerConfig.c 2025-09-04 12:17:04.878515468 +0200
|
|
||||||
@@ -99,7 +99,7 @@ static void PmcInitTimer(void* pInfo)
|
|
||||||
|
|
||||||
DRVPRINT("pTimerConfig(%p)", pTimerConfig);
|
|
||||||
|
|
||||||
- hrtimer_init(&pTimerConfig->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
+ hrtimer_setup(&pTimerConfig->m_hrTimer, PmcTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
}
|
|
||||||
|
|
||||||
int PmcSetupTimer(ClientContext* pClientCtx)
|
|
||||||
@@ -157,7 +157,6 @@ int PmcSetupTimer(ClientContext* pClient
|
|
||||||
{
|
|
||||||
/* Interval in ms */
|
|
||||||
pTimerConfig->m_time = ktime_set(interval / 1000, interval * 1000000);
|
|
||||||
- pTimerConfig->m_hrTimer.function = PmcTimerCallback;
|
|
||||||
|
|
||||||
DRVPRINT("retVal(%d) m_time(%lld)", retVal, (long long int) pTimerConfig->m_time);
|
|
||||||
}
|
|
||||||
--- a/src/PwrProfTimer.c 2025-09-04 12:18:08.750544327 +0200
|
|
||||||
+++ b/src/PwrProfTimer.c 2025-09-04 12:18:28.557863382 +0200
|
|
||||||
@@ -573,8 +573,7 @@ void InitHrTimer(uint32 cpu)
|
|
||||||
pCoreClientData = &per_cpu(g_coreClientData, cpu);
|
|
||||||
|
|
||||||
// initialize HR timer
|
|
||||||
- hrtimer_init(&pCoreClientData->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
- pCoreClientData->m_hrTimer.function = &HrTimerCallback;
|
|
||||||
+ hrtimer_setup(&pCoreClientData->m_hrTimer, &HrTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
|
|
||||||
return;
|
|
||||||
} // InitHrTimer
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
# Patch arguments to call sym std::string::find(char const*, unsigned long, unsigned long)
|
|
||||||
# so it matches NixOS:
|
|
||||||
#
|
|
||||||
# Change OS name to NixOS
|
|
||||||
wz NixOS @ 0x00550a43
|
|
||||||
# And set the length to 5 characters
|
|
||||||
wa mov ecx, 5 @0x00517930
|
|
||||||
#
|
|
||||||
# Then change the argument to dlopen() so it only uses libnuma.so
|
|
||||||
wz libnuma.so @ 0x00562940
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
--- a/Makefile 2025-06-19 20:36:49.346693267 +0200
|
|
||||||
+++ b/Makefile 2025-06-19 20:42:29.778088660 +0200
|
|
||||||
@@ -27,7 +27,7 @@ MODULE_VERSION=$(shell cat AMDPowerProfi
|
|
||||||
MODULE_NAME_KO=$(MODULE_NAME).ko
|
|
||||||
|
|
||||||
# check is module inserted
|
|
||||||
-MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
|
|
||||||
+#MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
|
|
||||||
|
|
||||||
# check pcore dkms status
|
|
||||||
PCORE_DKMS_STATUS=$(shell dkms status | grep $(MODULE_NAME) | grep $(MODULE_VERSION))
|
|
||||||
@@ -50,7 +50,7 @@ endif
|
|
||||||
# “-Wno-missing-attributes” is added for GCC version >= 9.0 and kernel version <= 5.00
|
|
||||||
G_VERSION=9
|
|
||||||
K_VERSION=5
|
|
||||||
-KERNEL_MAJOR_VERSION=$(shell uname -r | cut -f1 -d.)
|
|
||||||
+KERNEL_MAJOR_VERSION=$(shell echo "$(KERNEL_VERSION)" | cut -f1 -d.)
|
|
||||||
GCCVERSION = $(shell gcc -dumpversion | cut -f1 -d.)
|
|
||||||
ifeq ($(G_VERSION),$(firstword $(sort $(GCCVERSION) $(G_VERSION))))
|
|
||||||
ifeq ($(K_VERSION),$(lastword $(sort $(KERNEL_MAJOR_VERSION) $(K_VERSION))))
|
|
||||||
@@ -66,17 +66,7 @@ ${MODULE_NAME}-objs := src/PmcDataBuffe
|
|
||||||
|
|
||||||
# make
|
|
||||||
all:
|
|
||||||
- @chmod a+x ./AMDPPcert.sh
|
|
||||||
- @./AMDPPcert.sh 0 1; echo $$? > $(PWD)/sign_status;
|
|
||||||
- @SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS1 -eq 1 ]; then \
|
|
||||||
- exit 1; \
|
|
||||||
- fi
|
|
||||||
- @make -C /lib/modules/$(KERNEL_VERSION)/build M=$(PWD) $(MAKE_OPTS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" modules
|
|
||||||
- @SIGSTATUS3=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS3 -eq 0 ]; then \
|
|
||||||
- ./AMDPPcert.sh 1 $(MODULE_NAME_KO); \
|
|
||||||
- fi
|
|
||||||
+ make -C $(KERNEL_DIR) M=$(PWD) $(MAKE_OPTS) CFLAGS_MODULE="$(EXTRA_CFLAGS)" modules
|
|
||||||
|
|
||||||
# make clean
|
|
||||||
clean:
|
|
||||||
@@ -84,23 +74,9 @@ clean:
|
|
||||||
|
|
||||||
# make install
|
|
||||||
install:
|
|
||||||
- @mkdir -p /lib/modules/`uname -r`/kernel/drivers/extra
|
|
||||||
- @rm -f /lib/modules/`uname -r`/kernel/drivers/extra/$(MODULE_NAME_KO)
|
|
||||||
- @cp $(MODULE_NAME_KO) /lib/modules/`uname -r`/kernel/drivers/extra/
|
|
||||||
- @depmod -a
|
|
||||||
- @if [ ! -z "$(MODPROBE_OUTPUT)" ]; then \
|
|
||||||
- echo "Uninstalling AMDPowerProfiler Linux kernel module.";\
|
|
||||||
- rmmod $(MODULE_NAME);\
|
|
||||||
- fi
|
|
||||||
- @modprobe $(MODULE_NAME) 2> $(PWD)/sign_status1; \
|
|
||||||
- cat $(PWD)/sign_status1 | grep "Key was rejected by service"; \
|
|
||||||
- echo $$? > $(PWD)/sign_status; SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS1 -eq 0 ]; then \
|
|
||||||
- echo "ERROR: Secure Boot enabled, correct key is not yet enrolled in BIOS key table"; \
|
|
||||||
- exit 1; \
|
|
||||||
- else \
|
|
||||||
- cat $(PWD)/sign_status1; \
|
|
||||||
- fi
|
|
||||||
+ mkdir -p $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
|
|
||||||
+ cp -a $(MODULE_NAME_KO) $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
|
|
||||||
+
|
|
||||||
# make dkms
|
|
||||||
dkms:
|
|
||||||
@chmod a+x ./AMDPPcert.sh
|
|
||||||
@@ -53,15 +53,4 @@ final: prev:
|
|||||||
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
||||||
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
||||||
cudainfo = prev.callPackage ./cudainfo/default.nix { };
|
cudainfo = prev.callPackage ./cudainfo/default.nix { };
|
||||||
|
|
||||||
amd-uprof = prev.callPackage ./amd-uprof/default.nix { };
|
|
||||||
|
|
||||||
# FIXME: Extend this to all linuxPackages variants. Open problem, see:
|
|
||||||
# https://discourse.nixos.org/t/whats-the-right-way-to-make-a-custom-kernel-module-available/4636
|
|
||||||
linuxPackages = prev.linuxPackages.extend (_final: _prev: {
|
|
||||||
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
|
|
||||||
});
|
|
||||||
linuxPackages_latest = prev.linuxPackages_latest.extend(_final: _prev: {
|
|
||||||
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ let
|
|||||||
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
||||||
fox = [ keys.hosts.fox ] ++ adminsKeys;
|
fox = [ keys.hosts.fox ] ++ adminsKeys;
|
||||||
apex = [ keys.hosts.apex ] ++ adminsKeys;
|
apex = [ keys.hosts.apex ] ++ adminsKeys;
|
||||||
raccoon = [ keys.hosts.raccoon ] ++ adminsKeys;
|
|
||||||
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
||||||
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
||||||
# Only expose ceph keys to safe nodes and admins
|
# Only expose ceph keys to safe nodes and admins
|
||||||
@@ -30,5 +29,4 @@ in
|
|||||||
|
|
||||||
"wg-fox.age".publicKeys = fox;
|
"wg-fox.age".publicKeys = fox;
|
||||||
"wg-apex.age".publicKeys = apex;
|
"wg-apex.age".publicKeys = apex;
|
||||||
"wg-raccoon.age".publicKeys = raccoon;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
@@ -96,16 +96,6 @@ Then just run `nix develop` from the same directory:
|
|||||||
Cuda compilation tools, release 12.4, V12.4.99
|
Cuda compilation tools, release 12.4, V12.4.99
|
||||||
Build cuda_12.4.r12.4/compiler.33961263_0
|
Build cuda_12.4.r12.4/compiler.33961263_0
|
||||||
|
|
||||||
## AMD uProf
|
|
||||||
|
|
||||||
The [AMD uProf](https://www.amd.com/en/developer/uprof.html) performance
|
|
||||||
analysis tool-suite is installed and ready to use.
|
|
||||||
|
|
||||||
See the [AMD uProf user guide](https://docs.amd.com/r/en-US/57368-uProf-user-guide)
|
|
||||||
([PDF backup for v5.1](https://jungle.bsc.es/pub/57368-uprof-user-guide.pdf))
|
|
||||||
for more details on how to use the tools. To use the GUI make sure that you
|
|
||||||
connect to fox using X11 forwarding.
|
|
||||||
|
|
||||||
## Filesystems
|
## Filesystems
|
||||||
|
|
||||||
The machine has several file systems available.
|
The machine has several file systems available.
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Update 2025-09-26"
|
|
||||||
author: "Rodrigo Arias Mallo"
|
|
||||||
date: 2025-09-26
|
|
||||||
---
|
|
||||||
|
|
||||||
This is a summary of notable changes introduced in the last two years. We
|
|
||||||
continue to maintain all machines updated to the last NixOS release (currently
|
|
||||||
NixOS 25.05).
|
|
||||||
|
|
||||||
### New compute node: fox
|
|
||||||
|
|
||||||
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
|
|
||||||
RTX4000 GPUs. During the last months we have been doing some tests and it seems
|
|
||||||
that most of the components work well. We have configured CUDA to use the NVIDIA
|
|
||||||
GPUs, as well as AMD uProf to trace performance and energy counters from the
|
|
||||||
CPUs.
|
|
||||||
|
|
||||||
### Upgraded login node: apex
|
|
||||||
|
|
||||||
We have upgraded the operating system on the login node to NixOS, which now runs
|
|
||||||
Linux 6.15.6. During the upgrade, we have detected a problem with the storage
|
|
||||||
disks. The `/` and `/home` partitions sit on a
|
|
||||||
[RAID 5](https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5),
|
|
||||||
transparently handled by a RAID hardware controller which starts its own
|
|
||||||
firmware before passing the control to the BIOS to continue the boot sequence. A
|
|
||||||
problem during the startup of the firmware prevented the node to even reach the
|
|
||||||
BIOS screen.
|
|
||||||
|
|
||||||
After a long debugging session, we detected that the flash memory that stores
|
|
||||||
the firmware of the hardware controller was likely to be the issue, since
|
|
||||||
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
|
|
||||||
may lose charge over time and can end up corrupting the content. We flashed
|
|
||||||
the latest firmware so the memory cells are charged again with the new bits and
|
|
||||||
that fixed the problem. Hopefully we will be able to use it for some more years.
|
|
||||||
|
|
||||||
The SLURM server has been moved to apex which allows users to also submit jobs
|
|
||||||
to fox.
|
|
||||||
|
|
||||||
### Migrated machines to BSC building
|
|
||||||
|
|
||||||
The server room had a temperature issue that had been affecting our machines
|
|
||||||
since the end of February of 2025. As the summer approached, the temperature
|
|
||||||
exceeded the safe limits for our hardware, so we had to shutdown the cluster.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
Since then, we have moved the cluster to BSC premises, where it now rests at a
|
|
||||||
stable temperature, so hopefully we won't have more unscheduled downtime.
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 97 KiB |
Reference in New Issue
Block a user