Compare commits
3 Commits
old-master
...
monitor-gp
| Author | SHA1 | Date | |
|---|---|---|---|
| e3b9c08748 | |||
| c94e6fa497 | |||
| ba8e6e1888 |
46
doc/trim.sh
46
doc/trim.sh
@@ -1,46 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# Trims the jungle repository by moving the website to its own repository and
|
|
||||||
# removing it from jungle. It also removes big pdf files and kernel
|
|
||||||
# configurations so the jungle repository is small.
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ -e oldjungle -o -e newjungle -o -e website ]; then
|
|
||||||
echo "remove oldjungle/, newjungle/ and website/ first"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone the old jungle repo
|
|
||||||
git clone gitea@tent:rarias/jungle.git oldjungle
|
|
||||||
|
|
||||||
# First split the website into a new repository
|
|
||||||
mkdir website && git -C website init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--path web \
|
|
||||||
--subdirectory-filter web \
|
|
||||||
--source oldjungle \
|
|
||||||
--target website
|
|
||||||
|
|
||||||
# Then remove the website, pdf files and big kernel configs
|
|
||||||
mkdir newjungle && git -C newjungle init -b master
|
|
||||||
git-filter-repo \
|
|
||||||
--invert-paths \
|
|
||||||
--path web \
|
|
||||||
--path-glob 'doc*.pdf' \
|
|
||||||
--path-glob '**/kernel/configs/lockdep' \
|
|
||||||
--path-glob '**/kernel/configs/defconfig' \
|
|
||||||
--source oldjungle \
|
|
||||||
--target newjungle
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
du -sh oldjungle newjungle website
|
|
||||||
# 57M oldjungle
|
|
||||||
# 2,3M newjungle
|
|
||||||
# 6,4M website
|
|
||||||
|
|
||||||
du -sh --exclude=.git oldjungle newjungle website
|
|
||||||
# 30M oldjungle
|
|
||||||
# 700K newjungle
|
|
||||||
# 3,5M website
|
|
||||||
34
flake.lock
generated
34
flake.lock
generated
@@ -10,11 +10,11 @@
|
|||||||
"systems": "systems"
|
"systems": "systems"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1750173260,
|
"lastModified": 1723293904,
|
||||||
"narHash": "sha256-9P1FziAwl5+3edkfFcr5HeGtQUtrSdk/MksX39GieoA=",
|
"narHash": "sha256-b+uqzj+Wa6xgMS9aNbX4I+sXeb5biPDi39VgvSFqFvU=",
|
||||||
"owner": "ryantm",
|
"owner": "ryantm",
|
||||||
"repo": "agenix",
|
"repo": "agenix",
|
||||||
"rev": "531beac616433bac6f9e2a19feb8e99a22a66baf",
|
"rev": "f6291c5935fdc4e0bef208cfc0dcab7e3f7a1c41",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -30,11 +30,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1749650500,
|
"lastModified": 1732868163,
|
||||||
"narHash": "sha256-2MHfVPV6RA7qPSCtXh4+KK0F0UjN+J4z8//+n6NK7Xs=",
|
"narHash": "sha256-qck4h298AgcNI6BnGhEwl26MTLXjumuJVr+9kak7uPo=",
|
||||||
"ref": "refs/heads/master",
|
"ref": "refs/heads/master",
|
||||||
"rev": "9d1944c658929b6f98b3f3803fead4d1b91c4405",
|
"rev": "6782fc6c5b5a29e84a7f2c2d1064f4bcb1288c0f",
|
||||||
"revCount": 961,
|
"revCount": 952,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.sr.ht/~rodarima/bscpkgs"
|
"url": "https://git.sr.ht/~rodarima/bscpkgs"
|
||||||
},
|
},
|
||||||
@@ -51,11 +51,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1744478979,
|
"lastModified": 1700795494,
|
||||||
"narHash": "sha256-dyN+teG9G82G+m+PX/aSAagkC+vUv0SgUw3XkPhQodQ=",
|
"narHash": "sha256-gzGLZSiOhf155FW7262kdHo2YDeugp3VuIFb4/GGng0=",
|
||||||
"owner": "lnl7",
|
"owner": "lnl7",
|
||||||
"repo": "nix-darwin",
|
"repo": "nix-darwin",
|
||||||
"rev": "43975d782b418ebf4969e9ccba82466728c2851b",
|
"rev": "4b9b83d5a92e8c1fbfd8eb27eda375908c11ec4d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -73,11 +73,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1745494811,
|
"lastModified": 1703113217,
|
||||||
"narHash": "sha256-YZCh2o9Ua1n9uCvrvi5pRxtuVNml8X2a03qIFfRKpFs=",
|
"narHash": "sha256-7ulcXOk63TIT2lVDSExj7XzFx09LpdSAPtvgtM7yQPE=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "home-manager",
|
"repo": "home-manager",
|
||||||
"rev": "abfad3d2958c9e6300a883bd443512c55dfeb1be",
|
"rev": "3bfaacf46133c037bb356193bd2f1765d9dc82c1",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -88,16 +88,16 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1752436162,
|
"lastModified": 1736867362,
|
||||||
"narHash": "sha256-Kt1UIPi7kZqkSc5HVj6UY5YLHHEzPBkgpNUByuyxtlw=",
|
"narHash": "sha256-i/UJ5I7HoqmFMwZEH6vAvBxOrjjOJNU739lnZnhUln8=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "dfcd5b901dbab46c9c6e80b265648481aafb01f8",
|
"rev": "9c6b49aeac36e2ed73a8c472f1546f6d9cf1addc",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"ref": "nixos-25.05",
|
"ref": "nixos-24.11",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
inputs = {
|
inputs = {
|
||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
|
nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
|
||||||
agenix.url = "github:ryantm/agenix";
|
agenix.url = "github:ryantm/agenix";
|
||||||
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
agenix.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
bscpkgs.url = "git+https://git.sr.ht/~rodarima/bscpkgs";
|
||||||
@@ -27,8 +27,6 @@ in
|
|||||||
lake2 = mkConf "lake2";
|
lake2 = mkConf "lake2";
|
||||||
raccoon = mkConf "raccoon";
|
raccoon = mkConf "raccoon";
|
||||||
fox = mkConf "fox";
|
fox = mkConf "fox";
|
||||||
apex = mkConf "apex";
|
|
||||||
weasel = mkConf "weasel";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // {
|
packages.x86_64-linux = self.nixosConfigurations.hut.pkgs // {
|
||||||
|
|||||||
12
keys.nix
12
keys.nix
@@ -11,19 +11,16 @@ rec {
|
|||||||
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
lake2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINo66//S1yatpQHE/BuYD/Gfq64TY7ZN5XOGXmNchiO0 lake2";
|
||||||
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
|
fox = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDwItIk5uOJcQEVPoy/CVGRzfmE1ojrdDcI06FrU4NFT fox";
|
||||||
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
|
tent = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFAtTpHtdYoelbknD/IcfBlThwLKJv/dSmylOgpg3FRM tent";
|
||||||
apex = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBvUFjSfoxXnKwXhEFXx5ckRKJ0oewJ82mRitSMNMKjh apex";
|
|
||||||
weasel = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLJrQ8BF6KcweQV8pLkSbFT+tbDxSG9qxrdQE65zJZp weasel";
|
|
||||||
raccoon = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNQttFvL0dNEyy7klIhLoK4xXOeM2/K9R7lPMTG3qvK raccoon";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
hostGroup = with hosts; rec {
|
hostGroup = with hosts; rec {
|
||||||
compute = [ owl1 owl2 fox raccoon ];
|
untrusted = [ fox ];
|
||||||
playground = [ eudy koro weasel ];
|
compute = [ owl1 owl2 ];
|
||||||
|
playground = [ eudy koro ];
|
||||||
storage = [ bay lake2 ];
|
storage = [ bay lake2 ];
|
||||||
monitor = [ hut ];
|
monitor = [ hut ];
|
||||||
login = [ apex ];
|
|
||||||
|
|
||||||
system = storage ++ monitor ++ login;
|
system = storage ++ monitor;
|
||||||
safe = system ++ compute;
|
safe = system ++ compute;
|
||||||
all = safe ++ playground;
|
all = safe ++ playground;
|
||||||
};
|
};
|
||||||
@@ -31,7 +28,6 @@ rec {
|
|||||||
admins = {
|
admins = {
|
||||||
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
"rarias@hut" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE1oZTPtlEXdGt0Ak+upeCIiBdaDQtcmuWoTUCVuSVIR rarias@hut";
|
||||||
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
|
"rarias@tent" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIwlWSBTZi74WTz5xn6gBvTmCoVltmtIAeM3RMmkh4QZ rarias@tent";
|
||||||
"rarias@fox" = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDSbw3REAKECV7E2c/e2XJITudJQWq2qDSe2N1JHqHZd rarias@fox";
|
|
||||||
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
|
root = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIII/1TNArcwA6D47mgW4TArwlxQRpwmIGiZDysah40Gb root@hut";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,69 +0,0 @@
|
|||||||
{ lib, config, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
../common/xeon.nix
|
|
||||||
../common/ssf/hosts.nix
|
|
||||||
../module/ceph.nix
|
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/slurm-server.nix
|
|
||||||
./nfs.nix
|
|
||||||
./wireguard.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
# Don't install grub MBR for now
|
|
||||||
boot.loader.grub.device = "nodev";
|
|
||||||
|
|
||||||
boot.initrd.kernelModules = [
|
|
||||||
"megaraid_sas" # For HW RAID
|
|
||||||
];
|
|
||||||
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
storcli # To manage HW RAID
|
|
||||||
];
|
|
||||||
|
|
||||||
fileSystems."/home" = {
|
|
||||||
device = "/dev/disk/by-label/home";
|
|
||||||
fsType = "ext4";
|
|
||||||
};
|
|
||||||
|
|
||||||
# No swap, there is plenty of RAM
|
|
||||||
swapDevices = lib.mkForce [];
|
|
||||||
|
|
||||||
networking = {
|
|
||||||
hostName = "apex";
|
|
||||||
defaultGateway = "84.88.53.233";
|
|
||||||
nameservers = [ "8.8.8.8" ];
|
|
||||||
|
|
||||||
# Public facing interface
|
|
||||||
interfaces.eno1.ipv4.addresses = [ {
|
|
||||||
address = "84.88.53.236";
|
|
||||||
prefixLength = 29;
|
|
||||||
} ];
|
|
||||||
|
|
||||||
# Internal LAN to our Ethernet switch
|
|
||||||
interfaces.eno2.ipv4.addresses = [ {
|
|
||||||
address = "10.0.40.30";
|
|
||||||
prefixLength = 24;
|
|
||||||
} ];
|
|
||||||
|
|
||||||
# Infiniband over Omnipath switch (disconnected for now)
|
|
||||||
# interfaces.ibp5s0 = {};
|
|
||||||
|
|
||||||
nat = {
|
|
||||||
enable = true;
|
|
||||||
internalInterfaces = [ "eno2" ];
|
|
||||||
externalInterface = "eno1";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.firewall = {
|
|
||||||
extraCommands = ''
|
|
||||||
# Blackhole BSC vulnerability scanner (OpenVAS) as it is spamming our
|
|
||||||
# logs. Insert as first position so we also protect SSH.
|
|
||||||
iptables -I nixos-fw 1 -p tcp -s 192.168.8.16 -j nixos-fw-refuse
|
|
||||||
# Same with opsmonweb01.bsc.es which seems to be trying to access via SSH
|
|
||||||
iptables -I nixos-fw 2 -p tcp -s 84.88.52.176 -j nixos-fw-refuse
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
services.nfs.server = {
|
|
||||||
enable = true;
|
|
||||||
lockdPort = 4001;
|
|
||||||
mountdPort = 4002;
|
|
||||||
statdPort = 4000;
|
|
||||||
exports = ''
|
|
||||||
/home 10.0.40.0/24(rw,async,no_subtree_check,no_root_squash)
|
|
||||||
/home 10.106.0.0/24(rw,async,no_subtree_check,no_root_squash)
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
networking.firewall = {
|
|
||||||
# Check with `rpcinfo -p`
|
|
||||||
extraCommands = ''
|
|
||||||
# Accept NFS traffic from compute nodes but not from the outside
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 111 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 2049 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4000 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
|
|
||||||
# Same but UDP
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 111 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 2049 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4000 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4001 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 4002 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -s 10.0.40.0/24 --dport 20048 -j nixos-fw-accept
|
|
||||||
|
|
||||||
# Accept NFS traffic from wg0
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
|
|
||||||
# Same but UDP
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 111 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 2049 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4000 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4001 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 4002 -j nixos-fw-accept
|
|
||||||
iptables -A nixos-fw -p udp -i wg0 -s 10.106.0.0/24 --dport 20048 -j nixos-fw-accept
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
{ config, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.firewall = {
|
|
||||||
allowedUDPPorts = [ 666 ];
|
|
||||||
};
|
|
||||||
|
|
||||||
age.secrets.wgApex.file = ../../secrets/wg-apex.age;
|
|
||||||
|
|
||||||
# Enable WireGuard
|
|
||||||
networking.wireguard.enable = true;
|
|
||||||
networking.wireguard.interfaces = {
|
|
||||||
# "wg0" is the network interface name. You can name the interface arbitrarily.
|
|
||||||
wg0 = {
|
|
||||||
ips = [ "10.106.0.30/24" ];
|
|
||||||
listenPort = 666;
|
|
||||||
privateKeyFile = config.age.secrets.wgApex.path;
|
|
||||||
# Public key: VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=
|
|
||||||
peers = [
|
|
||||||
{
|
|
||||||
name = "fox";
|
|
||||||
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
|
||||||
allowedIPs = [ "10.106.0.1/32" ];
|
|
||||||
endpoint = "fox.ac.upc.edu:666";
|
|
||||||
# Send keepalives every 25 seconds. Important to keep NAT tables alive.
|
|
||||||
persistentKeepalive = 25;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
name = "raccoon";
|
|
||||||
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
|
|
||||||
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hosts = {
|
|
||||||
"10.106.0.1" = [ "fox" ];
|
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -3,7 +3,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
# Includes the basic configuration for an Intel server.
|
# Includes the basic configuration for an Intel server.
|
||||||
imports = [
|
imports = [
|
||||||
./base/agenix.nix
|
./base/agenix.nix
|
||||||
./base/always-power-on.nix
|
|
||||||
./base/august-shutdown.nix
|
./base/august-shutdown.nix
|
||||||
./base/boot.nix
|
./base/boot.nix
|
||||||
./base/env.nix
|
./base/env.nix
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
{
|
|
||||||
imports = [
|
|
||||||
../../module/power-policy.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
# Turn on as soon as we have power
|
|
||||||
power.policy = "always-on";
|
|
||||||
}
|
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
# Shutdown all machines on August 3rd at 22:00, so we can protect the
|
# Shutdown all machines on August 2nd at 11:00 AM, so we can protect the
|
||||||
# hardware from spurious electrical peaks on the yearly electrical cut for
|
# hardware from spurious electrical peaks on the yearly electrical cut for
|
||||||
# manteinance that starts on August 4th.
|
# manteinance that starts on August 4th.
|
||||||
systemd.timers.august-shutdown = {
|
systemd.timers.august-shutdown = {
|
||||||
description = "Shutdown on August 3rd for maintenance";
|
description = "Shutdown on August 2nd for maintenance";
|
||||||
wantedBy = [ "timers.target" ];
|
wantedBy = [ "timers.target" ];
|
||||||
timerConfig = {
|
timerConfig = {
|
||||||
OnCalendar = "*-08-03 22:00:00";
|
OnCalendar = "*-08-02 11:00:00";
|
||||||
RandomizedDelaySec = "10min";
|
RandomizedDelaySec = "10min";
|
||||||
Unit = "systemd-poweroff.service";
|
Unit = "systemd-poweroff.service";
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,8 +3,8 @@
|
|||||||
{
|
{
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
vim wget git htop tmux pciutils tcpdump ripgrep nix-index nixos-option
|
||||||
nix-diff ipmitool freeipmi ethtool lm_sensors cmake gnumake file tree
|
nix-diff ipmitool freeipmi ethtool lm_sensors ix cmake gnumake file tree
|
||||||
ncdu config.boot.kernelPackages.perf ldns pv
|
ncdu config.boot.kernelPackages.perf ldns
|
||||||
# From bsckgs overlay
|
# From bsckgs overlay
|
||||||
osumb
|
osumb
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
{ pkgs, lib, ... }:
|
{ pkgs, ... }:
|
||||||
|
|
||||||
{
|
{
|
||||||
networking = {
|
networking = {
|
||||||
@@ -10,14 +10,10 @@
|
|||||||
allowedTCPPorts = [ 22 ];
|
allowedTCPPorts = [ 22 ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Make sure we use iptables
|
|
||||||
nftables.enable = lib.mkForce false;
|
|
||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
|
"84.88.53.236" = [ "ssfhead.bsc.es" "ssfhead" ];
|
||||||
|
"84.88.51.152" = [ "raccoon" ];
|
||||||
"84.88.51.142" = [ "raccoon-ipmi" ];
|
"84.88.51.142" = [ "raccoon-ipmi" ];
|
||||||
"192.168.11.12" = [ "bscpm04.bsc.es" ];
|
|
||||||
"192.168.11.15" = [ "gitlab-internal.bsc.es" ];
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,8 +6,6 @@
|
|||||||
(import ../../../pkgs/overlay.nix)
|
(import ../../../pkgs/overlay.nix)
|
||||||
];
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
|
|
||||||
nix = {
|
nix = {
|
||||||
nixPath = [
|
nixPath = [
|
||||||
"nixpkgs=${nixpkgs}"
|
"nixpkgs=${nixpkgs}"
|
||||||
|
|||||||
@@ -56,7 +56,7 @@
|
|||||||
home = "/home/Computational/rpenacob";
|
home = "/home/Computational/rpenacob";
|
||||||
description = "Raúl Peñacoba";
|
description = "Raúl Peñacoba";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "owl1" "owl2" "hut" "tent" "fox" ];
|
hosts = [ "owl1" "owl2" "hut" "tent" "fox" ];
|
||||||
hashedPassword = "$6$TZm3bDIFyPrMhj1E$uEDXoYYd1z2Wd5mMPfh3DZAjP7ztVjJ4ezIcn82C0ImqafPA.AnTmcVftHEzLB3tbe2O4SxDyPSDEQgJ4GOtj/";
|
hashedPassword = "$6$TZm3bDIFyPrMhj1E$uEDXoYYd1z2Wd5mMPfh3DZAjP7ztVjJ4ezIcn82C0ImqafPA.AnTmcVftHEzLB3tbe2O4SxDyPSDEQgJ4GOtj/";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFYfXg37mauGeurqsLpedgA2XQ9d4Nm0ZGo/hI1f7wwH rpenacob@bsc"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFYfXg37mauGeurqsLpedgA2XQ9d4Nm0ZGo/hI1f7wwH rpenacob@bsc"
|
||||||
@@ -69,10 +69,10 @@
|
|||||||
home = "/home/Computational/anavarro";
|
home = "/home/Computational/anavarro";
|
||||||
description = "Antoni Navarro";
|
description = "Antoni Navarro";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "hut" "tent" "raccoon" "fox" "weasel" ];
|
hosts = [ "hut" "tent" "raccoon" "fox" ];
|
||||||
hashedPassword = "$6$EgturvVYXlKgP43g$gTN78LLHIhaF8hsrCXD.O6mKnZSASWSJmCyndTX8QBWT6wTlUhcWVAKz65lFJPXjlJA4u7G1ydYQ0GG6Wk07b1";
|
hashedPassword = "$6$QdNDsuLehoZTYZlb$CDhCouYDPrhoiB7/seu7RF.Gqg4zMQz0n5sA4U1KDgHaZOxy2as9pbIGeF8tOHJKRoZajk5GiaZv0rZMn7Oq31";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMsbM21uepnJwPrRe6jYFz8zrZ6AYMtSEvvt4c9spmFP toni@delltoni"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILWjRSlKgzBPZQhIeEtk6Lvws2XNcYwHcwPv4osSgst5 anavarro@ssfhead"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -82,7 +82,7 @@
|
|||||||
home = "/home/Computational/abonerib";
|
home = "/home/Computational/abonerib";
|
||||||
description = "Aleix Boné";
|
description = "Aleix Boné";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "owl1" "owl2" "hut" "tent" "raccoon" "fox" "weasel" ];
|
hosts = [ "owl1" "owl2" "hut" "tent" "raccoon" "fox" ];
|
||||||
hashedPassword = "$6$V1EQWJr474whv7XJ$OfJ0wueM2l.dgiJiiah0Tip9ITcJ7S7qDvtSycsiQ43QBFyP4lU0e0HaXWps85nqB4TypttYR4hNLoz3bz662/";
|
hashedPassword = "$6$V1EQWJr474whv7XJ$OfJ0wueM2l.dgiJiiah0Tip9ITcJ7S7qDvtSycsiQ43QBFyP4lU0e0HaXWps85nqB4TypttYR4hNLoz3bz662/";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIFiqXqt88VuUfyANkZyLJNiuroIITaGlOOTMhVDKjf abonerib@bsc"
|
||||||
@@ -95,7 +95,7 @@
|
|||||||
home = "/home/Computational/vlopez";
|
home = "/home/Computational/vlopez";
|
||||||
description = "Victor López";
|
description = "Victor López";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "koro" ];
|
hosts = [ "koro" ];
|
||||||
hashedPassword = "$6$0ZBkgIYE/renVqtt$1uWlJsb0FEezRVNoETTzZMx4X2SvWiOsKvi0ppWCRqI66S6TqMBXBdP4fcQyvRRBt0e4Z7opZIvvITBsEtO0f0";
|
hashedPassword = "$6$0ZBkgIYE/renVqtt$1uWlJsb0FEezRVNoETTzZMx4X2SvWiOsKvi0ppWCRqI66S6TqMBXBdP4fcQyvRRBt0e4Z7opZIvvITBsEtO0f0";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGMwlUZRf9jfG666Qa5Sb+KtEhXqkiMlBV2su3x/dXHq victor@arch"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGMwlUZRf9jfG666Qa5Sb+KtEhXqkiMlBV2su3x/dXHq victor@arch"
|
||||||
@@ -108,7 +108,7 @@
|
|||||||
home = "/home/Computational/dbautist";
|
home = "/home/Computational/dbautist";
|
||||||
description = "Dylan Bautista Cases";
|
description = "Dylan Bautista Cases";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "hut" "tent" "raccoon" ];
|
hosts = [ "hut" "tent" "raccoon" ];
|
||||||
hashedPassword = "$6$a2lpzMRVkG9nSgIm$12G6.ka0sFX1YimqJkBAjbvhRKZ.Hl090B27pdbnQOW0wzyxVWySWhyDDCILjQELky.HKYl9gqOeVXW49nW7q/";
|
hashedPassword = "$6$a2lpzMRVkG9nSgIm$12G6.ka0sFX1YimqJkBAjbvhRKZ.Hl090B27pdbnQOW0wzyxVWySWhyDDCILjQELky.HKYl9gqOeVXW49nW7q/";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAb+EQBoS98zrCwnGKkHKwMLdYABMTqv7q9E0+T0QmkS dbautist@bsc-848818791"
|
||||||
@@ -121,7 +121,7 @@
|
|||||||
home = "/home/Computational/dalvare1";
|
home = "/home/Computational/dalvare1";
|
||||||
description = "David Álvarez";
|
description = "David Álvarez";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "hut" "tent" "fox" ];
|
hosts = [ "hut" "tent" "fox" ];
|
||||||
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
|
hashedPassword = "$6$mpyIsV3mdq.rK8$FvfZdRH5OcEkUt5PnIUijWyUYZvB1SgeqxpJ2p91TTe.3eQIDTcLEQ5rxeg.e5IEXAZHHQ/aMsR5kPEujEghx0";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGEfy6F4rF80r4Cpo2H5xaWqhuUZzUsVsILSKGJzt5jF dalvare1@ssfhead"
|
||||||
@@ -134,7 +134,7 @@
|
|||||||
home = "/home/Computational/varcila";
|
home = "/home/Computational/varcila";
|
||||||
description = "Vincent Arcila";
|
description = "Vincent Arcila";
|
||||||
group = "Computational";
|
group = "Computational";
|
||||||
hosts = [ "apex" "hut" "tent" "fox" ];
|
hosts = [ "hut" "tent" "fox" ];
|
||||||
hashedPassword = "$6$oB0Tcn99DcM4Ch$Vn1A0ulLTn/8B2oFPi9wWl/NOsJzaFAWjqekwcuC9sMC7cgxEVb.Nk5XSzQ2xzYcNe5MLtmzkVYnRS1CqP39Y0";
|
hashedPassword = "$6$oB0Tcn99DcM4Ch$Vn1A0ulLTn/8B2oFPi9wWl/NOsJzaFAWjqekwcuC9sMC7cgxEVb.Nk5XSzQ2xzYcNe5MLtmzkVYnRS1CqP39Y0";
|
||||||
openssh.authorizedKeys.keys = [
|
openssh.authorizedKeys.keys = [
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGt0ESYxekBiHJQowmKpfdouw0hVm3N7tUMtAaeLejK vincent@varch"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGt0ESYxekBiHJQowmKpfdouw0hVm3N7tUMtAaeLejK vincent@varch"
|
||||||
@@ -154,32 +154,6 @@
|
|||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIV5LEAII5rfe1hYqDYIIrhb1gOw7RcS1p2mhOTqG+zc pedro@pedro-ThinkPad-P14s-Gen-2a"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
csiringo = {
|
|
||||||
uid = 9653;
|
|
||||||
isNormalUser = true;
|
|
||||||
home = "/home/Computational/csiringo";
|
|
||||||
description = "Cesare Siringo";
|
|
||||||
group = "Computational";
|
|
||||||
hosts = [ ];
|
|
||||||
hashedPassword = "$6$0IsZlju8jFukLlAw$VKm0FUXbS.mVmPm3rcJeizTNU4IM5Nmmy21BvzFL.cQwvlGwFI1YWRQm6gsbd4nbg47mPDvYkr/ar0SlgF6GO1";
|
|
||||||
openssh.authorizedKeys.keys = [
|
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHA65zvvG50iuFEMf+guRwZB65jlGXfGLF4HO+THFaed csiringo@bsc.es"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
acinca = {
|
|
||||||
uid = 9654;
|
|
||||||
isNormalUser = true;
|
|
||||||
home = "/home/Computational/acinca";
|
|
||||||
description = "Arnau Cinca";
|
|
||||||
group = "Computational";
|
|
||||||
hosts = [ "apex" "hut" "fox" "owl1" "owl2" ];
|
|
||||||
hashedPassword = "$6$S6PUeRpdzYlidxzI$szyvWejQ4hEN76yBYhp1diVO5ew1FFg.cz4lKiXt2Idy4XdpifwrFTCIzLTs5dvYlR62m7ekA5MrhcVxR5F/q/";
|
|
||||||
openssh.authorizedKeys.keys = [
|
|
||||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFmMqKqPg4uocNOr3O41kLbZMOMJn3m2ZdN1JvTR96z3 bsccns@arnau-bsc"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
groups = {
|
groups = {
|
||||||
|
|||||||
@@ -3,8 +3,7 @@
|
|||||||
imports = [
|
imports = [
|
||||||
./xeon.nix
|
./xeon.nix
|
||||||
./ssf/fs.nix
|
./ssf/fs.nix
|
||||||
./ssf/hosts.nix
|
|
||||||
./ssf/hosts-remote.nix
|
|
||||||
./ssf/net.nix
|
./ssf/net.nix
|
||||||
|
./ssf/ssh.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.hosts = {
|
|
||||||
# Remote hosts visible from compute nodes
|
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.hosts = {
|
|
||||||
# Login
|
|
||||||
"10.0.40.30" = [ "apex" ];
|
|
||||||
|
|
||||||
# Storage
|
|
||||||
"10.0.40.40" = [ "bay" ]; "10.0.42.40" = [ "bay-ib" ]; "10.0.40.141" = [ "bay-ipmi" ];
|
|
||||||
"10.0.40.41" = [ "oss01" ]; "10.0.42.41" = [ "oss01-ib0" ]; "10.0.40.142" = [ "oss01-ipmi" ];
|
|
||||||
"10.0.40.42" = [ "lake2" ]; "10.0.42.42" = [ "lake2-ib" ]; "10.0.40.143" = [ "lake2-ipmi" ];
|
|
||||||
|
|
||||||
# Xeon compute
|
|
||||||
"10.0.40.1" = [ "owl1" ]; "10.0.42.1" = [ "owl1-ib" ]; "10.0.40.101" = [ "owl1-ipmi" ];
|
|
||||||
"10.0.40.2" = [ "owl2" ]; "10.0.42.2" = [ "owl2-ib" ]; "10.0.40.102" = [ "owl2-ipmi" ];
|
|
||||||
"10.0.40.3" = [ "xeon03" ]; "10.0.42.3" = [ "xeon03-ib" ]; "10.0.40.103" = [ "xeon03-ipmi" ];
|
|
||||||
#"10.0.40.4" = [ "tent" ]; "10.0.42.4" = [ "tent-ib" ]; "10.0.40.104" = [ "tent-ipmi" ];
|
|
||||||
"10.0.40.5" = [ "koro" ]; "10.0.42.5" = [ "koro-ib" ]; "10.0.40.105" = [ "koro-ipmi" ];
|
|
||||||
"10.0.40.6" = [ "weasel" ]; "10.0.42.6" = [ "weasel-ib" ]; "10.0.40.106" = [ "weasel-ipmi" ];
|
|
||||||
"10.0.40.7" = [ "hut" ]; "10.0.42.7" = [ "hut-ib" ]; "10.0.40.107" = [ "hut-ipmi" ];
|
|
||||||
"10.0.40.8" = [ "eudy" ]; "10.0.42.8" = [ "eudy-ib" ]; "10.0.40.108" = [ "eudy-ipmi" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -9,6 +9,14 @@
|
|||||||
defaultGateway = "10.0.40.30";
|
defaultGateway = "10.0.40.30";
|
||||||
nameservers = ["8.8.8.8"];
|
nameservers = ["8.8.8.8"];
|
||||||
|
|
||||||
|
proxy = {
|
||||||
|
default = "http://hut:23080/";
|
||||||
|
noProxy = "127.0.0.1,localhost,internal.domain,10.0.40.40,hut";
|
||||||
|
# Don't set all_proxy as go complains and breaks the gitlab runner, see:
|
||||||
|
# https://github.com/golang/go/issues/16715
|
||||||
|
allProxy = null;
|
||||||
|
};
|
||||||
|
|
||||||
firewall = {
|
firewall = {
|
||||||
extraCommands = ''
|
extraCommands = ''
|
||||||
# Prevent ssfhead from contacting our slurmd daemon
|
# Prevent ssfhead from contacting our slurmd daemon
|
||||||
@@ -19,5 +27,64 @@
|
|||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
|
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
extraHosts = ''
|
||||||
|
10.0.40.30 ssfhead
|
||||||
|
|
||||||
|
# Node Entry for node: mds01 (ID=72)
|
||||||
|
10.0.40.40 bay mds01 mds01-eth0
|
||||||
|
10.0.42.40 bay-ib mds01-ib0
|
||||||
|
10.0.40.141 bay-ipmi mds01-ipmi0 mds01-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: oss01 (ID=73)
|
||||||
|
10.0.40.41 oss01 oss01-eth0
|
||||||
|
10.0.42.41 oss01-ib0
|
||||||
|
10.0.40.142 oss01-ipmi0 oss01-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: oss02 (ID=74)
|
||||||
|
10.0.40.42 lake2 oss02 oss02-eth0
|
||||||
|
10.0.42.42 lake2-ib oss02-ib0
|
||||||
|
10.0.40.143 lake2-ipmi oss02-ipmi0 oss02-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon01 (ID=15)
|
||||||
|
10.0.40.1 owl1 xeon01 xeon01-eth0
|
||||||
|
10.0.42.1 owl1-ib xeon01-ib0
|
||||||
|
10.0.40.101 owl1-ipmi xeon01-ipmi0 xeon01-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon02 (ID=16)
|
||||||
|
10.0.40.2 owl2 xeon02 xeon02-eth0
|
||||||
|
10.0.42.2 owl2-ib xeon02-ib0
|
||||||
|
10.0.40.102 owl2-ipmi xeon02-ipmi0 xeon02-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon03 (ID=17)
|
||||||
|
10.0.40.3 xeon03 xeon03-eth0
|
||||||
|
10.0.42.3 xeon03-ib0
|
||||||
|
10.0.40.103 xeon03-ipmi0 xeon03-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon04 (ID=18)
|
||||||
|
10.0.40.4 xeon04 xeon04-eth0
|
||||||
|
10.0.42.4 xeon04-ib0
|
||||||
|
10.0.40.104 xeon04-ipmi0 xeon04-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon05 (ID=19)
|
||||||
|
10.0.40.5 koro xeon05 xeon05-eth0
|
||||||
|
10.0.42.5 koro-ib xeon05-ib0
|
||||||
|
10.0.40.105 koro-ipmi xeon05-ipmi0
|
||||||
|
|
||||||
|
# Node Entry for node: xeon06 (ID=20)
|
||||||
|
10.0.40.6 xeon06 xeon06-eth0
|
||||||
|
10.0.42.6 xeon06-ib0
|
||||||
|
10.0.40.106 xeon06-ipmi0 xeon06-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon07 (ID=21)
|
||||||
|
10.0.40.7 hut xeon07 xeon07-eth0
|
||||||
|
10.0.42.7 hut-ib xeon07-ib0
|
||||||
|
10.0.40.107 hut-ipmi xeon07-ipmi0 xeon07-ipmi
|
||||||
|
|
||||||
|
# Node Entry for node: xeon08 (ID=22)
|
||||||
|
10.0.40.8 eudy xeon08 xeon08-eth0
|
||||||
|
10.0.42.8 eudy-ib xeon08-ib0
|
||||||
|
10.0.40.108 eudy-ipmi xeon08-ipmi0 xeon08-ipmi
|
||||||
|
'';
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
8
m/common/ssf/ssh.nix
Normal file
8
m/common/ssf/ssh.nix
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
# Connect to intranet git hosts via proxy
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
# Connect to BSC machines via hut proxy too
|
||||||
|
Host amdlogin1.bsc.es armlogin1.bsc.es hualogin1.bsc.es glogin1.bsc.es glogin2.bsc.es fpgalogin1.bsc.es
|
||||||
|
ProxyCommand nc -X connect -x hut:23080 %h %p
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -9,7 +9,6 @@
|
|||||||
./cpufreq.nix
|
./cpufreq.nix
|
||||||
./fs.nix
|
./fs.nix
|
||||||
./users.nix
|
./users.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -4,18 +4,9 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../common/base.nix
|
../common/base.nix
|
||||||
../common/xeon/console.nix
|
../common/xeon/console.nix
|
||||||
../module/amd-uprof.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/nvidia.nix
|
|
||||||
../module/slurm-client.nix
|
|
||||||
../module/hut-substituter.nix
|
|
||||||
./wireguard.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Don't turn off on August as UPC has different dates.
|
|
||||||
# Fox works fine on power cuts.
|
|
||||||
systemd.timers.august-shutdown.enable = false;
|
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
|
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x500a07514b0c1103";
|
||||||
|
|
||||||
@@ -23,7 +14,7 @@
|
|||||||
swapDevices = lib.mkForce [];
|
swapDevices = lib.mkForce [];
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
|
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usbhid" "usb_storage" "sd_mod" ];
|
||||||
boot.kernelModules = [ "kvm-amd" "amd_uncore" "amd_hsmp" ];
|
boot.kernelModules = [ "kvm-amd" "amd_uncore" ];
|
||||||
|
|
||||||
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
||||||
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
|
hardware.cpu.intel.updateMicrocode = lib.mkForce false;
|
||||||
@@ -31,21 +22,14 @@
|
|||||||
# Use performance for benchmarks
|
# Use performance for benchmarks
|
||||||
powerManagement.cpuFreqGovernor = "performance";
|
powerManagement.cpuFreqGovernor = "performance";
|
||||||
|
|
||||||
services.amd-uprof.enable = true;
|
|
||||||
|
|
||||||
# Disable NUMA balancing
|
# Disable NUMA balancing
|
||||||
boot.kernel.sysctl."kernel.numa_balancing" = 0;
|
boot.kernel.sysctl."kernel.numa_balancing" = 0;
|
||||||
|
|
||||||
# Expose kernel addresses
|
# Expose kernel addresses
|
||||||
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
|
boot.kernel.sysctl."kernel.kptr_restrict" = 0;
|
||||||
|
|
||||||
# Disable NMI watchdog to save one hw counter (for AMD uProf)
|
|
||||||
boot.kernel.sysctl."kernel.nmi_watchdog" = 0;
|
|
||||||
|
|
||||||
services.openssh.settings.X11Forwarding = true;
|
services.openssh.settings.X11Forwarding = true;
|
||||||
|
|
||||||
services.fail2ban.enable = true;
|
|
||||||
|
|
||||||
networking = {
|
networking = {
|
||||||
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
timeServers = [ "ntp1.upc.edu" "ntp2.upc.edu" ];
|
||||||
hostName = "fox";
|
hostName = "fox";
|
||||||
@@ -63,20 +47,23 @@
|
|||||||
interfaces.enp1s0f0np0.useDHCP = true;
|
interfaces.enp1s0f0np0.useDHCP = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Recommended for new graphics cards
|
# Use hut for cache
|
||||||
hardware.nvidia.open = true;
|
nix.settings = {
|
||||||
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configure Nvidia driver to use with CUDA
|
||||||
|
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
||||||
|
hardware.graphics.enable = true;
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
nixpkgs.config.nvidia.acceptLicense = true;
|
||||||
|
services.xserver.videoDrivers = [ "nvidia" ];
|
||||||
|
|
||||||
# Mount NVME disks
|
# Mount NVME disks
|
||||||
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
|
fileSystems."/nvme0" = { device = "/dev/disk/by-label/nvme0"; fsType = "ext4"; };
|
||||||
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
|
fileSystems."/nvme1" = { device = "/dev/disk/by-label/nvme1"; fsType = "ext4"; };
|
||||||
|
|
||||||
# Mount the NFS home
|
|
||||||
fileSystems."/nfs/home" = {
|
|
||||||
device = "10.106.0.30:/home";
|
|
||||||
fsType = "nfs";
|
|
||||||
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
# Make a /nvme{0,1}/$USER directory for each user.
|
# Make a /nvme{0,1}/$USER directory for each user.
|
||||||
systemd.services.create-nvme-dirs = let
|
systemd.services.create-nvme-dirs = let
|
||||||
# Take only normal users in fox
|
# Take only normal users in fox
|
||||||
@@ -93,20 +80,4 @@
|
|||||||
wantedBy = [ "multi-user.target" ];
|
wantedBy = [ "multi-user.target" ];
|
||||||
serviceConfig.ExecStart = script;
|
serviceConfig.ExecStart = script;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Only allow SSH connections from users who have a SLURM allocation
|
|
||||||
# See: https://slurm.schedmd.com/pam_slurm_adopt.html
|
|
||||||
security.pam.services.sshd.rules.account.slurm = {
|
|
||||||
control = "required";
|
|
||||||
enable = true;
|
|
||||||
modulePath = "${pkgs.slurm}/lib/security/pam_slurm_adopt.so";
|
|
||||||
args = [ "log_level=debug5" ];
|
|
||||||
order = 999999; # Make it last one
|
|
||||||
};
|
|
||||||
|
|
||||||
# Disable systemd session (pam_systemd.so) as it will conflict with the
|
|
||||||
# pam_slurm_adopt.so module. What happens is that the shell is first adopted
|
|
||||||
# into the slurmstepd task and then into the systemd session, which is not
|
|
||||||
# what we want, otherwise it will linger even if all jobs are gone.
|
|
||||||
security.pam.services.sshd.startSession = lib.mkForce false;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
{ config, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.firewall = {
|
|
||||||
allowedUDPPorts = [ 666 ];
|
|
||||||
};
|
|
||||||
|
|
||||||
age.secrets.wgFox.file = ../../secrets/wg-fox.age;
|
|
||||||
|
|
||||||
networking.wireguard.enable = true;
|
|
||||||
networking.wireguard.interfaces = {
|
|
||||||
# "wg0" is the network interface name. You can name the interface arbitrarily.
|
|
||||||
wg0 = {
|
|
||||||
# Determines the IP address and subnet of the server's end of the tunnel interface.
|
|
||||||
ips = [ "10.106.0.1/24" ];
|
|
||||||
|
|
||||||
# The port that WireGuard listens to. Must be accessible by the client.
|
|
||||||
listenPort = 666;
|
|
||||||
|
|
||||||
# Path to the private key file.
|
|
||||||
privateKeyFile = config.age.secrets.wgFox.path;
|
|
||||||
# Public key: VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=
|
|
||||||
|
|
||||||
peers = [
|
|
||||||
# List of allowed peers.
|
|
||||||
{
|
|
||||||
name = "apex";
|
|
||||||
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
|
||||||
# List of IPs assigned to this peer within the tunnel subnet. Used to configure routing.
|
|
||||||
allowedIPs = [ "10.106.0.30/32" ];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
name = "raccoon";
|
|
||||||
publicKey = "QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=";
|
|
||||||
allowedIPs = [ "10.106.0.236/32" "192.168.0.0/16" "10.0.44.0/24" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hosts = {
|
|
||||||
"10.106.0.30" = [ "apex" ];
|
|
||||||
"10.106.0.236" = [ "raccoon" ];
|
|
||||||
"10.0.44.4" = [ "tent" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.firewall = {
|
|
||||||
extraCommands = ''
|
|
||||||
# Accept slurm connections to slurmd from apex (via wireguard)
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.30/32 -d 10.106.0.1/32 --dport 6818 -j nixos-fw-accept
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -3,12 +3,160 @@ modules:
|
|||||||
prober: http
|
prober: http
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
http:
|
http:
|
||||||
|
proxy_url: "http://127.0.0.1:23080"
|
||||||
|
skip_resolve_phase_with_proxy: true
|
||||||
follow_redirects: true
|
follow_redirects: true
|
||||||
preferred_ip_protocol: "ip4"
|
|
||||||
valid_status_codes: [] # Defaults to 2xx
|
valid_status_codes: [] # Defaults to 2xx
|
||||||
method: GET
|
method: GET
|
||||||
|
http_with_proxy:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
proxy_url: "http://127.0.0.1:3128"
|
||||||
|
skip_resolve_phase_with_proxy: true
|
||||||
|
http_with_proxy_and_headers:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
proxy_url: "http://127.0.0.1:3128"
|
||||||
|
proxy_connect_header:
|
||||||
|
Proxy-Authorization:
|
||||||
|
- Bearer token
|
||||||
|
http_post_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Content-Type: application/json
|
||||||
|
body: '{}'
|
||||||
|
http_post_body_file:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
method: POST
|
||||||
|
body_file: "/files/body.txt"
|
||||||
|
http_basic_auth_example:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Host: "login.example.com"
|
||||||
|
basic_auth:
|
||||||
|
username: "username"
|
||||||
|
password: "mysecret"
|
||||||
|
http_2xx_oauth_client_credentials:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4"
|
||||||
|
valid_status_codes:
|
||||||
|
- 200
|
||||||
|
- 201
|
||||||
|
oauth2:
|
||||||
|
client_id: "client_id"
|
||||||
|
client_secret: "client_secret"
|
||||||
|
token_url: "https://api.example.com/token"
|
||||||
|
endpoint_params:
|
||||||
|
grant_type: "client_credentials"
|
||||||
|
http_custom_ca_example:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
tls_config:
|
||||||
|
ca_file: "/certs/my_cert.crt"
|
||||||
|
http_gzip:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
compression: gzip
|
||||||
|
http_gzip_with_accept_encoding:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
compression: gzip
|
||||||
|
headers:
|
||||||
|
Accept-Encoding: gzip
|
||||||
|
tls_connect:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
tcp:
|
||||||
|
tls: true
|
||||||
|
tcp_connect_example:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
imap_starttls:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
tcp:
|
||||||
|
query_response:
|
||||||
|
- expect: "OK.*STARTTLS"
|
||||||
|
- send: ". STARTTLS"
|
||||||
|
- expect: "OK"
|
||||||
|
- starttls: true
|
||||||
|
- send: ". capability"
|
||||||
|
- expect: "CAPABILITY IMAP4rev1"
|
||||||
|
smtp_starttls:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
tcp:
|
||||||
|
query_response:
|
||||||
|
- expect: "^220 ([^ ]+) ESMTP (.+)$"
|
||||||
|
- send: "EHLO prober\r"
|
||||||
|
- expect: "^250-STARTTLS"
|
||||||
|
- send: "STARTTLS\r"
|
||||||
|
- expect: "^220"
|
||||||
|
- starttls: true
|
||||||
|
- send: "EHLO prober\r"
|
||||||
|
- expect: "^250-AUTH"
|
||||||
|
- send: "QUIT\r"
|
||||||
|
irc_banner_example:
|
||||||
|
prober: tcp
|
||||||
|
timeout: 5s
|
||||||
|
tcp:
|
||||||
|
query_response:
|
||||||
|
- send: "NICK prober"
|
||||||
|
- send: "USER prober prober prober :prober"
|
||||||
|
- expect: "PING :([^ ]+)"
|
||||||
|
send: "PONG ${1}"
|
||||||
|
- expect: "^:[^ ]+ 001"
|
||||||
icmp:
|
icmp:
|
||||||
prober: icmp
|
prober: icmp
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
icmp:
|
icmp:
|
||||||
preferred_ip_protocol: "ip4"
|
preferred_ip_protocol: "ip4"
|
||||||
|
dns_udp_example:
|
||||||
|
prober: dns
|
||||||
|
timeout: 5s
|
||||||
|
dns:
|
||||||
|
query_name: "www.prometheus.io"
|
||||||
|
query_type: "A"
|
||||||
|
valid_rcodes:
|
||||||
|
- NOERROR
|
||||||
|
validate_answer_rrs:
|
||||||
|
fail_if_matches_regexp:
|
||||||
|
- ".*127.0.0.1"
|
||||||
|
fail_if_all_match_regexp:
|
||||||
|
- ".*127.0.0.1"
|
||||||
|
fail_if_not_matches_regexp:
|
||||||
|
- "www.prometheus.io.\t300\tIN\tA\t127.0.0.1"
|
||||||
|
fail_if_none_matches_regexp:
|
||||||
|
- "127.0.0.1"
|
||||||
|
validate_authority_rrs:
|
||||||
|
fail_if_matches_regexp:
|
||||||
|
- ".*127.0.0.1"
|
||||||
|
validate_additional_rrs:
|
||||||
|
fail_if_matches_regexp:
|
||||||
|
- ".*127.0.0.1"
|
||||||
|
dns_soa:
|
||||||
|
prober: dns
|
||||||
|
dns:
|
||||||
|
query_name: "prometheus.io"
|
||||||
|
query_type: "SOA"
|
||||||
|
dns_tcp_example:
|
||||||
|
prober: dns
|
||||||
|
dns:
|
||||||
|
transport_protocol: "tcp" # defaults to "udp"
|
||||||
|
preferred_ip_protocol: "ip4" # defaults to "ip6"
|
||||||
|
query_name: "www.prometheus.io"
|
||||||
|
|||||||
@@ -7,9 +7,11 @@
|
|||||||
../module/ceph.nix
|
../module/ceph.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
|
../module/slurm-client.nix
|
||||||
./gitlab-runner.nix
|
./gitlab-runner.nix
|
||||||
./monitoring.nix
|
./monitoring.nix
|
||||||
./nfs.nix
|
./nfs.nix
|
||||||
|
./slurm-server.nix
|
||||||
./nix-serve.nix
|
./nix-serve.nix
|
||||||
./public-inbox.nix
|
./public-inbox.nix
|
||||||
./gitea.nix
|
./gitea.nix
|
||||||
|
|||||||
@@ -2,10 +2,20 @@
|
|||||||
|
|
||||||
N=500
|
N=500
|
||||||
|
|
||||||
t=$(timeout 5 ssh bsc015557@glogin2.bsc.es "timeout 3 command time -f %e touch /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N} 2>&1; rm -f /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N}")
|
t_proj=$(timeout 5 ssh bsc015557@glogin2.bsc.es "timeout 3 command time -f %e touch /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N} 2>&1; rm -f /gpfs/projects/bsc15/bsc015557/gpfs.{1..$N}")
|
||||||
|
t_scratch=$(timeout 5 ssh bsc015557@glogin2.bsc.es "timeout 3 command time -f %e touch /gpfs/scratch/bsc15/rodrigo/probe/gpfs.{1..$N} 2>&1; rm -f /gpfs/scratch/bsc15/rodrigo/probe/gpfs.{1..$N}")
|
||||||
|
t_home=$(timeout 5 ssh bsc015557@glogin2.bsc.es "timeout 3 command time -f %e touch /home/bsc/bsc015557/.gpfs/{1..$N} 2>&1; rm -f /home/bsc/bsc015557/.gpfs/{1..$N}")
|
||||||
|
|
||||||
if [ -z "$t" ]; then
|
if [ -z "$t_proj" ]; then
|
||||||
t="5.00"
|
t_proj="5.00"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$t_scratch" ]; then
|
||||||
|
t_scratch="5.00"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$t_home" ]; then
|
||||||
|
t_home="5.00"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
@@ -14,5 +24,7 @@ Content-Type: text/plain; version=0.0.4; charset=utf-8; escaping=values
|
|||||||
|
|
||||||
# HELP gpfs_touch_latency Time to create $N files.
|
# HELP gpfs_touch_latency Time to create $N files.
|
||||||
# TYPE gpfs_touch_latency gauge
|
# TYPE gpfs_touch_latency gauge
|
||||||
gpfs_touch_latency $t
|
gpfs_touch_latency{partition="projects"} $t_proj
|
||||||
|
gpfs_touch_latency{partition="home"} $t_home
|
||||||
|
gpfs_touch_latency{partition="scratch"} $t_scratch
|
||||||
EOF
|
EOF
|
||||||
|
|||||||
@@ -267,6 +267,14 @@
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
job_name = "tent";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "127.0.0.1:29002" ]; # Node exporter
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
|
|||||||
7
m/hut/slurm-server.nix
Normal file
7
m/hut/slurm-server.nix
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{ ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
services.slurm = {
|
||||||
|
server.enable = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
- xeon03-ipmi
|
- xeon03-ipmi
|
||||||
- xeon04-ipmi
|
- xeon04-ipmi
|
||||||
- koro-ipmi
|
- koro-ipmi
|
||||||
- weasel-ipmi
|
- xeon06-ipmi
|
||||||
- hut-ipmi
|
- hut-ipmi
|
||||||
- eudy-ipmi
|
- eudy-ipmi
|
||||||
# Storage
|
# Storage
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
imports = [
|
imports = [
|
||||||
../common/ssf.nix
|
../common/ssf.nix
|
||||||
../module/monitoring.nix
|
../module/monitoring.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d53563a";
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
switch-opa = { pos=41; size=1; };
|
switch-opa = { pos=41; size=1; };
|
||||||
|
|
||||||
# SSF login
|
# SSF login
|
||||||
apex = { pos=39; size=2; label="SSFHEAD"; board="R2208WTTYSR"; contact="rodrigo.arias@bsc.es"; };
|
ssfhead = { pos=39; size=2; label="SSFHEAD"; board="R2208WTTYSR"; contact="operations@bsc.es"; };
|
||||||
|
|
||||||
# Storage
|
# Storage
|
||||||
bay = { pos=38; size=1; label="MDS01"; board="S2600WT2R"; sn="BQWL64850303"; contact="rodrigo.arias@bsc.es"; };
|
bay = { pos=38; size=1; label="MDS01"; board="S2600WT2R"; sn="BQWL64850303"; contact="rodrigo.arias@bsc.es"; };
|
||||||
@@ -19,7 +19,7 @@
|
|||||||
xeon03 = { pos=33; size=1; label="SSF-XEON03"; board="S2600WTTR"; sn="BQWL64750826"; contact="rodrigo.arias@bsc.es"; };
|
xeon03 = { pos=33; size=1; label="SSF-XEON03"; board="S2600WTTR"; sn="BQWL64750826"; contact="rodrigo.arias@bsc.es"; };
|
||||||
# Slot 34 empty
|
# Slot 34 empty
|
||||||
koro = { pos=31; size=1; label="SSF-XEON05"; board="S2600WTTR"; sn="BQWL64954293"; contact="rodrigo.arias@bsc.es"; };
|
koro = { pos=31; size=1; label="SSF-XEON05"; board="S2600WTTR"; sn="BQWL64954293"; contact="rodrigo.arias@bsc.es"; };
|
||||||
weasel = { pos=30; size=1; label="SSF-XEON06"; board="S2600WTTR"; sn="BQWL64750846"; contact="antoni.navarro@bsc.es"; };
|
xeon06 = { pos=30; size=1; label="SSF-XEON06"; board="S2600WTTR"; sn="BQWL64750846"; contact="antoni.navarro@bsc.es"; };
|
||||||
hut = { pos=29; size=1; label="SSF-XEON07"; board="S2600WTTR"; sn="BQWL64751184"; contact="rodrigo.arias@bsc.es"; };
|
hut = { pos=29; size=1; label="SSF-XEON07"; board="S2600WTTR"; sn="BQWL64751184"; contact="rodrigo.arias@bsc.es"; };
|
||||||
eudy = { pos=28; size=1; label="SSF-XEON08"; board="S2600WTTR"; sn="BQWL64756586"; contact="aleix.rocanonell@bsc.es"; };
|
eudy = { pos=28; size=1; label="SSF-XEON08"; board="S2600WTTR"; sn="BQWL64756586"; contact="aleix.rocanonell@bsc.es"; };
|
||||||
|
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
options = {
|
|
||||||
services.amd-uprof = {
|
|
||||||
enable = lib.mkOption {
|
|
||||||
type = lib.types.bool;
|
|
||||||
default = false;
|
|
||||||
description = "Whether to enable AMD uProf.";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Only setup amd-uprof if enabled
|
|
||||||
config = lib.mkIf config.services.amd-uprof.enable {
|
|
||||||
|
|
||||||
# First make sure that we add the module to the list of available modules
|
|
||||||
# in the kernel matching the same kernel version of this configuration.
|
|
||||||
boot.extraModulePackages = with config.boot.kernelPackages; [ amd-uprof-driver ];
|
|
||||||
boot.kernelModules = [ "AMDPowerProfiler" ];
|
|
||||||
|
|
||||||
# Make the userspace tools available in $PATH.
|
|
||||||
environment.systemPackages = with pkgs; [ amd-uprof ];
|
|
||||||
|
|
||||||
# The AMDPowerProfiler module doesn't create the /dev device nor it emits
|
|
||||||
# any uevents, so we cannot use udev rules to automatically create the
|
|
||||||
# device. Instead, we run a systemd unit that does it after loading the
|
|
||||||
# modules.
|
|
||||||
systemd.services.amd-uprof-device = {
|
|
||||||
description = "Create /dev/AMDPowerProfiler device";
|
|
||||||
after = [ "systemd-modules-load.service" ];
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
|
||||||
unitConfig.ConditionPathExists = [
|
|
||||||
"/proc/AMDPowerProfiler/device"
|
|
||||||
"!/dev/AMDPowerProfiler"
|
|
||||||
];
|
|
||||||
serviceConfig = {
|
|
||||||
Type = "oneshot";
|
|
||||||
RemainAfterExit = true;
|
|
||||||
ExecStart = pkgs.writeShellScript "add-amd-uprof-dev.sh" ''
|
|
||||||
mknod /dev/AMDPowerProfiler -m 666 c $(< /proc/AMDPowerProfiler/device) 0
|
|
||||||
'';
|
|
||||||
ExecStop = pkgs.writeShellScript "remove-amd-uprof-dev.sh" ''
|
|
||||||
rm -f /dev/AMDPowerProfiler
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -6,8 +6,5 @@
|
|||||||
{
|
{
|
||||||
extra-substituters = [ "http://hut/cache" ];
|
extra-substituters = [ "http://hut/cache" ];
|
||||||
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
|
|
||||||
# Set a low timeout in case hut is down
|
|
||||||
connect-timeout = 3; # seconds
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
{ lib, config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
# Configure Nvidia driver to use with CUDA
|
|
||||||
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
|
||||||
hardware.nvidia.open = lib.mkDefault (builtins.abort "hardware.nvidia.open not set");
|
|
||||||
hardware.graphics.enable = true;
|
|
||||||
nixpkgs.config.nvidia.acceptLicense = true;
|
|
||||||
services.xserver.videoDrivers = [ "nvidia" ];
|
|
||||||
|
|
||||||
# enable support for derivations which require nvidia-gpu to be available
|
|
||||||
# > requiredSystemFeatures = [ "cuda" ];
|
|
||||||
programs.nix-required-mounts.enable = true;
|
|
||||||
programs.nix-required-mounts.presets.nvidia-gpu.enable = true;
|
|
||||||
# They forgot to add the symlink
|
|
||||||
programs.nix-required-mounts.allowedPatterns.nvidia-gpu.paths = [
|
|
||||||
config.systemd.tmpfiles.settings.graphics-driver."/run/opengl-driver"."L+".argument
|
|
||||||
];
|
|
||||||
|
|
||||||
environment.systemPackages = [ pkgs.cudainfo ];
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
with lib;
|
|
||||||
|
|
||||||
let
|
|
||||||
cfg = config.power.policy;
|
|
||||||
in
|
|
||||||
{
|
|
||||||
options = {
|
|
||||||
power.policy = mkOption {
|
|
||||||
type = types.nullOr (types.enum [ "always-on" "previous" "always-off" ]);
|
|
||||||
default = null;
|
|
||||||
description = "Set power policy to use via IPMI.";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
config = mkIf (cfg != null) {
|
|
||||||
systemd.services."power-policy" = {
|
|
||||||
description = "Set power policy to use via IPMI";
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
|
||||||
unitConfig = {
|
|
||||||
StartLimitBurst = "10";
|
|
||||||
StartLimitIntervalSec = "10m";
|
|
||||||
};
|
|
||||||
serviceConfig = {
|
|
||||||
ExecStart = "${pkgs.ipmitool}/bin/ipmitool chassis policy ${cfg}";
|
|
||||||
Type = "oneshot";
|
|
||||||
Restart = "on-failure";
|
|
||||||
RestartSec = "5s";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,10 +1,33 @@
|
|||||||
{ lib, ... }:
|
{ config, pkgs, lib, ... }:
|
||||||
|
|
||||||
{
|
let
|
||||||
imports = [
|
suspendProgram = pkgs.writeScript "suspend.sh" ''
|
||||||
./slurm-common.nix
|
#!/usr/bin/env bash
|
||||||
];
|
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||||
|
set -x
|
||||||
|
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||||
|
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||||
|
hosts=$(scontrol show hostnames $1)
|
||||||
|
for host in $hosts; do
|
||||||
|
echo Shutting down host: $host
|
||||||
|
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
|
||||||
|
resumeProgram = pkgs.writeScript "resume.sh" ''
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
||||||
|
set -x
|
||||||
|
export "PATH=/run/current-system/sw/bin:$PATH"
|
||||||
|
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
||||||
|
hosts=$(scontrol show hostnames $1)
|
||||||
|
for host in $hosts; do
|
||||||
|
echo Starting host: $host
|
||||||
|
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
|
||||||
|
in {
|
||||||
systemd.services.slurmd.serviceConfig = {
|
systemd.services.slurmd.serviceConfig = {
|
||||||
# Kill all processes in the control group on stop/restart. This will kill
|
# Kill all processes in the control group on stop/restart. This will kill
|
||||||
# all the jobs running, so ensure that we only upgrade when the nodes are
|
# all the jobs running, so ensure that we only upgrade when the nodes are
|
||||||
@@ -12,13 +35,92 @@
|
|||||||
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
# https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb
|
||||||
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
# https://bugs.schedmd.com/show_bug.cgi?id=2095#c24
|
||||||
KillMode = lib.mkForce "control-group";
|
KillMode = lib.mkForce "control-group";
|
||||||
|
|
||||||
# If slurmd fails to contact the control server it will fail, causing the
|
|
||||||
# node to remain out of service until manually restarted. Always try to
|
|
||||||
# restart it.
|
|
||||||
Restart = "always";
|
|
||||||
RestartSec = "30s";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
services.slurm.client.enable = true;
|
services.slurm = {
|
||||||
|
client.enable = true;
|
||||||
|
controlMachine = "hut";
|
||||||
|
clusterName = "jungle";
|
||||||
|
nodeName = [
|
||||||
|
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
|
||||||
|
"hut Sockets=2 CoresPerSocket=14 ThreadsPerCore=2"
|
||||||
|
];
|
||||||
|
|
||||||
|
partitionName = [
|
||||||
|
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
||||||
|
];
|
||||||
|
|
||||||
|
# See slurm.conf(5) for more details about these options.
|
||||||
|
extraConfig = ''
|
||||||
|
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
||||||
|
# not with Intel MPI. For that use the compatibility shim libpmi.so
|
||||||
|
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
|
||||||
|
# library in SLURM (--mpi=pmix). See more details here:
|
||||||
|
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
|
||||||
|
MpiDefault=pmix
|
||||||
|
|
||||||
|
# When a node reboots return that node to the slurm queue as soon as it
|
||||||
|
# becomes operative again.
|
||||||
|
ReturnToService=2
|
||||||
|
|
||||||
|
# Track all processes by using a cgroup
|
||||||
|
ProctrackType=proctrack/cgroup
|
||||||
|
|
||||||
|
# Enable task/affinity to allow the jobs to run in a specified subset of
|
||||||
|
# the resources. Use the task/cgroup plugin to enable process containment.
|
||||||
|
TaskPlugin=task/affinity,task/cgroup
|
||||||
|
|
||||||
|
# Power off unused nodes until they are requested
|
||||||
|
SuspendProgram=${suspendProgram}
|
||||||
|
SuspendTimeout=60
|
||||||
|
ResumeProgram=${resumeProgram}
|
||||||
|
ResumeTimeout=300
|
||||||
|
SuspendExcNodes=hut
|
||||||
|
|
||||||
|
# Turn the nodes off after 1 hour of inactivity
|
||||||
|
SuspendTime=3600
|
||||||
|
|
||||||
|
# Reduce port range so we can allow only this range in the firewall
|
||||||
|
SrunPortRange=60000-61000
|
||||||
|
|
||||||
|
# Use cores as consumable resources. In SLURM terms, a core may have
|
||||||
|
# multiple hardware threads (or CPUs).
|
||||||
|
SelectType=select/cons_tres
|
||||||
|
|
||||||
|
# Ignore memory constraints and only use unused cores to share a node with
|
||||||
|
# other jobs.
|
||||||
|
SelectTypeParameters=CR_Core
|
||||||
|
|
||||||
|
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
|
||||||
|
# This sets up the "extern" step into which ssh-launched processes will be
|
||||||
|
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
|
||||||
|
# when a task runs (srun) so we can ssh early.
|
||||||
|
PrologFlags=Alloc,Contain,X11
|
||||||
|
|
||||||
|
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
|
||||||
|
# adopted by the external step, similar to tasks running in regular steps
|
||||||
|
# LaunchParameters=ulimit_pam_adopt
|
||||||
|
SlurmdDebug=debug5
|
||||||
|
#DebugFlags=Protocol,Cgroup
|
||||||
|
'';
|
||||||
|
|
||||||
|
extraCgroupConfig = ''
|
||||||
|
CgroupPlugin=cgroup/v2
|
||||||
|
#ConstrainCores=yes
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Place the slurm config in /etc as this will be required by PAM
|
||||||
|
environment.etc.slurm.source = config.services.slurm.etcSlurm;
|
||||||
|
|
||||||
|
age.secrets.mungeKey = {
|
||||||
|
file = ../../secrets/munge-key.age;
|
||||||
|
owner = "munge";
|
||||||
|
group = "munge";
|
||||||
|
};
|
||||||
|
|
||||||
|
services.munge = {
|
||||||
|
enable = true;
|
||||||
|
password = config.age.secrets.mungeKey.path;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,115 +0,0 @@
|
|||||||
{ config, pkgs, ... }:
|
|
||||||
|
|
||||||
let
|
|
||||||
suspendProgram = pkgs.writeShellScript "suspend.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Shutting down host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power off
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
resumeProgram = pkgs.writeShellScript "resume.sh" ''
|
|
||||||
exec 1>>/var/log/power_save.log 2>>/var/log/power_save.log
|
|
||||||
set -x
|
|
||||||
export "PATH=/run/current-system/sw/bin:$PATH"
|
|
||||||
echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log
|
|
||||||
hosts=$(scontrol show hostnames $1)
|
|
||||||
for host in $hosts; do
|
|
||||||
echo Starting host: $host
|
|
||||||
ipmitool -I lanplus -H ''${host}-ipmi -P "" -U "" chassis power on
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
in {
|
|
||||||
services.slurm = {
|
|
||||||
controlMachine = "apex";
|
|
||||||
clusterName = "jungle";
|
|
||||||
nodeName = [
|
|
||||||
"owl[1,2] Sockets=2 CoresPerSocket=14 ThreadsPerCore=2 Feature=owl"
|
|
||||||
"fox Sockets=8 CoresPerSocket=24 ThreadsPerCore=1"
|
|
||||||
];
|
|
||||||
|
|
||||||
partitionName = [
|
|
||||||
"owl Nodes=owl[1-2] Default=YES DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
|
||||||
"fox Nodes=fox Default=NO DefaultTime=01:00:00 MaxTime=INFINITE State=UP"
|
|
||||||
];
|
|
||||||
|
|
||||||
# See slurm.conf(5) for more details about these options.
|
|
||||||
extraConfig = ''
|
|
||||||
# Use PMIx for MPI by default. It works okay with MPICH and OpenMPI, but
|
|
||||||
# not with Intel MPI. For that use the compatibility shim libpmi.so
|
|
||||||
# setting I_MPI_PMI_LIBRARY=$pmix/lib/libpmi.so while maintaining the PMIx
|
|
||||||
# library in SLURM (--mpi=pmix). See more details here:
|
|
||||||
# https://pm.bsc.es/gitlab/rarias/jungle/-/issues/16
|
|
||||||
MpiDefault=pmix
|
|
||||||
|
|
||||||
# When a node reboots return that node to the slurm queue as soon as it
|
|
||||||
# becomes operative again.
|
|
||||||
ReturnToService=2
|
|
||||||
|
|
||||||
# Track all processes by using a cgroup
|
|
||||||
ProctrackType=proctrack/cgroup
|
|
||||||
|
|
||||||
# Enable task/affinity to allow the jobs to run in a specified subset of
|
|
||||||
# the resources. Use the task/cgroup plugin to enable process containment.
|
|
||||||
TaskPlugin=task/affinity,task/cgroup
|
|
||||||
|
|
||||||
# Power off unused nodes until they are requested
|
|
||||||
SuspendProgram=${suspendProgram}
|
|
||||||
SuspendTimeout=60
|
|
||||||
ResumeProgram=${resumeProgram}
|
|
||||||
ResumeTimeout=300
|
|
||||||
SuspendExcNodes=fox
|
|
||||||
|
|
||||||
# Turn the nodes off after 1 hour of inactivity
|
|
||||||
SuspendTime=3600
|
|
||||||
|
|
||||||
# Reduce port range so we can allow only this range in the firewall
|
|
||||||
SrunPortRange=60000-61000
|
|
||||||
|
|
||||||
# Use cores as consumable resources. In SLURM terms, a core may have
|
|
||||||
# multiple hardware threads (or CPUs).
|
|
||||||
SelectType=select/cons_tres
|
|
||||||
|
|
||||||
# Ignore memory constraints and only use unused cores to share a node with
|
|
||||||
# other jobs.
|
|
||||||
SelectTypeParameters=CR_Core
|
|
||||||
|
|
||||||
# Required for pam_slurm_adopt, see https://slurm.schedmd.com/pam_slurm_adopt.html
|
|
||||||
# This sets up the "extern" step into which ssh-launched processes will be
|
|
||||||
# adopted. Alloc runs the prolog at job allocation (salloc) rather than
|
|
||||||
# when a task runs (srun) so we can ssh early.
|
|
||||||
PrologFlags=Alloc,Contain,X11
|
|
||||||
|
|
||||||
# LaunchParameters=ulimit_pam_adopt will set RLIMIT_RSS in processes
|
|
||||||
# adopted by the external step, similar to tasks running in regular steps
|
|
||||||
# LaunchParameters=ulimit_pam_adopt
|
|
||||||
SlurmdDebug=debug5
|
|
||||||
#DebugFlags=Protocol,Cgroup
|
|
||||||
'';
|
|
||||||
|
|
||||||
extraCgroupConfig = ''
|
|
||||||
CgroupPlugin=cgroup/v2
|
|
||||||
#ConstrainCores=yes
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
|
|
||||||
# Place the slurm config in /etc as this will be required by PAM
|
|
||||||
environment.etc.slurm.source = config.services.slurm.etcSlurm;
|
|
||||||
|
|
||||||
age.secrets.mungeKey = {
|
|
||||||
file = ../../secrets/munge-key.age;
|
|
||||||
owner = "munge";
|
|
||||||
group = "munge";
|
|
||||||
};
|
|
||||||
|
|
||||||
services.munge = {
|
|
||||||
enable = true;
|
|
||||||
password = config.age.secrets.mungeKey.path;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./slurm-common.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
services.slurm.server.enable = true;
|
|
||||||
|
|
||||||
networking.firewall = {
|
|
||||||
extraCommands = ''
|
|
||||||
# Accept slurm connections to controller from compute nodes
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 6817 -j nixos-fw-accept
|
|
||||||
# Accept slurm connections from compute nodes for srun
|
|
||||||
iptables -A nixos-fw -p tcp -s 10.0.40.0/24 --dport 60000:61000 -j nixos-fw-accept
|
|
||||||
|
|
||||||
# Accept slurm connections to controller from fox (via wireguard)
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 6817 -j nixos-fw-accept
|
|
||||||
# Accept slurm connections from fox for srun (via wireguard)
|
|
||||||
iptables -A nixos-fw -p tcp -i wg0 -s 10.106.0.1/32 --dport 60000:61000 -j nixos-fw-accept
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
9
m/module/ssh-hut-extern.nix
Normal file
9
m/module/ssh-hut-extern.nix
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
programs.ssh.extraConfig = ''
|
||||||
|
Host ssfhead
|
||||||
|
HostName ssflogin.bsc.es
|
||||||
|
Host hut
|
||||||
|
ProxyJump ssfhead
|
||||||
|
HostName xeon07
|
||||||
|
'';
|
||||||
|
}
|
||||||
@@ -3,13 +3,10 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/base.nix
|
../common/base.nix
|
||||||
../common/ssf/hosts.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
../module/nvidia.nix
|
../module/ssh-hut-extern.nix
|
||||||
../eudy/kernel/perf.nix
|
../eudy/kernel/perf.nix
|
||||||
./wireguard.nix
|
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Don't install Grub on the disk yet
|
# Don't install Grub on the disk yet
|
||||||
@@ -41,21 +38,26 @@
|
|||||||
};
|
};
|
||||||
hosts = {
|
hosts = {
|
||||||
"10.0.44.4" = [ "tent" ];
|
"10.0.44.4" = [ "tent" ];
|
||||||
"84.88.53.236" = [ "apex" ];
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# Mount the NFS home
|
nix.settings = {
|
||||||
fileSystems."/nfs/home" = {
|
extra-substituters = [ "https://jungle.bsc.es/cache" ];
|
||||||
device = "10.106.0.30:/home";
|
extra-trusted-public-keys = [ "jungle.bsc.es:pEc7MlAT0HEwLQYPtpkPLwRsGf80ZI26aj29zMw/HH0=" ];
|
||||||
fsType = "nfs";
|
|
||||||
options = [ "nfsvers=3" "rsize=1024" "wsize=1024" "cto" "nofail" ];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable performance governor
|
# Enable performance governor
|
||||||
powerManagement.cpuFreqGovernor = "performance";
|
powerManagement.cpuFreqGovernor = "performance";
|
||||||
|
|
||||||
hardware.nvidia.open = false; # Maxwell is older than Turing architecture
|
# Configure Nvidia driver to use with CUDA
|
||||||
|
hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.production;
|
||||||
|
hardware.graphics.enable = true;
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
nixpkgs.config.nvidia.acceptLicense = true;
|
||||||
|
services.xserver.videoDrivers = [ "nvidia" ];
|
||||||
|
|
||||||
|
# Disable garbage collection for now
|
||||||
|
nix.gc.automatic = lib.mkForce false;
|
||||||
|
|
||||||
services.openssh.settings.X11Forwarding = true;
|
services.openssh.settings.X11Forwarding = true;
|
||||||
|
|
||||||
|
|||||||
@@ -1,48 +0,0 @@
|
|||||||
{ config, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
networking.nat = {
|
|
||||||
enable = true;
|
|
||||||
enableIPv6 = false;
|
|
||||||
externalInterface = "eno0";
|
|
||||||
internalInterfaces = [ "wg0" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.firewall = {
|
|
||||||
allowedUDPPorts = [ 666 ];
|
|
||||||
};
|
|
||||||
|
|
||||||
age.secrets.wgRaccoon.file = ../../secrets/wg-raccoon.age;
|
|
||||||
|
|
||||||
# Enable WireGuard
|
|
||||||
networking.wireguard.enable = true;
|
|
||||||
networking.wireguard.interfaces = {
|
|
||||||
wg0 = {
|
|
||||||
ips = [ "10.106.0.236/24" ];
|
|
||||||
listenPort = 666;
|
|
||||||
privateKeyFile = config.age.secrets.wgRaccoon.path;
|
|
||||||
# Public key: QUfnGXSMEgu2bviglsaSdCjidB51oEDBFpnSFcKGfDI=
|
|
||||||
peers = [
|
|
||||||
{
|
|
||||||
name = "fox";
|
|
||||||
publicKey = "VfMPBQLQTKeyXJSwv8wBhc6OV0j2qAxUpX3kLHunK2Y=";
|
|
||||||
allowedIPs = [ "10.106.0.1/32" ];
|
|
||||||
endpoint = "fox.ac.upc.edu:666";
|
|
||||||
persistentKeepalive = 25;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
name = "apex";
|
|
||||||
publicKey = "VwhcN8vSOzdJEotQTpmPHBC52x3Hbv1lkFIyKubrnUA=";
|
|
||||||
allowedIPs = [ "10.106.0.30/32" "10.0.40.0/24" ];
|
|
||||||
endpoint = "ssfhead.bsc.es:666";
|
|
||||||
persistentKeepalive = 25;
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hosts = {
|
|
||||||
"10.106.0.1" = [ "fox.wg" ];
|
|
||||||
"10.106.0.30" = [ "apex.wg" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -3,9 +3,9 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../common/xeon.nix
|
../common/xeon.nix
|
||||||
../common/ssf/hosts.nix
|
|
||||||
../module/emulation.nix
|
../module/emulation.nix
|
||||||
../module/debuginfod.nix
|
../module/debuginfod.nix
|
||||||
|
../module/ssh-hut-extern.nix
|
||||||
./monitoring.nix
|
./monitoring.nix
|
||||||
./nginx.nix
|
./nginx.nix
|
||||||
./nix-serve.nix
|
./nix-serve.nix
|
||||||
@@ -15,7 +15,6 @@
|
|||||||
../hut/msmtp.nix
|
../hut/msmtp.nix
|
||||||
../module/p.nix
|
../module/p.nix
|
||||||
../module/vpn-dac.nix
|
../module/vpn-dac.nix
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
];
|
||||||
|
|
||||||
# Select the this using the ID to avoid mismatches
|
# Select the this using the ID to avoid mismatches
|
||||||
@@ -34,10 +33,6 @@
|
|||||||
nameservers = [ "84.88.52.35" "84.88.52.36" ];
|
nameservers = [ "84.88.52.35" "84.88.52.36" ];
|
||||||
search = [ "bsc.es" "ac.upc.edu" ];
|
search = [ "bsc.es" "ac.upc.edu" ];
|
||||||
defaultGateway = "10.0.44.1";
|
defaultGateway = "10.0.44.1";
|
||||||
hosts = {
|
|
||||||
"84.88.53.236" = [ "apex" ];
|
|
||||||
"10.0.44.1" = [ "raccoon" ];
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
services.p.enable = true;
|
services.p.enable = true;
|
||||||
|
|||||||
@@ -2,13 +2,10 @@
|
|||||||
let
|
let
|
||||||
website = pkgs.stdenv.mkDerivation {
|
website = pkgs.stdenv.mkDerivation {
|
||||||
name = "jungle-web";
|
name = "jungle-web";
|
||||||
src = pkgs.fetchgit {
|
src = theFlake;
|
||||||
url = "https://jungle.bsc.es/git/rarias/jungle-website.git";
|
|
||||||
rev = "739bf0175a7f05380fe7ad7023ff1d60db1710e1";
|
|
||||||
hash = "sha256-ea5DzhYTzZ9TmqD+x95rdNdLbxPnBluqlYH2NmBYmc4=";
|
|
||||||
};
|
|
||||||
buildInputs = [ pkgs.hugo ];
|
buildInputs = [ pkgs.hugo ];
|
||||||
buildPhase = ''
|
buildPhase = ''
|
||||||
|
cd web
|
||||||
rm -rf public/
|
rm -rf public/
|
||||||
hugo
|
hugo
|
||||||
'';
|
'';
|
||||||
@@ -70,9 +67,6 @@ in
|
|||||||
location /p/ {
|
location /p/ {
|
||||||
alias /var/lib/p/;
|
alias /var/lib/p/;
|
||||||
}
|
}
|
||||||
location /pub/ {
|
|
||||||
alias /vault/pub/;
|
|
||||||
}
|
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,33 +0,0 @@
|
|||||||
{ lib, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
../common/ssf.nix
|
|
||||||
../module/hut-substituter.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
# Select this using the ID to avoid mismatches
|
|
||||||
boot.loader.grub.device = "/dev/disk/by-id/wwn-0x55cd2e414d5356ca";
|
|
||||||
|
|
||||||
# No swap, there is plenty of RAM
|
|
||||||
swapDevices = lib.mkForce [];
|
|
||||||
|
|
||||||
# Users with sudo access
|
|
||||||
users.groups.wheel.members = [ "abonerib" "anavarro" ];
|
|
||||||
|
|
||||||
# Run julia installed with juliaup using julia's own libraries:
|
|
||||||
# NIX_LD_LIBRARY_PATH=~/.julia/juliaup/${VERS}/lib/julia ~/.juliaup/bin/julia
|
|
||||||
programs.nix-ld.enable = true;
|
|
||||||
|
|
||||||
networking = {
|
|
||||||
hostName = "weasel";
|
|
||||||
interfaces.eno1.ipv4.addresses = [ {
|
|
||||||
address = "10.0.40.6";
|
|
||||||
prefixLength = 24;
|
|
||||||
} ];
|
|
||||||
interfaces.ibp5s0.ipv4.addresses = [ {
|
|
||||||
address = "10.0.42.6";
|
|
||||||
prefixLength = 24;
|
|
||||||
} ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
{ stdenv
|
|
||||||
, lib
|
|
||||||
, curl
|
|
||||||
, cacert
|
|
||||||
, runCommandLocal
|
|
||||||
, autoPatchelfHook
|
|
||||||
, elfutils
|
|
||||||
, glib
|
|
||||||
, libGL
|
|
||||||
, ncurses5
|
|
||||||
, xorg
|
|
||||||
, zlib
|
|
||||||
, libxkbcommon
|
|
||||||
, freetype
|
|
||||||
, fontconfig
|
|
||||||
, libGLU
|
|
||||||
, dbus
|
|
||||||
, rocmPackages
|
|
||||||
, libxcrypt-legacy
|
|
||||||
, numactl
|
|
||||||
, radare2
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
|
||||||
version = "5.1.701";
|
|
||||||
tarball = "AMDuProf_Linux_x64_${version}.tar.bz2";
|
|
||||||
|
|
||||||
# NOTE: Remember to update the radare2 patch below if AMDuProfPcm changes.
|
|
||||||
uprofSrc = runCommandLocal tarball {
|
|
||||||
nativeBuildInputs = [ curl ];
|
|
||||||
outputHash = "sha256-j9gxcBcIg6Zhc5FglUXf/VV9bKSo+PAKeootbN7ggYk=";
|
|
||||||
SSL_CERT_FILE="${cacert}/etc/ssl/certs/ca-bundle.crt";
|
|
||||||
} ''
|
|
||||||
curl \
|
|
||||||
-o $out \
|
|
||||||
'https://download.amd.com/developer/eula/uprof/uprof-5-1/${tarball}' \
|
|
||||||
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0' \
|
|
||||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
|
|
||||||
-H 'Accept-Language: en-US,en;q=0.5' \
|
|
||||||
-H 'Accept-Encoding: gzip, deflate, br, zstd' \
|
|
||||||
-H 'Referer: https://www.amd.com/' 2>&1 | tr '\r' '\n'
|
|
||||||
'';
|
|
||||||
|
|
||||||
in
|
|
||||||
stdenv.mkDerivation {
|
|
||||||
pname = "AMD-uProf";
|
|
||||||
inherit version;
|
|
||||||
src = uprofSrc;
|
|
||||||
dontStrip = true;
|
|
||||||
phases = [ "installPhase" "fixupPhase" ];
|
|
||||||
nativeBuildInputs = [ autoPatchelfHook radare2 ];
|
|
||||||
buildInputs = [
|
|
||||||
stdenv.cc.cc.lib
|
|
||||||
ncurses5
|
|
||||||
elfutils
|
|
||||||
glib
|
|
||||||
libGL
|
|
||||||
libGLU
|
|
||||||
libxcrypt-legacy
|
|
||||||
xorg.libX11
|
|
||||||
xorg.libXext
|
|
||||||
xorg.libXi
|
|
||||||
xorg.libXmu
|
|
||||||
xorg.libxcb
|
|
||||||
xorg.xcbutilwm
|
|
||||||
xorg.xcbutilrenderutil
|
|
||||||
xorg.xcbutilkeysyms
|
|
||||||
xorg.xcbutilimage
|
|
||||||
fontconfig.lib
|
|
||||||
libxkbcommon
|
|
||||||
zlib
|
|
||||||
freetype
|
|
||||||
dbus
|
|
||||||
rocmPackages.rocprofiler
|
|
||||||
numactl
|
|
||||||
];
|
|
||||||
installPhase = ''
|
|
||||||
set -x
|
|
||||||
mkdir -p $out
|
|
||||||
tar -x -v -C $out --strip-components=1 -f $src
|
|
||||||
rm $out/bin/AMDPowerProfilerDriverSource.tar.gz
|
|
||||||
patchelf --replace-needed libroctracer64.so.1 libroctracer64.so $out/bin/ProfileAgents/x64/libAMDGpuAgent.so
|
|
||||||
patchelf --add-needed libcrypt.so.1 --add-needed libstdc++.so.6 $out/bin/AMDuProfSys
|
|
||||||
echo "16334a51fcc48668307ad94e20482ca4 $out/bin/AMDuProfPcm" | md5sum -c -
|
|
||||||
radare2 -w -q -i ${./libnuma.r2} $out/bin/AMDuProfPcm
|
|
||||||
patchelf --add-needed libnuma.so $out/bin/AMDuProfPcm
|
|
||||||
set +x
|
|
||||||
'';
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{ stdenv
|
|
||||||
, lib
|
|
||||||
, amd-uprof
|
|
||||||
, kernel
|
|
||||||
, runCommandLocal
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
|
||||||
version = amd-uprof.version;
|
|
||||||
tarball = amd-uprof.src;
|
|
||||||
in stdenv.mkDerivation {
|
|
||||||
pname = "AMDPowerProfilerDriver";
|
|
||||||
inherit version;
|
|
||||||
src = runCommandLocal "AMDPowerProfilerDriverSource.tar.gz" { } ''
|
|
||||||
set -x
|
|
||||||
tar -x -f ${tarball} AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz
|
|
||||||
mv AMDuProf_Linux_x64_${version}/bin/AMDPowerProfilerDriverSource.tar.gz $out
|
|
||||||
set +x
|
|
||||||
'';
|
|
||||||
hardeningDisable = [ "pic" "format" ];
|
|
||||||
nativeBuildInputs = kernel.moduleBuildDependencies;
|
|
||||||
patches = [ ./makefile.patch ./hrtimer.patch ];
|
|
||||||
makeFlags = [
|
|
||||||
"KERNEL_VERSION=${kernel.modDirVersion}"
|
|
||||||
"KERNEL_DIR=${kernel.dev}/lib/modules/${kernel.modDirVersion}/build"
|
|
||||||
"INSTALL_MOD_PATH=$(out)"
|
|
||||||
];
|
|
||||||
meta = {
|
|
||||||
description = "AMD Power Profiler Driver";
|
|
||||||
homepage = "https://www.amd.com/es/developer/uprof.html";
|
|
||||||
platforms = lib.platforms.linux;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
--- a/src/PmcTimerConfig.c 2025-09-04 12:17:16.771707049 +0200
|
|
||||||
+++ b/src/PmcTimerConfig.c 2025-09-04 12:17:04.878515468 +0200
|
|
||||||
@@ -99,7 +99,7 @@ static void PmcInitTimer(void* pInfo)
|
|
||||||
|
|
||||||
DRVPRINT("pTimerConfig(%p)", pTimerConfig);
|
|
||||||
|
|
||||||
- hrtimer_init(&pTimerConfig->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
+ hrtimer_setup(&pTimerConfig->m_hrTimer, PmcTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
}
|
|
||||||
|
|
||||||
int PmcSetupTimer(ClientContext* pClientCtx)
|
|
||||||
@@ -157,7 +157,6 @@ int PmcSetupTimer(ClientContext* pClient
|
|
||||||
{
|
|
||||||
/* Interval in ms */
|
|
||||||
pTimerConfig->m_time = ktime_set(interval / 1000, interval * 1000000);
|
|
||||||
- pTimerConfig->m_hrTimer.function = PmcTimerCallback;
|
|
||||||
|
|
||||||
DRVPRINT("retVal(%d) m_time(%lld)", retVal, (long long int) pTimerConfig->m_time);
|
|
||||||
}
|
|
||||||
--- a/src/PwrProfTimer.c 2025-09-04 12:18:08.750544327 +0200
|
|
||||||
+++ b/src/PwrProfTimer.c 2025-09-04 12:18:28.557863382 +0200
|
|
||||||
@@ -573,8 +573,7 @@ void InitHrTimer(uint32 cpu)
|
|
||||||
pCoreClientData = &per_cpu(g_coreClientData, cpu);
|
|
||||||
|
|
||||||
// initialize HR timer
|
|
||||||
- hrtimer_init(&pCoreClientData->m_hrTimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
- pCoreClientData->m_hrTimer.function = &HrTimerCallback;
|
|
||||||
+ hrtimer_setup(&pCoreClientData->m_hrTimer, &HrTimerCallback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
|
|
||||||
return;
|
|
||||||
} // InitHrTimer
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
# Patch arguments to call sym std::string::find(char const*, unsigned long, unsigned long)
|
|
||||||
# so it matches NixOS:
|
|
||||||
#
|
|
||||||
# Change OS name to NixOS
|
|
||||||
wz NixOS @ 0x00550a43
|
|
||||||
# And set the length to 5 characters
|
|
||||||
wa mov ecx, 5 @0x00517930
|
|
||||||
#
|
|
||||||
# Then change the argument to dlopen() so it only uses libnuma.so
|
|
||||||
wz libnuma.so @ 0x00562940
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
--- a/Makefile 2025-06-19 20:36:49.346693267 +0200
|
|
||||||
+++ b/Makefile 2025-06-19 20:42:29.778088660 +0200
|
|
||||||
@@ -27,7 +27,7 @@ MODULE_VERSION=$(shell cat AMDPowerProfi
|
|
||||||
MODULE_NAME_KO=$(MODULE_NAME).ko
|
|
||||||
|
|
||||||
# check is module inserted
|
|
||||||
-MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
|
|
||||||
+#MODPROBE_OUTPUT=$(shell lsmod | grep $(MODULE_NAME))
|
|
||||||
|
|
||||||
# check pcore dkms status
|
|
||||||
PCORE_DKMS_STATUS=$(shell dkms status | grep $(MODULE_NAME) | grep $(MODULE_VERSION))
|
|
||||||
@@ -50,7 +50,7 @@ endif
|
|
||||||
# “-Wno-missing-attributes” is added for GCC version >= 9.0 and kernel version <= 5.00
|
|
||||||
G_VERSION=9
|
|
||||||
K_VERSION=5
|
|
||||||
-KERNEL_MAJOR_VERSION=$(shell uname -r | cut -f1 -d.)
|
|
||||||
+KERNEL_MAJOR_VERSION=$(shell echo "$(KERNEL_VERSION)" | cut -f1 -d.)
|
|
||||||
GCCVERSION = $(shell gcc -dumpversion | cut -f1 -d.)
|
|
||||||
ifeq ($(G_VERSION),$(firstword $(sort $(GCCVERSION) $(G_VERSION))))
|
|
||||||
ifeq ($(K_VERSION),$(lastword $(sort $(KERNEL_MAJOR_VERSION) $(K_VERSION))))
|
|
||||||
@@ -66,17 +66,7 @@ ${MODULE_NAME}-objs := src/PmcDataBuffe
|
|
||||||
|
|
||||||
# make
|
|
||||||
all:
|
|
||||||
- @chmod a+x ./AMDPPcert.sh
|
|
||||||
- @./AMDPPcert.sh 0 1; echo $$? > $(PWD)/sign_status;
|
|
||||||
- @SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS1 -eq 1 ]; then \
|
|
||||||
- exit 1; \
|
|
||||||
- fi
|
|
||||||
- @make -C /lib/modules/$(KERNEL_VERSION)/build M=$(PWD) $(MAKE_OPTS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" modules
|
|
||||||
- @SIGSTATUS3=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS3 -eq 0 ]; then \
|
|
||||||
- ./AMDPPcert.sh 1 $(MODULE_NAME_KO); \
|
|
||||||
- fi
|
|
||||||
+ make -C $(KERNEL_DIR) M=$(PWD) $(MAKE_OPTS) CFLAGS_MODULE="$(EXTRA_CFLAGS)" modules
|
|
||||||
|
|
||||||
# make clean
|
|
||||||
clean:
|
|
||||||
@@ -84,23 +74,9 @@ clean:
|
|
||||||
|
|
||||||
# make install
|
|
||||||
install:
|
|
||||||
- @mkdir -p /lib/modules/`uname -r`/kernel/drivers/extra
|
|
||||||
- @rm -f /lib/modules/`uname -r`/kernel/drivers/extra/$(MODULE_NAME_KO)
|
|
||||||
- @cp $(MODULE_NAME_KO) /lib/modules/`uname -r`/kernel/drivers/extra/
|
|
||||||
- @depmod -a
|
|
||||||
- @if [ ! -z "$(MODPROBE_OUTPUT)" ]; then \
|
|
||||||
- echo "Uninstalling AMDPowerProfiler Linux kernel module.";\
|
|
||||||
- rmmod $(MODULE_NAME);\
|
|
||||||
- fi
|
|
||||||
- @modprobe $(MODULE_NAME) 2> $(PWD)/sign_status1; \
|
|
||||||
- cat $(PWD)/sign_status1 | grep "Key was rejected by service"; \
|
|
||||||
- echo $$? > $(PWD)/sign_status; SIGSTATUS1=`cat $(PWD)/sign_status | tr -d '\n'`; \
|
|
||||||
- if [ $$SIGSTATUS1 -eq 0 ]; then \
|
|
||||||
- echo "ERROR: Secure Boot enabled, correct key is not yet enrolled in BIOS key table"; \
|
|
||||||
- exit 1; \
|
|
||||||
- else \
|
|
||||||
- cat $(PWD)/sign_status1; \
|
|
||||||
- fi
|
|
||||||
+ mkdir -p $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
|
|
||||||
+ cp -a $(MODULE_NAME_KO) $(INSTALL_MOD_PATH)/lib/modules/$(KERNEL_VERSION)/kernel/drivers/extra/
|
|
||||||
+
|
|
||||||
# make dkms
|
|
||||||
dkms:
|
|
||||||
@chmod a+x ./AMDPPcert.sh
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
HOSTCXX ?= g++
|
|
||||||
NVCC := nvcc -ccbin $(HOSTCXX)
|
|
||||||
CXXFLAGS := -m64
|
|
||||||
|
|
||||||
# Target rules
|
|
||||||
all: cudainfo
|
|
||||||
|
|
||||||
cudainfo: cudainfo.cpp
|
|
||||||
$(NVCC) $(CXXFLAGS) -o $@ $<
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f cudainfo cudainfo.o
|
|
||||||
@@ -1,600 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
|
||||||
*
|
|
||||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
||||||
* with this source code for terms and conditions that govern your use of
|
|
||||||
* this software. Any use, reproduction, disclosure, or distribution of
|
|
||||||
* this software and related documentation outside the terms of the EULA
|
|
||||||
* is strictly prohibited.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
|
|
||||||
|
|
||||||
// Shared Utilities (QA Testing)
|
|
||||||
|
|
||||||
// std::system includes
|
|
||||||
#include <memory>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
|
|
||||||
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
|
|
||||||
|
|
||||||
// CUDA Runtime error messages
|
|
||||||
#ifdef __DRIVER_TYPES_H__
|
|
||||||
static const char *_cudaGetErrorEnum(cudaError_t error)
|
|
||||||
{
|
|
||||||
switch (error)
|
|
||||||
{
|
|
||||||
case cudaSuccess:
|
|
||||||
return "cudaSuccess";
|
|
||||||
|
|
||||||
case cudaErrorMissingConfiguration:
|
|
||||||
return "cudaErrorMissingConfiguration";
|
|
||||||
|
|
||||||
case cudaErrorMemoryAllocation:
|
|
||||||
return "cudaErrorMemoryAllocation";
|
|
||||||
|
|
||||||
case cudaErrorInitializationError:
|
|
||||||
return "cudaErrorInitializationError";
|
|
||||||
|
|
||||||
case cudaErrorLaunchFailure:
|
|
||||||
return "cudaErrorLaunchFailure";
|
|
||||||
|
|
||||||
case cudaErrorPriorLaunchFailure:
|
|
||||||
return "cudaErrorPriorLaunchFailure";
|
|
||||||
|
|
||||||
case cudaErrorLaunchTimeout:
|
|
||||||
return "cudaErrorLaunchTimeout";
|
|
||||||
|
|
||||||
case cudaErrorLaunchOutOfResources:
|
|
||||||
return "cudaErrorLaunchOutOfResources";
|
|
||||||
|
|
||||||
case cudaErrorInvalidDeviceFunction:
|
|
||||||
return "cudaErrorInvalidDeviceFunction";
|
|
||||||
|
|
||||||
case cudaErrorInvalidConfiguration:
|
|
||||||
return "cudaErrorInvalidConfiguration";
|
|
||||||
|
|
||||||
case cudaErrorInvalidDevice:
|
|
||||||
return "cudaErrorInvalidDevice";
|
|
||||||
|
|
||||||
case cudaErrorInvalidValue:
|
|
||||||
return "cudaErrorInvalidValue";
|
|
||||||
|
|
||||||
case cudaErrorInvalidPitchValue:
|
|
||||||
return "cudaErrorInvalidPitchValue";
|
|
||||||
|
|
||||||
case cudaErrorInvalidSymbol:
|
|
||||||
return "cudaErrorInvalidSymbol";
|
|
||||||
|
|
||||||
case cudaErrorMapBufferObjectFailed:
|
|
||||||
return "cudaErrorMapBufferObjectFailed";
|
|
||||||
|
|
||||||
case cudaErrorUnmapBufferObjectFailed:
|
|
||||||
return "cudaErrorUnmapBufferObjectFailed";
|
|
||||||
|
|
||||||
case cudaErrorInvalidHostPointer:
|
|
||||||
return "cudaErrorInvalidHostPointer";
|
|
||||||
|
|
||||||
case cudaErrorInvalidDevicePointer:
|
|
||||||
return "cudaErrorInvalidDevicePointer";
|
|
||||||
|
|
||||||
case cudaErrorInvalidTexture:
|
|
||||||
return "cudaErrorInvalidTexture";
|
|
||||||
|
|
||||||
case cudaErrorInvalidTextureBinding:
|
|
||||||
return "cudaErrorInvalidTextureBinding";
|
|
||||||
|
|
||||||
case cudaErrorInvalidChannelDescriptor:
|
|
||||||
return "cudaErrorInvalidChannelDescriptor";
|
|
||||||
|
|
||||||
case cudaErrorInvalidMemcpyDirection:
|
|
||||||
return "cudaErrorInvalidMemcpyDirection";
|
|
||||||
|
|
||||||
case cudaErrorAddressOfConstant:
|
|
||||||
return "cudaErrorAddressOfConstant";
|
|
||||||
|
|
||||||
case cudaErrorTextureFetchFailed:
|
|
||||||
return "cudaErrorTextureFetchFailed";
|
|
||||||
|
|
||||||
case cudaErrorTextureNotBound:
|
|
||||||
return "cudaErrorTextureNotBound";
|
|
||||||
|
|
||||||
case cudaErrorSynchronizationError:
|
|
||||||
return "cudaErrorSynchronizationError";
|
|
||||||
|
|
||||||
case cudaErrorInvalidFilterSetting:
|
|
||||||
return "cudaErrorInvalidFilterSetting";
|
|
||||||
|
|
||||||
case cudaErrorInvalidNormSetting:
|
|
||||||
return "cudaErrorInvalidNormSetting";
|
|
||||||
|
|
||||||
case cudaErrorMixedDeviceExecution:
|
|
||||||
return "cudaErrorMixedDeviceExecution";
|
|
||||||
|
|
||||||
case cudaErrorCudartUnloading:
|
|
||||||
return "cudaErrorCudartUnloading";
|
|
||||||
|
|
||||||
case cudaErrorUnknown:
|
|
||||||
return "cudaErrorUnknown";
|
|
||||||
|
|
||||||
case cudaErrorNotYetImplemented:
|
|
||||||
return "cudaErrorNotYetImplemented";
|
|
||||||
|
|
||||||
case cudaErrorMemoryValueTooLarge:
|
|
||||||
return "cudaErrorMemoryValueTooLarge";
|
|
||||||
|
|
||||||
case cudaErrorInvalidResourceHandle:
|
|
||||||
return "cudaErrorInvalidResourceHandle";
|
|
||||||
|
|
||||||
case cudaErrorNotReady:
|
|
||||||
return "cudaErrorNotReady";
|
|
||||||
|
|
||||||
case cudaErrorInsufficientDriver:
|
|
||||||
return "cudaErrorInsufficientDriver";
|
|
||||||
|
|
||||||
case cudaErrorSetOnActiveProcess:
|
|
||||||
return "cudaErrorSetOnActiveProcess";
|
|
||||||
|
|
||||||
case cudaErrorInvalidSurface:
|
|
||||||
return "cudaErrorInvalidSurface";
|
|
||||||
|
|
||||||
case cudaErrorNoDevice:
|
|
||||||
return "cudaErrorNoDevice";
|
|
||||||
|
|
||||||
case cudaErrorECCUncorrectable:
|
|
||||||
return "cudaErrorECCUncorrectable";
|
|
||||||
|
|
||||||
case cudaErrorSharedObjectSymbolNotFound:
|
|
||||||
return "cudaErrorSharedObjectSymbolNotFound";
|
|
||||||
|
|
||||||
case cudaErrorSharedObjectInitFailed:
|
|
||||||
return "cudaErrorSharedObjectInitFailed";
|
|
||||||
|
|
||||||
case cudaErrorUnsupportedLimit:
|
|
||||||
return "cudaErrorUnsupportedLimit";
|
|
||||||
|
|
||||||
case cudaErrorDuplicateVariableName:
|
|
||||||
return "cudaErrorDuplicateVariableName";
|
|
||||||
|
|
||||||
case cudaErrorDuplicateTextureName:
|
|
||||||
return "cudaErrorDuplicateTextureName";
|
|
||||||
|
|
||||||
case cudaErrorDuplicateSurfaceName:
|
|
||||||
return "cudaErrorDuplicateSurfaceName";
|
|
||||||
|
|
||||||
case cudaErrorDevicesUnavailable:
|
|
||||||
return "cudaErrorDevicesUnavailable";
|
|
||||||
|
|
||||||
case cudaErrorInvalidKernelImage:
|
|
||||||
return "cudaErrorInvalidKernelImage";
|
|
||||||
|
|
||||||
case cudaErrorNoKernelImageForDevice:
|
|
||||||
return "cudaErrorNoKernelImageForDevice";
|
|
||||||
|
|
||||||
case cudaErrorIncompatibleDriverContext:
|
|
||||||
return "cudaErrorIncompatibleDriverContext";
|
|
||||||
|
|
||||||
case cudaErrorPeerAccessAlreadyEnabled:
|
|
||||||
return "cudaErrorPeerAccessAlreadyEnabled";
|
|
||||||
|
|
||||||
case cudaErrorPeerAccessNotEnabled:
|
|
||||||
return "cudaErrorPeerAccessNotEnabled";
|
|
||||||
|
|
||||||
case cudaErrorDeviceAlreadyInUse:
|
|
||||||
return "cudaErrorDeviceAlreadyInUse";
|
|
||||||
|
|
||||||
case cudaErrorProfilerDisabled:
|
|
||||||
return "cudaErrorProfilerDisabled";
|
|
||||||
|
|
||||||
case cudaErrorProfilerNotInitialized:
|
|
||||||
return "cudaErrorProfilerNotInitialized";
|
|
||||||
|
|
||||||
case cudaErrorProfilerAlreadyStarted:
|
|
||||||
return "cudaErrorProfilerAlreadyStarted";
|
|
||||||
|
|
||||||
case cudaErrorProfilerAlreadyStopped:
|
|
||||||
return "cudaErrorProfilerAlreadyStopped";
|
|
||||||
|
|
||||||
/* Since CUDA 4.0*/
|
|
||||||
case cudaErrorAssert:
|
|
||||||
return "cudaErrorAssert";
|
|
||||||
|
|
||||||
case cudaErrorTooManyPeers:
|
|
||||||
return "cudaErrorTooManyPeers";
|
|
||||||
|
|
||||||
case cudaErrorHostMemoryAlreadyRegistered:
|
|
||||||
return "cudaErrorHostMemoryAlreadyRegistered";
|
|
||||||
|
|
||||||
case cudaErrorHostMemoryNotRegistered:
|
|
||||||
return "cudaErrorHostMemoryNotRegistered";
|
|
||||||
|
|
||||||
/* Since CUDA 5.0 */
|
|
||||||
case cudaErrorOperatingSystem:
|
|
||||||
return "cudaErrorOperatingSystem";
|
|
||||||
|
|
||||||
case cudaErrorPeerAccessUnsupported:
|
|
||||||
return "cudaErrorPeerAccessUnsupported";
|
|
||||||
|
|
||||||
case cudaErrorLaunchMaxDepthExceeded:
|
|
||||||
return "cudaErrorLaunchMaxDepthExceeded";
|
|
||||||
|
|
||||||
case cudaErrorLaunchFileScopedTex:
|
|
||||||
return "cudaErrorLaunchFileScopedTex";
|
|
||||||
|
|
||||||
case cudaErrorLaunchFileScopedSurf:
|
|
||||||
return "cudaErrorLaunchFileScopedSurf";
|
|
||||||
|
|
||||||
case cudaErrorSyncDepthExceeded:
|
|
||||||
return "cudaErrorSyncDepthExceeded";
|
|
||||||
|
|
||||||
case cudaErrorLaunchPendingCountExceeded:
|
|
||||||
return "cudaErrorLaunchPendingCountExceeded";
|
|
||||||
|
|
||||||
case cudaErrorNotPermitted:
|
|
||||||
return "cudaErrorNotPermitted";
|
|
||||||
|
|
||||||
case cudaErrorNotSupported:
|
|
||||||
return "cudaErrorNotSupported";
|
|
||||||
|
|
||||||
/* Since CUDA 6.0 */
|
|
||||||
case cudaErrorHardwareStackError:
|
|
||||||
return "cudaErrorHardwareStackError";
|
|
||||||
|
|
||||||
case cudaErrorIllegalInstruction:
|
|
||||||
return "cudaErrorIllegalInstruction";
|
|
||||||
|
|
||||||
case cudaErrorMisalignedAddress:
|
|
||||||
return "cudaErrorMisalignedAddress";
|
|
||||||
|
|
||||||
case cudaErrorInvalidAddressSpace:
|
|
||||||
return "cudaErrorInvalidAddressSpace";
|
|
||||||
|
|
||||||
case cudaErrorInvalidPc:
|
|
||||||
return "cudaErrorInvalidPc";
|
|
||||||
|
|
||||||
case cudaErrorIllegalAddress:
|
|
||||||
return "cudaErrorIllegalAddress";
|
|
||||||
|
|
||||||
/* Since CUDA 6.5*/
|
|
||||||
case cudaErrorInvalidPtx:
|
|
||||||
return "cudaErrorInvalidPtx";
|
|
||||||
|
|
||||||
case cudaErrorInvalidGraphicsContext:
|
|
||||||
return "cudaErrorInvalidGraphicsContext";
|
|
||||||
|
|
||||||
case cudaErrorStartupFailure:
|
|
||||||
return "cudaErrorStartupFailure";
|
|
||||||
|
|
||||||
case cudaErrorApiFailureBase:
|
|
||||||
return "cudaErrorApiFailureBase";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "<unknown>";
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template< typename T >
|
|
||||||
void check(T result, char const *const func, const char *const file, int const line)
|
|
||||||
{
|
|
||||||
if (result)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
|
|
||||||
file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
|
|
||||||
cudaDeviceReset();
|
|
||||||
// Make sure we call CUDA Device Reset before exiting
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int *pArgc = NULL;
|
|
||||||
char **pArgv = NULL;
|
|
||||||
|
|
||||||
#if CUDART_VERSION < 5000
|
|
||||||
|
|
||||||
// CUDA-C includes
|
|
||||||
#include <cuda.h>
|
|
||||||
|
|
||||||
// This function wraps the CUDA Driver API into a template function
|
|
||||||
template <class T>
|
|
||||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
|
||||||
{
|
|
||||||
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
|
||||||
|
|
||||||
if (CUDA_SUCCESS != error) {
|
|
||||||
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
|
|
||||||
error, __FILE__, __LINE__);
|
|
||||||
|
|
||||||
// cudaDeviceReset causes the driver to clean up all state. While
|
|
||||||
// not mandatory in normal operation, it is good practice. It is also
|
|
||||||
// needed to ensure correct operation when the application is being
|
|
||||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
|
||||||
// flushed before the application exits
|
|
||||||
cudaDeviceReset();
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* CUDART_VERSION < 5000 */
|
|
||||||
|
|
||||||
// Beginning of GPU Architecture definitions
|
|
||||||
inline int ConvertSMVer2Cores(int major, int minor)
|
|
||||||
{
|
|
||||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
|
||||||
typedef struct {
|
|
||||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
|
||||||
int Cores;
|
|
||||||
} sSMtoCores;
|
|
||||||
|
|
||||||
sSMtoCores nGpuArchCoresPerSM[] = {
|
|
||||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
|
||||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
|
||||||
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
|
|
||||||
{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
|
|
||||||
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
|
|
||||||
{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
|
|
||||||
{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
|
|
||||||
{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
|
|
||||||
{ -1, -1 }
|
|
||||||
};
|
|
||||||
|
|
||||||
int index = 0;
|
|
||||||
|
|
||||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
|
||||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
|
||||||
return nGpuArchCoresPerSM[index].Cores;
|
|
||||||
}
|
|
||||||
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we don't find the values, we default use the previous one to run properly
|
|
||||||
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
|
|
||||||
return nGpuArchCoresPerSM[index-1].Cores;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Program main
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
int
|
|
||||||
main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
pArgc = &argc;
|
|
||||||
pArgv = argv;
|
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
|
||||||
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
|
||||||
|
|
||||||
int deviceCount = 0;
|
|
||||||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
|
||||||
|
|
||||||
if (error_id != cudaSuccess) {
|
|
||||||
printf("cudaGetDeviceCount failed: %s (%d)\n",
|
|
||||||
cudaGetErrorString(error_id), (int) error_id);
|
|
||||||
printf("Result = FAIL\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function call returns 0 if there are no CUDA capable devices.
|
|
||||||
if (deviceCount == 0)
|
|
||||||
printf("There are no available device(s) that support CUDA\n");
|
|
||||||
else
|
|
||||||
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
|
||||||
|
|
||||||
int dev, driverVersion = 0, runtimeVersion = 0;
|
|
||||||
|
|
||||||
for (dev = 0; dev < deviceCount; ++dev) {
|
|
||||||
cudaSetDevice(dev);
|
|
||||||
cudaDeviceProp deviceProp;
|
|
||||||
cudaGetDeviceProperties(&deviceProp, dev);
|
|
||||||
|
|
||||||
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
|
|
||||||
|
|
||||||
// Console log
|
|
||||||
cudaDriverGetVersion(&driverVersion);
|
|
||||||
cudaRuntimeGetVersion(&runtimeVersion);
|
|
||||||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
|
|
||||||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
|
|
||||||
|
|
||||||
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
|
|
||||||
(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
|
|
||||||
|
|
||||||
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
|
||||||
deviceProp.multiProcessorCount,
|
|
||||||
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
|
||||||
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
|
||||||
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
|
|
||||||
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= 5000
|
|
||||||
// This is supported in CUDA 5.0 (runtime API device properties)
|
|
||||||
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
|
|
||||||
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
|
|
||||||
|
|
||||||
if (deviceProp.l2CacheSize) {
|
|
||||||
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
// This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
|
|
||||||
int memoryClock;
|
|
||||||
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
|
||||||
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
|
||||||
int memBusWidth;
|
|
||||||
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
|
||||||
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
|
||||||
int L2CacheSize;
|
|
||||||
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
|
||||||
|
|
||||||
if (L2CacheSize) {
|
|
||||||
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
|
|
||||||
deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
|
|
||||||
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
|
|
||||||
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
|
||||||
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
|
|
||||||
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
|
|
||||||
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
|
|
||||||
|
|
||||||
|
|
||||||
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
|
|
||||||
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
|
|
||||||
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
|
|
||||||
printf(" Warp size: %d\n", deviceProp.warpSize);
|
|
||||||
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
|
|
||||||
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
|
|
||||||
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
|
||||||
deviceProp.maxThreadsDim[0],
|
|
||||||
deviceProp.maxThreadsDim[1],
|
|
||||||
deviceProp.maxThreadsDim[2]);
|
|
||||||
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
|
||||||
deviceProp.maxGridSize[0],
|
|
||||||
deviceProp.maxGridSize[1],
|
|
||||||
deviceProp.maxGridSize[2]);
|
|
||||||
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
|
|
||||||
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
|
|
||||||
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
|
|
||||||
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
|
||||||
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
|
|
||||||
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
|
|
||||||
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
|
|
||||||
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
|
||||||
#endif
|
|
||||||
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
|
|
||||||
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
|
|
||||||
|
|
||||||
const char *sComputeMode[] = {
|
|
||||||
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
|
|
||||||
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
|
|
||||||
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
|
|
||||||
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
|
|
||||||
"Unknown",
|
|
||||||
NULL
|
|
||||||
};
|
|
||||||
printf(" Compute Mode:\n");
|
|
||||||
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there are 2 or more GPUs, query to determine whether RDMA is supported
|
|
||||||
if (deviceCount >= 2)
|
|
||||||
{
|
|
||||||
cudaDeviceProp prop[64];
|
|
||||||
int gpuid[64]; // we want to find the first two GPU's that can support P2P
|
|
||||||
int gpu_p2p_count = 0;
|
|
||||||
|
|
||||||
for (int i=0; i < deviceCount; i++)
|
|
||||||
{
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
|
|
||||||
|
|
||||||
// Only boards based on Fermi or later can support P2P
|
|
||||||
if ((prop[i].major >= 2)
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
// on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this
|
|
||||||
&& prop[i].tccDriver
|
|
||||||
#endif
|
|
||||||
)
|
|
||||||
{
|
|
||||||
// This is an array of P2P capable GPUs
|
|
||||||
gpuid[gpu_p2p_count++] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Show all the combinations of support P2P GPUs
|
|
||||||
int can_access_peer_0_1, can_access_peer_1_0;
|
|
||||||
|
|
||||||
if (gpu_p2p_count >= 2)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < gpu_p2p_count-1; i++)
|
|
||||||
{
|
|
||||||
for (int j = 1; j < gpu_p2p_count; j++)
|
|
||||||
{
|
|
||||||
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j]));
|
|
||||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
|
|
||||||
prop[gpuid[j]].name, gpuid[j] ,
|
|
||||||
can_access_peer_0_1 ? "Yes" : "No");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j = 1; j < gpu_p2p_count; j++)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < gpu_p2p_count-1; i++)
|
|
||||||
{
|
|
||||||
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i]));
|
|
||||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j],
|
|
||||||
prop[gpuid[i]].name, gpuid[i] ,
|
|
||||||
can_access_peer_1_0 ? "Yes" : "No");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// csv masterlog info
|
|
||||||
// *****************************
|
|
||||||
// exe and CUDA driver name
|
|
||||||
printf("\n");
|
|
||||||
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
|
|
||||||
char cTemp[128];
|
|
||||||
|
|
||||||
// driver version
|
|
||||||
sProfileString += ", CUDA Driver Version = ";
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
|
||||||
#else
|
|
||||||
sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
|
||||||
#endif
|
|
||||||
sProfileString += cTemp;
|
|
||||||
|
|
||||||
// Runtime version
|
|
||||||
sProfileString += ", CUDA Runtime Version = ";
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
|
||||||
#else
|
|
||||||
sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
|
||||||
#endif
|
|
||||||
sProfileString += cTemp;
|
|
||||||
|
|
||||||
// Device count
|
|
||||||
sProfileString += ", NumDevs = ";
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
sprintf_s(cTemp, 10, "%d", deviceCount);
|
|
||||||
#else
|
|
||||||
sprintf(cTemp, "%d", deviceCount);
|
|
||||||
#endif
|
|
||||||
sProfileString += cTemp;
|
|
||||||
|
|
||||||
// Print Out all device Names
|
|
||||||
for (dev = 0; dev < deviceCount; ++dev)
|
|
||||||
{
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
|
||||||
sprintf_s(cTemp, 13, ", Device%d = ", dev);
|
|
||||||
#else
|
|
||||||
sprintf(cTemp, ", Device%d = ", dev);
|
|
||||||
#endif
|
|
||||||
cudaDeviceProp deviceProp;
|
|
||||||
cudaGetDeviceProperties(&deviceProp, dev);
|
|
||||||
sProfileString += cTemp;
|
|
||||||
sProfileString += deviceProp.name;
|
|
||||||
}
|
|
||||||
|
|
||||||
sProfileString += "\n";
|
|
||||||
printf("%s", sProfileString.c_str());
|
|
||||||
|
|
||||||
printf("Result = PASS\n");
|
|
||||||
|
|
||||||
// finish
|
|
||||||
// cudaDeviceReset causes the driver to clean up all state. While
|
|
||||||
// not mandatory in normal operation, it is good practice. It is also
|
|
||||||
// needed to ensure correct operation when the application is being
|
|
||||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
|
||||||
// flushed before the application exits
|
|
||||||
cudaDeviceReset();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,43 +0,0 @@
|
|||||||
{
|
|
||||||
stdenv
|
|
||||||
, cudatoolkit
|
|
||||||
, cudaPackages
|
|
||||||
, autoAddDriverRunpath
|
|
||||||
, strace
|
|
||||||
}:
|
|
||||||
|
|
||||||
stdenv.mkDerivation (finalAttrs: {
|
|
||||||
name = "cudainfo";
|
|
||||||
src = ./.;
|
|
||||||
buildInputs = [
|
|
||||||
cudatoolkit # Required for nvcc
|
|
||||||
cudaPackages.cuda_cudart.static # Required for -lcudart_static
|
|
||||||
autoAddDriverRunpath
|
|
||||||
];
|
|
||||||
installPhase = ''
|
|
||||||
mkdir -p $out/bin
|
|
||||||
cp -a cudainfo $out/bin
|
|
||||||
'';
|
|
||||||
passthru.gpuCheck = stdenv.mkDerivation {
|
|
||||||
name = "cudainfo-test";
|
|
||||||
requiredSystemFeatures = [ "cuda" ];
|
|
||||||
dontBuild = true;
|
|
||||||
nativeCheckInputs = [
|
|
||||||
finalAttrs.finalPackage # The cudainfo package from above
|
|
||||||
strace # When it fails, it will show the trace
|
|
||||||
];
|
|
||||||
dontUnpack = true;
|
|
||||||
doCheck = true;
|
|
||||||
checkPhase = ''
|
|
||||||
if ! cudainfo; then
|
|
||||||
set -x
|
|
||||||
cudainfo=$(command -v cudainfo)
|
|
||||||
ldd $cudainfo
|
|
||||||
readelf -d $cudainfo
|
|
||||||
strace -f $cudainfo
|
|
||||||
set +x
|
|
||||||
fi
|
|
||||||
'';
|
|
||||||
installPhase = "touch $out";
|
|
||||||
};
|
|
||||||
})
|
|
||||||
36
pkgs/mpich-fix-hwtopo.patch
Normal file
36
pkgs/mpich-fix-hwtopo.patch
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
diff --git a/src/util/mpir_hwtopo.c b/src/util/mpir_hwtopo.c
|
||||||
|
index 33e88bc..ee3641c 100644
|
||||||
|
--- a/src/util/mpir_hwtopo.c
|
||||||
|
+++ b/src/util/mpir_hwtopo.c
|
||||||
|
@@ -200,18 +200,6 @@ int MPII_hwtopo_init(void)
|
||||||
|
#ifdef HAVE_HWLOC
|
||||||
|
bindset = hwloc_bitmap_alloc();
|
||||||
|
hwloc_topology_init(&hwloc_topology);
|
||||||
|
- char *xmlfile = MPIR_pmi_get_jobattr("PMI_hwloc_xmlfile");
|
||||||
|
- if (xmlfile != NULL) {
|
||||||
|
- int rc;
|
||||||
|
- rc = hwloc_topology_set_xml(hwloc_topology, xmlfile);
|
||||||
|
- if (rc == 0) {
|
||||||
|
- /* To have hwloc still actually call OS-specific hooks, the
|
||||||
|
- * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
|
||||||
|
- * file is really the underlying system. */
|
||||||
|
- hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM);
|
||||||
|
- }
|
||||||
|
- MPL_free(xmlfile);
|
||||||
|
- }
|
||||||
|
|
||||||
|
hwloc_topology_set_io_types_filter(hwloc_topology, HWLOC_TYPE_FILTER_KEEP_ALL);
|
||||||
|
if (!hwloc_topology_load(hwloc_topology))
|
||||||
|
|
||||||
|
--- a/src/mpi/init/local_proc_attrs.c
|
||||||
|
+++ b/src/mpi/init/local_proc_attrs.c
|
||||||
|
@@ -79,10 +79,6 @@ int MPII_init_local_proc_attrs(int *p_thread_required)
|
||||||
|
/* Set the number of tag bits. The device may override this value. */
|
||||||
|
MPIR_Process.tag_bits = MPIR_TAG_BITS_DEFAULT;
|
||||||
|
|
||||||
|
- char *requested_kinds = MPIR_pmi_get_jobattr("PMI_mpi_memory_alloc_kinds");
|
||||||
|
- MPIR_get_supported_memory_kinds(requested_kinds, &MPIR_Process.memory_alloc_kinds);
|
||||||
|
- MPL_free(requested_kinds);
|
||||||
|
-
|
||||||
|
return mpi_errno;
|
||||||
|
}
|
||||||
@@ -11,6 +11,10 @@ final: prev:
|
|||||||
paths = [ pmix.dev pmix.out ];
|
paths = [ pmix.dev pmix.out ];
|
||||||
};
|
};
|
||||||
in prev.mpich.overrideAttrs (old: {
|
in prev.mpich.overrideAttrs (old: {
|
||||||
|
patches = (old.patches or []) ++ [
|
||||||
|
# See https://github.com/pmodels/mpich/issues/6946
|
||||||
|
./mpich-fix-hwtopo.patch
|
||||||
|
];
|
||||||
buildInput = old.buildInputs ++ [
|
buildInput = old.buildInputs ++ [
|
||||||
libfabric
|
libfabric
|
||||||
pmixAll
|
pmixAll
|
||||||
@@ -52,16 +56,4 @@ final: prev:
|
|||||||
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
prometheus-slurm-exporter = prev.callPackage ./slurm-exporter.nix { };
|
||||||
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
meteocat-exporter = prev.callPackage ./meteocat-exporter/default.nix { };
|
||||||
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
upc-qaire-exporter = prev.callPackage ./upc-qaire-exporter/default.nix { };
|
||||||
cudainfo = prev.callPackage ./cudainfo/default.nix { };
|
|
||||||
|
|
||||||
amd-uprof = prev.callPackage ./amd-uprof/default.nix { };
|
|
||||||
|
|
||||||
# FIXME: Extend this to all linuxPackages variants. Open problem, see:
|
|
||||||
# https://discourse.nixos.org/t/whats-the-right-way-to-make-a-custom-kernel-module-available/4636
|
|
||||||
linuxPackages = prev.linuxPackages.extend (_final: _prev: {
|
|
||||||
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
|
|
||||||
});
|
|
||||||
linuxPackages_latest = prev.linuxPackages_latest.extend(_final: _prev: {
|
|
||||||
amd-uprof-driver = _prev.callPackage ./amd-uprof/driver.nix { };
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,25 +1,19 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 AY8zKw /gmhFOFqOs8IobAImvQVKeM5Y6k0FpuR61/Cu5drVVI
|
-> ssh-ed25519 AY8zKw xeyzSqfio6SMS9SqywR+7II80D12Oha9T5zOgAIABSQ
|
||||||
g9FXJg2oIoien0zJ70FWHwSTM8SBwbpS188S3Swj7EM
|
ST26VaF2G1xv9l7d3jWKG32ssOivfwx+p9jLLV7ZFnU
|
||||||
-> ssh-ed25519 sgAamA opPjlWPhSiI0Rd5l7kd204S5FXFLcQcQftyKb7MDmnU
|
-> ssh-ed25519 sgAamA HrRx+x7NjXKVDaealWFo+Q8zMAdzoj6nTBxw0KMi3jE
|
||||||
3XrRDVnglCP+vBwvfd1rP5gHttsGDHyXwbf10a8/kKY
|
nlcEVTDTe1mPeS16/t9GYRnSSkm5EjpeiBZPIC/2f8U
|
||||||
-> ssh-ed25519 HY2yRg QKZbubM76C3tobPoyCFDRclA9Pzb2fC7s4WOoIgdORc
|
-> ssh-ed25519 HY2yRg NDp5vUeX35rDV78DFQi9fsc71pQNVE8YQ1StCp+YjTg
|
||||||
K5kckU0KhQFTE6SikJXFJgM41Tco5+VqOsaG0qLrY1Q
|
MdUAWHd1k6Jed2pp7Wct/DgF6ShqXFwNxPaXeBOLAcs
|
||||||
-> ssh-ed25519 fw2Xhg +ohqts8dLFjvdHxrGHcOGxU0dm+V3N//giljHkobpDM
|
-> ssh-ed25519 tcumPQ d0zVVB8t7W9KUapOsnsrvpAj7LgM9zS0yCv8SQnF0g8
|
||||||
jR/UzGrfS9lrJ/VeolKLxfzeJAf2fIB2pdIn/6ukqNk
|
aAPaWRTEBEQgmCkRG69NuWZ/lEva7vH+L8ifQSE0Z1I
|
||||||
-> ssh-ed25519 tcumPQ 3DPkDPIQQSVtXSLzIRETsIyXQ0k1o18Evn6vf+l/6R8
|
-> ssh-ed25519 JJ1LWg 4l8GZNdGOSbqKvmKq1q1aPvjeQIwpgbJj4DBYBse7x4
|
||||||
bLXF62OmJjnOT1vvgq3+AcOKKSG5NonrK5EqCVc0Mwo
|
rNhTiZlwzyOiCLzYRSzJ5AHebbv94dOgl1UyNmDJD8A
|
||||||
-> ssh-ed25519 JJ1LWg 2Wefc7eLolMU5InEmCNTq21Mf71mI0a2N1HgDrlHvy4
|
-> ssh-ed25519 CAWG4Q vGhwJDLJIAU8BpV6GP8Dnz2pvTAMufY4v4nvrr2O9yw
|
||||||
qXFW9CQBnrzubZ0mzS0Io2WGRrwGBkmeYndBTcZn/fM
|
hNZZFDYUMPQNM5+qcc5arIgqQw0PXuqq1WWDTpE+EHo
|
||||||
-> ssh-ed25519 cDBabA oiH36AoIt/fFFYgnoxtH7OoetP+2/wjtn8qo3RJDSHc
|
-> ssh-ed25519 xA739A 8eEi9S5dMWPVR4fKVZdV5eHBOJVf2Ap+3qHSYtYHYgc
|
||||||
qKmkxy1aZGP4ZwC0iH7n7hiJ0+rFQYvjQb5O1a1Z0r4
|
GcgzvJiqsNyZTVk12Z0FEnqB4LgfQ1xjKQwXdto1Hjs
|
||||||
-> ssh-ed25519 cK5kHw bX3RtO5StMejUYWAaA37fjHA5nO7Xs1vWDQk3yOjs2o
|
-> ssh-ed25519 MSF3dg oUY9IjDR6hi1qbrCV5z5IcYj85cMppxO94iqkD60Eww
|
||||||
Egxmcf8FKAd+E5hMLmhV1yQsCo5rJyUazf1szOvpTAM
|
cBzFGrhh+kWjIi0llw2RqACU1pa7XT9kqWkSeAY8VGI
|
||||||
-> ssh-ed25519 CAWG4Q oKqqRDJH0w8lsoQBQk0w8PO+z5gFNmSaGBUSumvDp1I
|
--- q7AaMOj7ZaS+Mf6trWK56o/1q/c2urrQBPAqk4PtATA
|
||||||
m1zWp9MfViAmtpbJhqOHraIokDaPKb0DvvO4vAGCTWI
|
~<7E><>k-/<2F>Xw<58><77><1D>V<EFBFBD>(<18><>Z<EFBFBD>d\<5C><>t<EFBFBD>'q<><71><EFBFBD>3<EFBFBD><33>R<EFBFBD>a\yFW<46>
|
||||||
-> ssh-ed25519 xA739A G26kPOz6sbFATs+KAr7gbDvji13eA1smFusQAOJXMwA
|
|
||||||
Sppvz7A103kZoNxoGsd6eXeCvVh7mBE2MRwLFj9O1dY
|
|
||||||
-> ssh-ed25519 MSF3dg 55ekNcp+inbUd+GQ/VZ7BoBASaJ8YDqF74CVXy1PUxQ
|
|
||||||
aTHLLAbzQPWWld/OT3BKebc6FcmsqMTaWCPBGm1UHic
|
|
||||||
--- mVkAMnI9XQhS3fMiFuuXP/yLR9wEG9+Rr8pA4Uc0avY
|
|
||||||
<04>DU <20><>s<EFBFBD><73><EFBFBD><EFBFBD>j<EFBFBD><6A>M<EFBFBD><4D>$<24>[<5B>M<EFBFBD><4D><EFBFBD><03>[_<>K7s<37>ju<>v<EFBFBD>D<EFBFBD>4<EFBFBD>g<EFBFBD><67>܄3<>Gn<47><6E><EFBFBD> ɽ<>P<EFBFBD>7~rZs<><73>
|
|
||||||
@@ -1,13 +1,11 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 HY2yRg gKGxsjHfpiRDQ6Tuvcx7pjKgrVUGweotuplLYwCGvik
|
-> ssh-ed25519 HY2yRg WUMWvyagPalsy7u1RaEFAwJvFowso1/quNBo+nAkxhQ
|
||||||
DSz9j/stVyB1lXpVP+kg+H+RDgSftREGFFLQZClC3kI
|
OHcebB7koPKhy58A6qngEVNWckkWChyEK3dwgy8EL5o
|
||||||
-> ssh-ed25519 cK5kHw 17DpKekfNVy4V742QSd61r2w6iawtOJR7Ct3UflDXio
|
-> ssh-ed25519 CAWG4Q Yx/HLIryUNE2BaqTl84FrNRy4XLCY2TRkRgbA9k3qU4
|
||||||
hsqTEPCYjHKvndMWPl4GpG23CzjGgVrS+cLIymISJHU
|
LZljfuLS5yMVVK6N57iC6cKEaFP6Hh2OkvWJjuFg8q0
|
||||||
-> ssh-ed25519 CAWG4Q oK01d4pbBqEZVsymSiKijPvJo714xsMSRMbzkssJKiw
|
-> ssh-ed25519 xA739A DOXjPRttSWz51Sr7KfjgKfAtaIYMo3foB1Ywqw9HYDY
|
||||||
hs0tVFkqtIHXg9jtC2iDgCtefFcWvGJkXB+HJUcqXQs
|
CA5puXK/1HDOitA2XHBI3OdKmZ7BzHst4DyuWGMC6hE
|
||||||
-> ssh-ed25519 xA739A KxO+AawfLMERHwzt3YnZRwPFlCfGETma7fo8M+ZtsAY
|
-> ssh-ed25519 MSF3dg +2LetdIiIZUk7wtHNS1tYsLo4ypwqZ9gpg77RQrnzHU
|
||||||
eSn0+/rhLQxNKt5xKubKck8Nxun2Sh3eJqBU/hwgzZM
|
yIUu8BVbF3dhUx3531RR50/cJQd9gd8VfKUQzEeT/iQ
|
||||||
-> ssh-ed25519 MSF3dg OyaZBLB2kO8fU139lXbbC404gT7IzIWk+BMhYzabBDg
|
--- oY/wQ+RjZO2CmKZtbQ0yOVZ5fv2+AlvvkRu1UDfCNAA
|
||||||
/fiPFfBJcb+e40+fZbwCw7niF2hh+JxUPiKSiwUSOWg
|
_8`G<>=C7@x&<26><>\Ft<46>)<29><><EFBFBD><EFBFBD><EFBFBD>cPe<50><65>%<25>ֽ[zX-0<>[<11><><EFBFBD>ɲ<><C9B2>tz<74><7A>;%<25><><EFBFBD><EFBFBD><EFBFBD>~<7E>H0<48>*XD<58>;<3B><>
|
||||||
--- ycZyGX+Li+LsOuweF9OVPl8aoMaRgp/RdFbDrPszkUs
|
|
||||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>YM<EFBFBD><EFBFBD>:E O<><4F>2<EFBFBD>r=<15>&4<><04>CQΣ<51><CEA3>hC<68><43><EFBFBD>cb<63>^Sy<53><79>% <09><>x-vC`g<><15><><EFBFBD><EFBFBD>W^<5E><>wVG<0B><><EFBFBD>
|
|
||||||
Binary file not shown.
@@ -1,13 +1,11 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 HY2yRg U2KQWviZIVNemm9e8h7H+eOzoYNxXgLLS3hsZLMAuGk
|
-> ssh-ed25519 HY2yRg 6C5Cv7ILdBrpMkCTT/insUY0kyQWbfgU500Ai8ePOXY
|
||||||
6n5dH1McNzk3rscP4v2pqZYDWtUFMd15rZsEd/mqIFM
|
tMw6ehFrsq2dvDEXkLOJwrNZfI28trlr9uy3xW/fzpA
|
||||||
-> ssh-ed25519 cK5kHw Ebrj/cpz1cFWAYAV9OxgyyH85OEMUnfUIV66p7jaoFY
|
-> ssh-ed25519 CAWG4Q x/j+364IYURgt7fhIPBzabbWMEg08nX8MRrJM/1Q6RU
|
||||||
6J7hWqODtS/fIF4BpxhxbrxZq5vbolvbLqRKqazT02M
|
AL5Ut2rDr3UXcQXMZJ53ZMf5wMHmT83whx0ntJfW/WU
|
||||||
-> ssh-ed25519 CAWG4Q mXqoQH9ycHF7u0y8mazCgynHxNLxTnrmQHke+2a5QCc
|
-> ssh-ed25519 xA739A QjXftBsoGV1rVeHSKcsjp+HMpRVsaHOeeGdDcF6ZWg4
|
||||||
mq6PdSF+KOqthuXwzTCsOQsi5KG0z1wHUck+bSTyOBY
|
ovVoYPaPn3liGPAxHWY37CBIUFjAXurv6jMWs2He3HQ
|
||||||
-> ssh-ed25519 xA739A TADeswueqDEroZWLjMw3RDNwVQ2xRD+JUMVZENovn0M
|
-> ssh-ed25519 MSF3dg FG0CQOj9fRlneW5QrWiy5ksRpicUwHqX9QMpZWhDImw
|
||||||
KFlnSjVFbjc+ZsbY8Ed7edC5B01TJGzd/dSryiLArPc
|
L20n1vZRepsRPT4xM6TO6PcI/MJxw4mBLUF0EPv9Uhs
|
||||||
-> ssh-ed25519 MSF3dg Pq+ZD8AqJGDHDbd4PO1ngNFST8+6C2ghZkO/knKzzEc
|
--- DEi7iuzkniq0JPatJ5f2KhrhxWid7ojHpvNfUCGxFtk
|
||||||
wyiL/u38hdQMokmfTsBrY7CtYwc+31FG4EDaqVEn31U
|
<EFBFBD><EFBFBD>% n<><6E>!;^Q<>rqG<71>:<3A>jC.8l<38>|<7C><>o<EFBFBD><1E><>$LYy<59>N<EFBFBD>b<EFBFBD><1E><>:<14>{<7B><><EFBFBD>fާxTS\<5C>t<04>U<EFBFBD><55>\F<>)%<25><><EFBFBD>KL<4B>㙇p<E39987>:><3E><><EFBFBD><EFBFBD>&<1B>)<0B>Q<EFBFBD>1<>H܃V<DC83>Sޑ<53>n<>
|
||||||
--- 1z4cOipayh0zYkvasEVEvGreajegE/dqBV7b6E7aFh0
|
|
||||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>R<EFBFBD>@<40>/i<>I'<27><><EFBFBD>Nx<4E>r"<1D>`<1E>O<EFBFBD><4F><EFBFBD>y<><79>8<EFBFBD><38> \/<2F><>I<19><17>D<EFBFBD>`<60>ߓ<EFBFBD><DF93><EFBFBD><1E><04>uy<75><79><EFBFBD>:9Lt<4C><1D><><EFBFBD>؋<EFBFBD><D88B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>AU<41><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>`<60>;<3B>q8<71>GLU#<23>i<EFBFBD>y<EFBFBD><79>i<03>ڜ
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -2,9 +2,6 @@ let
|
|||||||
keys = import ../keys.nix;
|
keys = import ../keys.nix;
|
||||||
adminsKeys = builtins.attrValues keys.admins;
|
adminsKeys = builtins.attrValues keys.admins;
|
||||||
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
hut = [ keys.hosts.hut ] ++ adminsKeys;
|
||||||
fox = [ keys.hosts.fox ] ++ adminsKeys;
|
|
||||||
apex = [ keys.hosts.apex ] ++ adminsKeys;
|
|
||||||
raccoon = [ keys.hosts.raccoon ] ++ adminsKeys;
|
|
||||||
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
mon = [ keys.hosts.hut keys.hosts.tent ] ++ adminsKeys;
|
||||||
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
tent = [ keys.hosts.tent ] ++ adminsKeys;
|
||||||
# Only expose ceph keys to safe nodes and admins
|
# Only expose ceph keys to safe nodes and admins
|
||||||
@@ -27,8 +24,4 @@ in
|
|||||||
|
|
||||||
"ceph-user.age".publicKeys = safe;
|
"ceph-user.age".publicKeys = safe;
|
||||||
"munge-key.age".publicKeys = safe;
|
"munge-key.age".publicKeys = safe;
|
||||||
|
|
||||||
"wg-fox.age".publicKeys = fox;
|
|
||||||
"wg-apex.age".publicKeys = apex;
|
|
||||||
"wg-raccoon.age".publicKeys = raccoon;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -1,13 +1,13 @@
|
|||||||
age-encryption.org/v1
|
age-encryption.org/v1
|
||||||
-> ssh-ed25519 G5LX5w 1KfTmTRP3iSdcclf/FuIpFWpy1tgKs5ED+qSYWo7inY
|
-> ssh-ed25519 G5LX5w V9bHLoGuY4stRwbzVS9Qa0L9yoY+UoCoXc+dJJQW/Ag
|
||||||
RX6Q1nLFF/yiVLpkWrl0BI0PpLoBi753+y8l/AXjNE4
|
2ut9GfdJ3KBCqZRaloZCQsl8MLfaZAZxqj6JtPJzu2k
|
||||||
-> ssh-ed25519 cK5kHw TP7+OQpQSNuyArnUo1C97J3P3oB0YtzCEPeVvlzsYHE
|
-> ssh-ed25519 CAWG4Q OAqnIfMECpKglZ7aF9tv/PQinG1Ou2+IEZ+nf4dtQjg
|
||||||
Bsy5KPNHTVNHnF1sxOvlfJq3CNMVFaXdYkRG2vSj7qM
|
dANdMLe4iI0d6Xd/dIMpZK+mgw2+VmJFQScHaIxD7WI
|
||||||
-> ssh-ed25519 CAWG4Q eQyzwNaH6CfaYIjs8abEuQxt6vxRXsGz69UletMUVDE
|
-> ssh-ed25519 xA739A nVNF4Y6VSa5PP6FFBJpVmoFYYseoFx5F2wJU+Pwk+Xk
|
||||||
FDcynPO7xg4PWez5Z8gTg5LyE0Wgb3zT9i3Kon67QsU
|
A5CiuTSNlX9Y76qhYgblBdJl3zPhtjWho2oL5/sIKu0
|
||||||
-> ssh-ed25519 xA739A 2JuLai2fUu3dZBydS8cMrLrEUIUkz4NNaiupoBOtTwU
|
-> ssh-ed25519 MSF3dg /WMsGnBGzquIMyw06gHKpSS4OUxheulT59kxi+/pxxU
|
||||||
sdM3X+XRzysop7yqa76Z7FAwTHOj91STCtZvfIgCdB0
|
ppwcv7RLzUbQUM7j0Tb9rRVT9XyPMhqYr2fr4S0nTJY
|
||||||
-> ssh-ed25519 MSF3dg fSPkiWnpInX1V5p3afPCoPotcGFoWFiOMPThtY927lc
|
--- zOe0Ko0oxArbmxePMPDVAT0pDju7IeOAih7sNrDcoVs
|
||||||
8v7E/3l0xA2VWZPXzkN4NmnaA0KJutLMurn/ZXZmhxA
|
i<EFBFBD>k<EFBFBD>A
|
||||||
--- MQkyBx9hT4ILYXKoZT18PWny1QbDFymcZr63zjMN/qQ
|
hODV<44>w!<21><0C><>E݈<45><DD88>+<2B><>`<60><><EFBFBD><EFBFBD>C<><43>5<EFBFBD>L<EFBFBD>A<EFBFBD>t<1A>M^<01>E<<1B>HI<48>_<EFBFBD>nn<6E><6E><EFBFBD>o<EFBFBD>?<3F>j-<05>
|
||||||
-b<>#<23><>M.<16>@<40>t<EFBFBD><74><EFBFBD>ŵ}+ό#@<40><><EFBFBD><EFBFBD><EFBFBD>k<EFBFBD>y<EFBFBD><79><EFBFBD>?v<><76>n<1F><>T<EFBFBD>+<2B><><EFBFBD>[<5B>Q<EFBFBD> gA<67><41><EFBFBD>
|
A<1B>nԔί<1B>>Z<><5A>z<EFBFBD><7A><EFBFBD>dT<64><54>b"<22>(@<40><>{_ځC
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,14 +0,0 @@
|
|||||||
age-encryption.org/v1
|
|
||||||
-> ssh-ed25519 cDBabA heyW9/cxgwFX9IexQIXjAQDWGQPNcMXcArQp2Rxsqx4
|
|
||||||
o9MQ7EH8PDDjsJdpH9F3Xq2zUoaDAJQlfFmYucSFs6Y
|
|
||||||
-> ssh-ed25519 cK5kHw Sza4pos7K3qW3omEeyidI/jszJNf9smemSZnUJfCIww
|
|
||||||
D6vazXki7hIYraIuSiGPS+FPbkFUwHhHWDf52OhEIMg
|
|
||||||
-> ssh-ed25519 CAWG4Q YexIHueOIMmIN8JIDyNUOKBkyz/k18HqV3hTXh48KlM
|
|
||||||
xh8UJzzWT6ByN+Dpn4JrMNsjGC/uc/v6LynwjBDz9NQ
|
|
||||||
-> ssh-ed25519 xA739A KySG3TXdqfCMUkVEDGa74B0op745s3XGYxFLyAXSQAc
|
|
||||||
5EI/yb5ctW9Qu18bHm3/sK97kwGcKzzmWvPSCWm89XA
|
|
||||||
-> ssh-ed25519 MSF3dg MNxnNj0fHmri8ophexXPNjRUBUWrzcuk5S1mucxUMTE
|
|
||||||
GVFWXtISEU8ZmlwL4nh4weAgfGrt2GHX0DTzbpS6zg8
|
|
||||||
--- UdrqkYG2ZApAuwdZeNhC50NP2rkD/Ol6y8nJa4RHx7Y
|
|
||||||
<EFBFBD>ܻ<EFBFBD>m(<28><><EFBFBD>><3E>H<48>Y87<><37>G<0F>+*<12><><EFBFBD><EFBFBD>9V<>.<2E><><EFBFBD><EFBFBD><03><><EFBFBD>p<EFBFBD>Oo<4F>=+哇<>P0<50><30>{<7B>)<29><17><><EFBFBD><EFBFBD>><3E>z3P^
|
|
||||||
u
|
|
||||||
Binary file not shown.
@@ -21,28 +21,17 @@ the detailed specifications:
|
|||||||
|
|
||||||
## Access
|
## Access
|
||||||
|
|
||||||
To access the machine, request a SLURM session from [apex](/apex) using the `fox`
|
To access the machine, request a SLURM session from [hut](/hut) using the `fox`
|
||||||
partition. If you need the machine for performance measurements, use an
|
partition:
|
||||||
exclusive reservation:
|
|
||||||
|
|
||||||
apex% salloc -p fox --exclusive
|
hut% salloc -p fox
|
||||||
|
|
||||||
Otherwise, specify the CPUs that you need so other users can also use the node
|
Then connect via ssh:
|
||||||
at the same time:
|
|
||||||
|
|
||||||
apex% salloc -p fox -c 8
|
hut% ssh fox
|
||||||
|
|
||||||
Then use srun to execute an interactive shell:
|
|
||||||
|
|
||||||
apex% srun --pty $SHELL
|
|
||||||
fox%
|
fox%
|
||||||
|
|
||||||
Make sure you get all CPUs you expect:
|
Follow [these steps](/access) if you don't have access to hut or fox.
|
||||||
|
|
||||||
fox% grep Cpus_allowed_list /proc/self/status
|
|
||||||
Cpus_allowed_list: 0-191
|
|
||||||
|
|
||||||
Follow [these steps](/access) if you don't have access to apex or fox.
|
|
||||||
|
|
||||||
## CUDA
|
## CUDA
|
||||||
|
|
||||||
@@ -96,22 +85,13 @@ Then just run `nix develop` from the same directory:
|
|||||||
Cuda compilation tools, release 12.4, V12.4.99
|
Cuda compilation tools, release 12.4, V12.4.99
|
||||||
Build cuda_12.4.r12.4/compiler.33961263_0
|
Build cuda_12.4.r12.4/compiler.33961263_0
|
||||||
|
|
||||||
## AMD uProf
|
|
||||||
|
|
||||||
The [AMD uProf](https://www.amd.com/en/developer/uprof.html) performance
|
|
||||||
analysis tool-suite is installed and ready to use.
|
|
||||||
|
|
||||||
See the [AMD uProf user guide](https://docs.amd.com/r/en-US/57368-uProf-user-guide)
|
|
||||||
([PDF backup for v5.1](https://jungle.bsc.es/pub/57368-uprof-user-guide.pdf))
|
|
||||||
for more details on how to use the tools. To use the GUI make sure that you
|
|
||||||
connect to fox using X11 forwarding.
|
|
||||||
|
|
||||||
## Filesystems
|
## Filesystems
|
||||||
|
|
||||||
The machine has several file systems available.
|
The machine has several file systems available.
|
||||||
|
|
||||||
- `/nfs/home`: The `/home` from apex via NFS, which is also shared with other
|
- `$HOME`: Mounted via NFS across all nodes. It is slow and has low capacity.
|
||||||
xeon machines. It has about 2 ms of latency, so not suitable for quick random
|
Don't abuse.
|
||||||
access.
|
- `/ceph/home/$USER`: Shared Ceph file system across jungle nodes. Slow but high
|
||||||
|
capacity. Stores three redundant copies of every file.
|
||||||
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
|
- `/nvme{0,1}/$USER`: The two local NVME disks, very fast and large capacity.
|
||||||
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
|
- `/tmp`: tmpfs, fast but not backed by a disk. Will be erased on reboot.
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Update 2025-09-26"
|
|
||||||
author: "Rodrigo Arias Mallo"
|
|
||||||
date: 2025-09-26
|
|
||||||
---
|
|
||||||
|
|
||||||
This is a summary of notable changes introduced in the last two years. We
|
|
||||||
continue to maintain all machines updated to the last NixOS release (currently
|
|
||||||
NixOS 25.05).
|
|
||||||
|
|
||||||
### New compute node: fox
|
|
||||||
|
|
||||||
We have a new [fox machine](/fox), with two AMD Genoa 9684X CPUs and two NVIDIA
|
|
||||||
RTX4000 GPUs. During the last months we have been doing some tests and it seems
|
|
||||||
that most of the components work well. We have configured CUDA to use the NVIDIA
|
|
||||||
GPUs, as well as AMD uProf to trace performance and energy counters from the
|
|
||||||
CPUs.
|
|
||||||
|
|
||||||
### Upgraded login node: apex
|
|
||||||
|
|
||||||
We have upgraded the operating system on the login node to NixOS, which now runs
|
|
||||||
Linux 6.15.6. During the upgrade, we have detected a problem with the storage
|
|
||||||
disks. The `/` and `/home` partitions sit on a
|
|
||||||
[RAID 5](https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5),
|
|
||||||
transparently handled by a RAID hardware controller which starts its own
|
|
||||||
firmware before passing the control to the BIOS to continue the boot sequence. A
|
|
||||||
problem during the startup of the firmware prevented the node to even reach the
|
|
||||||
BIOS screen.
|
|
||||||
|
|
||||||
After a long debugging session, we detected that the flash memory that stores
|
|
||||||
the firmware of the hardware controller was likely to be the issue, since
|
|
||||||
[memory cells](https://en.wikipedia.org/wiki/Flash_memory#Principles_of_operation)
|
|
||||||
may lose charge over time and can end up corrupting the content. We flashed
|
|
||||||
the latest firmware so the memory cells are charged again with the new bits and
|
|
||||||
that fixed the problem. Hopefully we will be able to use it for some more years.
|
|
||||||
|
|
||||||
The SLURM server has been moved to apex which allows users to also submit jobs
|
|
||||||
to fox.
|
|
||||||
|
|
||||||
### Migrated machines to BSC building
|
|
||||||
|
|
||||||
The server room had a temperature issue that had been affecting our machines
|
|
||||||
since the end of February of 2025. As the summer approached, the temperature
|
|
||||||
exceeded the safe limits for our hardware, so we had to shutdown the cluster.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
Since then, we have moved the cluster to BSC premises, where it now rests at a
|
|
||||||
stable temperature, so hopefully we won't have more unscheduled downtime.
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 97 KiB |
Reference in New Issue
Block a user