Lower the system noise in owl nodes #24

Open
opened 2023-06-30 15:13:36 +02:00 by rarias · 3 comments
rarias commented 2023-06-30 15:13:36 +02:00 (Migrated from pm.bsc.es)

We have some noisy processes that are stealing the CPU:

owl2$ uptime
 15:11:18  up 8 days 21:37,  1 user,  load average: 0,00, 0,00, 0,00
owl2$ ps -A --sort -time | head
    PID TTY          TIME CMD
  71059 ?        00:13:26 kworker/0:0-rcu_gp
   1623 ?        00:08:49 systemd-oomd
  74028 ?        00:03:44 kworker/0:1-events
     16 ?        00:01:44 rcu_preempt
   1628 ?        00:01:17 systemd-timesyn
   1848 ?        00:01:14 nsncd
    375 ?        00:00:55 kworker/3:1-events
      1 ?        00:00:52 systemd
    363 ?        00:00:21 kcompactd0
We have some noisy processes that are stealing the CPU: ``` owl2$ uptime 15:11:18 up 8 days 21:37, 1 user, load average: 0,00, 0,00, 0,00 owl2$ ps -A --sort -time | head PID TTY TIME CMD 71059 ? 00:13:26 kworker/0:0-rcu_gp 1623 ? 00:08:49 systemd-oomd 74028 ? 00:03:44 kworker/0:1-events 16 ? 00:01:44 rcu_preempt 1628 ? 00:01:17 systemd-timesyn 1848 ? 00:01:14 nsncd 375 ? 00:00:55 kworker/3:1-events 1 ? 00:00:52 systemd 363 ? 00:00:21 kcompactd0 ```
rarias commented 2023-06-30 15:13:50 +02:00 (Migrated from pm.bsc.es)

changed the description

changed the description
rarias commented 2023-06-30 15:18:40 +02:00 (Migrated from pm.bsc.es)

The systemd-oomd daemon can be disabled, as we don't need it for these nodes.

The nsncd seems to be a local DNS server, which is trying to resolve the NTP server:

[pid  1855] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable)
[pid  1855] close(6)                    = 0
[pid  1855] socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_IP) = 6
[pid  1855] setsockopt(6, SOL_IP, IP_RECVERR, [1], 4) = 0
[pid  1855] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable)
[pid  1855] close(6)                    = 0
[pid  1855] write(4, "\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 24) = 24
[pid  1855] shutdown(4, SHUT_RDWR)      = 0
[pid  1855] close(4)                    = 0
[pid  1855] sched_yield()               = 0
[pid  1855] sched_yield()               = 0
[pid  1855] sched_yield()               = 0
[pid  1855] sched_yield()               = 0
[pid  1855] futex(0x5607b61dd5d8, FUTEX_WAIT_BITSET_PRIVATE, 4294967295, NULL, FUTEX_BITSET_MATCH_ANY <unfinished ...>
[pid  1860] <... accept4 resumed>{sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC) = 5
[pid  1860] futex(0x5607b61dcee8, FUTEX_WAKE_PRIVATE, 1) = 1
[pid  1860] accept4(3,  <unfinished ...>
[pid  1852] <... futex resumed>)        = 0
[pid  1852] recvfrom(5, "\2\0\0\0\16\0\0\0\25\0\0\0003.nixos.pool.ntp.org"..., 4096, 0, NULL, NULL) = 33
[pid  1852] newfstatat(AT_FDCWD, "/etc/nsswitch.conf", {st_mode=S_IFREG|0444, st_size=213, ...}, 0) = 0
[pid  1852] newfstatat(AT_FDCWD, "/etc/resolv.conf", {st_mode=S_IFREG|0644, st_size=59, ...}, 0) = 0
[pid  1852] rt_sigprocmask(SIG_BLOCK, [HUP USR1 USR2 PIPE ALRM CHLD TSTP URG VTALRM PROF WINCH IO], [], 8) = 0
[pid  1852] openat(AT_FDCWD, "/run/systemd/machines/3.nixos.pool.ntp.org", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
[pid  1852] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid  1852] openat(AT_FDCWD, "/etc/hosts", O_RDONLY|O_CLOEXEC) = 6
[pid  1852] newfstatat(6, "", {st_mode=S_IFREG|0444, st_size=1798, ...}, AT_EMPTY_PATH) = 0
[pid  1852] lseek(6, 0, SEEK_SET)       = 0
[pid  1852] read(6, "127.0.0.1 localhost\n\n127.0.0.2 o"..., 4096) = 1798
[pid  1852] read(6, "", 4096)           = 0
[pid  1852] close(6)                    = 0
[pid  1852] rt_sigprocmask(SIG_BLOCK, [HUP USR1 USR2 PIPE ALRM CHLD TSTP URG VTALRM PROF WINCH IO], [], 8) = 0
[pid  1852] uname({sysname="Linux", nodename="owl2", ...}) = 0
[pid  1852] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid  1852] socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_IP) = 6
[pid  1852] setsockopt(6, SOL_IP, IP_RECVERR, [1], 4) = 0
[pid  1852] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable)

This will be addressed by !14 , but we can consider switching to PTP (see #14) if that lowers the CPU noise. Same for the timesync daemon.

Now, the kworker rcu_gp and events, I'm not sure what they are doing.

The systemd-oomd daemon can be disabled, as we don't need it for these nodes. The nsncd seems to be a local DNS server, which is trying to resolve the NTP server: ``` [pid 1855] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable) [pid 1855] close(6) = 0 [pid 1855] socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_IP) = 6 [pid 1855] setsockopt(6, SOL_IP, IP_RECVERR, [1], 4) = 0 [pid 1855] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable) [pid 1855] close(6) = 0 [pid 1855] write(4, "\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 24) = 24 [pid 1855] shutdown(4, SHUT_RDWR) = 0 [pid 1855] close(4) = 0 [pid 1855] sched_yield() = 0 [pid 1855] sched_yield() = 0 [pid 1855] sched_yield() = 0 [pid 1855] sched_yield() = 0 [pid 1855] futex(0x5607b61dd5d8, FUTEX_WAIT_BITSET_PRIVATE, 4294967295, NULL, FUTEX_BITSET_MATCH_ANY <unfinished ...> [pid 1860] <... accept4 resumed>{sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC) = 5 [pid 1860] futex(0x5607b61dcee8, FUTEX_WAKE_PRIVATE, 1) = 1 [pid 1860] accept4(3, <unfinished ...> [pid 1852] <... futex resumed>) = 0 [pid 1852] recvfrom(5, "\2\0\0\0\16\0\0\0\25\0\0\0003.nixos.pool.ntp.org"..., 4096, 0, NULL, NULL) = 33 [pid 1852] newfstatat(AT_FDCWD, "/etc/nsswitch.conf", {st_mode=S_IFREG|0444, st_size=213, ...}, 0) = 0 [pid 1852] newfstatat(AT_FDCWD, "/etc/resolv.conf", {st_mode=S_IFREG|0644, st_size=59, ...}, 0) = 0 [pid 1852] rt_sigprocmask(SIG_BLOCK, [HUP USR1 USR2 PIPE ALRM CHLD TSTP URG VTALRM PROF WINCH IO], [], 8) = 0 [pid 1852] openat(AT_FDCWD, "/run/systemd/machines/3.nixos.pool.ntp.org", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) [pid 1852] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 [pid 1852] openat(AT_FDCWD, "/etc/hosts", O_RDONLY|O_CLOEXEC) = 6 [pid 1852] newfstatat(6, "", {st_mode=S_IFREG|0444, st_size=1798, ...}, AT_EMPTY_PATH) = 0 [pid 1852] lseek(6, 0, SEEK_SET) = 0 [pid 1852] read(6, "127.0.0.1 localhost\n\n127.0.0.2 o"..., 4096) = 1798 [pid 1852] read(6, "", 4096) = 0 [pid 1852] close(6) = 0 [pid 1852] rt_sigprocmask(SIG_BLOCK, [HUP USR1 USR2 PIPE ALRM CHLD TSTP URG VTALRM PROF WINCH IO], [], 8) = 0 [pid 1852] uname({sysname="Linux", nodename="owl2", ...}) = 0 [pid 1852] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 [pid 1852] socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_IP) = 6 [pid 1852] setsockopt(6, SOL_IP, IP_RECVERR, [1], 4) = 0 [pid 1852] connect(6, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("8.8.8.8")}, 16) = -1 ENETUNREACH (Network is unreachable) ``` This will be addressed by !14 , but we can consider switching to PTP (see #14) if that lowers the CPU noise. Same for the timesync daemon. Now, the kworker rcu_gp and events, I'm not sure what they are doing.
rarias commented 2023-06-30 16:27:51 +02:00 (Migrated from pm.bsc.es)

As commented with Aleix, the "rcu_gp" in kworker/0:0-rcu_gp seems to be "RCU grace period". This may be related with the stall detector.

As commented with Aleix, the "rcu_gp" in kworker/0:0-rcu_gp seems to be "RCU grace period". This may be related with the [stall detector](https://docs.kernel.org/RCU/stallwarn.html).
Sign in to join this conversation.
No Milestone
No project
No Assignees
1 Participants
Notifications
Due Date
The due date is invalid or out of range. Please use the format 'yyyy-mm-dd'.

No due date set.

Dependencies

No dependencies set.

Reference: rarias/jungle#24
No description provided.