Skip to content

Commit ebfb257

Browse files
authored
fix: instruction telemetry requires many file descriptors on large systems (#673)
Signed-off-by: Harper, Jason M <jason.m.harper@intel.com>
1 parent e1f4db3 commit ebfb257

1 file changed

Lines changed: 34 additions & 0 deletions

File tree

internal/script/scripts.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,8 @@ turbostat -i $interval $count
14921492
duration={{.Duration}}
14931493
pid={{.InstrMixPID}}
14941494
1495+
min_fd_limit=4096
1496+
14951497
cleanup_done=0
14961498
14971499
finalize() {
@@ -1514,6 +1516,38 @@ finalize() {
15141516
}
15151517
trap finalize INT TERM EXIT
15161518
1519+
# processwatch opens perf_event file descriptors per monitored CPU. On large
1520+
# systems, the default soft open-file limit can be too low and libbpf fails with
1521+
# "Too many open files" while attaching insn_collect.
1522+
cpu_count=$(getconf _NPROCESSORS_ONLN 2>/dev/null || nproc 2>/dev/null || echo 1)
1523+
required_fd_limit=$((cpu_count * 2 + 128))
1524+
if [ "$required_fd_limit" -lt "$min_fd_limit" ]; then
1525+
required_fd_limit=$min_fd_limit
1526+
fi
1527+
1528+
current_fd_limit=$(ulimit -Sn)
1529+
if [ "$current_fd_limit" = "unlimited" ]; then
1530+
current_fd_limit=$required_fd_limit
1531+
fi
1532+
1533+
if [ "$current_fd_limit" -lt "$required_fd_limit" ]; then
1534+
hard_fd_limit=$(ulimit -Hn)
1535+
if [ "$hard_fd_limit" = "unlimited" ] || [ "$hard_fd_limit" -ge "$required_fd_limit" ]; then
1536+
ulimit -Sn "$required_fd_limit" 2>/dev/null || true
1537+
fi
1538+
current_fd_limit=$(ulimit -Sn)
1539+
fi
1540+
1541+
if [ "$current_fd_limit" = "unlimited" ]; then
1542+
current_fd_limit=$required_fd_limit
1543+
fi
1544+
1545+
if [ "$current_fd_limit" -lt "$required_fd_limit" ]; then
1546+
echo "instruction telemetry requires at least $required_fd_limit open files for $cpu_count CPUs, but the current soft limit is $current_fd_limit" 1>&2
1547+
echo "increase the open file limit (ulimit -n) or reduce the number of monitored CPUs before retrying instruction telemetry" 1>&2
1548+
exit 1
1549+
fi
1550+
15171551
if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
15181552
count=$((duration / interval))
15191553
arg_count="-n $count"

0 commit comments

Comments
 (0)