Skip to content

Commit bb97072

Browse files
branch-4.1: [fix](Reliability)Fix Doris query service fails after the file handles on the BE node are used up #62393 (#62540)
Cherry-picked from #62393 Co-authored-by: Guangming Lu <71873108+LuGuangming@users.noreply.github.com>
1 parent a19ec32 commit bb97072

4 files changed

Lines changed: 16 additions & 0 deletions

File tree

be/src/common/config.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,7 @@ DEFINE_Validator(concurrency_stats_dump_interval_ms,
17391739
DEFINE_mBool(cloud_mow_sync_rowsets_when_load_txn_begin, "true");
17401740

17411741
DEFINE_mBool(enable_cloud_make_rs_visible_on_be, "false");
1742+
DEFINE_mInt32(file_handles_deplenish_frequency_times, "3");
17421743

17431744
// clang-format off
17441745
#ifdef BE_TEST

be/src/common/config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,6 +1787,7 @@ DECLARE_mInt32(concurrency_stats_dump_interval_ms);
17871787
DECLARE_mBool(cloud_mow_sync_rowsets_when_load_txn_begin);
17881788

17891789
DECLARE_mBool(enable_cloud_make_rs_visible_on_be);
1790+
DECLARE_mInt32(file_handles_deplenish_frequency_times);
17901791

17911792
#ifdef BE_TEST
17921793
// test s3

be/src/common/metrics/system_metrics.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <utility>
2727

2828
#include "common/cast_set.h"
29+
#include "common/config.h"
2930
#include "runtime/memory/jemalloc_control.h"
3031
#include "util/cgroup_util.h"
3132
#include "util/perf_counters.h"
@@ -432,6 +433,18 @@ void SystemMetrics::_update_cpu_metrics() {
432433
char buf[64];
433434
LOG(WARNING) << "open /proc/stat failed, errno=" << errno
434435
<< ", message=" << strerror_r(errno, buf, 64);
436+
if (errno == 24) {
437+
_file_handle_deplenish_counter++;
438+
} else {
439+
_file_handle_deplenish_counter = 0;
440+
}
441+
// Threshold of the number of consecutive failures
442+
if (_file_handle_deplenish_counter >= config::file_handles_deplenish_frequency_times) {
443+
LOG(FATAL) << "The system file handles are insufficient, causing service exceptions"
444+
<< ", BE will exit. please check the configs 'soft nofile'"
445+
<< " and 'hard nofile' of /etc/security/limits.conf ";
446+
exit(-1);
447+
}
435448
return;
436449
}
437450

be/src/common/metrics/system_metrics.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class SystemMetrics {
121121
IntGauge* max_disk_io_util_percent = nullptr;
122122
IntGauge* max_network_send_bytes_rate = nullptr;
123123
IntGauge* max_network_receive_bytes_rate = nullptr;
124+
int _file_handle_deplenish_counter = 0;
124125
};
125126

126127
} // namespace doris

0 commit comments

Comments
 (0)