From b8bcabba154bec2e9f33f830cc588b4e57841699 Mon Sep 17 00:00:00 2001 From: Yves Mettier Date: Mon, 26 Oct 2015 16:59:45 +0000 Subject: [PATCH] write_tsdb : Add a random TTL before querying the DNS again --- src/collectd.conf.pod | 15 +++++++- src/write_tsdb.c | 83 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 4d053527..fce59032 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -8144,6 +8144,7 @@ Synopsis: DNS_Cache_TTL 60 + DNS_Random_Cache_TTL 60 Host "tsd-1.my.domain" Port "4242" @@ -8160,10 +8161,22 @@ Global directives are: =item B I +=item B I + When Collectd connects to a TSDB node, it will request the DNS. This can become a problem is the TSDN node is unavailable or badly configured because Collected will request DNS in order to reconnect for every metric, which can flood your DNS. -So you can cache the last value for C seconds (default: 60s). +So you can cache the last value for C seconds (default: 600s e.g; 10 min). + +You can also define a random ttl. This prevents all your Collectd servers to +request the DNS at the same time when the connection fails. Default value is +15 * the write_tsdb interval (or the global interval if write_tsdb interval is not +defined). + +Note : if the DNS resolution has already been successful, if the socket closes, +the plugin will try to reconnect as soon as possible with the cached information. +DNS is queried only when the socket is closed for a long time (DNS_Cache_TTL + +DNS_Random_Cache_TTL) =back diff --git a/src/write_tsdb.c b/src/write_tsdb.c index 615e8b75..af8276bd 100644 --- a/src/write_tsdb.c +++ b/src/write_tsdb.c @@ -67,6 +67,20 @@ #define WT_SEND_BUF_SIZE 1428 #endif +/* Default configuration */ + +/* WRITE_TSDB_DEFAULT_DNS_TTL is the time we keep the dns cached info + * (seconds) + */ +#define WRITE_TSDB_DEFAULT_DNS_TTL 600 + +/* WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL helps define the max random + * time we keep the dns cached info : + * min = 0 + * max = WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL * get_plugin_interval() + */ +#define WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL 15 + /* * Private variables */ @@ -88,9 +102,15 @@ struct wt_callback { cdtime_t send_buf_init_time; pthread_mutex_t send_lock; + + _Bool connect_failed_log_enabled; + int connect_dns_failed_attempts_remaining; + cdtime_t next_random_ttl; }; -static cdtime_t dnsttl = TIME_T_TO_CDTIME_T_STATIC(60); +static cdtime_t dnsttl = TIME_T_TO_CDTIME_T_STATIC(WRITE_TSDB_DEFAULT_DNS_TTL); +static double dnsrandomttl = .0; +static _Bool use_dnsrandomttl = 0; /* * Functions @@ -148,6 +168,15 @@ static int wt_flush_nolock(cdtime_t timeout, struct wt_callback *cb) { return status; } +static cdtime_t new_random_ttl() { + time_t ttl = 0; + if (use_dnsrandomttl) { + ttl = (time_t)(dnsrandomttl * ((double)random()) / + (((double)RAND_MAX) + 1.0)); + } + return TIME_T_TO_CDTIME_T(ttl); +} + static int wt_callback_init(struct wt_callback *cb) { int status; cdtime_t now; @@ -159,10 +188,25 @@ static int wt_callback_init(struct wt_callback *cb) { return 0; now = cdtime(); - if ((cb->sock_info_last_update + dnsttl) < now) { - if (cb->sock_info) { - freeaddrinfo(cb->sock_info); - cb->sock_info = NULL; + if (cb->sock_info) { + /* When we are here, we still have the IP in cache. + * If we have remaining attempts without calling the DNS, we update the + * last_update date so we keep the info until next time. + * If there is no more attempts, we need to flush the cache. + */ + + if ((cb->sock_info_last_update + dnsttl + cb->next_random_ttl) < now) { + cb->next_random_ttl = new_random_ttl(); + if (cb->connect_dns_failed_attempts_remaining > 0) { + /* Warning : this is run under send_lock mutex. + * This is why we do not use another mutex here. + * */ + cb->sock_info_last_update = now; + cb->connect_dns_failed_attempts_remaining--; + } else { + freeaddrinfo(cb->sock_info); + cb->sock_info = NULL; + } } } @@ -173,13 +217,14 @@ static int wt_callback_init(struct wt_callback *cb) { .ai_socktype = SOCK_STREAM, }; - if ((cb->sock_info_last_update + dnsttl) >= now) { + if ((cb->sock_info_last_update + dnsttl + cb->next_random_ttl) >= now) { DEBUG("write_tsdb plugin: too many getaddrinfo (%s, %s) failures", node, service); return (-1); } cb->sock_info_last_update = now; + cb->next_random_ttl = new_random_ttl(); status = getaddrinfo(node, service, &ai_hints, &(cb->sock_info)); if (status != 0) { if (cb->sock_info) { @@ -223,6 +268,12 @@ static int wt_callback_init(struct wt_callback *cb) { return -1; } + if (0 == cb->connect_failed_log_enabled) { + WARNING("write_tsdb plugin: Connecting to %s:%s succeeded.", node, service); + cb->connect_failed_log_enabled = 1; + } + cb->connect_dns_failed_attempts_remaining = 1; + wt_reset_buffer(cb); return 0; @@ -550,6 +601,8 @@ static int wt_config_tsd(oconfig_item_t *ci) { return -1; } cb->sock_fd = -1; + cb->connect_failed_log_enabled = 1; + cb->next_random_ttl = new_random_ttl(); pthread_mutex_init(&cb->send_lock, NULL); @@ -588,6 +641,8 @@ static int wt_config_tsd(oconfig_item_t *ci) { } static int wt_config(oconfig_item_t *ci) { + _Bool config_random_ttl = 0; + for (int i = 0; i < ci->children_num; i++) { oconfig_item_t *child = ci->children + i; @@ -597,6 +652,16 @@ static int wt_config(oconfig_item_t *ci) { int ttl; cf_util_get_int(child, &ttl); dnsttl = TIME_T_TO_CDTIME_T(ttl); + } else if (strcasecmp("DNS_Random_Cache_TTL", child->key) == 0) { + int ttl; + cf_util_get_int(child, &ttl); + config_random_ttl = 1; + if (ttl) { + dnsrandomttl = (double)ttl; + use_dnsrandomttl = 1; + } else { + use_dnsrandomttl = 0; + } } else { ERROR("write_tsdb plugin: Invalid configuration " "option: %s.", @@ -604,6 +669,12 @@ static int wt_config(oconfig_item_t *ci) { } } + if (!config_random_ttl) { + use_dnsrandomttl = 1; + dnsrandomttl = CDTIME_T_TO_DOUBLE(WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL * + plugin_get_interval()); + } + return 0; } -- 2.30.2