summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: f14a69e)
raw | patch | inline | side by side (parent: f14a69e)
author | Yves Mettier <ymettier@free.fr> | |
Mon, 26 Oct 2015 16:59:45 +0000 (16:59 +0000) | ||
committer | Florian Forster <octo@collectd.org> | |
Tue, 29 Nov 2016 07:53:21 +0000 (08:53 +0100) |
src/collectd.conf.pod | patch | blob | history | |
src/write_tsdb.c | patch | blob | history |
diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod
index 4d0535276546d8778329ff7d1c31ddef9d1a50e7..fce59032db5780165f89f11fac76df6f0d25d7e3 100644 (file)
--- a/src/collectd.conf.pod
+++ b/src/collectd.conf.pod
<Plugin write_tsdb>
DNS_Cache_TTL 60
+ DNS_Random_Cache_TTL 60
<Node "example">
Host "tsd-1.my.domain"
Port "4242"
=item B<DNS_Cache_TTL> I<ttl>
+=item B<DNS_Random_Cache_TTL> I<ttl>
+
When Collectd connects to a TSDB node, it will request the DNS. This can become
a problem is the TSDN node is unavailable or badly configured because Collected
will request DNS in order to reconnect for every metric, which can flood your DNS.
-So you can cache the last value for C<ttl> seconds (default: 60s).
+So you can cache the last value for C<ttl> seconds (default: 600s e.g; 10 min).
+
+You can also define a random ttl. This prevents all your Collectd servers to
+request the DNS at the same time when the connection fails. Default value is
+15 * the write_tsdb interval (or the global interval if write_tsdb interval is not
+defined).
+
+Note : if the DNS resolution has already been successful, if the socket closes,
+the plugin will try to reconnect as soon as possible with the cached information.
+DNS is queried only when the socket is closed for a long time (DNS_Cache_TTL +
+DNS_Random_Cache_TTL)
=back
diff --git a/src/write_tsdb.c b/src/write_tsdb.c
index 615e8b75d1f79cec4ee58dcb83e62f782d6028ff..af8276bd950519867750b4e779b1e205f1ff662e 100644 (file)
--- a/src/write_tsdb.c
+++ b/src/write_tsdb.c
#define WT_SEND_BUF_SIZE 1428
#endif
+/* Default configuration */
+
+/* WRITE_TSDB_DEFAULT_DNS_TTL is the time we keep the dns cached info
+ * (seconds)
+ */
+#define WRITE_TSDB_DEFAULT_DNS_TTL 600
+
+/* WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL helps define the max random
+ * time we keep the dns cached info :
+ * min = 0
+ * max = WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL * get_plugin_interval()
+ */
+#define WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL 15
+
/*
* Private variables
*/
cdtime_t send_buf_init_time;
pthread_mutex_t send_lock;
+
+ _Bool connect_failed_log_enabled;
+ int connect_dns_failed_attempts_remaining;
+ cdtime_t next_random_ttl;
};
-static cdtime_t dnsttl = TIME_T_TO_CDTIME_T_STATIC(60);
+static cdtime_t dnsttl = TIME_T_TO_CDTIME_T_STATIC(WRITE_TSDB_DEFAULT_DNS_TTL);
+static double dnsrandomttl = .0;
+static _Bool use_dnsrandomttl = 0;
/*
* Functions
return status;
}
+static cdtime_t new_random_ttl() {
+ time_t ttl = 0;
+ if (use_dnsrandomttl) {
+ ttl = (time_t)(dnsrandomttl * ((double)random()) /
+ (((double)RAND_MAX) + 1.0));
+ }
+ return TIME_T_TO_CDTIME_T(ttl);
+}
+
static int wt_callback_init(struct wt_callback *cb) {
int status;
cdtime_t now;
return 0;
now = cdtime();
- if ((cb->sock_info_last_update + dnsttl) < now) {
- if (cb->sock_info) {
- freeaddrinfo(cb->sock_info);
- cb->sock_info = NULL;
+ if (cb->sock_info) {
+ /* When we are here, we still have the IP in cache.
+ * If we have remaining attempts without calling the DNS, we update the
+ * last_update date so we keep the info until next time.
+ * If there is no more attempts, we need to flush the cache.
+ */
+
+ if ((cb->sock_info_last_update + dnsttl + cb->next_random_ttl) < now) {
+ cb->next_random_ttl = new_random_ttl();
+ if (cb->connect_dns_failed_attempts_remaining > 0) {
+ /* Warning : this is run under send_lock mutex.
+ * This is why we do not use another mutex here.
+ * */
+ cb->sock_info_last_update = now;
+ cb->connect_dns_failed_attempts_remaining--;
+ } else {
+ freeaddrinfo(cb->sock_info);
+ cb->sock_info = NULL;
+ }
}
}
.ai_socktype = SOCK_STREAM,
};
- if ((cb->sock_info_last_update + dnsttl) >= now) {
+ if ((cb->sock_info_last_update + dnsttl + cb->next_random_ttl) >= now) {
DEBUG("write_tsdb plugin: too many getaddrinfo (%s, %s) failures", node,
service);
return (-1);
}
cb->sock_info_last_update = now;
+ cb->next_random_ttl = new_random_ttl();
status = getaddrinfo(node, service, &ai_hints, &(cb->sock_info));
if (status != 0) {
if (cb->sock_info) {
return -1;
}
+ if (0 == cb->connect_failed_log_enabled) {
+ WARNING("write_tsdb plugin: Connecting to %s:%s succeeded.", node, service);
+ cb->connect_failed_log_enabled = 1;
+ }
+ cb->connect_dns_failed_attempts_remaining = 1;
+
wt_reset_buffer(cb);
return 0;
return -1;
}
cb->sock_fd = -1;
+ cb->connect_failed_log_enabled = 1;
+ cb->next_random_ttl = new_random_ttl();
pthread_mutex_init(&cb->send_lock, NULL);
}
static int wt_config(oconfig_item_t *ci) {
+ _Bool config_random_ttl = 0;
+
for (int i = 0; i < ci->children_num; i++) {
oconfig_item_t *child = ci->children + i;
int ttl;
cf_util_get_int(child, &ttl);
dnsttl = TIME_T_TO_CDTIME_T(ttl);
+ } else if (strcasecmp("DNS_Random_Cache_TTL", child->key) == 0) {
+ int ttl;
+ cf_util_get_int(child, &ttl);
+ config_random_ttl = 1;
+ if (ttl) {
+ dnsrandomttl = (double)ttl;
+ use_dnsrandomttl = 1;
+ } else {
+ use_dnsrandomttl = 0;
+ }
} else {
ERROR("write_tsdb plugin: Invalid configuration "
"option: %s.",
}
}
+ if (!config_random_ttl) {
+ use_dnsrandomttl = 1;
+ dnsrandomttl = CDTIME_T_TO_DOUBLE(WRITE_TSDB_DEFAULT_DNS_RANDOM_TTL *
+ plugin_get_interval());
+ }
+
return 0;
}