Code

AMQP connection failure leads collectd to use 100%CPU
authorYoga Ramalingam <yramalingam1@bloomberg.net>
Fri, 31 Oct 2014 20:03:31 +0000 (16:03 -0400)
committerMarc Fournier <marc.fournier@camptocamp.com>
Wed, 11 Mar 2015 21:09:42 +0000 (22:09 +0100)
Summary:
Issue : https://ipm.bloomberg.com/jira/browse/SS-103
Solution : When amqp connection fails, it retries for every message, added a fix to retry only after a delay. A new configuration "ConnectionRetryDelay"  has been introduced with default value as 60 seconds.

Test Plan:
1. Tested without the new configuration
2. Tested with the new configuration

In both the cases, verified connection is retried only after the delay and made sure collectd is not taking 100% cpu.

Reviewers: skhajamo

Reviewed By: skhajamo

CC: arcyd
Differential Revision: https://all.phab.dev.bloomberg.com/D149956

Conflicts:
src/amqp.c

src/amqp.c

index 1764129faf4a57fdc622062156d28ffb322a4bef..dd5a65267a1212622b2b66097a01e650c6fe8a99 100644 (file)
@@ -80,6 +80,9 @@ struct camqp_config_s
     char   *exchange;
     char   *routing_key;
 
+    /* Number of seconds to wait before connection is retried */
+    int     connection_retry_delay;
+
     /* publish only */
     uint8_t delivery_mode;
     _Bool   store_rates;
@@ -405,6 +408,8 @@ static int camqp_setup_queue (camqp_config_t *conf) /* {{{ */
 
 static int camqp_connect (camqp_config_t *conf) /* {{{ */
 {
+    static time_t lastConnectTime = 0;
+
     amqp_rpc_reply_t reply;
     int status;
 #ifdef HAVE_AMQP_TCP_SOCKET
@@ -416,6 +421,19 @@ static int camqp_connect (camqp_config_t *conf) /* {{{ */
     if (conf->connection != NULL)
         return (0);
 
+    time_t now = time(NULL);
+    if (now < (lastConnectTime + conf->connection_retry_delay))
+    {
+        DEBUG("amqp plugin: skipping connection retry, ConnectionRetryDelay: %d"
+                , conf->connection_retry_delay);
+        return(1);
+    }
+    else
+    {
+        DEBUG ("amqp plugin: retrying connection");
+        lastConnectTime = now;
+    }
+
     conf->connection = amqp_new_connection ();
     if (conf->connection == NULL)
     {
@@ -922,6 +940,8 @@ static int camqp_config_connection (oconfig_item_t *ci, /* {{{ */
     conf->password = NULL;
     conf->exchange = NULL;
     conf->routing_key = NULL;
+    conf->connection_retry_delay = 60;
+
     /* publish only */
     conf->delivery_mode = CAMQP_DM_VOLATILE;
     conf->store_rates = 0;
@@ -1017,6 +1037,8 @@ static int camqp_config_connection (oconfig_item_t *ci, /* {{{ */
             conf->escape_char = tmp_buff[0];
             sfree (tmp_buff);
         }
+        else if (strcasecmp ("ConnectionRetryDelay", child->key) == 0)
+            status = cf_util_get_int (child, &conf->connection_retry_delay);
         else
             WARNING ("amqp plugin: Ignoring unknown "
                     "configuration option \"%s\".", child->key);