From 3451d2486faf938b9e78dafcdf8c30ed375af19f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Aur=C3=A9lien=20Reynaud?= Date: Thu, 9 Sep 2010 22:43:16 +0200 Subject: [PATCH] lpar plugin update MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Hello, here is a new patch against ar/lpar with the following features : - get back to the original implementation with gauges only. A new type "vcpu" is created (it was "lpar_pcpu" in the original) - I tried to keep as much as possible of your changes, but some have been reverted (the init function is back) because it was simpler for me to port my previous code rather than adapt it to the current form. Feel free to change them again - the "consumed" metric might seem superfluous at first sight as it could be calculated in the frontend in the general case. But I thought it might come in handy when dealing with dedicated partitions, where donated and stolen values are no easy concepts. Not everyone wants to dig into the code and the APIs to find out what they mean and whether they should be added to or substracted from other values... As a side note, one of the changes you introduced was better checking of the return status from perfstat_partition_total() using errno. This reminded me that under AIX errno is by default unsafe to use in a multithreaded environment (which collectd is). I posted a fix ("Fix errno thread-safety under AIX") on Sat, 19 Jun 2010, which if I am not mistaken has not been merged yet. Best regards, Aurélien Reynaud Signed-off-by: Florian Forster --- src/lpar.c | 298 ++++++++++++++++++++++++++++----------------------- src/types.db | 1 + 2 files changed, 166 insertions(+), 133 deletions(-) diff --git a/src/lpar.c b/src/lpar.c index be4738bb..4f7f444a 100644 --- a/src/lpar.c +++ b/src/lpar.c @@ -16,7 +16,7 @@ * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Authors: - * Aurelien Reynaud + * Aurélien Reynaud **/ #include "collectd.h" @@ -27,6 +27,13 @@ #include #include +/* XINTFRAC was defined in libperfstat.h somewhere between AIX 5.3 and 6.1 */ +#ifndef XINTFRAC +# include +# define XINTFRAC ((double)(_system_configuration.Xint) / \ + (double)(_system_configuration.Xfrac)) +#endif + static const char *config_keys[] = { "CpuPoolStats", @@ -36,6 +43,45 @@ static int config_keys_num = STATIC_ARRAY_SIZE (config_keys); static _Bool pool_stats = 0; static _Bool report_by_serial = 0; +static _Bool donate_flag = 0; +static char serial[SYS_NMLN]; + +static u_longlong_t time_old; +static u_longlong_t user_old, + syst_old, + idle_old, + wait_old; +static u_longlong_t pool_busy_time_old, + pool_max_time_old; +static u_longlong_t idle_donated_old, + busy_donated_old, + busy_stolen_old, + idle_stolen_old; + + +static void save_last_values (perfstat_partition_total_t *lparstats) +{ + time_old = lparstats->timebase_last; + + user_old = lparstats->puser; + syst_old = lparstats->psys; + idle_old = lparstats->pidle; + wait_old = lparstats->pwait; + + if (donate_flag) + { + idle_donated_old = lparstats->idle_donated_purr; + busy_donated_old = lparstats->busy_donated_purr; + busy_stolen_old = lparstats->busy_stolen_purr; + idle_stolen_old = lparstats->idle_stolen_purr; + } + + if (pool_stats) + { + pool_busy_time_old = lparstats->pool_busy_time; + pool_max_time_old = lparstats->pool_max_time; + } +} /* void save_last_values */ static int lpar_config (const char *key, const char *value) { @@ -61,33 +107,54 @@ static int lpar_config (const char *key, const char *value) return (0); } /* int lpar_config */ -static void lpar_submit (const char *type_instance, counter_t value) +static int lpar_init (void) +{ + perfstat_partition_total_t lparstats; + int status; + + /* Retrieve the initial metrics. Returns the number of structures filled. */ + status = perfstat_partition_total (/* name = */ NULL, /* (must be NULL) */ + &lparstats, sizeof (perfstat_partition_total_t), + /* number = */ 1 /* (must be 1) */); + if (status != 1) + { + char errbuf[1024]; + ERROR ("lpar plugin: perfstat_partition_total failed: %s (%i)", + sstrerror (errno, errbuf, sizeof (errbuf)), + status); + return (-1); + } + + if (!lparstats.type.b.shared_enabled && lparstats.type.b.donate_enabled) + { + donate_flag = 1; + } + + if (pool_stats && !lparstats.type.b.pool_util_authority) + { + WARNING ("lpar plugin: This partition does not have pool authority. " + "Disabling CPU pool statistics collection."); + pool_stats = 0; + } + + /* Save the initial data */ + save_last_values (&lparstats); + + return (0); +} /* int lpar_init */ + +static void lpar_submit (const char *type_instance, double value) { value_t values[1]; value_list_t vl = VALUE_LIST_INIT; - /* Although it appears as a double, value is really a (scaled) counter, - expressed in CPU x seconds. At high collection rates (< 1 min), its - integer part is very small and the resulting graphs get blocky. We regain - some precision by applying a x100 factor before casting it to a counter, - turning the final value into CPU units instead of CPUs. */ - values[0].counter = value; + values[0].gauge = (gauge_t)value; vl.values = values; vl.values_len = 1; - - /* An LPAR has the same serial number as the physical system it is currently - running on. It is a convenient way of tracking LPARs as they are moved - from chassis to chassis through Live Partition Mobility (LPM). */ if (report_by_serial) { - struct utsname name; - if (uname (&name) != 0) - { - ERROR ("lpar plugin: uname failed."); - return; - } - sstrncpy (vl.host, name.machine, sizeof (vl.host)); + sstrncpy (vl.host, serial, sizeof (vl.host)); sstrncpy (vl.plugin_instance, hostname_g, sizeof (vl.plugin)); } else @@ -95,116 +162,31 @@ static void lpar_submit (const char *type_instance, counter_t value) sstrncpy (vl.host, hostname_g, sizeof (vl.host)); } sstrncpy (vl.plugin, "lpar", sizeof (vl.plugin)); - sstrncpy (vl.type, "cpu", sizeof (vl.type)); + sstrncpy (vl.type, "vcpu", sizeof (vl.type)); sstrncpy (vl.type_instance, type_instance, sizeof (vl.type_instance)); plugin_dispatch_values (&vl); -} - -static int lpar_read_shared_partition (const perfstat_partition_total_t *data) -{ - static counter_t time_old; - static counter_t user_old; - static counter_t syst_old; - static counter_t wait_old; - static counter_t idle_old; - static counter_t unav_old; - - counter_t user = (counter_t) data->puser; - counter_t syst = (counter_t) data->psys; - counter_t wait = (counter_t) data->pwait; - counter_t idle = (counter_t) data->pidle; - counter_t unav = 0; - - /* - * On a shared partition, we're "entitled" to a certain amount of - * processing power, for example 250/100 of a physical CPU. Processing - * capacity not used by the partition may be assigned to a different - * partition by the hypervisor, so "idle" is hopefully a very small - * number. - * - * We calculate the amount of ticks assigned to a different partition - * from the number of ticks we're entitled to and the number of ticks - * we used up. - */ - if (time_old != 0) - { - counter_t time_diff; - counter_t entitled_ticks; - counter_t consumed_ticks; - counter_t user_diff; - counter_t syst_diff; - counter_t wait_diff; - counter_t idle_diff; - counter_t unav_diff; - - double entitled_pool_capacity; - - /* Number of ticks since we last run. */ - time_diff = ((counter_t) data->timebase_last) - time_old; - - /* entitled_pool_capacity is in 1/100th of a CPU */ - entitled_pool_capacity = 0.01 * ((double) data->entitled_pool_capacity); - - /* The number of ticks this partition would have been entitled to. */ - entitled_ticks = (counter_t) ((entitled_pool_capacity * ((double) time_diff)) + .5); - - /* The number of ticks actually spent in the various states */ - user_diff = user - user_old; - syst_diff = syst - syst_old; - wait_diff = wait - wait_old; - idle_diff = idle - idle_old; - consumed_ticks = user_diff + syst_diff + wait_diff + idle_diff; - - /* "uncapped" partitions are allowed to consume more ticks than - * they are entitled to. */ - if (entitled_ticks >= consumed_ticks) - unav_diff = entitled_ticks - consumed_ticks; - else - unav_diff = 0; - unav = unav_old + unav_diff; - - lpar_submit ("user", user); - lpar_submit ("system", syst); - lpar_submit ("wait", wait); - lpar_submit ("idle", idle); - lpar_submit ("unavailable", unav); - } - - time_old = (counter_t) data->timebase_last; - user_old = user; - syst_old = syst; - wait_old = wait; - idle_old = idle; - unav_old = unav; - - return (0); -} /* int lpar_read_shared_partition */ - -static int lpar_read_dedicated_partition (const perfstat_partition_total_t *data) -{ - lpar_submit ("user", (counter_t) data->puser); - lpar_submit ("system", (counter_t) data->psys); - lpar_submit ("wait", (counter_t) data->pwait); - lpar_submit ("idle", (counter_t) data->pidle); - - if (data->type.b.donate_enabled) - { - /* FYI: PURR == Processor Utilization of Resources Register - * SPURR == Scaled PURR */ - lpar_submit ("idle_donated", (counter_t) data->idle_donated_purr); - lpar_submit ("busy_donated", (counter_t) data->busy_donated_purr); - lpar_submit ("idle_stolen", (counter_t) data->idle_stolen_purr); - lpar_submit ("busy_stolen", (counter_t) data->busy_stolen_purr); - } - - return (0); -} /* int lpar_read_dedicated_partition */ +} /* void lpar_submit */ static int lpar_read (void) { perfstat_partition_total_t lparstats; int status; + struct utsname name; + u_longlong_t ticks; + u_longlong_t user_ticks, syst_ticks, wait_ticks, idle_ticks; + u_longlong_t consumed_ticks; + double entitled_proc_capacity; + + /* An LPAR has the same serial number as the physical system it is currently + running on. It is a convenient way of tracking LPARs as they are moved + from chassis to chassis through Live Partition Mobility (LPM). */ + if (uname (&name) != 0) + { + ERROR ("lpar plugin: uname failed."); + return (-1); + } + sstrncpy (serial, name.machine, sizeof (serial)); /* Retrieve the current metrics. Returns the number of structures filled. */ status = perfstat_partition_total (/* name = */ NULL, /* (must be NULL) */ @@ -219,30 +201,79 @@ static int lpar_read (void) return (-1); } - if (lparstats.type.b.shared_enabled) - lpar_read_shared_partition (&lparstats); - else /* if (!shared_enabled) */ - lpar_read_dedicated_partition (&lparstats); + /* + * On a shared partition, we're "entitled" to a certain amount of + * processing power, for example 250/100 of a physical CPU. Processing + * capacity not used by the partition may be assigned to a different + * partition by the hypervisor, so "idle" is hopefully a very small + * number. + */ - if (pool_stats && !lparstats.type.b.pool_util_authority) + /* Number of ticks since we last run. */ + ticks = lparstats.timebase_last - time_old; + if (ticks == 0) { - WARNING ("lpar plugin: This partition does not have pool authority. " - "Disabling CPU pool statistics collection."); - pool_stats = 0; + /* The stats have not been updated. Return now to avoid dividing by zero */ + return (0); } + /* entitled_proc_capacity is in 1/100th of a CPU */ + entitled_proc_capacity = 0.01 * ((double) lparstats.entitled_proc_capacity); + lpar_submit ("entitled", entitled_proc_capacity); + + /* The number of ticks actually spent in the various states */ + user_ticks = lparstats.puser - user_old; + syst_ticks = lparstats.psys - syst_old; + wait_ticks = lparstats.pwait - wait_old; + idle_ticks = lparstats.pidle - idle_old; + consumed_ticks = user_ticks + syst_ticks + wait_ticks + idle_ticks; + + lpar_submit ("user", (double) user_ticks / (double) ticks); + lpar_submit ("sys", (double) syst_ticks / (double) ticks); + lpar_submit ("wait", (double) wait_ticks / (double) ticks); + lpar_submit ("idle", (double) idle_ticks / (double) ticks); + + if (donate_flag) + { + u_longlong_t idle_donated_ticks, busy_donated_ticks; + u_longlong_t idle_stolen_ticks, busy_stolen_ticks; + + idle_donated_ticks = lparstats.idle_donated_purr - idle_donated_old; + busy_donated_ticks = lparstats.busy_donated_purr - busy_donated_old; + idle_stolen_ticks = lparstats.idle_stolen_purr - idle_stolen_old; + busy_stolen_ticks = lparstats.busy_stolen_purr - busy_stolen_old; + + /* FYI: PURR == Processor Utilization of Resources Register + * SPURR == Scaled PURR */ + lpar_submit ("idle_donated", (double) idle_donated_ticks / (double) ticks); + lpar_submit ("busy_donated", (double) busy_donated_ticks / (double) ticks); + lpar_submit ("idle_stolen", (double) idle_stolen_ticks / (double) ticks); + lpar_submit ("busy_stolen", (double) busy_stolen_ticks / (double) ticks); + + /* Donated ticks will be accounted for as stolen ticks in other LPARs */ + consumed_ticks += idle_stolen_ticks + busy_stolen_ticks; + } + + lpar_submit ("consumed", (double) consumed_ticks / (double) ticks); + if (pool_stats) { char typinst[DATA_MAX_NAME_LEN]; + u_longlong_t pool_busy_ns, pool_max_ns; + + pool_busy_ns = lparstats.pool_busy_time - pool_busy_time_old; + pool_max_ns = lparstats.pool_max_time - pool_max_time_old; /* Pool stats are in CPU x ns */ - ssnprintf (typinst, sizeof(typinst), "pool-%X-busy", lparstats.pool_id); - lpar_submit (typinst, (double)lparstats.pool_busy_time / 1000000000.0); + ssnprintf (typinst, sizeof (typinst), "pool-%X-busy", lparstats.pool_id); + lpar_submit (typinst, (double) pool_busy_ns / XINTFRAC / (double) ticks); - ssnprintf (typinst, sizeof(typinst), "pool-%X-total", lparstats.pool_id); - lpar_submit (typinst, (double)lparstats.pool_max_time / 1000000000.0); + ssnprintf (typinst, sizeof (typinst), "pool-%X-total", lparstats.pool_id); + lpar_submit (typinst, (double) pool_max_ns / XINTFRAC / (double) ticks); } + save_last_values (&lparstats); + return (0); } /* int lpar_read */ @@ -250,6 +281,7 @@ void module_register (void) { plugin_register_config ("lpar", lpar_config, config_keys, config_keys_num); + plugin_register_init ("lpar", lpar_init); plugin_register_read ("lpar", lpar_read); } /* void module_register */ diff --git a/src/types.db b/src/types.db index 1b0020f6..b2c0b4fe 100644 --- a/src/types.db +++ b/src/types.db @@ -165,6 +165,7 @@ total_time_in_ms value:DERIVE:0:U total_values value:DERIVE:0:U uptime value:GAUGE:0:4294967295 users users:GAUGE:0:65535 +vcpu value:GAUGE:0:U virt_cpu_total ns:COUNTER:0:256000000000 virt_vcpu ns:COUNTER:0:1000000000 vmpage_action value:COUNTER:0:4294967295 -- 2.30.2