summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: f3feef5)
raw | patch | inline | side by side (parent: f3feef5)
author | Aurélien Reynaud <collectd@wattapower.net> | |
Wed, 1 Sep 2010 20:28:38 +0000 (22:28 +0200) | ||
committer | Florian Forster <octo@leeloo.lan.home.verplant.org> | |
Fri, 3 Sep 2010 06:49:53 +0000 (08:49 +0200) |
Hello Florian,
here is a new version of my lpar plugin. I tried to address the
shortcomings of the previous attempt:
- Minimum and maximum proc capacity are gone, being static values
- The plugin now uses the cpu type for every value, so there is no need
anymore for the lpar_cpu type
- This also means there is no need anymore to compute rates in the
plugin, so the code is IMHO much more elegant
- There is a config option "ReportBySerial", as described in my previous
email
- We now use pool_busy_time directly instead of computing it from total
and idle
The patch is against the current 4.10 branch, rather than against
ar/lpar, because it is more of a complete rewrite than just fixes. I
could provide a patch against ar/lpar however if you prefer so.
Regards,
Aurélien Reynaud
Signed-off-by: Florian Forster <octo@leeloo.lan.home.verplant.org>
here is a new version of my lpar plugin. I tried to address the
shortcomings of the previous attempt:
- Minimum and maximum proc capacity are gone, being static values
- The plugin now uses the cpu type for every value, so there is no need
anymore for the lpar_cpu type
- This also means there is no need anymore to compute rates in the
plugin, so the code is IMHO much more elegant
- There is a config option "ReportBySerial", as described in my previous
- We now use pool_busy_time directly instead of computing it from total
and idle
The patch is against the current 4.10 branch, rather than against
ar/lpar, because it is more of a complete rewrite than just fixes. I
could provide a patch against ar/lpar however if you prefer so.
Regards,
Aurélien Reynaud
Signed-off-by: Florian Forster <octo@leeloo.lan.home.verplant.org>
src/lpar.c | patch | blob | history | |
src/types.db | patch | blob | history |
diff --git a/src/lpar.c b/src/lpar.c
index 7998d7963202a4dc7e802d72dbe9510e108a9765..2267e03cadfbc5f56f650a45b58920c8ae9d1081 100644 (file)
--- a/src/lpar.c
+++ b/src/lpar.c
#include "collectd.h"
#include "common.h"
#include "plugin.h"
-
#include <sys/protosw.h>
#include <libperfstat.h>
-#include <sys/systemcfg.h>
#include <sys/utsname.h>
#ifndef XINTFRAC
+# include <sys/systemcfg.h>
# define XINTFRAC ((double)(_system_configuration.Xint) / \
(double)(_system_configuration.Xfrac))
#endif
+#define HTIC2SEC(x) ((double)x * XINTFRAC / 1000000000.0)
+
+/* Max length of the type instance string */
+#define TYPE_INST_LEN (sizeof("pool--total") + 2*sizeof(int) + 1)
static const char *config_keys[] =
{
- "CpuPoolStats"
+ "CpuPoolStats",
+ "ReportBySerial"
};
static int config_keys_num = STATIC_ARRAY_SIZE (config_keys);
-static int pool_stats = 0;
+static int pool_stats = 0,
+ report_by_serial = 0;
-/* As an LPAR can be moved transparently across physical systems
- * through Live Partition Mobility (LPM), and the resources we are
- * monitoring are tied to the underlying hardware, we need to keep
- * track on which physical server we are currently on. This is done
- * through the plugin instance which holds the chassis' serial.
- */
static u_longlong_t last_time_base;
-static u_longlong_t last_pcpu_user,
- last_pcpu_sys,
- last_pcpu_idle,
- last_pcpu_wait;
-static u_longlong_t last_pool_idle_time = 0;
-static u_longlong_t last_idle_donated_purr = 0,
- last_busy_donated_purr = 0,
- last_busy_stolen_purr = 0,
- last_idle_stolen_purr = 0;
+static u_longlong_t ent_counter;
static int donate_flag = 0;
-/* Save the current values for the next iteration */
-static void save_last_values (perfstat_partition_total_t *lparstats)
-{
- last_time_base = lparstats->timebase_last;
-
- last_pcpu_user = lparstats->puser;
- last_pcpu_sys = lparstats->psys;
- last_pcpu_idle = lparstats->pidle;
- last_pcpu_wait = lparstats->pwait;
-
- if (donate_flag)
- {
- last_idle_donated_purr = lparstats->idle_donated_purr;
- last_busy_donated_purr = lparstats->busy_donated_purr;
- last_busy_stolen_purr = lparstats->busy_stolen_purr;
- last_idle_stolen_purr = lparstats->idle_stolen_purr;
- }
-
- last_pool_idle_time = lparstats->pool_idle_time;
-}
-
static int lpar_config (const char *key, const char *value)
{
if (strcasecmp ("CpuPoolStats", key) == 0)
{
if (IS_TRUE (value))
pool_stats = 1;
- else
- pool_stats = 0;
+ }
+ else if (strcasecmp ("ReportBySerial", key) == 0)
+ {
+ if (IS_TRUE (value))
+ report_by_serial = 1;
}
else
{
{
perfstat_partition_total_t lparstats;
- /* retrieve the initial metrics */
+ /* Retrieve the initial metrics */
if (!perfstat_partition_total (NULL, &lparstats,
sizeof (perfstat_partition_total_t), 1))
{
donate_flag = 1;
}
- /* save the initial data */
- save_last_values (&lparstats);
+ if (pool_stats && !lparstats.type.b.pool_util_authority)
+ {
+ WARNING ("lpar plugin: this system does not have pool authority. "
+ "Disabling CPU pool statistics collection.");
+ pool_stats = 0;
+ }
+
+ /* Initialize the fake counter for entitled capacity */
+ last_time_base = lparstats.timebase_last;
+ ent_counter = 0;
return (0);
} /* int lpar_init */
-static void lpar_submit (const char *plugin_inst, const char *type_instance, double value)
+static void lpar_submit (const char *type_instance, double value)
{
value_t values[1];
value_list_t vl = VALUE_LIST_INIT;
- values[0].gauge = (gauge_t)value;
+ /* Although it appears as a double, value is really a (scaled) counter,
+ expressed in CPU x seconds. At high collection rates (< 1 min), its
+ integer part is very small and the resulting graphs get blocky. We regain
+ some precision by applying a x100 factor before casting it to a counter,
+ turning the final value into CPU units instead of CPUs. */
+ values[0].counter = (counter_t)(value * 100.0 + 0.5);
vl.values = values;
vl.values_len = 1;
- sstrncpy (vl.host, hostname_g, sizeof (vl.host));
+
+ /* An LPAR has the same serial number as the physical system it is currently
+ running on. It is a convenient way of tracking LPARs as they are moved
+ from chassis to chassis through Live Partition Mobility (LPM). */
+ if (report_by_serial)
+ {
+ struct utsname name;
+ if (uname (&name) != 0)
+ {
+ ERROR ("lpar plugin: uname failed.");
+ return;
+ }
+ sstrncpy (vl.host, name.machine, sizeof (vl.host));
+ sstrncpy (vl.plugin_instance, hostname_g, sizeof (vl.plugin));
+ }
+ else
+ {
+ sstrncpy (vl.host, hostname_g, sizeof (vl.host));
+ }
sstrncpy (vl.plugin, "lpar", sizeof (vl.plugin));
- sstrncpy (vl.plugin_instance, plugin_inst, sizeof (vl.plugin));
- sstrncpy (vl.type, "lpar_pcpu", sizeof (vl.type));
+ sstrncpy (vl.type, "cpu", sizeof (vl.type));
sstrncpy (vl.type_instance, type_instance, sizeof (vl.type_instance));
plugin_dispatch_values (&vl);
}
-static int submit_counter (const char *plugin_instance, /* {{{ */
- const char *type, const char *type_instance, counter_t value)
-{
- value_t values[1];
- value_list_t vl = VALUE_LIST_INIT;
-
- values[0].counter = value;
-
- vl.values = values;
- vl.values_len = 1;
- sstrncpy (vl.host, hostname_g, sizeof (vl.host));
- sstrncpy (vl.plugin, "lpar", sizeof (vl.plugin));
- sstrncpy (vl.plugin_instance, plugin_inst, sizeof (vl.plugin));
- sstrncpy (vl.type, type, sizeof (vl.type));
- sstrncpy (vl.type_instance, type_instance, sizeof (vl.type_instance));
-
- return (plugin_dispatch_values (&vl));
-} /* }}} int submit_counter */
-
static int lpar_read (void)
{
u_longlong_t delta_time_base;
perfstat_partition_total_t lparstats;
- struct utsname name;
- char plugin_inst[DATA_MAX_NAME_LEN];
- _Bool have_donate = 0;
- /* retrieve the current physical server's id and build the plugin
- instance's name */
- if (uname (&name) != 0)
- {
- ERROR ("lpar plugin: uname failed.");
- return (-1);
- }
- sstrncpy (plugin_inst, name.machine, sizeof (plugin_inst));
-
- /* retrieve the current metrics */
- if (!perfstat_partition_total (/* name = */ NULL, /* "must be set to NULL" */
- &lparstats, sizeof (lparstats),
- /* desired_number = */ 1 /* "must be set to 1" */))
+ /* Retrieve the current metrics */
+ if (!perfstat_partition_total (NULL, &lparstats,
+ sizeof (perfstat_partition_total_t), 1))
{
ERROR ("lpar plugin: perfstat_partition_total failed.");
return (-1);
}
- if (!lparstats.type.b.shared_enabled
- && lparstats.type.b.donate_enabled)
- have_donate = 1;
-
delta_time_base = lparstats.timebase_last - last_time_base;
- if (delta_time_base == 0)
- {
- /* The system stats have not been updated since last time */
- return (0);
- }
-
- submit_counter (plugin_inst, "cpu", "user", (counter_t) lparstats.puser);
- submit_counter (plugin_inst, "cpu", "system", (counter_t) lparstats.psys);
- submit_counter (plugin_inst, "cpu", "idle", (counter_t) lparstats.pidle);
- submit_counter (plugin_inst, "cpu", "wait", (counter_t) lparstats.pwait);
+ last_time_base = lparstats.timebase_last;
+
+ lpar_submit ("user", HTIC2SEC(lparstats.puser));
+ lpar_submit ("sys", HTIC2SEC(lparstats.psys));
+ lpar_submit ("wait", HTIC2SEC(lparstats.pwait));
+ lpar_submit ("idle", HTIC2SEC(lparstats.pidle));
+ /* Entitled capacity is reported as an absolute value instead of a counter,
+ so we fake one. It's also in CPU units, hence the division by 100 before
+ submission. */
+ ent_counter += lparstats.entitled_proc_capacity * delta_time_base;
+ lpar_submit ("ent", HTIC2SEC(ent_counter) / 100.0);
- /* FIXME: Use an appropriate GAUGE type here. */
- lpar_submit (plugin_inst, "ent", (double)lparstats.entitled_proc_capacity / 100.0);
- lpar_submit (plugin_inst, "max", (double)lparstats.max_proc_capacity / 100.0);
- lpar_submit (plugin_inst, "min", (double)lparstats.min_proc_capacity / 100.0);
-
- if (have_donate)
+ if (donate_flag)
{
- dlt_idle_donated = lparstats.idle_donated_purr - last_idle_donated_purr;
- dlt_busy_donated = lparstats.busy_donated_purr - last_busy_donated_purr;
- dlt_idle_stolen = lparstats.idle_stolen_purr - last_idle_stolen_purr;
- dlt_busy_stolen = lparstats.busy_stolen_purr - last_busy_stolen_purr;
-
- submit_counter (plugin_inst, "cpu", "donated-idle", (counter_t) lparstats.idle_donated_purr);
- submit_counter (plugin_inst, "cpu", "donated-busy", (counter_t) lparstats.busy_donated_purr);
- submit_counter (plugin_inst, "cpu", "stolen-idle", (counter_t) lparstats.idle_stolen_purr);
- submit_counter (plugin_inst, "cpu", "stolen-busy", (counter_t) lparstats.busy_stolen_purr);
+ lpar_submit ("idle_donated", HTIC2SEC(lparstats.idle_donated_purr));
+ lpar_submit ("busy_donated", HTIC2SEC(lparstats.busy_donated_purr));
+ lpar_submit ("idle_stolen", HTIC2SEC(lparstats.idle_stolen_purr));
+ lpar_submit ("busy_stolen", HTIC2SEC(lparstats.busy_stolen_purr));
}
if (pool_stats)
{
- if (!lparstats.type.b.pool_util_authority)
- {
- WARNING ("lpar plugin: Pool utilization data is not available.");
- }
- else
- {
- u_longlong_t dlt_pit;
- double total, idle;
- char type[DATA_MAX_NAME_LEN];
+ char typinst[TYPE_INST_LEN];
- /* FIXME: The pool id should probably be used as plugin instance. */
- dlt_pit = lparstats.pool_idle_time - last_pool_idle_time;
- total = (double)lparstats.phys_cpus_pool;
- idle = (double)dlt_pit / XINTFRAC / (double)delta_time_base;
- ssnprintf (type, sizeof(type), "pool-%X-total", lparstats.pool_id);
- lpar_submit (plugin_inst, type, total);
- ssnprintf (type, sizeof(type), "pool-%X-used", lparstats.pool_id);
- lpar_submit (plugin_inst, type, total - idle);
- }
- }
+ /* Pool stats are in CPU x ns */
+ ssnprintf (typinst, sizeof(typinst), "pool-%X-busy", lparstats.pool_id);
+ lpar_submit (typinst, (double)lparstats.pool_busy_time / 1000000000.0);
- save_last_values (&lparstats);
+ ssnprintf (typinst, sizeof(typinst), "pool-%X-total", lparstats.pool_id);
+ lpar_submit (typinst, (double)lparstats.pool_max_time / 1000000000.0);
+ }
return (0);
} /* int lpar_read */
plugin_register_read ("lpar", lpar_read);
} /* void module_register */
-/* vim: set sw=8 sts=8 ts=8 noet : */
+/* vim: set sw=2 sts=2 ts=8 : */
diff --git a/src/types.db b/src/types.db
index 962109f48c39d0b3b59ddaddb6ca2a7c062911a7..1b0020f6e03981132e5623d66bba787fc2bfba8a 100644 (file)
--- a/src/types.db
+++ b/src/types.db
latency value:GAUGE:0:65535
links value:GAUGE:0:U
load shortterm:GAUGE:0:100, midterm:GAUGE:0:100, longterm:GAUGE:0:100
-lpar_pcpu value:GAUGE:0:U
memcached_command value:COUNTER:0:U
memcached_connections value:GAUGE:0:U
memcached_items value:GAUGE:0:U