Code

processes plugin: Add collection of IO-metrics.
authorAndrés J. Díaz <ajdiaz@connectical.com>
Wed, 21 Oct 2009 09:12:40 +0000 (11:12 +0200)
committerFlorian Forster <octo@noris.net>
Wed, 21 Oct 2009 09:15:01 +0000 (11:15 +0200)
I attach a patch for collectd-4.8.0 adding a new feature which read IO
data for a process using /proc/<pid>/io (only works on linux > 2.6.20),
it's very usefull to monitoring disk throughput and what process is
related with high disk IO. I'm using this patchs for days in some
database environments with good results, but I think that is not enough
stable yet (more testing is required). The patch read from
/proc/<pid>/io (where available) the data for syscr and syscw (read and
write operations) and rchar and wchar (read and write bytes).

src/collectd.conf.pod
src/processes.c
src/types.db

index 3ea3d4dd588ab47ed1cd63e759f76ffa19dc0390..6bae33936e3eac1b0cbe6980e019bf1e55db1d45 100644 (file)
@@ -3235,7 +3235,7 @@ C<I<prefix>/var/run/collectd-powerdns>.
 Select more detailed statistics of processes matching this name. The statistics
 collected for these selected processes are size of the resident segment size
 (RSS), user- and system-time used, number of processes and number of threads,
-and minor and major pagefaults.
+io data (where available) and minor and major pagefaults.
 
 =item B<ProcessMatch> I<name> I<regex>
 
index 5f67abaa12dd879072ccc4e794e575fc241146a9..0a8f8c4d294229b9da05a4ac1aa1fe2fc46339f8 100644 (file)
@@ -136,6 +136,12 @@ typedef struct procstat_entry_s
        unsigned long cpu_user_counter;
        unsigned long cpu_system_counter;
 
+       /* io data */
+       long io_rchar;
+       long io_wchar;
+       long io_syscr;
+       long io_syscw;
+
        struct procstat_entry_s *next;
 } procstat_entry_t;
 
@@ -159,6 +165,12 @@ typedef struct procstat
        unsigned long cpu_user_counter;
        unsigned long cpu_system_counter;
 
+       /* io data */
+       long io_rchar;
+       long io_wchar;
+       long io_syscr;
+       long io_syscw;
+
        struct procstat   *next;
        struct procstat_entry_s *instances;
 } procstat_t;
@@ -328,6 +340,10 @@ static void ps_list_add (const char *name, const char *cmdline, procstat_entry_t
                pse->vmem_size  = entry->vmem_size;
                pse->vmem_rss   = entry->vmem_rss;
                pse->stack_size = entry->stack_size;
+               pse->io_rchar   = entry->io_rchar;
+               pse->io_wchar   = entry->io_wchar;
+               pse->io_syscr   = entry->io_syscr;
+               pse->io_syscw   = entry->io_syscw;
 
                ps->num_proc   += pse->num_proc;
                ps->num_lwp    += pse->num_lwp;
@@ -335,6 +351,11 @@ static void ps_list_add (const char *name, const char *cmdline, procstat_entry_t
                ps->vmem_rss   += pse->vmem_rss;
                ps->stack_size += pse->stack_size;
 
+               ps->io_rchar   += ((pse->io_rchar == -1)?0:pse->io_rchar);
+               ps->io_wchar   += ((pse->io_wchar == -1)?0:pse->io_wchar);
+               ps->io_syscr   += ((pse->io_syscr == -1)?0:pse->io_syscr);
+               ps->io_syscw   += ((pse->io_syscw == -1)?0:pse->io_syscw);
+
                if ((entry->vmem_minflt_counter == 0)
                                && (entry->vmem_majflt_counter == 0))
                {
@@ -425,6 +446,10 @@ static void ps_list_reset (void)
                ps->vmem_size   = 0;
                ps->vmem_rss    = 0;
                ps->stack_size  = 0;
+               ps->io_rchar = -1;
+               ps->io_wchar = -1;
+               ps->io_syscr = -1;
+               ps->io_syscw = -1;
 
                pse_prev = NULL;
                pse = ps->instances;
@@ -607,12 +632,33 @@ static void ps_submit_proc_list (procstat_t *ps)
        vl.values_len = 2;
        plugin_dispatch_values (&vl);
 
+       if ( (ps->io_rchar != -1) && (ps->io_wchar != -1) )
+       {
+               sstrncpy (vl.type, "ps_diskbytes", sizeof (vl.type));
+               vl.values[0].counter = ps->io_rchar;
+               vl.values[1].counter = ps->io_wchar;
+               vl.values_len = 2;
+               plugin_dispatch_values (&vl);
+       }
+
+       if ( (ps->io_syscr != -1) && (ps->io_syscw != -1) )
+       {
+               sstrncpy (vl.type, "ps_diskops", sizeof (vl.type));
+               vl.values[0].counter = ps->io_syscr;
+               vl.values[1].counter = ps->io_syscw;
+               vl.values_len = 2;
+               plugin_dispatch_values (&vl);
+       }
+
        DEBUG ("name = %s; num_proc = %lu; num_lwp = %lu; vmem_rss = %lu; "
                        "vmem_minflt_counter = %lu; vmem_majflt_counter = %lu; "
-                       "cpu_user_counter = %lu; cpu_system_counter = %lu;",
+                       "cpu_user_counter = %lu; cpu_system_counter = %lu; "
+                       "io_rchar = %ld; io_wchar = %ld; "
+                       "io_syscr = %ld; io_syscw = %ld;",
                        ps->name, ps->num_proc, ps->num_lwp, ps->vmem_rss,
                        ps->vmem_minflt_counter, ps->vmem_majflt_counter,
-                       ps->cpu_user_counter, ps->cpu_system_counter);
+                       ps->cpu_user_counter, ps->cpu_system_counter,
+                       ps->io_rchar, ps->io_wchar, ps->io_syscr, ps->io_syscw);
 } /* void ps_submit_proc_list */
 
 /* ------- additional functions for KERNEL_LINUX/HAVE_THREAD_INFO ------- */
@@ -644,6 +690,52 @@ static int ps_read_tasks (int pid)
        return ((count >= 1) ? count : 1);
 } /* int *ps_read_tasks */
 
+procstat_t *ps_read_io (int pid, procstat_t *ps)
+{
+       FILE *fh;
+       char buffer[1024];
+       char filename[64];
+
+       char *fields[8];
+       int numfields;
+
+       ssnprintf (filename, sizeof (filename), "/proc/%i/io", pid);
+       if ((fh = fopen (filename, "r")) == NULL)
+               return (NULL);
+
+       while (fgets (buffer, 1024, fh) != NULL)
+       {
+               long *val = NULL;
+
+               if (strncasecmp (buffer, "rchar:", 6) == 0)
+                       val = &(ps->io_rchar);
+               else if (strncasecmp (buffer, "wchar:", 6) == 0)
+                       val = &(ps->io_wchar);
+               else if (strncasecmp (buffer, "syscr:", 6) == 0)
+                       val = &(ps->io_syscr);
+               else if (strncasecmp (buffer, "syscw:", 6) == 0)
+                       val = &(ps->io_syscw);
+               else
+                       continue;
+
+               numfields = strsplit (buffer, fields, 8);
+
+               if (numfields < 2)
+                       continue;
+
+               *val = atol (fields[1]);
+       }
+
+       if (fclose (fh))
+       {
+               char errbuf[1024];
+               WARNING ("processes: fclose: %s",
+                               sstrerror (errno, errbuf, sizeof (errbuf)));
+       }
+
+       return (ps);
+} /* procstat_t *ps_read_io */
+
 int ps_read_process (int pid, procstat_t *ps, char *state)
 {
        char  filename[64];
@@ -746,6 +838,17 @@ int ps_read_process (int pid, procstat_t *ps, char *state)
        ps->vmem_rss = (unsigned long) vmem_rss;
        ps->stack_size = (unsigned long) stack_size;
 
+       if ( (ps_read_io (pid, ps)) == NULL)
+       {
+               /* no io data */
+               ps->io_rchar = -1;
+               ps->io_wchar = -1;
+               ps->io_syscr = -1;
+               ps->io_syscw = -1;
+
+               DEBUG("ps_read_process: not get io data for pid %i",pid);
+       }
+
        /* success */
        return (0);
 } /* int ps_read_process (...) */
@@ -1274,6 +1377,11 @@ static int ps_read (void)
                pse.cpu_system = 0;
                pse.cpu_system_counter = ps.cpu_system_counter;
 
+               pse.io_rchar = ps.io_rchar;
+               pse.io_wchar = ps.io_wchar;
+               pse.io_syscr = ps.io_syscr;
+               pse.io_syscw = ps.io_syscw;
+
                switch (state)
                {
                        case 'R': running++;  break;
index 0225e0f6b8e6586bb44db3fb8d83baefbeb9eab7..aed5a22e8f34aba20c4ff9e7a1531ad7682eecaa 100644 (file)
@@ -123,6 +123,8 @@ protocol_counter    value:COUNTER:0:U
 ps_count               processes:GAUGE:0:1000000, threads:GAUGE:0:1000000
 ps_cputime             user:COUNTER:0:16000000, syst:COUNTER:0:16000000
 ps_pagefaults          minflt:COUNTER:0:9223372036854775807, majflt:COUNTER:0:9223372036854775807
+ps_diskbytes   read:COUNTER:0:16000000, write:COUNTER:0:16000000
+ps_diskops             read:COUNTER:0:16000000, write:COUNTER:0:16000000
 ps_rss                 value:GAUGE:0:9223372036854775807
 ps_stacksize           value:GAUGE:0:9223372036854775807
 ps_state               value:GAUGE:0:65535