contrib/killspike/killspike.pl

   1 #! /usr/sepp/bin/perl -w
   2
   3 # $Id$
   4 # $Source$
   5
   6 # This script will read an XML file produced by
   7 #       rrdtool dump foo.rrd >in.xml
   8 # and look at the $maxspike highest samples per datasource. It then finds
   9 # the records with the most hits and ditches the data. The resulting file
  10 # can be read back into the RRD database with the command
  11 #       rrdtool restore out.xml foo.rrd
  12 #
  13 # The whole idea is to find and eradicate "spikes" caused by erroneous
  14 # readings affecting entire records.
  15 #
  16 # This tool is not for the faint of heart, will require tweaking per case
  17 # (even though that should just be picking values for cutoff and to a lesser
  18 # extent, maxspike). It will cause data loss, for obvious reasons.
  19 #
  20 # THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT IS USEFUL, AND COMES WITH
  21 # NO WARRANTY. USE AT YOUR OWN RISK!
  22 #
  23 #                       Bert Driehuis <driehuis@playbeing.org>
  24
  25 use strict;
  26
  27 my $maxspike = 25;      # How many top samples to consider per datasource
  28 my $cutoff = 20;        # How many records to ditch
  29 my $debug = 1;
  30 my $file = "in.xml";
  31 my $outfile = "out.xml";
  32
  33 my $nds = 0;
  34 my @dsl = ();
  35 my @dsi = ();
  36 my @topindx = ();
  37 my @botindx = ();
  38 my @dsname = ();
  39 my $i;
  40 my $j;
  41
  42 # Count the number of data sources
  43 open(IN, $file) || die;
  44 while(<IN>) {
  45         if (/<name>\s*(\w+)\s*/) {
  46                 $dsname[$nds] = $1;
  47                 $nds++;
  48         }
  49 }
  50 close IN;
  51
  52 print "Found $nds datasources\n" if $debug;
  53
  54 # Set up the list of lists for the datasource data
  55 for ($i = 0; $i < $nds; $i++) {
  56         my @dsdata = ();
  57         push @dsl, \@dsdata;
  58         my @dsindex = ();
  59         push @dsi, \@dsindex;
  60         my @top = ();
  61         push @topindx, \@top;
  62         my @bot = ();
  63         push @botindx, \@bot;
  64 }
  65
  66 # Slurp all datasource fields into the @dsl Lol
  67 my $recno = -1;
  68 open(IN, $file) || die;
  69 while(<IN>) {
  70         next if !/<row>/;
  71         $recno++;
  72         my @data = split(/ /);
  73         die "Malformed input" if $data[5] ne "<row><v>";
  74         die "Malformed record" if $data[5 + ($nds * 2)] ne "</v></row>\n";
  75         for ($i = 0; $i < $nds; $i++) {
  76                 my $sample = $data[($i * 2) + 6];
  77                 #print "$sample\n";
  78                 push @{$dsl[$i]}, $sample;
  79         }
  80 }
  81 close IN;
  82
  83 # Set up a LoL with indexes, and ditch the values that represent NaN's
  84 for ($i = 0; $i < $nds; $i++) {
  85         @{$dsi[$i]} = grep { ${$dsl[$i]}[$_] ne "NaN" } (0..$recno);
  86         print "$dsname[$i] has $#{$dsi[$i]} valid samples\n" if $debug;
  87 }
  88
  89 sub sortit {
  90         ${$dsl[$i]}[$a] <=> ${$dsl[$i]}[$b];
  91 }
  92 my %indexes;
  93 for ($i = 0; $i < $nds; $i++) {
  94         next if ($#{$dsi[$i]} < $maxspike);
  95         @{$dsi[$i]} = sort sortit @{$dsi[$i]};
  96         @{$botindx[$i]} = @{$dsi[$i]};
  97         @{$topindx[$i]} = splice(@{$botindx[$i]}, -$maxspike);
  98         print "$dsname[$i] top $maxspike: ".join(' ', @{$topindx[$i]})."\n";
  99         for($j = 0; $j < $maxspike; $j++) {
 100                 $indexes{${$topindx[$i]}[$j]} = 0 if
 101                                 !defined($indexes{${$topindx[$i]}[$j]});
 102                 $indexes{${$topindx[$i]}[$j]}++;
 103                 printf "%1.1e ", ${$dsl[$i]}[${$topindx[$i]}[$j]];
 104         }
 105         print "\n";
 106 }
 107
 108 # Report on the hit rate of the records to be dumped, and a few for
 109 # reference.
 110 $j = 0;
 111 my %ditch;
 112 foreach $i (sort {$indexes{$b} <=> $indexes{$a}} keys %indexes) {
 113         print "Record index $i: $indexes{$i} hits\n";
 114         $ditch{$i} = 1 if $j < $cutoff;
 115         print "----------\n" if $j + 1 == $cutoff;
 116         last if $j++ > $maxspike;
 117 }
 118
 119 # Okay, so we start ditching the records. You can always re-run the script
 120 # if the results don't suit you after adjusting $cutoff or $maxspike.
 121 $recno = -1;
 122 open(IN, $file) || die;
 123 open(OUT, ">$outfile") || die;
 124 while(<IN>) {
 125         print OUT if !/<row>/;
 126         next if !/<row>/;
 127         $recno++;
 128         print OUT if !defined($ditch{$recno});
 129         next if !defined($ditch{$recno});
 130         my @data = split(/ /);
 131         for ($i = 0; $i < $nds; $i++) {
 132                 $data[($i * 2) + 6] = "NaN";
 133         }
 134         print OUT join(' ', @data);
 135 }
 136 close IN;
 137 close OUT;