1 #! @PERL@ -w
3 # $Id$
4 # $Source$
6 # This script will read an XML file produced by
7 # rrdtool dump foo.rrd >in.xml
8 # and look at the $maxspike highest samples per datasource. It then finds
9 # the records with the most hits and ditches the data. The resulting file
10 # can be read back into the RRD database with the command
11 # rrdtool restore out.xml foo.rrd
12 #
13 # The whole idea is to find and eradicate "spikes" caused by erroneous
14 # readings affecting entire records.
15 #
16 # This tool is not for the faint of heart, will require tweaking per case
17 # (even though that should just be picking values for cutoff and to a lesser
18 # extent, maxspike). It will cause data loss, for obvious reasons.
19 #
20 # THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT IS USEFUL, AND COMES WITH
21 # NO WARRANTY. USE AT YOUR OWN RISK!
22 #
23 # Bert Driehuis <driehuis@playbeing.org>
25 use strict;
27 my $maxspike = 25; # How many top samples to consider per datasource
28 my $cutoff = 20; # How many records to ditch
29 my $debug = 1;
30 my $file = "in.xml";
31 my $outfile = "out.xml";
33 my $nds = 0;
34 my @dsl = ();
35 my @dsi = ();
36 my @topindx = ();
37 my @botindx = ();
38 my @dsname = ();
39 my $i;
40 my $j;
42 # Count the number of data sources
43 open(IN, $file) || die;
44 while(<IN>) {
45 if (/<name>\s*(\w+)\s*/) {
46 $dsname[$nds] = $1;
47 $nds++;
48 }
49 }
50 close IN;
52 print "Found $nds datasources\n" if $debug;
54 # Set up the list of lists for the datasource data
55 for ($i = 0; $i < $nds; $i++) {
56 my @dsdata = ();
57 push @dsl, \@dsdata;
58 my @dsindex = ();
59 push @dsi, \@dsindex;
60 my @top = ();
61 push @topindx, \@top;
62 my @bot = ();
63 push @botindx, \@bot;
64 }
66 # Slurp all datasource fields into the @dsl Lol
67 my $recno = -1;
68 open(IN, $file) || die;
69 while(<IN>) {
70 next if !/<row>/;
71 $recno++;
72 my @data = split(/ /);
73 die "Malformed input" if $data[5] ne "<row><v>";
74 die "Malformed record" if $data[5 + ($nds * 2)] ne "</v></row>\n";
75 for ($i = 0; $i < $nds; $i++) {
76 my $sample = $data[($i * 2) + 6];
77 #print "$sample\n";
78 push @{$dsl[$i]}, $sample;
79 }
80 }
81 close IN;
83 # Set up a LoL with indexes, and ditch the values that represent NaN's
84 for ($i = 0; $i < $nds; $i++) {
85 @{$dsi[$i]} = grep { ${$dsl[$i]}[$_] ne "NaN" } (0..$recno);
86 print "$dsname[$i] has $#{$dsi[$i]} valid samples\n" if $debug;
87 }
89 sub sortit {
90 ${$dsl[$i]}[$a] <=> ${$dsl[$i]}[$b];
91 }
92 my %indexes;
93 for ($i = 0; $i < $nds; $i++) {
94 next if ($#{$dsi[$i]} < $maxspike);
95 @{$dsi[$i]} = sort sortit @{$dsi[$i]};
96 @{$botindx[$i]} = @{$dsi[$i]};
97 @{$topindx[$i]} = splice(@{$botindx[$i]}, -$maxspike);
98 print "$dsname[$i] top $maxspike: ".join(' ', @{$topindx[$i]})."\n";
99 for($j = 0; $j < $maxspike; $j++) {
100 $indexes{${$topindx[$i]}[$j]} = 0 if
101 !defined($indexes{${$topindx[$i]}[$j]});
102 $indexes{${$topindx[$i]}[$j]}++;
103 printf "%1.1e ", ${$dsl[$i]}[${$topindx[$i]}[$j]];
104 }
105 print "\n";
106 }
108 # Report on the hit rate of the records to be dumped, and a few for
109 # reference.
110 $j = 0;
111 my %ditch;
112 foreach $i (sort {$indexes{$b} <=> $indexes{$a}} keys %indexes) {
113 print "Record index $i: $indexes{$i} hits\n";
114 $ditch{$i} = 1 if $j < $cutoff;
115 print "----------\n" if $j + 1 == $cutoff;
116 last if $j++ > $maxspike;
117 }
119 # Okay, so we start ditching the records. You can always re-run the script
120 # if the results don't suit you after adjusting $cutoff or $maxspike.
121 $recno = -1;
122 open(IN, $file) || die;
123 open(OUT, ">$outfile") || die;
124 while(<IN>) {
125 print OUT if !/<row>/;
126 next if !/<row>/;
127 $recno++;
128 print OUT if !defined($ditch{$recno});
129 next if !defined($ditch{$recno});
130 my @data = split(/ /);
131 for ($i = 0; $i < $nds; $i++) {
132 $data[($i * 2) + 6] = "NaN";
133 }
134 print OUT join(' ', @data);
135 }
136 close IN;
137 close OUT;