gsimm.c

   1 #include <string.h>
   2 #include "rabinpoly.h"
   3 #include "gsimm.h"
   4
   5 /* Has to be power of two. Since the Rabin hash only has 63
   6    usable bits, the number of hashes is limited to 32.
   7    Lower powers of two could be used for speeding up processing
   8    of very large files.  */
   9 #define NUM_HASHES_PER_CHAR 32
  10
  11 /* Size of cache used to eliminate duplicate substrings.
  12    Make small enough to comfortably fit in L1 cache.  */
  13 #define DUP_CACHE_SIZE 256
  14
  15 /* For the final counting, do not count each bit individually, but
  16    group them. Must be power of two, at most NUM_HASHES_PER_CHAR.
  17    However, larger sizes result in higher cache usage. Use 8 bits
  18    per group for efficient processing of large files on fast machines
  19    with decent caches, or 4 bits for faster processing of small files
  20    and for machines with small caches.  */
  21 #define GROUP_BITS 4
  22 #define GROUP_COUNTERS (1<<GROUP_BITS)
  23
  24 static void freq_to_md(u_char *md, int *freq)
  25 { int j, k;
  26
  27   for (j = 0; j < MD_LENGTH; j++)
  28   { u_char ch = 0;
  29
  30     for (k = 0; k < 8; k++) ch = 2*ch + (freq[8*j+k] > 0);
  31     md[j] = ch;
  32   }
  33   bzero (freq, sizeof(freq[0]) * MD_BITS);
  34 }
  35
  36 static int dist (u_char *l, u_char *r)
  37 { int j, k;
  38   int d = 0;
  39
  40   for (j = 0; j < MD_LENGTH; j++)
  41   { u_char ch = l[j] ^ r[j];
  42
  43     for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
  44   }
  45
  46   return d;
  47 }
  48
  49 double gb_simm_score(u_char *l, u_char *r)
  50 {
  51         int d = dist(l, r);
  52         double sim = (double) (d) / (MD_LENGTH * 4 - 1);
  53         if (1.0 < sim)
  54                 return 0;
  55         else
  56                 return 1.0 - sim;
  57 }
  58
  59 void gb_simm_process(u_char *data, unsigned len, u_char *md)
  60 { size_t j = 0;
  61   u_int32_t ofs;
  62   u_int32_t dup_cache[DUP_CACHE_SIZE];
  63   u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
  64   int freq[MD_BITS];
  65
  66   if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
  67           memset(md, 0, MD_LENGTH);
  68           return;
  69   }
  70
  71   bzero (freq, sizeof(freq[0]) * MD_BITS);
  72   bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
  73   bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
  74
  75   /* Ignore incomplete substrings */
  76   while (j < len && j < RABIN_WINDOW_SIZE) rabin_slide8 (data[j++]);
  77
  78   while (j < len)
  79   { u_int64_t hash;
  80     u_int32_t ofs, sum;
  81     u_char idx;
  82     int k;
  83
  84     hash = rabin_slide8 (data[j++]);
  85
  86     /* In order to update a much larger frequency table
  87        with only 32 bits of checksum, randomly select a
  88        part of the table to update. The selection should
  89        only depend on the content of the represented data,
  90        and be independent of the bits used for the update.
  91
  92        Instead of updating 32 individual counters, process
  93        the checksum in MD_BITS / GROUP_BITS groups of
  94        GROUP_BITS bits, and count the frequency of each bit pattern.
  95     */
  96
  97     idx = (hash >> 32);
  98     sum = (u_int32_t) hash;
  99     ofs = idx % (MD_BITS / NUM_HASHES_PER_CHAR) * NUM_HASHES_PER_CHAR;
 100     idx %= DUP_CACHE_SIZE;
 101     if (dup_cache[idx] != sum)
 102     { dup_cache[idx] = sum;
 103       for (k = 0; k < NUM_HASHES_PER_CHAR / GROUP_BITS; k++)
 104       { count[ofs * GROUP_COUNTERS / GROUP_BITS + (sum % GROUP_COUNTERS)]++;
 105         ofs += GROUP_BITS;
 106         sum >>= GROUP_BITS;
 107   } } }
 108
 109   /* Distribute the occurrences of each bit group over the frequency table. */
 110   for (ofs = 0; ofs < MD_BITS; ofs += GROUP_BITS)
 111   { int j;
 112     for (j = 0; j < GROUP_COUNTERS; j++)
 113     { int k;
 114       for (k = 0; k < GROUP_BITS; k++)
 115       { freq[ofs + k] += ((1<<k) & j)
 116           ? count[ofs * GROUP_COUNTERS / GROUP_BITS + j]
 117           : -count[ofs * GROUP_COUNTERS / GROUP_BITS + j];
 118   } } }
 119
 120   if (md)
 121   { rabin_reset();
 122     freq_to_md (md, freq);
 123 } }