summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: fd2bbdd)
raw | patch | inline | side by side (parent: fd2bbdd)
author | Junio C Hamano <junkio@cox.net> | |
Mon, 17 Apr 2006 04:07:32 +0000 (21:07 -0700) | ||
committer | Junio C Hamano <junkio@cox.net> | |
Mon, 17 Apr 2006 04:21:46 +0000 (21:21 -0700) |
Define a function to compute similarity score 0.0<=score<=1.0
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Junio C Hamano <junkio@cox.net>
gsimm.c | patch | blob | history | |
gsimm.h | patch | blob | history | |
test-gsimm.c | patch | blob | history |
index 7024bf8f58e219a9b42a6b5e5217a843b6d75130..bd646eb3d245460cd126327b39ca464b041f7a41 100644 (file)
--- a/gsimm.c
+++ b/gsimm.c
+#include <string.h>
#include "rabinpoly.h"
#include "gsimm.h"
bzero (freq, sizeof(freq[0]) * MD_BITS);
}
+static int dist (u_char *l, u_char *r)
+{ int j, k;
+ int d = 0;
+
+ for (j = 0; j < MD_LENGTH; j++)
+ { u_char ch = l[j] ^ r[j];
+
+ for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
+ }
+
+ return d;
+}
+
+double gb_simm_score(u_char *l, u_char *r)
+{
+ int d = dist(l, r);
+ double sim = (double) (d) / (MD_LENGTH * 4 - 1);
+ if (1.0 < sim)
+ return 0;
+ else
+ return 1.0 - sim;
+}
+
void gb_simm_process(u_char *data, unsigned len, u_char *md)
{ size_t j = 0;
u_int32_t ofs;
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
int freq[MD_BITS];
+ if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
+ memset(md, 0, MD_LENGTH);
+ return;
+ }
+
bzero (freq, sizeof(freq[0]) * MD_BITS);
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
index 4b023b91a93f8e1c6bd7c232f89553a208cc4a68..17fab32d8707d00137054a9c2993443f7f65bf7b 100644 (file)
--- a/gsimm.h
+++ b/gsimm.h
In order to get at least an average of 12 samples
per bit in the final message digest, require at least 3 * MD_LENGTH
complete windows in the file. */
-#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
+#define GB_SIMM_MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
/* Limit matching algorithm to files less than 256 MB, so we can use
32 bit integers everywhere without fear of overflow. For larger
files we should add logic to mmap the file by piece and accumulate
the frequency counts. */
-#define MAX_FILE_SIZE (256*1024*1024 - 1)
+#define GB_SIMM_MAX_FILE_SIZE (256*1024*1024 - 1)
void gb_simm_process(u_char *data, unsigned len, u_char *md);
+double gb_simm_score(u_char *l, u_char *r);
#endif
diff --git a/test-gsimm.c b/test-gsimm.c
index bd28b7da28e4456ca7fea883144e2ebbfb7b6625..b1e7939b6573027f39c5a86295c03d863ca01676 100644 (file)
--- a/test-gsimm.c
+++ b/test-gsimm.c
exit (1);
}
-int dist (u_char *l, u_char *r)
-{ int j, k;
- int d = 0;
-
- for (j = 0; j < MD_LENGTH; j++)
- { u_char ch = l[j] ^ r[j];
-
- for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
- }
-
- return d;
-}
-
char *md_to_str(u_char *md)
{ int j;
exit (2);
}
- if (fs.st_size >= MIN_FILE_SIZE
- && fs.st_size <= MAX_FILE_SIZE)
+ if (fs.st_size >= GB_SIMM_MIN_FILE_SIZE
+ && fs.st_size <= GB_SIMM_MAX_FILE_SIZE)
{ fi->length = fs.st_size;
fi->name = name;
gb_simm_process (data, fs.st_size, fi->md);
if (flag_relative)
- { int d = dist (fi->md, relative_md);
- double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1));
- fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
- md_to_str (fi->md), (long long unsigned) 0,
- (unsigned) fs.st_size, name,
- d, 100.0 * sim);
- }
+ fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
+ md_to_str (fi->md), (long long unsigned) 0,
+ (unsigned) fs.st_size, name,
+ (unsigned) 0,
+ 100.0 * gb_simm_score(fi->md, relative_md));
else
{
fprintf (stdout, "%s %llu %u %s\n",