Code

Do linear-time/space rename logic for exact renames
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Oct 2007 18:23:26 +0000 (11:23 -0700)
committerJunio C Hamano <gitster@pobox.com>
Sat, 27 Oct 2007 06:18:06 +0000 (23:18 -0700)
This implements a smarter rename detector for exact renames, which
rather than doing a pairwise comparison (time O(m*n)) will just hash the
files into a hash-table (size O(n+m)), and only do pairwise comparisons
to renames that have the same hash (time O(n+m) except for unrealistic
hash collissions, which we just cull aggressively).

Admittedly the exact rename case is not nearly as interesting as the
generic case, but it's an important case none-the-less. A similar general
approach should work for the generic case too, but even then you do need
to handle the exact renames/copies separately (to avoid the inevitable
added cost factor that comes from the _size_ of the file), so this is
worth doing.

In the expectation that we will indeed do the same hashing trick for the
general rename case, this code uses a generic hash-table implementation
that can be used for other things too.  In fact, we might be able to
consolidate some of our existing hash tables with the new generic code
in hash.[ch].

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Makefile
diffcore-rename.c
hash.c [new file with mode: 0644]
hash.h [new file with mode: 0644]

index ba969e4397810504d431364369a706fb4c2df8f9..2e6fd8f21950c1acf50f8d50158c93b53ca70c23 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -290,7 +290,7 @@ LIB_H = \
        run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \
        tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \
        utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \
-       mailmap.h remote.h transport.h diffcore.h
+       mailmap.h remote.h transport.h diffcore.h hash.h
 
 DIFF_OBJS = \
        diff.o diff-lib.o diffcore-break.o diffcore-order.o \
@@ -300,7 +300,7 @@ DIFF_OBJS = \
 LIB_OBJS = \
        blob.o commit.o connect.o csum-file.o cache-tree.o base85.o \
        date.o diff-delta.o entry.o exec_cmd.o ident.o \
-       interpolate.o \
+       interpolate.o hash.o \
        lockfile.o \
        patch-ids.o \
        object.o pack-check.o pack-write.o patch-delta.o path.o pkt-line.o \
index edb2424d13ac602431362fd630175128456c58db..e7e370b2cc1e66f7724755c1b33f2a815b7223bf 100644 (file)
@@ -4,6 +4,7 @@
 #include "cache.h"
 #include "diff.h"
 #include "diffcore.h"
+#include "hash.h"
 
 /* Table of rename/copy destinations */
 
@@ -93,29 +94,6 @@ static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
        return &(rename_src[first]);
 }
 
-static int is_exact_match(struct diff_filespec *src,
-                         struct diff_filespec *dst,
-                         int contents_too)
-{
-       if (src->sha1_valid && dst->sha1_valid &&
-           !hashcmp(src->sha1, dst->sha1))
-               return 1;
-       if (!contents_too)
-               return 0;
-       if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
-               return 0;
-       if (src->size != dst->size)
-               return 0;
-       if (src->sha1_valid && dst->sha1_valid)
-           return !hashcmp(src->sha1, dst->sha1);
-       if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
-               return 0;
-       if (src->size == dst->size &&
-           !memcmp(src->data, dst->data, src->size))
-               return 1;
-       return 0;
-}
-
 static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
 {
        int src_len = strlen(src->path), dst_len = strlen(dst->path);
@@ -242,56 +220,163 @@ static int score_compare(const void *a_, const void *b_)
        return b->score - a->score;
 }
 
+struct file_similarity {
+       int src_dst, index;
+       struct diff_filespec *filespec;
+       struct file_similarity *next;
+};
+
+static int find_identical_files(struct file_similarity *src,
+                               struct file_similarity *dst)
+{
+       int renames = 0;
+
+       /*
+        * Walk over all the destinations ...
+        */
+       do {
+               struct diff_filespec *one = dst->filespec;
+               struct file_similarity *p, *best;
+               int i = 100;
+
+               /*
+                * .. to find the best source match
+                */
+               best = NULL;
+               for (p = src; p; p = p->next) {
+                       struct diff_filespec *two = p->filespec;
+
+                       /* False hash collission? */
+                       if (hashcmp(one->sha1, two->sha1))
+                               continue;
+                       /* Non-regular files? If so, the modes must match! */
+                       if (!S_ISREG(one->mode) || !S_ISREG(two->mode)) {
+                               if (one->mode != two->mode)
+                                       continue;
+                       }
+                       best = p;
+                       if (basename_same(one, two))
+                               break;
+
+                       /* Too many identical alternatives? Pick one */
+                       if (!--i)
+                               break;
+               }
+               if (best) {
+                       record_rename_pair(dst->index, best->index, MAX_SCORE);
+                       renames++;
+               }
+       } while ((dst = dst->next) != NULL);
+       return renames;
+}
+
+/*
+ * Note: the rest of the rename logic depends on this
+ * phase also populating all the filespecs for any
+ * entry that isn't matched up with an exact rename.
+ */
+static void free_similarity_list(struct file_similarity *p)
+{
+       while (p) {
+               struct file_similarity *entry = p;
+               p = p->next;
+
+               /* Stupid special case, see note above! */
+               diff_populate_filespec(entry->filespec, 0);
+               free(entry);
+       }
+}
+
+static int find_same_files(void *ptr)
+{
+       int ret;
+       struct file_similarity *p = ptr;
+       struct file_similarity *src = NULL, *dst = NULL;
+
+       /* Split the hash list up into sources and destinations */
+       do {
+               struct file_similarity *entry = p;
+               p = p->next;
+               if (entry->src_dst < 0) {
+                       entry->next = src;
+                       src = entry;
+               } else {
+                       entry->next = dst;
+                       dst = entry;
+               }
+       } while (p);
+
+       /*
+        * If we have both sources *and* destinations, see if
+        * we can match them up
+        */
+       ret = (src && dst) ? find_identical_files(src, dst) : 0;
+
+       /* Free the hashes and return the number of renames found */
+       free_similarity_list(src);
+       free_similarity_list(dst);
+       return ret;
+}
+
+static unsigned int hash_filespec(struct diff_filespec *filespec)
+{
+       unsigned int hash;
+       if (!filespec->sha1_valid) {
+               if (diff_populate_filespec(filespec, 0))
+                       return 0;
+               hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
+       }
+       memcpy(&hash, filespec->sha1, sizeof(hash));
+       return hash;
+}
+
+static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec)
+{
+       void **pos;
+       unsigned int hash;
+       struct file_similarity *entry = xmalloc(sizeof(*entry));
+
+       entry->src_dst = src_dst;
+       entry->index = index;
+       entry->filespec = filespec;
+       entry->next = NULL;
+
+       hash = hash_filespec(filespec);
+       pos = insert_hash(hash, entry, table);
+
+       /* We already had an entry there? */
+       if (pos) {
+               entry->next = *pos;
+               *pos = entry;
+       }
+}
+
 /*
  * Find exact renames first.
  *
  * The first round matches up the up-to-date entries,
  * and then during the second round we try to match
  * cache-dirty entries as well.
- *
- * Note: the rest of the rename logic depends on this
- * phase also populating all the filespecs for any
- * entry that isn't matched up with an exact rename,
- * see "is_exact_match()".
  */
 static int find_exact_renames(void)
 {
-       int rename_count = 0;
-       int contents_too;
-
-       for (contents_too = 0; contents_too < 2; contents_too++) {
-               int i;
-
-               for (i = 0; i < rename_dst_nr; i++) {
-                       struct diff_filespec *two = rename_dst[i].two;
-                       int j;
-
-                       if (rename_dst[i].pair)
-                               continue; /* dealt with an earlier round */
-                       for (j = 0; j < rename_src_nr; j++) {
-                               int k;
-                               struct diff_filespec *one = rename_src[j].one;
-                               if (!is_exact_match(one, two, contents_too))
-                                       continue;
+       int i;
+       struct hash_table file_table;
 
-                               /* see if there is a basename match, too */
-                               for (k = j; k < rename_src_nr; k++) {
-                                       one = rename_src[k].one;
-                                       if (basename_same(one, two) &&
-                                               is_exact_match(one, two,
-                                                       contents_too)) {
-                                               j = k;
-                                               break;
-                                       }
-                               }
-
-                               record_rename_pair(i, j, (int)MAX_SCORE);
-                               rename_count++;
-                               break; /* we are done with this entry */
-                       }
-               }
-       }
-       return rename_count;
+       init_hash(&file_table);
+       for (i = 0; i < rename_src_nr; i++)
+               insert_file_table(&file_table, -1, i, rename_src[i].one);
+
+       for (i = 0; i < rename_dst_nr; i++)
+               insert_file_table(&file_table, 1, i, rename_dst[i].two);
+
+       /* Find the renames */
+       i = for_each_hash(&file_table, find_same_files);
+
+       /* .. and free the hash data structure */
+       free_hash(&file_table);
+
+       return i;
 }
 
 void diffcore_rename(struct diff_options *options)
diff --git a/hash.c b/hash.c
new file mode 100644 (file)
index 0000000..7b492d4
--- /dev/null
+++ b/hash.c
@@ -0,0 +1,110 @@
+/*
+ * Some generic hashing helpers.
+ */
+#include "cache.h"
+#include "hash.h"
+
+/*
+ * Look up a hash entry in the hash table. Return the pointer to
+ * the existing entry, or the empty slot if none existed. The caller
+ * can then look at the (*ptr) to see whether it existed or not.
+ */
+static struct hash_table_entry *lookup_hash_entry(unsigned int hash, struct hash_table *table)
+{
+       unsigned int size = table->size, nr = hash % size;
+       struct hash_table_entry *array = table->array;
+
+       while (array[nr].ptr) {
+               if (array[nr].hash == hash)
+                       break;
+               nr++;
+               if (nr >= size)
+                       nr = 0;
+       }
+       return array + nr;
+}
+
+
+/*
+ * Insert a new hash entry pointer into the table.
+ *
+ * If that hash entry already existed, return the pointer to
+ * the existing entry (and the caller can create a list of the
+ * pointers or do anything else). If it didn't exist, return
+ * NULL (and the caller knows the pointer has been inserted).
+ */
+static void **insert_hash_entry(unsigned int hash, void *ptr, struct hash_table *table)
+{
+       struct hash_table_entry *entry = lookup_hash_entry(hash, table);
+
+       if (!entry->ptr) {
+               entry->ptr = ptr;
+               entry->hash = hash;
+               table->nr++;
+               return NULL;
+       }
+       return &entry->ptr;
+}
+
+static void grow_hash_table(struct hash_table *table)
+{
+       unsigned int i;
+       unsigned int old_size = table->size, new_size;
+       struct hash_table_entry *old_array = table->array, *new_array;
+
+       new_size = alloc_nr(old_size);
+       new_array = xcalloc(sizeof(struct hash_table_entry), new_size);
+       table->size = new_size;
+       table->array = new_array;
+       table->nr = 0;
+       for (i = 0; i < old_size; i++) {
+               unsigned int hash = old_array[i].hash;
+               void *ptr = old_array[i].ptr;
+               if (ptr)
+                       insert_hash_entry(hash, ptr, table);
+       }
+       free(old_array);
+}
+
+void *lookup_hash(unsigned int hash, struct hash_table *table)
+{
+       if (!table->array)
+               return NULL;
+       return &lookup_hash_entry(hash, table)->ptr;
+}
+
+void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table)
+{
+       unsigned int nr = table->nr;
+       if (nr >= table->size/2)
+               grow_hash_table(table);
+       return insert_hash_entry(hash, ptr, table);
+}
+
+int for_each_hash(struct hash_table *table, int (*fn)(void *))
+{
+       int sum = 0;
+       unsigned int i;
+       unsigned int size = table->size;
+       struct hash_table_entry *array = table->array;
+
+       for (i = 0; i < size; i++) {
+               void *ptr = array->ptr;
+               array++;
+               if (ptr) {
+                       int val = fn(ptr);
+                       if (val < 0)
+                               return val;
+                       sum += val;
+               }
+       }
+       return sum;
+}
+
+void free_hash(struct hash_table *table)
+{
+       free(table->array);
+       table->array = NULL;
+       table->size = 0;
+       table->nr = 0;
+}
diff --git a/hash.h b/hash.h
new file mode 100644 (file)
index 0000000..a8b0fbb
--- /dev/null
+++ b/hash.h
@@ -0,0 +1,43 @@
+#ifndef HASH_H
+#define HASH_H
+
+/*
+ * These are some simple generic hash table helper functions.
+ * Not necessarily suitable for all users, but good for things
+ * where you want to just keep track of a list of things, and
+ * have a good hash to use on them.
+ *
+ * It keeps the hash table at roughly 50-75% free, so the memory
+ * cost of the hash table itself is roughly
+ *
+ *     3 * 2*sizeof(void *) * nr_of_objects
+ *
+ * bytes.
+ *
+ * FIXME: on 64-bit architectures, we waste memory. It would be
+ * good to have just 32-bit pointers, requiring a special allocator
+ * for hashed entries or something.
+ */
+struct hash_table_entry {
+       unsigned int hash;
+       void *ptr;
+};
+
+struct hash_table {
+       unsigned int size, nr;
+       struct hash_table_entry *array;
+};
+
+extern void *lookup_hash(unsigned int hash, struct hash_table *table);
+extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table);
+extern int for_each_hash(struct hash_table *table, int (*fn)(void *));
+extern void free_hash(struct hash_table *table);
+
+static inline void init_hash(struct hash_table *table)
+{
+       table->size = 0;
+       table->nr = 0;
+       table->array = NULL;
+}
+
+#endif