Create pathname-based hash-table lookup into index

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 23 Jan 2008 02:41:14 +0000 (18:41 -0800)

committer Junio C Hamano <gitster@pobox.com>

Wed, 23 Jan 2008 05:46:30 +0000 (21:46 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 23 Jan 2008 02:41:14 +0000 (18:41 -0800)
committer Junio C Hamano <gitster@pobox.com>
Wed, 23 Jan 2008 05:46:30 +0000 (21:46 -0800)
diff --git a/cache.h b/cache.h

index 3a47cdc9d26886db66379c0000bbe80565e2f742..409738ca6b4b6dcefb60937c665bf0f1456a0f5a 100644 (file)
--- a/cache.h
+++ b/cache.h
@@ -3,6 +3,7 @@
  
  #include "git-compat-util.h"
  #include "strbuf.h"
+#include "hash.h"
  
  #include SHA1_HEADER
  #include <zlib.h>
@@ -109,6 +110,7 @@ struct ondisk_cache_entry {
  };
  
  struct cache_entry {
+       struct cache_entry *next;
         unsigned int ce_ctime;
         unsigned int ce_mtime;
         unsigned int ce_dev;
@@ -131,6 +133,7 @@ struct cache_entry {
  #define CE_UPDATE    (0x10000)
  #define CE_REMOVE    (0x20000)
  #define CE_UPTODATE  (0x40000)
+#define CE_UNHASHED  (0x80000)
  
  static inline unsigned create_ce_flags(size_t len, unsigned stage)
  {
@@ -188,6 +191,7 @@ struct index_state {
         struct cache_tree *cache_tree;
         time_t timestamp;
         void *alloc;
+       struct hash_table name_hash;
  };
  
  extern struct index_state the_index;
@@ -211,6 +215,7 @@ extern struct index_state the_index;
  #define refresh_cache(flags) refresh_index(&the_index, (flags), NULL, NULL)
  #define ce_match_stat(ce, st, options) ie_match_stat(&the_index, (ce), (st), (options))
  #define ce_modified(ce, st, options) ie_modified(&the_index, (ce), (st), (options))
+#define cache_name_exists(name, namelen) index_name_exists(&the_index, (name), (namelen))
  #endif
  
  enum object_type {
@@ -297,6 +302,7 @@ extern int read_index_from(struct index_state *, const char *path);
  extern int write_index(struct index_state *, int newfd);
  extern int discard_index(struct index_state *);
  extern int verify_path(const char *path);
+extern int index_name_exists(struct index_state *istate, const char *name, int namelen);
  extern int index_name_pos(struct index_state *, const char *name, int namelen);
  #define ADD_CACHE_OK_TO_ADD 1          /* Ok to add */
  #define ADD_CACHE_OK_TO_REPLACE 2      /* Ok to replace file/directory */
diff --git a/dir.c b/dir.c

index 1b9cc7a8a81002fa2971d71c41a256444e6112fe..6543105b9622212430a9e5ed131a81074e019d9a 100644 (file)
--- a/dir.c
+++ b/dir.c
@@ -346,7 +346,7 @@ static struct dir_entry *dir_entry_new(const char *pathname, int len)
  
  struct dir_entry *dir_add_name(struct dir_struct *dir, const char *pathname, int len)
  {
-       if (cache_name_pos(pathname, len) >= 0)
+       if (cache_name_exists(pathname, len))
                 return NULL;
  
         ALLOC_GROW(dir->entries, dir->nr+1, dir->alloc);
diff --git a/read-cache.c b/read-cache.c

index 07abd5d7ebfc10998a3c6140a7a325d752d70b49..9477c0b398125c3759ae3692de80e75ae62e11e0 100644 (file)
--- a/read-cache.c
+++ b/read-cache.c
@@ -23,6 +23,70 @@
  
  struct index_state the_index;
  
+static unsigned int hash_name(const char *name, int namelen)
+{
+       unsigned int hash = 0x123;
+
+       do {
+               unsigned char c = *name++;
+               hash = hash*101 + c;
+       } while (--namelen);
+       return hash;
+}
+
+static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
+{
+       void **pos;
+       unsigned int hash = hash_name(ce->name, ce_namelen(ce));
+
+       istate->cache[nr] = ce;
+       pos = insert_hash(hash, ce, &istate->name_hash);
+       if (pos) {
+               ce->next = *pos;
+               *pos = ce;
+       }
+}
+
+/*
+ * We don't actually *remove* it, we can just mark it invalid so that
+ * we won't find it in lookups.
+ *
+ * Not only would we have to search the lists (simple enough), but
+ * we'd also have to rehash other hash buckets in case this makes the
+ * hash bucket empty (common). So it's much better to just mark
+ * it.
+ */
+static void remove_hash_entry(struct index_state *istate, struct cache_entry *ce)
+{
+       ce->ce_flags |= CE_UNHASHED;
+}
+
+static void replace_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
+{
+       struct cache_entry *old = istate->cache[nr];
+
+       if (ce != old) {
+               remove_hash_entry(istate, old);
+               set_index_entry(istate, nr, ce);
+       }
+       istate->cache_changed = 1;
+}
+
+int index_name_exists(struct index_state *istate, const char *name, int namelen)
+{
+       unsigned int hash = hash_name(name, namelen);
+       struct cache_entry *ce = lookup_hash(hash, &istate->name_hash);
+
+       while (ce) {
+               if (!(ce->ce_flags & CE_UNHASHED)) {
+                       if (!cache_name_compare(name, namelen, ce->name, ce->ce_flags))
+                               return 1;
+               }
+               ce = ce->next;
+       }
+       return 0;
+}
+
  /*
   * This only updates the "non-critical" parts of the directory
   * cache, ie the parts that aren't tracked by GIT, and only used
@@ -327,6 +391,9 @@ int index_name_pos(struct index_state *istate, const char *name, int namelen)
  /* Remove entry, return true if there are more entries to go.. */
  int remove_index_entry_at(struct index_state *istate, int pos)
  {
+       struct cache_entry *ce = istate->cache[pos];
+
+       remove_hash_entry(istate, ce);
         istate->cache_changed = 1;
         istate->cache_nr--;
         if (pos >= istate->cache_nr)
@@ -702,8 +769,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
  
         /* existing match? Just replace it. */
         if (pos >= 0) {
-               istate->cache_changed = 1;
-               istate->cache[pos] = ce;
+               replace_index_entry(istate, pos, ce);
                 return 0;
         }
         pos = -pos-1;
@@ -763,7 +829,7 @@ int add_index_entry(struct index_state *istate, struct cache_entry *ce, int opti
                 memmove(istate->cache + pos + 1,
                         istate->cache + pos,
                         (istate->cache_nr - pos - 1) * sizeof(ce));
-       istate->cache[pos] = ce;
+       set_index_entry(istate, pos, ce);
         istate->cache_changed = 1;
         return 0;
  }
@@ -892,11 +958,8 @@ int refresh_index(struct index_state *istate, unsigned int flags, const char **p
                         has_errors = 1;
                         continue;
                 }
-               istate->cache_changed = 1;
-               /* You can NOT just free istate->cache[i] here, since it
-                * might not be necessarily malloc()ed but can also come
-                * from mmap(). */
-               istate->cache[i] = new;
+
+               replace_index_entry(istate, i, new);
         }
         return has_errors;
  }
@@ -971,6 +1034,20 @@ static void convert_from_disk(struct ondisk_cache_entry *ondisk, struct cache_en
         memcpy(ce->name, ondisk->name, len + 1);
  }
  
+static inline size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
+{
+       long per_entry;
+
+       per_entry = sizeof(struct cache_entry) - sizeof(struct ondisk_cache_entry);
+
+       /*
+        * Alignment can cause differences. This should be "alignof", but
+        * since that's a gcc'ism, just use the size of a pointer.
+        */
+       per_entry += sizeof(void *);
+       return ondisk_size + entries*per_entry;
+}
+
  /* remember to discard_cache() before reading a different cache! */
  int read_index_from(struct index_state *istate, const char *path)
  {
@@ -1021,7 +1098,7 @@ int read_index_from(struct index_state *istate, const char *path)
          * has room for a few  more flags, we can allocate using the same
          * index size
          */
-       istate->alloc = xmalloc(mmap_size);
+       istate->alloc = xmalloc(estimate_cache_size(mmap_size, istate->cache_nr));
  
         src_offset = sizeof(*hdr);
         dst_offset = 0;
@@ -1032,7 +1109,7 @@ int read_index_from(struct index_state *istate, const char *path)
                 disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
                 ce = (struct cache_entry *)((char *)istate->alloc + dst_offset);
                 convert_from_disk(disk_ce, ce);
-               istate->cache[i] = ce;
+               set_index_entry(istate, i, ce);
  
                 src_offset += ondisk_ce_size(ce);
                 dst_offset += ce_size(ce);
@@ -1070,6 +1147,7 @@ int discard_index(struct index_state *istate)
         istate->cache_nr = 0;
         istate->cache_changed = 0;
         istate->timestamp = 0;
+       free_hash(&istate->name_hash);
         cache_tree_free(&(istate->cache_tree));
         free(istate->alloc);
         istate->alloc = NULL;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 23 Jan 2008 02:41:14 +0000 (18:41 -0800)
committer	Junio C Hamano <gitster@pobox.com>
	Wed, 23 Jan 2008 05:46:30 +0000 (21:46 -0800)
cache.h		patch \| blob \| history
dir.c		patch \| blob \| history
read-cache.c		patch \| blob \| history