summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 87cefaa)
raw | patch | inline | side by side (parent: 87cefaa)
author | Linus Torvalds <torvalds@osdl.org> | |
Mon, 5 Jun 2006 19:03:31 +0000 (12:03 -0700) | ||
committer | Junio C Hamano <junkio@cox.net> | |
Tue, 6 Jun 2006 00:23:31 +0000 (17:23 -0700) |
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
pack-objects.c | patch | blob | history |
diff --git a/pack-objects.c b/pack-objects.c
index 3590cd5e5983cde07016f7a1f20ce146b6a73703..179560f2bd67e08d25bef0199c41ee0ee70793f9 100644 (file)
--- a/pack-objects.c
+++ b/pack-objects.c
}
}
-struct name_path {
- struct name_path *up;
- const char *elem;
- int len;
-};
-
-#define DIRBITS 12
-
-static unsigned name_hash(struct name_path *path, const char *name)
+static unsigned name_hash(const char *name)
{
- struct name_path *p = path;
- const char *n = name + strlen(name);
- unsigned hash = 0, name_hash = 0, name_done = 0;
-
- if (n != name && n[-1] == '\n')
- n--;
- while (name <= --n) {
- unsigned char c = *n;
- if (c == '/' && !name_done) {
- name_hash = hash;
- name_done = 1;
- hash = 0;
- }
- hash = hash * 11 + c;
- }
- if (!name_done) {
- name_hash = hash;
- hash = 0;
- }
- for (p = path; p; p = p->up) {
- hash = hash * 11 + '/';
- n = p->elem + p->len;
- while (p->elem <= --n) {
- unsigned char c = *n;
- hash = hash * 11 + c;
- }
- }
+ unsigned char c;
+ unsigned hash = 0;
+
/*
- * Make sure "Makefile" and "t/Makefile" are hashed separately
- * but close enough.
+ * This effectively just creates a sortable number from the
+ * last sixteen non-whitespace characters. Last characters
+ * count "most", so things that end in ".c" sort together.
*/
- hash = (name_hash<<DIRBITS) | (hash & ((1U<<DIRBITS )-1));
+ while ((c = *name++) != 0) {
+ if (isspace(c))
+ continue;
+ hash = (hash >> 2) + (c << 24);
+ }
return hash;
}
}
static void add_pbase_object(struct tree_desc *tree,
- struct name_path *up,
const char *name,
- int cmplen)
+ int cmplen,
+ const char *fullname)
{
struct name_entry entry;
sha1_object_info(entry.sha1, type, &size))
continue;
if (name[cmplen] != '/') {
- unsigned hash = name_hash(up, name);
+ unsigned hash = name_hash(fullname);
add_object_entry(entry.sha1, hash, 1);
return;
}
if (!strcmp(type, tree_type)) {
struct tree_desc sub;
- struct name_path me;
struct pbase_tree_cache *tree;
const char *down = name+cmplen+1;
int downlen = name_cmp_len(down);
sub.buf = tree->tree_data;
sub.size = tree->tree_size;
- me.up = up;
- me.elem = entry.path;
- me.len = entry.pathlen;
- add_pbase_object(&sub, &me, down, downlen);
+ add_pbase_object(&sub, down, downlen, fullname);
pbase_tree_put(tree);
}
}
for (it = pbase_tree; it; it = it->next) {
if (cmplen == 0) {
- hash = name_hash(NULL, "");
+ hash = name_hash("");
add_object_entry(it->pcache.sha1, hash, 1);
}
else {
struct tree_desc tree;
tree.buf = it->pcache.tree_data;
tree.size = it->pcache.tree_size;
- add_pbase_object(&tree, NULL, name, cmplen);
+ add_pbase_object(&tree, name, cmplen, name);
}
}
}
}
if (get_sha1_hex(line, sha1))
die("expected sha1, got garbage:\n %s", line);
- hash = name_hash(NULL, line+41);
+ hash = name_hash(line+41);
add_preferred_base_object(line+41, hash);
add_object_entry(sha1, hash, 0);
}