summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: ee5743c)
raw | patch | inline | side by side (parent: ee5743c)
author | Nicolas Pitre <nico@cam.org> | |
Mon, 9 Apr 2007 05:06:33 +0000 (01:06 -0400) | ||
committer | Junio C Hamano <junkio@cox.net> | |
Tue, 10 Apr 2007 19:48:14 +0000 (12:48 -0700) |
Pack index version 2 goes as follows:
- 8 bytes of header with signature and version.
- 256 entries of 4-byte first-level fan-out table.
- Table of sorted 20-byte SHA1 records for each object in pack.
- Table of 4-byte CRC32 entries for raw pack object data.
- Table of 4-byte offset entries for objects in the pack if offset is
representable with 31 bits or less, otherwise it is an index in the next
table with top bit set.
- Table of 8-byte offset entries indexed from previous table for offsets
which are 32 bits or more (optional).
- 20-byte SHA1 checksum of sorted object names.
- 20-byte SHA1 checksum of the above.
The object SHA1 table is all contiguous so future pack format that would
contain this table directly won't require big changes to the code. It is
also tighter for slightly better cache locality when looking up entries.
Support for large packs exceeding 31 bits in size won't impose an index
size bloat for packs within that range that don't need a 64-bit offset.
And because newer objects which are likely to be the most frequently used
are located at the beginning of the pack, they won't pay the 64-bit offset
lookup at run time either even if the pack is large.
Right now an index version 2 is created only when the biggest offset in a
pack reaches 31 bits. It might be a good idea to always use index version
2 eventually to benefit from the CRC32 it contains when reusing pack data
while repacking.
[jc: with the "oops" fix to keep track of the last offset correctly]
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
- 8 bytes of header with signature and version.
- 256 entries of 4-byte first-level fan-out table.
- Table of sorted 20-byte SHA1 records for each object in pack.
- Table of 4-byte CRC32 entries for raw pack object data.
- Table of 4-byte offset entries for objects in the pack if offset is
representable with 31 bits or less, otherwise it is an index in the next
table with top bit set.
- Table of 8-byte offset entries indexed from previous table for offsets
which are 32 bits or more (optional).
- 20-byte SHA1 checksum of sorted object names.
- 20-byte SHA1 checksum of the above.
The object SHA1 table is all contiguous so future pack format that would
contain this table directly won't require big changes to the code. It is
also tighter for slightly better cache locality when looking up entries.
Support for large packs exceeding 31 bits in size won't impose an index
size bloat for packs within that range that don't need a 64-bit offset.
And because newer objects which are likely to be the most frequently used
are located at the beginning of the pack, they won't pay the 64-bit offset
lookup at run time either even if the pack is large.
Right now an index version 2 is created only when the biggest offset in a
pack reaches 31 bits. It might be a good idea to always use index version
2 eventually to benefit from the CRC32 it contains when reusing pack data
while repacking.
[jc: with the "oops" fix to keep track of the last offset correctly]
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
builtin-pack-objects.c | patch | blob | history |
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 03e36f01830bc246eaccac85a36a0368d15c9995..8cf2871751e7d2d04e653224699aecb71c8bd79b 100644 (file)
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
int i;
const char *index = p->index_data;
- index += 4 * 256;
rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
- for (i = 0; i < num_ent; i++) {
- uint32_t hl = *((uint32_t *)(index + 24 * i));
- rix->revindex[i].offset = ntohl(hl);
- rix->revindex[i].nr = i;
+ index += 4 * 256;
+
+ if (p->index_version > 1) {
+ const uint32_t *off_32 =
+ (uint32_t *)(index + 8 + p->num_objects * (20 + 4));
+ const uint32_t *off_64 = off_32 + p->num_objects;
+ for (i = 0; i < num_ent; i++) {
+ uint32_t off = ntohl(*off_32++);
+ if (!(off & 0x80000000)) {
+ rix->revindex[i].offset = off;
+ } else {
+ rix->revindex[i].offset =
+ ((uint64_t)ntohl(*off_64++)) << 32;
+ rix->revindex[i].offset |=
+ ntohl(*off_64++);
+ }
+ rix->revindex[i].nr = i;
+ }
+ } else {
+ for (i = 0; i < num_ent; i++) {
+ uint32_t hl = *((uint32_t *)(index + 24 * i));
+ rix->revindex[i].offset = ntohl(hl);
+ rix->revindex[i].nr = i;
+ }
}
+
/* This knows the pack format -- the 20-byte trailer
* follows immediately after the last object data.
*/
return offset + size;
}
-static void write_pack_file(void)
+static off_t write_pack_file(void)
{
uint32_t i;
struct sha1file *f;
- off_t offset;
+ off_t offset, last_obj_offset = 0;
struct pack_header hdr;
unsigned last_percent = 999;
int do_progress = progress;
if (!nr_result)
goto done;
for (i = 0; i < nr_objects; i++) {
+ last_obj_offset = offset;
offset = write_one(f, objects + i, offset);
if (do_progress) {
unsigned percent = written * 100 / nr_result;
if (written != nr_result)
die("wrote %u objects while expecting %u", written, nr_result);
sha1close(f, pack_file_sha1, 1);
+
+ return last_obj_offset;
}
-static void write_index_file(void)
+static void write_index_file(off_t last_obj_offset)
{
uint32_t i;
struct sha1file *f = sha1create("%s-%s.%s", base_name,
struct object_entry **list = sorted_by_sha;
struct object_entry **last = list + nr_result;
uint32_t array[256];
+ uint32_t index_version;
+
+ /* if last object's offset is >= 2^31 we should use index V2 */
+ index_version = (last_obj_offset >> 31) ? 2 : 1;
+
+ /* index versions 2 and above need a header */
+ if (index_version >= 2) {
+ struct pack_idx_header hdr;
+ hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+ hdr.idx_version = htonl(index_version);
+ sha1write(f, &hdr, sizeof(hdr));
+ }
/*
* Write the first-level table (the list is sorted,
list = sorted_by_sha;
for (i = 0; i < nr_result; i++) {
struct object_entry *entry = *list++;
- uint32_t offset = htonl(entry->offset);
- sha1write(f, &offset, 4);
+ if (index_version < 2) {
+ uint32_t offset = htonl(entry->offset);
+ sha1write(f, &offset, 4);
+ }
sha1write(f, entry->sha1, 20);
}
+
+ if (index_version >= 2) {
+ unsigned int nr_large_offset = 0;
+
+ /* write the crc32 table */
+ list = sorted_by_sha;
+ for (i = 0; i < nr_objects; i++) {
+ struct object_entry *entry = *list++;
+ uint32_t crc32_val = htonl(entry->crc32);
+ sha1write(f, &crc32_val, 4);
+ }
+
+ /* write the 32-bit offset table */
+ list = sorted_by_sha;
+ for (i = 0; i < nr_objects; i++) {
+ struct object_entry *entry = *list++;
+ uint32_t offset = (entry->offset <= 0x7fffffff) ?
+ entry->offset : (0x80000000 | nr_large_offset++);
+ offset = htonl(offset);
+ sha1write(f, &offset, 4);
+ }
+
+ /* write the large offset table */
+ list = sorted_by_sha;
+ while (nr_large_offset) {
+ struct object_entry *entry = *list++;
+ uint64_t offset = entry->offset;
+ if (offset > 0x7fffffff) {
+ uint32_t split[2];
+ split[0] = htonl(offset >> 32);
+ split[1] = htonl(offset & 0xffffffff);
+ sha1write(f, split, 8);
+ nr_large_offset--;
+ }
+ }
+ }
+
sha1write(f, pack_file_sha1, 20);
sha1close(f, NULL, 1);
}
if (reuse_cached_pack(object_list_sha1))
;
else {
+ off_t last_obj_offset;
if (nr_result)
prepare_pack(window, depth);
if (progress == 1 && pack_to_stdout) {
signal(SIGALRM, SIG_IGN );
progress_update = 0;
}
- write_pack_file();
+ last_obj_offset = write_pack_file();
if (!pack_to_stdout) {
- write_index_file();
+ write_index_file(last_obj_offset);
puts(sha1_to_hex(object_list_sha1));
}
}