Code

diffcore-break: micro-optimize by avoiding delta between identical files.
[git.git] / index-pack.c
1 #include "cache.h"
2 #include "delta.h"
3 #include "pack.h"
4 #include "csum-file.h"
6 static const char index_pack_usage[] =
7 "git-index-pack [-o index-file] pack-file";
9 struct object_entry
10 {
11         unsigned long offset;
12         enum object_type type;
13         enum object_type real_type;
14         unsigned char sha1[20];
15 };
17 struct delta_entry
18 {
19         struct object_entry *obj;
20         unsigned char base_sha1[20];
21 };
23 static const char *pack_name;
24 static unsigned char *pack_base;
25 static unsigned long pack_size;
26 static struct object_entry *objects;
27 static struct delta_entry *deltas;
28 static int nr_objects;
29 static int nr_deltas;
31 static void open_pack_file(void)
32 {
33         int fd;
34         struct stat st;
36         fd = open(pack_name, O_RDONLY);
37         if (fd < 0)
38                 die("cannot open packfile '%s': %s", pack_name,
39                     strerror(errno));
40         if (fstat(fd, &st)) {
41                 int err = errno;
42                 close(fd);
43                 die("cannot fstat packfile '%s': %s", pack_name,
44                     strerror(err));
45         }
46         pack_size = st.st_size;
47         pack_base = mmap(NULL, pack_size, PROT_READ, MAP_PRIVATE, fd, 0);
48         if (pack_base == MAP_FAILED) {
49                 int err = errno;
50                 close(fd);
51                 die("cannot mmap packfile '%s': %s", pack_name,
52                     strerror(err));
53         }
54         close(fd);
55 }
57 static void parse_pack_header(void)
58 {
59         const struct pack_header *hdr;
60         unsigned char sha1[20];
61         SHA_CTX ctx;
63         /* Ensure there are enough bytes for the header and final SHA1 */
64         if (pack_size < sizeof(struct pack_header) + 20)
65                 die("packfile '%s' is too small", pack_name);
67         /* Header consistency check */
68         hdr = (void *)pack_base;
69         if (hdr->hdr_signature != htonl(PACK_SIGNATURE))
70                 die("packfile '%s' signature mismatch", pack_name);
71         if (!pack_version_ok(hdr->hdr_version))
72                 die("packfile '%s' version %d unsupported",
73                     pack_name, ntohl(hdr->hdr_version));
75         nr_objects = ntohl(hdr->hdr_entries);
77         /* Check packfile integrity */
78         SHA1_Init(&ctx);
79         SHA1_Update(&ctx, pack_base, pack_size - 20);
80         SHA1_Final(sha1, &ctx);
81         if (memcmp(sha1, pack_base + pack_size - 20, 20))
82                 die("packfile '%s' SHA1 mismatch", pack_name);
83 }
85 static void bad_object(unsigned long offset, const char *format,
86                        ...) NORETURN __attribute__((format (printf, 2, 3)));
88 static void bad_object(unsigned long offset, const char *format, ...)
89 {
90         va_list params;
91         char buf[1024];
93         va_start(params, format);
94         vsnprintf(buf, sizeof(buf), format, params);
95         va_end(params);
96         die("packfile '%s': bad object at offset %lu: %s",
97             pack_name, offset, buf);
98 }
100 static void *unpack_entry_data(unsigned long offset,
101                                unsigned long *current_pos, unsigned long size)
103         unsigned long pack_limit = pack_size - 20;
104         unsigned long pos = *current_pos;
105         z_stream stream;
106         void *buf = xmalloc(size);
108         memset(&stream, 0, sizeof(stream));
109         stream.next_out = buf;
110         stream.avail_out = size;
111         stream.next_in = pack_base + pos;
112         stream.avail_in = pack_limit - pos;
113         inflateInit(&stream);
115         for (;;) {
116                 int ret = inflate(&stream, 0);
117                 if (ret == Z_STREAM_END)
118                         break;
119                 if (ret != Z_OK)
120                         bad_object(offset, "inflate returned %d", ret);
121         }
122         inflateEnd(&stream);
123         if (stream.total_out != size)
124                 bad_object(offset, "size mismatch (expected %lu, got %lu)",
125                            size, stream.total_out);
126         *current_pos = pack_limit - stream.avail_in;
127         return buf;
130 static void *unpack_raw_entry(unsigned long offset,
131                               enum object_type *obj_type,
132                               unsigned long *obj_size,
133                               unsigned char *delta_base,
134                               unsigned long *next_obj_offset)
136         unsigned long pack_limit = pack_size - 20;
137         unsigned long pos = offset;
138         unsigned char c;
139         unsigned long size;
140         unsigned shift;
141         enum object_type type;
142         void *data;
144         c = pack_base[pos++];
145         type = (c >> 4) & 7;
146         size = (c & 15);
147         shift = 4;
148         while (c & 0x80) {
149                 if (pos >= pack_limit)
150                         bad_object(offset, "object extends past end of pack");
151                 c = pack_base[pos++];
152                 size += (c & 0x7fUL) << shift;
153                 shift += 7;
154         }
156         switch (type) {
157         case OBJ_DELTA:
158                 if (pos + 20 >= pack_limit)
159                         bad_object(offset, "object extends past end of pack");
160                 memcpy(delta_base, pack_base + pos, 20);
161                 pos += 20;
162                 /* fallthru */
163         case OBJ_COMMIT:
164         case OBJ_TREE:
165         case OBJ_BLOB:
166         case OBJ_TAG:
167                 data = unpack_entry_data(offset, &pos, size);
168                 break;
169         default:
170                 bad_object(offset, "bad object type %d", type);
171         }
173         *obj_type = type;
174         *obj_size = size;
175         *next_obj_offset = pos;
176         return data;
179 static int find_delta(const unsigned char *base_sha1)
181         int first = 0, last = nr_deltas;
183         while (first < last) {
184                 int next = (first + last) / 2;
185                 struct delta_entry *delta = &deltas[next];
186                 int cmp;
188                 cmp = memcmp(base_sha1, delta->base_sha1, 20);
189                 if (!cmp)
190                         return next;
191                 if (cmp < 0) {
192                         last = next;
193                         continue;
194                 }
195                 first = next+1;
196         }
197         return -first-1;
200 static int find_deltas_based_on_sha1(const unsigned char *base_sha1,
201                                      int *first_index, int *last_index)
203         int first = find_delta(base_sha1);
204         int last = first;
205         int end = nr_deltas - 1;
207         if (first < 0)
208                 return -1;
209         while (first > 0 && !memcmp(deltas[first-1].base_sha1, base_sha1, 20))
210                 --first;
211         while (last < end && !memcmp(deltas[last+1].base_sha1, base_sha1, 20))
212                 ++last;
213         *first_index = first;
214         *last_index = last;
215         return 0;
218 static void sha1_object(const void *data, unsigned long size,
219                         enum object_type type, unsigned char *sha1)
221         SHA_CTX ctx;
222         char header[50];
223         int header_size;
224         const char *type_str;
226         switch (type) {
227         case OBJ_COMMIT: type_str = "commit"; break;
228         case OBJ_TREE:   type_str = "tree"; break;
229         case OBJ_BLOB:   type_str = "blob"; break;
230         case OBJ_TAG:    type_str = "tag"; break;
231         default:
232                 die("bad type %d", type);
233         }
235         header_size = sprintf(header, "%s %lu", type_str, size) + 1;
237         SHA1_Init(&ctx);
238         SHA1_Update(&ctx, header, header_size);
239         SHA1_Update(&ctx, data, size);
240         SHA1_Final(sha1, &ctx);
243 static void resolve_delta(struct delta_entry *delta, void *base_data,
244                           unsigned long base_size, enum object_type type)
246         struct object_entry *obj = delta->obj;
247         void *delta_data;
248         unsigned long delta_size;
249         void *result;
250         unsigned long result_size;
251         enum object_type delta_type;
252         unsigned char base_sha1[20];
253         unsigned long next_obj_offset;
254         int j, first, last;
256         obj->real_type = type;
257         delta_data = unpack_raw_entry(obj->offset, &delta_type,
258                                       &delta_size, base_sha1,
259                                       &next_obj_offset);
260         result = patch_delta(base_data, base_size, delta_data, delta_size,
261                              &result_size);
262         free(delta_data);
263         if (!result)
264                 bad_object(obj->offset, "failed to apply delta");
265         sha1_object(result, result_size, type, obj->sha1);
266         if (!find_deltas_based_on_sha1(obj->sha1, &first, &last)) {
267                 for (j = first; j <= last; j++)
268                         resolve_delta(&deltas[j], result, result_size, type);
269         }
270         free(result);
273 static int compare_delta_entry(const void *a, const void *b)
275         const struct delta_entry *delta_a = a;
276         const struct delta_entry *delta_b = b;
277         return memcmp(delta_a->base_sha1, delta_b->base_sha1, 20);
280 static void parse_pack_objects(void)
282         int i;
283         unsigned long offset = sizeof(struct pack_header);
284         unsigned char base_sha1[20];
285         void *data;
286         unsigned long data_size;
288         /*
289          * First pass:
290          * - find locations of all objects;
291          * - calculate SHA1 of all non-delta objects;
292          * - remember base SHA1 for all deltas.
293          */
294         for (i = 0; i < nr_objects; i++) {
295                 struct object_entry *obj = &objects[i];
296                 obj->offset = offset;
297                 data = unpack_raw_entry(offset, &obj->type, &data_size,
298                                         base_sha1, &offset);
299                 obj->real_type = obj->type;
300                 if (obj->type == OBJ_DELTA) {
301                         struct delta_entry *delta = &deltas[nr_deltas++];
302                         delta->obj = obj;
303                         memcpy(delta->base_sha1, base_sha1, 20);
304                 } else
305                         sha1_object(data, data_size, obj->type, obj->sha1);
306                 free(data);
307         }
308         if (offset != pack_size - 20)
309                 die("packfile '%s' has junk at the end", pack_name);
311         /* Sort deltas by base SHA1 for fast searching */
312         qsort(deltas, nr_deltas, sizeof(struct delta_entry),
313               compare_delta_entry);
315         /*
316          * Second pass:
317          * - for all non-delta objects, look if it is used as a base for
318          *   deltas;
319          * - if used as a base, uncompress the object and apply all deltas,
320          *   recursively checking if the resulting object is used as a base
321          *   for some more deltas.
322          */
323         for (i = 0; i < nr_objects; i++) {
324                 struct object_entry *obj = &objects[i];
325                 int j, first, last;
327                 if (obj->type == OBJ_DELTA)
328                         continue;
329                 if (find_deltas_based_on_sha1(obj->sha1, &first, &last))
330                         continue;
331                 data = unpack_raw_entry(obj->offset, &obj->type, &data_size,
332                                         base_sha1, &offset);
333                 for (j = first; j <= last; j++)
334                         resolve_delta(&deltas[j], data, data_size, obj->type);
335                 free(data);
336         }
338         /* Check for unresolved deltas */
339         for (i = 0; i < nr_deltas; i++) {
340                 if (deltas[i].obj->real_type == OBJ_DELTA)
341                         die("packfile '%s' has unresolved deltas",  pack_name);
342         }
345 static int sha1_compare(const void *_a, const void *_b)
347         struct object_entry *a = *(struct object_entry **)_a;
348         struct object_entry *b = *(struct object_entry **)_b;
349         return memcmp(a->sha1, b->sha1, 20);
352 static void write_index_file(const char *index_name, unsigned char *sha1)
354         struct sha1file *f;
355         struct object_entry **sorted_by_sha, **list, **last;
356         unsigned int array[256];
357         int i;
358         SHA_CTX ctx;
360         if (nr_objects) {
361                 sorted_by_sha =
362                         xcalloc(nr_objects, sizeof(struct object_entry *));
363                 list = sorted_by_sha;
364                 last = sorted_by_sha + nr_objects;
365                 for (i = 0; i < nr_objects; ++i)
366                         sorted_by_sha[i] = &objects[i];
367                 qsort(sorted_by_sha, nr_objects, sizeof(sorted_by_sha[0]),
368                       sha1_compare);
370         }
371         else
372                 sorted_by_sha = list = last = NULL;
374         unlink(index_name);
375         f = sha1create("%s", index_name);
377         /*
378          * Write the first-level table (the list is sorted,
379          * but we use a 256-entry lookup to be able to avoid
380          * having to do eight extra binary search iterations).
381          */
382         for (i = 0; i < 256; i++) {
383                 struct object_entry **next = list;
384                 while (next < last) {
385                         struct object_entry *obj = *next;
386                         if (obj->sha1[0] != i)
387                                 break;
388                         next++;
389                 }
390                 array[i] = htonl(next - sorted_by_sha);
391                 list = next;
392         }
393         sha1write(f, array, 256 * sizeof(int));
395         /* recompute the SHA1 hash of sorted object names.
396          * currently pack-objects does not do this, but that
397          * can be fixed.
398          */
399         SHA1_Init(&ctx);
400         /*
401          * Write the actual SHA1 entries..
402          */
403         list = sorted_by_sha;
404         for (i = 0; i < nr_objects; i++) {
405                 struct object_entry *obj = *list++;
406                 unsigned int offset = htonl(obj->offset);
407                 sha1write(f, &offset, 4);
408                 sha1write(f, obj->sha1, 20);
409                 SHA1_Update(&ctx, obj->sha1, 20);
410         }
411         sha1write(f, pack_base + pack_size - 20, 20);
412         sha1close(f, NULL, 1);
413         free(sorted_by_sha);
414         SHA1_Final(sha1, &ctx);
417 int main(int argc, char **argv)
419         int i;
420         char *index_name = NULL;
421         char *index_name_buf = NULL;
422         unsigned char sha1[20];
424         for (i = 1; i < argc; i++) {
425                 const char *arg = argv[i];
427                 if (*arg == '-') {
428                         if (!strcmp(arg, "-o")) {
429                                 if (index_name || (i+1) >= argc)
430                                         usage(index_pack_usage);
431                                 index_name = argv[++i];
432                         } else
433                                 usage(index_pack_usage);
434                         continue;
435                 }
437                 if (pack_name)
438                         usage(index_pack_usage);
439                 pack_name = arg;
440         }
442         if (!pack_name)
443                 usage(index_pack_usage);
444         if (!index_name) {
445                 int len = strlen(pack_name);
446                 if (len < 5 || strcmp(pack_name + len - 5, ".pack"))
447                         die("packfile name '%s' does not end with '.pack'",
448                             pack_name);
449                 index_name_buf = xmalloc(len);
450                 memcpy(index_name_buf, pack_name, len - 5);
451                 strcpy(index_name_buf + len - 5, ".idx");
452                 index_name = index_name_buf;
453         }
455         open_pack_file();
456         parse_pack_header();
457         objects = xcalloc(nr_objects, sizeof(struct object_entry));
458         deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
459         parse_pack_objects();
460         free(deltas);
461         write_index_file(index_name, sha1);
462         free(objects);
463         free(index_name_buf);
465         printf("%s\n", sha1_to_hex(sha1));
467         return 0;