Code

7f8a60de18584fa18dba71a454a6f2300674f47f
[git.git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5  * http://www.xmailserver.org/xdiff-lib.html
6  *
7  * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8  *
9  * This code is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  */
14 #include "git-compat-util.h"
15 #include "delta.h"
17 /* maximum hash entry list for the same hash bucket */
18 #define HASH_LIMIT 64
20 #define RABIN_SHIFT 23
21 #define RABIN_WINDOW 16
23 static const unsigned int T[256] = {
24         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67 };
69 static const unsigned int U[256] = {
70         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113 };
115 struct index_entry {
116         const unsigned char *ptr;
117         unsigned int val;
118 };
120 struct unpacked_index_entry {
121         struct index_entry entry;
122         struct unpacked_index_entry *next;
123 };
125 struct delta_index {
126         unsigned long memsize;
127         const void *src_buf;
128         unsigned long src_size;
129         unsigned int hash_mask;
130         struct index_entry *hash[FLEX_ARRAY];
131 };
133 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
135         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
136         const unsigned char *data, *buffer = buf;
137         struct delta_index *index;
138         struct unpacked_index_entry *entry, **hash;
139         struct index_entry *packed_entry, **packed_hash;
140         void *mem;
141         unsigned long memsize;
143         if (!buf || !bufsize)
144                 return NULL;
146         /* Determine index hash size.  Note that indexing skips the
147            first byte to allow for optimizing the Rabin's polynomial
148            initialization in create_delta(). */
149         entries = (bufsize - 1)  / RABIN_WINDOW;
150         hsize = entries / 4;
151         for (i = 4; (1u << i) < hsize && i < 31; i++);
152         hsize = 1 << i;
153         hmask = hsize - 1;
155         /* allocate lookup index */
156         memsize = sizeof(*hash) * hsize +
157                   sizeof(*entry) * entries;
158         mem = malloc(memsize);
159         if (!mem)
160                 return NULL;
161         hash = mem;
162         mem = hash + hsize;
163         entry = mem;
165         memset(hash, 0, hsize * sizeof(*hash));
167         /* allocate an array to count hash entries */
168         hash_count = calloc(hsize, sizeof(*hash_count));
169         if (!hash_count) {
170                 free(hash);
171                 return NULL;
172         }
174         /* then populate the index */
175         prev_val = ~0;
176         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177              data >= buffer;
178              data -= RABIN_WINDOW) {
179                 unsigned int val = 0;
180                 for (i = 1; i <= RABIN_WINDOW; i++)
181                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
182                 if (val == prev_val) {
183                         /* keep the lowest of consecutive identical blocks */
184                         entry[-1].entry.ptr = data + RABIN_WINDOW;
185                         --entries;
186                 } else {
187                         prev_val = val;
188                         i = val & hmask;
189                         entry->entry.ptr = data + RABIN_WINDOW;
190                         entry->entry.val = val;
191                         entry->next = hash[i];
192                         hash[i] = entry++;
193                         hash_count[i]++;
194                 }
195         }
197         /*
198          * Determine a limit on the number of entries in the same hash
199          * bucket.  This guards us against pathological data sets causing
200          * really bad hash distribution with most entries in the same hash
201          * bucket that would bring us to O(m*n) computing costs (m and n
202          * corresponding to reference and target buffer sizes).
203          *
204          * Make sure none of the hash buckets has more entries than
205          * we're willing to test.  Otherwise we cull the entry list
206          * uniformly to still preserve a good repartition across
207          * the reference buffer.
208          */
209         for (i = 0; i < hsize; i++) {
210                 if (hash_count[i] < HASH_LIMIT)
211                         continue;
212                 entry = hash[i];
213                 do {
214                         struct unpacked_index_entry *keep = entry;
215                         int skip = hash_count[i] / HASH_LIMIT;
216                         do {
217                                 --entries;
218                                 entry = entry->next;
219                         } while(--skip && entry);
220                         ++entries;
221                         keep->next = entry;
222                 } while(entry);
223         }
224         free(hash_count);
226         /* Now create the packed index in array form rather than
227          * linked lists */
229         memsize = sizeof(*index)
230                 + sizeof(*packed_hash) * (hsize+1)
231                 + sizeof(*packed_entry) * entries;
233         mem = malloc(memsize);
235         if (!mem) {
236                 free(hash);
237                 return NULL;
238         }
240         index = mem;
241         index->memsize = memsize;
242         index->src_buf = buf;
243         index->src_size = bufsize;
244         index->hash_mask = hmask;
246         mem = index + 1;
247         packed_hash = mem;
248         mem = packed_hash + (hsize+1);
249         packed_entry = mem;
251         /* Coalesce all entries belonging to one linked list into
252          * consecutive array entries */
254         for (i = 0; i < hsize; i++) {
255                 packed_hash[i] = packed_entry;
256                 for (entry = hash[i]; entry; entry = entry->next)
257                         *packed_entry++ = entry->entry;
258         }
260         /* Sentinel value to indicate the length of the last hash
261          * bucket */
263         packed_hash[hsize] = packed_entry;
264         assert(packed_entry - (struct index_entry *)mem == entries);
265         free(hash);
267         return index;
270 void free_delta_index(struct delta_index *index)
272         free(index);
275 unsigned long sizeof_delta_index(struct delta_index *index)
277         if (index)
278                 return index->memsize;
279         else
280                 return 0;
283 /*
284  * The maximum size for any opcode sequence, including the initial header
285  * plus Rabin window plus biggest copy.
286  */
287 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
289 void *
290 create_delta(const struct delta_index *index,
291              const void *trg_buf, unsigned long trg_size,
292              unsigned long *delta_size, unsigned long max_size)
294         unsigned int i, outpos, outsize, moff, msize, val;
295         int inscnt;
296         const unsigned char *ref_data, *ref_top, *data, *top;
297         unsigned char *out;
299         if (!trg_buf || !trg_size)
300                 return NULL;
302         outpos = 0;
303         outsize = 8192;
304         if (max_size && outsize >= max_size)
305                 outsize = max_size + MAX_OP_SIZE + 1;
306         out = malloc(outsize);
307         if (!out)
308                 return NULL;
310         /* store reference buffer size */
311         i = index->src_size;
312         while (i >= 0x80) {
313                 out[outpos++] = i | 0x80;
314                 i >>= 7;
315         }
316         out[outpos++] = i;
318         /* store target buffer size */
319         i = trg_size;
320         while (i >= 0x80) {
321                 out[outpos++] = i | 0x80;
322                 i >>= 7;
323         }
324         out[outpos++] = i;
326         ref_data = index->src_buf;
327         ref_top = ref_data + index->src_size;
328         data = trg_buf;
329         top = (const unsigned char *) trg_buf + trg_size;
331         outpos++;
332         val = 0;
333         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
334                 out[outpos++] = *data;
335                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
336         }
337         inscnt = i;
339         moff = 0;
340         msize = 0;
341         while (data < top) {
342                 if (msize < 4096) {
343                         struct index_entry *entry;
344                         val ^= U[data[-RABIN_WINDOW]];
345                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
346                         i = val & index->hash_mask;
347                         for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
348                                 const unsigned char *ref = entry->ptr;
349                                 const unsigned char *src = data;
350                                 unsigned int ref_size = ref_top - ref;
351                                 if (entry->val != val)
352                                         continue;
353                                 if (ref_size > top - src)
354                                         ref_size = top - src;
355                                 if (ref_size <= msize)
356                                         break;
357                                 while (ref_size-- && *src++ == *ref)
358                                         ref++;
359                                 if (msize < ref - entry->ptr) {
360                                         /* this is our best match so far */
361                                         msize = ref - entry->ptr;
362                                         moff = entry->ptr - ref_data;
363                                         if (msize >= 4096) /* good enough */
364                                                 break;
365                                 }
366                         }
367                 }
369                 if (msize < 4) {
370                         if (!inscnt)
371                                 outpos++;
372                         out[outpos++] = *data++;
373                         inscnt++;
374                         if (inscnt == 0x7f) {
375                                 out[outpos - inscnt - 1] = inscnt;
376                                 inscnt = 0;
377                         }
378                         msize = 0;
379                 } else {
380                         unsigned int left;
381                         unsigned char *op;
383                         if (inscnt) {
384                                 while (moff && ref_data[moff-1] == data[-1]) {
385                                         /* we can match one byte back */
386                                         msize++;
387                                         moff--;
388                                         data--;
389                                         outpos--;
390                                         if (--inscnt)
391                                                 continue;
392                                         outpos--;  /* remove count slot */
393                                         inscnt--;  /* make it -1 */
394                                         break;
395                                 }
396                                 out[outpos - inscnt - 1] = inscnt;
397                                 inscnt = 0;
398                         }
400                         /* A copy op is currently limited to 64KB (pack v2) */
401                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
402                         msize -= left;
404                         op = out + outpos++;
405                         i = 0x80;
407                         if (moff & 0x000000ff)
408                                 out[outpos++] = moff >> 0,  i |= 0x01;
409                         if (moff & 0x0000ff00)
410                                 out[outpos++] = moff >> 8,  i |= 0x02;
411                         if (moff & 0x00ff0000)
412                                 out[outpos++] = moff >> 16, i |= 0x04;
413                         if (moff & 0xff000000)
414                                 out[outpos++] = moff >> 24, i |= 0x08;
416                         if (msize & 0x00ff)
417                                 out[outpos++] = msize >> 0, i |= 0x10;
418                         if (msize & 0xff00)
419                                 out[outpos++] = msize >> 8, i |= 0x20;
421                         *op = i;
423                         data += msize;
424                         moff += msize;
425                         msize = left;
427                         if (msize < 4096) {
428                                 int j;
429                                 val = 0;
430                                 for (j = -RABIN_WINDOW; j < 0; j++)
431                                         val = ((val << 8) | data[j])
432                                               ^ T[val >> RABIN_SHIFT];
433                         }
434                 }
436                 if (outpos >= outsize - MAX_OP_SIZE) {
437                         void *tmp = out;
438                         outsize = outsize * 3 / 2;
439                         if (max_size && outsize >= max_size)
440                                 outsize = max_size + MAX_OP_SIZE + 1;
441                         if (max_size && outpos > max_size)
442                                 break;
443                         out = realloc(out, outsize);
444                         if (!out) {
445                                 free(tmp);
446                                 return NULL;
447                         }
448                 }
449         }
451         if (inscnt)
452                 out[outpos - inscnt - 1] = inscnt;
454         if (max_size && outpos > max_size) {
455                 free(out);
456                 return NULL;
457         }
459         *delta_size = outpos;
460         return out;