Code

Merge branch 'maint'
[git.git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5  * http://www.xmailserver.org/xdiff-lib.html
6  *
7  * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8  *
9  * This code is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  */
14 #include "git-compat-util.h"
15 #include "delta.h"
17 /* maximum hash entry list for the same hash bucket */
18 #define HASH_LIMIT 64
20 #define RABIN_SHIFT 23
21 #define RABIN_WINDOW 16
23 static const unsigned int T[256] = {
24         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67 };
69 static const unsigned int U[256] = {
70         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113 };
115 struct index_entry {
116         const unsigned char *ptr;
117         unsigned int val;
118         struct index_entry *next;
119 };
121 struct delta_index {
122         const void *src_buf;
123         unsigned long src_size;
124         unsigned int hash_mask;
125         struct index_entry *hash[FLEX_ARRAY];
126 };
128 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
130         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
131         const unsigned char *data, *buffer = buf;
132         struct delta_index *index;
133         struct index_entry *entry, **hash;
134         void *mem;
135         unsigned long memsize;
137         if (!buf || !bufsize)
138                 return NULL;
140         /* Determine index hash size.  Note that indexing skips the
141            first byte to allow for optimizing the Rabin's polynomial
142            initialization in create_delta(). */
143         entries = (bufsize - 1)  / RABIN_WINDOW;
144         hsize = entries / 4;
145         for (i = 4; (1u << i) < hsize && i < 31; i++);
146         hsize = 1 << i;
147         hmask = hsize - 1;
149         /* allocate lookup index */
150         memsize = sizeof(*index) +
151                   sizeof(*hash) * hsize +
152                   sizeof(*entry) * entries;
153         mem = malloc(memsize);
154         if (!mem)
155                 return NULL;
156         index = mem;
157         mem = index + 1;
158         hash = mem;
159         mem = hash + hsize;
160         entry = mem;
162         index->src_buf = buf;
163         index->src_size = bufsize;
164         index->hash_mask = hmask;
165         memset(hash, 0, hsize * sizeof(*hash));
167         /* allocate an array to count hash entries */
168         hash_count = calloc(hsize, sizeof(*hash_count));
169         if (!hash_count) {
170                 free(index);
171                 return NULL;
172         }
174         /* then populate the index */
175         prev_val = ~0;
176         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177              data >= buffer;
178              data -= RABIN_WINDOW) {
179                 unsigned int val = 0;
180                 for (i = 1; i <= RABIN_WINDOW; i++)
181                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
182                 if (val == prev_val) {
183                         /* keep the lowest of consecutive identical blocks */
184                         entry[-1].ptr = data + RABIN_WINDOW;
185                 } else {
186                         prev_val = val;
187                         i = val & hmask;
188                         entry->ptr = data + RABIN_WINDOW;
189                         entry->val = val;
190                         entry->next = hash[i];
191                         hash[i] = entry++;
192                         hash_count[i]++;
193                 }
194         }
196         /*
197          * Determine a limit on the number of entries in the same hash
198          * bucket.  This guards us against pathological data sets causing
199          * really bad hash distribution with most entries in the same hash
200          * bucket that would bring us to O(m*n) computing costs (m and n
201          * corresponding to reference and target buffer sizes).
202          *
203          * Make sure none of the hash buckets has more entries than
204          * we're willing to test.  Otherwise we cull the entry list
205          * uniformly to still preserve a good repartition across
206          * the reference buffer.
207          */
208         for (i = 0; i < hsize; i++) {
209                 if (hash_count[i] < HASH_LIMIT)
210                         continue;
211                 entry = hash[i];
212                 do {
213                         struct index_entry *keep = entry;
214                         int skip = hash_count[i] / HASH_LIMIT / 2;
215                         do {
216                                 entry = entry->next;
217                         } while(--skip && entry);
218                         keep->next = entry;
219                 } while(entry);
220         }
221         free(hash_count);
223         return index;
226 void free_delta_index(struct delta_index *index)
228         free(index);
231 /*
232  * The maximum size for any opcode sequence, including the initial header
233  * plus Rabin window plus biggest copy.
234  */
235 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
237 void *
238 create_delta(const struct delta_index *index,
239              const void *trg_buf, unsigned long trg_size,
240              unsigned long *delta_size, unsigned long max_size)
242         unsigned int i, outpos, outsize, moff, msize, val;
243         int inscnt;
244         const unsigned char *ref_data, *ref_top, *data, *top;
245         unsigned char *out;
247         if (!trg_buf || !trg_size)
248                 return NULL;
250         outpos = 0;
251         outsize = 8192;
252         if (max_size && outsize >= max_size)
253                 outsize = max_size + MAX_OP_SIZE + 1;
254         out = malloc(outsize);
255         if (!out)
256                 return NULL;
258         /* store reference buffer size */
259         i = index->src_size;
260         while (i >= 0x80) {
261                 out[outpos++] = i | 0x80;
262                 i >>= 7;
263         }
264         out[outpos++] = i;
266         /* store target buffer size */
267         i = trg_size;
268         while (i >= 0x80) {
269                 out[outpos++] = i | 0x80;
270                 i >>= 7;
271         }
272         out[outpos++] = i;
274         ref_data = index->src_buf;
275         ref_top = ref_data + index->src_size;
276         data = trg_buf;
277         top = (const unsigned char *) trg_buf + trg_size;
279         outpos++;
280         val = 0;
281         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
282                 out[outpos++] = *data;
283                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
284         }
285         inscnt = i;
287         moff = 0;
288         msize = 0;
289         while (data < top) {
290                 if (msize < 4096) {
291                         struct index_entry *entry;
292                         val ^= U[data[-RABIN_WINDOW]];
293                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294                         i = val & index->hash_mask;
295                         for (entry = index->hash[i]; entry; entry = entry->next) {
296                                 const unsigned char *ref = entry->ptr;
297                                 const unsigned char *src = data;
298                                 unsigned int ref_size = ref_top - ref;
299                                 if (entry->val != val)
300                                         continue;
301                                 if (ref_size > top - src)
302                                         ref_size = top - src;
303                                 if (ref_size <= msize)
304                                         break;
305                                 while (ref_size-- && *src++ == *ref)
306                                         ref++;
307                                 if (msize < ref - entry->ptr) {
308                                         /* this is our best match so far */
309                                         msize = ref - entry->ptr;
310                                         moff = entry->ptr - ref_data;
311                                         if (msize >= 4096) /* good enough */
312                                                 break;
313                                 }
314                         }
315                 }
317                 if (msize < 4) {
318                         if (!inscnt)
319                                 outpos++;
320                         out[outpos++] = *data++;
321                         inscnt++;
322                         if (inscnt == 0x7f) {
323                                 out[outpos - inscnt - 1] = inscnt;
324                                 inscnt = 0;
325                         }
326                         msize = 0;
327                 } else {
328                         unsigned int left;
329                         unsigned char *op;
331                         if (inscnt) {
332                                 while (moff && ref_data[moff-1] == data[-1]) {
333                                         /* we can match one byte back */
334                                         msize++;
335                                         moff--;
336                                         data--;
337                                         outpos--;
338                                         if (--inscnt)
339                                                 continue;
340                                         outpos--;  /* remove count slot */
341                                         inscnt--;  /* make it -1 */
342                                         break;
343                                 }
344                                 out[outpos - inscnt - 1] = inscnt;
345                                 inscnt = 0;
346                         }
348                         /* A copy op is currently limited to 64KB (pack v2) */
349                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
350                         msize -= left;
352                         op = out + outpos++;
353                         i = 0x80;
355                         if (moff & 0x000000ff)
356                                 out[outpos++] = moff >> 0,  i |= 0x01;
357                         if (moff & 0x0000ff00)
358                                 out[outpos++] = moff >> 8,  i |= 0x02;
359                         if (moff & 0x00ff0000)
360                                 out[outpos++] = moff >> 16, i |= 0x04;
361                         if (moff & 0xff000000)
362                                 out[outpos++] = moff >> 24, i |= 0x08;
364                         if (msize & 0x00ff)
365                                 out[outpos++] = msize >> 0, i |= 0x10;
366                         if (msize & 0xff00)
367                                 out[outpos++] = msize >> 8, i |= 0x20;
369                         *op = i;
371                         data += msize;
372                         moff += msize;
373                         msize = left;
375                         if (msize < 4096) {
376                                 int j;
377                                 val = 0;
378                                 for (j = -RABIN_WINDOW; j < 0; j++)
379                                         val = ((val << 8) | data[j])
380                                               ^ T[val >> RABIN_SHIFT];
381                         }
382                 }
384                 if (outpos >= outsize - MAX_OP_SIZE) {
385                         void *tmp = out;
386                         outsize = outsize * 3 / 2;
387                         if (max_size && outsize >= max_size)
388                                 outsize = max_size + MAX_OP_SIZE + 1;
389                         if (max_size && outpos > max_size)
390                                 break;
391                         out = realloc(out, outsize);
392                         if (!out) {
393                                 free(tmp);
394                                 return NULL;
395                         }
396                 }
397         }
399         if (inscnt)
400                 out[outpos - inscnt - 1] = inscnt;
402         if (max_size && outpos > max_size) {
403                 free(out);
404                 return NULL;
405         }
407         *delta_size = outpos;
408         return out;