Code

3a737da68c8cadb900ffe5b44973d9193fbca83f
[git.git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  *  Many parts of this file have been lifted from LibXDiff version 0.10.
5  *  http://www.xmailserver.org/xdiff-lib.html
6  *
7  *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8  *  Copyright (C) 2003  Davide Libenzi
9  *
10  *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
11  *
12  *  This file is free software; you can redistribute it and/or
13  *  modify it under the terms of the GNU Lesser General Public
14  *  License as published by the Free Software Foundation; either
15  *  version 2.1 of the License, or (at your option) any later version.
16  *
17  *  Use of this within git automatically means that the LGPL
18  *  licensing gets turned into GPLv2 within this project.
19  */
21 #include "git-compat-util.h"
22 #include "delta.h"
24 /* maximum hash entry list for the same hash bucket */
25 #define HASH_LIMIT 64
27 #define RABIN_SHIFT 23
28 #define RABIN_WINDOW 16
30 static const unsigned int T[256] = {
31         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
32         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
33         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
34         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
35         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
36         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
37         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
38         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
39         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
40         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
41         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
42         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
43         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
44         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
45         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
46         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
47         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
48         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
49         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
50         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
51         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
52         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
53         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
54         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
55         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
56         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
57         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
58         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
59         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
60         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
61         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
62         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
63         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
64         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
65         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
66         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
67         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
68         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
69         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
70         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
71         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
72         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
73         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
74 };
76 static const unsigned int U[256] = {
77         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
78         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
79         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
80         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
81         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
82         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
83         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
84         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
85         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
86         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
87         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
88         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
89         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
90         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
91         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
92         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
93         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
94         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
95         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
96         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
97         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
98         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
99         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
100         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
101         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
102         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
103         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
104         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
105         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
106         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
107         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
108         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
109         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
110         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
111         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
112         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
113         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
114         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
115         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
116         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
117         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
118         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
119         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
120 };
122 struct index_entry {
123         const unsigned char *ptr;
124         unsigned int val;
125         struct index_entry *next;
126 };
128 struct delta_index {
129         const void *src_buf;
130         unsigned long src_size;
131         unsigned int hash_mask;
132         struct index_entry *hash[FLEX_ARRAY];
133 };
135 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
137         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
138         const unsigned char *data, *buffer = buf;
139         struct delta_index *index;
140         struct index_entry *entry, **hash;
141         void *mem;
142         unsigned long memsize;
144         if (!buf || !bufsize)
145                 return NULL;
147         /* Determine index hash size.  Note that indexing skips the
148            first byte to allow for optimizing the Rabin's polynomial
149            initialization in create_delta(). */
150         entries = (bufsize - 1)  / RABIN_WINDOW;
151         hsize = entries / 4;
152         for (i = 4; (1u << i) < hsize && i < 31; i++);
153         hsize = 1 << i;
154         hmask = hsize - 1;
156         /* allocate lookup index */
157         memsize = sizeof(*index) +
158                   sizeof(*hash) * hsize +
159                   sizeof(*entry) * entries;
160         mem = malloc(memsize);
161         if (!mem)
162                 return NULL;
163         index = mem;
164         mem = index + 1;
165         hash = mem;
166         mem = hash + hsize;
167         entry = mem;
169         index->src_buf = buf;
170         index->src_size = bufsize;
171         index->hash_mask = hmask;
172         memset(hash, 0, hsize * sizeof(*hash));
174         /* allocate an array to count hash entries */
175         hash_count = calloc(hsize, sizeof(*hash_count));
176         if (!hash_count) {
177                 free(index);
178                 return NULL;
179         }
181         /* then populate the index */
182         prev_val = ~0;
183         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
184              data >= buffer;
185              data -= RABIN_WINDOW) {
186                 unsigned int val = 0;
187                 for (i = 1; i <= RABIN_WINDOW; i++)
188                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
189                 if (val == prev_val) {
190                         /* keep the lowest of consecutive identical blocks */
191                         entry[-1].ptr = data + RABIN_WINDOW;
192                 } else {
193                         prev_val = val;
194                         i = val & hmask;
195                         entry->ptr = data + RABIN_WINDOW;
196                         entry->val = val;
197                         entry->next = hash[i];
198                         hash[i] = entry++;
199                         hash_count[i]++;
200                 }
201         }
203         /*
204          * Determine a limit on the number of entries in the same hash
205          * bucket.  This guards us against pathological data sets causing
206          * really bad hash distribution with most entries in the same hash
207          * bucket that would bring us to O(m*n) computing costs (m and n
208          * corresponding to reference and target buffer sizes).
209          *
210          * Make sure none of the hash buckets has more entries than
211          * we're willing to test.  Otherwise we cull the entry list
212          * uniformly to still preserve a good repartition across
213          * the reference buffer.
214          */
215         for (i = 0; i < hsize; i++) {
216                 if (hash_count[i] < HASH_LIMIT)
217                         continue;
218                 entry = hash[i];
219                 do {
220                         struct index_entry *keep = entry;
221                         int skip = hash_count[i] / HASH_LIMIT / 2;
222                         do {
223                                 entry = entry->next;
224                         } while(--skip && entry);
225                         keep->next = entry;
226                 } while(entry);
227         }
228         free(hash_count);
230         return index;
233 void free_delta_index(struct delta_index *index)
235         free(index);
238 /*
239  * The maximum size for any opcode sequence, including the initial header
240  * plus Rabin window plus biggest copy.
241  */
242 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
244 void *
245 create_delta(const struct delta_index *index,
246              const void *trg_buf, unsigned long trg_size,
247              unsigned long *delta_size, unsigned long max_size)
249         unsigned int i, outpos, outsize, moff, msize, val;
250         int inscnt;
251         const unsigned char *ref_data, *ref_top, *data, *top;
252         unsigned char *out;
254         if (!trg_buf || !trg_size)
255                 return NULL;
257         outpos = 0;
258         outsize = 8192;
259         if (max_size && outsize >= max_size)
260                 outsize = max_size + MAX_OP_SIZE + 1;
261         out = malloc(outsize);
262         if (!out)
263                 return NULL;
265         /* store reference buffer size */
266         i = index->src_size;
267         while (i >= 0x80) {
268                 out[outpos++] = i | 0x80;
269                 i >>= 7;
270         }
271         out[outpos++] = i;
273         /* store target buffer size */
274         i = trg_size;
275         while (i >= 0x80) {
276                 out[outpos++] = i | 0x80;
277                 i >>= 7;
278         }
279         out[outpos++] = i;
281         ref_data = index->src_buf;
282         ref_top = ref_data + index->src_size;
283         data = trg_buf;
284         top = (const unsigned char *) trg_buf + trg_size;
286         outpos++;
287         val = 0;
288         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
289                 out[outpos++] = *data;
290                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
291         }
292         inscnt = i;
294         moff = 0;
295         msize = 0;
296         while (data < top) {
297                 if (msize < 4096) {
298                         struct index_entry *entry;
299                         val ^= U[data[-RABIN_WINDOW]];
300                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
301                         i = val & index->hash_mask;
302                         for (entry = index->hash[i]; entry; entry = entry->next) {
303                                 const unsigned char *ref = entry->ptr;
304                                 const unsigned char *src = data;
305                                 unsigned int ref_size = ref_top - ref;
306                                 if (entry->val != val)
307                                         continue;
308                                 if (ref_size > top - src)
309                                         ref_size = top - src;
310                                 if (ref_size <= msize)
311                                         break;
312                                 while (ref_size-- && *src++ == *ref)
313                                         ref++;
314                                 if (msize < ref - entry->ptr) {
315                                         /* this is our best match so far */
316                                         msize = ref - entry->ptr;
317                                         moff = entry->ptr - ref_data;
318                                         if (msize >= 4096) /* good enough */
319                                                 break;
320                                 }
321                         }
322                 }
324                 if (msize < 4) {
325                         if (!inscnt)
326                                 outpos++;
327                         out[outpos++] = *data++;
328                         inscnt++;
329                         if (inscnt == 0x7f) {
330                                 out[outpos - inscnt - 1] = inscnt;
331                                 inscnt = 0;
332                         }
333                         msize = 0;
334                 } else {
335                         unsigned int left;
336                         unsigned char *op;
338                         if (inscnt) {
339                                 while (moff && ref_data[moff-1] == data[-1]) {
340                                         /* we can match one byte back */
341                                         msize++;
342                                         moff--;
343                                         data--;
344                                         outpos--;
345                                         if (--inscnt)
346                                                 continue;
347                                         outpos--;  /* remove count slot */
348                                         inscnt--;  /* make it -1 */
349                                         break;
350                                 }
351                                 out[outpos - inscnt - 1] = inscnt;
352                                 inscnt = 0;
353                         }
355                         /* A copy op is currently limited to 64KB (pack v2) */
356                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
357                         msize -= left;
359                         op = out + outpos++;
360                         i = 0x80;
362                         if (moff & 0x000000ff)
363                                 out[outpos++] = moff >> 0,  i |= 0x01;
364                         if (moff & 0x0000ff00)
365                                 out[outpos++] = moff >> 8,  i |= 0x02;
366                         if (moff & 0x00ff0000)
367                                 out[outpos++] = moff >> 16, i |= 0x04;
368                         if (moff & 0xff000000)
369                                 out[outpos++] = moff >> 24, i |= 0x08;
371                         if (msize & 0x00ff)
372                                 out[outpos++] = msize >> 0, i |= 0x10;
373                         if (msize & 0xff00)
374                                 out[outpos++] = msize >> 8, i |= 0x20;
376                         *op = i;
378                         data += msize;
379                         moff += msize;
380                         msize = left;
382                         if (msize < 4096) {
383                                 int j;
384                                 val = 0;
385                                 for (j = -RABIN_WINDOW; j < 0; j++)
386                                         val = ((val << 8) | data[j])
387                                               ^ T[val >> RABIN_SHIFT];
388                         }
389                 }
391                 if (outpos >= outsize - MAX_OP_SIZE) {
392                         void *tmp = out;
393                         outsize = outsize * 3 / 2;
394                         if (max_size && outsize >= max_size)
395                                 outsize = max_size + MAX_OP_SIZE + 1;
396                         if (max_size && outpos > max_size)
397                                 break;
398                         out = xrealloc(out, outsize);
399                         if (!out) {
400                                 free(tmp);
401                                 return NULL;
402                         }
403                 }
404         }
406         if (inscnt)
407                 out[outpos - inscnt - 1] = inscnt;
409         if (max_size && outpos > max_size) {
410                 free(out);
411                 return NULL;
412         }
414         *delta_size = outpos;
415         return out;