summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 6145ee8)
raw | patch | inline | side by side (parent: 6145ee8)
author | Junio C Hamano <junkio@cox.net> | |
Fri, 27 May 2005 22:56:38 +0000 (15:56 -0700) | ||
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | |
Sun, 29 May 2005 18:17:44 +0000 (11:17 -0700) |
This attempts to optimize "diff-tree -[CM] --stdin", which
compares successible tree pairs. This optimization does not
make much sense for other commands in the diff-* brothers.
When reading from --stdin and using rename/copy detection, the
patch makes diff-tree to read the current index file first.
This is done to reuse the optimization used by diff-cache in the
non-cached case. Similarity estimator can avoid expanding a
blob if the index says what is in the work tree has an exact
copy of that blob already expanded.
Another optimization the patch makes is to check only file sizes
first to terminate similarity estimation early. In order for
this to work, it needs a way to tell the size of the blob
without expanding it. Since an obvious way of doing it, which
is to keep all the blobs previously used in the memory, is too
costly, it does so by keeping the filesize for each object it
has already seen in memory.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
compares successible tree pairs. This optimization does not
make much sense for other commands in the diff-* brothers.
When reading from --stdin and using rename/copy detection, the
patch makes diff-tree to read the current index file first.
This is done to reuse the optimization used by diff-cache in the
non-cached case. Similarity estimator can avoid expanding a
blob if the index says what is in the work tree has an exact
copy of that blob already expanded.
Another optimization the patch makes is to check only file sizes
first to terminate similarity estimation early. In order for
this to work, it needs a way to tell the size of the blob
without expanding it. Since an obvious way of doing it, which
is to keep all the blobs previously used in the memory, is too
costly, it does so by keeping the filesize for each object it
has already seen in memory.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff-tree.c | patch | blob | history | |
diff.c | patch | blob | history | |
diff.h | patch | blob | history | |
diffcore-pickaxe.c | patch | blob | history | |
diffcore-rename.c | patch | blob | history | |
diffcore.h | patch | blob | history |
diff --git a/diff-tree.c b/diff-tree.c
index c66c78708e81617c3bce19750e07bff9d692e5c9..8bdb1dba59f9b36a9660273eb17577c1fc76cdbe 100644 (file)
--- a/diff-tree.c
+++ b/diff-tree.c
if (!read_stdin)
return 0;
+ if (detect_rename)
+ diff_setup_opt |= (DIFF_SETUP_USE_SIZE_CACHE |
+ DIFF_SETUP_USE_CACHE);
while (fgets(line, sizeof(line), stdin))
diff_tree_stdin(line);
index ebec71aeee3d318b81c67d4a00687cd363145cf7..357c4efce1c76a42a56421ce90f35bc1555349dd 100644 (file)
--- a/diff.c
+++ b/diff.c
static unsigned char null_sha1[20] = { 0, };
static int reverse_diff;
+static int use_size_cache;
static const char *external_diff(void)
{
return 1;
}
+static struct sha1_size_cache {
+ unsigned char sha1[20];
+ unsigned long size;
+} **sha1_size_cache;
+static int sha1_size_cache_nr, sha1_size_cache_alloc;
+
+static struct sha1_size_cache *locate_size_cache(unsigned char *sha1,
+ unsigned long size)
+{
+ int first, last;
+ struct sha1_size_cache *e;
+
+ first = 0;
+ last = sha1_size_cache_nr;
+ while (last > first) {
+ int next = (last + first) >> 1;
+ e = sha1_size_cache[next];
+ int cmp = memcmp(e->sha1, sha1, 20);
+ if (!cmp)
+ return e;
+ if (cmp < 0) {
+ last = next;
+ continue;
+ }
+ first = next+1;
+ }
+ /* not found */
+ if (size == UINT_MAX)
+ return NULL;
+ /* insert to make it at "first" */
+ if (sha1_size_cache_alloc <= sha1_size_cache_nr) {
+ sha1_size_cache_alloc = alloc_nr(sha1_size_cache_alloc);
+ sha1_size_cache = xrealloc(sha1_size_cache,
+ sha1_size_cache_alloc *
+ sizeof(*sha1_size_cache));
+ }
+ sha1_size_cache_nr++;
+ if (first < sha1_size_cache_nr)
+ memmove(sha1_size_cache + first + 1, sha1_size_cache + first,
+ (sha1_size_cache_nr - first - 1) *
+ sizeof(*sha1_size_cache));
+ e = xmalloc(sizeof(struct sha1_size_cache));
+ sha1_size_cache[first] = e;
+ memcpy(e->sha1, sha1, 20);
+ e->size = size;
+ return e;
+}
+
/*
* While doing rename detection and pickaxe operation, we may need to
* grab the data for the blob (or file) for our own in-core comparison.
* diff_filespec has data and size fields for this purpose.
*/
-int diff_populate_filespec(struct diff_filespec *s)
+int diff_populate_filespec(struct diff_filespec *s, int size_only)
{
int err = 0;
if (!DIFF_FILE_VALID(s))
if (S_ISDIR(s->mode))
return -1;
+ if (!use_size_cache)
+ size_only = 0;
+
if (s->data)
return err;
if (!s->sha1_valid ||
s->size = st.st_size;
if (!s->size)
goto empty;
+ if (size_only)
+ return 0;
if (S_ISLNK(st.st_mode)) {
int ret;
s->data = xmalloc(s->size);
close(fd);
}
else {
+ /* We cannot do size only for SHA1 blobs */
char type[20];
+ struct sha1_size_cache *e;
+
+ if (size_only) {
+ e = locate_size_cache(s->sha1, UINT_MAX);
+ if (e) {
+ s->size = e->size;
+ return 0;
+ }
+ }
s->data = read_sha1_file(s->sha1, type, &s->size);
s->should_free = 1;
+ if (s->data && size_only)
+ locate_size_cache(s->sha1, s->size);
}
return 0;
}
return;
}
else {
- if (diff_populate_filespec(one))
+ if (diff_populate_filespec(one, 0))
die("cannot read data blob for %s", one->path);
prep_temp_blob(temp, one->data, one->size,
one->sha1, one->mode);
{
if (flags & DIFF_SETUP_REVERSE)
reverse_diff = 1;
+ if (flags & DIFF_SETUP_USE_CACHE) {
+ if (!active_cache)
+ /* read-cache does not die even when it fails
+ * so it is safe for us to do this here. Also
+ * it does not smudge active_cache or active_nr
+ * when it fails, so we do not have to worry about
+ * cleaning it up oufselves either.
+ */
+ read_cache();
+ }
+ if (flags & DIFF_SETUP_USE_SIZE_CACHE)
+ use_size_cache = 1;
+
}
struct diff_queue_struct diff_queued_diff;
index 40a6757dc874ebd1b81b0bf22b561f38d71ad70e..a07ee9f36751eac537d789d36fd4e3ce5826f757 100644 (file)
--- a/diff.h
+++ b/diff.h
extern int diff_scoreopt_parse(const char *opt);
#define DIFF_SETUP_REVERSE 1
+#define DIFF_SETUP_USE_CACHE 2
+#define DIFF_SETUP_USE_SIZE_CACHE 4
extern void diff_setup(int flags);
#define DIFF_DETECT_RENAME 1
diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index 9cf3a5083852bff2e2bf09335e792b40af8e3d73..ef9c5c1d217ff2efc1ba9e43052601e2fc18ac22 100644 (file)
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
{
unsigned long offset, sz;
const char *data;
- if (diff_populate_filespec(one))
+ if (diff_populate_filespec(one, 0))
return 0;
sz = one->size;
data = one->data;
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 6389dedbf9c45a7c9fd976b749175557c401cbcd..035d4ebb851e499537e8e8832bbf0bbbaab11b04 100644 (file)
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
if (src->sha1_valid && dst->sha1_valid &&
!memcmp(src->sha1, dst->sha1, 20))
return 1;
- if (diff_populate_filespec(src) || diff_populate_filespec(dst))
- /* this is an error but will be caught downstream */
+ if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
+ return 0;
+ if (src->size != dst->size)
+ return 0;
+ if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
return 0;
if (src->size == dst->size &&
!memcmp(src->data, dst->data, src->size))
* dst, and then some edit has been applied to dst.
*
* Compare them and return how similar they are, representing
- * the score as an integer between 0 and 10000, except
- * where they match exactly it is considered better than anything
- * else.
+ * the score as an integer between 0 and MAX_SCORE.
+ *
+ * When there is an exact match, it is considered a better
+ * match than anything else; the destination does not even
+ * call into this function in that case.
*/
void *delta;
unsigned long delta_size, base_size;
/* We would not consider edits that change the file size so
* drastically. delta_size must be smaller than
* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
+ *
* Note that base_size == 0 case is handled here already
* and the final score computation below would not have a
* divide-by-zero issue.
if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
return 0;
+ if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
+ return 0; /* error but caught downstream */
+
delta = diff_delta(src->data, src->size,
dst->data, dst->size,
&delta_size);
diff --git a/diffcore.h b/diffcore.h
index 462014b652ea6f7c5a2b09a7b37dbf563a174c3f..60ee7756e3c10d98be3eb4edcffb945cb4519911 100644 (file)
--- a/diffcore.h
+++ b/diffcore.h
extern void fill_filespec(struct diff_filespec *, const unsigned char *,
unsigned short);
-extern int diff_populate_filespec(struct diff_filespec *);
+extern int diff_populate_filespec(struct diff_filespec *, int);
extern void diff_free_filespec_data(struct diff_filespec *);
struct diff_filepair {