From c86e8568d87aec483379f2cef0ab81580abd1af5 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 23 Feb 2006 02:58:37 -0800 Subject: [PATCH 1/3] count-delta: fix counting of copied source. The previous one wrongly coalesced a span with the next one even though the span being added does not reach it. Signed-off-by: Junio C Hamano --- count-delta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/count-delta.c b/count-delta.c index 4e4d2f4fcc..3ee3a0ccf1 100644 --- a/count-delta.c +++ b/count-delta.c @@ -26,7 +26,7 @@ static void touch_range(struct span **span, again: if (ofs < e->end) { while (e->end < end) { - if (e->next) { + if (e->next && e->next->ofs <= end) { e->end = e->next->ofs; e = e->next; } From 581845f0b8ed97cb718fffe2bc9613b6186d84ee Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 23 Feb 2006 17:02:56 -0800 Subject: [PATCH 2/3] Tweak break/merge score to adjust to the new delta generation code. This lowers the default merge threshold score to 75% from earlier 80%. The break threshold stays the same at 50% for now, but we might want to revisit it (and the rename detection limit as well). * break score: this much edit (both insertion of new material and deletion of old material) needs to be there in the file before we consider this _might_ be a rewrite and break the filepair. * merge score: after a filepair is broken by the above criteria and goes through rename detection, if their pieces did not match with other files as rename/copy, we merge them back into one as if nothing happened. If the filepair had at least this much deletion of old material, however, we say this is completely rewritten with dissimilarity index X% when we do so. The updated delta code by Nico is so good that what we earlier thought to be complete rewrite now reuses a lot more from the source material (reducing the counted "delete"), so this adjustment is needed to keep the perceived behaviour similar to what we had earlier. Signed-off-by: Junio C Hamano --- diffcore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffcore.h b/diffcore.h index 12cd816591..91d6c631e6 100644 --- a/diffcore.h +++ b/diffcore.h @@ -18,7 +18,7 @@ #define MAX_SCORE 60000.0 #define DEFAULT_RENAME_SCORE 30000 /* rename/copy similarity minimum (50%) */ #define DEFAULT_BREAK_SCORE 30000 /* minimum for break to happen (50%)*/ -#define DEFAULT_MERGE_SCORE 48000 /* maximum for break-merge to happen (80%)*/ +#define DEFAULT_MERGE_SCORE 45000 /* maximum for break-merge to happen (75%)*/ #define MINIMUM_BREAK_SIZE 400 /* do not break a file smaller than this */ From eae3fe5e509f3d3890bc99015cb02f9b67aa501c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 24 Feb 2006 00:54:59 -0800 Subject: [PATCH 3/3] Revert "diff-delta: produce optimal pack data" This reverts 6b7d25d97bdb8a26719f90d17ff5c9720be68762 commit. It turns out that the new algorithm has a really bad corner case, that literally spends minutes for inputs that takes less than a quater seconds to delta with the old algorithm. The resulting delta is 50% smaller which is admirable, but the performance degradation is simply unacceptable for unconditional use. Some example cases are these blobs in Linux 2.6 repository: 4917ec509720a42846d513addc11cbd25e0e3c4f 9af06ba723df75fed49f7ccae5b6c9c34bc5115f dfc9cd58dc065d17030d875d3fea6e7862ede143 Signed-off-by: Junio C Hamano --- diff-delta.c | 77 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/diff-delta.c b/diff-delta.c index 27f83a0858..2ed5984b1c 100644 --- a/diff-delta.c +++ b/diff-delta.c @@ -20,11 +20,21 @@ #include #include +#include #include "delta.h" +/* block size: min = 16, max = 64k, power of 2 */ +#define BLK_SIZE 16 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define GR_PRIME 0x9e370001 +#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift)) + struct index { const unsigned char *ptr; + unsigned int val; struct index *next; }; @@ -32,21 +42,21 @@ static struct index ** delta_index(const unsigned char *buf, unsigned long bufsize, unsigned int *hash_shift) { - unsigned long hsize; - unsigned int hshift, i; + unsigned int hsize, hshift, entries, blksize, i; const unsigned char *data; struct index *entry, **hash; void *mem; /* determine index hash size */ - hsize = bufsize / 4; - for (i = 8; (1 << i) < hsize && i < 16; i++); + entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE; + hsize = entries / 4; + for (i = 4; (1 << i) < hsize && i < 16; i++); hsize = 1 << i; - hshift = i - 8; + hshift = 32 - i; *hash_shift = hshift; /* allocate lookup index */ - mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry)); + mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry)); if (!mem) return NULL; hash = mem; @@ -54,12 +64,17 @@ static struct index ** delta_index(const unsigned char *buf, memset(hash, 0, hsize * sizeof(*hash)); /* then populate it */ - data = buf + bufsize - 2; - while (data > buf) { - entry->ptr = --data; - i = data[0] ^ data[1] ^ (data[2] << hshift); + data = buf + entries * BLK_SIZE - BLK_SIZE; + blksize = bufsize - (data - buf); + while (data >= buf) { + unsigned int val = adler32(0, data, blksize); + i = HASH(val, hshift); + entry->ptr = data; + entry->val = val; entry->next = hash[i]; hash[i] = entry++; + blksize = BLK_SIZE; + data -= BLK_SIZE; } return hash; @@ -126,27 +141,29 @@ void *diff_delta(void *from_buf, unsigned long from_size, while (data < top) { unsigned int moff = 0, msize = 0; - if (data + 2 < top) { - i = data[0] ^ data[1] ^ (data[2] << hash_shift); - for (entry = hash[i]; entry; entry = entry->next) { - const unsigned char *ref = entry->ptr; - const unsigned char *src = data; - unsigned int ref_size = ref_top - ref; - if (ref_size > top - src) - ref_size = top - src; - if (ref_size > 0x10000) - ref_size = 0x10000; - if (ref_size <= msize) + unsigned int blksize = MIN(top - data, BLK_SIZE); + unsigned int val = adler32(0, data, blksize); + i = HASH(val, hash_shift); + for (entry = hash[i]; entry; entry = entry->next) { + const unsigned char *ref = entry->ptr; + const unsigned char *src = data; + unsigned int ref_size = ref_top - ref; + if (entry->val != val) + continue; + if (ref_size > top - src) + ref_size = top - src; + while (ref_size && *src++ == *ref) { + ref++; + ref_size--; + } + ref_size = ref - entry->ptr; + if (ref_size > msize) { + /* this is our best match so far */ + moff = entry->ptr - ref_data; + msize = ref_size; + if (msize >= 0x10000) { + msize = 0x10000; break; - while (ref_size && *src++ == *ref) { - ref++; - ref_size--; - } - ref_size = ref - entry->ptr; - if (msize < ref - entry->ptr) { - /* this is our best match so far */ - msize = ref - entry->ptr; - moff = entry->ptr - ref_data; } } }