From d8c3d03a0b7f10977dd508a5a965a417b7f1b065 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 28 Jun 2007 22:54:37 -0700 Subject: [PATCH 1/4] diffcore_count_changes: pass diffcore_filespec We may want to use richer information on the data we are dealing with in this function, so instead of passing a buffer address and length, just pass the diffcore_filespec structure. Existing callers always call this function with parameters taken from a filespec anyway, so there is no functionality changes. Signed-off-by: Junio C Hamano --- diffcore-break.c | 3 +-- diffcore-delta.c | 8 ++++---- diffcore-rename.c | 3 +-- diffcore.h | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/diffcore-break.c b/diffcore-break.c index 9c19b8cab7..ae8a7d03e2 100644 --- a/diffcore-break.c +++ b/diffcore-break.c @@ -66,8 +66,7 @@ static int should_break(struct diff_filespec *src, if (base_size < MINIMUM_BREAK_SIZE) return 0; /* we do not break too small filepair */ - if (diffcore_count_changes(src->data, src->size, - dst->data, dst->size, + if (diffcore_count_changes(src, dst, NULL, NULL, 0, &src_copied, &literal_added)) diff --git a/diffcore-delta.c b/diffcore-delta.c index 7338a40c59..0e1fae79de 100644 --- a/diffcore-delta.c +++ b/diffcore-delta.c @@ -156,8 +156,8 @@ static struct spanhash_top *hash_chars(unsigned char *buf, unsigned int sz) return hash; } -int diffcore_count_changes(void *src, unsigned long src_size, - void *dst, unsigned long dst_size, +int diffcore_count_changes(struct diff_filespec *src, + struct diff_filespec *dst, void **src_count_p, void **dst_count_p, unsigned long delta_limit, @@ -172,14 +172,14 @@ int diffcore_count_changes(void *src, unsigned long src_size, if (src_count_p) src_count = *src_count_p; if (!src_count) { - src_count = hash_chars(src, src_size); + src_count = hash_chars(src->data, src->size); if (src_count_p) *src_count_p = src_count; } if (dst_count_p) dst_count = *dst_count_p; if (!dst_count) { - dst_count = hash_chars(dst, dst_size); + dst_count = hash_chars(dst->data, dst->size); if (dst_count_p) *dst_count_p = dst_count; } diff --git a/diffcore-rename.c b/diffcore-rename.c index 79c984c9cf..cb227366b8 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -189,8 +189,7 @@ static int estimate_similarity(struct diff_filespec *src, delta_limit = (unsigned long) (base_size * (MAX_SCORE-minimum_score) / MAX_SCORE); - if (diffcore_count_changes(src->data, src->size, - dst->data, dst->size, + if (diffcore_count_changes(src, dst, &src->cnt_data, &dst->cnt_data, delta_limit, &src_copied, &literal_added)) diff --git a/diffcore.h b/diffcore.h index 7b9294eab2..990dec50f1 100644 --- a/diffcore.h +++ b/diffcore.h @@ -103,8 +103,8 @@ void diff_debug_queue(const char *, struct diff_queue_struct *); #define diff_debug_queue(a,b) do {} while(0) #endif -extern int diffcore_count_changes(void *src, unsigned long src_size, - void *dst, unsigned long dst_size, +extern int diffcore_count_changes(struct diff_filespec *src, + struct diff_filespec *dst, void **src_count_p, void **dst_count_p, unsigned long delta_limit, From 706098af6b53403b5ea3db9216fb99afbe06f52d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 28 Jun 2007 22:56:07 -0700 Subject: [PATCH 2/4] diffcore_filespec: add is_binary diffcore-break and diffcore-rename would want to behave slightly differently depending on the binary-ness of the data, so add one bit to the filespec, as the structure is now passed down to diffcore_count_changes() function. Signed-off-by: Junio C Hamano --- diff.c | 16 ++++++++++++++++ diffcore.h | 1 + 2 files changed, 17 insertions(+) diff --git a/diff.c b/diff.c index 9938969fa5..74c1198e67 100644 --- a/diff.c +++ b/diff.c @@ -3005,6 +3005,22 @@ void diffcore_std(struct diff_options *options) { if (options->quiet) return; + + /* + * break/rename count similarity differently depending on + * the binary-ness. + */ + if ((options->break_opt != -1) || (options->detect_rename)) { + struct diff_queue_struct *q = &diff_queued_diff; + int i; + + for (i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + p->one->is_binary = file_is_binary(p->one); + p->two->is_binary = file_is_binary(p->two); + } + } + if (options->break_opt != -1) diffcore_break(options->break_opt); if (options->detect_rename) diff --git a/diffcore.h b/diffcore.h index 990dec50f1..0c8abb5b94 100644 --- a/diffcore.h +++ b/diffcore.h @@ -37,6 +37,7 @@ struct diff_filespec { #define DIFF_FILE_VALID(spec) (((spec)->mode) != 0) unsigned should_free : 1; /* data should be free()'ed */ unsigned should_munmap : 1; /* data should be munmap()'ed */ + unsigned is_binary : 1; /* data should be considered "binary" */ }; extern struct diff_filespec *alloc_filespec(const char *); From af3abef94af9c821a0c192c693c3e5342ab8729f Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 28 Jun 2007 23:11:40 -0700 Subject: [PATCH 3/4] diffcore-delta.c: update the comment on the algorithm. The comment at the top of the file described an old algorithm that was neutral to text/binary differences (it hashed sliding window of N-byte sequences and counted overlaps), but long time ago we switched to a new heuristics that are more suitable for line oriented (read: text) files that are much faster. Signed-off-by: Junio C Hamano --- diffcore-delta.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/diffcore-delta.c b/diffcore-delta.c index 0e1fae79de..7116df0b83 100644 --- a/diffcore-delta.c +++ b/diffcore-delta.c @@ -5,23 +5,20 @@ /* * Idea here is very simple. * - * We have total of (sz-N+1) N-byte overlapping sequences in buf whose - * size is sz. If the same N-byte sequence appears in both source and - * destination, we say the byte that starts that sequence is shared - * between them (i.e. copied from source to destination). + * Almost all data we are interested in are text, but sometimes we have + * to deal with binary data. So we cut them into chunks delimited by + * LF byte, or 64-byte sequence, whichever comes first, and hash them. * - * For each possible N-byte sequence, if the source buffer has more - * instances of it than the destination buffer, that means the - * difference are the number of bytes not copied from source to - * destination. If the counts are the same, everything was copied - * from source to destination. If the destination has more, - * everything was copied, and destination added more. + * For those chunks, if the source buffer has more instances of it + * than the destination buffer, that means the difference are the + * number of bytes not copied from source to destination. If the + * counts are the same, everything was copied from source to + * destination. If the destination has more, everything was copied, + * and destination added more. * * We are doing an approximation so we do not really have to waste * memory by actually storing the sequence. We just hash them into * somewhere around 2^16 hashbuckets and count the occurrences. - * - * The length of the sequence is arbitrarily set to 8 for now. */ /* Wild guess at the initial hash size */ From b9905fed7a028cc9749cf8ad479cbb07940c8638 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 28 Jun 2007 23:14:13 -0700 Subject: [PATCH 4/4] diffcore-delta.c: Ignore CR in CRLF for text files This ignores CR byte in CRLF sequence in text file when computing similarity of two blobs. Usually this should not matter as nobody sane would be checking in a file with CRLF line endings to the repository (they would use autocrlf so that the repository copy would have LF line endings). Signed-off-by: Junio C Hamano --- diffcore-delta.c | 14 +++++++++++--- t/t0022-crlf-rename.sh | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) create mode 100755 t/t0022-crlf-rename.sh diff --git a/diffcore-delta.c b/diffcore-delta.c index 7116df0b83..a038b166c5 100644 --- a/diffcore-delta.c +++ b/diffcore-delta.c @@ -122,11 +122,14 @@ static struct spanhash_top *add_spanhash(struct spanhash_top *top, } } -static struct spanhash_top *hash_chars(unsigned char *buf, unsigned int sz) +static struct spanhash_top *hash_chars(struct diff_filespec *one) { int i, n; unsigned int accum1, accum2, hashval; struct spanhash_top *hash; + unsigned char *buf = one->data; + unsigned int sz = one->size; + int is_text = !one->is_binary; i = INITIAL_HASH_SIZE; hash = xmalloc(sizeof(*hash) + sizeof(struct spanhash) * (1<> 25); accum2 = (accum2 << 7) ^ (old_1 >> 25); accum1 += c; @@ -169,14 +177,14 @@ int diffcore_count_changes(struct diff_filespec *src, if (src_count_p) src_count = *src_count_p; if (!src_count) { - src_count = hash_chars(src->data, src->size); + src_count = hash_chars(src); if (src_count_p) *src_count_p = src_count; } if (dst_count_p) dst_count = *dst_count_p; if (!dst_count) { - dst_count = hash_chars(dst->data, dst->size); + dst_count = hash_chars(dst); if (dst_count_p) *dst_count_p = dst_count; } diff --git a/t/t0022-crlf-rename.sh b/t/t0022-crlf-rename.sh new file mode 100755 index 0000000000..430a1d1d38 --- /dev/null +++ b/t/t0022-crlf-rename.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +test_description='ignore CR in CRLF sequence while computing similiarity' + +. ./test-lib.sh + +test_expect_success setup ' + + cat ../t0022-crlf-rename.sh >sample && + git add sample && + + test_tick && + git commit -m Initial && + + sed -e "s/\$/ /" ../t0022-crlf-rename.sh >elpmas && + git add elpmas && + rm -f sample && + + test_tick && + git commit -a -m Second + +' + +test_expect_success 'diff -M' ' + + git diff-tree -M -r --name-status HEAD^ HEAD | + sed -e "s/R[0-9]*/RNUM/" >actual && + echo "RNUM sample elpmas" >expect && + diff -u expect actual + +' + +test_done