From 1df4320fa25d3784b035936b35725460d46f1ca0 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 20 Jan 2014 20:20:38 +0400 Subject: [PATCH 1/6] diffcore-order: export generic ordering interface diffcore_order() interface only accepts a queue of `struct diff_filepair`. In the next patches, we'll want to order `struct combine_diff_path` by path, so let's first rework diffcore-order to also provide generic low-level interface for ordering arbitrary objects, provided they have path accessors. The new interface is: - `struct obj_order` for describing objects to ordering routine, and - order_objects() for actually doing the ordering work. Signed-off-by: Kirill Smelkov Signed-off-by: Junio C Hamano --- diffcore-order.c | 51 ++++++++++++++++++++++++++++++------------------ diffcore.h | 14 +++++++++++++ 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/diffcore-order.c b/diffcore-order.c index fe7f1f4647..1bfcc39f90 100644 --- a/diffcore-order.c +++ b/diffcore-order.c @@ -57,12 +57,6 @@ static void prepare_order(const char *orderfile) } } -struct pair_order { - struct diff_filepair *pair; - int orig_order; - int order; -}; - static int match_order(const char *path) { int i; @@ -84,35 +78,54 @@ static int match_order(const char *path) return order_cnt; } -static int compare_pair_order(const void *a_, const void *b_) +static int compare_objs_order(const void *a_, const void *b_) { - struct pair_order const *a, *b; - a = (struct pair_order const *)a_; - b = (struct pair_order const *)b_; + struct obj_order const *a, *b; + a = (struct obj_order const *)a_; + b = (struct obj_order const *)b_; if (a->order != b->order) return a->order - b->order; return a->orig_order - b->orig_order; } +void order_objects(const char *orderfile, obj_path_fn_t obj_path, + struct obj_order *objs, int nr) +{ + int i; + + if (!nr) + return; + + prepare_order(orderfile); + for (i = 0; i < nr; i++) { + objs[i].orig_order = i; + objs[i].order = match_order(obj_path(objs[i].obj)); + } + qsort(objs, nr, sizeof(*objs), compare_objs_order); +} + +static const char *pair_pathtwo(void *obj) +{ + struct diff_filepair *pair = (struct diff_filepair *)obj; + + return pair->two->path; +} + void diffcore_order(const char *orderfile) { struct diff_queue_struct *q = &diff_queued_diff; - struct pair_order *o; + struct obj_order *o; int i; if (!q->nr) return; o = xmalloc(sizeof(*o) * q->nr); - prepare_order(orderfile); - for (i = 0; i < q->nr; i++) { - o[i].pair = q->queue[i]; - o[i].orig_order = i; - o[i].order = match_order(o[i].pair->two->path); - } - qsort(o, q->nr, sizeof(*o), compare_pair_order); for (i = 0; i < q->nr; i++) - q->queue[i] = o[i].pair; + o[i].obj = q->queue[i]; + order_objects(orderfile, pair_pathtwo, o, q->nr); + for (i = 0; i < q->nr; i++) + q->queue[i] = o[i].obj; free(o); return; } diff --git a/diffcore.h b/diffcore.h index 79de8cf28d..cbe9e62b49 100644 --- a/diffcore.h +++ b/diffcore.h @@ -109,6 +109,20 @@ extern void diffcore_merge_broken(void); extern void diffcore_pickaxe(struct diff_options *); extern void diffcore_order(const char *orderfile); +/* low-level interface to diffcore_order */ +struct obj_order { + void *obj; /* setup by caller */ + + /* setup/used by order_objects() */ + int orig_order; + int order; +}; + +typedef const char *(*obj_path_fn_t)(void *obj); + +void order_objects(const char *orderfile, obj_path_fn_t obj_path, + struct obj_order *objs, int nr); + #define DIFF_DEBUG 0 #if DIFF_DEBUG void diff_debug_filespec(struct diff_filespec *, int, const char *); From 91921ceff60ccf7a9e80749eb28fabfb8399d371 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 20 Jan 2014 20:20:39 +0400 Subject: [PATCH 2/6] diff test: add tests for combine-diff with orderfile In the next patch combine-diff will have special code-path for taking orderfile into account. Prepare for making changes by introducing coverage tests for that case. Signed-off-by: Kirill Smelkov Signed-off-by: Junio C Hamano --- t/t4056-diff-order.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/t/t4056-diff-order.sh b/t/t4056-diff-order.sh index 9e2b29ede5..c0460bb0e5 100755 --- a/t/t4056-diff-order.sh +++ b/t/t4056-diff-order.sh @@ -97,4 +97,25 @@ do ' done +test_expect_success 'setup for testing combine-diff order' ' + git checkout -b tmp HEAD~ && + create_files 3 && + git checkout master && + git merge --no-commit -s ours tmp && + create_files 5 +' + +test_expect_success "combine-diff: no order (=tree object order)" ' + git diff --name-only HEAD HEAD^ HEAD^2 >actual && + test_cmp expect_none actual +' + +for i in 1 2 +do + test_expect_success "combine-diff: orderfile using option ($i)" ' + git diff -Oorder_file_$i --name-only HEAD HEAD^ HEAD^2 >actual && + test_cmp expect_$i actual + ' +done + test_done From 8518ff8fabc43aa96f1d8c8cd3de7f399d51d11e Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 20 Jan 2014 20:20:40 +0400 Subject: [PATCH 3/6] combine-diff: optimize combine_diff_path sets intersection When generating combined diff, for each commit, we intersect diff paths from diff(parent_0,commit) to diff(parent_i,commit) comparing all paths pairs, i.e. doing it the quadratic way. That is correct, but could be optimized. Paths come from trees in sorted (= tree) order, and so does diff_tree() emits resulting paths in that order too. Now if we look at diffcore transformations, all of them, except diffcore_order, preserve resulting path ordering: - skip_stat_unmatch, grep, pickaxe, filter -- just skip elements -> order stays preserved - break -- just breaks diff for a path, adding path dup after the path -> order stays preserved - detect rename/copy -- resulting paths are emitted sorted (verified empirically) So only diffcore_order changes diff paths ordering. But diffcore_order meaning affects only presentation - i.e. only how to show the diff, so we could do all the internal computations without paths reordering, and order only resultant paths set. This is faster, since, if we know two paths sets are all ordered, their intersection could be done in linear time. This patch does just that. Timings for `git log --raw --no-abbrev --no-renames` without `-c` ("git log") and with `-c` ("git log -c") before and after the patch are as follows: linux.git v3.10..v3.11 log log -c before 1.9s 20.4s after 1.9s 16.6s navy.git (private repo) log log -c before 0.83s 15.6s after 0.83s 2.1s P.S. I think linux.git case is sped up not so much as the second one, since in navy.git, there are more exotic (subtree, etc) merges. P.P.S. My tracing showed that the rest of the time (16.6s vs 1.9s) is usually spent in computing huge diffs from commit to second parent. Will try to deal with it, if I'll have time. P.P.P.S. For combine_diff_path, ->len is not needed anymore - will remove it in the next noisy cleanup path, to maintain good signal/noise ratio here. Signed-off-by: Kirill Smelkov Signed-off-by: Junio C Hamano --- combine-diff.c | 96 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 22 deletions(-) diff --git a/combine-diff.c b/combine-diff.c index 3b92c44880..d7692d7073 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -15,8 +15,8 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, int n, int num_parent) { struct diff_queue_struct *q = &diff_queued_diff; - struct combine_diff_path *p; - int i; + struct combine_diff_path *p, *pprev, *ptmp; + int i, cmp; if (!n) { struct combine_diff_path *list = NULL, **tail = &list; @@ -47,28 +47,43 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, return list; } - for (p = curr; p; p = p->next) { - int found = 0; - if (!p->len) - continue; - for (i = 0; i < q->nr; i++) { - const char *path; - int len; + /* + * NOTE paths are coming sorted here (= in tree order) + */ - if (diff_unmodified_pair(q->queue[i])) - continue; - path = q->queue[i]->two->path; - len = strlen(path); - if (len == p->len && !memcmp(path, p->path, len)) { - found = 1; - hashcpy(p->parent[n].sha1, q->queue[i]->one->sha1); - p->parent[n].mode = q->queue[i]->one->mode; - p->parent[n].status = q->queue[i]->status; - break; - } + pprev = NULL; + p = curr; + i = 0; + + while (1) { + if (!p) + break; + + cmp = (i >= q->nr) ? -1 + : strcmp(p->path, q->queue[i]->two->path); + if (cmp < 0) { + if (pprev) + pprev->next = p->next; + ptmp = p; + p = p->next; + free(ptmp); + if (curr == ptmp) + curr = p; + continue; } - if (!found) - p->len = 0; + + if (cmp > 0) { + i++; + continue; + } + + hashcpy(p->parent[n].sha1, q->queue[i]->one->sha1); + p->parent[n].mode = q->queue[i]->one->mode; + p->parent[n].status = q->queue[i]->status; + + pprev = p; + p = p->next; + i++; } return curr; } @@ -1295,6 +1310,13 @@ static void handle_combined_callback(struct diff_options *opt, free(q.queue); } +static const char *path_path(void *obj) +{ + struct combine_diff_path *path = (struct combine_diff_path *)obj; + + return path->path; +} + void diff_tree_combined(const unsigned char *sha1, const struct sha1_array *parents, int dense, @@ -1310,6 +1332,8 @@ void diff_tree_combined(const unsigned char *sha1, diffopts.output_format = DIFF_FORMAT_NO_OUTPUT; DIFF_OPT_SET(&diffopts, RECURSIVE); DIFF_OPT_CLR(&diffopts, ALLOW_EXTERNAL); + /* tell diff_tree to emit paths in sorted (=tree) order */ + diffopts.orderfile = NULL; show_log_first = !!rev->loginfo && !rev->no_commit_id; needsep = 0; @@ -1335,6 +1359,13 @@ void diff_tree_combined(const unsigned char *sha1, printf("%s%c", diff_line_prefix(opt), opt->line_termination); } + + /* if showing diff, show it in requested order */ + if (diffopts.output_format != DIFF_FORMAT_NO_OUTPUT && + opt->orderfile) { + diffcore_order(opt->orderfile); + } + diff_flush(&diffopts); } @@ -1343,6 +1374,27 @@ void diff_tree_combined(const unsigned char *sha1, if (p->len) num_paths++; } + + /* order paths according to diffcore_order */ + if (opt->orderfile && num_paths) { + struct obj_order *o; + + o = xmalloc(sizeof(*o) * num_paths); + for (i = 0, p = paths; p; p = p->next, i++) + o[i].obj = p; + order_objects(opt->orderfile, path_path, o, num_paths); + for (i = 0; i < num_paths - 1; i++) { + p = o[i].obj; + p->next = o[i+1].obj; + } + + p = o[num_paths-1].obj; + p->next = NULL; + paths = o[0].obj; + free(o); + } + + if (num_paths) { if (opt->output_format & (DIFF_FORMAT_RAW | DIFF_FORMAT_NAME | From af82c7880f1a3df1655092da11c80603260384a0 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 20 Jan 2014 20:20:41 +0400 Subject: [PATCH 4/6] combine-diff: combine_diff_path.len is not needed anymore The field was used in order to speed-up name comparison and also to mark removed paths by setting it to 0. Because the updated code does significantly less strcmp and also just removes paths from the list and free right after we know a path will not be needed, it is not needed anymore. Signed-off-by: Kirill Smelkov Signed-off-by: Junio C Hamano --- combine-diff.c | 30 +++++++++--------------------- diff-lib.c | 2 -- diff.h | 1 - 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/combine-diff.c b/combine-diff.c index d7692d7073..2d79312a0b 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -31,7 +31,6 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, p->path = (char *) &(p->parent[num_parent]); memcpy(p->path, path, len); p->path[len] = 0; - p->len = len; p->next = NULL; memset(p->parent, 0, sizeof(p->parent[0]) * num_parent); @@ -1234,8 +1233,6 @@ void show_combined_diff(struct combine_diff_path *p, { struct diff_options *opt = &rev->diffopt; - if (!p->len) - return; if (opt->output_format & (DIFF_FORMAT_RAW | DIFF_FORMAT_NAME | DIFF_FORMAT_NAME_STATUS)) @@ -1299,11 +1296,8 @@ static void handle_combined_callback(struct diff_options *opt, q.queue = xcalloc(num_paths, sizeof(struct diff_filepair *)); q.alloc = num_paths; q.nr = num_paths; - for (i = 0, p = paths; p; p = p->next) { - if (!p->len) - continue; + for (i = 0, p = paths; p; p = p->next) q.queue[i++] = combined_pair(p, num_parent); - } opt->format_callback(&q, opt, opt->format_callback_data); for (i = 0; i < num_paths; i++) free_combined_pair(q.queue[i]); @@ -1369,11 +1363,9 @@ void diff_tree_combined(const unsigned char *sha1, diff_flush(&diffopts); } - /* find out surviving paths */ - for (num_paths = 0, p = paths; p; p = p->next) { - if (p->len) - num_paths++; - } + /* find out number of surviving paths */ + for (num_paths = 0, p = paths; p; p = p->next) + num_paths++; /* order paths according to diffcore_order */ if (opt->orderfile && num_paths) { @@ -1399,10 +1391,8 @@ void diff_tree_combined(const unsigned char *sha1, if (opt->output_format & (DIFF_FORMAT_RAW | DIFF_FORMAT_NAME | DIFF_FORMAT_NAME_STATUS)) { - for (p = paths; p; p = p->next) { - if (p->len) - show_raw_diff(p, num_parent, rev); - } + for (p = paths; p; p = p->next) + show_raw_diff(p, num_parent, rev); needsep = 1; } else if (opt->output_format & @@ -1415,11 +1405,9 @@ void diff_tree_combined(const unsigned char *sha1, if (needsep) printf("%s%c", diff_line_prefix(opt), opt->line_termination); - for (p = paths; p; p = p->next) { - if (p->len) - show_patch_diff(p, num_parent, dense, - 0, rev); - } + for (p = paths; p; p = p->next) + show_patch_diff(p, num_parent, dense, + 0, rev); } } diff --git a/diff-lib.c b/diff-lib.c index 346cac651d..bf624e7716 100644 --- a/diff-lib.c +++ b/diff-lib.c @@ -124,7 +124,6 @@ int run_diff_files(struct rev_info *revs, unsigned int option) dpath->path = (char *) &(dpath->parent[5]); dpath->next = NULL; - dpath->len = path_len; memcpy(dpath->path, ce->name, path_len); dpath->path[path_len] = '\0'; hashclr(dpath->sha1); @@ -326,7 +325,6 @@ static int show_modified(struct rev_info *revs, p = xmalloc(combine_diff_path_size(2, pathlen)); p->path = (char *) &p->parent[2]; p->next = NULL; - p->len = pathlen; memcpy(p->path, new->name, pathlen); p->path[pathlen] = 0; p->mode = mode; diff --git a/diff.h b/diff.h index ce123fa06f..e79f3b3ff0 100644 --- a/diff.h +++ b/diff.h @@ -198,7 +198,6 @@ extern int diff_root_tree_sha1(const unsigned char *new, const char *base, struct combine_diff_path { struct combine_diff_path *next; - int len; char *path; unsigned int mode; unsigned char sha1[20]; From 7b1004b0ba6637e8c299ee8f927de5426139495c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 28 Jan 2014 13:55:59 -0800 Subject: [PATCH 5/6] combine-diff: simplify intersect_paths() further Linus once said: I actually wish more people understood the really core low-level kind of coding. Not big, complex stuff like the lockless name lookup, but simply good use of pointers-to-pointers etc. For example, I've seen too many people who delete a singly-linked list entry by keeping track of the "prev" entry, and then to delete the entry, doing something like if (prev) prev->next = entry->next; else list_head = entry->next; and whenever I see code like that, I just go "This person doesn't understand pointers". And it's sadly quite common. People who understand pointers just use a "pointer to the entry pointer", and initialize that with the address of the list_head. And then as they traverse the list, they can remove the entry without using any conditionals, by just doing a "*pp = entry->next". Applying that simplification lets us lose 7 lines from this function even while adding 2 lines of comment. I was tempted to squash this into the original commit, but because the benchmarking described in the commit log is without this simplification, I decided to keep it a separate follow-up patch. Signed-off-by: Junio C Hamano --- combine-diff.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/combine-diff.c b/combine-diff.c index 2d79312a0b..24ca7e2334 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -15,11 +15,10 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, int n, int num_parent) { struct diff_queue_struct *q = &diff_queued_diff; - struct combine_diff_path *p, *pprev, *ptmp; + struct combine_diff_path *p, **tail = &curr; int i, cmp; if (!n) { - struct combine_diff_path *list = NULL, **tail = &list; for (i = 0; i < q->nr; i++) { int len; const char *path; @@ -43,35 +42,27 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, *tail = p; tail = &p->next; } - return list; + return curr; } /* - * NOTE paths are coming sorted here (= in tree order) + * paths in curr (linked list) and q->queue[] (array) are + * both sorted in the tree order. */ - - pprev = NULL; - p = curr; i = 0; + while ((p = *tail) != NULL) { + cmp = ((i >= q->nr) + ? -1 : strcmp(p->path, q->queue[i]->two->path)); - while (1) { - if (!p) - break; - - cmp = (i >= q->nr) ? -1 - : strcmp(p->path, q->queue[i]->two->path); if (cmp < 0) { - if (pprev) - pprev->next = p->next; - ptmp = p; - p = p->next; - free(ptmp); - if (curr == ptmp) - curr = p; + /* p->path not in q->queue[]; drop it */ + *tail = p->next; + free(p); continue; } if (cmp > 0) { + /* q->queue[i] not in p->path; skip it */ i++; continue; } @@ -80,8 +71,7 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr, p->parent[n].mode = q->queue[i]->one->mode; p->parent[n].status = q->queue[i]->status; - pprev = p; - p = p->next; + tail = &p->next; i++; } return curr; From fce135c4ffc87f85e1c3b5c57a6d9e1abdbd074d Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 3 Feb 2014 13:08:49 +0400 Subject: [PATCH 6/6] tests: add checking that combine-diff emits only correct paths where "correct paths" stands for paths that are different to all parents. Up until now, we were testing combined diff only on one file, or on several files which were all different (t4038-diff-combined.sh). As recent thinko in "simplify intersect_paths() further" showed, and also, since we are going to rework code for finding paths different to all parents, lets write at least basic tests. Signed-off-by: Kirill Smelkov Signed-off-by: Junio C Hamano --- t/t4057-diff-combined-paths.sh | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100755 t/t4057-diff-combined-paths.sh diff --git a/t/t4057-diff-combined-paths.sh b/t/t4057-diff-combined-paths.sh new file mode 100755 index 0000000000..097e63215e --- /dev/null +++ b/t/t4057-diff-combined-paths.sh @@ -0,0 +1,106 @@ +#!/bin/sh + +test_description='combined diff show only paths that are different to all parents' + +. ./test-lib.sh + +# verify that diffc.expect matches output of +# `git diff -c --name-only HEAD HEAD^ HEAD^2` +diffc_verify () { + git diff -c --name-only HEAD HEAD^ HEAD^2 >diffc.actual && + test_cmp diffc.expect diffc.actual +} + +test_expect_success 'trivial merge - combine-diff empty' ' + for i in $(test_seq 1 9) + do + echo $i >$i.txt && + git add $i.txt + done && + git commit -m "init" && + git checkout -b side && + for i in $(test_seq 2 9) + do + echo $i/2 >>$i.txt + done && + git commit -a -m "side 2-9" && + git checkout master && + echo 1/2 >1.txt && + git commit -a -m "master 1" && + git merge side && + >diffc.expect && + diffc_verify +' + + +test_expect_success 'only one trully conflicting path' ' + git checkout side && + for i in $(test_seq 2 9) + do + echo $i/3 >>$i.txt + done && + echo "4side" >>4.txt && + git commit -a -m "side 2-9 +4" && + git checkout master && + for i in $(test_seq 1 9) + do + echo $i/3 >>$i.txt + done && + echo "4master" >>4.txt && + git commit -a -m "master 1-9 +4" && + test_must_fail git merge side && + cat <<-\EOF >4.txt && + 4 + 4/2 + 4/3 + 4master + 4side + EOF + git add 4.txt && + git commit -m "merge side (2)" && + echo 4.txt >diffc.expect && + diffc_verify +' + +test_expect_success 'merge introduces new file' ' + git checkout side && + for i in $(test_seq 5 9) + do + echo $i/4 >>$i.txt + done && + git commit -a -m "side 5-9" && + git checkout master && + for i in $(test_seq 1 3) + do + echo $i/4 >>$i.txt + done && + git commit -a -m "master 1-3 +4hello" && + git merge side && + echo "Hello World" >4hello.txt && + git add 4hello.txt && + git commit --amend && + echo 4hello.txt >diffc.expect && + diffc_verify +' + +test_expect_success 'merge removed a file' ' + git checkout side && + for i in $(test_seq 5 9) + do + echo $i/5 >>$i.txt + done && + git commit -a -m "side 5-9" && + git checkout master && + for i in $(test_seq 1 3) + do + echo $i/4 >>$i.txt + done && + git commit -a -m "master 1-3" && + git merge side && + git rm 4.txt && + git commit --amend && + echo 4.txt >diffc.expect && + diffc_verify +' + +test_done