From 16445242edd7e90dc564b657043d2a5efca68cb0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 2 Jul 2013 02:16:23 -0400 Subject: [PATCH 1/3] fetch-pack: avoid quadratic list insertion in mark_complete We insert the commit pointed to by each ref one-by-one into the "complete" commit_list using insert_by_date. Because each insertion is O(n), we end up with O(n^2) behavior. This typically doesn't matter, because the number of refs is reasonably small. And even if there are a lot of refs, they often point to a smaller set of objects (in which case the optimization in commit ea5f220 keeps our "n" small). However, in pathological repositories (hundreds of thousands of refs, each pointing to a unique commit), this quadratic behavior can make a difference. Since we do not care about the list order until we have finished building it, we can simply keep it unsorted during the insertion phase, then sort it afterwards. On a repository like the one described above, this dropped the time to do a no-op fetch from 2.0s to 1.7s. On normal repositories, it probably does not matter at all, but it does not hurt to protect ourselves from pathological cases. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- fetch-pack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fetch-pack.c b/fetch-pack.c index abe5ffbba5..4df8abd7c1 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -505,7 +505,7 @@ static int mark_complete(const char *refname, const unsigned char *sha1, int fla struct commit *commit = (struct commit *)o; if (!(commit->object.flags & COMPLETE)) { commit->object.flags |= COMPLETE; - commit_list_insert_by_date(commit, &complete); + commit_list_insert(commit, &complete); } } return 0; @@ -622,6 +622,7 @@ static int everything_local(struct fetch_pack_args *args, if (!args->depth) { for_each_ref(mark_complete, NULL); for_each_alternate_ref(mark_alternate_complete, NULL); + commit_list_sort_by_date(&complete); if (cutoff) mark_recent_complete_commits(args, cutoff); } From 727377ff65f8b38990d4aa13fc6979d9a8cd6756 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 2 Jul 2013 02:21:48 -0400 Subject: [PATCH 2/3] commit.c: make compare_commits_by_commit_date global This helper function was introduced as a prio_queue comparator to help topological sorting. However, other users of prio_queue who want to replace commit_list_insert_by_date will want to use it, too. So let's make it public. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 2 +- commit.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/commit.c b/commit.c index 521e49c309..ebc0eeab8f 100644 --- a/commit.c +++ b/commit.c @@ -581,7 +581,7 @@ static int compare_commits_by_author_date(const void *a_, const void *b_, return 0; } -static int compare_commits_by_commit_date(const void *a_, const void *b_, void *unused) +int compare_commits_by_commit_date(const void *a_, const void *b_, void *unused) { const struct commit *a = a_, *b = b_; /* newer commits with larger date first */ diff --git a/commit.h b/commit.h index 4d452dc96d..18a523495e 100644 --- a/commit.h +++ b/commit.h @@ -254,4 +254,6 @@ extern void print_commit_list(struct commit_list *list, */ extern void check_commit_signature(const struct commit* commit, struct signature_check *sigc); +int compare_commits_by_commit_date(const void *a_, const void *b_, void *unused); + #endif /* COMMIT_H */ From 099327b55275ddad678c27c7501e1babed078aef Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 2 Jul 2013 02:24:21 -0400 Subject: [PATCH 3/3] fetch-pack: avoid quadratic behavior in rev_list_push When we call find_common to start finding common ancestors with the remote side of a fetch, the first thing we do is insert the tip of each ref into our rev_list linked list. We keep the list sorted the whole time with commit_list_insert_by_date, which means our insertion ends up doing O(n^2) timestamp comparisons. We could teach rev_list_push to use an unsorted list, and then sort it once after we have added each ref. However, in get_rev, we process the list by popping commits off the front and adding parents back in timestamp-sorted order. So that procedure would still operate on the large list. Instead, we can replace the linked list with a heap-based priority queue, which can do O(log n) insertion, making the whole insertion procedure O(n log n). As a result of switching to the prio_queue struct, we fix two minor bugs: 1. When we "pop" a commit in get_rev, and when we clear the rev_list in find_common, we do not take care to free the "struct commit_list", and just leak its memory. With the prio_queue implementation, the memory management is handled for us. 2. In get_rev, we look at the head commit of the list, possibly push its parents onto the list, and then "pop" the front of the list off, assuming it is the same element that we just peeked at. This is typically going to be the case, but would not be in the face of clock skew: the parents are inserted by date, and could potentially be inserted at the head of the list if they have a timestamp newer than their descendent. In this case, we would accidentally pop the parent, and never process it at all. The new implementation pulls the commit off of the queue as we examine it, and so does not suffer from this problem. With this patch, a fetch of a single commit into a repository with 50,000 refs went from: real 0m7.984s user 0m7.852s sys 0m0.120s to: real 0m2.017s user 0m1.884s sys 0m0.124s Before this patch, a larger case with 370K refs still had not completed after tens of minutes; with this patch, it completes in about 12 seconds. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- fetch-pack.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fetch-pack.c b/fetch-pack.c index 4df8abd7c1..6684348c0e 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -11,6 +11,7 @@ #include "run-command.h" #include "transport.h" #include "version.h" +#include "prio-queue.h" static int transfer_unpack_limit = -1; static int fetch_unpack_limit = -1; @@ -37,7 +38,7 @@ static int marked; */ #define MAX_IN_VAIN 256 -static struct commit_list *rev_list; +static struct prio_queue rev_list = { compare_commits_by_commit_date }; static int non_common_revs, multi_ack, use_sideband, allow_tip_sha1_in_want; static void rev_list_push(struct commit *commit, int mark) @@ -49,7 +50,7 @@ static void rev_list_push(struct commit *commit, int mark) if (parse_commit(commit)) return; - commit_list_insert_by_date(commit, &rev_list); + prio_queue_put(&rev_list, commit); if (!(commit->object.flags & COMMON)) non_common_revs++; @@ -122,10 +123,10 @@ static const unsigned char *get_rev(void) unsigned int mark; struct commit_list *parents; - if (rev_list == NULL || non_common_revs == 0) + if (rev_list.nr == 0 || non_common_revs == 0) return NULL; - commit = rev_list->item; + commit = prio_queue_get(&rev_list); if (!commit->object.parsed) parse_commit(commit); parents = commit->parents; @@ -152,8 +153,6 @@ static const unsigned char *get_rev(void) mark_common(parents->item, 1, 0); parents = parents->next; } - - rev_list = rev_list->next; } return commit->object.sha1; @@ -442,7 +441,7 @@ static int find_common(struct fetch_pack_args *args, in_vain = 0; got_continue = 1; if (ack == ACK_ready) { - rev_list = NULL; + clear_prio_queue(&rev_list); got_ready = 1; } break;