From dfa33a298de2ab724d4812633bb009a90d1df790 Mon Sep 17 00:00:00 2001 From: Josh Steadmon Date: Fri, 19 Apr 2019 14:00:13 -0700 Subject: [PATCH 1/2] clone: do faster object check for partial clones For partial clones, doing a full connectivity check is wasteful; we skip promisor objects (which, for a partial clone, is all known objects), and enumerating them all to exclude them from the connectivity check can take a significant amount of time on large repos. At most, we want to make sure that we get the objects referred to by any wanted refs. For partial clones, just check that these objects were transferred. Signed-off-by: Josh Steadmon Signed-off-by: Junio C Hamano --- builtin/clone.c | 6 ++++-- connected.c | 17 +++++++++++++++++ connected.h | 8 ++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/builtin/clone.c b/builtin/clone.c index 50bde99618..fdbbd8942a 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -657,7 +657,8 @@ static void update_remote_refs(const struct ref *refs, const char *branch_top, const char *msg, struct transport *transport, - int check_connectivity) + int check_connectivity, + int check_refs_only) { const struct ref *rm = mapped_refs; @@ -666,6 +667,7 @@ static void update_remote_refs(const struct ref *refs, opt.transport = transport; opt.progress = transport->progress; + opt.check_refs_only = !!check_refs_only; if (check_connected(iterate_ref_map, &rm, &opt)) die(_("remote did not send all necessary objects")); @@ -1224,7 +1226,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix) update_remote_refs(refs, mapped_refs, remote_head_points_at, branch_top.buf, reflog_msg.buf, transport, - !is_local); + !is_local, filter_options.choice); update_head(our_head_points_at, remote_head, reflog_msg.buf); diff --git a/connected.c b/connected.c index 1bba888eff..1ab481fed6 100644 --- a/connected.c +++ b/connected.c @@ -1,4 +1,5 @@ #include "cache.h" +#include "object-store.h" #include "run-command.h" #include "sigchain.h" #include "connected.h" @@ -49,6 +50,22 @@ int check_connected(oid_iterate_fn fn, void *cb_data, strbuf_release(&idx_file); } + if (opt->check_refs_only) { + /* + * For partial clones, we don't want to have to do a regular + * connectivity check because we have to enumerate and exclude + * all promisor objects (slow), and then the connectivity check + * itself becomes a no-op because in a partial clone every + * object is a promisor object. Instead, just make sure we + * received the objects pointed to by each wanted ref. + */ + do { + if (!repo_has_object_file(the_repository, &oid)) + return 1; + } while (!fn(cb_data, &oid)); + return 0; + } + if (opt->shallow_file) { argv_array_push(&rev_list.args, "--shallow-file"); argv_array_push(&rev_list.args, opt->shallow_file); diff --git a/connected.h b/connected.h index 8d5a6b3ad6..ce2e7d8f2e 100644 --- a/connected.h +++ b/connected.h @@ -46,6 +46,14 @@ struct check_connected_options { * during a fetch. */ unsigned is_deepening_fetch : 1; + + /* + * If non-zero, only check the top-level objects referenced by the + * wanted refs (passed in as cb_data). This is useful for partial + * clones, where enumerating and excluding all promisor objects is very + * slow and the commit-walk itself becomes a no-op. + */ + unsigned check_refs_only : 1; }; #define CHECK_CONNECTED_INIT { 0 } From 1bb10d4f7c074ac13a61d6d20be89c2bff9488a4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 2 May 2019 17:45:09 -0400 Subject: [PATCH 2/2] t/perf: add perf script for partial clones We don't cover the partial clone feature at all in t/perf. Let's at least run a few basic tests so that we'll notice any regressions. We'll do a no-blob clone, and split it into two parts: the actual object transfer, and the subsequent checkout (which will of course require another transfer to get the blobs). That will help us more clearly assess the performance of each. There are obviously a lot more possibilities besides just a no-blob partial clone, but this should serve as a canary that alerts us to any generic slow-downs (and we can add more tests later for cases that aren't exercised here). There are a few non-ideal things here that make this not an entirely accurate test, but are probably OK for our purposes: 1. We have to do some extra prep/cleanup work inside the timing tests, since they impact the on-disk state and the perf harness may run each one multiple times. In practice this is probably OK, since these bits should be much less expensive than the operations we are measuring. 2. The clone time is likely to be dominated by the server's object enumeration. In the real world, a repo large enough to drive people to partial clones is likely to have reachability bitmaps enabled. And in the opposite direction, our object transfer is happening at the speed of a local pipe, whereas in the real world it would bottle-neck on the network. So any percentage speedups should be taken with a grain of salt. But hopefully any regressions will produce enough of an effect to be noticeable. This script also demonstrates the recent improvement from dfa33a298d (clone: do faster object check for partial clones, 2019-04-19): Test dfa33a298d^ dfa33a298d ------------------------------------------------------------------------- 5600.2: clone without blobs 18.41(22.72+1.09) 6.83(11.65+0.50) -62.9% 5600.3: checkout of result 1.82(3.24+0.26) 1.84(3.24+0.26) +1.1% Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/perf/p5600-partial-clone.sh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100755 t/perf/p5600-partial-clone.sh diff --git a/t/perf/p5600-partial-clone.sh b/t/perf/p5600-partial-clone.sh new file mode 100755 index 0000000000..3e04bd2ae1 --- /dev/null +++ b/t/perf/p5600-partial-clone.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +test_description='performance of partial clones' +. ./perf-lib.sh + +test_perf_default_repo + +test_expect_success 'enable server-side config' ' + git config uploadpack.allowFilter true && + git config uploadpack.allowAnySHA1InWant true +' + +test_perf 'clone without blobs' ' + rm -rf bare.git && + git clone --no-local --bare --filter=blob:none . bare.git +' + +test_perf 'checkout of result' ' + rm -rf worktree && + mkdir -p worktree/.git && + tar -C bare.git -cf - . | tar -C worktree/.git -xf - && + git -C worktree config core.bare false && + git -C worktree checkout -f +' + +test_done