bcec6780b2
When serving a push, git-receive-pack(1) needs to verify that the packfile sent by the client contains all objects that are required by the updated references. This connectivity check works by marking all preexisting references as uninteresting and using the new reference tips as starting point for a graph walk. Marking all preexisting references as uninteresting can be a problem when it comes to performance. Git forges tend to do internal bookkeeping to keep alive sets of objects for internal use or make them easy to find via certain references. These references are typically hidden away from the user so that they are neither advertised nor writeable. At GitLab, we have one particular repository that contains a total of 7 million references, of which 6.8 million are indeed internal references. With the current connectivity check we are forced to load all these references in order to mark them as uninteresting, and this alone takes around 15 seconds to compute. We can optimize this by only taking into account the set of visible refs when marking objects as uninteresting. This means that we may now walk more objects until we hit any object that is marked as uninteresting. But it is rather unlikely that clients send objects that make large parts of objects reachable that have previously only ever been hidden, whereas the common case is to push incremental changes that build on top of the visible object graph. This provides a huge boost to performance in the mentioned repository, where the vast majority of its refs hidden. Pushing a new commit into this repo with `transfer.hideRefs` set up to hide 6.8 million of 7 refs as it is configured in Gitaly leads to a 4.5-fold speedup: Benchmark 1: main Time (mean ± σ): 30.977 s ± 0.157 s [User: 30.226 s, System: 1.083 s] Range (min … max): 30.796 s … 31.071 s 3 runs Benchmark 2: pks-connectivity-check-hide-refs Time (mean ± σ): 6.799 s ± 0.063 s [User: 6.803 s, System: 0.354 s] Range (min … max): 6.729 s … 6.850 s 3 runs Summary 'pks-connectivity-check-hide-refs' ran 4.56 ± 0.05 times faster than 'main' As we mostly go through the same codepaths even in the case where there are no hidden refs at all compared to the code before there is no change in performance when no refs are hidden: Benchmark 1: main Time (mean ± σ): 48.188 s ± 0.432 s [User: 49.326 s, System: 5.009 s] Range (min … max): 47.706 s … 48.539 s 3 runs Benchmark 2: pks-connectivity-check-hide-refs Time (mean ± σ): 48.027 s ± 0.500 s [User: 48.934 s, System: 5.025 s] Range (min … max): 47.504 s … 48.500 s 3 runs Summary 'pks-connectivity-check-hide-refs' ran 1.00 ± 0.01 times faster than 'main' Signed-off-by: Patrick Steinhardt <ps@pks.im> Signed-off-by: Taylor Blau <me@ttaylorr.com>
159 lines
4.5 KiB
C
159 lines
4.5 KiB
C
#include "cache.h"
|
|
#include "object-store.h"
|
|
#include "run-command.h"
|
|
#include "sigchain.h"
|
|
#include "connected.h"
|
|
#include "transport.h"
|
|
#include "packfile.h"
|
|
#include "promisor-remote.h"
|
|
|
|
/*
|
|
* If we feed all the commits we want to verify to this command
|
|
*
|
|
* $ git rev-list --objects --stdin --not --all
|
|
*
|
|
* and if it does not error out, that means everything reachable from
|
|
* these commits locally exists and is connected to our existing refs.
|
|
* Note that this does _not_ validate the individual objects.
|
|
*
|
|
* Returns 0 if everything is connected, non-zero otherwise.
|
|
*/
|
|
int check_connected(oid_iterate_fn fn, void *cb_data,
|
|
struct check_connected_options *opt)
|
|
{
|
|
struct child_process rev_list = CHILD_PROCESS_INIT;
|
|
FILE *rev_list_in;
|
|
struct check_connected_options defaults = CHECK_CONNECTED_INIT;
|
|
const struct object_id *oid;
|
|
int err = 0;
|
|
struct packed_git *new_pack = NULL;
|
|
struct transport *transport;
|
|
size_t base_len;
|
|
|
|
if (!opt)
|
|
opt = &defaults;
|
|
transport = opt->transport;
|
|
|
|
oid = fn(cb_data);
|
|
if (!oid) {
|
|
if (opt->err_fd)
|
|
close(opt->err_fd);
|
|
return err;
|
|
}
|
|
|
|
if (transport && transport->smart_options &&
|
|
transport->smart_options->self_contained_and_connected &&
|
|
transport->pack_lockfiles.nr == 1 &&
|
|
strip_suffix(transport->pack_lockfiles.items[0].string,
|
|
".keep", &base_len)) {
|
|
struct strbuf idx_file = STRBUF_INIT;
|
|
strbuf_add(&idx_file, transport->pack_lockfiles.items[0].string,
|
|
base_len);
|
|
strbuf_addstr(&idx_file, ".idx");
|
|
new_pack = add_packed_git(idx_file.buf, idx_file.len, 1);
|
|
strbuf_release(&idx_file);
|
|
}
|
|
|
|
if (has_promisor_remote()) {
|
|
/*
|
|
* For partial clones, we don't want to have to do a regular
|
|
* connectivity check because we have to enumerate and exclude
|
|
* all promisor objects (slow), and then the connectivity check
|
|
* itself becomes a no-op because in a partial clone every
|
|
* object is a promisor object. Instead, just make sure we
|
|
* received, in a promisor packfile, the objects pointed to by
|
|
* each wanted ref.
|
|
*
|
|
* Before checking for promisor packs, be sure we have the
|
|
* latest pack-files loaded into memory.
|
|
*/
|
|
reprepare_packed_git(the_repository);
|
|
do {
|
|
struct packed_git *p;
|
|
|
|
for (p = get_all_packs(the_repository); p; p = p->next) {
|
|
if (!p->pack_promisor)
|
|
continue;
|
|
if (find_pack_entry_one(oid->hash, p))
|
|
goto promisor_pack_found;
|
|
}
|
|
/*
|
|
* Fallback to rev-list with oid and the rest of the
|
|
* object IDs provided by fn.
|
|
*/
|
|
goto no_promisor_pack_found;
|
|
promisor_pack_found:
|
|
;
|
|
} while ((oid = fn(cb_data)) != NULL);
|
|
return 0;
|
|
}
|
|
|
|
no_promisor_pack_found:
|
|
if (opt->shallow_file) {
|
|
strvec_push(&rev_list.args, "--shallow-file");
|
|
strvec_push(&rev_list.args, opt->shallow_file);
|
|
}
|
|
strvec_push(&rev_list.args,"rev-list");
|
|
strvec_push(&rev_list.args, "--objects");
|
|
strvec_push(&rev_list.args, "--stdin");
|
|
if (has_promisor_remote())
|
|
strvec_push(&rev_list.args, "--exclude-promisor-objects");
|
|
if (!opt->is_deepening_fetch) {
|
|
strvec_push(&rev_list.args, "--not");
|
|
if (opt->exclude_hidden_refs_section)
|
|
strvec_pushf(&rev_list.args, "--exclude-hidden=%s",
|
|
opt->exclude_hidden_refs_section);
|
|
strvec_push(&rev_list.args, "--all");
|
|
}
|
|
strvec_push(&rev_list.args, "--quiet");
|
|
strvec_push(&rev_list.args, "--alternate-refs");
|
|
if (opt->progress)
|
|
strvec_pushf(&rev_list.args, "--progress=%s",
|
|
_("Checking connectivity"));
|
|
|
|
rev_list.git_cmd = 1;
|
|
if (opt->env)
|
|
strvec_pushv(&rev_list.env, opt->env);
|
|
rev_list.in = -1;
|
|
rev_list.no_stdout = 1;
|
|
if (opt->err_fd)
|
|
rev_list.err = opt->err_fd;
|
|
else
|
|
rev_list.no_stderr = opt->quiet;
|
|
|
|
if (start_command(&rev_list))
|
|
return error(_("Could not run 'git rev-list'"));
|
|
|
|
sigchain_push(SIGPIPE, SIG_IGN);
|
|
|
|
rev_list_in = xfdopen(rev_list.in, "w");
|
|
|
|
do {
|
|
/*
|
|
* If index-pack already checked that:
|
|
* - there are no dangling pointers in the new pack
|
|
* - the pack is self contained
|
|
* Then if the updated ref is in the new pack, then we
|
|
* are sure the ref is good and not sending it to
|
|
* rev-list for verification.
|
|
*/
|
|
if (new_pack && find_pack_entry_one(oid->hash, new_pack))
|
|
continue;
|
|
|
|
if (fprintf(rev_list_in, "%s\n", oid_to_hex(oid)) < 0)
|
|
break;
|
|
} while ((oid = fn(cb_data)) != NULL);
|
|
|
|
if (ferror(rev_list_in) || fflush(rev_list_in)) {
|
|
if (errno != EPIPE && errno != EINVAL)
|
|
error_errno(_("failed write to rev-list"));
|
|
err = -1;
|
|
}
|
|
|
|
if (fclose(rev_list_in))
|
|
err = error_errno(_("failed to close rev-list's stdin"));
|
|
|
|
sigchain_pop(SIGPIPE);
|
|
return finish_command(&rev_list) || err;
|
|
}
|