git-commit-vandalism/list-objects.c
Jeff King aa9ad6fee5 bitmaps: don't recurse into trees already in the bitmap
If an object is already mentioned in a reachability bitmap we are
building, then by definition so are all of the objects it can reach. We
have an optimization to stop traversing commits when we see they are
already in the bitmap, but we don't do the same for trees.

It's generally unavoidable to recurse into trees for commits not yet
covered by bitmaps (since most commits generally do have unique
top-level trees). But they usually have subtrees that are shared with
other commits (i.e., all of the subtrees the commit _didn't_ touch). And
some of those commits (and their trees) may be covered by the bitmap.

Usually this isn't _too_ big a deal, because we'll visit those subtrees
only once in total for the whole walk. But if you have a large number of
unbitmapped commits, and if your tree is big, then you may end up
opening a lot of sub-trees for no good reason.

We can use the same optimization we do for commits here: when we are
about to open a tree, see if it's in the bitmap (either the one we are
building, or the "seen" bitmap which covers the UNINTERESTING side of
the bitmap when doing a set-difference).

This works especially well because we'll visit all commits before
hitting any trees. So even in a history like:

  A -- B

if "A" has a bitmap on disk but "B" doesn't, we'll already have OR-ed in
the results from A before looking at B's tree (so we really will only
look at trees touched by B).

For most repositories, the timings produced by p5310 are unspectacular.
Here's linux.git:

  Test                         HEAD^             HEAD
  --------------------------------------------------------------------
  5310.4: simulated clone      6.00(5.90+0.10)   5.98(5.90+0.08) -0.3%
  5310.5: simulated fetch      2.98(5.45+0.18)   2.85(5.31+0.18) -4.4%
  5310.7: rev-list (commits)   0.32(0.29+0.03)   0.33(0.30+0.03) +3.1%
  5310.8: rev-list (objects)   1.48(1.44+0.03)   1.49(1.44+0.05) +0.7%

Any improvement there is within the noise (the +3.1% on test 7 has to be
noise, since we are not recursing into trees, and thus the new code
isn't even run). The results for git.git are likewise uninteresting.

But here are numbers from some other real-world repositories (that are
not public). This one's tree is comparable in size to linux.git, but has
~16k refs (and so less complete bitmap coverage):

  Test                         HEAD^               HEAD
  -------------------------------------------------------------------------
  5310.4: simulated clone      38.34(39.86+0.74)   33.95(35.53+0.76) -11.5%
  5310.5: simulated fetch      2.29(6.31+0.35)     2.20(5.97+0.41) -3.9%
  5310.7: rev-list (commits)   0.99(0.86+0.13)     0.96(0.85+0.11) -3.0%
  5310.8: rev-list (objects)   11.32(11.04+0.27)   6.59(6.37+0.21) -41.8%

And here's another with a very large tree (~340k entries), and a fairly
large number of refs (~10k):

  Test                         HEAD^               HEAD
  -------------------------------------------------------------------------
  5310.3: simulated clone      53.83(54.71+1.54)   39.77(40.76+1.50) -26.1%
  5310.4: simulated fetch      19.91(20.11+0.56)   19.79(19.98+0.67) -0.6%
  5310.6: rev-list (commits)   0.54(0.44+0.11)     0.51(0.43+0.07) -5.6%
  5310.7: rev-list (objects)   24.32(23.59+0.73)   9.85(9.49+0.36) -59.5%

This patch provides substantial improvements in these larger cases, and
have any drawbacks for smaller ones (the cost of the bitmap check is
quite small compared to an actual tree traversal).

Note that we have to add a version of revision.c's include_check
callback which handles non-commits. We could possibly consolidate this
into a single callback for all objects types, as there's only one user
of the feature which would need converted (pack-bitmap.c:should_include).
That would in theory let us avoid duplicating any logic. But when I
tried it, the code ended up much worse to read, with lots of repeated
"if it's a commit do this, otherwise do that". Having two separate
callbacks splits that naturally, and matches the existing split of
show_commit/show_object callbacks.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-15 11:13:11 +09:00

451 lines
12 KiB
C

#include "cache.h"
#include "tag.h"
#include "commit.h"
#include "tree.h"
#include "blob.h"
#include "diff.h"
#include "tree-walk.h"
#include "revision.h"
#include "list-objects.h"
#include "list-objects-filter.h"
#include "list-objects-filter-options.h"
#include "packfile.h"
#include "object-store.h"
#include "trace.h"
struct traversal_context {
struct rev_info *revs;
show_object_fn show_object;
show_commit_fn show_commit;
void *show_data;
struct filter *filter;
};
static void process_blob(struct traversal_context *ctx,
struct blob *blob,
struct strbuf *path,
const char *name)
{
struct object *obj = &blob->object;
size_t pathlen;
enum list_objects_filter_result r;
if (!ctx->revs->blob_objects)
return;
if (!obj)
die("bad blob object");
if (obj->flags & (UNINTERESTING | SEEN))
return;
/*
* Pre-filter known-missing objects when explicitly requested.
* Otherwise, a missing object error message may be reported
* later (depending on other filtering criteria).
*
* Note that this "--exclude-promisor-objects" pre-filtering
* may cause the actual filter to report an incomplete list
* of missing objects.
*/
if (ctx->revs->exclude_promisor_objects &&
!has_object_file(&obj->oid) &&
is_promisor_object(&obj->oid))
return;
pathlen = path->len;
strbuf_addstr(path, name);
r = list_objects_filter__filter_object(ctx->revs->repo,
LOFS_BLOB, obj,
path->buf, &path->buf[pathlen],
ctx->filter);
if (r & LOFR_MARK_SEEN)
obj->flags |= SEEN;
if (r & LOFR_DO_SHOW)
ctx->show_object(obj, path->buf, ctx->show_data);
strbuf_setlen(path, pathlen);
}
/*
* Processing a gitlink entry currently does nothing, since
* we do not recurse into the subproject.
*
* We *could* eventually add a flag that actually does that,
* which would involve:
* - is the subproject actually checked out?
* - if so, see if the subproject has already been added
* to the alternates list, and add it if not.
* - process the commit (or tag) the gitlink points to
* recursively.
*
* However, it's unclear whether there is really ever any
* reason to see superprojects and subprojects as such a
* "unified" object pool (potentially resulting in a totally
* humongous pack - avoiding which was the whole point of
* having gitlinks in the first place!).
*
* So for now, there is just a note that we *could* follow
* the link, and how to do it. Whether it necessarily makes
* any sense what-so-ever to ever do that is another issue.
*/
static void process_gitlink(struct traversal_context *ctx,
const unsigned char *sha1,
struct strbuf *path,
const char *name)
{
/* Nothing to do */
}
static void process_tree(struct traversal_context *ctx,
struct tree *tree,
struct strbuf *base,
const char *name);
static void process_tree_contents(struct traversal_context *ctx,
struct tree *tree,
struct strbuf *base)
{
struct tree_desc desc;
struct name_entry entry;
enum interesting match = ctx->revs->diffopt.pathspec.nr == 0 ?
all_entries_interesting : entry_not_interesting;
init_tree_desc(&desc, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
if (match != all_entries_interesting) {
match = tree_entry_interesting(ctx->revs->repo->index,
&entry, base, 0,
&ctx->revs->diffopt.pathspec);
if (match == all_entries_not_interesting)
break;
if (match == entry_not_interesting)
continue;
}
if (S_ISDIR(entry.mode)) {
struct tree *t = lookup_tree(ctx->revs->repo, &entry.oid);
if (!t) {
die(_("entry '%s' in tree %s has tree mode, "
"but is not a tree"),
entry.path, oid_to_hex(&tree->object.oid));
}
t->object.flags |= NOT_USER_GIVEN;
process_tree(ctx, t, base, entry.path);
}
else if (S_ISGITLINK(entry.mode))
process_gitlink(ctx, entry.oid.hash,
base, entry.path);
else {
struct blob *b = lookup_blob(ctx->revs->repo, &entry.oid);
if (!b) {
die(_("entry '%s' in tree %s has blob mode, "
"but is not a blob"),
entry.path, oid_to_hex(&tree->object.oid));
}
b->object.flags |= NOT_USER_GIVEN;
process_blob(ctx, b, base, entry.path);
}
}
}
static void process_tree(struct traversal_context *ctx,
struct tree *tree,
struct strbuf *base,
const char *name)
{
struct object *obj = &tree->object;
struct rev_info *revs = ctx->revs;
int baselen = base->len;
enum list_objects_filter_result r;
int failed_parse;
if (!revs->tree_objects)
return;
if (!obj)
die("bad tree object");
if (obj->flags & (UNINTERESTING | SEEN))
return;
if (revs->include_check_obj &&
!revs->include_check_obj(&tree->object, revs->include_check_data))
return;
failed_parse = parse_tree_gently(tree, 1);
if (failed_parse) {
if (revs->ignore_missing_links)
return;
/*
* Pre-filter known-missing tree objects when explicitly
* requested. This may cause the actual filter to report
* an incomplete list of missing objects.
*/
if (revs->exclude_promisor_objects &&
is_promisor_object(&obj->oid))
return;
if (!revs->do_not_die_on_missing_tree)
die("bad tree object %s", oid_to_hex(&obj->oid));
}
strbuf_addstr(base, name);
r = list_objects_filter__filter_object(ctx->revs->repo,
LOFS_BEGIN_TREE, obj,
base->buf, &base->buf[baselen],
ctx->filter);
if (r & LOFR_MARK_SEEN)
obj->flags |= SEEN;
if (r & LOFR_DO_SHOW)
ctx->show_object(obj, base->buf, ctx->show_data);
if (base->len)
strbuf_addch(base, '/');
if (r & LOFR_SKIP_TREE)
trace_printf("Skipping contents of tree %s...\n", base->buf);
else if (!failed_parse)
process_tree_contents(ctx, tree, base);
r = list_objects_filter__filter_object(ctx->revs->repo,
LOFS_END_TREE, obj,
base->buf, &base->buf[baselen],
ctx->filter);
if (r & LOFR_MARK_SEEN)
obj->flags |= SEEN;
if (r & LOFR_DO_SHOW)
ctx->show_object(obj, base->buf, ctx->show_data);
strbuf_setlen(base, baselen);
free_tree_buffer(tree);
}
static void process_tag(struct traversal_context *ctx,
struct tag *tag,
const char *name)
{
enum list_objects_filter_result r;
r = list_objects_filter__filter_object(ctx->revs->repo, LOFS_TAG,
&tag->object, NULL, NULL,
ctx->filter);
if (r & LOFR_MARK_SEEN)
tag->object.flags |= SEEN;
if (r & LOFR_DO_SHOW)
ctx->show_object(&tag->object, name, ctx->show_data);
}
static void mark_edge_parents_uninteresting(struct commit *commit,
struct rev_info *revs,
show_edge_fn show_edge)
{
struct commit_list *parents;
for (parents = commit->parents; parents; parents = parents->next) {
struct commit *parent = parents->item;
if (!(parent->object.flags & UNINTERESTING))
continue;
mark_tree_uninteresting(revs->repo, get_commit_tree(parent));
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
parent->object.flags |= SHOWN;
show_edge(parent);
}
}
}
static void add_edge_parents(struct commit *commit,
struct rev_info *revs,
show_edge_fn show_edge,
struct oidset *set)
{
struct commit_list *parents;
for (parents = commit->parents; parents; parents = parents->next) {
struct commit *parent = parents->item;
struct tree *tree = get_commit_tree(parent);
if (!tree)
continue;
oidset_insert(set, &tree->object.oid);
if (!(parent->object.flags & UNINTERESTING))
continue;
tree->object.flags |= UNINTERESTING;
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
parent->object.flags |= SHOWN;
show_edge(parent);
}
}
}
void mark_edges_uninteresting(struct rev_info *revs,
show_edge_fn show_edge,
int sparse)
{
struct commit_list *list;
int i;
if (sparse) {
struct oidset set;
oidset_init(&set, 16);
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
struct tree *tree = get_commit_tree(commit);
if (commit->object.flags & UNINTERESTING)
tree->object.flags |= UNINTERESTING;
oidset_insert(&set, &tree->object.oid);
add_edge_parents(commit, revs, show_edge, &set);
}
mark_trees_uninteresting_sparse(revs->repo, &set);
oidset_clear(&set);
} else {
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
if (commit->object.flags & UNINTERESTING) {
mark_tree_uninteresting(revs->repo,
get_commit_tree(commit));
if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
commit->object.flags |= SHOWN;
show_edge(commit);
}
continue;
}
mark_edge_parents_uninteresting(commit, revs, show_edge);
}
}
if (revs->edge_hint_aggressive) {
for (i = 0; i < revs->cmdline.nr; i++) {
struct object *obj = revs->cmdline.rev[i].item;
struct commit *commit = (struct commit *)obj;
if (obj->type != OBJ_COMMIT || !(obj->flags & UNINTERESTING))
continue;
mark_tree_uninteresting(revs->repo,
get_commit_tree(commit));
if (!(obj->flags & SHOWN)) {
obj->flags |= SHOWN;
show_edge(commit);
}
}
}
}
static void add_pending_tree(struct rev_info *revs, struct tree *tree)
{
add_pending_object(revs, &tree->object, "");
}
static void traverse_trees_and_blobs(struct traversal_context *ctx,
struct strbuf *base)
{
int i;
assert(base->len == 0);
for (i = 0; i < ctx->revs->pending.nr; i++) {
struct object_array_entry *pending = ctx->revs->pending.objects + i;
struct object *obj = pending->item;
const char *name = pending->name;
const char *path = pending->path;
if (obj->flags & (UNINTERESTING | SEEN))
continue;
if (obj->type == OBJ_TAG) {
process_tag(ctx, (struct tag *)obj, name);
continue;
}
if (!path)
path = "";
if (obj->type == OBJ_TREE) {
process_tree(ctx, (struct tree *)obj, base, path);
continue;
}
if (obj->type == OBJ_BLOB) {
process_blob(ctx, (struct blob *)obj, base, path);
continue;
}
die("unknown pending object %s (%s)",
oid_to_hex(&obj->oid), name);
}
object_array_clear(&ctx->revs->pending);
}
static void do_traverse(struct traversal_context *ctx)
{
struct commit *commit;
struct strbuf csp; /* callee's scratch pad */
strbuf_init(&csp, PATH_MAX);
while ((commit = get_revision(ctx->revs)) != NULL) {
enum list_objects_filter_result r;
r = list_objects_filter__filter_object(ctx->revs->repo,
LOFS_COMMIT, &commit->object,
NULL, NULL, ctx->filter);
/*
* an uninteresting boundary commit may not have its tree
* parsed yet, but we are not going to show them anyway
*/
if (!ctx->revs->tree_objects)
; /* do not bother loading tree */
else if (get_commit_tree(commit)) {
struct tree *tree = get_commit_tree(commit);
tree->object.flags |= NOT_USER_GIVEN;
add_pending_tree(ctx->revs, tree);
} else if (commit->object.parsed) {
die(_("unable to load root tree for commit %s"),
oid_to_hex(&commit->object.oid));
}
if (r & LOFR_MARK_SEEN)
commit->object.flags |= SEEN;
if (r & LOFR_DO_SHOW)
ctx->show_commit(commit, ctx->show_data);
if (ctx->revs->tree_blobs_in_commit_order)
/*
* NEEDSWORK: Adding the tree and then flushing it here
* needs a reallocation for each commit. Can we pass the
* tree directory without allocation churn?
*/
traverse_trees_and_blobs(ctx, &csp);
}
traverse_trees_and_blobs(ctx, &csp);
strbuf_release(&csp);
}
void traverse_commit_list(struct rev_info *revs,
show_commit_fn show_commit,
show_object_fn show_object,
void *show_data)
{
struct traversal_context ctx;
ctx.revs = revs;
ctx.show_commit = show_commit;
ctx.show_object = show_object;
ctx.show_data = show_data;
ctx.filter = NULL;
do_traverse(&ctx);
}
void traverse_commit_list_filtered(
struct list_objects_filter_options *filter_options,
struct rev_info *revs,
show_commit_fn show_commit,
show_object_fn show_object,
void *show_data,
struct oidset *omitted)
{
struct traversal_context ctx;
ctx.revs = revs;
ctx.show_object = show_object;
ctx.show_commit = show_commit;
ctx.show_data = show_data;
ctx.filter = list_objects_filter__init(omitted, filter_options);
do_traverse(&ctx);
list_objects_filter__free(ctx.filter);
}