2005-04-24 04:04:40 +02:00
|
|
|
#include "cache.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2005-04-24 04:04:40 +02:00
|
|
|
#include "commit.h"
|
2006-04-11 03:14:54 +02:00
|
|
|
#include "diff.h"
|
2006-02-26 01:19:46 +01:00
|
|
|
#include "revision.h"
|
2006-09-05 06:50:12 +02:00
|
|
|
#include "list-objects.h"
|
2017-11-21 21:58:51 +01:00
|
|
|
#include "list-objects-filter.h"
|
|
|
|
#include "list-objects-filter-options.h"
|
2018-10-05 23:31:23 +02:00
|
|
|
#include "object.h"
|
2018-05-16 01:42:15 +02:00
|
|
|
#include "object-store.h"
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
#include "pack.h"
|
|
|
|
#include "pack-bitmap.h"
|
2006-05-18 23:19:20 +02:00
|
|
|
#include "builtin.h"
|
2007-10-22 07:47:56 +02:00
|
|
|
#include "log-tree.h"
|
2008-05-04 12:36:54 +02:00
|
|
|
#include "graph.h"
|
2009-03-26 05:55:24 +01:00
|
|
|
#include "bisect.h"
|
2016-07-20 15:28:09 +02:00
|
|
|
#include "progress.h"
|
2017-07-07 11:08:30 +02:00
|
|
|
#include "reflog-walk.h"
|
2017-11-21 21:58:51 +01:00
|
|
|
#include "oidset.h"
|
2017-12-08 16:27:15 +01:00
|
|
|
#include "packfile.h"
|
2005-05-31 03:46:32 +02:00
|
|
|
|
2005-05-26 03:29:09 +02:00
|
|
|
static const char rev_list_usage[] =
|
2022-10-13 17:39:11 +02:00
|
|
|
"git rev-list [<options>] <commit>... [--] [<path>...]\n"
|
2022-10-13 17:39:01 +02:00
|
|
|
"\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" limiting output:\n"
|
2010-10-08 19:31:15 +02:00
|
|
|
" --max-count=<n>\n"
|
|
|
|
" --max-age=<epoch>\n"
|
|
|
|
" --min-age=<epoch>\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" --sparse\n"
|
|
|
|
" --no-merges\n"
|
2011-03-21 11:14:06 +01:00
|
|
|
" --min-parents=<n>\n"
|
|
|
|
" --no-min-parents\n"
|
|
|
|
" --max-parents=<n>\n"
|
|
|
|
" --no-max-parents\n"
|
2006-01-27 10:39:24 +01:00
|
|
|
" --remove-empty\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" --all\n"
|
2008-02-28 08:24:25 +01:00
|
|
|
" --branches\n"
|
|
|
|
" --tags\n"
|
|
|
|
" --remotes\n"
|
2006-09-06 06:39:02 +02:00
|
|
|
" --stdin\n"
|
2022-11-17 06:46:56 +01:00
|
|
|
" --exclude-hidden=[receive|uploadpack]\n"
|
2007-11-11 08:29:41 +01:00
|
|
|
" --quiet\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" ordering output:\n"
|
|
|
|
" --topo-order\n"
|
2006-02-16 07:05:33 +01:00
|
|
|
" --date-order\n"
|
2008-03-19 07:16:28 +01:00
|
|
|
" --reverse\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" formatting output:\n"
|
|
|
|
" --parents\n"
|
2008-04-04 08:01:47 +02:00
|
|
|
" --children\n"
|
2006-02-19 12:32:31 +01:00
|
|
|
" --objects | --objects-edge\n"
|
2022-08-11 06:47:54 +02:00
|
|
|
" --disk-usage[=human]\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" --unpacked\n"
|
|
|
|
" --header | --pretty\n"
|
2019-06-19 22:56:56 +02:00
|
|
|
" --[no-]object-names\n"
|
2010-10-08 19:31:15 +02:00
|
|
|
" --abbrev=<n> | --no-abbrev\n"
|
2006-04-07 06:32:36 +02:00
|
|
|
" --abbrev-commit\n"
|
2007-04-05 16:53:07 +02:00
|
|
|
" --left-right\n"
|
2015-07-01 11:24:11 +02:00
|
|
|
" --count\n"
|
2005-10-30 10:03:45 +01:00
|
|
|
" special purpose:\n"
|
2007-03-22 06:15:54 +01:00
|
|
|
" --bisect\n"
|
2007-10-22 07:47:56 +02:00
|
|
|
" --bisect-vars\n"
|
|
|
|
" --bisect-all"
|
2005-10-30 10:03:45 +01:00
|
|
|
;
|
2005-05-26 03:29:09 +02:00
|
|
|
|
2016-07-20 15:28:09 +02:00
|
|
|
static struct progress *progress;
|
|
|
|
static unsigned progress_counter;
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
static struct oidset omitted_objects;
|
|
|
|
static int arg_print_omitted; /* print objects omitted by filter */
|
|
|
|
|
|
|
|
static struct oidset missing_objects;
|
|
|
|
enum missing_action {
|
|
|
|
MA_ERROR = 0, /* fail if any missing objects are encountered */
|
|
|
|
MA_ALLOW_ANY, /* silently allow ALL missing objects */
|
|
|
|
MA_PRINT, /* print ALL missing objects in special section */
|
2017-12-08 16:27:15 +01:00
|
|
|
MA_ALLOW_PROMISOR, /* silently allow all missing PROMISOR objects */
|
2017-11-21 21:58:51 +01:00
|
|
|
};
|
|
|
|
static enum missing_action arg_missing_action;
|
|
|
|
|
2019-06-19 22:56:56 +02:00
|
|
|
/* display only the oid of each object encountered */
|
|
|
|
static int arg_show_object_names = 1;
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
#define DEFAULT_OIDSET_SIZE (16*1024)
|
|
|
|
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
static int show_disk_usage;
|
|
|
|
static off_t total_disk_usage;
|
2022-08-11 06:47:54 +02:00
|
|
|
static int human_readable;
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
|
|
|
|
static off_t get_object_disk_usage(struct object *obj)
|
|
|
|
{
|
|
|
|
off_t size;
|
|
|
|
struct object_info oi = OBJECT_INFO_INIT;
|
|
|
|
oi.disk_sizep = &size;
|
|
|
|
if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0)
|
|
|
|
die(_("unable to get disk usage of %s"), oid_to_hex(&obj->oid));
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2019-05-09 23:32:03 +02:00
|
|
|
static void finish_commit(struct commit *commit);
|
2009-04-06 21:28:36 +02:00
|
|
|
static void show_commit(struct commit *commit, void *data)
|
2005-06-02 18:19:53 +02:00
|
|
|
{
|
2009-04-06 22:28:00 +02:00
|
|
|
struct rev_list_info *info = data;
|
|
|
|
struct rev_info *revs = info->revs;
|
2009-04-06 21:28:36 +02:00
|
|
|
|
2016-07-20 15:28:09 +02:00
|
|
|
display_progress(progress, ++progress_counter);
|
|
|
|
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
if (show_disk_usage)
|
|
|
|
total_disk_usage += get_object_disk_usage(&commit->object);
|
|
|
|
|
2012-02-28 15:00:00 +01:00
|
|
|
if (info->flags & REV_LIST_QUIET) {
|
2019-05-09 23:32:03 +02:00
|
|
|
finish_commit(commit);
|
2012-02-28 15:00:00 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-04-06 21:28:36 +02:00
|
|
|
graph_show_commit(revs->graph);
|
2008-05-04 12:36:54 +02:00
|
|
|
|
2010-06-10 13:47:23 +02:00
|
|
|
if (revs->count) {
|
2011-04-26 10:24:29 +02:00
|
|
|
if (commit->object.flags & PATCHSAME)
|
|
|
|
revs->count_same++;
|
|
|
|
else if (commit->object.flags & SYMMETRIC_LEFT)
|
2010-06-10 13:47:23 +02:00
|
|
|
revs->count_left++;
|
|
|
|
else
|
|
|
|
revs->count_right++;
|
2019-05-09 23:32:03 +02:00
|
|
|
finish_commit(commit);
|
2010-06-10 13:47:23 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-04-06 22:28:00 +02:00
|
|
|
if (info->show_timestamp)
|
2017-04-21 12:45:48 +02:00
|
|
|
printf("%"PRItime" ", commit->date);
|
2009-04-06 22:28:00 +02:00
|
|
|
if (info->header_prefix)
|
|
|
|
fputs(info->header_prefix, stdout);
|
2008-05-25 09:07:21 +02:00
|
|
|
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
if (revs->include_header) {
|
|
|
|
if (!revs->graph)
|
|
|
|
fputs(get_revision_mark(revs, commit), stdout);
|
|
|
|
if (revs->abbrev_commit && revs->abbrev)
|
|
|
|
fputs(find_unique_abbrev(&commit->object.oid, revs->abbrev),
|
|
|
|
stdout);
|
|
|
|
else
|
|
|
|
fputs(oid_to_hex(&commit->object.oid), stdout);
|
|
|
|
}
|
2009-04-06 21:28:36 +02:00
|
|
|
if (revs->print_parents) {
|
2005-06-02 18:19:53 +02:00
|
|
|
struct commit_list *parents = commit->parents;
|
|
|
|
while (parents) {
|
2015-11-10 03:22:28 +01:00
|
|
|
printf(" %s", oid_to_hex(&parents->item->object.oid));
|
2005-06-02 18:19:53 +02:00
|
|
|
parents = parents->next;
|
|
|
|
}
|
|
|
|
}
|
2009-04-06 21:28:36 +02:00
|
|
|
if (revs->children.name) {
|
2008-04-04 08:01:47 +02:00
|
|
|
struct commit_list *children;
|
|
|
|
|
2009-04-06 21:28:36 +02:00
|
|
|
children = lookup_decoration(&revs->children, &commit->object);
|
2008-04-04 08:01:47 +02:00
|
|
|
while (children) {
|
2015-11-10 03:22:28 +01:00
|
|
|
printf(" %s", oid_to_hex(&children->item->object.oid));
|
2008-04-04 08:01:47 +02:00
|
|
|
children = children->next;
|
|
|
|
}
|
|
|
|
}
|
2009-04-06 21:28:36 +02:00
|
|
|
show_decorations(revs, commit);
|
|
|
|
if (revs->commit_format == CMIT_FMT_ONELINE)
|
2005-08-09 07:15:40 +02:00
|
|
|
putchar(' ');
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
else if (revs->include_header)
|
2005-08-09 07:15:40 +02:00
|
|
|
putchar('\n');
|
|
|
|
|
commit: drop uses of get_cached_commit_buffer()
The "--show-all" revision option shows UNINTERESTING
commits. Some of these commits may be unparsed when we try
to show them (since we may or may not need to walk their
parents to fulfill the request).
Commit 3131b71301 (Add "--show-all" revision walker flag for
debugging, 2008-02-09) resolved this by just skipping
pretty-printing for commits without their object contents
cached, saying:
Because we now end up listing commits we may not even have been parsed
at all "show_log" and "show_commit" need to protect against commits
that don't have a commit buffer entry.
That was the easy fix to avoid the pretty-printer segfaulting,
but:
1. It doesn't work for all formats. E.g., --oneline
prints the oid for each such commit but not a trailing
newline, leading to jumbled output.
2. It only affects some commits, depending on whether we
happened to parse them or not (so if they were at the
tip of an UNINTERESTING starting point, or if we
happened to traverse over them, you'd see more data).
3. It unncessarily ties the decision to show the verbose
header to whether the commit buffer was cached. That
makes it harder to change the logic around caching
(e.g., if we could traverse without actually loading
the full commit objects).
These days it's safe to feed such a commit to the
pretty-print code. Since be5c9fb904 (logmsg_reencode: lazily
load missing commit buffers, 2013-01-26), we'll load it on
demand in such a case. So let's just always show the verbose
headers.
This does change the behavior of plumbing, but:
a. The --show-all option was explicitly introduced as a
debugging aid, and was never documented (and has rarely
even been mentioned on the list by git devs).
b. Avoiding the commits was already not deterministic due
to (2) above. So the caller might have seen full
headers for these commits anyway, and would need to be
prepared for it.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-02-22 00:13:38 +01:00
|
|
|
if (revs->verbose_header) {
|
2008-10-09 21:12:12 +02:00
|
|
|
struct strbuf buf = STRBUF_INIT;
|
2009-10-19 17:48:08 +02:00
|
|
|
struct pretty_print_context ctx = {0};
|
|
|
|
ctx.abbrev = revs->abbrev;
|
|
|
|
ctx.date_mode = revs->date_mode;
|
log: respect date_mode_explicit with --format:%gd
When we show a reflog selector (e.g., via "git log -g"), we
perform some DWIM magic: while we normally show the entry's
index (e.g., HEAD@{1}), if the user has given us a date
with "--date", then we show a date-based select (e.g.,
HEAD@{yesterday}).
However, we don't want to trigger this magic if the
alternate date format we got was from the "log.date"
configuration; that is not sufficiently strong context for
us to invoke this particular magic. To fix this, commit
f4ea32f (improve reflog date/number heuristic, 2009-09-24)
introduced a "date_mode_explicit" flag in rev_info. This
flag is set only when we see a "--date" option on the
command line, and we a vanilla date to the reflog code if
the date was not explicit.
Later, commit 8f8f547 (Introduce new pretty formats %g[sdD]
for reflog information, 2009-10-19) added another way to
show selectors, and it did not respect the date_mode_explicit
flag from f4ea32f.
This patch propagates the date_mode_explicit flag to the
pretty-print code, which can then use it to pass the
appropriate date field to the reflog code. This brings the
behavior of "%gd" in line with the other formats, and means
that its output is independent of any user configuration.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-05-04 07:25:18 +02:00
|
|
|
ctx.date_mode_explicit = revs->date_mode_explicit;
|
2011-05-27 00:27:49 +02:00
|
|
|
ctx.fmt = revs->commit_format;
|
2013-06-26 12:19:50 +02:00
|
|
|
ctx.output_encoding = get_log_output_encoding();
|
2017-07-13 17:07:30 +02:00
|
|
|
ctx.color = revs->diffopt.use_color;
|
2011-05-27 00:27:49 +02:00
|
|
|
pretty_print_commit(&ctx, commit, &buf);
|
2016-09-01 01:27:20 +02:00
|
|
|
if (buf.len) {
|
|
|
|
if (revs->commit_format != CMIT_FMT_ONELINE)
|
|
|
|
graph_show_oneline(revs->graph);
|
2008-05-04 12:36:54 +02:00
|
|
|
|
2016-09-01 01:27:20 +02:00
|
|
|
graph_show_commit_msg(revs->graph, stdout, &buf);
|
2008-05-04 12:36:54 +02:00
|
|
|
|
2016-09-01 01:27:20 +02:00
|
|
|
/*
|
|
|
|
* Add a newline after the commit message.
|
|
|
|
*
|
|
|
|
* Usually, this newline produces a blank
|
|
|
|
* padding line between entries, in which case
|
|
|
|
* we need to add graph padding on this line.
|
|
|
|
*
|
|
|
|
* However, the commit message may not end in a
|
|
|
|
* newline. In this case the newline simply
|
|
|
|
* ends the last line of the commit message,
|
|
|
|
* and we don't need any graph output. (This
|
|
|
|
* always happens with CMIT_FMT_ONELINE, and it
|
|
|
|
* happens with CMIT_FMT_USERFORMAT when the
|
|
|
|
* format doesn't explicitly end in a newline.)
|
|
|
|
*/
|
|
|
|
if (buf.len && buf.buf[buf.len - 1] == '\n')
|
|
|
|
graph_show_padding(revs->graph);
|
2016-10-20 22:41:00 +02:00
|
|
|
putchar(info->hdr_termination);
|
2008-05-04 12:36:54 +02:00
|
|
|
} else {
|
2016-09-01 01:27:20 +02:00
|
|
|
/*
|
|
|
|
* If the message buffer is empty, just show
|
|
|
|
* the rest of the graph output for this
|
|
|
|
* commit.
|
|
|
|
*/
|
|
|
|
if (graph_show_remainder(revs->graph))
|
|
|
|
putchar('\n');
|
|
|
|
if (revs->commit_format == CMIT_FMT_ONELINE)
|
|
|
|
putchar('\n');
|
2008-05-04 12:36:54 +02:00
|
|
|
}
|
2007-09-10 12:35:06 +02:00
|
|
|
strbuf_release(&buf);
|
2008-05-04 12:36:54 +02:00
|
|
|
} else {
|
2009-04-06 21:28:36 +02:00
|
|
|
if (graph_show_remainder(revs->graph))
|
2008-05-04 12:36:54 +02:00
|
|
|
putchar('\n');
|
2005-07-05 01:36:48 +02:00
|
|
|
}
|
2007-06-29 19:40:46 +02:00
|
|
|
maybe_flush_or_die(stdout, "stdout");
|
2019-05-09 23:32:03 +02:00
|
|
|
finish_commit(commit);
|
2007-11-11 08:29:41 +01:00
|
|
|
}
|
|
|
|
|
2019-05-09 23:32:03 +02:00
|
|
|
static void finish_commit(struct commit *commit)
|
2007-11-11 08:29:41 +01:00
|
|
|
{
|
2022-04-13 22:01:34 +02:00
|
|
|
free_commit_list(commit->parents);
|
|
|
|
commit->parents = NULL;
|
2018-12-15 01:09:40 +01:00
|
|
|
free_commit_buffer(the_repository->parsed_objects,
|
|
|
|
commit);
|
2005-06-06 17:39:40 +02:00
|
|
|
}
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
static inline void finish_object__ma(struct object *obj)
|
|
|
|
{
|
2017-12-08 16:27:15 +01:00
|
|
|
/*
|
|
|
|
* Whether or not we try to dynamically fetch missing objects
|
|
|
|
* from the server, we currently DO NOT have the object. We
|
|
|
|
* can either print, allow (ignore), or conditionally allow
|
|
|
|
* (ignore) them.
|
|
|
|
*/
|
2017-11-21 21:58:51 +01:00
|
|
|
switch (arg_missing_action) {
|
|
|
|
case MA_ERROR:
|
2018-10-05 23:31:23 +02:00
|
|
|
die("missing %s object '%s'",
|
|
|
|
type_name(obj->type), oid_to_hex(&obj->oid));
|
2017-11-21 21:58:51 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
case MA_ALLOW_ANY:
|
|
|
|
return;
|
|
|
|
|
|
|
|
case MA_PRINT:
|
|
|
|
oidset_insert(&missing_objects, &obj->oid);
|
|
|
|
return;
|
|
|
|
|
2017-12-08 16:27:15 +01:00
|
|
|
case MA_ALLOW_PROMISOR:
|
|
|
|
if (is_promisor_object(&obj->oid))
|
|
|
|
return;
|
2018-10-05 23:31:23 +02:00
|
|
|
die("unexpected missing %s object '%s'",
|
|
|
|
type_name(obj->type), oid_to_hex(&obj->oid));
|
2017-12-08 16:27:15 +01:00
|
|
|
return;
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
default:
|
|
|
|
BUG("unhandled missing_action");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-08 16:27:15 +01:00
|
|
|
static int finish_object(struct object *obj, const char *name, void *cb_data)
|
2007-11-11 08:29:41 +01:00
|
|
|
{
|
2012-02-28 15:00:00 +01:00
|
|
|
struct rev_list_info *info = cb_data;
|
rev-list: allow cached objects in existence check
This fixes a regression in 7c0fe330d5 (rev-list: handle missing tree
objects properly, 2018-10-05) where rev-list will now complain about the
empty tree when it doesn't physically exist on disk.
Before that commit, we relied on the traversal code in list-objects.c to
walk through the trees. Since it uses parse_tree(), we'd do a normal
object lookup that includes looking in the set of "cached" objects
(which is where our magic internal empty-tree kicks in).
After that commit, we instead tell list-objects.c not to die on any
missing trees, and we check them ourselves using has_object_file(). But
that function uses OBJECT_INFO_SKIP_CACHED, which means we won't use our
internal empty tree.
This normally wouldn't come up. For most operations, Git will try to
write out the empty tree object as it would any other object. And
pack-objects in a push or fetch will send the empty tree (even if it's
virtual on the sending side). However, there are cases where this can
matter. One I found in the wild:
1. The root tree of a commit became empty by deleting all files,
without using an index. In this case it was done using libgit2's
tree builder API, but as the included test shows, it can easily be
done with regular git using hash-object.
The resulting repo works OK, as we'd avoid walking over our own
reachable commits for a connectivity check.
2. Cloning with --reference pointing to the repository from (1) can
trigger the problem, because we tell the other side we already have
that commit (and hence the empty tree), but then walk over it
during the connectivity check (where we complain about it missing).
Arguably the workflow in step (1) should be more careful about writing
the empty tree object if we're referencing it. But this workflow did
work prior to 7c0fe330d5, so let's restore it.
This patch makes the minimal fix, which is to swap out a direct call to
oid_object_info_extended(), minus the SKIP_CACHED flag, instead of
calling has_object_file(). This is all that has_object_file() is doing
under the hood. And there's little danger of unrelated fallout from
other unexpected "cached" objects, since there's only one call site that
ends such a cached object, and it's in git-blame.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-03-04 18:40:54 +01:00
|
|
|
if (oid_object_info_extended(the_repository, &obj->oid, NULL, 0) < 0) {
|
2017-11-21 21:58:51 +01:00
|
|
|
finish_object__ma(obj);
|
2017-12-08 16:27:15 +01:00
|
|
|
return 1;
|
|
|
|
}
|
2012-02-28 15:00:00 +01:00
|
|
|
if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
|
2018-06-29 03:21:51 +02:00
|
|
|
parse_object(the_repository, &obj->oid);
|
2017-12-08 16:27:15 +01:00
|
|
|
return 0;
|
2007-11-11 08:29:41 +01:00
|
|
|
}
|
|
|
|
|
list-objects: pass full pathname to callbacks
When we find a blob at "a/b/c", we currently pass this to
our show_object_fn callbacks as two components: "a/b/" and
"c". Callbacks which want the full value then call
path_name(), which concatenates the two. But this is an
inefficient interface; the path is a strbuf, and we could
simply append "c" to it temporarily, then roll back the
length, without creating a new copy.
So we could improve this by teaching the callsites of
path_name() this trick (and there are only 3). But we can
also notice that no callback actually cares about the
broken-down representation, and simply pass each callback
the full path "a/b/c" as a string. The callback code becomes
even simpler, then, as we do not have to worry about freeing
an allocated buffer, nor rolling back our modification to
the strbuf.
This is theoretically less efficient, as some callbacks
would not bother to format the final path component. But in
practice this is not measurable. Since we use the same
strbuf over and over, our work to grow it is amortized, and
we really only pay to memcpy a few bytes.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-11 23:28:36 +01:00
|
|
|
static void show_object(struct object *obj, const char *name, void *cb_data)
|
2005-06-25 07:56:58 +02:00
|
|
|
{
|
2012-02-13 21:17:11 +01:00
|
|
|
struct rev_list_info *info = cb_data;
|
2020-02-14 19:22:20 +01:00
|
|
|
struct rev_info *revs = info->revs;
|
|
|
|
|
2017-12-08 16:27:15 +01:00
|
|
|
if (finish_object(obj, name, cb_data))
|
|
|
|
return;
|
2016-07-20 15:28:09 +02:00
|
|
|
display_progress(progress, ++progress_counter);
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
if (show_disk_usage)
|
|
|
|
total_disk_usage += get_object_disk_usage(obj);
|
2012-02-28 15:00:00 +01:00
|
|
|
if (info->flags & REV_LIST_QUIET)
|
|
|
|
return;
|
2020-02-14 19:22:20 +01:00
|
|
|
|
|
|
|
if (revs->count) {
|
2020-02-18 22:21:46 +01:00
|
|
|
/*
|
|
|
|
* The object count is always accumulated in the .count_right
|
|
|
|
* field for traversal that is not a left-right traversal,
|
|
|
|
* and cmd_rev_list() made sure that a .count request that
|
|
|
|
* wants to count non-commit objects, which is handled by
|
|
|
|
* the show_object() callback, does not ask for .left_right.
|
|
|
|
*/
|
2020-02-14 19:22:20 +01:00
|
|
|
revs->count_right++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-06-19 22:56:56 +02:00
|
|
|
if (arg_show_object_names)
|
|
|
|
show_object_with_name(stdout, obj, name);
|
|
|
|
else
|
|
|
|
printf("%s\n", oid_to_hex(&obj->oid));
|
2005-06-25 07:56:58 +02:00
|
|
|
}
|
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
static void show_edge(struct commit *commit)
|
|
|
|
{
|
2015-11-10 03:22:28 +01:00
|
|
|
printf("-%s\n", oid_to_hex(&commit->object.oid));
|
2006-09-06 10:42:23 +02:00
|
|
|
}
|
|
|
|
|
2009-04-21 07:54:10 +02:00
|
|
|
static void print_var_str(const char *var, const char *val)
|
2009-04-19 11:55:43 +02:00
|
|
|
{
|
2009-04-21 07:54:10 +02:00
|
|
|
printf("%s='%s'\n", var, val);
|
2009-04-19 11:55:43 +02:00
|
|
|
}
|
|
|
|
|
2009-04-21 07:54:10 +02:00
|
|
|
static void print_var_int(const char *var, int val)
|
2009-04-19 11:55:43 +02:00
|
|
|
{
|
2009-04-21 07:54:10 +02:00
|
|
|
printf("%s=%d\n", var, val);
|
2009-04-19 11:55:43 +02:00
|
|
|
}
|
|
|
|
|
2010-01-12 07:21:18 +01:00
|
|
|
static int show_bisect_vars(struct rev_list_info *info, int reaches, int all)
|
2009-03-26 05:55:30 +01:00
|
|
|
{
|
2012-02-28 15:00:00 +01:00
|
|
|
int cnt, flags = info->flags;
|
2017-03-26 18:01:24 +02:00
|
|
|
char hex[GIT_MAX_HEXSZ + 1] = "";
|
2009-03-26 05:55:49 +01:00
|
|
|
struct commit_list *tried;
|
2009-04-06 22:28:00 +02:00
|
|
|
struct rev_info *revs = info->revs;
|
2009-03-26 05:55:30 +01:00
|
|
|
|
2012-02-28 14:59:59 +01:00
|
|
|
if (!revs->commits)
|
2009-03-26 05:55:30 +01:00
|
|
|
return 1;
|
|
|
|
|
2009-06-06 06:41:33 +02:00
|
|
|
revs->commits = filter_skipped(revs->commits, &tried,
|
|
|
|
flags & BISECT_SHOW_ALL,
|
|
|
|
NULL, NULL);
|
2009-03-26 05:55:49 +01:00
|
|
|
|
2009-03-26 05:55:30 +01:00
|
|
|
/*
|
2009-03-26 05:55:41 +01:00
|
|
|
* revs->commits can reach "reaches" commits among
|
2009-03-26 05:55:30 +01:00
|
|
|
* "all" commits. If it is good, then there are
|
|
|
|
* (all-reaches) commits left to be bisected.
|
|
|
|
* On the other hand, if it is bad, then the set
|
|
|
|
* to bisect is "reaches".
|
|
|
|
* A bisect set of size N has (N-1) commits further
|
|
|
|
* to test, as we already know one bad one.
|
|
|
|
*/
|
|
|
|
cnt = all - reaches;
|
|
|
|
if (cnt < reaches)
|
|
|
|
cnt = reaches;
|
2009-03-26 05:55:35 +01:00
|
|
|
|
2009-03-26 05:55:49 +01:00
|
|
|
if (revs->commits)
|
2017-01-28 23:03:03 +01:00
|
|
|
oid_to_hex_r(hex, &revs->commits->item->object.oid);
|
2009-03-26 05:55:30 +01:00
|
|
|
|
2009-03-29 11:55:43 +02:00
|
|
|
if (flags & BISECT_SHOW_ALL) {
|
2009-04-06 22:28:00 +02:00
|
|
|
traverse_commit_list(revs, show_commit, show_object, info);
|
2009-03-26 05:55:30 +01:00
|
|
|
printf("------\n");
|
|
|
|
}
|
|
|
|
|
2009-04-21 07:54:10 +02:00
|
|
|
print_var_str("bisect_rev", hex);
|
|
|
|
print_var_int("bisect_nr", cnt - 1);
|
|
|
|
print_var_int("bisect_good", all - reaches - 1);
|
|
|
|
print_var_int("bisect_bad", reaches - 1);
|
|
|
|
print_var_int("bisect_all", all);
|
|
|
|
print_var_int("bisect_steps", estimate_bisect_steps(all));
|
2009-03-26 05:55:30 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
static int show_object_fast(
|
2017-10-16 00:07:00 +02:00
|
|
|
const struct object_id *oid,
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
enum object_type type,
|
|
|
|
int exclude,
|
|
|
|
uint32_t name_hash,
|
|
|
|
struct packed_git *found_pack,
|
|
|
|
off_t found_offset)
|
|
|
|
{
|
2017-10-16 00:07:00 +02:00
|
|
|
fprintf(stdout, "%s\n", oid_to_hex(oid));
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2022-08-11 06:47:54 +02:00
|
|
|
static void print_disk_usage(off_t size)
|
|
|
|
{
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
if (human_readable)
|
|
|
|
strbuf_humanise_bytes(&sb, size);
|
|
|
|
else
|
|
|
|
strbuf_addf(&sb, "%"PRIuMAX, (uintmax_t)size);
|
|
|
|
puts(sb.buf);
|
|
|
|
strbuf_release(&sb);
|
|
|
|
}
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
static inline int parse_missing_action_value(const char *value)
|
|
|
|
{
|
|
|
|
if (!strcmp(value, "error")) {
|
|
|
|
arg_missing_action = MA_ERROR;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(value, "allow-any")) {
|
|
|
|
arg_missing_action = MA_ALLOW_ANY;
|
2017-12-08 16:27:15 +01:00
|
|
|
fetch_if_missing = 0;
|
2017-11-21 21:58:51 +01:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(value, "print")) {
|
|
|
|
arg_missing_action = MA_PRINT;
|
2017-12-08 16:27:15 +01:00
|
|
|
fetch_if_missing = 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(value, "allow-promisor")) {
|
|
|
|
arg_missing_action = MA_ALLOW_PROMISOR;
|
|
|
|
fetch_if_missing = 0;
|
2017-11-21 21:58:51 +01:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-02-14 19:22:32 +01:00
|
|
|
static int try_bitmap_count(struct rev_info *revs,
|
2021-04-19 13:47:06 +02:00
|
|
|
int filter_provided_objects)
|
2020-02-14 19:22:18 +01:00
|
|
|
{
|
2020-02-14 19:22:22 +01:00
|
|
|
uint32_t commit_count = 0,
|
|
|
|
tag_count = 0,
|
|
|
|
tree_count = 0,
|
|
|
|
blob_count = 0;
|
2020-02-14 19:22:18 +01:00
|
|
|
int max_count;
|
|
|
|
struct bitmap_index *bitmap_git;
|
|
|
|
|
|
|
|
/* This function only handles counting, not general traversal. */
|
|
|
|
if (!revs->count)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A bitmap result can't know left/right, etc, because we don't
|
|
|
|
* actually traverse.
|
|
|
|
*/
|
|
|
|
if (revs->left_right || revs->cherry_mark)
|
|
|
|
return -1;
|
|
|
|
|
2020-02-14 19:22:22 +01:00
|
|
|
/*
|
|
|
|
* If we're counting reachable objects, we can't handle a max count of
|
|
|
|
* commits to traverse, since we don't know which objects go with which
|
|
|
|
* commit.
|
|
|
|
*/
|
|
|
|
if (revs->max_count >= 0 &&
|
|
|
|
(revs->tag_objects || revs->tree_objects || revs->blob_objects))
|
|
|
|
return -1;
|
|
|
|
|
2020-02-14 19:22:18 +01:00
|
|
|
/*
|
|
|
|
* This must be saved before doing any walking, since the revision
|
|
|
|
* machinery will count it down to zero while traversing.
|
|
|
|
*/
|
|
|
|
max_count = revs->max_count;
|
|
|
|
|
2022-03-09 17:01:35 +01:00
|
|
|
bitmap_git = prepare_bitmap_walk(revs, filter_provided_objects);
|
2020-02-14 19:22:18 +01:00
|
|
|
if (!bitmap_git)
|
|
|
|
return -1;
|
|
|
|
|
2020-02-14 19:22:22 +01:00
|
|
|
count_bitmap_commit_list(bitmap_git, &commit_count,
|
|
|
|
revs->tree_objects ? &tree_count : NULL,
|
|
|
|
revs->blob_objects ? &blob_count : NULL,
|
|
|
|
revs->tag_objects ? &tag_count : NULL);
|
2020-02-14 19:22:18 +01:00
|
|
|
if (max_count >= 0 && max_count < commit_count)
|
|
|
|
commit_count = max_count;
|
|
|
|
|
2020-02-14 19:22:22 +01:00
|
|
|
printf("%d\n", commit_count + tree_count + blob_count + tag_count);
|
2020-02-14 19:22:18 +01:00
|
|
|
free_bitmap_index(bitmap_git);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-02-14 19:22:32 +01:00
|
|
|
static int try_bitmap_traversal(struct rev_info *revs,
|
2021-04-19 13:47:06 +02:00
|
|
|
int filter_provided_objects)
|
2020-02-14 19:22:18 +01:00
|
|
|
{
|
|
|
|
struct bitmap_index *bitmap_git;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't use a bitmap result with a traversal limit, since the set
|
|
|
|
* of commits we'd get would be essentially random.
|
|
|
|
*/
|
|
|
|
if (revs->max_count >= 0)
|
|
|
|
return -1;
|
|
|
|
|
2022-03-09 17:01:35 +01:00
|
|
|
bitmap_git = prepare_bitmap_walk(revs, filter_provided_objects);
|
2020-02-14 19:22:18 +01:00
|
|
|
if (!bitmap_git)
|
|
|
|
return -1;
|
|
|
|
|
rev-list: allow commit-only bitmap traversals
Ever since we added reachability bitmap support, we've been able to use
it with rev-list to get the full list of objects, like:
git rev-list --objects --use-bitmap-index --all
But you can't do so without --objects, since we weren't ready to just
show the commits. However, the internals of the bitmap code are mostly
ready for this: they avoid opening up trees when walking to fill in the
bitmaps. We just need to actually pass in the rev_info to
traverse_bitmap_commit_list() so it knows which types to bother
triggering our callback for.
For completeness, the perf test now covers both the existing --objects
case, as well as the new commits-only behavior (the objects one got way
faster when we introduced bitmaps, but obviously isn't improved now).
Here are numbers for linux.git:
Test HEAD^ HEAD
------------------------------------------------------------------------
5310.7: rev-list (commits) 8.29(8.10+0.19) 1.76(1.72+0.04) -78.8%
5310.8: rev-list (objects) 8.06(7.94+0.12) 8.14(7.94+0.13) +1.0%
That run was cheating a little, as I didn't have any commit-graph in the
repository, and we'd built it by default these days when running git-gc.
Here are numbers with a commit-graph:
Test HEAD^ HEAD
------------------------------------------------------------------------
5310.7: rev-list (commits) 0.70(0.58+0.12) 0.51(0.46+0.04) -27.1%
5310.8: rev-list (objects) 6.20(6.09+0.10) 6.27(6.16+0.11) +1.1%
Still an improvement, but a lot less impressive.
We could have the perf script remove any commit-graph to show the
out-sized effect, but it probably makes sense to leave it in what would
be a more typical setup.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-14 19:22:27 +01:00
|
|
|
traverse_bitmap_commit_list(bitmap_git, revs, &show_object_fast);
|
2020-02-14 19:22:18 +01:00
|
|
|
free_bitmap_index(bitmap_git);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
static int try_bitmap_disk_usage(struct rev_info *revs,
|
2021-04-19 13:47:06 +02:00
|
|
|
int filter_provided_objects)
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
{
|
|
|
|
struct bitmap_index *bitmap_git;
|
2022-08-11 06:47:54 +02:00
|
|
|
off_t size_from_bitmap;
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
|
|
|
|
if (!show_disk_usage)
|
|
|
|
return -1;
|
|
|
|
|
2022-03-09 17:01:35 +01:00
|
|
|
bitmap_git = prepare_bitmap_walk(revs, filter_provided_objects);
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
if (!bitmap_git)
|
|
|
|
return -1;
|
|
|
|
|
2022-08-11 06:47:54 +02:00
|
|
|
size_from_bitmap = get_disk_usage_from_bitmap(bitmap_git, revs);
|
|
|
|
print_disk_usage(size_from_bitmap);
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-07-29 07:44:25 +02:00
|
|
|
int cmd_rev_list(int argc, const char **argv, const char *prefix)
|
2005-04-24 04:04:40 +02:00
|
|
|
{
|
2009-04-06 21:28:36 +02:00
|
|
|
struct rev_info revs;
|
2009-04-06 22:28:00 +02:00
|
|
|
struct rev_list_info info;
|
2018-12-03 23:10:19 +01:00
|
|
|
struct setup_revision_opt s_r_opt = {
|
|
|
|
.allow_exclude_promisor_objects = 1,
|
|
|
|
};
|
2006-02-27 17:54:36 +01:00
|
|
|
int i;
|
2009-03-26 05:55:17 +01:00
|
|
|
int bisect_list = 0;
|
2007-03-22 06:15:54 +01:00
|
|
|
int bisect_show_vars = 0;
|
2007-10-22 07:47:56 +02:00
|
|
|
int bisect_find_all = 0;
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
int use_bitmap_index = 0;
|
2021-04-19 13:47:06 +02:00
|
|
|
int filter_provided_objects = 0;
|
2016-07-20 15:28:09 +02:00
|
|
|
const char *show_progress = NULL;
|
2022-04-13 22:01:40 +02:00
|
|
|
int ret = 0;
|
2005-04-24 04:04:40 +02:00
|
|
|
|
2017-06-01 06:38:16 +02:00
|
|
|
if (argc == 2 && !strcmp(argv[1], "-h"))
|
|
|
|
usage(rev_list_usage);
|
|
|
|
|
2008-05-14 19:46:53 +02:00
|
|
|
git_config(git_default_config, NULL);
|
2018-09-21 17:57:38 +02:00
|
|
|
repo_init_revisions(the_repository, &revs, prefix);
|
2010-03-22 14:36:30 +01:00
|
|
|
revs.abbrev = DEFAULT_ABBREV;
|
2006-04-16 08:48:27 +02:00
|
|
|
revs.commit_format = CMIT_FMT_UNSPECIFIED;
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
revs.include_header = 1;
|
2017-12-08 16:27:15 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan the argument list before invoking setup_revisions(), so that we
|
|
|
|
* know if fetch_if_missing needs to be set to 0.
|
|
|
|
*
|
|
|
|
* "--exclude-promisor-objects" acts as a pre-filter on missing objects
|
|
|
|
* by not crossing the boundary from realized objects to promisor
|
|
|
|
* objects.
|
|
|
|
*
|
|
|
|
* Let "--missing" to conditionally set fetch_if_missing.
|
|
|
|
*/
|
|
|
|
for (i = 1; i < argc; i++) {
|
|
|
|
const char *arg = argv[i];
|
|
|
|
if (!strcmp(arg, "--exclude-promisor-objects")) {
|
|
|
|
fetch_if_missing = 0;
|
|
|
|
revs.exclude_promisor_objects = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i = 1; i < argc; i++) {
|
|
|
|
const char *arg = argv[i];
|
|
|
|
if (skip_prefix(arg, "--missing=", &arg)) {
|
|
|
|
if (revs.exclude_promisor_objects)
|
2022-01-05 21:02:16 +01:00
|
|
|
die(_("options '%s' and '%s' cannot be used together"), "--exclude-promisor-objects", "--missing");
|
2017-12-08 16:27:15 +01:00
|
|
|
if (parse_missing_action_value(arg))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
rev-list: let traversal die when --missing is not in use
Commit 7c0fe330d5 (rev-list: handle missing tree objects properly,
2018-10-05) taught the traversal machinery used by git-rev-list to
ignore missing trees, so that rev-list could handle them itself.
However, it does so only by checking via oid_object_info_extended() that
the object exists at all. This can miss several classes of errors that
were previously detected by rev-list:
- type mismatches (e.g., we expected a tree but got a blob)
- failure to read the object data (e.g., due to bitrot on disk)
This is especially important because we use "rev-list --objects" as our
connectivity check to admit new objects to the repository, and it will
now miss these cases (though the bitrot one is less important here,
because we'd typically have just hashed and stored the object).
There are a few options to fix this:
1. we could check these properties in rev-list when we do the existence
check. This is probably too expensive in practice (perhaps even for
a type check, but definitely for checking the whole content again,
which implies loading each object into memory twice).
2. teach the traversal machinery to differentiate between a missing
object, and one that could not be loaded as expected. This probably
wouldn't be too hard to detect type mismatches, but detecting bitrot
versus a truly missing object would require deep changes to the
object-loading code.
3. have the traversal machinery communicate the failure to the caller,
so that it can decide how to proceed without re-evaluting the object
itself.
Of those, I think (3) is probably the best path forward. However, this
patch does none of them. In the name of expediently fixing the
regression to a normal "rev-list --objects" that we use for connectivity
checks, this simply restores the pre-7c0fe330d5 behavior of having the
traversal die as soon as it fails to load a tree (when --missing is set
to MA_ERROR, which is the default).
Note that we can't get rid of the object-existence check in
finish_object(), because this also handles blobs (which are not
otherwise checked at all by the traversal code).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-10 04:13:23 +02:00
|
|
|
if (arg_missing_action)
|
|
|
|
revs.do_not_die_on_missing_tree = 1;
|
|
|
|
|
2018-12-03 23:10:19 +01:00
|
|
|
argc = setup_revisions(argc, argv, &revs, &s_r_opt);
|
2006-02-26 01:19:46 +01:00
|
|
|
|
2009-04-06 22:28:00 +02:00
|
|
|
memset(&info, 0, sizeof(info));
|
|
|
|
info.revs = &revs;
|
2009-10-27 19:28:07 +01:00
|
|
|
if (revs.bisect)
|
|
|
|
bisect_list = 1;
|
2009-04-06 22:28:00 +02:00
|
|
|
|
2017-10-31 19:19:11 +01:00
|
|
|
if (revs.diffopt.flags.quick)
|
2012-02-28 15:00:00 +01:00
|
|
|
info.flags |= REV_LIST_QUIET;
|
2005-05-06 10:00:11 +02:00
|
|
|
for (i = 1 ; i < argc; i++) {
|
2005-10-21 06:25:09 +02:00
|
|
|
const char *arg = argv[i];
|
2005-05-06 10:00:11 +02:00
|
|
|
|
2005-05-26 03:29:09 +02:00
|
|
|
if (!strcmp(arg, "--header")) {
|
2006-04-16 08:48:27 +02:00
|
|
|
revs.verbose_header = 1;
|
2005-06-01 17:42:22 +02:00
|
|
|
continue;
|
|
|
|
}
|
2006-03-22 09:22:00 +01:00
|
|
|
if (!strcmp(arg, "--timestamp")) {
|
2009-04-06 22:28:00 +02:00
|
|
|
info.show_timestamp = 1;
|
2006-03-22 09:22:00 +01:00
|
|
|
continue;
|
|
|
|
}
|
2005-06-18 07:54:50 +02:00
|
|
|
if (!strcmp(arg, "--bisect")) {
|
|
|
|
bisect_list = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2007-10-22 07:47:56 +02:00
|
|
|
if (!strcmp(arg, "--bisect-all")) {
|
|
|
|
bisect_list = 1;
|
|
|
|
bisect_find_all = 1;
|
2012-02-28 15:00:00 +01:00
|
|
|
info.flags |= BISECT_SHOW_ALL;
|
2009-02-08 15:54:47 +01:00
|
|
|
revs.show_decorations = 1;
|
2007-10-22 07:47:56 +02:00
|
|
|
continue;
|
|
|
|
}
|
2007-03-22 06:15:54 +01:00
|
|
|
if (!strcmp(arg, "--bisect-vars")) {
|
|
|
|
bisect_list = 1;
|
|
|
|
bisect_show_vars = 1;
|
|
|
|
continue;
|
|
|
|
}
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
if (!strcmp(arg, "--use-bitmap-index")) {
|
|
|
|
use_bitmap_index = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp(arg, "--test-bitmap")) {
|
|
|
|
test_bitmap_walk(&revs);
|
2022-04-13 22:01:40 +02:00
|
|
|
goto cleanup;
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
}
|
2016-07-20 15:28:09 +02:00
|
|
|
if (skip_prefix(arg, "--progress=", &arg)) {
|
|
|
|
show_progress = arg;
|
|
|
|
continue;
|
|
|
|
}
|
2021-04-19 13:47:06 +02:00
|
|
|
if (!strcmp(arg, "--filter-provided-objects")) {
|
|
|
|
filter_provided_objects = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2017-11-21 21:58:51 +01:00
|
|
|
if (!strcmp(arg, "--filter-print-omitted")) {
|
|
|
|
arg_print_omitted = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-12-08 16:27:15 +01:00
|
|
|
if (!strcmp(arg, "--exclude-promisor-objects"))
|
|
|
|
continue; /* already handled above */
|
|
|
|
if (skip_prefix(arg, "--missing=", &arg))
|
|
|
|
continue; /* already handled above */
|
2017-11-21 21:58:51 +01:00
|
|
|
|
2019-06-19 22:56:56 +02:00
|
|
|
if (!strcmp(arg, ("--no-object-names"))) {
|
|
|
|
arg_show_object_names = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(arg, ("--object-names"))) {
|
|
|
|
arg_show_object_names = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
if (!strcmp(arg, ("--commit-header"))) {
|
|
|
|
revs.include_header = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(arg, ("--no-commit-header"))) {
|
|
|
|
revs.include_header = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-08-11 06:47:54 +02:00
|
|
|
if (skip_prefix(arg, "--disk-usage", &arg)) {
|
|
|
|
if (*arg == '=') {
|
|
|
|
if (!strcmp(++arg, "human")) {
|
|
|
|
human_readable = 1;
|
|
|
|
} else
|
|
|
|
die(_("invalid value for '%s': '%s', the only allowed format is '%s'"),
|
|
|
|
"--disk-usage=<format>", arg, "human");
|
|
|
|
} else if (*arg) {
|
|
|
|
/*
|
|
|
|
* Arguably should goto a label to continue chain of ifs?
|
|
|
|
* Doesn't matter unless we try to add --disk-usage-foo
|
|
|
|
* afterwards.
|
|
|
|
*/
|
|
|
|
usage(rev_list_usage);
|
|
|
|
}
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
show_disk_usage = 1;
|
|
|
|
info.flags |= REV_LIST_QUIET;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2006-02-26 01:19:46 +01:00
|
|
|
usage(rev_list_usage);
|
2005-05-26 03:29:09 +02:00
|
|
|
|
2005-05-06 10:00:11 +02:00
|
|
|
}
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
if (revs.commit_format != CMIT_FMT_USERFORMAT)
|
|
|
|
revs.include_header = 1;
|
2006-04-16 08:48:27 +02:00
|
|
|
if (revs.commit_format != CMIT_FMT_UNSPECIFIED) {
|
|
|
|
/* The command line has a --pretty */
|
2009-04-06 22:28:00 +02:00
|
|
|
info.hdr_termination = '\n';
|
rev-list: add option for --pretty=format without header
In general, we encourage users to use plumbing commands, like git
rev-list, over porcelain commands, like git log, when scripting.
However, git rev-list has one glaring problem that prevents it from
being used in certain cases: when --pretty is used with a custom format,
it always prints out a line containing "commit" and the object ID. This
makes it unsuitable for many scripting needs, and forces users to use
git log instead.
While we can't change this behavior for backwards compatibility, we can
add an option to suppress this behavior, so let's do so, and call it
"--no-commit-header". Additionally, add the corresponding positive
option to switch it back on.
Note that this option doesn't affect the built-in formats, only custom
formats. This is exactly the same behavior as users already have from
git log and is what most users will be used to.
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-07-11 23:55:10 +02:00
|
|
|
if (revs.commit_format == CMIT_FMT_ONELINE || !revs.include_header)
|
2009-04-06 22:28:00 +02:00
|
|
|
info.header_prefix = "";
|
2006-04-16 08:48:27 +02:00
|
|
|
else
|
2009-04-06 22:28:00 +02:00
|
|
|
info.header_prefix = "commit ";
|
2006-04-16 08:48:27 +02:00
|
|
|
}
|
2006-04-17 21:42:36 +02:00
|
|
|
else if (revs.verbose_header)
|
|
|
|
/* Only --header was specified */
|
|
|
|
revs.commit_format = CMIT_FMT_RAW;
|
2005-05-06 10:00:11 +02:00
|
|
|
|
2017-07-07 11:08:30 +02:00
|
|
|
if ((!revs.commits && reflog_walk_empty(revs.reflog_info) &&
|
2013-10-16 19:26:39 +02:00
|
|
|
(!(revs.tag_objects || revs.tree_objects || revs.blob_objects) &&
|
2017-08-03 00:26:06 +02:00
|
|
|
!revs.pending.nr) &&
|
rev-list: make empty --stdin not an error
When we originally did the series that contains 7ba826290a
(revision: add rev_input_given flag, 2017-08-02) the intent
was that "git rev-list --stdin </dev/null" would similarly
become a successful noop. However, an attempt at the time to
do that did not work[1]. The problem is that rev_input_given
serves two roles:
- it tells rev-list.c that it should not error out
- it tells revision.c that it should not have the "default"
ref kick (e.g., "HEAD" in "git log")
We want to trigger the former, but not the latter. This is
technically possible with a single flag, if we set the flag
only after revision.c's revs->def check. But this introduces
a rather subtle ordering dependency.
Instead, let's keep two flags: one to denote when we got
actual input (which triggers both roles) and one for when we
read stdin (which triggers only the first).
This does mean a caller interested in the first role has to
check both flags, but there's only one such caller. And any
future callers might want to make the distinction anyway
(e.g., if they care less about erroring out, and more about
whether revision.c soaked up our stdin).
In fact, we already keep such a flag internally in
revision.c for this purpose, so this is really just exposing
that to the caller (and the old function-local flag can go
away in favor of our new one).
[1] https://public-inbox.org/git/20170802223416.gwiezhbuxbdmbjzx@sigill.intra.peff.net/
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-08-22 23:37:23 +02:00
|
|
|
!revs.rev_input_given && !revs.read_from_stdin) ||
|
2006-04-15 07:43:34 +02:00
|
|
|
revs.diff)
|
2005-10-26 00:24:55 +02:00
|
|
|
usage(rev_list_usage);
|
|
|
|
|
2015-08-23 19:56:40 +02:00
|
|
|
if (revs.show_notes)
|
|
|
|
die(_("rev-list does not support display of notes"));
|
|
|
|
|
2020-02-14 19:22:20 +01:00
|
|
|
if (revs.count &&
|
|
|
|
(revs.tag_objects || revs.tree_objects || revs.blob_objects) &&
|
|
|
|
(revs.left_right || revs.cherry_mark))
|
2022-01-05 21:02:24 +01:00
|
|
|
die(_("marked counting and '%s' cannot be used together"), "--objects");
|
2020-02-14 19:22:20 +01:00
|
|
|
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
save_commit_buffer = (revs.verbose_header ||
|
|
|
|
revs.grep_filter.pattern_list ||
|
|
|
|
revs.grep_filter.header_list);
|
rev-list --bisect: limit list before bisecting.
I noticed bisect does not work well without both good and bad.
Running this script in git.git repository would give you quite
different results:
#!/bin/sh
initial=e83c5163316f89bfbde7d9ab23ca2e25604af290
mid0=`git rev-list --bisect ^$initial --all`
git rev-list $mid0 | wc -l
git rev-list ^$mid0 --all | wc -l
mid1=`git rev-list --bisect --all`
git rev-list $mid1 | wc -l
git rev-list ^$mid1 --all | wc -l
The $initial commit is the very first commit you made. The
first midpoint bisects things evenly as designed, but the latter
does not.
The reason I got interested in this was because I was wondering
if something like the following would help people converting a
huge repository from foreign SCM, or preparing a repository to
be fetched over plain dumb HTTP only:
#!/bin/sh
N=4
P=.git/objects/pack
bottom=
while test 0 \< $N
do
N=$((N-1))
if test -z "$bottom"
then
newbottom=`git rev-list --bisect --all`
else
newbottom=`git rev-list --bisect ^$bottom --all`
fi
if test -z "$bottom"
then
rev_list="$newbottom"
elif test 0 = $N
then
rev_list="^$bottom --all"
else
rev_list="^$bottom $newbottom"
fi
p=$(git rev-list --unpacked --objects $rev_list |
git pack-objects $P/pack)
git show-index <$P/pack-$p.idx | wc -l
bottom=$newbottom
done
The idea is to pack older half of the history to one pack, then
older half of the remaining history to another, to continue a
few times, using finer granularity as we get closer to the tip.
This may not matter, since for a truly huge history, running
bisect number of times could be quite time consuming, and we
might be better off running "git rev-list --all" once into a
temporary file, and manually pick cut-off points from the
resulting list of commits. After all we are talking about
"approximately half" for such an usage, and older history does
not matter much.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-04-15 00:57:32 +02:00
|
|
|
if (bisect_list)
|
|
|
|
revs.limited = 1;
|
2006-03-29 03:28:04 +02:00
|
|
|
|
2016-07-20 15:28:09 +02:00
|
|
|
if (show_progress)
|
progress: simplify "delayed" progress API
We used to expose the full power of the delayed progress API to the
callers, so that they can specify, not just the message to show and
expected total amount of work that is used to compute the percentage
of work performed so far, the percent-threshold parameter P and the
delay-seconds parameter N. The progress meter starts to show at N
seconds into the operation only if we have not yet completed P per-cent
of the total work.
Most callers used either (0%, 2s) or (50%, 1s) as (P, N), but there
are oddballs that chose more random-looking values like 95%.
For a smoother workload, (50%, 1s) would allow us to start showing
the progress meter earlier than (0%, 2s), while keeping the chance
of not showing progress meter for long running operation the same as
the latter. For a task that would take 2s or more to complete, it
is likely that less than half of it would complete within the first
second, if the workload is smooth. But for a spiky workload whose
earlier part is easier, such a setting is likely to fail to show the
progress meter entirely and (0%, 2s) is more appropriate.
But that is merely a theory. Realistically, it is of dubious value
to ask each codepath to carefully consider smoothness of their
workload and specify their own setting by passing two extra
parameters. Let's simplify the API by dropping both parameters and
have everybody use (0%, 2s).
Oh, by the way, the percent-threshold parameter and the structure
member were consistently misspelled, which also is now fixed ;-)
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-08-19 19:39:41 +02:00
|
|
|
progress = start_delayed_progress(show_progress, 0);
|
2016-07-20 15:28:09 +02:00
|
|
|
|
2020-02-14 19:22:16 +01:00
|
|
|
if (use_bitmap_index) {
|
2022-03-09 17:01:33 +01:00
|
|
|
if (!try_bitmap_count(&revs, filter_provided_objects))
|
2022-04-13 22:01:40 +02:00
|
|
|
goto cleanup;
|
2022-03-09 17:01:33 +01:00
|
|
|
if (!try_bitmap_disk_usage(&revs, filter_provided_objects))
|
2022-04-13 22:01:40 +02:00
|
|
|
goto cleanup;
|
2022-03-09 17:01:33 +01:00
|
|
|
if (!try_bitmap_traversal(&revs, filter_provided_objects))
|
2022-04-13 22:01:40 +02:00
|
|
|
goto cleanup;
|
rev-list: add bitmap mode to speed up object lists
The bitmap reachability index used to speed up the counting objects
phase during `pack-objects` can also be used to optimize a normal
rev-list if the only thing required are the SHA1s of the objects during
the list (i.e., not the path names at which trees and blobs were found).
Calling `git rev-list --objects --use-bitmap-index [committish]` will
perform an object iteration based on a bitmap result instead of actually
walking the object graph.
These are some example timings for `torvalds/linux` (warm cache,
best-of-five):
$ time git rev-list --objects master > /dev/null
real 0m34.191s
user 0m33.904s
sys 0m0.268s
$ time git rev-list --objects --use-bitmap-index master > /dev/null
real 0m1.041s
user 0m0.976s
sys 0m0.064s
Likewise, using `git rev-list --count --use-bitmap-index` will speed up
the counting operation by building the resulting bitmap and performing a
fast popcount (number of bits set on the bitmap) on the result.
Here are some sample timings of different ways to count commits in
`torvalds/linux`:
$ time git rev-list master | wc -l
399882
real 0m6.524s
user 0m6.060s
sys 0m3.284s
$ time git rev-list --count master
399882
real 0m4.318s
user 0m4.236s
sys 0m0.076s
$ time git rev-list --use-bitmap-index --count master
399882
real 0m0.217s
user 0m0.176s
sys 0m0.040s
This also respects negative refs, so you can use it to count
a slice of history:
$ time git rev-list --count v3.0..master
144843
real 0m1.971s
user 0m1.932s
sys 0m0.036s
$ time git rev-list --use-bitmap-index --count v3.0..master
real 0m0.280s
user 0m0.220s
sys 0m0.056s
Though note that the closer the endpoints, the less it helps. In the
traversal case, we have fewer commits to cross, so we take less time.
But the bitmap time is dominated by generating the pack revindex, which
is constant with respect to the refs given.
Note that you cannot yet get a fast --left-right count of a symmetric
difference (e.g., "--count --left-right master...topic"). The slow part
of that walk actually happens during the merge-base determination when
we parse "master...topic". Even though a count does not actually need to
know the real merge base (it only needs to take the symmetric difference
of the bitmaps), the revision code would require some refactoring to
handle this case.
Additionally, a `--test-bitmap` flag has been added that will perform
the same rev-list manually (i.e. using a normal revwalk) and using
bitmaps, and verify that the results are the same. This can be used to
exercise the bitmap code, and also to verify that the contents of the
.bitmap file are sane.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-12-21 15:00:12 +01:00
|
|
|
}
|
|
|
|
|
2008-02-18 08:31:56 +01:00
|
|
|
if (prepare_revision_walk(&revs))
|
|
|
|
die("revision walk setup failed");
|
2006-02-28 20:24:00 +01:00
|
|
|
if (revs.tree_objects)
|
2019-01-16 19:25:58 +01:00
|
|
|
mark_edges_uninteresting(&revs, show_edge, 0);
|
2006-02-28 20:24:00 +01:00
|
|
|
|
2007-03-22 06:15:54 +01:00
|
|
|
if (bisect_list) {
|
-Wuninitialized: remove some 'init-self' workarounds
The 'self-initialised' variables construct (ie <type> var = var;) has
been used to silence gcc '-W[maybe-]uninitialized' warnings. This has,
unfortunately, caused MSVC to issue 'uninitialized variable' warnings.
Also, using clang static analysis causes complaints about an 'Assigned
value is garbage or undefined'.
There are six such constructs in the current codebase. Only one of the
six causes gcc to issue a '-Wmaybe-uninitialized' warning (which will
be addressed elsewhere). The remaining five 'init-self' gcc workarounds
are noted below, along with the commit which introduced them:
1. builtin/rev-list.c: 'reaches' and 'all', see commit 457f08a030
("git-rev-list: add --bisect-vars option.", 2007-03-21).
2. merge-recursive.c:2064 'mrtree', see commit f120ae2a8e ("merge-
recursive.c: mrtree in merge() is not used before set", 2007-10-29).
3. fast-import.c:3023 'oe', see commit 85c62395b1 ("fast-import: let
importers retrieve blobs", 2010-11-28).
4. fast-import.c:3006 'oe', see commit 28c7b1f7b7 ("fast-import: add a
get-mark command", 2015-07-01).
Remove the 'self-initialised' variable constructs noted above.
Signed-off-by: Ramsay Jones <ramsay@ramsayjones.plus.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-19 18:54:35 +01:00
|
|
|
int reaches, all;
|
2020-08-07 23:58:38 +02:00
|
|
|
unsigned bisect_flags = 0;
|
2007-03-22 06:15:54 +01:00
|
|
|
|
2020-08-07 23:58:38 +02:00
|
|
|
if (bisect_find_all)
|
|
|
|
bisect_flags |= FIND_BISECTION_ALL;
|
|
|
|
|
|
|
|
if (revs.first_parent_only)
|
|
|
|
bisect_flags |= FIND_BISECTION_FIRST_PARENT_ONLY;
|
|
|
|
|
|
|
|
find_bisection(&revs.commits, &reaches, &all, bisect_flags);
|
2009-03-26 05:55:49 +01:00
|
|
|
|
2022-04-13 22:01:40 +02:00
|
|
|
if (bisect_show_vars) {
|
|
|
|
ret = show_bisect_vars(&info, reaches, all);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2007-03-22 06:15:54 +01:00
|
|
|
}
|
2005-10-26 00:24:55 +02:00
|
|
|
|
2021-04-19 13:47:06 +02:00
|
|
|
if (filter_provided_objects) {
|
|
|
|
struct commit_list *c;
|
|
|
|
for (i = 0; i < revs.pending.nr; i++) {
|
|
|
|
struct object_array_entry *pending = revs.pending.objects + i;
|
|
|
|
pending->item->flags |= NOT_USER_GIVEN;
|
|
|
|
}
|
|
|
|
for (c = revs.commits; c; c = c->next)
|
|
|
|
c->item->object.flags |= NOT_USER_GIVEN;
|
|
|
|
}
|
|
|
|
|
2017-11-21 21:58:51 +01:00
|
|
|
if (arg_print_omitted)
|
|
|
|
oidset_init(&omitted_objects, DEFAULT_OIDSET_SIZE);
|
|
|
|
if (arg_missing_action == MA_PRINT)
|
|
|
|
oidset_init(&missing_objects, DEFAULT_OIDSET_SIZE);
|
|
|
|
|
|
|
|
traverse_commit_list_filtered(
|
2022-03-09 17:01:36 +01:00
|
|
|
&revs, show_commit, show_object, &info,
|
2017-11-21 21:58:51 +01:00
|
|
|
(arg_print_omitted ? &omitted_objects : NULL));
|
|
|
|
|
|
|
|
if (arg_print_omitted) {
|
|
|
|
struct oidset_iter iter;
|
|
|
|
struct object_id *oid;
|
|
|
|
oidset_iter_init(&omitted_objects, &iter);
|
|
|
|
while ((oid = oidset_iter_next(&iter)))
|
|
|
|
printf("~%s\n", oid_to_hex(oid));
|
|
|
|
oidset_clear(&omitted_objects);
|
|
|
|
}
|
|
|
|
if (arg_missing_action == MA_PRINT) {
|
|
|
|
struct oidset_iter iter;
|
|
|
|
struct object_id *oid;
|
|
|
|
oidset_iter_init(&missing_objects, &iter);
|
|
|
|
while ((oid = oidset_iter_next(&iter)))
|
|
|
|
printf("?%s\n", oid_to_hex(oid));
|
|
|
|
oidset_clear(&missing_objects);
|
|
|
|
}
|
2005-05-31 03:46:32 +02:00
|
|
|
|
2016-07-20 15:28:09 +02:00
|
|
|
stop_progress(&progress);
|
|
|
|
|
2010-06-10 13:47:23 +02:00
|
|
|
if (revs.count) {
|
2011-04-26 10:24:29 +02:00
|
|
|
if (revs.left_right && revs.cherry_mark)
|
|
|
|
printf("%d\t%d\t%d\n", revs.count_left, revs.count_right, revs.count_same);
|
|
|
|
else if (revs.left_right)
|
2010-06-10 13:47:23 +02:00
|
|
|
printf("%d\t%d\n", revs.count_left, revs.count_right);
|
2011-04-26 10:24:29 +02:00
|
|
|
else if (revs.cherry_mark)
|
|
|
|
printf("%d\t%d\n", revs.count_left + revs.count_right, revs.count_same);
|
2010-06-10 13:47:23 +02:00
|
|
|
else
|
|
|
|
printf("%d\n", revs.count_left + revs.count_right);
|
|
|
|
}
|
|
|
|
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
if (show_disk_usage)
|
2022-08-11 06:47:54 +02:00
|
|
|
print_disk_usage(total_disk_usage);
|
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-09 11:53:50 +01:00
|
|
|
|
2022-04-13 22:01:40 +02:00
|
|
|
cleanup:
|
|
|
|
release_revisions(&revs);
|
|
|
|
return ret;
|
2005-04-24 04:04:40 +02:00
|
|
|
}
|