maintenance: add loose-objects task

One goal of background maintenance jobs is to allow a user to
disable auto-gc (gc.auto=0) but keep their repository in a clean
state. Without any cleanup, loose objects will clutter the object
database and slow operations. In addition, the loose objects will
take up extra space because they are not stored with deltas against
similar objects.

Create a 'loose-objects' task for the 'git maintenance run' command.
This helps clean up loose objects without disrupting concurrent Git
commands using the following sequence of events:

1. Run 'git prune-packed' to delete any loose objects that exist
   in a pack-file. Concurrent commands will prefer the packed
   version of the object to the loose version. (Of course, there
   are exceptions for commands that specifically care about the
   location of an object. These are rare for a user to run on
   purpose, and we hope a user that has selected background
   maintenance will not be trying to do foreground maintenance.)

2. Run 'git pack-objects' on a batch of loose objects. These
   objects are grouped by scanning the loose object directories in
   lexicographic order until listing all loose objects -or-
   reaching 50,000 objects. This is more than enough if the loose
   objects are created only by a user doing normal development.
   We noticed users with _millions_ of loose objects because VFS
   for Git downloads blobs on-demand when a file read operation
   requires populating a virtual file.

This step is based on a similar step in Scalar [1] and VFS for Git.
[1] https://github.com/microsoft/scalar/blob/master/Scalar.Common/Maintenance/LooseObjectsStep.cs

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Derrick Stolee 2020-09-25 12:33:32 +00:00 committed by Junio C Hamano
parent 28cb5e66dd
commit 252cfb7cb8
3 changed files with 151 additions and 0 deletions

View File

@ -70,6 +70,21 @@ gc::
be disruptive in some situations, as it deletes stale data. See be disruptive in some situations, as it deletes stale data. See
linkgit:git-gc[1] for more details on garbage collection in Git. linkgit:git-gc[1] for more details on garbage collection in Git.
loose-objects::
The `loose-objects` job cleans up loose objects and places them into
pack-files. In order to prevent race conditions with concurrent Git
commands, it follows a two-step process. First, it deletes any loose
objects that already exist in a pack-file; concurrent Git processes
will examine the pack-file for the object data instead of the loose
object. Second, it creates a new pack-file (starting with "loose-")
containing a batch of loose objects. The batch size is limited to 50
thousand objects to prevent the job from taking too long on a
repository with many loose objects. The `gc` task writes unreachable
objects as loose objects to be cleaned up by a later step only if
they are not re-added to a pack-file; for this reason it is not
advisable to enable both the `loose-objects` and `gc` tasks at the
same time.
OPTIONS OPTIONS
------- -------
--auto:: --auto::

View File

@ -880,6 +880,98 @@ static int maintenance_task_gc(struct maintenance_run_opts *opts)
return run_command(&child); return run_command(&child);
} }
static int prune_packed(struct maintenance_run_opts *opts)
{
struct child_process child = CHILD_PROCESS_INIT;
child.git_cmd = 1;
strvec_push(&child.args, "prune-packed");
if (opts->quiet)
strvec_push(&child.args, "--quiet");
return !!run_command(&child);
}
struct write_loose_object_data {
FILE *in;
int count;
int batch_size;
};
static int bail_on_loose(const struct object_id *oid,
const char *path,
void *data)
{
return 1;
}
static int write_loose_object_to_stdin(const struct object_id *oid,
const char *path,
void *data)
{
struct write_loose_object_data *d = (struct write_loose_object_data *)data;
fprintf(d->in, "%s\n", oid_to_hex(oid));
return ++(d->count) > d->batch_size;
}
static int pack_loose(struct maintenance_run_opts *opts)
{
struct repository *r = the_repository;
int result = 0;
struct write_loose_object_data data;
struct child_process pack_proc = CHILD_PROCESS_INIT;
/*
* Do not start pack-objects process
* if there are no loose objects.
*/
if (!for_each_loose_file_in_objdir(r->objects->odb->path,
bail_on_loose,
NULL, NULL, NULL))
return 0;
pack_proc.git_cmd = 1;
strvec_push(&pack_proc.args, "pack-objects");
if (opts->quiet)
strvec_push(&pack_proc.args, "--quiet");
strvec_pushf(&pack_proc.args, "%s/pack/loose", r->objects->odb->path);
pack_proc.in = -1;
if (start_command(&pack_proc)) {
error(_("failed to start 'git pack-objects' process"));
return 1;
}
data.in = xfdopen(pack_proc.in, "w");
data.count = 0;
data.batch_size = 50000;
for_each_loose_file_in_objdir(r->objects->odb->path,
write_loose_object_to_stdin,
NULL,
NULL,
&data);
fclose(data.in);
if (finish_command(&pack_proc)) {
error(_("failed to finish 'git pack-objects' process"));
result = 1;
}
return result;
}
static int maintenance_task_loose_objects(struct maintenance_run_opts *opts)
{
return prune_packed(opts) || pack_loose(opts);
}
typedef int maintenance_task_fn(struct maintenance_run_opts *opts); typedef int maintenance_task_fn(struct maintenance_run_opts *opts);
/* /*
@ -901,6 +993,7 @@ struct maintenance_task {
enum maintenance_task_label { enum maintenance_task_label {
TASK_PREFETCH, TASK_PREFETCH,
TASK_LOOSE_OBJECTS,
TASK_GC, TASK_GC,
TASK_COMMIT_GRAPH, TASK_COMMIT_GRAPH,
@ -913,6 +1006,10 @@ static struct maintenance_task tasks[] = {
"prefetch", "prefetch",
maintenance_task_prefetch, maintenance_task_prefetch,
}, },
[TASK_LOOSE_OBJECTS] = {
"loose-objects",
maintenance_task_loose_objects,
},
[TASK_GC] = { [TASK_GC] = {
"gc", "gc",
maintenance_task_gc, maintenance_task_gc,

View File

@ -88,4 +88,43 @@ test_expect_success 'prefetch multiple remotes' '
test_cmp_rev refs/remotes/remote2/two refs/prefetch/remote2/two test_cmp_rev refs/remotes/remote2/two refs/prefetch/remote2/two
' '
test_expect_success 'loose-objects task' '
# Repack everything so we know the state of the object dir
git repack -adk &&
# Hack to stop maintenance from running during "git commit"
echo in use >.git/objects/maintenance.lock &&
# Assuming that "git commit" creates at least one loose object
test_commit create-loose-object &&
rm .git/objects/maintenance.lock &&
ls .git/objects >obj-dir-before &&
test_file_not_empty obj-dir-before &&
ls .git/objects/pack/*.pack >packs-before &&
test_line_count = 1 packs-before &&
# The first run creates a pack-file
# but does not delete loose objects.
git maintenance run --task=loose-objects &&
ls .git/objects >obj-dir-between &&
test_cmp obj-dir-before obj-dir-between &&
ls .git/objects/pack/*.pack >packs-between &&
test_line_count = 2 packs-between &&
ls .git/objects/pack/loose-*.pack >loose-packs &&
test_line_count = 1 loose-packs &&
# The second run deletes loose objects
# but does not create a pack-file.
git maintenance run --task=loose-objects &&
ls .git/objects >obj-dir-after &&
cat >expect <<-\EOF &&
info
pack
EOF
test_cmp expect obj-dir-after &&
ls .git/objects/pack/*.pack >packs-after &&
test_cmp packs-between packs-after
'
test_done test_done