2013-09-15 17:33:20 +02:00
|
|
|
#include "builtin.h"
|
|
|
|
#include "cache.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2013-09-15 17:33:20 +02:00
|
|
|
#include "dir.h"
|
|
|
|
#include "parse-options.h"
|
|
|
|
#include "run-command.h"
|
|
|
|
#include "sigchain.h"
|
|
|
|
#include "strbuf.h"
|
|
|
|
#include "string-list.h"
|
2020-07-28 22:23:39 +02:00
|
|
|
#include "strvec.h"
|
2018-07-12 21:39:40 +02:00
|
|
|
#include "midx.h"
|
2018-08-09 00:34:06 +02:00
|
|
|
#include "packfile.h"
|
2020-03-24 02:07:52 +01:00
|
|
|
#include "prune-packed.h"
|
2018-08-20 20:33:55 +02:00
|
|
|
#include "object-store.h"
|
2019-06-25 15:40:31 +02:00
|
|
|
#include "promisor-remote.h"
|
2020-04-30 21:48:50 +02:00
|
|
|
#include "shallow.h"
|
2021-01-12 09:21:59 +01:00
|
|
|
#include "pack.h"
|
2013-09-15 17:33:20 +02:00
|
|
|
|
|
|
|
static int delta_base_offset = 1;
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
static int pack_kept_objects = -1;
|
2019-03-14 10:12:54 +01:00
|
|
|
static int write_bitmaps = -1;
|
2018-08-16 08:13:10 +02:00
|
|
|
static int use_delta_islands;
|
2013-09-15 17:33:20 +02:00
|
|
|
static char *packdir, *packtmp;
|
|
|
|
|
|
|
|
static const char *const git_repack_usage[] = {
|
2015-01-13 08:44:47 +01:00
|
|
|
N_("git repack [<options>]"),
|
2013-09-15 17:33:20 +02:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-12-28 23:45:42 +01:00
|
|
|
static const char incremental_bitmap_conflict_error[] = N_(
|
|
|
|
"Incremental repacks are incompatible with bitmap indexes. Use\n"
|
|
|
|
"--no-write-bitmap-index or disable the pack.writebitmaps configuration."
|
|
|
|
);
|
|
|
|
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
static int repack_config(const char *var, const char *value, void *cb)
|
|
|
|
{
|
|
|
|
if (!strcmp(var, "repack.usedeltabaseoffset")) {
|
|
|
|
delta_base_offset = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
if (!strcmp(var, "repack.packkeptobjects")) {
|
|
|
|
pack_kept_objects = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
2014-06-10 22:20:30 +02:00
|
|
|
if (!strcmp(var, "repack.writebitmaps") ||
|
|
|
|
!strcmp(var, "pack.writebitmaps")) {
|
2014-06-10 22:10:07 +02:00
|
|
|
write_bitmaps = git_config_bool(var, value);
|
2014-06-10 22:09:23 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2018-08-16 08:13:10 +02:00
|
|
|
if (!strcmp(var, "repack.usedeltaislands")) {
|
|
|
|
use_delta_islands = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
2013-09-15 17:33:20 +02:00
|
|
|
return git_default_config(var, value, cb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove temporary $GIT_OBJECT_DIRECTORY/pack/.tmp-$$-pack-* files.
|
|
|
|
*/
|
|
|
|
static void remove_temporary_files(void)
|
|
|
|
{
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
size_t dirlen, prefixlen;
|
|
|
|
DIR *dir;
|
|
|
|
struct dirent *e;
|
|
|
|
|
|
|
|
dir = opendir(packdir);
|
|
|
|
if (!dir)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Point at the slash at the end of ".../objects/pack/" */
|
|
|
|
dirlen = strlen(packdir) + 1;
|
|
|
|
strbuf_addstr(&buf, packtmp);
|
|
|
|
/* Hold the length of ".tmp-%d-pack-" */
|
|
|
|
prefixlen = buf.len - dirlen;
|
|
|
|
|
|
|
|
while ((e = readdir(dir))) {
|
|
|
|
if (strncmp(e->d_name, buf.buf + dirlen, prefixlen))
|
|
|
|
continue;
|
|
|
|
strbuf_setlen(&buf, dirlen);
|
|
|
|
strbuf_addstr(&buf, e->d_name);
|
|
|
|
unlink(buf.buf);
|
|
|
|
}
|
|
|
|
closedir(dir);
|
|
|
|
strbuf_release(&buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_pack_on_signal(int signo)
|
|
|
|
{
|
|
|
|
remove_temporary_files();
|
|
|
|
sigchain_pop(signo);
|
|
|
|
raise(signo);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adds all packs hex strings to the fname list, which do not
|
2018-08-09 00:34:06 +02:00
|
|
|
* have a corresponding .keep file. These packs are not to
|
2017-12-08 16:27:16 +01:00
|
|
|
* be kept if we are going to pack everything into one file.
|
2013-09-15 17:33:20 +02:00
|
|
|
*/
|
2018-04-15 17:36:13 +02:00
|
|
|
static void get_non_kept_pack_filenames(struct string_list *fname_list,
|
|
|
|
const struct string_list *extra_keep)
|
2013-09-15 17:33:20 +02:00
|
|
|
{
|
|
|
|
DIR *dir;
|
|
|
|
struct dirent *e;
|
|
|
|
char *fname;
|
|
|
|
|
|
|
|
if (!(dir = opendir(packdir)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
while ((e = readdir(dir)) != NULL) {
|
2014-06-30 18:58:51 +02:00
|
|
|
size_t len;
|
2018-04-15 17:36:13 +02:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < extra_keep->nr; i++)
|
|
|
|
if (!fspathcmp(e->d_name, extra_keep->items[i].string))
|
|
|
|
break;
|
|
|
|
if (extra_keep->nr > 0 && i < extra_keep->nr)
|
|
|
|
continue;
|
|
|
|
|
2014-06-30 18:58:51 +02:00
|
|
|
if (!strip_suffix(e->d_name, ".pack", &len))
|
2013-09-15 17:33:20 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
fname = xmemdupz(e->d_name, len);
|
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
if (!file_exists(mkpath("%s/%s.keep", packdir, fname)))
|
2013-09-15 17:33:20 +02:00
|
|
|
string_list_append_nodup(fname_list, fname);
|
|
|
|
else
|
|
|
|
free(fname);
|
|
|
|
}
|
|
|
|
closedir(dir);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_redundant_pack(const char *dir_name, const char *base_name)
|
|
|
|
{
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
midx: traverse the local MIDX first
When a repository has an alternate object directory configured, callers
can traverse through each alternate's MIDX by walking the '->next'
pointer.
But, when 'prepare_multi_pack_index_one()' loads multiple MIDXs, it
places the new ones at the front of this pointer chain, not at the end.
This can be confusing for callers such as 'git repack -ad', causing test
failures like in t7700.6 with 'GIT_TEST_MULTI_PACK_INDEX=1'.
The occurs when dropping a pack known to the local MIDX with alternates
configured that have their own MIDX. Since the alternate's MIDX is
returned via 'get_multi_pack_index()', 'midx_contains_pack()' returns
true (which is correct, since it traverses through the '->next' pointer
to find the MIDX in the chain that does contain the requested object).
But, we call 'clear_midx_file()' on 'the_repository', which drops the
MIDX at the path of the first MIDX in the chain, which (in the case of
t7700.6 is the one in the alternate).
This patch addresses that by:
- placing the local MIDX first in the chain when calling
'prepare_multi_pack_index_one()', and
- introducing a new 'get_local_multi_pack_index()', which explicitly
returns the repository-local MIDX, if any.
Don't impose an additional order on the MIDX's '->next' pointer beyond
that the first item in the chain must be local if one exists so that we
avoid a quadratic insertion.
Likewise, use 'get_local_multi_pack_index()' in
'remove_redundant_pack()' to fix the formerly broken t7700.6 when run
with 'GIT_TEST_MULTI_PACK_INDEX=1'.
Finally, note that the MIDX ordering invariant is only preserved by the
insertion order in 'prepare_packed_git()', which traverses through the
ODB's '->next' pointer, meaning we visit the local object store first.
This fragility makes this an undesirable long-term solution if more
callers are added, but it is acceptable for now since this is the only
caller.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-28 22:22:13 +02:00
|
|
|
struct multi_pack_index *m = get_local_multi_pack_index(the_repository);
|
2020-08-25 18:04:36 +02:00
|
|
|
strbuf_addf(&buf, "%s.pack", base_name);
|
|
|
|
if (m && midx_contains_pack(m, buf.buf))
|
|
|
|
clear_midx_file(the_repository);
|
|
|
|
strbuf_insertf(&buf, 0, "%s/", dir_name);
|
2019-06-11 01:35:22 +02:00
|
|
|
unlink_pack_path(buf.buf, 1);
|
2013-09-15 17:33:20 +02:00
|
|
|
strbuf_release(&buf);
|
|
|
|
}
|
|
|
|
|
2018-08-09 00:34:05 +02:00
|
|
|
struct pack_objects_args {
|
|
|
|
const char *window;
|
|
|
|
const char *window_memory;
|
|
|
|
const char *depth;
|
|
|
|
const char *threads;
|
|
|
|
const char *max_pack_size;
|
|
|
|
int no_reuse_delta;
|
|
|
|
int no_reuse_object;
|
|
|
|
int quiet;
|
|
|
|
int local;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void prepare_pack_objects(struct child_process *cmd,
|
|
|
|
const struct pack_objects_args *args)
|
|
|
|
{
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd->args, "pack-objects");
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->window)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--window=%s", args->window);
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->window_memory)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--window-memory=%s", args->window_memory);
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->depth)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--depth=%s", args->depth);
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->threads)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--threads=%s", args->threads);
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->max_pack_size)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--max-pack-size=%s", args->max_pack_size);
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->no_reuse_delta)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--no-reuse-delta");
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->no_reuse_object)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd->args, "--no-reuse-object");
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->local)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd->args, "--local");
|
2018-08-09 00:34:05 +02:00
|
|
|
if (args->quiet)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd->args, "--quiet");
|
2018-08-09 00:34:05 +02:00
|
|
|
if (delta_base_offset)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd->args, "--delta-base-offset");
|
|
|
|
strvec_push(&cmd->args, packtmp);
|
2018-08-09 00:34:05 +02:00
|
|
|
cmd->git_cmd = 1;
|
|
|
|
cmd->out = -1;
|
|
|
|
}
|
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
/*
|
|
|
|
* Write oid to the given struct child_process's stdin, starting it first if
|
|
|
|
* necessary.
|
|
|
|
*/
|
|
|
|
static int write_oid(const struct object_id *oid, struct packed_git *pack,
|
|
|
|
uint32_t pos, void *data)
|
|
|
|
{
|
|
|
|
struct child_process *cmd = data;
|
|
|
|
|
|
|
|
if (cmd->in == -1) {
|
|
|
|
if (start_command(cmd))
|
2018-11-10 06:16:10 +01:00
|
|
|
die(_("could not start pack-objects to repack promisor objects"));
|
2018-08-09 00:34:06 +02:00
|
|
|
}
|
|
|
|
|
2019-08-18 22:04:18 +02:00
|
|
|
xwrite(cmd->in, oid_to_hex(oid), the_hash_algo->hexsz);
|
2018-08-09 00:34:06 +02:00
|
|
|
xwrite(cmd->in, "\n", 1);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-11-16 19:41:12 +01:00
|
|
|
static struct {
|
|
|
|
const char *name;
|
|
|
|
unsigned optional:1;
|
|
|
|
} exts[] = {
|
|
|
|
{".pack"},
|
|
|
|
{".idx"},
|
packfile: prepare for the existence of '*.rev' files
Specify the format of the on-disk reverse index 'pack-*.rev' file, as
well as prepare the code for the existence of such files.
The reverse index maps from pack relative positions (i.e., an index into
the array of object which is sorted by their offsets within the
packfile) to their position within the 'pack-*.idx' file. Today, this is
done by building up a list of (off_t, uint32_t) tuples for each object
(the off_t corresponding to that object's offset, and the uint32_t
corresponding to its position in the index). To convert between pack and
index position quickly, this array of tuples is radix sorted based on
its offset.
This has two major drawbacks:
First, the in-memory cost scales linearly with the number of objects in
a pack. Each 'struct revindex_entry' is sizeof(off_t) +
sizeof(uint32_t) + padding bytes for a total of 16.
To observe this, force Git to load the reverse index by, for e.g.,
running 'git cat-file --batch-check="%(objectsize:disk)"'. When asking
for a single object in a fresh clone of the kernel, Git needs to
allocate 120+ MB of memory in order to hold the reverse index in memory.
Second, the cost to sort also scales with the size of the pack.
Luckily, this is a linear function since 'load_pack_revindex()' uses a
radix sort, but this cost still must be paid once per pack per process.
As an example, it takes ~60x longer to print the _size_ of an object as
it does to print that entire object's _contents_:
Benchmark #1: git.compile cat-file --batch <obj
Time (mean ± σ): 3.4 ms ± 0.1 ms [User: 3.3 ms, System: 2.1 ms]
Range (min … max): 3.2 ms … 3.7 ms 726 runs
Benchmark #2: git.compile cat-file --batch-check="%(objectsize:disk)" <obj
Time (mean ± σ): 210.3 ms ± 8.9 ms [User: 188.2 ms, System: 23.2 ms]
Range (min … max): 193.7 ms … 224.4 ms 13 runs
Instead, avoid computing and sorting the revindex once per process by
writing it to a file when the pack itself is generated.
The format is relatively straightforward. It contains an array of
uint32_t's, the length of which is equal to the number of objects in the
pack. The ith entry in this table contains the index position of the
ith object in the pack, where "ith object in the pack" is determined by
pack offset.
One thing that the on-disk format does _not_ contain is the full (up to)
eight-byte offset corresponding to each object. This is something that
the in-memory revindex contains (it stores an off_t in 'struct
revindex_entry' along with the same uint32_t that the on-disk format
has). Omit it in the on-disk format, since knowing the index position
for some object is sufficient to get a constant-time lookup in the
pack-*.idx file to ask for an object's offset within the pack.
This trades off between the on-disk size of the 'pack-*.rev' file for
runtime to chase down the offset for some object. Even though the lookup
is constant time, the constant is heavier, since it can potentially
involve two pointer walks in v2 indexes (one to access the 4-byte offset
table, and potentially a second to access the double wide offset table).
Consider trying to map an object's pack offset to a relative position
within that pack. In a cold-cache scenario, more page faults occur while
switching between binary searching through the reverse index and
searching through the *.idx file for an object's offset. Sure enough,
with a cold cache (writing '3' into '/proc/sys/vm/drop_caches' after
'sync'ing), printing out the entire object's contents is still
marginally faster than printing its size:
Benchmark #1: git.compile cat-file --batch-check="%(objectsize:disk)" <obj >/dev/null
Time (mean ± σ): 22.6 ms ± 0.5 ms [User: 2.4 ms, System: 7.9 ms]
Range (min … max): 21.4 ms … 23.5 ms 41 runs
Benchmark #2: git.compile cat-file --batch <obj >/dev/null
Time (mean ± σ): 17.2 ms ± 0.7 ms [User: 2.8 ms, System: 5.5 ms]
Range (min … max): 15.6 ms … 18.2 ms 45 runs
(Numbers taken in the kernel after cheating and using the next patch to
generate a reverse index). There are a couple of approaches to improve
cold cache performance not pursued here:
- We could include the object offsets in the reverse index format.
Predictably, this does result in fewer page faults, but it triples
the size of the file, while simultaneously duplicating a ton of data
already available in the .idx file. (This was the original way I
implemented the format, and it did show
`--batch-check='%(objectsize:disk)'` winning out against `--batch`.)
On the other hand, this increase in size also results in a large
block-cache footprint, which could potentially hurt other workloads.
- We could store the mapping from pack to index position in more
cache-friendly way, like constructing a binary search tree from the
table and writing the values in breadth-first order. This would
result in much better locality, but the price you pay is trading
O(1) lookup in 'pack_pos_to_index()' for an O(log n) one (since you
can no longer directly index the table).
So, neither of these approaches are taken here. (Thankfully, the format
is versioned, so we are free to pursue these in the future.) But, cold
cache performance likely isn't interesting outside of one-off cases like
asking for the size of an object directly. In real-world usage, Git is
often performing many operations in the revindex (i.e., asking about
many objects rather than a single one).
The trade-off is worth it, since we will avoid the vast majority of the
cost of generating the revindex that the extra pointer chase will look
like noise in the following patch's benchmarks.
This patch describes the format and prepares callers (like in
pack-revindex.c) to be able to read *.rev files once they exist. An
implementation of the writer will appear in the next patch, and callers
will gradually begin to start using the writer in the patches that
follow after that.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-26 00:37:14 +01:00
|
|
|
{".rev", 1},
|
2020-11-16 19:41:12 +01:00
|
|
|
{".bitmap", 1},
|
|
|
|
{".promisor", 1},
|
|
|
|
};
|
|
|
|
|
2020-11-16 19:41:17 +01:00
|
|
|
static unsigned populate_pack_exts(char *name)
|
|
|
|
{
|
|
|
|
struct stat statbuf;
|
|
|
|
struct strbuf path = STRBUF_INIT;
|
|
|
|
unsigned ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(exts); i++) {
|
|
|
|
strbuf_reset(&path);
|
|
|
|
strbuf_addf(&path, "%s-%s%s", packtmp, name, exts[i].name);
|
|
|
|
|
|
|
|
if (stat(path.buf, &statbuf))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ret |= (1 << i);
|
|
|
|
}
|
|
|
|
|
|
|
|
strbuf_release(&path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
static void repack_promisor_objects(const struct pack_objects_args *args,
|
|
|
|
struct string_list *names)
|
|
|
|
{
|
|
|
|
struct child_process cmd = CHILD_PROCESS_INIT;
|
|
|
|
FILE *out;
|
|
|
|
struct strbuf line = STRBUF_INIT;
|
|
|
|
|
|
|
|
prepare_pack_objects(&cmd, args);
|
|
|
|
cmd.in = -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NEEDSWORK: Giving pack-objects only the OIDs without any ordering
|
|
|
|
* hints may result in suboptimal deltas in the resulting pack. See if
|
|
|
|
* the OIDs can be sent with fake paths such that pack-objects can use a
|
|
|
|
* {type -> existing pack order} ordering when computing deltas instead
|
|
|
|
* of a {type -> size} ordering, which may produce better deltas.
|
|
|
|
*/
|
|
|
|
for_each_packed_object(write_oid, &cmd,
|
|
|
|
FOR_EACH_OBJECT_PROMISOR_ONLY);
|
|
|
|
|
|
|
|
if (cmd.in == -1)
|
|
|
|
/* No packed objects; cmd was never started */
|
|
|
|
return;
|
|
|
|
|
|
|
|
close(cmd.in);
|
|
|
|
|
|
|
|
out = xfdopen(cmd.out, "r");
|
|
|
|
while (strbuf_getline_lf(&line, out) != EOF) {
|
2020-11-16 19:41:17 +01:00
|
|
|
struct string_list_item *item;
|
2018-08-09 00:34:06 +02:00
|
|
|
char *promisor_name;
|
2021-01-12 09:21:59 +01:00
|
|
|
|
2018-10-15 02:01:50 +02:00
|
|
|
if (line.len != the_hash_algo->hexsz)
|
2019-01-04 22:33:31 +01:00
|
|
|
die(_("repack: Expecting full hex object ID lines only from pack-objects."));
|
2020-11-16 19:41:17 +01:00
|
|
|
item = string_list_append(names, line.buf);
|
2018-08-09 00:34:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* pack-objects creates the .pack and .idx files, but not the
|
|
|
|
* .promisor file. Create the .promisor file, which is empty.
|
2019-10-15 02:12:31 +02:00
|
|
|
*
|
|
|
|
* NEEDSWORK: fetch-pack sometimes generates non-empty
|
|
|
|
* .promisor files containing the ref names and associated
|
|
|
|
* hashes at the point of generation of the corresponding
|
|
|
|
* packfile, but this would not preserve their contents. Maybe
|
|
|
|
* concatenate the contents of all .promisor files instead of
|
|
|
|
* just creating a new empty file.
|
2018-08-09 00:34:06 +02:00
|
|
|
*/
|
|
|
|
promisor_name = mkpathdup("%s-%s.promisor", packtmp,
|
|
|
|
line.buf);
|
2021-01-12 09:21:59 +01:00
|
|
|
write_promisor_file(promisor_name, NULL, 0);
|
2020-11-16 19:41:17 +01:00
|
|
|
|
|
|
|
item->util = (void *)(uintptr_t)populate_pack_exts(item->string);
|
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
free(promisor_name);
|
|
|
|
}
|
|
|
|
fclose(out);
|
|
|
|
if (finish_command(&cmd))
|
2018-11-10 06:16:10 +01:00
|
|
|
die(_("could not finish pack-objects to repack promisor objects"));
|
2018-08-09 00:34:06 +02:00
|
|
|
}
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
#define ALL_INTO_ONE 1
|
|
|
|
#define LOOSEN_UNREACHABLE 2
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
struct pack_geometry {
|
|
|
|
struct packed_git **pack;
|
|
|
|
uint32_t pack_nr, pack_alloc;
|
|
|
|
uint32_t split;
|
|
|
|
};
|
|
|
|
|
|
|
|
static uint32_t geometry_pack_weight(struct packed_git *p)
|
|
|
|
{
|
|
|
|
if (open_pack_index(p))
|
|
|
|
die(_("cannot open index for %s"), p->pack_name);
|
|
|
|
return p->num_objects;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int geometry_cmp(const void *va, const void *vb)
|
|
|
|
{
|
|
|
|
uint32_t aw = geometry_pack_weight(*(struct packed_git **)va),
|
|
|
|
bw = geometry_pack_weight(*(struct packed_git **)vb);
|
|
|
|
|
|
|
|
if (aw < bw)
|
|
|
|
return -1;
|
|
|
|
if (aw > bw)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void init_pack_geometry(struct pack_geometry **geometry_p)
|
|
|
|
{
|
|
|
|
struct packed_git *p;
|
|
|
|
struct pack_geometry *geometry;
|
|
|
|
|
|
|
|
*geometry_p = xcalloc(1, sizeof(struct pack_geometry));
|
|
|
|
geometry = *geometry_p;
|
|
|
|
|
|
|
|
for (p = get_all_packs(the_repository); p; p = p->next) {
|
|
|
|
if (!pack_kept_objects && p->pack_keep)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ALLOC_GROW(geometry->pack,
|
|
|
|
geometry->pack_nr + 1,
|
|
|
|
geometry->pack_alloc);
|
|
|
|
|
|
|
|
geometry->pack[geometry->pack_nr] = p;
|
|
|
|
geometry->pack_nr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
QSORT(geometry->pack, geometry->pack_nr, geometry_cmp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void split_pack_geometry(struct pack_geometry *geometry, int factor)
|
|
|
|
{
|
|
|
|
uint32_t i;
|
|
|
|
uint32_t split;
|
|
|
|
off_t total_size = 0;
|
|
|
|
|
2021-03-05 16:21:37 +01:00
|
|
|
if (!geometry->pack_nr) {
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
geometry->split = geometry->pack_nr;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First, count the number of packs (in descending order of size) which
|
|
|
|
* already form a geometric progression.
|
|
|
|
*/
|
|
|
|
for (i = geometry->pack_nr - 1; i > 0; i--) {
|
|
|
|
struct packed_git *ours = geometry->pack[i];
|
|
|
|
struct packed_git *prev = geometry->pack[i - 1];
|
2021-03-05 16:21:56 +01:00
|
|
|
|
|
|
|
if (unsigned_mult_overflows(factor, geometry_pack_weight(prev)))
|
|
|
|
die(_("pack %s too large to consider in geometric "
|
|
|
|
"progression"),
|
|
|
|
prev->pack_name);
|
|
|
|
|
2021-03-05 16:21:50 +01:00
|
|
|
if (geometry_pack_weight(ours) < factor * geometry_pack_weight(prev))
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-03-05 16:21:50 +01:00
|
|
|
split = i;
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (split) {
|
|
|
|
/*
|
|
|
|
* Move the split one to the right, since the top element in the
|
|
|
|
* last-compared pair can't be in the progression. Only do this
|
|
|
|
* when we split in the middle of the array (otherwise if we got
|
|
|
|
* to the end, then the split is in the right place).
|
|
|
|
*/
|
|
|
|
split++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Then, anything to the left of 'split' must be in a new pack. But,
|
|
|
|
* creating that new pack may cause packs in the heavy half to no longer
|
|
|
|
* form a geometric progression.
|
|
|
|
*
|
|
|
|
* Compute an expected size of the new pack, and then determine how many
|
|
|
|
* packs in the heavy half need to be joined into it (if any) to restore
|
|
|
|
* the geometric progression.
|
|
|
|
*/
|
2021-03-05 16:21:56 +01:00
|
|
|
for (i = 0; i < split; i++) {
|
|
|
|
struct packed_git *p = geometry->pack[i];
|
|
|
|
|
|
|
|
if (unsigned_add_overflows(total_size, geometry_pack_weight(p)))
|
|
|
|
die(_("pack %s too large to roll up"), p->pack_name);
|
|
|
|
total_size += geometry_pack_weight(p);
|
|
|
|
}
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
for (i = split; i < geometry->pack_nr; i++) {
|
|
|
|
struct packed_git *ours = geometry->pack[i];
|
2021-03-05 16:21:56 +01:00
|
|
|
|
|
|
|
if (unsigned_mult_overflows(factor, total_size))
|
|
|
|
die(_("pack %s too large to roll up"), ours->pack_name);
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (geometry_pack_weight(ours) < factor * total_size) {
|
2021-03-05 16:21:56 +01:00
|
|
|
if (unsigned_add_overflows(total_size,
|
|
|
|
geometry_pack_weight(ours)))
|
|
|
|
die(_("pack %s too large to roll up"),
|
|
|
|
ours->pack_name);
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
split++;
|
|
|
|
total_size += geometry_pack_weight(ours);
|
|
|
|
} else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
geometry->split = split;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void clear_pack_geometry(struct pack_geometry *geometry)
|
|
|
|
{
|
|
|
|
if (!geometry)
|
|
|
|
return;
|
|
|
|
|
|
|
|
free(geometry->pack);
|
|
|
|
geometry->pack_nr = 0;
|
|
|
|
geometry->pack_alloc = 0;
|
|
|
|
geometry->split = 0;
|
|
|
|
}
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
int cmd_repack(int argc, const char **argv, const char *prefix)
|
|
|
|
{
|
2014-08-19 21:09:35 +02:00
|
|
|
struct child_process cmd = CHILD_PROCESS_INIT;
|
2013-09-15 17:33:20 +02:00
|
|
|
struct string_list_item *item;
|
|
|
|
struct string_list names = STRING_LIST_INIT_DUP;
|
|
|
|
struct string_list rollback = STRING_LIST_INIT_NODUP;
|
|
|
|
struct string_list existing_packs = STRING_LIST_INIT_DUP;
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
struct pack_geometry *geometry = NULL;
|
2013-09-15 17:33:20 +02:00
|
|
|
struct strbuf line = STRBUF_INIT;
|
builtin/repack.c: don't move existing packs out of the way
When 'git repack' creates a pack with the same name as any existing
pack, it moves the existing one to 'old-pack-xxx.{pack,idx,...}' and
then renames the new one into place.
Eventually, it would be nice to have 'git repack' allow for writing a
multi-pack index at the critical time (after the new packs have been
written / moved into place, but before the old ones have been deleted).
Guessing that this option might be called '--write-midx', this makes the
following situation (where repacks are issued back-to-back without any
new objects) impossible:
$ git repack -adb
$ git repack -adb --write-midx
In the second repack, the existing packs are overwritten verbatim with
the same rename-to-old sequence. At that point, the current MIDX is
invalidated, since it refers to now-missing packs. So that code wants to
be run after the MIDX is re-written. But (prior to this patch) the new
MIDX can't be written until the new packs are moved into place. So, we
have a circular dependency.
This is all hypothetical, since no code currently exists to write a MIDX
safely during a 'git repack' (the 'GIT_TEST_MULTI_PACK_INDEX' does so
unsafely). Putting hypothetical aside, though: why do we need to rename
existing packs to be prefixed with 'old-' anyway?
This behavior dates all the way back to 2ad47d6 (git-repack: Be
careful when updating the same pack as an existing one., 2006-06-25).
2ad47d6 is mainly concerned about a case where a newly written pack
would have a different structure than its index. This used to be
possible when the pack name was a hash of the set of objects. Under this
naming scheme, two packs that store the same set of objects could differ
in delta selection, object positioning, or both. If this happened, then
any such packs would be unreadable in the instant between copying the
new pack and new index (i.e., either the index or pack will be stale
depending on the order that they were copied).
But since 1190a1a (pack-objects: name pack files after trailer hash,
2013-12-05), this is no longer possible, since pack files are named not
after their logical contents (i.e., the set of objects), but by the
actual checksum of their contents. So, this old- behavior can safely go,
which allows us to avoid our circular dependency above.
In addition to avoiding the circular dependency, this patch also makes
'git repack' a lot simpler, since we don't have to deal with failures
encountered when renaming existing packs to be prefixed with 'old-'.
This patch is mostly limited to removing code paths that deal with the
'old' prefixing, with the exception of files that include the pack's
name in their own filename, like .idx, .bitmap, and related files. The
exception is that we want to continue to trust what pack-objects wrote.
That is, it is not the case that we pretend as if pack-objects didn't
write files identical to ones that already exist, but rather that we
respect what pack-objects wrote as the source of truth. That cuts two
ways:
- If pack-objects produced an identical pack to one that already
exists with a bitmap, but did not produce a bitmap, we remove the
bitmap that already exists. (This behavior is codified in t7700.14).
- If pack-objects produced an identical pack to one that already
exists, we trust the just-written version of the coresponding .idx,
.promisor, and other files over the ones that already exist. This
ensures that we use the most up-to-date versions of this files,
which is safe even in the face of format changes in, say, the .idx
file (which would not be reflected in the .idx file's name).
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-17 21:15:16 +01:00
|
|
|
int i, ext, ret;
|
2013-09-15 17:33:20 +02:00
|
|
|
FILE *out;
|
|
|
|
|
|
|
|
/* variables to be filled by option parsing */
|
|
|
|
int pack_everything = 0;
|
|
|
|
int delete_redundant = 0;
|
2014-01-23 02:28:30 +01:00
|
|
|
const char *unpack_unreachable = NULL;
|
repack: add --keep-unreachable option
The usual way to do a full repack (and what is done by
git-gc) is to run "repack -Ad --unpack-unreachable=<when>",
which will loosen any unreachable objects newer than
"<when>", and drop any older ones.
This is a safer alternative to "repack -ad", because
"<when>" becomes a grace period during which we will not
drop any new objects that are about to be referenced.
However, it isn't perfectly safe. It's always possible that
a process is about to reference an old object. Even if that
process were to take care to update the timestamp on the
object, there is no atomicity with a simultaneously running
"repack" process.
So while unlikely, there is a small race wherein we may drop
an object that is in the process of being referenced. If you
do automated repacking on a large number of active
repositories, you may hit it eventually, and the result is a
corrupted repository.
It would be nice to fix that race in the long run, but it's
complicated. In the meantime, there is a much simpler
strategy for automated repository maintenance: do not drop
objects at all. We already have a "--keep-unreachable"
option in pack-objects; we just need to plumb it through
from git-repack.
Note that this _isn't_ plumbed through from git-gc, so at
this point it's strictly a tool for people doing their own
advanced repository maintenance strategy.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-06-13 06:36:28 +02:00
|
|
|
int keep_unreachable = 0;
|
2018-04-15 17:36:13 +02:00
|
|
|
struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
|
2013-09-15 17:33:20 +02:00
|
|
|
int no_update_server_info = 0;
|
2018-08-09 00:34:05 +02:00
|
|
|
struct pack_objects_args po_args = {NULL};
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
int geometric_factor = 0;
|
2013-09-15 17:33:20 +02:00
|
|
|
|
|
|
|
struct option builtin_repack_options[] = {
|
|
|
|
OPT_BIT('a', NULL, &pack_everything,
|
|
|
|
N_("pack everything in a single pack"), ALL_INTO_ONE),
|
|
|
|
OPT_BIT('A', NULL, &pack_everything,
|
|
|
|
N_("same as -a, and turn unreachable objects loose"),
|
|
|
|
LOOSEN_UNREACHABLE | ALL_INTO_ONE),
|
|
|
|
OPT_BOOL('d', NULL, &delete_redundant,
|
|
|
|
N_("remove redundant packs, and run git-prune-packed")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_BOOL('f', NULL, &po_args.no_reuse_delta,
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("pass --no-reuse-delta to git-pack-objects")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("pass --no-reuse-object to git-pack-objects")),
|
|
|
|
OPT_BOOL('n', NULL, &no_update_server_info,
|
|
|
|
N_("do not run git-update-server-info")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT__QUIET(&po_args.quiet, N_("be quiet")),
|
|
|
|
OPT_BOOL('l', "local", &po_args.local,
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("pass --local to git-pack-objects")),
|
2014-06-10 22:10:07 +02:00
|
|
|
OPT_BOOL('b', "write-bitmap-index", &write_bitmaps,
|
2013-12-21 15:00:31 +01:00
|
|
|
N_("write bitmap index")),
|
2018-08-16 08:13:10 +02:00
|
|
|
OPT_BOOL('i', "delta-islands", &use_delta_islands,
|
|
|
|
N_("pass --delta-islands to git-pack-objects")),
|
2013-09-15 17:33:20 +02:00
|
|
|
OPT_STRING(0, "unpack-unreachable", &unpack_unreachable, N_("approxidate"),
|
|
|
|
N_("with -A, do not loosen objects older than this")),
|
repack: add --keep-unreachable option
The usual way to do a full repack (and what is done by
git-gc) is to run "repack -Ad --unpack-unreachable=<when>",
which will loosen any unreachable objects newer than
"<when>", and drop any older ones.
This is a safer alternative to "repack -ad", because
"<when>" becomes a grace period during which we will not
drop any new objects that are about to be referenced.
However, it isn't perfectly safe. It's always possible that
a process is about to reference an old object. Even if that
process were to take care to update the timestamp on the
object, there is no atomicity with a simultaneously running
"repack" process.
So while unlikely, there is a small race wherein we may drop
an object that is in the process of being referenced. If you
do automated repacking on a large number of active
repositories, you may hit it eventually, and the result is a
corrupted repository.
It would be nice to fix that race in the long run, but it's
complicated. In the meantime, there is a much simpler
strategy for automated repository maintenance: do not drop
objects at all. We already have a "--keep-unreachable"
option in pack-objects; we just need to plumb it through
from git-repack.
Note that this _isn't_ plumbed through from git-gc, so at
this point it's strictly a tool for people doing their own
advanced repository maintenance strategy.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-06-13 06:36:28 +02:00
|
|
|
OPT_BOOL('k', "keep-unreachable", &keep_unreachable,
|
|
|
|
N_("with -a, repack unreachable objects")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_STRING(0, "window", &po_args.window, N_("n"),
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("size of the window used for delta compression")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_STRING(0, "window-memory", &po_args.window_memory, N_("bytes"),
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("same as the above, but limit memory size instead of entries count")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_STRING(0, "depth", &po_args.depth, N_("n"),
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("limits the maximum delta depth")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_STRING(0, "threads", &po_args.threads, N_("n"),
|
2017-04-27 01:09:25 +02:00
|
|
|
N_("limits the maximum number of threads")),
|
2018-08-09 00:34:05 +02:00
|
|
|
OPT_STRING(0, "max-pack-size", &po_args.max_pack_size, N_("bytes"),
|
2013-09-15 17:33:20 +02:00
|
|
|
N_("maximum size of each packfile")),
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects,
|
|
|
|
N_("repack objects in packs marked with .keep")),
|
2018-04-15 17:36:13 +02:00
|
|
|
OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
|
|
|
|
N_("do not repack this pack")),
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
OPT_INTEGER('g', "geometric", &geometric_factor,
|
|
|
|
N_("find a geometric progression with factor <N>")),
|
2013-09-15 17:33:20 +02:00
|
|
|
OPT_END()
|
|
|
|
};
|
|
|
|
|
|
|
|
git_config(repack_config, NULL);
|
|
|
|
|
|
|
|
argc = parse_options(argc, argv, prefix, builtin_repack_options,
|
|
|
|
git_repack_usage, 0);
|
|
|
|
|
2015-06-23 12:54:11 +02:00
|
|
|
if (delete_redundant && repository_format_precious_objects)
|
|
|
|
die(_("cannot delete packs in a precious-objects repo"));
|
|
|
|
|
repack: add --keep-unreachable option
The usual way to do a full repack (and what is done by
git-gc) is to run "repack -Ad --unpack-unreachable=<when>",
which will loosen any unreachable objects newer than
"<when>", and drop any older ones.
This is a safer alternative to "repack -ad", because
"<when>" becomes a grace period during which we will not
drop any new objects that are about to be referenced.
However, it isn't perfectly safe. It's always possible that
a process is about to reference an old object. Even if that
process were to take care to update the timestamp on the
object, there is no atomicity with a simultaneously running
"repack" process.
So while unlikely, there is a small race wherein we may drop
an object that is in the process of being referenced. If you
do automated repacking on a large number of active
repositories, you may hit it eventually, and the result is a
corrupted repository.
It would be nice to fix that race in the long run, but it's
complicated. In the meantime, there is a much simpler
strategy for automated repository maintenance: do not drop
objects at all. We already have a "--keep-unreachable"
option in pack-objects; we just need to plumb it through
from git-repack.
Note that this _isn't_ plumbed through from git-gc, so at
this point it's strictly a tool for people doing their own
advanced repository maintenance strategy.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-06-13 06:36:28 +02:00
|
|
|
if (keep_unreachable &&
|
|
|
|
(unpack_unreachable || (pack_everything & LOOSEN_UNREACHABLE)))
|
|
|
|
die(_("--keep-unreachable and -A are incompatible"));
|
|
|
|
|
2019-06-29 21:13:59 +02:00
|
|
|
if (write_bitmaps < 0) {
|
2019-07-31 07:39:27 +02:00
|
|
|
if (!(pack_everything & ALL_INTO_ONE) ||
|
2019-07-31 07:40:56 +02:00
|
|
|
!is_bare_repository())
|
2019-07-31 07:39:27 +02:00
|
|
|
write_bitmaps = 0;
|
2019-06-29 21:13:59 +02:00
|
|
|
}
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
if (pack_kept_objects < 0)
|
2019-07-31 07:40:56 +02:00
|
|
|
pack_kept_objects = write_bitmaps > 0;
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
|
2016-12-28 23:45:42 +01:00
|
|
|
if (write_bitmaps && !(pack_everything & ALL_INTO_ONE))
|
|
|
|
die(_(incremental_bitmap_conflict_error));
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (geometric_factor) {
|
|
|
|
if (pack_everything)
|
|
|
|
die(_("--geometric is incompatible with -A, -a"));
|
|
|
|
init_pack_geometry(&geometry);
|
|
|
|
split_pack_geometry(geometry, geometric_factor);
|
|
|
|
}
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
packdir = mkpathdup("%s/pack", get_object_directory());
|
|
|
|
packtmp = mkpathdup("%s/.tmp-%d-pack", packdir, (int)getpid());
|
|
|
|
|
|
|
|
sigchain_push_common(remove_pack_on_signal);
|
|
|
|
|
2018-08-09 00:34:05 +02:00
|
|
|
prepare_pack_objects(&cmd, &po_args);
|
|
|
|
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--keep-true-parents");
|
repack: add `repack.packKeptObjects` config var
The git-repack command always passes `--honor-pack-keep`
to pack-objects. This has traditionally been a good thing,
as we do not want to duplicate those objects in a new pack,
and we are not going to delete the old pack.
However, when bitmaps are in use, it is important for a full
repack to include all reachable objects, even if they may be
duplicated in a .keep pack. Otherwise, we cannot generate
the bitmaps, as the on-disk format requires the set of
objects in the pack to be fully closed.
Even if the repository does not generally have .keep files,
a simultaneous push could cause a race condition in which a
.keep file exists at the moment of a repack. The repack may
try to include those objects in one of two situations:
1. The pushed .keep pack contains objects that were
already in the repository (e.g., blobs due to a revert of
an old commit).
2. Receive-pack updates the refs, making the objects
reachable, but before it removes the .keep file, the
repack runs.
In either case, we may prefer to duplicate some objects in
the new, full pack, and let the next repack (after the .keep
file is cleaned up) take care of removing them.
This patch introduces both a command-line and config option
to disable the `--honor-pack-keep` option. By default, it
is triggered when pack.writeBitmaps (or `--write-bitmap-index`
is turned on), but specifying it explicitly can override the
behavior (e.g., in cases where you prefer .keep files to
bitmaps, but only when they are present).
Note that this option just disables the pack-objects
behavior. We still leave packs with a .keep in place, as we
do not necessarily know that we have duplicated all of their
objects.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-03 21:04:20 +01:00
|
|
|
if (!pack_kept_objects)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--honor-pack-keep");
|
2018-04-15 17:36:13 +02:00
|
|
|
for (i = 0; i < keep_pack_list.nr; i++)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd.args, "--keep-pack=%s",
|
strvec: fix indentation in renamed calls
Code which split an argv_array call across multiple lines, like:
argv_array_pushl(&args, "one argument",
"another argument", "and more",
NULL);
was recently mechanically renamed to use strvec, which results in
mis-matched indentation like:
strvec_pushl(&args, "one argument",
"another argument", "and more",
NULL);
Let's fix these up to align the arguments with the opening paren. I did
this manually by sifting through the results of:
git jump grep 'strvec_.*,$'
and liberally applying my editor's auto-format. Most of the changes are
of the form shown above, though I also normalized a few that had
originally used a single-tab indentation (rather than our usual style of
aligning with the open paren). I also rewrapped a couple of obvious
cases (e.g., where previously too-long lines became short enough to fit
on one), but I wasn't aggressive about it. In cases broken to three or
more lines, the grouping of arguments is sometimes meaningful, and it
wasn't worth my time or reviewer time to ponder each case individually.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-28 22:26:31 +02:00
|
|
|
keep_pack_list.items[i].string);
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--non-empty");
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (!geometry) {
|
|
|
|
/*
|
2021-03-05 16:22:02 +01:00
|
|
|
* We need to grab all reachable objects, including those that
|
|
|
|
* are reachable from reflogs and the index.
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
*
|
2021-03-05 16:22:02 +01:00
|
|
|
* When repacking into a geometric progression of packs,
|
|
|
|
* however, we ask 'git pack-objects --stdin-packs', and it is
|
|
|
|
* not about packing objects based on reachability but about
|
|
|
|
* repacking all the objects in specified packs and loose ones
|
|
|
|
* (indeed, --stdin-packs is incompatible with these options).
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
*/
|
|
|
|
strvec_push(&cmd.args, "--all");
|
|
|
|
strvec_push(&cmd.args, "--reflog");
|
|
|
|
strvec_push(&cmd.args, "--indexed-objects");
|
|
|
|
}
|
2019-06-25 15:40:31 +02:00
|
|
|
if (has_promisor_remote())
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--exclude-promisor-objects");
|
2019-07-31 07:39:27 +02:00
|
|
|
if (write_bitmaps > 0)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--write-bitmap-index");
|
2019-07-31 07:39:27 +02:00
|
|
|
else if (write_bitmaps < 0)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--write-bitmap-index-quiet");
|
2018-08-16 08:13:10 +02:00
|
|
|
if (use_delta_islands)
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--delta-islands");
|
2013-09-15 17:33:20 +02:00
|
|
|
|
|
|
|
if (pack_everything & ALL_INTO_ONE) {
|
2018-04-15 17:36:13 +02:00
|
|
|
get_non_kept_pack_filenames(&existing_packs, &keep_pack_list);
|
2013-09-15 17:33:20 +02:00
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
repack_promisor_objects(&po_args, &names);
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
if (existing_packs.nr && delete_redundant) {
|
2015-03-20 19:43:13 +01:00
|
|
|
if (unpack_unreachable) {
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_pushf(&cmd.args,
|
strvec: fix indentation in renamed calls
Code which split an argv_array call across multiple lines, like:
argv_array_pushl(&args, "one argument",
"another argument", "and more",
NULL);
was recently mechanically renamed to use strvec, which results in
mis-matched indentation like:
strvec_pushl(&args, "one argument",
"another argument", "and more",
NULL);
Let's fix these up to align the arguments with the opening paren. I did
this manually by sifting through the results of:
git jump grep 'strvec_.*,$'
and liberally applying my editor's auto-format. Most of the changes are
of the form shown above, though I also normalized a few that had
originally used a single-tab indentation (rather than our usual style of
aligning with the open paren). I also rewrapped a couple of obvious
cases (e.g., where previously too-long lines became short enough to fit
on one), but I wasn't aggressive about it. In cases broken to three or
more lines, the grouping of arguments is sometimes meaningful, and it
wasn't worth my time or reviewer time to ponder each case individually.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-28 22:26:31 +02:00
|
|
|
"--unpack-unreachable=%s",
|
|
|
|
unpack_unreachable);
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.env_array, "GIT_REF_PARANOIA=1");
|
2015-03-20 19:43:13 +01:00
|
|
|
} else if (pack_everything & LOOSEN_UNREACHABLE) {
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args,
|
strvec: fix indentation in renamed calls
Code which split an argv_array call across multiple lines, like:
argv_array_pushl(&args, "one argument",
"another argument", "and more",
NULL);
was recently mechanically renamed to use strvec, which results in
mis-matched indentation like:
strvec_pushl(&args, "one argument",
"another argument", "and more",
NULL);
Let's fix these up to align the arguments with the opening paren. I did
this manually by sifting through the results of:
git jump grep 'strvec_.*,$'
and liberally applying my editor's auto-format. Most of the changes are
of the form shown above, though I also normalized a few that had
originally used a single-tab indentation (rather than our usual style of
aligning with the open paren). I also rewrapped a couple of obvious
cases (e.g., where previously too-long lines became short enough to fit
on one), but I wasn't aggressive about it. In cases broken to three or
more lines, the grouping of arguments is sometimes meaningful, and it
wasn't worth my time or reviewer time to ponder each case individually.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-28 22:26:31 +02:00
|
|
|
"--unpack-unreachable");
|
repack: add --keep-unreachable option
The usual way to do a full repack (and what is done by
git-gc) is to run "repack -Ad --unpack-unreachable=<when>",
which will loosen any unreachable objects newer than
"<when>", and drop any older ones.
This is a safer alternative to "repack -ad", because
"<when>" becomes a grace period during which we will not
drop any new objects that are about to be referenced.
However, it isn't perfectly safe. It's always possible that
a process is about to reference an old object. Even if that
process were to take care to update the timestamp on the
object, there is no atomicity with a simultaneously running
"repack" process.
So while unlikely, there is a small race wherein we may drop
an object that is in the process of being referenced. If you
do automated repacking on a large number of active
repositories, you may hit it eventually, and the result is a
corrupted repository.
It would be nice to fix that race in the long run, but it's
complicated. In the meantime, there is a much simpler
strategy for automated repository maintenance: do not drop
objects at all. We already have a "--keep-unreachable"
option in pack-objects; we just need to plumb it through
from git-repack.
Note that this _isn't_ plumbed through from git-gc, so at
this point it's strictly a tool for people doing their own
advanced repository maintenance strategy.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-06-13 06:36:28 +02:00
|
|
|
} else if (keep_unreachable) {
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--keep-unreachable");
|
|
|
|
strvec_push(&cmd.args, "--pack-loose-unreachable");
|
2015-03-20 19:43:13 +01:00
|
|
|
} else {
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.env_array, "GIT_REF_PARANOIA=1");
|
2015-03-20 19:43:13 +01:00
|
|
|
}
|
2013-09-15 17:33:20 +02:00
|
|
|
}
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
} else if (geometry) {
|
|
|
|
strvec_push(&cmd.args, "--stdin-packs");
|
|
|
|
strvec_push(&cmd.args, "--unpacked");
|
2013-09-15 17:33:20 +02:00
|
|
|
} else {
|
2020-07-28 22:24:27 +02:00
|
|
|
strvec_push(&cmd.args, "--unpacked");
|
|
|
|
strvec_push(&cmd.args, "--incremental");
|
2013-09-15 17:33:20 +02:00
|
|
|
}
|
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (geometry)
|
|
|
|
cmd.in = -1;
|
|
|
|
else
|
|
|
|
cmd.no_stdin = 1;
|
2013-09-15 17:33:20 +02:00
|
|
|
|
|
|
|
ret = start_command(&cmd);
|
|
|
|
if (ret)
|
2013-09-15 17:33:21 +02:00
|
|
|
return ret;
|
2013-09-15 17:33:20 +02:00
|
|
|
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
if (geometry) {
|
|
|
|
FILE *in = xfdopen(cmd.in, "w");
|
|
|
|
/*
|
|
|
|
* The resulting pack should contain all objects in packs that
|
|
|
|
* are going to be rolled up, but exclude objects in packs which
|
|
|
|
* are being left alone.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < geometry->split; i++)
|
|
|
|
fprintf(in, "%s\n", pack_basename(geometry->pack[i]));
|
|
|
|
for (i = geometry->split; i < geometry->pack_nr; i++)
|
|
|
|
fprintf(in, "^%s\n", pack_basename(geometry->pack[i]));
|
|
|
|
fclose(in);
|
|
|
|
}
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
out = xfdopen(cmd.out, "r");
|
2016-01-14 00:31:17 +01:00
|
|
|
while (strbuf_getline_lf(&line, out) != EOF) {
|
2018-10-15 02:01:50 +02:00
|
|
|
if (line.len != the_hash_algo->hexsz)
|
2019-01-04 22:33:31 +01:00
|
|
|
die(_("repack: Expecting full hex object ID lines only from pack-objects."));
|
2013-09-15 17:33:20 +02:00
|
|
|
string_list_append(&names, line.buf);
|
|
|
|
}
|
|
|
|
fclose(out);
|
|
|
|
ret = finish_command(&cmd);
|
|
|
|
if (ret)
|
2013-09-15 17:33:21 +02:00
|
|
|
return ret;
|
2013-09-15 17:33:20 +02:00
|
|
|
|
2018-08-09 00:34:05 +02:00
|
|
|
if (!names.nr && !po_args.quiet)
|
2018-11-10 06:16:10 +01:00
|
|
|
printf_ln(_("Nothing new to pack."));
|
2013-09-15 17:33:20 +02:00
|
|
|
|
2020-11-16 19:41:17 +01:00
|
|
|
for_each_string_list_item(item, &names) {
|
|
|
|
item->util = (void *)(uintptr_t)populate_pack_exts(item->string);
|
|
|
|
}
|
|
|
|
|
2019-05-17 20:41:49 +02:00
|
|
|
close_object_store(the_repository->objects);
|
2018-12-15 23:04:01 +01:00
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
/*
|
|
|
|
* Ok we have prepared all new packfiles.
|
|
|
|
*/
|
|
|
|
for_each_string_list_item(item, &names) {
|
2013-12-21 15:00:19 +01:00
|
|
|
for (ext = 0; ext < ARRAY_SIZE(exts); ext++) {
|
2015-08-10 11:35:38 +02:00
|
|
|
char *fname, *fname_old;
|
2018-07-12 21:39:40 +02:00
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
fname = mkpathdup("%s/pack-%s%s",
|
2013-12-21 15:00:23 +01:00
|
|
|
packdir, item->string, exts[ext].name);
|
2013-09-15 17:33:20 +02:00
|
|
|
fname_old = mkpathdup("%s-%s%s",
|
2013-12-21 15:00:23 +01:00
|
|
|
packtmp, item->string, exts[ext].name);
|
builtin/repack.c: don't move existing packs out of the way
When 'git repack' creates a pack with the same name as any existing
pack, it moves the existing one to 'old-pack-xxx.{pack,idx,...}' and
then renames the new one into place.
Eventually, it would be nice to have 'git repack' allow for writing a
multi-pack index at the critical time (after the new packs have been
written / moved into place, but before the old ones have been deleted).
Guessing that this option might be called '--write-midx', this makes the
following situation (where repacks are issued back-to-back without any
new objects) impossible:
$ git repack -adb
$ git repack -adb --write-midx
In the second repack, the existing packs are overwritten verbatim with
the same rename-to-old sequence. At that point, the current MIDX is
invalidated, since it refers to now-missing packs. So that code wants to
be run after the MIDX is re-written. But (prior to this patch) the new
MIDX can't be written until the new packs are moved into place. So, we
have a circular dependency.
This is all hypothetical, since no code currently exists to write a MIDX
safely during a 'git repack' (the 'GIT_TEST_MULTI_PACK_INDEX' does so
unsafely). Putting hypothetical aside, though: why do we need to rename
existing packs to be prefixed with 'old-' anyway?
This behavior dates all the way back to 2ad47d6 (git-repack: Be
careful when updating the same pack as an existing one., 2006-06-25).
2ad47d6 is mainly concerned about a case where a newly written pack
would have a different structure than its index. This used to be
possible when the pack name was a hash of the set of objects. Under this
naming scheme, two packs that store the same set of objects could differ
in delta selection, object positioning, or both. If this happened, then
any such packs would be unreadable in the instant between copying the
new pack and new index (i.e., either the index or pack will be stale
depending on the order that they were copied).
But since 1190a1a (pack-objects: name pack files after trailer hash,
2013-12-05), this is no longer possible, since pack files are named not
after their logical contents (i.e., the set of objects), but by the
actual checksum of their contents. So, this old- behavior can safely go,
which allows us to avoid our circular dependency above.
In addition to avoiding the circular dependency, this patch also makes
'git repack' a lot simpler, since we don't have to deal with failures
encountered when renaming existing packs to be prefixed with 'old-'.
This patch is mostly limited to removing code paths that deal with the
'old' prefixing, with the exception of files that include the pack's
name in their own filename, like .idx, .bitmap, and related files. The
exception is that we want to continue to trust what pack-objects wrote.
That is, it is not the case that we pretend as if pack-objects didn't
write files identical to ones that already exist, but rather that we
respect what pack-objects wrote as the source of truth. That cuts two
ways:
- If pack-objects produced an identical pack to one that already
exists with a bitmap, but did not produce a bitmap, we remove the
bitmap that already exists. (This behavior is codified in t7700.14).
- If pack-objects produced an identical pack to one that already
exists, we trust the just-written version of the coresponding .idx,
.promisor, and other files over the ones that already exist. This
ensures that we use the most up-to-date versions of this files,
which is safe even in the face of format changes in, say, the .idx
file (which would not be reflected in the .idx file's name).
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-17 21:15:16 +01:00
|
|
|
|
|
|
|
if (((uintptr_t)item->util) & (1 << ext)) {
|
|
|
|
struct stat statbuffer;
|
|
|
|
if (!stat(fname_old, &statbuffer)) {
|
|
|
|
statbuffer.st_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
|
|
|
|
chmod(fname_old, statbuffer.st_mode);
|
|
|
|
}
|
|
|
|
|
2013-12-21 15:00:27 +01:00
|
|
|
if (rename(fname_old, fname))
|
|
|
|
die_errno(_("renaming '%s' failed"), fname_old);
|
builtin/repack.c: don't move existing packs out of the way
When 'git repack' creates a pack with the same name as any existing
pack, it moves the existing one to 'old-pack-xxx.{pack,idx,...}' and
then renames the new one into place.
Eventually, it would be nice to have 'git repack' allow for writing a
multi-pack index at the critical time (after the new packs have been
written / moved into place, but before the old ones have been deleted).
Guessing that this option might be called '--write-midx', this makes the
following situation (where repacks are issued back-to-back without any
new objects) impossible:
$ git repack -adb
$ git repack -adb --write-midx
In the second repack, the existing packs are overwritten verbatim with
the same rename-to-old sequence. At that point, the current MIDX is
invalidated, since it refers to now-missing packs. So that code wants to
be run after the MIDX is re-written. But (prior to this patch) the new
MIDX can't be written until the new packs are moved into place. So, we
have a circular dependency.
This is all hypothetical, since no code currently exists to write a MIDX
safely during a 'git repack' (the 'GIT_TEST_MULTI_PACK_INDEX' does so
unsafely). Putting hypothetical aside, though: why do we need to rename
existing packs to be prefixed with 'old-' anyway?
This behavior dates all the way back to 2ad47d6 (git-repack: Be
careful when updating the same pack as an existing one., 2006-06-25).
2ad47d6 is mainly concerned about a case where a newly written pack
would have a different structure than its index. This used to be
possible when the pack name was a hash of the set of objects. Under this
naming scheme, two packs that store the same set of objects could differ
in delta selection, object positioning, or both. If this happened, then
any such packs would be unreadable in the instant between copying the
new pack and new index (i.e., either the index or pack will be stale
depending on the order that they were copied).
But since 1190a1a (pack-objects: name pack files after trailer hash,
2013-12-05), this is no longer possible, since pack files are named not
after their logical contents (i.e., the set of objects), but by the
actual checksum of their contents. So, this old- behavior can safely go,
which allows us to avoid our circular dependency above.
In addition to avoiding the circular dependency, this patch also makes
'git repack' a lot simpler, since we don't have to deal with failures
encountered when renaming existing packs to be prefixed with 'old-'.
This patch is mostly limited to removing code paths that deal with the
'old' prefixing, with the exception of files that include the pack's
name in their own filename, like .idx, .bitmap, and related files. The
exception is that we want to continue to trust what pack-objects wrote.
That is, it is not the case that we pretend as if pack-objects didn't
write files identical to ones that already exist, but rather that we
respect what pack-objects wrote as the source of truth. That cuts two
ways:
- If pack-objects produced an identical pack to one that already
exists with a bitmap, but did not produce a bitmap, we remove the
bitmap that already exists. (This behavior is codified in t7700.14).
- If pack-objects produced an identical pack to one that already
exists, we trust the just-written version of the coresponding .idx,
.promisor, and other files over the ones that already exist. This
ensures that we use the most up-to-date versions of this files,
which is safe even in the face of format changes in, say, the .idx
file (which would not be reflected in the .idx file's name).
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-17 21:15:16 +01:00
|
|
|
} else if (!exts[ext].optional)
|
|
|
|
die(_("missing required file: %s"), fname_old);
|
|
|
|
else if (unlink(fname) < 0 && errno != ENOENT)
|
|
|
|
die_errno(_("could not unlink: %s"), fname);
|
2013-09-15 17:33:20 +02:00
|
|
|
|
2015-08-10 11:35:38 +02:00
|
|
|
free(fname);
|
builtin/repack.c: don't move existing packs out of the way
When 'git repack' creates a pack with the same name as any existing
pack, it moves the existing one to 'old-pack-xxx.{pack,idx,...}' and
then renames the new one into place.
Eventually, it would be nice to have 'git repack' allow for writing a
multi-pack index at the critical time (after the new packs have been
written / moved into place, but before the old ones have been deleted).
Guessing that this option might be called '--write-midx', this makes the
following situation (where repacks are issued back-to-back without any
new objects) impossible:
$ git repack -adb
$ git repack -adb --write-midx
In the second repack, the existing packs are overwritten verbatim with
the same rename-to-old sequence. At that point, the current MIDX is
invalidated, since it refers to now-missing packs. So that code wants to
be run after the MIDX is re-written. But (prior to this patch) the new
MIDX can't be written until the new packs are moved into place. So, we
have a circular dependency.
This is all hypothetical, since no code currently exists to write a MIDX
safely during a 'git repack' (the 'GIT_TEST_MULTI_PACK_INDEX' does so
unsafely). Putting hypothetical aside, though: why do we need to rename
existing packs to be prefixed with 'old-' anyway?
This behavior dates all the way back to 2ad47d6 (git-repack: Be
careful when updating the same pack as an existing one., 2006-06-25).
2ad47d6 is mainly concerned about a case where a newly written pack
would have a different structure than its index. This used to be
possible when the pack name was a hash of the set of objects. Under this
naming scheme, two packs that store the same set of objects could differ
in delta selection, object positioning, or both. If this happened, then
any such packs would be unreadable in the instant between copying the
new pack and new index (i.e., either the index or pack will be stale
depending on the order that they were copied).
But since 1190a1a (pack-objects: name pack files after trailer hash,
2013-12-05), this is no longer possible, since pack files are named not
after their logical contents (i.e., the set of objects), but by the
actual checksum of their contents. So, this old- behavior can safely go,
which allows us to avoid our circular dependency above.
In addition to avoiding the circular dependency, this patch also makes
'git repack' a lot simpler, since we don't have to deal with failures
encountered when renaming existing packs to be prefixed with 'old-'.
This patch is mostly limited to removing code paths that deal with the
'old' prefixing, with the exception of files that include the pack's
name in their own filename, like .idx, .bitmap, and related files. The
exception is that we want to continue to trust what pack-objects wrote.
That is, it is not the case that we pretend as if pack-objects didn't
write files identical to ones that already exist, but rather that we
respect what pack-objects wrote as the source of truth. That cuts two
ways:
- If pack-objects produced an identical pack to one that already
exists with a bitmap, but did not produce a bitmap, we remove the
bitmap that already exists. (This behavior is codified in t7700.14).
- If pack-objects produced an identical pack to one that already
exists, we trust the just-written version of the coresponding .idx,
.promisor, and other files over the ones that already exist. This
ensures that we use the most up-to-date versions of this files,
which is safe even in the face of format changes in, say, the .idx
file (which would not be reflected in the .idx file's name).
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-17 21:15:16 +01:00
|
|
|
free(fname_old);
|
2013-09-15 17:33:20 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/* End of pack replacement. */
|
|
|
|
|
2018-08-09 00:34:06 +02:00
|
|
|
reprepare_packed_git(the_repository);
|
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
if (delete_redundant) {
|
2018-10-15 02:01:50 +02:00
|
|
|
const int hexsz = the_hash_algo->hexsz;
|
2014-09-13 09:28:01 +02:00
|
|
|
int opts = 0;
|
2014-11-25 09:02:35 +01:00
|
|
|
string_list_sort(&names);
|
2013-09-15 17:33:20 +02:00
|
|
|
for_each_string_list_item(item, &existing_packs) {
|
|
|
|
char *sha1;
|
|
|
|
size_t len = strlen(item->string);
|
2018-10-15 02:01:50 +02:00
|
|
|
if (len < hexsz)
|
2013-09-15 17:33:20 +02:00
|
|
|
continue;
|
2018-10-15 02:01:50 +02:00
|
|
|
sha1 = item->string + len - hexsz;
|
2013-09-15 17:33:20 +02:00
|
|
|
if (!string_list_has_string(&names, sha1))
|
|
|
|
remove_redundant_pack(packdir, item->string);
|
|
|
|
}
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
|
|
|
|
if (geometry) {
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
|
|
|
|
uint32_t i;
|
|
|
|
for (i = 0; i < geometry->split; i++) {
|
|
|
|
struct packed_git *p = geometry->pack[i];
|
|
|
|
if (string_list_has_string(&names,
|
|
|
|
hash_to_hex(p->hash)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
strbuf_reset(&buf);
|
|
|
|
strbuf_addstr(&buf, pack_basename(p));
|
|
|
|
strbuf_strip_suffix(&buf, ".pack");
|
|
|
|
|
|
|
|
remove_redundant_pack(packdir, buf.buf);
|
|
|
|
}
|
|
|
|
strbuf_release(&buf);
|
|
|
|
}
|
2018-08-09 00:34:05 +02:00
|
|
|
if (!po_args.quiet && isatty(2))
|
2014-09-13 09:28:01 +02:00
|
|
|
opts |= PRUNE_PACKED_VERBOSE;
|
|
|
|
prune_packed_objects(opts);
|
repack -ad: prune the list of shallow commits
`git repack` can drop unreachable commits without further warning,
making the corresponding entries in `.git/shallow` invalid, which causes
serious problems when deepening the branches.
One scenario where unreachable commits are dropped by `git repack` is
when a `git fetch --prune` (or even a `git fetch` when a ref was
force-pushed in the meantime) can make a commit unreachable that was
reachable before.
Therefore it is not safe to assume that a `git repack -adlf` will keep
unreachable commits alone (under the assumption that they had not been
packed in the first place, which is an assumption at least some of Git's
code seems to make).
This is particularly important to keep in mind when looking at the
`.git/shallow` file: if any commits listed in that file become
unreachable, it is not a problem, but if they go missing, it *is* a
problem. One symptom of this problem is that a deepening fetch may now
fail with
fatal: error in object: unshallow <commit-hash>
To avoid this problem, let's prune the shallow list in `git repack` when
the `-d` option is passed, unless `-A` is passed, too (which would force
the now-unreachable objects to be turned into loose objects instead of
being deleted). Additionally, we also need to take `--keep-reachable`
and `--unpack-unreachable=<date>` into account.
Note: an alternative solution discussed during the review of this patch
was to teach `git fetch` to simply ignore entries in .git/shallow if the
corresponding commits do not exist locally. A quick test, however,
revealed that the .git/shallow file is written during a shallow *clone*,
in which case the commits do not exist, either, but the "shallow" line
*does* need to be sent. Therefore, this approach would be a lot more
finicky than the approach presented by the this patch.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-10-24 17:56:13 +02:00
|
|
|
|
|
|
|
if (!keep_unreachable &&
|
|
|
|
(!(pack_everything & LOOSEN_UNREACHABLE) ||
|
|
|
|
unpack_unreachable) &&
|
|
|
|
is_repository_shallow(the_repository))
|
|
|
|
prune_shallow(PRUNE_QUICK);
|
2013-09-15 17:33:20 +02:00
|
|
|
}
|
|
|
|
|
2014-09-13 09:28:01 +02:00
|
|
|
if (!no_update_server_info)
|
|
|
|
update_server_info(0);
|
2013-09-15 17:33:20 +02:00
|
|
|
remove_temporary_files();
|
2018-10-12 19:34:20 +02:00
|
|
|
|
|
|
|
if (git_env_bool(GIT_TEST_MULTI_PACK_INDEX, 0))
|
2021-03-30 17:04:11 +02:00
|
|
|
write_midx_file(get_object_directory(), NULL, 0);
|
2018-10-12 19:34:20 +02:00
|
|
|
|
2013-09-15 17:33:20 +02:00
|
|
|
string_list_clear(&names, 0);
|
|
|
|
string_list_clear(&rollback, 0);
|
|
|
|
string_list_clear(&existing_packs, 0);
|
builtin/repack.c: add '--geometric' option
Often it is useful to both:
- have relatively few packfiles in a repository, and
- avoid having so few packfiles in a repository that we repack its
entire contents regularly
This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).
Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:
objects(Pi) > r*objects(P(i-1))
for all i in [1, n], where the packs are sorted by
objects(P1) <= objects(P2) <= ... <= objects(Pn).
Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:
1. We assume that there is a cutoff of packs _before starting the
repack_ where everything to the right of that cut-off already forms
a geometric progression (or no cutoff exists and everything must be
repacked).
2. We assume that everything smaller than the cutoff count must be
repacked. This forms our base assumption, but it can also cause
even the "heavy" packs to get repacked, for e.g., if we have 6
packs containing the following number of objects:
1, 1, 1, 2, 4, 32
then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
rolling up the first two packs into a pack with 2 objects. That
breaks our progression and leaves us:
2, 1, 2, 4, 32
^
(where the '^' indicates the position of our split). To restore a
progression, we move the split forward (towards larger packs)
joining each pack into our new pack until a geometric progression
is restored. Here, that looks like:
2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32
^ ^ ^ ^
This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:27 +01:00
|
|
|
clear_pack_geometry(geometry);
|
2013-09-15 17:33:20 +02:00
|
|
|
strbuf_release(&line);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|