Merge branch 'jk/pack-objects-optim-mru'
"git pack-objects" in a repository with many packfiles used to spend a lot of time looking for/at objects in them; the accesses to the packfiles are now optimized by checking the most-recently-used packfile first. * jk/pack-objects-optim-mru: pack-objects: use mru list when iterating over packs pack-objects: break delta cycles before delta-search phase sha1_file: make packed_object_info public provide an initializer for "struct object_info"
This commit is contained in:
commit
e6e24c94df
@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name,
|
|||||||
char *buf;
|
char *buf;
|
||||||
unsigned long size;
|
unsigned long size;
|
||||||
struct object_context obj_context;
|
struct object_context obj_context;
|
||||||
struct object_info oi = {NULL};
|
struct object_info oi = OBJECT_INFO_INIT;
|
||||||
struct strbuf sb = STRBUF_INIT;
|
struct strbuf sb = STRBUF_INIT;
|
||||||
unsigned flags = LOOKUP_REPLACE_OBJECT;
|
unsigned flags = LOOKUP_REPLACE_OBJECT;
|
||||||
const char *path = force_path;
|
const char *path = force_path;
|
||||||
@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt)
|
|||||||
data.split_on_whitespace = 1;
|
data.split_on_whitespace = 1;
|
||||||
|
|
||||||
if (opt->all_objects) {
|
if (opt->all_objects) {
|
||||||
struct object_info empty;
|
struct object_info empty = OBJECT_INFO_INIT;
|
||||||
memset(&empty, 0, sizeof(empty));
|
|
||||||
if (!memcmp(&data.info, &empty, sizeof(empty)))
|
if (!memcmp(&data.info, &empty, sizeof(empty)))
|
||||||
data.skip_object_info = 1;
|
data.skip_object_info = 1;
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#include "reachable.h"
|
#include "reachable.h"
|
||||||
#include "sha1-array.h"
|
#include "sha1-array.h"
|
||||||
#include "argv-array.h"
|
#include "argv-array.h"
|
||||||
|
#include "mru.h"
|
||||||
|
|
||||||
static const char *pack_usage[] = {
|
static const char *pack_usage[] = {
|
||||||
N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
|
N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
|
||||||
@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||||||
struct packed_git **found_pack,
|
struct packed_git **found_pack,
|
||||||
off_t *found_offset)
|
off_t *found_offset)
|
||||||
{
|
{
|
||||||
struct packed_git *p;
|
struct mru_entry *entry;
|
||||||
int want;
|
int want;
|
||||||
|
|
||||||
if (!exclude && local && has_loose_object_nonlocal(sha1))
|
if (!exclude && local && has_loose_object_nonlocal(sha1))
|
||||||
@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||||||
return want;
|
return want;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (p = packed_git; p; p = p->next) {
|
for (entry = packed_git_mru->head; entry; entry = entry->next) {
|
||||||
|
struct packed_git *p = entry->item;
|
||||||
off_t offset;
|
off_t offset;
|
||||||
|
|
||||||
if (p == *found_pack)
|
if (p == *found_pack)
|
||||||
@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||||||
*found_pack = p;
|
*found_pack = p;
|
||||||
}
|
}
|
||||||
want = want_found_object(exclude, p);
|
want = want_found_object(exclude, p);
|
||||||
|
if (!exclude && want > 0)
|
||||||
|
mru_mark(packed_git_mru, entry);
|
||||||
if (want != -1)
|
if (want != -1)
|
||||||
return want;
|
return want;
|
||||||
}
|
}
|
||||||
@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
|
|||||||
(a->in_pack_offset > b->in_pack_offset);
|
(a->in_pack_offset > b->in_pack_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Drop an on-disk delta we were planning to reuse. Naively, this would
|
||||||
|
* just involve blanking out the "delta" field, but we have to deal
|
||||||
|
* with some extra book-keeping:
|
||||||
|
*
|
||||||
|
* 1. Removing ourselves from the delta_sibling linked list.
|
||||||
|
*
|
||||||
|
* 2. Updating our size/type to the non-delta representation. These were
|
||||||
|
* either not recorded initially (size) or overwritten with the delta type
|
||||||
|
* (type) when check_object() decided to reuse the delta.
|
||||||
|
*/
|
||||||
|
static void drop_reused_delta(struct object_entry *entry)
|
||||||
|
{
|
||||||
|
struct object_entry **p = &entry->delta->delta_child;
|
||||||
|
struct object_info oi = OBJECT_INFO_INIT;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
if (*p == entry)
|
||||||
|
*p = (*p)->delta_sibling;
|
||||||
|
else
|
||||||
|
p = &(*p)->delta_sibling;
|
||||||
|
}
|
||||||
|
entry->delta = NULL;
|
||||||
|
|
||||||
|
oi.sizep = &entry->size;
|
||||||
|
oi.typep = &entry->type;
|
||||||
|
if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
|
||||||
|
/*
|
||||||
|
* We failed to get the info from this pack for some reason;
|
||||||
|
* fall back to sha1_object_info, which may find another copy.
|
||||||
|
* And if that fails, the error will be recorded in entry->type
|
||||||
|
* and dealt with in prepare_pack().
|
||||||
|
*/
|
||||||
|
entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Follow the chain of deltas from this entry onward, throwing away any links
|
||||||
|
* that cause us to hit a cycle (as determined by the DFS state flags in
|
||||||
|
* the entries).
|
||||||
|
*/
|
||||||
|
static void break_delta_chains(struct object_entry *entry)
|
||||||
|
{
|
||||||
|
/* If it's not a delta, it can't be part of a cycle. */
|
||||||
|
if (!entry->delta) {
|
||||||
|
entry->dfs_state = DFS_DONE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (entry->dfs_state) {
|
||||||
|
case DFS_NONE:
|
||||||
|
/*
|
||||||
|
* This is the first time we've seen the object. We mark it as
|
||||||
|
* part of the active potential cycle and recurse.
|
||||||
|
*/
|
||||||
|
entry->dfs_state = DFS_ACTIVE;
|
||||||
|
break_delta_chains(entry->delta);
|
||||||
|
entry->dfs_state = DFS_DONE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DFS_DONE:
|
||||||
|
/* object already examined, and not part of a cycle */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DFS_ACTIVE:
|
||||||
|
/*
|
||||||
|
* We found a cycle that needs broken. It would be correct to
|
||||||
|
* break any link in the chain, but it's convenient to
|
||||||
|
* break this one.
|
||||||
|
*/
|
||||||
|
drop_reused_delta(entry);
|
||||||
|
entry->dfs_state = DFS_DONE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void get_object_details(void)
|
static void get_object_details(void)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
@ -1544,6 +1625,13 @@ static void get_object_details(void)
|
|||||||
entry->no_try_delta = 1;
|
entry->no_try_delta = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This must happen in a second pass, since we rely on the delta
|
||||||
|
* information for the whole list being completed.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < to_pack.nr_objects; i++)
|
||||||
|
break_delta_chains(&to_pack.objects[i]);
|
||||||
|
|
||||||
free(sorted_by_offset);
|
free(sorted_by_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
8
cache.h
8
cache.h
@ -1602,7 +1602,15 @@ struct object_info {
|
|||||||
} packed;
|
} packed;
|
||||||
} u;
|
} u;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initializer for a "struct object_info" that wants no items. You may
|
||||||
|
* also memset() the memory to all-zeroes.
|
||||||
|
*/
|
||||||
|
#define OBJECT_INFO_INIT {NULL}
|
||||||
|
|
||||||
extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
|
extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
|
||||||
|
extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *);
|
||||||
|
|
||||||
/* Dumb servers support */
|
/* Dumb servers support */
|
||||||
extern int update_server_info(int);
|
extern int update_server_info(int);
|
||||||
|
@ -27,6 +27,15 @@ struct object_entry {
|
|||||||
unsigned no_try_delta:1;
|
unsigned no_try_delta:1;
|
||||||
unsigned tagged:1; /* near the very tip of refs */
|
unsigned tagged:1; /* near the very tip of refs */
|
||||||
unsigned filled:1; /* assigned write-order */
|
unsigned filled:1; /* assigned write-order */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* State flags for depth-first search used for analyzing delta cycles.
|
||||||
|
*/
|
||||||
|
enum {
|
||||||
|
DFS_NONE = 0,
|
||||||
|
DFS_ACTIVE,
|
||||||
|
DFS_DONE
|
||||||
|
} dfs_state;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct packing_data {
|
struct packing_data {
|
||||||
|
10
sha1_file.c
10
sha1_file.c
@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi,
|
|||||||
|
|
||||||
int parse_sha1_header(const char *hdr, unsigned long *sizep)
|
int parse_sha1_header(const char *hdr, unsigned long *sizep)
|
||||||
{
|
{
|
||||||
struct object_info oi;
|
struct object_info oi = OBJECT_INFO_INIT;
|
||||||
|
|
||||||
oi.sizep = sizep;
|
oi.sizep = sizep;
|
||||||
oi.typename = NULL;
|
|
||||||
oi.typep = NULL;
|
|
||||||
return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
|
return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2068,8 +2066,8 @@ unwind:
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int packed_object_info(struct packed_git *p, off_t obj_offset,
|
int packed_object_info(struct packed_git *p, off_t obj_offset,
|
||||||
struct object_info *oi)
|
struct object_info *oi)
|
||||||
{
|
{
|
||||||
struct pack_window *w_curs = NULL;
|
struct pack_window *w_curs = NULL;
|
||||||
unsigned long size;
|
unsigned long size;
|
||||||
@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
|
|||||||
int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
|
int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
|
||||||
{
|
{
|
||||||
enum object_type type;
|
enum object_type type;
|
||||||
struct object_info oi = {NULL};
|
struct object_info oi = OBJECT_INFO_INIT;
|
||||||
|
|
||||||
oi.typep = &type;
|
oi.typep = &type;
|
||||||
oi.sizep = sizep;
|
oi.sizep = sizep;
|
||||||
|
@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1,
|
|||||||
struct stream_filter *filter)
|
struct stream_filter *filter)
|
||||||
{
|
{
|
||||||
struct git_istream *st;
|
struct git_istream *st;
|
||||||
struct object_info oi = {NULL};
|
struct object_info oi = OBJECT_INFO_INIT;
|
||||||
const unsigned char *real = lookup_replace_object(sha1);
|
const unsigned char *real = lookup_replace_object(sha1);
|
||||||
enum input_source src = istream_source(real, type, &oi);
|
enum input_source src = istream_source(real, type, &oi);
|
||||||
|
|
||||||
|
113
t/t5314-pack-cycle-detection.sh
Executable file
113
t/t5314-pack-cycle-detection.sh
Executable file
@ -0,0 +1,113 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
test_description='test handling of inter-pack delta cycles during repack
|
||||||
|
|
||||||
|
The goal here is to create a situation where we have two blobs, A and B, with A
|
||||||
|
as a delta against B in one pack, and vice versa in the other. Then if we can
|
||||||
|
persuade a full repack to find A from one pack and B from the other, that will
|
||||||
|
give us a cycle when we attempt to reuse those deltas.
|
||||||
|
|
||||||
|
The trick is in the "persuade" step, as it depends on the internals of how
|
||||||
|
pack-objects picks which pack to reuse the deltas from. But we can assume
|
||||||
|
that it does so in one of two general strategies:
|
||||||
|
|
||||||
|
1. Using a static ordering of packs. In this case, no inter-pack cycles can
|
||||||
|
happen. Any objects with a delta relationship must be present in the same
|
||||||
|
pack (i.e., no "--thin" packs on disk), so we will find all related objects
|
||||||
|
from that pack. So assuming there are no cycles within a single pack (and
|
||||||
|
we avoid generating them via pack-objects or importing them via
|
||||||
|
index-pack), then our result will have no cycles.
|
||||||
|
|
||||||
|
So this case should pass the tests no matter how we arrange things.
|
||||||
|
|
||||||
|
2. Picking the next pack to examine based on locality (i.e., where we found
|
||||||
|
something else recently).
|
||||||
|
|
||||||
|
In this case, we want to make sure that we find the delta versions of A and
|
||||||
|
B and not their base versions. We can do this by putting two blobs in each
|
||||||
|
pack. The first is a "dummy" blob that can only be found in the pack in
|
||||||
|
question. And then the second is the actual delta we want to find.
|
||||||
|
|
||||||
|
The two blobs must be present in the same tree, not present in other trees,
|
||||||
|
and the dummy pathname must sort before the delta path.
|
||||||
|
|
||||||
|
The setup below focuses on case 2. We have two commits HEAD and HEAD^, each
|
||||||
|
which has two files: "dummy" and "file". Then we can make two packs which
|
||||||
|
contain:
|
||||||
|
|
||||||
|
[pack one]
|
||||||
|
HEAD:dummy
|
||||||
|
HEAD:file (as delta against HEAD^:file)
|
||||||
|
HEAD^:file (as base)
|
||||||
|
|
||||||
|
[pack two]
|
||||||
|
HEAD^:dummy
|
||||||
|
HEAD^:file (as delta against HEAD:file)
|
||||||
|
HEAD:file (as base)
|
||||||
|
|
||||||
|
Then no matter which order we start looking at the packs in, we know that we
|
||||||
|
will always find a delta for "file", because its lookup will always come
|
||||||
|
immediately after the lookup for "dummy".
|
||||||
|
'
|
||||||
|
. ./test-lib.sh
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Create a pack containing the the tree $1 and blob $1:file, with
|
||||||
|
# the latter stored as a delta against $2:file.
|
||||||
|
#
|
||||||
|
# We convince pack-objects to make the delta in the direction of our choosing
|
||||||
|
# by marking $2 as a preferred-base edge. That results in $1:file as a thin
|
||||||
|
# delta, and index-pack completes it by adding $2:file as a base.
|
||||||
|
#
|
||||||
|
# Note that the two variants of "file" must be similar enough to convince git
|
||||||
|
# to create the delta.
|
||||||
|
make_pack () {
|
||||||
|
{
|
||||||
|
printf '%s\n' "-$(git rev-parse $2)"
|
||||||
|
printf '%s dummy\n' "$(git rev-parse $1:dummy)"
|
||||||
|
printf '%s file\n' "$(git rev-parse $1:file)"
|
||||||
|
} |
|
||||||
|
git pack-objects --stdout |
|
||||||
|
git index-pack --stdin --fix-thin
|
||||||
|
}
|
||||||
|
|
||||||
|
test_expect_success 'setup' '
|
||||||
|
test-genrandom base 4096 >base &&
|
||||||
|
for i in one two
|
||||||
|
do
|
||||||
|
# we want shared content here to encourage deltas...
|
||||||
|
cp base file &&
|
||||||
|
echo $i >>file &&
|
||||||
|
|
||||||
|
# ...whereas dummy should be short, because we do not want
|
||||||
|
# deltas that would create duplicates when we --fix-thin
|
||||||
|
echo $i >dummy &&
|
||||||
|
|
||||||
|
git add file dummy &&
|
||||||
|
test_tick &&
|
||||||
|
git commit -m $i ||
|
||||||
|
return 1
|
||||||
|
done &&
|
||||||
|
|
||||||
|
make_pack HEAD^ HEAD &&
|
||||||
|
make_pack HEAD HEAD^
|
||||||
|
'
|
||||||
|
|
||||||
|
test_expect_success 'repack' '
|
||||||
|
# We first want to check that we do not have any internal errors,
|
||||||
|
# and also that we do not hit the last-ditch cycle-breaking code
|
||||||
|
# in write_object(), which will issue a warning to stderr.
|
||||||
|
>expect &&
|
||||||
|
git repack -ad 2>stderr &&
|
||||||
|
test_cmp expect stderr &&
|
||||||
|
|
||||||
|
# And then double-check that the resulting pack is usable (i.e.,
|
||||||
|
# we did not fail to notice any cycles). We know we are accessing
|
||||||
|
# the objects via the new pack here, because "repack -d" will have
|
||||||
|
# removed the others.
|
||||||
|
git cat-file blob HEAD:file >/dev/null &&
|
||||||
|
git cat-file blob HEAD^:file >/dev/null
|
||||||
|
'
|
||||||
|
|
||||||
|
test_done
|
Loading…
Reference in New Issue
Block a user