2018-03-23 18:20:55 +01:00
|
|
|
#ifndef OBJECT_STORE_H
|
|
|
|
#define OBJECT_STORE_H
|
|
|
|
|
2018-08-15 19:54:05 +02:00
|
|
|
#include "cache.h"
|
2018-04-12 02:21:05 +02:00
|
|
|
#include "oidmap.h"
|
2018-07-12 00:42:38 +02:00
|
|
|
#include "list.h"
|
2020-03-30 16:03:46 +02:00
|
|
|
#include "oid-array.h"
|
2018-07-12 00:42:38 +02:00
|
|
|
#include "strbuf.h"
|
object-store: allow threaded access to object reading
Allow object reading to be performed by multiple threads protecting it
with an internal lock, the obj_read_mutex. The lock usage can be toggled
with enable_obj_read_lock() and disable_obj_read_lock(). Currently, the
functions which can be safely called in parallel are:
read_object_file_extended(), repo_read_object_file(),
read_object_file(), read_object_with_reference(), read_object(),
oid_object_info() and oid_object_info_extended(). It's also possible
to use obj_read_lock() and obj_read_unlock() to protect other sections
that cannot execute in parallel with object reading.
Probably there are many spots in the functions listed above that could
be executed unlocked (and thus, in parallel). But, for now, we are most
interested in allowing parallel access to zlib inflation. This is one of
the sections where object reading spends most of the time in (e.g. up to
one-third of git-grep's execution time in the chromium repo corresponds
to inflation) and it's already thread-safe. So, to take advantage of
that, the obj_read_mutex is released when calling git_inflate() and
re-acquired right after, for every calling spot in
oid_object_info_extended()'s call chain. We may refine this lock to also
exploit other possible parallel spots in the future, but for now,
threaded zlib inflation should already give great speedups for threaded
object reading callers.
Note that add_delta_base_cache() was also modified to skip adding
already present entries to the cache. This wasn't possible before, but
it would be now, with the parallel inflation. Take for example the
following situation, where two threads - A and B - are executing the
code at unpack_entry():
1. Thread A is performing the decompression of a base O (which is not
yet in the cache) at PHASE II. Thread B is simultaneously trying to
unpack O, but just starting at PHASE I.
2. Since O is not yet in the cache, B will go to PHASE II to also
perform the decompression.
3. When they finish decompressing, one of them will get the object
reading mutex and go to PHASE III while the other waits for the
mutex. Let’s say A got the mutex first.
4. Thread A will add O to the cache, go throughout the rest of PHASE III
and return.
5. Thread B gets the mutex, also add O to the cache (if the check wasn't
there) and returns.
Finally, it is also important to highlight that the object reading lock
can only ensure thread-safety in the mentioned functions thanks to two
complementary mechanisms: the use of 'struct raw_object_store's
replace_mutex, which guards sections in the object reading machinery
that would otherwise be thread-unsafe; and the 'struct pack_window's
inuse_cnt, which protects window reading operations (such as the one
performed during the inflation of a packed object), allowing them to
execute without the acquisition of the obj_read_mutex.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:53 +01:00
|
|
|
#include "thread-utils.h"
|
2021-07-08 01:10:15 +02:00
|
|
|
#include "khash.h"
|
|
|
|
#include "dir.h"
|
2021-07-08 01:10:19 +02:00
|
|
|
#include "oidtree.h"
|
2021-09-11 22:43:26 +02:00
|
|
|
#include "oidset.h"
|
2018-04-12 02:21:05 +02:00
|
|
|
|
2018-11-12 15:48:47 +01:00
|
|
|
struct object_directory {
|
|
|
|
struct object_directory *next;
|
2018-03-23 18:20:56 +01:00
|
|
|
|
|
|
|
/*
|
2018-11-12 15:50:56 +01:00
|
|
|
* Used to store the results of readdir(3) calls when we are OK
|
|
|
|
* sacrificing accuracy due to races for speed. That includes
|
sha1-file: use loose object cache for quick existence check
In cases where we expect to ask has_sha1_file() about a lot of objects
that we are not likely to have (e.g., during fetch negotiation), we
already use OBJECT_INFO_QUICK to sacrifice accuracy (due to racing with
a simultaneous write or repack) for speed (we avoid re-scanning the pack
directory).
However, even checking for loose objects can be expensive, as we will
stat() each one. On many systems this cost isn't too noticeable, but
stat() can be particularly slow on some operating systems, or due to
network filesystems.
Since the QUICK flag already tells us that we're OK with a slightly
stale answer, we can use that as a cue to look in our in-memory cache of
each object directory. That basically trades an in-memory binary search
for a stat() call.
Note that it is possible for this to actually be _slower_. We'll do a
full readdir() to fill the cache, so if you have a very large number of
loose objects and a very small number of lookups, that readdir() may end
up more expensive.
This shouldn't be a big deal in practice. If you have a large number of
reachable loose objects, you'll already run into performance problems
(which you should remedy by repacking). You may have unreachable objects
which wouldn't otherwise impact performance. Usually these would go away
with the prune step of "git gc", but they may be held for up to 2 weeks
in the default configuration.
So it comes down to how many such objects you might reasonably expect to
have, how much slower is readdir() on N entries versus M stat() calls
(and here we really care about the syscall backing readdir(), like
getdents() on Linux, but I'll just call this readdir() below).
If N is much smaller than M (a typical packed repo), we know this is a
big win (few readdirs() followed by many uses of the resulting cache).
When N and M are similar in size, it's also a win. We care about the
latency of making a syscall, and readdir() should be giving us many
values in a single call. How many?
On Linux, running "strace -e getdents ls" shows a 32k buffer getting 512
entries per call (which is 64 bytes per entry; the name itself is 38
bytes, plus there are some other fields). So we can imagine that this is
always a win as long as the number of loose objects in the repository is
a factor of 500 less than the number of lookups you make. It's hard to
auto-tune this because we don't generally know up front how many lookups
we're going to do. But it's unlikely for this to perform significantly
worse.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:54:42 +01:00
|
|
|
* object existence with OBJECT_INFO_QUICK, as well as
|
2018-11-12 15:50:56 +01:00
|
|
|
* our search for unique abbreviated hashes. Don't use it for tasks
|
|
|
|
* requiring greater accuracy!
|
|
|
|
*
|
|
|
|
* Be sure to call odb_load_loose_cache() before using.
|
2018-03-23 18:20:56 +01:00
|
|
|
*/
|
2021-07-08 01:10:17 +02:00
|
|
|
uint32_t loose_objects_subdir_seen[8]; /* 256 bits */
|
2021-07-08 01:10:19 +02:00
|
|
|
struct oidtree *loose_objects_cache;
|
2018-03-23 18:20:56 +01:00
|
|
|
|
2021-12-06 23:05:05 +01:00
|
|
|
/*
|
|
|
|
* This is a temporary object store created by the tmp_objdir
|
|
|
|
* facility. Disable ref updates since the objects in the store
|
|
|
|
* might be discarded on rollback.
|
|
|
|
*/
|
|
|
|
int disable_ref_updates;
|
|
|
|
|
2021-12-06 23:05:04 +01:00
|
|
|
/*
|
|
|
|
* This object store is ephemeral, so there is no need to fsync.
|
|
|
|
*/
|
|
|
|
int will_destroy;
|
|
|
|
|
2018-03-23 18:21:08 +01:00
|
|
|
/*
|
|
|
|
* Path to the alternative object store. If this is a relative path,
|
|
|
|
* it is relative to the current working directory.
|
|
|
|
*/
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
char *path;
|
2018-03-23 18:20:57 +01:00
|
|
|
};
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
|
2021-07-08 01:10:15 +02:00
|
|
|
KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
|
2021-08-09 03:38:32 +02:00
|
|
|
struct object_directory *, 1, fspathhash, fspatheq)
|
2021-07-08 01:10:15 +02:00
|
|
|
|
2018-03-23 18:21:09 +01:00
|
|
|
void prepare_alt_odb(struct repository *r);
|
2018-03-23 18:20:56 +01:00
|
|
|
char *compute_alternate_path(const char *path, struct strbuf *err);
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-01 22:34:01 +02:00
|
|
|
struct object_directory *find_odb(struct repository *r, const char *obj_dir);
|
2018-11-12 15:48:47 +01:00
|
|
|
typedef int alt_odb_fn(struct object_directory *, void *);
|
2018-03-23 18:20:56 +01:00
|
|
|
int foreach_alt_odb(alt_odb_fn, void*);
|
2019-07-01 15:17:40 +02:00
|
|
|
typedef void alternate_ref_fn(const struct object_id *oid, void *);
|
|
|
|
void for_each_alternate_ref(alternate_ref_fn, void *);
|
2018-03-23 18:20:56 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the directory to the on-disk alternates file; the new entry will also
|
|
|
|
* take effect in the current process.
|
|
|
|
*/
|
|
|
|
void add_to_alternates_file(const char *dir);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the directory to the in-memory list of alternates (along with any
|
|
|
|
* recursive alternates it points to), but do not modify the on-disk alternates
|
|
|
|
* file.
|
|
|
|
*/
|
|
|
|
void add_to_alternates_memory(const char *dir);
|
|
|
|
|
2021-12-06 23:05:04 +01:00
|
|
|
/*
|
|
|
|
* Replace the current writable object directory with the specified temporary
|
|
|
|
* object directory; returns the former primary object directory.
|
|
|
|
*/
|
|
|
|
struct object_directory *set_temporary_primary_odb(const char *dir, int will_destroy);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Restore a previous ODB replaced by set_temporary_main_odb.
|
|
|
|
*/
|
|
|
|
void restore_primary_odb(struct object_directory *restore_odb, const char *old_path);
|
|
|
|
|
2019-01-06 17:45:30 +01:00
|
|
|
/*
|
|
|
|
* Populate and return the loose object cache array corresponding to the
|
|
|
|
* given object ID.
|
|
|
|
*/
|
2021-07-08 01:10:19 +02:00
|
|
|
struct oidtree *odb_loose_cache(struct object_directory *odb,
|
2019-01-06 17:45:30 +01:00
|
|
|
const struct object_id *oid);
|
|
|
|
|
2019-01-06 17:45:39 +01:00
|
|
|
/* Empty the loose object cache for the specified object directory. */
|
|
|
|
void odb_clear_loose_cache(struct object_directory *odb);
|
|
|
|
|
2021-12-06 23:05:04 +01:00
|
|
|
/* Clear and free the specified object directory */
|
|
|
|
void free_object_directory(struct object_directory *odb);
|
|
|
|
|
2018-03-23 18:20:59 +01:00
|
|
|
struct packed_git {
|
2019-11-27 23:24:53 +01:00
|
|
|
struct hashmap_entry packmap_ent;
|
2018-03-23 18:20:59 +01:00
|
|
|
struct packed_git *next;
|
|
|
|
struct list_head mru;
|
|
|
|
struct pack_window *windows;
|
|
|
|
off_t pack_size;
|
|
|
|
const void *index_data;
|
|
|
|
size_t index_size;
|
|
|
|
uint32_t num_objects;
|
2020-05-25 21:59:10 +02:00
|
|
|
uint32_t crc_offset;
|
2021-09-11 22:43:26 +02:00
|
|
|
struct oidset bad_objects;
|
2018-03-23 18:20:59 +01:00
|
|
|
int index_version;
|
|
|
|
time_t mtime;
|
|
|
|
int pack_fd;
|
2018-04-14 17:35:05 +02:00
|
|
|
int index; /* for builtin/pack-objects.c */
|
2018-03-23 18:20:59 +01:00
|
|
|
unsigned pack_local:1,
|
|
|
|
pack_keep:1,
|
2018-04-15 17:36:13 +02:00
|
|
|
pack_keep_in_core:1,
|
2018-03-23 18:20:59 +01:00
|
|
|
freshened:1,
|
|
|
|
do_not_close:1,
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-29 18:18:56 +02:00
|
|
|
pack_promisor:1,
|
|
|
|
multi_pack_index:1;
|
2019-02-19 01:05:03 +01:00
|
|
|
unsigned char hash[GIT_MAX_RAWSZ];
|
2018-03-23 18:20:59 +01:00
|
|
|
struct revindex_entry *revindex;
|
packfile: prepare for the existence of '*.rev' files
Specify the format of the on-disk reverse index 'pack-*.rev' file, as
well as prepare the code for the existence of such files.
The reverse index maps from pack relative positions (i.e., an index into
the array of object which is sorted by their offsets within the
packfile) to their position within the 'pack-*.idx' file. Today, this is
done by building up a list of (off_t, uint32_t) tuples for each object
(the off_t corresponding to that object's offset, and the uint32_t
corresponding to its position in the index). To convert between pack and
index position quickly, this array of tuples is radix sorted based on
its offset.
This has two major drawbacks:
First, the in-memory cost scales linearly with the number of objects in
a pack. Each 'struct revindex_entry' is sizeof(off_t) +
sizeof(uint32_t) + padding bytes for a total of 16.
To observe this, force Git to load the reverse index by, for e.g.,
running 'git cat-file --batch-check="%(objectsize:disk)"'. When asking
for a single object in a fresh clone of the kernel, Git needs to
allocate 120+ MB of memory in order to hold the reverse index in memory.
Second, the cost to sort also scales with the size of the pack.
Luckily, this is a linear function since 'load_pack_revindex()' uses a
radix sort, but this cost still must be paid once per pack per process.
As an example, it takes ~60x longer to print the _size_ of an object as
it does to print that entire object's _contents_:
Benchmark #1: git.compile cat-file --batch <obj
Time (mean ± σ): 3.4 ms ± 0.1 ms [User: 3.3 ms, System: 2.1 ms]
Range (min … max): 3.2 ms … 3.7 ms 726 runs
Benchmark #2: git.compile cat-file --batch-check="%(objectsize:disk)" <obj
Time (mean ± σ): 210.3 ms ± 8.9 ms [User: 188.2 ms, System: 23.2 ms]
Range (min … max): 193.7 ms … 224.4 ms 13 runs
Instead, avoid computing and sorting the revindex once per process by
writing it to a file when the pack itself is generated.
The format is relatively straightforward. It contains an array of
uint32_t's, the length of which is equal to the number of objects in the
pack. The ith entry in this table contains the index position of the
ith object in the pack, where "ith object in the pack" is determined by
pack offset.
One thing that the on-disk format does _not_ contain is the full (up to)
eight-byte offset corresponding to each object. This is something that
the in-memory revindex contains (it stores an off_t in 'struct
revindex_entry' along with the same uint32_t that the on-disk format
has). Omit it in the on-disk format, since knowing the index position
for some object is sufficient to get a constant-time lookup in the
pack-*.idx file to ask for an object's offset within the pack.
This trades off between the on-disk size of the 'pack-*.rev' file for
runtime to chase down the offset for some object. Even though the lookup
is constant time, the constant is heavier, since it can potentially
involve two pointer walks in v2 indexes (one to access the 4-byte offset
table, and potentially a second to access the double wide offset table).
Consider trying to map an object's pack offset to a relative position
within that pack. In a cold-cache scenario, more page faults occur while
switching between binary searching through the reverse index and
searching through the *.idx file for an object's offset. Sure enough,
with a cold cache (writing '3' into '/proc/sys/vm/drop_caches' after
'sync'ing), printing out the entire object's contents is still
marginally faster than printing its size:
Benchmark #1: git.compile cat-file --batch-check="%(objectsize:disk)" <obj >/dev/null
Time (mean ± σ): 22.6 ms ± 0.5 ms [User: 2.4 ms, System: 7.9 ms]
Range (min … max): 21.4 ms … 23.5 ms 41 runs
Benchmark #2: git.compile cat-file --batch <obj >/dev/null
Time (mean ± σ): 17.2 ms ± 0.7 ms [User: 2.8 ms, System: 5.5 ms]
Range (min … max): 15.6 ms … 18.2 ms 45 runs
(Numbers taken in the kernel after cheating and using the next patch to
generate a reverse index). There are a couple of approaches to improve
cold cache performance not pursued here:
- We could include the object offsets in the reverse index format.
Predictably, this does result in fewer page faults, but it triples
the size of the file, while simultaneously duplicating a ton of data
already available in the .idx file. (This was the original way I
implemented the format, and it did show
`--batch-check='%(objectsize:disk)'` winning out against `--batch`.)
On the other hand, this increase in size also results in a large
block-cache footprint, which could potentially hurt other workloads.
- We could store the mapping from pack to index position in more
cache-friendly way, like constructing a binary search tree from the
table and writing the values in breadth-first order. This would
result in much better locality, but the price you pay is trading
O(1) lookup in 'pack_pos_to_index()' for an O(log n) one (since you
can no longer directly index the table).
So, neither of these approaches are taken here. (Thankfully, the format
is versioned, so we are free to pursue these in the future.) But, cold
cache performance likely isn't interesting outside of one-off cases like
asking for the size of an object directly. In real-world usage, Git is
often performing many operations in the revindex (i.e., asking about
many objects rather than a single one).
The trade-off is worth it, since we will avoid the vast majority of the
cost of generating the revindex that the extra pointer chase will look
like noise in the following patch's benchmarks.
This patch describes the format and prepares callers (like in
pack-revindex.c) to be able to read *.rev files once they exist. An
implementation of the writer will appear in the next patch, and callers
will gradually begin to start using the writer in the patches that
follow after that.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-26 00:37:14 +01:00
|
|
|
const uint32_t *revindex_data;
|
|
|
|
const uint32_t *revindex_map;
|
|
|
|
size_t revindex_size;
|
2018-03-23 18:20:59 +01:00
|
|
|
/* something like ".git/objects/pack/xxxxx.pack" */
|
|
|
|
char pack_name[FLEX_ARRAY]; /* more */
|
|
|
|
};
|
|
|
|
|
2018-07-12 21:39:23 +02:00
|
|
|
struct multi_pack_index;
|
|
|
|
|
2019-11-27 23:24:53 +01:00
|
|
|
static inline int pack_map_entry_cmp(const void *unused_cmp_data,
|
|
|
|
const struct hashmap_entry *entry,
|
|
|
|
const struct hashmap_entry *entry2,
|
|
|
|
const void *keydata)
|
|
|
|
{
|
|
|
|
const char *key = keydata;
|
|
|
|
const struct packed_git *pg1, *pg2;
|
|
|
|
|
|
|
|
pg1 = container_of(entry, const struct packed_git, packmap_ent);
|
|
|
|
pg2 = container_of(entry2, const struct packed_git, packmap_ent);
|
|
|
|
|
|
|
|
return strcmp(pg1->pack_name, key ? key : pg2->pack_name);
|
|
|
|
}
|
|
|
|
|
2018-03-23 18:20:55 +01:00
|
|
|
struct raw_object_store {
|
|
|
|
/*
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
* Set of all object directories; the main directory is first (and
|
|
|
|
* cannot be NULL after initialization). Subsequent directories are
|
|
|
|
* alternates.
|
2018-03-23 18:20:55 +01:00
|
|
|
*/
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
struct object_directory *odb;
|
|
|
|
struct object_directory **odb_tail;
|
2021-07-08 01:10:15 +02:00
|
|
|
kh_odb_path_map_t *odb_by_path;
|
|
|
|
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
int loaded_alternates;
|
2018-03-23 18:20:55 +01:00
|
|
|
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
/*
|
|
|
|
* A list of alternate object directories loaded from the environment;
|
|
|
|
* this should not generally need to be accessed directly, but will
|
|
|
|
* populate the "odb" list when prepare_alt_odb() is run.
|
|
|
|
*/
|
2018-03-23 18:20:55 +01:00
|
|
|
char *alternate_db;
|
2018-03-23 18:20:57 +01:00
|
|
|
|
2018-04-12 02:21:05 +02:00
|
|
|
/*
|
|
|
|
* Objects that should be substituted by other objects
|
|
|
|
* (see git-replace(1)).
|
|
|
|
*/
|
2018-04-12 02:21:07 +02:00
|
|
|
struct oidmap *replace_map;
|
replace-object: make replace operations thread-safe
replace-object functions are very close to being thread-safe: the only
current racy section is the lazy initialization at
prepare_replace_object(). The following patches will protect some object
reading operations to be called threaded, but before that, replace
functions must be protected. To do so, add a mutex to struct
raw_object_store and acquire it before lazy initializing the
replace_map. This won't cause any noticeable performance drop as the
mutex will no longer be used after the replace_map is initialized.
Later, when the replace functions are called in parallel, thread
debuggers might point our use of the added replace_map_initialized flag
as a data race. However, as this boolean variable is initialized as
false and it's only updated once, there's no real harm. It's perfectly
fine if the value is updated right after a thread read it in
replace-map.h:lookup_replace_object() (there'll only be a performance
penalty for the affected threads at that moment). We could cease the
debugger warning protecting the variable reading at the said function.
However, this would negatively affect performance for all threads
calling it, at any time, so it's not really worthy since the warning
doesn't represent a real problem. Instead, to make sure we don't get
false positives (at ThreadSanitizer, at least) an entry for the
respective function is added to .tsan-suppressions.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:52 +01:00
|
|
|
unsigned replace_map_initialized : 1;
|
|
|
|
pthread_mutex_t replace_mutex; /* protect object replace functions */
|
2018-04-12 02:21:05 +02:00
|
|
|
|
2018-07-12 00:42:41 +02:00
|
|
|
struct commit_graph *commit_graph;
|
|
|
|
unsigned commit_graph_attempted : 1; /* if loading has been attempted */
|
|
|
|
|
2018-07-12 21:39:33 +02:00
|
|
|
/*
|
|
|
|
* private data
|
|
|
|
*
|
|
|
|
* should only be accessed directly by packfile.c and midx.c
|
|
|
|
*/
|
|
|
|
struct multi_pack_index *multi_pack_index;
|
|
|
|
|
2018-03-23 18:20:59 +01:00
|
|
|
/*
|
|
|
|
* private data
|
|
|
|
*
|
|
|
|
* should only be accessed directly by packfile.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct packed_git *packed_git;
|
|
|
|
/* A most-recently-used ordered version of the packed_git list. */
|
|
|
|
struct list_head packed_git_mru;
|
2018-03-23 18:21:01 +01:00
|
|
|
|
packfile: add kept-pack cache for find_kept_pack_entry()
In a recent patch we added a function 'find_kept_pack_entry()' to look
for an object only among kept packs.
While this function avoids doing any lookup work in non-kept packs, it
is still linear in the number of packs, since we have to traverse the
linked list of packs once per object. Let's cache a reduced version of
that list to save us time.
Note that this cache will last the lifetime of the program. We could
invalidate it on reprepare_packed_git(), but there's not much point in
being rigorous here:
- we might already fail to notice new .keep packs showing up after the
program starts. We only reprepare_packed_git() when we fail to find
an object. But adding a new pack won't cause that to happen.
Somebody repacking could add a new pack and delete an old one, but
most of the time we'd have a descriptor or mmap open to the old
pack anyway, so we might not even notice.
- in pack-objects we already cache the .keep state at startup, since
56dfeb6263 (pack-objects: compute local/ignore_pack_keep early,
2016-07-29). So this is just extending that concept further.
- we don't have to worry about any packed_git being removed; we always
keep the old structs around, even after reprepare_packed_git()
We do defensively invalidate the cache in case the set of kept packs
being asked for changes (e.g., only in-core kept packs were cached, but
suddenly the caller also wants on-disk kept packs, too). In theory we
could build all three caches and switch between them, but it's not
necessary, since this patch (and series) never changes the set of kept
packs that it wants to inspect from the cache.
So that "optimization" is more about being defensive in the face of
future changes than it is about asking for multiple kinds of kept packs
in this patch.
Here are p5303 results (as always, measured against the kernel):
Test HEAD^ HEAD
-----------------------------------------------------------------------------------------------
5303.5: repack (1) 57.34(54.66+10.88) 56.98(54.36+10.98) -0.6%
5303.6: repack with kept (1) 57.38(54.83+10.49) 57.17(54.97+10.26) -0.4%
5303.11: repack (50) 71.70(88.99+4.74) 71.62(88.48+5.08) -0.1%
5303.12: repack with kept (50) 72.58(89.61+4.78) 71.56(88.80+4.59) -1.4%
5303.17: repack (1000) 217.19(491.72+14.25) 217.31(490.82+14.53) +0.1%
5303.18: repack with kept (1000) 246.12(520.07+14.93) 217.08(490.37+15.10) -11.8%
and the --stdin-packs case, which scales a little bit better (although
not by that much even at 1,000 packs):
5303.7: repack with --stdin-packs (1) 0.00(0.00+0.00) 0.00(0.00+0.00) =
5303.13: repack with --stdin-packs (50) 3.43(11.75+0.24) 3.43(11.69+0.30) +0.0%
5303.19: repack with --stdin-packs (1000) 130.50(307.15+7.66) 125.13(301.36+8.04) -4.1%
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-02-23 03:25:23 +01:00
|
|
|
struct {
|
|
|
|
struct packed_git **packs;
|
|
|
|
unsigned flags;
|
|
|
|
} kept_pack_cache;
|
|
|
|
|
2019-11-27 23:24:53 +01:00
|
|
|
/*
|
|
|
|
* A map of packfiles to packed_git structs for tracking which
|
|
|
|
* packs have been loaded already.
|
|
|
|
*/
|
|
|
|
struct hashmap pack_map;
|
|
|
|
|
2018-03-23 18:21:02 +01:00
|
|
|
/*
|
|
|
|
* A fast, rough count of the number of objects in the repository.
|
|
|
|
* These two fields are not meant for direct access. Use
|
|
|
|
* approximate_object_count() instead.
|
|
|
|
*/
|
|
|
|
unsigned long approximate_object_count;
|
|
|
|
unsigned approximate_object_count_valid : 1;
|
|
|
|
|
2018-03-23 18:21:01 +01:00
|
|
|
/*
|
|
|
|
* Whether packed_git has already been populated with this repository's
|
|
|
|
* packs.
|
|
|
|
*/
|
|
|
|
unsigned packed_git_initialized : 1;
|
2018-03-23 18:20:55 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct raw_object_store *raw_object_store_new(void);
|
|
|
|
void raw_object_store_clear(struct raw_object_store *o);
|
|
|
|
|
2018-03-23 18:21:10 +01:00
|
|
|
/*
|
|
|
|
* Put in `buf` the name of the file in the local object database that
|
sha1-file: modernize loose object file functions
The loose object access code in sha1-file.c is some of the oldest in
Git, and could use some modernizing. It mostly uses "unsigned char *"
for object ids, which these days should be "struct object_id".
It also uses the term "sha1_file" in many functions, which is confusing.
The term "loose_objects" is much better. It clearly distinguishes
them from packed objects (which didn't even exist back when the name
"sha1_file" came into being). And it also distinguishes it from the
checksummed-file concept in csum-file.c (which until recently was
actually called "struct sha1file"!).
This patch converts the functions {open,close,map,stat}_sha1_file() into
open_loose_object(), etc, and switches their sha1 arguments for
object_id structs. Similarly, path functions like fill_sha1_path()
become fill_loose_path() and use object_ids.
The function sha1_loose_object_info() already says "loose", so we can
just drop the "sha1" (and teach it to use object_id).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-07 09:35:42 +01:00
|
|
|
* would be used to store a loose object with the specified oid.
|
2018-03-23 18:21:10 +01:00
|
|
|
*/
|
sha1-file: modernize loose object file functions
The loose object access code in sha1-file.c is some of the oldest in
Git, and could use some modernizing. It mostly uses "unsigned char *"
for object ids, which these days should be "struct object_id".
It also uses the term "sha1_file" in many functions, which is confusing.
The term "loose_objects" is much better. It clearly distinguishes
them from packed objects (which didn't even exist back when the name
"sha1_file" came into being). And it also distinguishes it from the
checksummed-file concept in csum-file.c (which until recently was
actually called "struct sha1file"!).
This patch converts the functions {open,close,map,stat}_sha1_file() into
open_loose_object(), etc, and switches their sha1 arguments for
object_id structs. Similarly, path functions like fill_sha1_path()
become fill_loose_path() and use object_ids.
The function sha1_loose_object_info() already says "loose", so we can
just drop the "sha1" (and teach it to use object_id).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-07 09:35:42 +01:00
|
|
|
const char *loose_object_path(struct repository *r, struct strbuf *buf,
|
|
|
|
const struct object_id *oid);
|
2018-03-23 18:21:10 +01:00
|
|
|
|
sha1-file: modernize loose object file functions
The loose object access code in sha1-file.c is some of the oldest in
Git, and could use some modernizing. It mostly uses "unsigned char *"
for object ids, which these days should be "struct object_id".
It also uses the term "sha1_file" in many functions, which is confusing.
The term "loose_objects" is much better. It clearly distinguishes
them from packed objects (which didn't even exist back when the name
"sha1_file" came into being). And it also distinguishes it from the
checksummed-file concept in csum-file.c (which until recently was
actually called "struct sha1file"!).
This patch converts the functions {open,close,map,stat}_sha1_file() into
open_loose_object(), etc, and switches their sha1 arguments for
object_id structs. Similarly, path functions like fill_sha1_path()
become fill_loose_path() and use object_ids.
The function sha1_loose_object_info() already says "loose", so we can
just drop the "sha1" (and teach it to use object_id).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-07 09:35:42 +01:00
|
|
|
void *map_loose_object(struct repository *r, const struct object_id *oid,
|
|
|
|
unsigned long *size);
|
2018-03-23 18:21:14 +01:00
|
|
|
|
2019-04-29 10:28:14 +02:00
|
|
|
void *read_object_file_extended(struct repository *r,
|
2019-04-29 10:28:23 +02:00
|
|
|
const struct object_id *oid,
|
|
|
|
enum object_type *type,
|
|
|
|
unsigned long *size, int lookup_replace);
|
2018-11-14 01:12:47 +01:00
|
|
|
static inline void *repo_read_object_file(struct repository *r,
|
|
|
|
const struct object_id *oid,
|
|
|
|
enum object_type *type,
|
|
|
|
unsigned long *size)
|
2018-05-16 01:42:15 +02:00
|
|
|
{
|
2018-11-14 01:12:47 +01:00
|
|
|
return read_object_file_extended(r, oid, type, size, 1);
|
2018-05-16 01:42:15 +02:00
|
|
|
}
|
2018-11-14 01:12:47 +01:00
|
|
|
#ifndef NO_THE_REPOSITORY_COMPATIBILITY_MACROS
|
|
|
|
#define read_object_file(oid, type, size) repo_read_object_file(the_repository, oid, type, size)
|
|
|
|
#endif
|
2018-05-16 01:42:15 +02:00
|
|
|
|
|
|
|
/* Read and unpack an object file into memory, write memory to an object file */
|
|
|
|
int oid_object_info(struct repository *r, const struct object_id *, unsigned long *);
|
|
|
|
|
2022-02-05 00:48:24 +01:00
|
|
|
void hash_object_file(const struct git_hash_algo *algo, const void *buf,
|
2022-02-05 00:48:32 +01:00
|
|
|
unsigned long len, enum object_type type,
|
2022-02-05 00:48:24 +01:00
|
|
|
struct object_id *oid);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2021-10-12 16:30:49 +02:00
|
|
|
int write_object_file_flags(const void *buf, unsigned long len,
|
2022-02-05 00:48:26 +01:00
|
|
|
enum object_type type, struct object_id *oid,
|
2021-10-12 16:30:49 +02:00
|
|
|
unsigned flags);
|
|
|
|
static inline int write_object_file(const void *buf, unsigned long len,
|
2022-02-05 00:48:26 +01:00
|
|
|
enum object_type type, struct object_id *oid)
|
2021-10-12 16:30:49 +02:00
|
|
|
{
|
|
|
|
return write_object_file_flags(buf, len, type, oid, 0);
|
|
|
|
}
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2022-02-05 00:48:31 +01:00
|
|
|
int write_object_file_literally(const void *buf, unsigned long len,
|
|
|
|
const char *type, struct object_id *oid,
|
|
|
|
unsigned flags);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2020-01-04 01:13:31 +01:00
|
|
|
/*
|
|
|
|
* Add an object file to the in-memory object store, without writing it
|
|
|
|
* to disk.
|
|
|
|
*
|
|
|
|
* Callers are responsible for calling write_object_file to record the
|
|
|
|
* object in persistent storage before writing any other new objects
|
|
|
|
* that reference it.
|
|
|
|
*/
|
2019-04-29 10:28:14 +02:00
|
|
|
int pretend_object_file(void *, unsigned long, enum object_type,
|
2019-04-29 10:28:23 +02:00
|
|
|
struct object_id *oid);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2019-04-29 10:28:14 +02:00
|
|
|
int force_object_loose(const struct object_id *oid, time_t mtime);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Open the loose object at path, check its hash, and return the contents,
|
2021-10-01 11:16:52 +02:00
|
|
|
* use the "oi" argument to assert things about the object, or e.g. populate its
|
2018-05-16 01:42:15 +02:00
|
|
|
* type, and size. If the object is a blob, then "contents" may return NULL,
|
|
|
|
* to allow streaming of large blobs.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, negative on error (details may be written to stderr).
|
|
|
|
*/
|
|
|
|
int read_loose_object(const char *path,
|
|
|
|
const struct object_id *expected_oid,
|
fsck: report invalid object type-path combinations
Improve the error that's emitted in cases where we find a loose object
we parse, but which isn't at the location we expect it to be.
Before this change we'd prefix the error with a not-a-OID derived from
the path at which the object was found, due to an emergent behavior in
how we'd end up with an "OID" in these codepaths.
Now we'll instead say what object we hashed, and what path it was
found at. Before this patch series e.g.:
$ git hash-object --stdin -w -t blob </dev/null
e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
$ mv objects/e6/ objects/e7
Would emit ("[...]" used to abbreviate the OIDs):
git fsck
error: hash mismatch for ./objects/e7/9d[...] (expected e79d[...])
error: e79d[...]: object corrupt or missing: ./objects/e7/9d[...]
Now we'll instead emit:
error: e69d[...]: hash-path mismatch, found at: ./objects/e7/9d[...]
Furthermore, we'll do the right thing when the object type and its
location are bad. I.e. this case:
$ git hash-object --stdin -w -t garbage --literally </dev/null
8315a83d2acc4c174aed59430f9a9c4ed926440f
$ mv objects/83 objects/84
As noted in an earlier commits we'd simply die early in those cases,
until preceding commits fixed the hard die on invalid object type:
$ git fsck
fatal: invalid object type
Now we'll instead emit sensible error messages:
$ git fsck
error: 8315[...]: hash-path mismatch, found at: ./objects/84/15[...]
error: 8315[...]: object is of unknown type 'garbage': ./objects/84/15[...]
In both fsck.c and object-file.c we're using null_oid as a sentinel
value for checking whether we got far enough to be certain that the
issue was indeed this OID mismatch.
We need to add the "object corrupt or missing" special-case to deal
with cases where read_loose_object() will return an error before
completing check_object_signature(), e.g. if we have an error in
unpack_loose_rest() because we find garbage after the valid gzip
content:
$ git hash-object --stdin -w -t blob </dev/null
e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
$ chmod 755 objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391
$ echo garbage >>objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391
$ git fsck
error: garbage at end of loose object 'e69d[...]'
error: unable to unpack contents of ./objects/e6/9d[...]
error: e69d[...]: object corrupt or missing: ./objects/e6/9d[...]
There is currently some weird messaging in the edge case when the two
are combined, i.e. because we're not explicitly passing along an error
state about this specific scenario from check_stream_oid() via
read_loose_object() we'll end up printing the null OID if an object is
of an unknown type *and* it can't be unpacked by zlib, e.g.:
$ git hash-object --stdin -w -t garbage --literally </dev/null
8315a83d2acc4c174aed59430f9a9c4ed926440f
$ chmod 755 objects/83/15a83d2acc4c174aed59430f9a9c4ed926440f
$ echo garbage >>objects/83/15a83d2acc4c174aed59430f9a9c4ed926440f
$ /usr/bin/git fsck
fatal: invalid object type
$ ~/g/git/git fsck
error: garbage at end of loose object '8315a83d2acc4c174aed59430f9a9c4ed926440f'
error: unable to unpack contents of ./objects/83/15a83d2acc4c174aed59430f9a9c4ed926440f
error: 8315a83d2acc4c174aed59430f9a9c4ed926440f: object corrupt or missing: ./objects/83/15a83d2acc4c174aed59430f9a9c4ed926440f
error: 0000000000000000000000000000000000000000: object is of unknown type 'garbage': ./objects/83/15a83d2acc4c174aed59430f9a9c4ed926440f
[...]
I think it's OK to leave that for future improvements, which would
involve enum-ifying more error state as we've done with "enum
unpack_loose_header_result" in preceding commits. In these
increasingly more obscure cases the worst that can happen is that
we'll get slightly nonsensical or inapplicable error messages.
There's other such potential edge cases, all of which might produce
some confusing messaging, but still be handled correctly as far as
passing along errors goes. E.g. if check_object_signature() returns
and oideq(real_oid, null_oid()) is true, which could happen if it
returns -1 due to the read_istream() call having failed.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-01 11:16:53 +02:00
|
|
|
struct object_id *real_oid,
|
2021-10-01 11:16:52 +02:00
|
|
|
void **contents,
|
|
|
|
struct object_info *oi);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2020-08-06 01:06:49 +02:00
|
|
|
/* Retry packed storage after checking packed and loose storage */
|
|
|
|
#define HAS_OBJECT_RECHECK_PACKED 1
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns 1 if the object exists. This function will not lazily fetch objects
|
|
|
|
* in a partial clone.
|
|
|
|
*/
|
|
|
|
int has_object(struct repository *r, const struct object_id *oid,
|
|
|
|
unsigned flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These macros and functions are deprecated. If checking existence for an
|
|
|
|
* object that is likely to be missing and/or whose absence is relatively
|
|
|
|
* inconsequential (or is consequential but the caller is prepared to handle
|
|
|
|
* it), use has_object(), which has better defaults (no lazy fetch in a partial
|
|
|
|
* clone and no rechecking of packed storage). In the unlikely event that a
|
|
|
|
* caller needs to assert existence of an object that it fully expects to
|
|
|
|
* exist, and wants to trigger a lazy fetch in a partial clone, use
|
|
|
|
* oid_object_info_extended() with a NULL struct object_info.
|
|
|
|
*
|
|
|
|
* These functions can be removed once all callers have migrated to
|
|
|
|
* has_object() and/or oid_object_info_extended().
|
|
|
|
*/
|
2018-11-14 01:12:48 +01:00
|
|
|
#ifndef NO_THE_REPOSITORY_COMPATIBILITY_MACROS
|
|
|
|
#define has_sha1_file_with_flags(sha1, flags) repo_has_sha1_file_with_flags(the_repository, sha1, flags)
|
|
|
|
#define has_sha1_file(sha1) repo_has_sha1_file(the_repository, sha1)
|
|
|
|
#endif
|
|
|
|
int repo_has_object_file(struct repository *r, const struct object_id *oid);
|
|
|
|
int repo_has_object_file_with_flags(struct repository *r,
|
|
|
|
const struct object_id *oid, int flags);
|
|
|
|
#ifndef NO_THE_REPOSITORY_COMPATIBILITY_MACROS
|
|
|
|
#define has_object_file(oid) repo_has_object_file(the_repository, oid)
|
|
|
|
#define has_object_file_with_flags(oid, flags) repo_has_object_file_with_flags(the_repository, oid, flags)
|
|
|
|
#endif
|
2018-05-16 01:42:15 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Return true iff an alternate object database has a loose object
|
|
|
|
* with the specified name. This function does not respect replace
|
|
|
|
* references.
|
|
|
|
*/
|
2019-04-29 10:28:14 +02:00
|
|
|
int has_loose_object_nonlocal(const struct object_id *);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
2022-02-05 00:48:25 +01:00
|
|
|
/**
|
|
|
|
* format_object_header() is a thin wrapper around s xsnprintf() that
|
|
|
|
* writes the initial "<type> <obj-len>" part of the loose object
|
|
|
|
* header. It returns the size that snprintf() returns + 1.
|
|
|
|
*/
|
|
|
|
int format_object_header(char *str, size_t size, enum object_type type,
|
|
|
|
size_t objsize);
|
|
|
|
|
2019-04-29 10:28:14 +02:00
|
|
|
void assert_oid_type(const struct object_id *oid, enum object_type expect);
|
2018-05-16 01:42:15 +02:00
|
|
|
|
object-store: allow threaded access to object reading
Allow object reading to be performed by multiple threads protecting it
with an internal lock, the obj_read_mutex. The lock usage can be toggled
with enable_obj_read_lock() and disable_obj_read_lock(). Currently, the
functions which can be safely called in parallel are:
read_object_file_extended(), repo_read_object_file(),
read_object_file(), read_object_with_reference(), read_object(),
oid_object_info() and oid_object_info_extended(). It's also possible
to use obj_read_lock() and obj_read_unlock() to protect other sections
that cannot execute in parallel with object reading.
Probably there are many spots in the functions listed above that could
be executed unlocked (and thus, in parallel). But, for now, we are most
interested in allowing parallel access to zlib inflation. This is one of
the sections where object reading spends most of the time in (e.g. up to
one-third of git-grep's execution time in the chromium repo corresponds
to inflation) and it's already thread-safe. So, to take advantage of
that, the obj_read_mutex is released when calling git_inflate() and
re-acquired right after, for every calling spot in
oid_object_info_extended()'s call chain. We may refine this lock to also
exploit other possible parallel spots in the future, but for now,
threaded zlib inflation should already give great speedups for threaded
object reading callers.
Note that add_delta_base_cache() was also modified to skip adding
already present entries to the cache. This wasn't possible before, but
it would be now, with the parallel inflation. Take for example the
following situation, where two threads - A and B - are executing the
code at unpack_entry():
1. Thread A is performing the decompression of a base O (which is not
yet in the cache) at PHASE II. Thread B is simultaneously trying to
unpack O, but just starting at PHASE I.
2. Since O is not yet in the cache, B will go to PHASE II to also
perform the decompression.
3. When they finish decompressing, one of them will get the object
reading mutex and go to PHASE III while the other waits for the
mutex. Let’s say A got the mutex first.
4. Thread A will add O to the cache, go throughout the rest of PHASE III
and return.
5. Thread B gets the mutex, also add O to the cache (if the check wasn't
there) and returns.
Finally, it is also important to highlight that the object reading lock
can only ensure thread-safety in the mentioned functions thanks to two
complementary mechanisms: the use of 'struct raw_object_store's
replace_mutex, which guards sections in the object reading machinery
that would otherwise be thread-unsafe; and the 'struct pack_window's
inuse_cnt, which protects window reading operations (such as the one
performed during the inflation of a packed object), allowing them to
execute without the acquisition of the obj_read_mutex.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:53 +01:00
|
|
|
/*
|
|
|
|
* Enabling the object read lock allows multiple threads to safely call the
|
|
|
|
* following functions in parallel: repo_read_object_file(), read_object_file(),
|
|
|
|
* read_object_file_extended(), read_object_with_reference(), read_object(),
|
|
|
|
* oid_object_info() and oid_object_info_extended().
|
|
|
|
*
|
|
|
|
* obj_read_lock() and obj_read_unlock() may also be used to protect other
|
|
|
|
* section which cannot execute in parallel with object reading. Since the used
|
|
|
|
* lock is a recursive mutex, these sections can even contain calls to object
|
|
|
|
* reading functions. However, beware that in these cases zlib inflation won't
|
|
|
|
* be performed in parallel, losing performance.
|
|
|
|
*
|
|
|
|
* TODO: oid_object_info_extended()'s call stack has a recursive behavior. If
|
|
|
|
* any of its callees end up calling it, this recursive call won't benefit from
|
|
|
|
* parallel inflation.
|
|
|
|
*/
|
|
|
|
void enable_obj_read_lock(void);
|
|
|
|
void disable_obj_read_lock(void);
|
|
|
|
|
|
|
|
extern int obj_read_use_lock;
|
|
|
|
extern pthread_mutex_t obj_read_mutex;
|
|
|
|
|
|
|
|
static inline void obj_read_lock(void)
|
|
|
|
{
|
|
|
|
if(obj_read_use_lock)
|
|
|
|
pthread_mutex_lock(&obj_read_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void obj_read_unlock(void)
|
|
|
|
{
|
|
|
|
if(obj_read_use_lock)
|
|
|
|
pthread_mutex_unlock(&obj_read_mutex);
|
|
|
|
}
|
|
|
|
|
2018-05-16 01:42:15 +02:00
|
|
|
struct object_info {
|
|
|
|
/* Request */
|
|
|
|
enum object_type *typep;
|
|
|
|
unsigned long *sizep;
|
|
|
|
off_t *disk_sizep;
|
2020-02-24 05:36:56 +01:00
|
|
|
struct object_id *delta_base_oid;
|
2018-05-16 01:42:15 +02:00
|
|
|
struct strbuf *type_name;
|
|
|
|
void **contentp;
|
|
|
|
|
|
|
|
/* Response */
|
|
|
|
enum {
|
|
|
|
OI_CACHED,
|
|
|
|
OI_LOOSE,
|
|
|
|
OI_PACKED,
|
|
|
|
OI_DBCACHED
|
|
|
|
} whence;
|
|
|
|
union {
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* ... Nothing to expose in this case
|
|
|
|
* } cached;
|
|
|
|
* struct {
|
|
|
|
* ... Nothing to expose in this case
|
|
|
|
* } loose;
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
struct packed_git *pack;
|
|
|
|
off_t offset;
|
|
|
|
unsigned int is_delta;
|
|
|
|
} packed;
|
|
|
|
} u;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initializer for a "struct object_info" that wants no items. You may
|
|
|
|
* also memset() the memory to all-zeroes.
|
|
|
|
*/
|
2021-09-27 14:54:25 +02:00
|
|
|
#define OBJECT_INFO_INIT { 0 }
|
2018-05-16 01:42:15 +02:00
|
|
|
|
|
|
|
/* Invoke lookup_replace_object() on the given hash */
|
|
|
|
#define OBJECT_INFO_LOOKUP_REPLACE 1
|
|
|
|
/* Allow reading from a loose object file of unknown/bogus type */
|
|
|
|
#define OBJECT_INFO_ALLOW_UNKNOWN_TYPE 2
|
|
|
|
/* Do not retry packed storage after checking packed and loose storage */
|
|
|
|
#define OBJECT_INFO_QUICK 8
|
|
|
|
/* Do not check loose object */
|
|
|
|
#define OBJECT_INFO_IGNORE_LOOSE 16
|
2019-03-29 22:39:27 +01:00
|
|
|
/*
|
|
|
|
* Do not attempt to fetch the object if missing (even if fetch_is_missing is
|
2019-05-28 17:19:07 +02:00
|
|
|
* nonzero).
|
2019-03-29 22:39:27 +01:00
|
|
|
*/
|
2019-05-28 17:19:07 +02:00
|
|
|
#define OBJECT_INFO_SKIP_FETCH_OBJECT 32
|
|
|
|
/*
|
|
|
|
* This is meant for bulk prefetching of missing blobs in a partial
|
|
|
|
* clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK
|
|
|
|
*/
|
|
|
|
#define OBJECT_INFO_FOR_PREFETCH (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK)
|
2018-05-16 01:42:15 +02:00
|
|
|
|
|
|
|
int oid_object_info_extended(struct repository *r,
|
|
|
|
const struct object_id *,
|
|
|
|
struct object_info *, unsigned flags);
|
|
|
|
|
2018-08-14 20:21:18 +02:00
|
|
|
/*
|
|
|
|
* Iterate over the files in the loose-object parts of the object
|
|
|
|
* directory "path", triggering the following callbacks:
|
|
|
|
*
|
|
|
|
* - loose_object is called for each loose object we find.
|
|
|
|
*
|
|
|
|
* - loose_cruft is called for any files that do not appear to be
|
|
|
|
* loose objects. Note that we only look in the loose object
|
|
|
|
* directories "objects/[0-9a-f]{2}/", so we will not report
|
|
|
|
* "objects/foobar" as cruft.
|
|
|
|
*
|
|
|
|
* - loose_subdir is called for each top-level hashed subdirectory
|
|
|
|
* of the object directory (e.g., "$OBJDIR/f0"). It is called
|
|
|
|
* after the objects in the directory are processed.
|
|
|
|
*
|
|
|
|
* Any callback that is NULL will be ignored. Callbacks returning non-zero
|
|
|
|
* will end the iteration.
|
|
|
|
*
|
|
|
|
* In the "buf" variant, "path" is a strbuf which will also be used as a
|
|
|
|
* scratch buffer, but restored to its original contents before
|
|
|
|
* the function returns.
|
|
|
|
*/
|
|
|
|
typedef int each_loose_object_fn(const struct object_id *oid,
|
|
|
|
const char *path,
|
|
|
|
void *data);
|
|
|
|
typedef int each_loose_cruft_fn(const char *basename,
|
|
|
|
const char *path,
|
|
|
|
void *data);
|
|
|
|
typedef int each_loose_subdir_fn(unsigned int nr,
|
|
|
|
const char *path,
|
|
|
|
void *data);
|
|
|
|
int for_each_file_in_obj_subdir(unsigned int subdir_nr,
|
|
|
|
struct strbuf *path,
|
|
|
|
each_loose_object_fn obj_cb,
|
|
|
|
each_loose_cruft_fn cruft_cb,
|
|
|
|
each_loose_subdir_fn subdir_cb,
|
|
|
|
void *data);
|
|
|
|
int for_each_loose_file_in_objdir(const char *path,
|
|
|
|
each_loose_object_fn obj_cb,
|
|
|
|
each_loose_cruft_fn cruft_cb,
|
|
|
|
each_loose_subdir_fn subdir_cb,
|
|
|
|
void *data);
|
|
|
|
int for_each_loose_file_in_objdir_buf(struct strbuf *path,
|
|
|
|
each_loose_object_fn obj_cb,
|
|
|
|
each_loose_cruft_fn cruft_cb,
|
|
|
|
each_loose_subdir_fn subdir_cb,
|
|
|
|
void *data);
|
|
|
|
|
|
|
|
/* Flags for for_each_*_object() below. */
|
|
|
|
enum for_each_object_flags {
|
|
|
|
/* Iterate only over local objects, not alternates. */
|
|
|
|
FOR_EACH_OBJECT_LOCAL_ONLY = (1<<0),
|
|
|
|
|
|
|
|
/* Only iterate over packs obtained from the promisor remote. */
|
|
|
|
FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1),
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Visit objects within a pack in packfile order rather than .idx order
|
|
|
|
*/
|
|
|
|
FOR_EACH_OBJECT_PACK_ORDER = (1<<2),
|
2021-08-30 04:48:52 +02:00
|
|
|
|
|
|
|
/* Only iterate over packs that are not marked as kept in-core. */
|
|
|
|
FOR_EACH_OBJECT_SKIP_IN_CORE_KEPT_PACKS = (1<<3),
|
|
|
|
|
|
|
|
/* Only iterate over packs that do not have .keep files. */
|
|
|
|
FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4),
|
2018-08-14 20:21:18 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate over all accessible loose objects without respect to
|
|
|
|
* reachability. By default, this includes both local and alternate objects.
|
|
|
|
* The order in which objects are visited is unspecified.
|
|
|
|
*
|
|
|
|
* Any flags specific to packs are ignored.
|
|
|
|
*/
|
|
|
|
int for_each_loose_object(each_loose_object_fn, void *,
|
|
|
|
enum for_each_object_flags flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate over all accessible packed objects without respect to reachability.
|
|
|
|
* By default, this includes both local and alternate packs.
|
|
|
|
*
|
|
|
|
* Note that some objects may appear twice if they are found in multiple packs.
|
|
|
|
* Each pack is visited in an unspecified order. By default, objects within a
|
|
|
|
* pack are visited in pack-idx order (i.e., sorted by oid).
|
|
|
|
*/
|
|
|
|
typedef int each_packed_object_fn(const struct object_id *oid,
|
|
|
|
struct packed_git *pack,
|
|
|
|
uint32_t pos,
|
|
|
|
void *data);
|
|
|
|
int for_each_object_in_pack(struct packed_git *p,
|
|
|
|
each_packed_object_fn, void *data,
|
|
|
|
enum for_each_object_flags flags);
|
|
|
|
int for_each_packed_object(each_packed_object_fn, void *,
|
|
|
|
enum for_each_object_flags flags);
|
|
|
|
|
2018-03-23 18:20:55 +01:00
|
|
|
#endif /* OBJECT_STORE_H */
|