2005-04-08 00:16:10 +02:00
|
|
|
/*
|
|
|
|
* GIT - The information manager from hell
|
|
|
|
*
|
|
|
|
* Copyright (C) Linus Torvalds, 2005
|
|
|
|
*/
|
2005-04-08 00:13:13 +02:00
|
|
|
#include "cache.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2018-07-01 03:24:55 +02:00
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2015-08-10 11:47:45 +02:00
|
|
|
#include "tempfile.h"
|
2014-10-01 12:28:42 +02:00
|
|
|
#include "lockfile.h"
|
2006-04-25 06:18:58 +02:00
|
|
|
#include "cache-tree.h"
|
2007-04-10 06:20:29 +02:00
|
|
|
#include "refs.h"
|
2007-08-11 23:59:01 +02:00
|
|
|
#include "dir.h"
|
2018-05-16 01:42:15 +02:00
|
|
|
#include "object-store.h"
|
2008-07-21 10:24:17 +02:00
|
|
|
#include "tree.h"
|
|
|
|
#include "commit.h"
|
2008-08-21 10:44:53 +02:00
|
|
|
#include "blob.h"
|
2009-12-25 09:30:51 +01:00
|
|
|
#include "resolve-undo.h"
|
2019-02-15 18:59:21 +01:00
|
|
|
#include "run-command.h"
|
2012-04-04 00:53:15 +02:00
|
|
|
#include "strbuf.h"
|
|
|
|
#include "varint.h"
|
2014-06-13 14:19:36 +02:00
|
|
|
#include "split-index.h"
|
2014-12-16 00:15:20 +01:00
|
|
|
#include "utf8.h"
|
2017-09-22 18:35:40 +02:00
|
|
|
#include "fsmonitor.h"
|
2018-10-10 17:59:36 +02:00
|
|
|
#include "thread-utils.h"
|
2018-09-15 19:56:04 +02:00
|
|
|
#include "progress.h"
|
2006-04-25 06:18:58 +02:00
|
|
|
|
2012-07-11 11:22:37 +02:00
|
|
|
/* Mask for the name length in ce_flags in the on-disk index */
|
|
|
|
|
|
|
|
#define CE_NAMEMASK (0x0fff)
|
|
|
|
|
2006-04-25 06:18:58 +02:00
|
|
|
/* Index extensions.
|
|
|
|
*
|
|
|
|
* The first letter should be 'A'..'Z' for extensions that are not
|
|
|
|
* necessary for a correct operation (i.e. optimization data).
|
|
|
|
* When new extensions are added that _needs_ to be understood in
|
|
|
|
* order to correctly interpret the index file, pick character that
|
|
|
|
* is outside the range, to cause the reader to abort.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define CACHE_EXT(s) ( (s[0]<<24)|(s[1]<<16)|(s[2]<<8)|(s[3]) )
|
|
|
|
#define CACHE_EXT_TREE 0x54524545 /* "TREE" */
|
2010-02-02 16:33:28 +01:00
|
|
|
#define CACHE_EXT_RESOLVE_UNDO 0x52455543 /* "REUC" */
|
2014-06-13 14:19:36 +02:00
|
|
|
#define CACHE_EXT_LINK 0x6c696e6b /* "link" */
|
2015-03-08 11:12:33 +01:00
|
|
|
#define CACHE_EXT_UNTRACKED 0x554E5452 /* "UNTR" */
|
2017-09-22 18:35:40 +02:00
|
|
|
#define CACHE_EXT_FSMONITOR 0x46534D4E /* "FSMN" */
|
2018-10-10 17:59:34 +02:00
|
|
|
#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945 /* "EOIE" */
|
2018-10-10 17:59:37 +02:00
|
|
|
#define CACHE_EXT_INDEXENTRYOFFSETTABLE 0x49454F54 /* "IEOT" */
|
2014-06-13 14:19:36 +02:00
|
|
|
|
|
|
|
/* changes that can be kept in $GIT_DIR/index (basically all extensions) */
|
2014-06-13 14:19:37 +02:00
|
|
|
#define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
|
2014-06-13 14:19:44 +02:00
|
|
|
CE_ENTRY_ADDED | CE_ENTRY_REMOVED | CE_ENTRY_CHANGED | \
|
2017-09-22 18:35:40 +02:00
|
|
|
SPLIT_INDEX_ORDERED | UNTRACKED_CHANGED | FSMONITOR_CHANGED)
|
2005-04-08 00:13:13 +02:00
|
|
|
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an estimate of the pathname length in the index. We use
|
|
|
|
* this for V4 index files to guess the un-deltafied size of the index
|
|
|
|
* in memory because of pathname deltafication. This is not required
|
|
|
|
* for V2/V3 index formats because their pathnames are not compressed.
|
|
|
|
* If the initial amount of memory set aside is not sufficient, the
|
|
|
|
* mem pool will allocate extra memory.
|
|
|
|
*/
|
|
|
|
#define CACHE_ENTRY_PATH_LENGTH 80
|
|
|
|
|
|
|
|
static inline struct cache_entry *mem_pool__ce_alloc(struct mem_pool *mem_pool, size_t len)
|
|
|
|
{
|
|
|
|
struct cache_entry *ce;
|
|
|
|
ce = mem_pool_alloc(mem_pool, cache_entry_size(len));
|
|
|
|
ce->mem_pool_allocated = 1;
|
|
|
|
return ce;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct cache_entry *mem_pool__ce_calloc(struct mem_pool *mem_pool, size_t len)
|
|
|
|
{
|
|
|
|
struct cache_entry * ce;
|
|
|
|
ce = mem_pool_calloc(mem_pool, 1, cache_entry_size(len));
|
|
|
|
ce->mem_pool_allocated = 1;
|
|
|
|
return ce;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mem_pool *find_mem_pool(struct index_state *istate)
|
|
|
|
{
|
|
|
|
struct mem_pool **pool_ptr;
|
|
|
|
|
|
|
|
if (istate->split_index && istate->split_index->base)
|
|
|
|
pool_ptr = &istate->split_index->base->ce_mem_pool;
|
|
|
|
else
|
|
|
|
pool_ptr = &istate->ce_mem_pool;
|
|
|
|
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
if (!*pool_ptr) {
|
|
|
|
*pool_ptr = xmalloc(sizeof(**pool_ptr));
|
|
|
|
mem_pool_init(*pool_ptr, 0);
|
|
|
|
}
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
|
|
|
|
return *pool_ptr;
|
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:24 +02:00
|
|
|
static const char *alternate_index_output;
|
2006-07-26 06:32:18 +02:00
|
|
|
|
2008-01-23 08:01:13 +01:00
|
|
|
static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
|
|
|
|
{
|
|
|
|
istate->cache[nr] = ce;
|
2008-03-21 21:16:24 +01:00
|
|
|
add_name_hash(istate, ce);
|
2008-01-23 08:01:13 +01:00
|
|
|
}
|
|
|
|
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
static void replace_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
|
|
|
|
{
|
|
|
|
struct cache_entry *old = istate->cache[nr];
|
|
|
|
|
2014-06-13 14:19:39 +02:00
|
|
|
replace_index_entry_in_base(istate, old, ce);
|
2013-02-28 00:57:48 +01:00
|
|
|
remove_name_hash(istate, old);
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(old);
|
2018-03-15 16:25:20 +01:00
|
|
|
ce->ce_flags &= ~CE_HASHED;
|
Fix name re-hashing semantics
We handled the case of removing and re-inserting cache entries badly,
which is something that merging commonly needs to do (removing the
different stages, and then re-inserting one of them as the merged
state).
We even had a rather ugly special case for this failure case, where
replace_index_entry() basically turned itself into a no-op if the new
and the old entries were the same, exactly because the hash routines
didn't handle it on their own.
So what this patch does is to not just have the UNHASHED bit, but a
HASHED bit too, and when you insert an entry into the name hash, that
involves:
- clear the UNHASHED bit, because now it's valid again for lookup
(which is really all that UNHASHED meant)
- if we're being lazy, we're done here (but we still want to clear the
UNHASHED bit regardless of lazy mode, since we can become unlazy
later, and so we need the UNHASHED bit to always be set correctly,
even if we never actually insert the entry into the hash list)
- if it was already hashed, we just leave it on the list
- otherwise mark it HASHED and insert it into the list
this all means that unhashing and rehashing a name all just works
automatically. Obviously, you cannot change the name of an entry (that
would be a serious bug), but nothing can validly do that anyway (you'd
have to allocate a new struct cache_entry anyway since the name length
could change), so that's not a new limitation.
The code actually gets simpler in many ways, although the lazy hashing
does mean that there are a few odd cases (ie something can be marked
unhashed even though it was never on the hash in the first place, and
isn't actually marked hashed!).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-02-23 05:37:40 +01:00
|
|
|
set_index_entry(istate, nr, ce);
|
2014-06-13 14:19:39 +02:00
|
|
|
ce->ce_flags |= CE_UPDATE_IN_BASE;
|
2017-09-22 18:35:40 +02:00
|
|
|
mark_fsmonitor_invalid(istate, ce);
|
2014-06-13 14:19:27 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_CHANGED;
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
}
|
|
|
|
|
2008-07-21 02:25:56 +02:00
|
|
|
void rename_index_entry_at(struct index_state *istate, int nr, const char *new_name)
|
|
|
|
{
|
2018-02-14 19:59:45 +01:00
|
|
|
struct cache_entry *old_entry = istate->cache[nr], *new_entry;
|
2008-07-21 02:25:56 +02:00
|
|
|
int namelen = strlen(new_name);
|
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
new_entry = make_empty_cache_entry(istate, namelen);
|
2018-02-14 19:59:45 +01:00
|
|
|
copy_cache_entry(new_entry, old_entry);
|
|
|
|
new_entry->ce_flags &= ~CE_HASHED;
|
|
|
|
new_entry->ce_namelen = namelen;
|
|
|
|
new_entry->index = 0;
|
|
|
|
memcpy(new_entry->name, new_name, namelen + 1);
|
2008-07-21 02:25:56 +02:00
|
|
|
|
2018-02-14 19:59:45 +01:00
|
|
|
cache_tree_invalidate_path(istate, old_entry->name);
|
|
|
|
untracked_cache_remove_from_index(istate, old_entry->name);
|
2008-07-21 02:25:56 +02:00
|
|
|
remove_index_entry_at(istate, nr);
|
2018-02-14 19:59:45 +01:00
|
|
|
add_index_entry(istate, new_entry, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
|
2008-07-21 02:25:56 +02:00
|
|
|
}
|
|
|
|
|
2013-06-20 10:37:50 +02:00
|
|
|
void fill_stat_data(struct stat_data *sd, struct stat *st)
|
|
|
|
{
|
|
|
|
sd->sd_ctime.sec = (unsigned int)st->st_ctime;
|
|
|
|
sd->sd_mtime.sec = (unsigned int)st->st_mtime;
|
|
|
|
sd->sd_ctime.nsec = ST_CTIME_NSEC(*st);
|
|
|
|
sd->sd_mtime.nsec = ST_MTIME_NSEC(*st);
|
|
|
|
sd->sd_dev = st->st_dev;
|
|
|
|
sd->sd_ino = st->st_ino;
|
|
|
|
sd->sd_uid = st->st_uid;
|
|
|
|
sd->sd_gid = st->st_gid;
|
|
|
|
sd->sd_size = st->st_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
int match_stat_data(const struct stat_data *sd, struct stat *st)
|
|
|
|
{
|
|
|
|
int changed = 0;
|
|
|
|
|
|
|
|
if (sd->sd_mtime.sec != (unsigned int)st->st_mtime)
|
|
|
|
changed |= MTIME_CHANGED;
|
|
|
|
if (trust_ctime && check_stat &&
|
|
|
|
sd->sd_ctime.sec != (unsigned int)st->st_ctime)
|
|
|
|
changed |= CTIME_CHANGED;
|
|
|
|
|
|
|
|
#ifdef USE_NSEC
|
|
|
|
if (check_stat && sd->sd_mtime.nsec != ST_MTIME_NSEC(*st))
|
|
|
|
changed |= MTIME_CHANGED;
|
|
|
|
if (trust_ctime && check_stat &&
|
|
|
|
sd->sd_ctime.nsec != ST_CTIME_NSEC(*st))
|
|
|
|
changed |= CTIME_CHANGED;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (check_stat) {
|
|
|
|
if (sd->sd_uid != (unsigned int) st->st_uid ||
|
|
|
|
sd->sd_gid != (unsigned int) st->st_gid)
|
|
|
|
changed |= OWNER_CHANGED;
|
|
|
|
if (sd->sd_ino != (unsigned int) st->st_ino)
|
|
|
|
changed |= INODE_CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef USE_STDEV
|
|
|
|
/*
|
|
|
|
* st_dev breaks on network filesystems where different
|
|
|
|
* clients will have different views of what "device"
|
|
|
|
* the filesystem is on
|
|
|
|
*/
|
|
|
|
if (check_stat && sd->sd_dev != (unsigned int) st->st_dev)
|
|
|
|
changed |= INODE_CHANGED;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (sd->sd_size != (unsigned int) st->st_size)
|
|
|
|
changed |= DATA_CHANGED;
|
|
|
|
|
|
|
|
return changed;
|
|
|
|
}
|
|
|
|
|
2005-05-15 23:23:12 +02:00
|
|
|
/*
|
|
|
|
* This only updates the "non-critical" parts of the directory
|
|
|
|
* cache, ie the parts that aren't tracked by GIT, and only used
|
|
|
|
* to validate the cache.
|
|
|
|
*/
|
2019-05-24 14:23:47 +02:00
|
|
|
void fill_stat_cache_info(struct index_state *istate, struct cache_entry *ce, struct stat *st)
|
2005-05-15 23:23:12 +02:00
|
|
|
{
|
2013-06-20 10:37:50 +02:00
|
|
|
fill_stat_data(&ce->ce_stat_data, st);
|
2006-02-09 06:15:24 +01:00
|
|
|
|
|
|
|
if (assume_unchanged)
|
2008-01-15 01:03:17 +01:00
|
|
|
ce->ce_flags |= CE_VALID;
|
2008-01-19 08:45:24 +01:00
|
|
|
|
2017-09-22 18:35:40 +02:00
|
|
|
if (S_ISREG(st->st_mode)) {
|
2008-01-19 08:45:24 +01:00
|
|
|
ce_mark_uptodate(ce);
|
mark_fsmonitor_valid(): mark the index as changed if needed
Without this bug fix, t7519's four "status doesn't detect unreported
modifications" test cases would fail occasionally (and, oddly enough,
*a lot* more frequently on Windows).
The reason is that these test cases intentionally use the side effect of
`git status` to re-write the index if any updates were detected: they
first clean the worktree, run `git status` to update the index as well
as show the output to the casual reader, then make the worktree dirty
again and expect no changes to reported if running with a mocked
fsmonitor hook.
The problem with this strategy was that the index was written during
said `git status` on the clean worktree for the *wrong* reason: not
because the index was marked as changed (it wasn't), but because the
recorded mtimes were racy with the index' own mtime.
As the mtime granularity on Windows is 100 nanoseconds (see e.g.
https://docs.microsoft.com/en-us/windows/desktop/SysInfo/file-times),
the mtimes of the files are often enough *not* racy with the index', so
that that `git status` call currently does not always update the index
(including the fsmonitor extension), causing the test case to fail.
The obvious fix: if we change *any* index entry's `CE_FSMONITOR_VALID`
flag, we should also mark the index as changed. That will cause the
index to be written upon `git status`, *including* an updated fsmonitor
extension.
Side note: Even though the reader might think that the t7519 issue
should be *much* more prevalent on Linux, given that the ext4 filesystem
(that seems to be used by every Linux distribution) stores mtimes in
nanosecond precision. However, ext4 uses `current_kernel_time()` (see
https://unix.stackexchange.com/questions/11599#comment762968_11599; it
is *amazingly* hard to find any proper source of information about such
ext4 questions) whose accuracy seems to depend on many factors but is
safely worse than the 100-nanosecond granularity of NTFS (again, it is
*horribly* hard to find anything remotely authoritative about this
question). So it seems that the racy index condition that hid the bug
fixed by this patch simply is a lot more likely on Linux than on
Windows. But not impossible ;-)
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-05-24 14:23:48 +02:00
|
|
|
mark_fsmonitor_valid(istate, ce);
|
2017-09-22 18:35:40 +02:00
|
|
|
}
|
2005-05-15 23:23:12 +02:00
|
|
|
}
|
|
|
|
|
2018-09-21 17:57:31 +02:00
|
|
|
static int ce_compare_data(struct index_state *istate,
|
|
|
|
const struct cache_entry *ce,
|
|
|
|
struct stat *st)
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
{
|
|
|
|
int match = -1;
|
2016-10-28 15:23:07 +02:00
|
|
|
int fd = git_open_cloexec(ce->name, O_RDONLY);
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
|
|
|
|
if (fd >= 0) {
|
2017-08-20 22:09:27 +02:00
|
|
|
struct object_id oid;
|
2018-09-21 17:57:31 +02:00
|
|
|
if (!index_fd(istate, &oid, fd, st, OBJ_BLOB, ce->name, 0))
|
2018-08-28 23:22:59 +02:00
|
|
|
match = !oideq(&oid, &ce->oid);
|
2006-07-31 18:55:15 +02:00
|
|
|
/* index_fd() closed the file descriptor already */
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
}
|
|
|
|
return match;
|
|
|
|
}
|
|
|
|
|
2013-06-02 17:46:52 +02:00
|
|
|
static int ce_compare_link(const struct cache_entry *ce, size_t expected_size)
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
{
|
|
|
|
int match = -1;
|
|
|
|
void *buffer;
|
|
|
|
unsigned long size;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
2008-12-17 18:47:27 +01:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
|
2008-12-17 18:47:27 +01:00
|
|
|
if (strbuf_readlink(&sb, ce->name, expected_size))
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
return -1;
|
2008-12-17 18:47:27 +01:00
|
|
|
|
sha1_file: convert read_sha1_file to struct object_id
Convert read_sha1_file to take a pointer to struct object_id and rename
it read_object_file. Do the same for read_sha1_file_extended.
Convert one use in grep.c to use the new function without any other code
change, since the pointer being passed is a void pointer that is already
initialized with a pointer to struct object_id. Update the declaration
and definitions of the modified functions, and apply the following
semantic patch to convert the remaining callers:
@@
expression E1, E2, E3;
@@
- read_sha1_file(E1.hash, E2, E3)
+ read_object_file(&E1, E2, E3)
@@
expression E1, E2, E3;
@@
- read_sha1_file(E1->hash, E2, E3)
+ read_object_file(E1, E2, E3)
@@
expression E1, E2, E3, E4;
@@
- read_sha1_file_extended(E1.hash, E2, E3, E4)
+ read_object_file_extended(&E1, E2, E3, E4)
@@
expression E1, E2, E3, E4;
@@
- read_sha1_file_extended(E1->hash, E2, E3, E4)
+ read_object_file_extended(E1, E2, E3, E4)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-12 03:27:53 +01:00
|
|
|
buffer = read_object_file(&ce->oid, &type, &size);
|
2008-12-17 18:47:27 +01:00
|
|
|
if (buffer) {
|
|
|
|
if (size == sb.len)
|
|
|
|
match = memcmp(buffer, sb.buf, size);
|
|
|
|
free(buffer);
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
}
|
2008-12-17 18:47:27 +01:00
|
|
|
strbuf_release(&sb);
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
return match;
|
|
|
|
}
|
|
|
|
|
2013-06-02 17:46:52 +02:00
|
|
|
static int ce_compare_gitlink(const struct cache_entry *ce)
|
2007-04-10 06:20:29 +02:00
|
|
|
{
|
2017-10-16 00:07:06 +02:00
|
|
|
struct object_id oid;
|
2007-04-10 06:20:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't actually require that the .git directory
|
2007-05-21 22:08:28 +02:00
|
|
|
* under GITLINK directory be a valid git directory. It
|
2007-04-10 06:20:29 +02:00
|
|
|
* might even be missing (in case nobody populated that
|
|
|
|
* sub-project).
|
|
|
|
*
|
|
|
|
* If so, we consider it always to match.
|
|
|
|
*/
|
refs: convert resolve_gitlink_ref to struct object_id
Convert the declaration and definition of resolve_gitlink_ref to use
struct object_id and apply the following semantic patch:
@@
expression E1, E2, E3;
@@
- resolve_gitlink_ref(E1, E2, E3.hash)
+ resolve_gitlink_ref(E1, E2, &E3)
@@
expression E1, E2, E3;
@@
- resolve_gitlink_ref(E1, E2, E3->hash)
+ resolve_gitlink_ref(E1, E2, E3)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-10-16 00:07:07 +02:00
|
|
|
if (resolve_gitlink_ref(ce->name, "HEAD", &oid) < 0)
|
2007-04-10 06:20:29 +02:00
|
|
|
return 0;
|
2018-08-28 23:22:59 +02:00
|
|
|
return !oideq(&oid, &ce->oid);
|
2007-04-10 06:20:29 +02:00
|
|
|
}
|
|
|
|
|
2018-09-21 17:57:31 +02:00
|
|
|
static int ce_modified_check_fs(struct index_state *istate,
|
|
|
|
const struct cache_entry *ce,
|
|
|
|
struct stat *st)
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
{
|
|
|
|
switch (st->st_mode & S_IFMT) {
|
|
|
|
case S_IFREG:
|
2018-09-21 17:57:31 +02:00
|
|
|
if (ce_compare_data(istate, ce, st))
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
return DATA_CHANGED;
|
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
2007-03-07 02:44:37 +01:00
|
|
|
if (ce_compare_link(ce, xsize_t(st->st_size)))
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
return DATA_CHANGED;
|
|
|
|
break;
|
2007-04-13 18:24:13 +02:00
|
|
|
case S_IFDIR:
|
2008-01-15 01:03:17 +01:00
|
|
|
if (S_ISGITLINK(ce->ce_mode))
|
2008-07-29 10:13:44 +02:00
|
|
|
return ce_compare_gitlink(ce) ? DATA_CHANGED : 0;
|
consistently use "fallthrough" comments in switches
Gcc 7 adds -Wimplicit-fallthrough, which can warn when a
switch case falls through to the next case. The general idea
is that the compiler can't tell if this was intentional or
not, so you should annotate any intentional fall-throughs as
such, leaving it to complain about any unannotated ones.
There's a GNU __attribute__ which can be used for
annotation, but of course we'd have to #ifdef it away on
non-gcc compilers. Gcc will also recognize
specially-formatted comments, which matches our current
practice. Let's extend that practice to all of the
unannotated sites (which I did look over and verify that
they were behaving as intended).
Ideally in each case we'd actually give some reasons in the
comment about why we're falling through, or what we're
falling through to. And gcc does support that with
-Wimplicit-fallthrough=2, which relaxes the comment pattern
matching to anything that contains "fallthrough" (or a
variety of spelling variants). However, this isn't the
default for -Wimplicit-fallthrough, nor for -Wextra. In the
name of simplicity, it's probably better for us to support
the default level, which requires "fallthrough" to be the
only thing in the comment (modulo some window dressing like
"else" and some punctuation; see the gcc manual for the
complete set of patterns).
This patch suppresses all warnings due to
-Wimplicit-fallthrough. We might eventually want to add that
to the DEVELOPER Makefile knob, but we should probably wait
until gcc 7 is more widely adopted (since earlier versions
will complain about the unknown warning type).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-21 08:25:41 +02:00
|
|
|
/* else fallthrough */
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
default:
|
|
|
|
return TYPE_CHANGED;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-06-02 17:46:52 +02:00
|
|
|
static int ce_match_stat_basic(const struct cache_entry *ce, struct stat *st)
|
2005-04-09 18:48:20 +02:00
|
|
|
{
|
|
|
|
unsigned int changed = 0;
|
|
|
|
|
2008-01-15 01:03:17 +01:00
|
|
|
if (ce->ce_flags & CE_REMOVE)
|
|
|
|
return MODE_CHANGED | DATA_CHANGED | TYPE_CHANGED;
|
|
|
|
|
|
|
|
switch (ce->ce_mode & S_IFMT) {
|
2005-05-05 14:38:25 +02:00
|
|
|
case S_IFREG:
|
|
|
|
changed |= !S_ISREG(st->st_mode) ? TYPE_CHANGED : 0;
|
2005-10-12 03:45:33 +02:00
|
|
|
/* We consider only the owner x bit to be relevant for
|
|
|
|
* "mode changes"
|
|
|
|
*/
|
|
|
|
if (trust_executable_bit &&
|
2008-01-15 01:03:17 +01:00
|
|
|
(0100 & (ce->ce_mode ^ st->st_mode)))
|
2005-05-06 15:45:01 +02:00
|
|
|
changed |= MODE_CHANGED;
|
2005-05-05 14:38:25 +02:00
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
2007-03-02 22:11:30 +01:00
|
|
|
if (!S_ISLNK(st->st_mode) &&
|
|
|
|
(has_symlinks || !S_ISREG(st->st_mode)))
|
|
|
|
changed |= TYPE_CHANGED;
|
2005-05-05 14:38:25 +02:00
|
|
|
break;
|
2007-05-21 22:08:28 +02:00
|
|
|
case S_IFGITLINK:
|
2008-07-29 10:13:44 +02:00
|
|
|
/* We ignore most of the st_xxx fields for gitlinks */
|
2007-04-10 06:20:29 +02:00
|
|
|
if (!S_ISDIR(st->st_mode))
|
|
|
|
changed |= TYPE_CHANGED;
|
|
|
|
else if (ce_compare_gitlink(ce))
|
|
|
|
changed |= DATA_CHANGED;
|
2007-04-13 18:24:13 +02:00
|
|
|
return changed;
|
2005-05-05 14:38:25 +02:00
|
|
|
default:
|
2018-11-10 06:16:04 +01:00
|
|
|
BUG("unsupported ce_mode: %o", ce->ce_mode);
|
2005-05-05 14:38:25 +02:00
|
|
|
}
|
2005-05-23 00:08:15 +02:00
|
|
|
|
2013-06-20 10:37:50 +02:00
|
|
|
changed |= match_stat_data(&ce->ce_stat_data, st);
|
2005-09-20 00:11:15 +02:00
|
|
|
|
2008-06-10 19:44:43 +02:00
|
|
|
/* Racily smudged entry? */
|
2013-06-20 10:37:50 +02:00
|
|
|
if (!ce->ce_stat_data.sd_size) {
|
2016-09-05 22:07:52 +02:00
|
|
|
if (!is_empty_blob_sha1(ce->oid.hash))
|
2008-06-10 19:44:43 +02:00
|
|
|
changed |= DATA_CHANGED;
|
|
|
|
}
|
|
|
|
|
2005-12-20 21:12:18 +01:00
|
|
|
return changed;
|
|
|
|
}
|
|
|
|
|
2015-03-08 11:12:36 +01:00
|
|
|
static int is_racy_stat(const struct index_state *istate,
|
|
|
|
const struct stat_data *sd)
|
2008-01-21 09:44:50 +01:00
|
|
|
{
|
2015-03-08 11:12:36 +01:00
|
|
|
return (istate->timestamp.sec &&
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
#ifdef USE_NSEC
|
|
|
|
/* nanosecond timestamped files can also be racy! */
|
2015-03-08 11:12:36 +01:00
|
|
|
(istate->timestamp.sec < sd->sd_mtime.sec ||
|
|
|
|
(istate->timestamp.sec == sd->sd_mtime.sec &&
|
|
|
|
istate->timestamp.nsec <= sd->sd_mtime.nsec))
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
#else
|
2015-03-08 11:12:36 +01:00
|
|
|
istate->timestamp.sec <= sd->sd_mtime.sec
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
#endif
|
2015-03-08 11:12:36 +01:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
split-index: smudge and add racily clean cache entries to split index
Ever since the split index feature was introduced [1], refreshing a
split index is prone to a variant of the classic racy git problem.
Consider the following sequence of commands updating the split index
when the shared index contains a racily clean cache entry, i.e. an
entry whose cached stat data matches with the corresponding file in
the worktree and the cached mtime matches that of the index:
echo "cached content" >file
git update-index --split-index --add file
echo "dirty worktree" >file # size stays the same!
# ... wait ...
git update-index --add other-file
Normally, when a non-split index is updated, then do_write_index()
(the function responsible for writing all kinds of indexes, "regular",
split, and shared) recognizes racily clean cache entries, and writes
them with smudged stat data, i.e. with file size set to 0. When
subsequent git commands read the index, they will notice that the
smudged stat data doesn't match with the file in the worktree, and
then go on to check the file's content and notice its dirtiness.
In the above example, however, in the second 'git update-index'
prepare_to_write_split_index() decides which cache entries stored only
in the shared index should be replaced in the new split index. Alas,
this function never looks out for racily clean cache entries, and
since the file's stat data in the worktree hasn't changed since the
shared index was written, it won't be replaced in the new split index.
Consequently, do_write_index() doesn't even get this racily clean
cache entry, and can't smudge its stat data. Subsequent git commands
will then see that the index has more recent mtime than the file and
that the (not smudged) cached stat data still matches with the file in
the worktree, and, ultimately, will erroneously consider the file
clean.
Modify prepare_to_write_split_index() to recognize racily clean cache
entries, and mark them to be added to the split index. Note that
there are two places where it should check raciness: first those cache
entries that are only stored in the shared index, and then those that
have been copied by unpack_trees() from the shared index while it
constructed a new index. This way do_write_index() will get these
racily clean cache entries as well, and will then write them with
smudged stat data to the new split index.
This change makes all tests in 't1701-racy-split-index.sh' pass, so
flip the two 'test_expect_failure' tests to success. Also add the '#'
(as in nr. of trial) to those tests' description that were omitted
when the tests expected failure.
Note that after this change if the index is split when it contains a
racily clean cache entry, then a smudged cache entry will be written
both to the new shared and to the new split indexes. This doesn't
affect regular git commands: as far as they are concerned this is just
an entry in the split index replacing an outdated entry in the shared
index. It did affect a few tests in 't1700-split-index.sh', though,
because they actually check which entries are stored in the split
index; a previous patch in this series has already made the necessary
adjustments in 't1700'. And racily clean cache entries and index
splitting are rare enough to not worry about the resulting duplicated
smudged cache entries, and the additional complexity required to
prevent them is not worth it.
Several tests failed occasionally when the test suite was run with
'GIT_TEST_SPLIT_INDEX=yes'. Here are those that I managed to trace
back to this racy split index problem, starting with those failing
more frequently, with a link to a failing Travis CI build job for
each. The highlighted line [2] shows when the racy file was written,
which is not always in the failing test but in a preceeding setup
test.
t3903-stash.sh:
https://travis-ci.org/git/git/jobs/385542084#L5858
t4024-diff-optimize-common.sh:
https://travis-ci.org/git/git/jobs/386531969#L3174
t4015-diff-whitespace.sh:
https://travis-ci.org/git/git/jobs/360797600#L8215
t2200-add-update.sh:
https://travis-ci.org/git/git/jobs/382543426#L3051
t0090-cache-tree.sh:
https://travis-ci.org/git/git/jobs/416583010#L3679
There might be others, e.g. perhaps 't1000-read-tree-m-3way.sh' and
others using 'lib-read-tree-m-3way.sh', but I couldn't confirm yet.
[1] In the branch leading to the merge commit v2.1.0-rc0~45 (Merge
branch 'nd/split-index', 2014-07-16).
[2] Note that those highlighted lines are in the 'after failure' fold,
and your browser might unhelpfully fold it up before you could
take a good look.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-10-11 11:43:09 +02:00
|
|
|
int is_racy_timestamp(const struct index_state *istate,
|
2015-03-08 11:12:36 +01:00
|
|
|
const struct cache_entry *ce)
|
|
|
|
{
|
|
|
|
return (!S_ISGITLINK(ce->ce_mode) &&
|
|
|
|
is_racy_stat(istate, &ce->ce_stat_data));
|
2008-01-21 09:44:50 +01:00
|
|
|
}
|
|
|
|
|
2015-03-08 11:12:37 +01:00
|
|
|
int match_stat_data_racy(const struct index_state *istate,
|
|
|
|
const struct stat_data *sd, struct stat *st)
|
|
|
|
{
|
|
|
|
if (is_racy_stat(istate, sd))
|
|
|
|
return MTIME_CHANGED;
|
|
|
|
return match_stat_data(sd, st);
|
2008-01-21 09:44:50 +01:00
|
|
|
}
|
|
|
|
|
2017-09-22 18:35:40 +02:00
|
|
|
int ie_match_stat(struct index_state *istate,
|
2013-06-02 17:46:52 +02:00
|
|
|
const struct cache_entry *ce, struct stat *st,
|
2007-11-10 09:15:03 +01:00
|
|
|
unsigned int options)
|
2005-12-20 21:12:18 +01:00
|
|
|
{
|
2006-02-09 06:15:24 +01:00
|
|
|
unsigned int changed;
|
2007-11-10 09:15:03 +01:00
|
|
|
int ignore_valid = options & CE_MATCH_IGNORE_VALID;
|
2009-12-14 12:43:58 +01:00
|
|
|
int ignore_skip_worktree = options & CE_MATCH_IGNORE_SKIP_WORKTREE;
|
2007-11-10 09:15:03 +01:00
|
|
|
int assume_racy_is_modified = options & CE_MATCH_RACY_IS_DIRTY;
|
2017-09-22 18:35:40 +02:00
|
|
|
int ignore_fsmonitor = options & CE_MATCH_IGNORE_FSMONITOR;
|
2006-02-09 06:15:24 +01:00
|
|
|
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!ignore_fsmonitor)
|
|
|
|
refresh_fsmonitor(istate);
|
2006-02-09 06:15:24 +01:00
|
|
|
/*
|
|
|
|
* If it's marked as always valid in the index, it's
|
|
|
|
* valid whatever the checked-out copy says.
|
2009-12-14 12:43:58 +01:00
|
|
|
*
|
|
|
|
* skip-worktree has the same effect with higher precedence
|
2006-02-09 06:15:24 +01:00
|
|
|
*/
|
2009-12-14 12:43:58 +01:00
|
|
|
if (!ignore_skip_worktree && ce_skip_worktree(ce))
|
|
|
|
return 0;
|
2008-01-15 01:03:17 +01:00
|
|
|
if (!ignore_valid && (ce->ce_flags & CE_VALID))
|
2006-02-09 06:15:24 +01:00
|
|
|
return 0;
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!ignore_fsmonitor && (ce->ce_flags & CE_FSMONITOR_VALID))
|
|
|
|
return 0;
|
2006-02-09 06:15:24 +01:00
|
|
|
|
2008-11-29 04:56:34 +01:00
|
|
|
/*
|
|
|
|
* Intent-to-add entries have not been added, so the index entry
|
|
|
|
* by definition never matches what is in the work tree until it
|
|
|
|
* actually gets added.
|
|
|
|
*/
|
2015-08-22 03:08:05 +02:00
|
|
|
if (ce_intent_to_add(ce))
|
2008-11-29 04:56:34 +01:00
|
|
|
return DATA_CHANGED | TYPE_CHANGED | MODE_CHANGED;
|
|
|
|
|
2006-02-09 06:15:24 +01:00
|
|
|
changed = ce_match_stat_basic(ce, st);
|
2005-12-20 21:12:18 +01:00
|
|
|
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
/*
|
|
|
|
* Within 1 second of this sequence:
|
|
|
|
* echo xyzzy >file && git-update-index --add file
|
|
|
|
* running this command:
|
|
|
|
* echo frotz >file
|
|
|
|
* would give a falsely clean cache entry. The mtime and
|
|
|
|
* length match the cache, and other stat fields do not change.
|
|
|
|
*
|
|
|
|
* We could detect this at update-index time (the cache entry
|
|
|
|
* being registered/updated records the same time as "now")
|
|
|
|
* and delay the return from git-update-index, but that would
|
|
|
|
* effectively mean we can make at most one commit per second,
|
|
|
|
* which is not acceptable. Instead, we check cache entries
|
|
|
|
* whose mtime are the same as the index file timestamp more
|
2006-02-09 06:15:24 +01:00
|
|
|
* carefully than others.
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
*/
|
2008-01-21 09:44:50 +01:00
|
|
|
if (!changed && is_racy_timestamp(istate, ce)) {
|
2006-08-16 06:38:07 +02:00
|
|
|
if (assume_racy_is_modified)
|
|
|
|
changed |= DATA_CHANGED;
|
|
|
|
else
|
2018-09-21 17:57:31 +02:00
|
|
|
changed |= ce_modified_check_fs(istate, ce, st);
|
2006-08-16 06:38:07 +02:00
|
|
|
}
|
2005-09-20 00:11:15 +02:00
|
|
|
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
return changed;
|
2005-09-20 00:11:15 +02:00
|
|
|
}
|
|
|
|
|
2017-09-22 18:35:40 +02:00
|
|
|
int ie_modified(struct index_state *istate,
|
2013-06-02 17:46:52 +02:00
|
|
|
const struct cache_entry *ce,
|
|
|
|
struct stat *st, unsigned int options)
|
2005-09-20 00:11:15 +02:00
|
|
|
{
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
int changed, changed_fs;
|
2007-11-10 09:15:03 +01:00
|
|
|
|
|
|
|
changed = ie_match_stat(istate, ce, st, options);
|
2005-09-20 00:11:15 +02:00
|
|
|
if (!changed)
|
|
|
|
return 0;
|
|
|
|
/*
|
|
|
|
* If the mode or type has changed, there's no point in trying
|
|
|
|
* to refresh the entry - it's not going to match
|
|
|
|
*/
|
|
|
|
if (changed & (MODE_CHANGED | TYPE_CHANGED))
|
|
|
|
return changed;
|
|
|
|
|
2008-07-29 10:13:44 +02:00
|
|
|
/*
|
|
|
|
* Immediately after read-tree or update-index --cacheinfo,
|
|
|
|
* the length field is zero, as we have never even read the
|
|
|
|
* lstat(2) information once, and we cannot trust DATA_CHANGED
|
|
|
|
* returned by ie_match_stat() which in turn was returned by
|
|
|
|
* ce_match_stat_basic() to signal that the filesize of the
|
|
|
|
* blob changed. We have to actually go to the filesystem to
|
|
|
|
* see if the contents match, and if so, should answer "unchanged".
|
|
|
|
*
|
|
|
|
* The logic does not apply to gitlinks, as ce_match_stat_basic()
|
|
|
|
* already has checked the actual HEAD from the filesystem in the
|
|
|
|
* subproject. If ie_match_stat() already said it is different,
|
|
|
|
* then we know it is.
|
2005-09-20 00:11:15 +02:00
|
|
|
*/
|
2008-07-29 10:13:44 +02:00
|
|
|
if ((changed & DATA_CHANGED) &&
|
2013-06-20 10:37:50 +02:00
|
|
|
(S_ISGITLINK(ce->ce_mode) || ce->ce_stat_data.sd_size != 0))
|
2005-09-20 00:11:15 +02:00
|
|
|
return changed;
|
|
|
|
|
2018-09-21 17:57:31 +02:00
|
|
|
changed_fs = ce_modified_check_fs(istate, ce, st);
|
Racy GIT
This fixes the longstanding "Racy GIT" problem, which was pretty
much there from the beginning of time, but was first
demonstrated by Pasky in this message on October 24, 2005:
http://marc.theaimsgroup.com/?l=git&m=113014629716878
If you run the following sequence of commands:
echo frotz >infocom
git update-index --add infocom
echo xyzzy >infocom
so that the second update to file "infocom" does not change
st_mtime, what is recorded as the stat information for the cache
entry "infocom" exactly matches what is on the filesystem
(owner, group, inum, mtime, ctime, mode, length). After this
sequence, we incorrectly think "infocom" file still has string
"frotz" in it, and get really confused. E.g. git-diff-files
would say there is no change, git-update-index --refresh would
not even look at the filesystem to correct the situation.
Some ways of working around this issue were already suggested by
Linus in the same thread on the same day, including waiting
until the next second before returning from update-index if a
cache entry written out has the current timestamp, but that
means we can make at most one commit per second, and given that
the e-mail patch workflow used by Linus needs to process at
least 5 commits per second, it is not an acceptable solution.
Linus notes that git-apply is primarily used to update the index
while processing e-mailed patches, which is true, and
git-apply's up-to-date check is fooled by the same problem but
luckily in the other direction, so it is not really a big issue,
but still it is disturbing.
The function ce_match_stat() is called to bypass the comparison
against filesystem data when the stat data recorded in the cache
entry matches what stat() returns from the filesystem. This
patch tackles the problem by changing it to actually go to the
filesystem data for cache entries that have the same mtime as
the index file itself. This works as long as the index file and
working tree files are on the filesystems that share the same
monotonic clock. Files on network mounted filesystems sometimes
get skewed timestamps compared to "date" output, but as long as
working tree files' timestamps are skewed the same way as the
index file's, this approach still works. The only problematic
files are the ones that have the same timestamp as the index
file's, because two file updates that sandwitch the index file
update must happen within the same second to trigger the
problem.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2005-12-20 09:02:15 +01:00
|
|
|
if (changed_fs)
|
|
|
|
return changed | changed_fs;
|
2005-09-20 00:11:15 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-05-20 18:09:18 +02:00
|
|
|
int base_name_compare(const char *name1, int len1, int mode1,
|
|
|
|
const char *name2, int len2, int mode2)
|
|
|
|
{
|
|
|
|
unsigned char c1, c2;
|
|
|
|
int len = len1 < len2 ? len1 : len2;
|
|
|
|
int cmp;
|
|
|
|
|
|
|
|
cmp = memcmp(name1, name2, len);
|
|
|
|
if (cmp)
|
|
|
|
return cmp;
|
|
|
|
c1 = name1[len];
|
|
|
|
c2 = name2[len];
|
Fix thinko in subproject entry sorting
This fixes a total thinko in my original series: subprojects do *not* sort
like directories, because the index is sorted purely by full pathname, and
since a subproject shows up in the index as a normal NUL-terminated
string, it never has the issues with sorting with the '/' at the end.
So if you have a subproject "proj" and a file "proj.c", the subproject
sorts alphabetically before the file in the index (and must thus also sort
that way in a tree object, since trees sort as the index).
In contrast, it you have two files "proj/file" and "proj.c", the "proj.c"
will sort alphabetically before "proj/file" in the index. The index
itself, of course, does not actually contain an entry "proj/", but in the
*tree* that gets written out, the tree entry "proj" will sort after the
file entry "proj.c", which is the only real magic sorting rule.
In other words: the magic sorting rule only affects tree entries, and
*only* affects tree entries that point to other trees (ie are of the type
S_IFDIR).
Anyway, that thinko just means that we should remove the special case to
make S_ISDIRLNK entries sort like S_ISDIR entries. They don't. They sort
like normal files.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2007-04-11 23:39:12 +02:00
|
|
|
if (!c1 && S_ISDIR(mode1))
|
2005-05-20 18:09:18 +02:00
|
|
|
c1 = '/';
|
Fix thinko in subproject entry sorting
This fixes a total thinko in my original series: subprojects do *not* sort
like directories, because the index is sorted purely by full pathname, and
since a subproject shows up in the index as a normal NUL-terminated
string, it never has the issues with sorting with the '/' at the end.
So if you have a subproject "proj" and a file "proj.c", the subproject
sorts alphabetically before the file in the index (and must thus also sort
that way in a tree object, since trees sort as the index).
In contrast, it you have two files "proj/file" and "proj.c", the "proj.c"
will sort alphabetically before "proj/file" in the index. The index
itself, of course, does not actually contain an entry "proj/", but in the
*tree* that gets written out, the tree entry "proj" will sort after the
file entry "proj.c", which is the only real magic sorting rule.
In other words: the magic sorting rule only affects tree entries, and
*only* affects tree entries that point to other trees (ie are of the type
S_IFDIR).
Anyway, that thinko just means that we should remove the special case to
make S_ISDIRLNK entries sort like S_ISDIR entries. They don't. They sort
like normal files.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2007-04-11 23:39:12 +02:00
|
|
|
if (!c2 && S_ISDIR(mode2))
|
2005-05-20 18:09:18 +02:00
|
|
|
c2 = '/';
|
|
|
|
return (c1 < c2) ? -1 : (c1 > c2) ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
2008-03-06 03:25:10 +01:00
|
|
|
/*
|
|
|
|
* df_name_compare() is identical to base_name_compare(), except it
|
|
|
|
* compares conflicting directory/file entries as equal. Note that
|
|
|
|
* while a directory name compares as equal to a regular file, they
|
|
|
|
* then individually compare _differently_ to a filename that has
|
|
|
|
* a dot after the basename (because '\0' < '.' < '/').
|
|
|
|
*
|
|
|
|
* This is used by routines that want to traverse the git namespace
|
|
|
|
* but then handle conflicting entries together when possible.
|
|
|
|
*/
|
|
|
|
int df_name_compare(const char *name1, int len1, int mode1,
|
|
|
|
const char *name2, int len2, int mode2)
|
|
|
|
{
|
|
|
|
int len = len1 < len2 ? len1 : len2, cmp;
|
|
|
|
unsigned char c1, c2;
|
|
|
|
|
|
|
|
cmp = memcmp(name1, name2, len);
|
|
|
|
if (cmp)
|
|
|
|
return cmp;
|
|
|
|
/* Directories and files compare equal (same length, same name) */
|
|
|
|
if (len1 == len2)
|
|
|
|
return 0;
|
|
|
|
c1 = name1[len];
|
|
|
|
if (!c1 && S_ISDIR(mode1))
|
|
|
|
c1 = '/';
|
|
|
|
c2 = name2[len];
|
|
|
|
if (!c2 && S_ISDIR(mode2))
|
|
|
|
c2 = '/';
|
|
|
|
if (c1 == '/' && !c2)
|
|
|
|
return 0;
|
|
|
|
if (c2 == '/' && !c1)
|
|
|
|
return 0;
|
|
|
|
return c1 - c2;
|
|
|
|
}
|
|
|
|
|
2014-06-20 04:06:44 +02:00
|
|
|
int name_compare(const char *name1, size_t len1, const char *name2, size_t len2)
|
2005-04-09 18:26:55 +02:00
|
|
|
{
|
2014-06-20 04:06:44 +02:00
|
|
|
size_t min_len = (len1 < len2) ? len1 : len2;
|
|
|
|
int cmp = memcmp(name1, name2, min_len);
|
2005-04-09 18:26:55 +02:00
|
|
|
if (cmp)
|
|
|
|
return cmp;
|
|
|
|
if (len1 < len2)
|
|
|
|
return -1;
|
|
|
|
if (len1 > len2)
|
|
|
|
return 1;
|
2014-06-20 04:06:44 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int cache_name_stage_compare(const char *name1, int len1, int stage1, const char *name2, int len2, int stage2)
|
|
|
|
{
|
|
|
|
int cmp;
|
|
|
|
|
|
|
|
cmp = name_compare(name1, len1, name2, len2);
|
|
|
|
if (cmp)
|
|
|
|
return cmp;
|
2006-02-09 06:15:24 +01:00
|
|
|
|
2012-07-11 11:22:37 +02:00
|
|
|
if (stage1 < stage2)
|
2005-04-16 07:51:44 +02:00
|
|
|
return -1;
|
2012-07-11 11:22:37 +02:00
|
|
|
if (stage1 > stage2)
|
2005-04-16 07:51:44 +02:00
|
|
|
return 1;
|
2005-04-09 18:26:55 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-09-16 07:44:31 +02:00
|
|
|
static int index_name_stage_pos(const struct index_state *istate, const char *name, int namelen, int stage)
|
2005-04-09 18:26:55 +02:00
|
|
|
{
|
|
|
|
int first, last;
|
|
|
|
|
|
|
|
first = 0;
|
2007-04-02 08:26:07 +02:00
|
|
|
last = istate->cache_nr;
|
2005-04-09 18:26:55 +02:00
|
|
|
while (last > first) {
|
2019-06-13 19:51:56 +02:00
|
|
|
int next = first + ((last - first) >> 1);
|
2007-04-02 08:26:07 +02:00
|
|
|
struct cache_entry *ce = istate->cache[next];
|
2012-07-11 11:22:37 +02:00
|
|
|
int cmp = cache_name_stage_compare(name, namelen, stage, ce->name, ce_namelen(ce), ce_stage(ce));
|
2005-04-09 18:26:55 +02:00
|
|
|
if (!cmp)
|
2005-04-11 07:06:50 +02:00
|
|
|
return next;
|
2005-04-09 18:26:55 +02:00
|
|
|
if (cmp < 0) {
|
|
|
|
last = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
first = next+1;
|
|
|
|
}
|
2005-04-11 07:06:50 +02:00
|
|
|
return -first-1;
|
2005-04-09 18:26:55 +02:00
|
|
|
}
|
|
|
|
|
2012-07-11 11:22:37 +02:00
|
|
|
int index_name_pos(const struct index_state *istate, const char *name, int namelen)
|
|
|
|
{
|
|
|
|
return index_name_stage_pos(istate, name, namelen, 0);
|
|
|
|
}
|
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
int remove_index_entry_at(struct index_state *istate, int pos)
|
2005-04-16 21:05:45 +02:00
|
|
|
{
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
struct cache_entry *ce = istate->cache[pos];
|
|
|
|
|
2009-12-25 09:30:51 +01:00
|
|
|
record_resolve_undo(istate, ce);
|
2013-02-28 00:57:48 +01:00
|
|
|
remove_name_hash(istate, ce);
|
2014-06-13 14:19:38 +02:00
|
|
|
save_or_free_index_entry(istate, ce);
|
2014-06-13 14:19:27 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_REMOVED;
|
2007-04-02 08:26:07 +02:00
|
|
|
istate->cache_nr--;
|
|
|
|
if (pos >= istate->cache_nr)
|
2005-04-16 21:05:45 +02:00
|
|
|
return 0;
|
2017-07-15 22:00:45 +02:00
|
|
|
MOVE_ARRAY(istate->cache + pos, istate->cache + pos + 1,
|
|
|
|
istate->cache_nr - pos);
|
2005-04-16 21:05:45 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
check_updates(): effective removal of cache entries marked CE_REMOVE
Below is oprofile output from GIT command 'git chekcout -q my-v2.6.25'
(move from tag v2.6.27 to tag v2.6.25 of the Linux kernel):
CPU: Core 2, speed 1999.95 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit
mask of 0x00 (Unhalted core cycles) count 20000
Counted INST_RETIRED_ANY_P events (number of instructions retired) with a
unit mask of 0x00 (No unit mask) count 20000
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
409247 100.000 342878 100.000 git
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
260476 63.6476 257843 75.1996 libz.so.1.2.3
100876 24.6492 64378 18.7758 kernel-2.6.28.4_2.vmlinux
30850 7.5382 7874 2.2964 libc-2.9.so
14775 3.6103 8390 2.4469 git
2020 0.4936 4325 1.2614 libcrypto.so.0.9.8
191 0.0467 32 0.0093 libpthread-2.9.so
58 0.0142 36 0.0105 ld-2.9.so
1 2.4e-04 0 0 libldap-2.3.so.0.2.31
Detail list of the top 20 function entries (libz counted in one blob):
CPU_CLK_UNHALTED INST_RETIRED_ANY_P
samples % samples % image name symbol name
260476 63.6862 257843 75.2725 libz.so.1.2.3 /lib/libz.so.1.2.3
16587 4.0555 3636 1.0615 libc-2.9.so memcpy
7710 1.8851 277 0.0809 libc-2.9.so memmove
3679 0.8995 1108 0.3235 kernel-2.6.28.4_2.vmlinux d_validate
3546 0.8670 2607 0.7611 kernel-2.6.28.4_2.vmlinux __getblk
3174 0.7760 1813 0.5293 libc-2.9.so _int_malloc
2396 0.5858 3681 1.0746 kernel-2.6.28.4_2.vmlinux copy_to_user
2270 0.5550 2528 0.7380 kernel-2.6.28.4_2.vmlinux __link_path_walk
2205 0.5391 1797 0.5246 kernel-2.6.28.4_2.vmlinux ext4_mark_iloc_dirty
2103 0.5142 1203 0.3512 kernel-2.6.28.4_2.vmlinux find_first_zero_bit
2077 0.5078 997 0.2911 kernel-2.6.28.4_2.vmlinux do_get_write_access
2070 0.5061 514 0.1501 git cache_name_compare
2043 0.4995 1501 0.4382 kernel-2.6.28.4_2.vmlinux rcu_irq_exit
2022 0.4944 1732 0.5056 kernel-2.6.28.4_2.vmlinux __ext4_get_inode_loc
2020 0.4939 4325 1.2626 libcrypto.so.0.9.8 /usr/lib/libcrypto.so.0.9.8
1965 0.4804 1384 0.4040 git patch_delta
1708 0.4176 984 0.2873 kernel-2.6.28.4_2.vmlinux rcu_sched_grace_period
1682 0.4112 727 0.2122 kernel-2.6.28.4_2.vmlinux sysfs_slab_alias
1659 0.4056 290 0.0847 git find_pack_entry_one
1480 0.3619 1307 0.3816 kernel-2.6.28.4_2.vmlinux ext4_writepage_trans_blocks
Notice the memmove line, where the CPU did 7710 / 277 = 27.8 cycles
per instruction, and compared to the total cycles spent inside the
source code of GIT for this command, all the memmove() calls
translates to (7710 * 100) / 14775 = 52.2% of this.
Retesting with a GIT program compiled for gcov usage, I found out that
the memmove() calls came from remove_index_entry_at() in read-cache.c,
where we have:
memmove(istate->cache + pos,
istate->cache + pos + 1,
(istate->cache_nr - pos) * sizeof(struct cache_entry *));
remove_index_entry_at() is called 4902 times from check_updates() in
unpack-trees.c, and each time called we move each cache_entry pointers
(from the removed one) one step to the left.
Since we have 28828 entries in the cache this time, and if we on
average move half of them each time, we in total move approximately
4902 * 0.5 * 28828 * 4 = 282 629 712 bytes, or twice this amount if
each pointer is 8 bytes (64 bit).
OK, is seems that the function check_updates() is called 28 times, so
the estimated guess above had been more correct if check_updates() had
been called only once, but the point is: we get lots of bytes moved.
To fix this, and use an O(N) algorithm instead, where N is the number
of cache_entries, we delete/remove all entries in one loop through all
entries.
From a retest, the new remove_marked_cache_entries() from the patch
below, ended up with the following output line from oprofile:
46 0.0105 15 0.0041 git remove_marked_cache_entries
If we can trust the numbers from oprofile in this case, we saved
approximately ((7710 - 46) * 20000) / (2 * 1000 * 1000 * 1000) = 0.077
seconds CPU time with this fix for this particular test. And notice
that now the CPU did only 46 / 15 = 3.1 cycles/instruction.
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-18 23:18:03 +01:00
|
|
|
/*
|
2013-07-29 10:18:21 +02:00
|
|
|
* Remove all cache entries marked for removal, that is where
|
check_updates(): effective removal of cache entries marked CE_REMOVE
Below is oprofile output from GIT command 'git chekcout -q my-v2.6.25'
(move from tag v2.6.27 to tag v2.6.25 of the Linux kernel):
CPU: Core 2, speed 1999.95 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit
mask of 0x00 (Unhalted core cycles) count 20000
Counted INST_RETIRED_ANY_P events (number of instructions retired) with a
unit mask of 0x00 (No unit mask) count 20000
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
409247 100.000 342878 100.000 git
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
260476 63.6476 257843 75.1996 libz.so.1.2.3
100876 24.6492 64378 18.7758 kernel-2.6.28.4_2.vmlinux
30850 7.5382 7874 2.2964 libc-2.9.so
14775 3.6103 8390 2.4469 git
2020 0.4936 4325 1.2614 libcrypto.so.0.9.8
191 0.0467 32 0.0093 libpthread-2.9.so
58 0.0142 36 0.0105 ld-2.9.so
1 2.4e-04 0 0 libldap-2.3.so.0.2.31
Detail list of the top 20 function entries (libz counted in one blob):
CPU_CLK_UNHALTED INST_RETIRED_ANY_P
samples % samples % image name symbol name
260476 63.6862 257843 75.2725 libz.so.1.2.3 /lib/libz.so.1.2.3
16587 4.0555 3636 1.0615 libc-2.9.so memcpy
7710 1.8851 277 0.0809 libc-2.9.so memmove
3679 0.8995 1108 0.3235 kernel-2.6.28.4_2.vmlinux d_validate
3546 0.8670 2607 0.7611 kernel-2.6.28.4_2.vmlinux __getblk
3174 0.7760 1813 0.5293 libc-2.9.so _int_malloc
2396 0.5858 3681 1.0746 kernel-2.6.28.4_2.vmlinux copy_to_user
2270 0.5550 2528 0.7380 kernel-2.6.28.4_2.vmlinux __link_path_walk
2205 0.5391 1797 0.5246 kernel-2.6.28.4_2.vmlinux ext4_mark_iloc_dirty
2103 0.5142 1203 0.3512 kernel-2.6.28.4_2.vmlinux find_first_zero_bit
2077 0.5078 997 0.2911 kernel-2.6.28.4_2.vmlinux do_get_write_access
2070 0.5061 514 0.1501 git cache_name_compare
2043 0.4995 1501 0.4382 kernel-2.6.28.4_2.vmlinux rcu_irq_exit
2022 0.4944 1732 0.5056 kernel-2.6.28.4_2.vmlinux __ext4_get_inode_loc
2020 0.4939 4325 1.2626 libcrypto.so.0.9.8 /usr/lib/libcrypto.so.0.9.8
1965 0.4804 1384 0.4040 git patch_delta
1708 0.4176 984 0.2873 kernel-2.6.28.4_2.vmlinux rcu_sched_grace_period
1682 0.4112 727 0.2122 kernel-2.6.28.4_2.vmlinux sysfs_slab_alias
1659 0.4056 290 0.0847 git find_pack_entry_one
1480 0.3619 1307 0.3816 kernel-2.6.28.4_2.vmlinux ext4_writepage_trans_blocks
Notice the memmove line, where the CPU did 7710 / 277 = 27.8 cycles
per instruction, and compared to the total cycles spent inside the
source code of GIT for this command, all the memmove() calls
translates to (7710 * 100) / 14775 = 52.2% of this.
Retesting with a GIT program compiled for gcov usage, I found out that
the memmove() calls came from remove_index_entry_at() in read-cache.c,
where we have:
memmove(istate->cache + pos,
istate->cache + pos + 1,
(istate->cache_nr - pos) * sizeof(struct cache_entry *));
remove_index_entry_at() is called 4902 times from check_updates() in
unpack-trees.c, and each time called we move each cache_entry pointers
(from the removed one) one step to the left.
Since we have 28828 entries in the cache this time, and if we on
average move half of them each time, we in total move approximately
4902 * 0.5 * 28828 * 4 = 282 629 712 bytes, or twice this amount if
each pointer is 8 bytes (64 bit).
OK, is seems that the function check_updates() is called 28 times, so
the estimated guess above had been more correct if check_updates() had
been called only once, but the point is: we get lots of bytes moved.
To fix this, and use an O(N) algorithm instead, where N is the number
of cache_entries, we delete/remove all entries in one loop through all
entries.
From a retest, the new remove_marked_cache_entries() from the patch
below, ended up with the following output line from oprofile:
46 0.0105 15 0.0041 git remove_marked_cache_entries
If we can trust the numbers from oprofile in this case, we saved
approximately ((7710 - 46) * 20000) / (2 * 1000 * 1000 * 1000) = 0.077
seconds CPU time with this fix for this particular test. And notice
that now the CPU did only 46 / 15 = 3.1 cycles/instruction.
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-18 23:18:03 +01:00
|
|
|
* CE_REMOVE is set in ce_flags. This is much more effective than
|
|
|
|
* calling remove_index_entry_at() for each entry to be removed.
|
|
|
|
*/
|
2018-12-20 14:48:16 +01:00
|
|
|
void remove_marked_cache_entries(struct index_state *istate, int invalidate)
|
check_updates(): effective removal of cache entries marked CE_REMOVE
Below is oprofile output from GIT command 'git chekcout -q my-v2.6.25'
(move from tag v2.6.27 to tag v2.6.25 of the Linux kernel):
CPU: Core 2, speed 1999.95 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit
mask of 0x00 (Unhalted core cycles) count 20000
Counted INST_RETIRED_ANY_P events (number of instructions retired) with a
unit mask of 0x00 (No unit mask) count 20000
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
409247 100.000 342878 100.000 git
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
260476 63.6476 257843 75.1996 libz.so.1.2.3
100876 24.6492 64378 18.7758 kernel-2.6.28.4_2.vmlinux
30850 7.5382 7874 2.2964 libc-2.9.so
14775 3.6103 8390 2.4469 git
2020 0.4936 4325 1.2614 libcrypto.so.0.9.8
191 0.0467 32 0.0093 libpthread-2.9.so
58 0.0142 36 0.0105 ld-2.9.so
1 2.4e-04 0 0 libldap-2.3.so.0.2.31
Detail list of the top 20 function entries (libz counted in one blob):
CPU_CLK_UNHALTED INST_RETIRED_ANY_P
samples % samples % image name symbol name
260476 63.6862 257843 75.2725 libz.so.1.2.3 /lib/libz.so.1.2.3
16587 4.0555 3636 1.0615 libc-2.9.so memcpy
7710 1.8851 277 0.0809 libc-2.9.so memmove
3679 0.8995 1108 0.3235 kernel-2.6.28.4_2.vmlinux d_validate
3546 0.8670 2607 0.7611 kernel-2.6.28.4_2.vmlinux __getblk
3174 0.7760 1813 0.5293 libc-2.9.so _int_malloc
2396 0.5858 3681 1.0746 kernel-2.6.28.4_2.vmlinux copy_to_user
2270 0.5550 2528 0.7380 kernel-2.6.28.4_2.vmlinux __link_path_walk
2205 0.5391 1797 0.5246 kernel-2.6.28.4_2.vmlinux ext4_mark_iloc_dirty
2103 0.5142 1203 0.3512 kernel-2.6.28.4_2.vmlinux find_first_zero_bit
2077 0.5078 997 0.2911 kernel-2.6.28.4_2.vmlinux do_get_write_access
2070 0.5061 514 0.1501 git cache_name_compare
2043 0.4995 1501 0.4382 kernel-2.6.28.4_2.vmlinux rcu_irq_exit
2022 0.4944 1732 0.5056 kernel-2.6.28.4_2.vmlinux __ext4_get_inode_loc
2020 0.4939 4325 1.2626 libcrypto.so.0.9.8 /usr/lib/libcrypto.so.0.9.8
1965 0.4804 1384 0.4040 git patch_delta
1708 0.4176 984 0.2873 kernel-2.6.28.4_2.vmlinux rcu_sched_grace_period
1682 0.4112 727 0.2122 kernel-2.6.28.4_2.vmlinux sysfs_slab_alias
1659 0.4056 290 0.0847 git find_pack_entry_one
1480 0.3619 1307 0.3816 kernel-2.6.28.4_2.vmlinux ext4_writepage_trans_blocks
Notice the memmove line, where the CPU did 7710 / 277 = 27.8 cycles
per instruction, and compared to the total cycles spent inside the
source code of GIT for this command, all the memmove() calls
translates to (7710 * 100) / 14775 = 52.2% of this.
Retesting with a GIT program compiled for gcov usage, I found out that
the memmove() calls came from remove_index_entry_at() in read-cache.c,
where we have:
memmove(istate->cache + pos,
istate->cache + pos + 1,
(istate->cache_nr - pos) * sizeof(struct cache_entry *));
remove_index_entry_at() is called 4902 times from check_updates() in
unpack-trees.c, and each time called we move each cache_entry pointers
(from the removed one) one step to the left.
Since we have 28828 entries in the cache this time, and if we on
average move half of them each time, we in total move approximately
4902 * 0.5 * 28828 * 4 = 282 629 712 bytes, or twice this amount if
each pointer is 8 bytes (64 bit).
OK, is seems that the function check_updates() is called 28 times, so
the estimated guess above had been more correct if check_updates() had
been called only once, but the point is: we get lots of bytes moved.
To fix this, and use an O(N) algorithm instead, where N is the number
of cache_entries, we delete/remove all entries in one loop through all
entries.
From a retest, the new remove_marked_cache_entries() from the patch
below, ended up with the following output line from oprofile:
46 0.0105 15 0.0041 git remove_marked_cache_entries
If we can trust the numbers from oprofile in this case, we saved
approximately ((7710 - 46) * 20000) / (2 * 1000 * 1000 * 1000) = 0.077
seconds CPU time with this fix for this particular test. And notice
that now the CPU did only 46 / 15 = 3.1 cycles/instruction.
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-18 23:18:03 +01:00
|
|
|
{
|
|
|
|
struct cache_entry **ce_array = istate->cache;
|
|
|
|
unsigned int i, j;
|
|
|
|
|
|
|
|
for (i = j = 0; i < istate->cache_nr; i++) {
|
2013-11-14 20:24:37 +01:00
|
|
|
if (ce_array[i]->ce_flags & CE_REMOVE) {
|
2018-12-20 14:48:16 +01:00
|
|
|
if (invalidate) {
|
|
|
|
cache_tree_invalidate_path(istate,
|
|
|
|
ce_array[i]->name);
|
|
|
|
untracked_cache_remove_from_index(istate,
|
|
|
|
ce_array[i]->name);
|
|
|
|
}
|
2013-02-28 00:57:48 +01:00
|
|
|
remove_name_hash(istate, ce_array[i]);
|
2014-06-13 14:19:38 +02:00
|
|
|
save_or_free_index_entry(istate, ce_array[i]);
|
2013-11-14 20:24:37 +01:00
|
|
|
}
|
check_updates(): effective removal of cache entries marked CE_REMOVE
Below is oprofile output from GIT command 'git chekcout -q my-v2.6.25'
(move from tag v2.6.27 to tag v2.6.25 of the Linux kernel):
CPU: Core 2, speed 1999.95 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit
mask of 0x00 (Unhalted core cycles) count 20000
Counted INST_RETIRED_ANY_P events (number of instructions retired) with a
unit mask of 0x00 (No unit mask) count 20000
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
409247 100.000 342878 100.000 git
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
260476 63.6476 257843 75.1996 libz.so.1.2.3
100876 24.6492 64378 18.7758 kernel-2.6.28.4_2.vmlinux
30850 7.5382 7874 2.2964 libc-2.9.so
14775 3.6103 8390 2.4469 git
2020 0.4936 4325 1.2614 libcrypto.so.0.9.8
191 0.0467 32 0.0093 libpthread-2.9.so
58 0.0142 36 0.0105 ld-2.9.so
1 2.4e-04 0 0 libldap-2.3.so.0.2.31
Detail list of the top 20 function entries (libz counted in one blob):
CPU_CLK_UNHALTED INST_RETIRED_ANY_P
samples % samples % image name symbol name
260476 63.6862 257843 75.2725 libz.so.1.2.3 /lib/libz.so.1.2.3
16587 4.0555 3636 1.0615 libc-2.9.so memcpy
7710 1.8851 277 0.0809 libc-2.9.so memmove
3679 0.8995 1108 0.3235 kernel-2.6.28.4_2.vmlinux d_validate
3546 0.8670 2607 0.7611 kernel-2.6.28.4_2.vmlinux __getblk
3174 0.7760 1813 0.5293 libc-2.9.so _int_malloc
2396 0.5858 3681 1.0746 kernel-2.6.28.4_2.vmlinux copy_to_user
2270 0.5550 2528 0.7380 kernel-2.6.28.4_2.vmlinux __link_path_walk
2205 0.5391 1797 0.5246 kernel-2.6.28.4_2.vmlinux ext4_mark_iloc_dirty
2103 0.5142 1203 0.3512 kernel-2.6.28.4_2.vmlinux find_first_zero_bit
2077 0.5078 997 0.2911 kernel-2.6.28.4_2.vmlinux do_get_write_access
2070 0.5061 514 0.1501 git cache_name_compare
2043 0.4995 1501 0.4382 kernel-2.6.28.4_2.vmlinux rcu_irq_exit
2022 0.4944 1732 0.5056 kernel-2.6.28.4_2.vmlinux __ext4_get_inode_loc
2020 0.4939 4325 1.2626 libcrypto.so.0.9.8 /usr/lib/libcrypto.so.0.9.8
1965 0.4804 1384 0.4040 git patch_delta
1708 0.4176 984 0.2873 kernel-2.6.28.4_2.vmlinux rcu_sched_grace_period
1682 0.4112 727 0.2122 kernel-2.6.28.4_2.vmlinux sysfs_slab_alias
1659 0.4056 290 0.0847 git find_pack_entry_one
1480 0.3619 1307 0.3816 kernel-2.6.28.4_2.vmlinux ext4_writepage_trans_blocks
Notice the memmove line, where the CPU did 7710 / 277 = 27.8 cycles
per instruction, and compared to the total cycles spent inside the
source code of GIT for this command, all the memmove() calls
translates to (7710 * 100) / 14775 = 52.2% of this.
Retesting with a GIT program compiled for gcov usage, I found out that
the memmove() calls came from remove_index_entry_at() in read-cache.c,
where we have:
memmove(istate->cache + pos,
istate->cache + pos + 1,
(istate->cache_nr - pos) * sizeof(struct cache_entry *));
remove_index_entry_at() is called 4902 times from check_updates() in
unpack-trees.c, and each time called we move each cache_entry pointers
(from the removed one) one step to the left.
Since we have 28828 entries in the cache this time, and if we on
average move half of them each time, we in total move approximately
4902 * 0.5 * 28828 * 4 = 282 629 712 bytes, or twice this amount if
each pointer is 8 bytes (64 bit).
OK, is seems that the function check_updates() is called 28 times, so
the estimated guess above had been more correct if check_updates() had
been called only once, but the point is: we get lots of bytes moved.
To fix this, and use an O(N) algorithm instead, where N is the number
of cache_entries, we delete/remove all entries in one loop through all
entries.
From a retest, the new remove_marked_cache_entries() from the patch
below, ended up with the following output line from oprofile:
46 0.0105 15 0.0041 git remove_marked_cache_entries
If we can trust the numbers from oprofile in this case, we saved
approximately ((7710 - 46) * 20000) / (2 * 1000 * 1000 * 1000) = 0.077
seconds CPU time with this fix for this particular test. And notice
that now the CPU did only 46 / 15 = 3.1 cycles/instruction.
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-18 23:18:03 +01:00
|
|
|
else
|
|
|
|
ce_array[j++] = ce_array[i];
|
|
|
|
}
|
2014-06-13 14:19:26 +02:00
|
|
|
if (j == istate->cache_nr)
|
|
|
|
return;
|
2014-06-13 14:19:27 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_REMOVED;
|
check_updates(): effective removal of cache entries marked CE_REMOVE
Below is oprofile output from GIT command 'git chekcout -q my-v2.6.25'
(move from tag v2.6.27 to tag v2.6.25 of the Linux kernel):
CPU: Core 2, speed 1999.95 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit
mask of 0x00 (Unhalted core cycles) count 20000
Counted INST_RETIRED_ANY_P events (number of instructions retired) with a
unit mask of 0x00 (No unit mask) count 20000
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
409247 100.000 342878 100.000 git
CPU_CLK_UNHALT...|INST_RETIRED:2...|
samples| %| samples| %|
------------------------------------
260476 63.6476 257843 75.1996 libz.so.1.2.3
100876 24.6492 64378 18.7758 kernel-2.6.28.4_2.vmlinux
30850 7.5382 7874 2.2964 libc-2.9.so
14775 3.6103 8390 2.4469 git
2020 0.4936 4325 1.2614 libcrypto.so.0.9.8
191 0.0467 32 0.0093 libpthread-2.9.so
58 0.0142 36 0.0105 ld-2.9.so
1 2.4e-04 0 0 libldap-2.3.so.0.2.31
Detail list of the top 20 function entries (libz counted in one blob):
CPU_CLK_UNHALTED INST_RETIRED_ANY_P
samples % samples % image name symbol name
260476 63.6862 257843 75.2725 libz.so.1.2.3 /lib/libz.so.1.2.3
16587 4.0555 3636 1.0615 libc-2.9.so memcpy
7710 1.8851 277 0.0809 libc-2.9.so memmove
3679 0.8995 1108 0.3235 kernel-2.6.28.4_2.vmlinux d_validate
3546 0.8670 2607 0.7611 kernel-2.6.28.4_2.vmlinux __getblk
3174 0.7760 1813 0.5293 libc-2.9.so _int_malloc
2396 0.5858 3681 1.0746 kernel-2.6.28.4_2.vmlinux copy_to_user
2270 0.5550 2528 0.7380 kernel-2.6.28.4_2.vmlinux __link_path_walk
2205 0.5391 1797 0.5246 kernel-2.6.28.4_2.vmlinux ext4_mark_iloc_dirty
2103 0.5142 1203 0.3512 kernel-2.6.28.4_2.vmlinux find_first_zero_bit
2077 0.5078 997 0.2911 kernel-2.6.28.4_2.vmlinux do_get_write_access
2070 0.5061 514 0.1501 git cache_name_compare
2043 0.4995 1501 0.4382 kernel-2.6.28.4_2.vmlinux rcu_irq_exit
2022 0.4944 1732 0.5056 kernel-2.6.28.4_2.vmlinux __ext4_get_inode_loc
2020 0.4939 4325 1.2626 libcrypto.so.0.9.8 /usr/lib/libcrypto.so.0.9.8
1965 0.4804 1384 0.4040 git patch_delta
1708 0.4176 984 0.2873 kernel-2.6.28.4_2.vmlinux rcu_sched_grace_period
1682 0.4112 727 0.2122 kernel-2.6.28.4_2.vmlinux sysfs_slab_alias
1659 0.4056 290 0.0847 git find_pack_entry_one
1480 0.3619 1307 0.3816 kernel-2.6.28.4_2.vmlinux ext4_writepage_trans_blocks
Notice the memmove line, where the CPU did 7710 / 277 = 27.8 cycles
per instruction, and compared to the total cycles spent inside the
source code of GIT for this command, all the memmove() calls
translates to (7710 * 100) / 14775 = 52.2% of this.
Retesting with a GIT program compiled for gcov usage, I found out that
the memmove() calls came from remove_index_entry_at() in read-cache.c,
where we have:
memmove(istate->cache + pos,
istate->cache + pos + 1,
(istate->cache_nr - pos) * sizeof(struct cache_entry *));
remove_index_entry_at() is called 4902 times from check_updates() in
unpack-trees.c, and each time called we move each cache_entry pointers
(from the removed one) one step to the left.
Since we have 28828 entries in the cache this time, and if we on
average move half of them each time, we in total move approximately
4902 * 0.5 * 28828 * 4 = 282 629 712 bytes, or twice this amount if
each pointer is 8 bytes (64 bit).
OK, is seems that the function check_updates() is called 28 times, so
the estimated guess above had been more correct if check_updates() had
been called only once, but the point is: we get lots of bytes moved.
To fix this, and use an O(N) algorithm instead, where N is the number
of cache_entries, we delete/remove all entries in one loop through all
entries.
From a retest, the new remove_marked_cache_entries() from the patch
below, ended up with the following output line from oprofile:
46 0.0105 15 0.0041 git remove_marked_cache_entries
If we can trust the numbers from oprofile in this case, we saved
approximately ((7710 - 46) * 20000) / (2 * 1000 * 1000 * 1000) = 0.077
seconds CPU time with this fix for this particular test. And notice
that now the CPU did only 46 / 15 = 3.1 cycles/instruction.
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-18 23:18:03 +01:00
|
|
|
istate->cache_nr = j;
|
|
|
|
}
|
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
int remove_file_from_index(struct index_state *istate, const char *path)
|
2005-04-09 21:09:27 +02:00
|
|
|
{
|
2007-04-02 08:26:07 +02:00
|
|
|
int pos = index_name_pos(istate, path, strlen(path));
|
2005-04-17 18:53:35 +02:00
|
|
|
if (pos < 0)
|
|
|
|
pos = -pos-1;
|
2014-06-13 14:19:31 +02:00
|
|
|
cache_tree_invalidate_path(istate, path);
|
2015-03-08 11:12:35 +01:00
|
|
|
untracked_cache_remove_from_index(istate, path);
|
2007-04-02 08:26:07 +02:00
|
|
|
while (pos < istate->cache_nr && !strcmp(istate->cache[pos]->name, path))
|
|
|
|
remove_index_entry_at(istate, pos);
|
2005-04-09 21:09:27 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-06-29 19:32:46 +02:00
|
|
|
static int compare_name(struct cache_entry *ce, const char *path, int namelen)
|
|
|
|
{
|
|
|
|
return namelen != ce_namelen(ce) || memcmp(path, ce->name, namelen);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int index_name_pos_also_unmerged(struct index_state *istate,
|
|
|
|
const char *path, int namelen)
|
|
|
|
{
|
|
|
|
int pos = index_name_pos(istate, path, namelen);
|
|
|
|
struct cache_entry *ce;
|
|
|
|
|
|
|
|
if (pos >= 0)
|
|
|
|
return pos;
|
|
|
|
|
|
|
|
/* maybe unmerged? */
|
|
|
|
pos = -1 - pos;
|
|
|
|
if (pos >= istate->cache_nr ||
|
|
|
|
compare_name((ce = istate->cache[pos]), path, namelen))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* order of preference: stage 2, 1, 3 */
|
|
|
|
if (ce_stage(ce) == 1 && pos + 1 < istate->cache_nr &&
|
|
|
|
ce_stage((ce = istate->cache[pos + 1])) == 2 &&
|
|
|
|
!compare_name(ce, path, namelen))
|
|
|
|
pos++;
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
static int different_name(struct cache_entry *ce, struct cache_entry *alias)
|
|
|
|
{
|
|
|
|
int len = ce_namelen(ce);
|
|
|
|
return ce_namelen(alias) != len || memcmp(ce->name, alias->name, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we add a filename that aliases in the cache, we will use the
|
|
|
|
* name that we already have - but we don't want to update the same
|
|
|
|
* alias twice, because that implies that there were actually two
|
|
|
|
* different files with aliasing names!
|
|
|
|
*
|
|
|
|
* So we use the CE_ADDED flag to verify that the alias was an old
|
|
|
|
* one before we accept it as
|
|
|
|
*/
|
2014-06-13 14:19:38 +02:00
|
|
|
static struct cache_entry *create_alias_ce(struct index_state *istate,
|
|
|
|
struct cache_entry *ce,
|
|
|
|
struct cache_entry *alias)
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
{
|
|
|
|
int len;
|
2018-02-14 19:59:45 +01:00
|
|
|
struct cache_entry *new_entry;
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
|
|
|
|
if (alias->ce_flags & CE_ADDED)
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("will not add file alias '%s' ('%s' already exists in index)"),
|
|
|
|
ce->name, alias->name);
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
|
|
|
|
/* Ok, create the new entry using the name of the existing alias */
|
|
|
|
len = ce_namelen(alias);
|
2018-07-02 21:49:31 +02:00
|
|
|
new_entry = make_empty_cache_entry(istate, len);
|
2018-02-14 19:59:45 +01:00
|
|
|
memcpy(new_entry->name, alias->name, len);
|
|
|
|
copy_cache_entry(new_entry, ce);
|
2014-06-13 14:19:38 +02:00
|
|
|
save_or_free_index_entry(istate, ce);
|
2018-02-14 19:59:45 +01:00
|
|
|
return new_entry;
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
}
|
|
|
|
|
2014-02-04 03:20:09 +01:00
|
|
|
void set_object_name_for_intent_to_add_entry(struct cache_entry *ce)
|
2008-08-21 10:44:53 +02:00
|
|
|
{
|
2018-01-28 01:13:19 +01:00
|
|
|
struct object_id oid;
|
|
|
|
if (write_object_file("", 0, blob_type, &oid))
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("cannot create an empty blob in the object database"));
|
2018-01-28 01:13:19 +01:00
|
|
|
oidcpy(&ce->oid, &oid);
|
2008-08-21 10:44:53 +02:00
|
|
|
}
|
|
|
|
|
2016-09-14 23:07:47 +02:00
|
|
|
int add_to_index(struct index_state *istate, const char *path, struct stat *st, int flags)
|
2006-07-26 03:52:35 +02:00
|
|
|
{
|
2018-07-02 21:49:31 +02:00
|
|
|
int namelen, was_same;
|
2008-05-09 18:11:43 +02:00
|
|
|
mode_t st_mode = st->st_mode;
|
2017-11-16 17:38:28 +01:00
|
|
|
struct cache_entry *ce, *alias = NULL;
|
2009-12-14 12:43:58 +01:00
|
|
|
unsigned ce_option = CE_MATCH_IGNORE_VALID|CE_MATCH_IGNORE_SKIP_WORKTREE|CE_MATCH_RACY_IS_DIRTY;
|
2008-05-21 21:04:34 +02:00
|
|
|
int verbose = flags & (ADD_CACHE_VERBOSE | ADD_CACHE_PRETEND);
|
|
|
|
int pretend = flags & ADD_CACHE_PRETEND;
|
2008-08-21 10:44:53 +02:00
|
|
|
int intent_only = flags & ADD_CACHE_INTENT;
|
|
|
|
int add_option = (ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE|
|
|
|
|
(intent_only ? ADD_CACHE_NEW_ONLY : 0));
|
2019-01-17 17:27:11 +01:00
|
|
|
int hash_flags = HASH_WRITE_OBJECT;
|
2019-04-10 01:07:37 +02:00
|
|
|
struct object_id oid;
|
2017-11-16 17:38:28 +01:00
|
|
|
|
2019-01-17 17:27:11 +01:00
|
|
|
if (flags & ADD_CACHE_RENORMALIZE)
|
|
|
|
hash_flags |= HASH_RENORMALIZE;
|
2006-07-26 03:52:35 +02:00
|
|
|
|
2008-05-09 18:11:43 +02:00
|
|
|
if (!S_ISREG(st_mode) && !S_ISLNK(st_mode) && !S_ISDIR(st_mode))
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("%s: can only add regular files, symbolic links or git-directories"), path);
|
2006-07-26 03:52:35 +02:00
|
|
|
|
|
|
|
namelen = strlen(path);
|
2008-05-09 18:11:43 +02:00
|
|
|
if (S_ISDIR(st_mode)) {
|
2019-04-10 01:07:37 +02:00
|
|
|
if (resolve_gitlink_ref(path, "HEAD", &oid) < 0)
|
|
|
|
return error(_("'%s' does not have a commit checked out"), path);
|
2007-04-11 23:49:44 +02:00
|
|
|
while (namelen && path[namelen-1] == '/')
|
|
|
|
namelen--;
|
|
|
|
}
|
2018-07-02 21:49:31 +02:00
|
|
|
ce = make_empty_cache_entry(istate, namelen);
|
2006-07-26 03:52:35 +02:00
|
|
|
memcpy(ce->name, path, namelen);
|
2012-07-11 11:22:37 +02:00
|
|
|
ce->ce_namelen = namelen;
|
2008-08-21 10:44:53 +02:00
|
|
|
if (!intent_only)
|
2019-05-24 14:23:47 +02:00
|
|
|
fill_stat_cache_info(istate, ce, st);
|
2008-11-29 04:55:25 +01:00
|
|
|
else
|
|
|
|
ce->ce_flags |= CE_INTENT_TO_ADD;
|
2006-07-26 03:52:35 +02:00
|
|
|
|
2016-09-14 23:07:47 +02:00
|
|
|
|
|
|
|
if (trust_executable_bit && has_symlinks) {
|
2008-05-09 18:11:43 +02:00
|
|
|
ce->ce_mode = create_ce_mode(st_mode);
|
2016-09-14 23:07:47 +02:00
|
|
|
} else {
|
2007-03-02 22:11:30 +01:00
|
|
|
/* If there is an existing entry, pick the mode bits and type
|
|
|
|
* from it, otherwise assume unexecutable regular file.
|
2006-07-26 03:52:35 +02:00
|
|
|
*/
|
2007-02-17 07:43:48 +01:00
|
|
|
struct cache_entry *ent;
|
2007-06-29 19:32:46 +02:00
|
|
|
int pos = index_name_pos_also_unmerged(istate, path, namelen);
|
2007-02-17 07:43:48 +01:00
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
ent = (0 <= pos) ? istate->cache[pos] : NULL;
|
2008-05-09 18:11:43 +02:00
|
|
|
ce->ce_mode = ce_mode_from_stat(ent, st_mode);
|
2006-07-26 03:52:35 +02:00
|
|
|
}
|
|
|
|
|
Support case folding for git add when core.ignorecase=true
When MyDir/ABC/filea.txt is added to Git, the disk directory MyDir/ABC/
is renamed to mydir/aBc/, and then mydir/aBc/fileb.txt is added, the
index will contain MyDir/ABC/filea.txt and mydir/aBc/fileb.txt. Although
the earlier portions of this patch series account for those differences
in case, this patch makes the pathing consistent by folding the case of
newly added files against the first file added with that path.
In read-cache.c's add_to_index(), the index_name_exists() support used
for git status's case insensitive directory lookups is used to find the
proper directory case according to what the user already checked in.
That is, MyDir/ABC/'s case is used to alter the stored path for
fileb.txt to MyDir/ABC/fileb.txt (instead of mydir/aBc/fileb.txt).
This is especially important when cloning a repository to a case
sensitive file system. MyDir/ABC/ and mydir/aBc/ exist in the same
directory on a Windows machine, but on Linux, the files exist in two
separate directories. The update to add_to_index(), in effect, treats a
Windows file system as case sensitive by making path case consistent.
Signed-off-by: Joshua Jensen <jjensen@workspacewhiz.com>
Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-10-03 11:56:45 +02:00
|
|
|
/* When core.ignorecase=true, determine if a directory of the same name but differing
|
|
|
|
* case already exists within the Git repository. If it does, ensure the directory
|
|
|
|
* case of the file being added to the repository matches (is folded into) the existing
|
|
|
|
* entry's directory case.
|
|
|
|
*/
|
|
|
|
if (ignore_case) {
|
2015-10-21 19:54:11 +02:00
|
|
|
adjust_dirname_case(istate, ce->name);
|
Support case folding for git add when core.ignorecase=true
When MyDir/ABC/filea.txt is added to Git, the disk directory MyDir/ABC/
is renamed to mydir/aBc/, and then mydir/aBc/fileb.txt is added, the
index will contain MyDir/ABC/filea.txt and mydir/aBc/fileb.txt. Although
the earlier portions of this patch series account for those differences
in case, this patch makes the pathing consistent by folding the case of
newly added files against the first file added with that path.
In read-cache.c's add_to_index(), the index_name_exists() support used
for git status's case insensitive directory lookups is used to find the
proper directory case according to what the user already checked in.
That is, MyDir/ABC/'s case is used to alter the stored path for
fileb.txt to MyDir/ABC/fileb.txt (instead of mydir/aBc/fileb.txt).
This is especially important when cloning a repository to a case
sensitive file system. MyDir/ABC/ and mydir/aBc/ exist in the same
directory on a Windows machine, but on Linux, the files exist in two
separate directories. The update to add_to_index(), in effect, treats a
Windows file system as case sensitive by making path case consistent.
Signed-off-by: Joshua Jensen <jjensen@workspacewhiz.com>
Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-10-03 11:56:45 +02:00
|
|
|
}
|
2019-02-07 03:00:22 +01:00
|
|
|
if (!(flags & ADD_CACHE_RENORMALIZE)) {
|
2017-11-16 17:38:28 +01:00
|
|
|
alias = index_file_exists(istate, ce->name,
|
|
|
|
ce_namelen(ce), ignore_case);
|
|
|
|
if (alias &&
|
|
|
|
!ce_stage(alias) &&
|
|
|
|
!ie_match_stat(istate, alias, st, ce_option)) {
|
|
|
|
/* Nothing changed, really */
|
|
|
|
if (!S_ISGITLINK(alias->ce_mode))
|
|
|
|
ce_mark_uptodate(alias);
|
|
|
|
alias->ce_flags |= CE_ADDED;
|
Support case folding for git add when core.ignorecase=true
When MyDir/ABC/filea.txt is added to Git, the disk directory MyDir/ABC/
is renamed to mydir/aBc/, and then mydir/aBc/fileb.txt is added, the
index will contain MyDir/ABC/filea.txt and mydir/aBc/fileb.txt. Although
the earlier portions of this patch series account for those differences
in case, this patch makes the pathing consistent by folding the case of
newly added files against the first file added with that path.
In read-cache.c's add_to_index(), the index_name_exists() support used
for git status's case insensitive directory lookups is used to find the
proper directory case according to what the user already checked in.
That is, MyDir/ABC/'s case is used to alter the stored path for
fileb.txt to MyDir/ABC/fileb.txt (instead of mydir/aBc/fileb.txt).
This is especially important when cloning a repository to a case
sensitive file system. MyDir/ABC/ and mydir/aBc/ exist in the same
directory on a Windows machine, but on Linux, the files exist in two
separate directories. The update to add_to_index(), in effect, treats a
Windows file system as case sensitive by making path case consistent.
Signed-off-by: Joshua Jensen <jjensen@workspacewhiz.com>
Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-10-03 11:56:45 +02:00
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(ce);
|
2017-11-16 17:38:28 +01:00
|
|
|
return 0;
|
|
|
|
}
|
2007-07-31 02:12:58 +02:00
|
|
|
}
|
2008-08-21 10:44:53 +02:00
|
|
|
if (!intent_only) {
|
2019-02-05 23:26:13 +01:00
|
|
|
if (index_path(istate, &ce->oid, path, st, hash_flags)) {
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(ce);
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("unable to index file '%s'"), path);
|
2015-03-21 01:28:00 +01:00
|
|
|
}
|
2008-08-21 10:44:53 +02:00
|
|
|
} else
|
2014-02-04 03:20:09 +01:00
|
|
|
set_object_name_for_intent_to_add_entry(ce);
|
2008-08-21 10:44:53 +02:00
|
|
|
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
if (ignore_case && alias && different_name(ce, alias))
|
2014-06-13 14:19:38 +02:00
|
|
|
ce = create_alias_ce(istate, ce, alias);
|
Make git-add behave more sensibly in a case-insensitive environment
This expands on the previous patch, and allows "git add" to sanely handle
a filename that has changed case, keeping the case in the index constant,
and avoiding aliases.
In particular, if you have an index entry called "File", but the
checked-out tree is case-corrupted and has an entry called "file"
instead, doing a
git add .
(or naming "file" explicitly) will automatically notice that we have an
alias, and will replace the name "file" with the existing index
capitalization (ie "File").
However, if we actually have *both* a file called "File" and one called
"file", and they don't have the same lstat() information (ie we're on a
case-sensitive filesystem but have the "core.ignorecase" flag set), we
will error out if we try to add them both.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-03-22 22:22:44 +01:00
|
|
|
ce->ce_flags |= CE_ADDED;
|
2008-05-21 21:04:34 +02:00
|
|
|
|
2008-07-17 03:48:58 +02:00
|
|
|
/* It was suspected to be racily clean, but it turns out to be Ok */
|
2008-05-21 21:04:34 +02:00
|
|
|
was_same = (alias &&
|
|
|
|
!ce_stage(alias) &&
|
convert "oidcmp() == 0" to oideq()
Using the more restrictive oideq() should, in the long run,
give the compiler more opportunities to optimize these
callsites. For now, this conversion should be a complete
noop with respect to the generated code.
The result is also perhaps a little more readable, as it
avoids the "zero is equal" idiom. Since it's so prevalent in
C, I think seasoned programmers tend not to even notice it
anymore, but it can sometimes make for awkward double
negations (e.g., we can drop a few !!oidcmp() instances
here).
This patch was generated almost entirely by the included
coccinelle patch. This mechanical conversion should be
completely safe, because we check explicitly for cases where
oidcmp() is compared to 0, which is what oideq() is doing
under the hood. Note that we don't have to catch "!oidcmp()"
separately; coccinelle's standard isomorphisms make sure the
two are treated equivalently.
I say "almost" because I did hand-edit the coccinelle output
to fix up a few style violations (it mostly keeps the
original formatting, but sometimes unwraps long lines).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-08-28 23:22:40 +02:00
|
|
|
oideq(&alias->oid, &ce->oid) &&
|
2008-05-21 21:04:34 +02:00
|
|
|
ce->ce_mode == alias->ce_mode);
|
|
|
|
|
|
|
|
if (pretend)
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(ce);
|
2015-03-23 18:58:00 +01:00
|
|
|
else if (add_index_entry(istate, ce, add_option)) {
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(ce);
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("unable to add '%s' to index"), path);
|
2015-03-23 18:58:00 +01:00
|
|
|
}
|
2008-05-21 21:04:34 +02:00
|
|
|
if (verbose && !was_same)
|
2006-07-26 03:52:35 +02:00
|
|
|
printf("add '%s'\n", path);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:07:47 +02:00
|
|
|
int add_file_to_index(struct index_state *istate, const char *path, int flags)
|
2008-05-09 18:11:43 +02:00
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
if (lstat(path, &st))
|
2018-11-10 06:16:05 +01:00
|
|
|
die_errno(_("unable to stat '%s'"), path);
|
2016-09-14 23:07:47 +02:00
|
|
|
return add_to_index(istate, path, &st, flags);
|
2008-05-09 18:11:43 +02:00
|
|
|
}
|
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
struct cache_entry *make_empty_cache_entry(struct index_state *istate, size_t len)
|
|
|
|
{
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
return mem_pool__ce_calloc(find_mem_pool(istate), len);
|
2018-07-02 21:49:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
struct cache_entry *make_empty_transient_cache_entry(size_t len)
|
|
|
|
{
|
|
|
|
return xcalloc(1, cache_entry_size(len));
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cache_entry *make_cache_entry(struct index_state *istate,
|
|
|
|
unsigned int mode,
|
2018-07-02 21:49:30 +02:00
|
|
|
const struct object_id *oid,
|
|
|
|
const char *path,
|
|
|
|
int stage,
|
|
|
|
unsigned int refresh_options)
|
2007-09-11 05:17:28 +02:00
|
|
|
{
|
2015-02-17 19:06:14 +01:00
|
|
|
struct cache_entry *ce, *ret;
|
2018-07-02 21:49:31 +02:00
|
|
|
int len;
|
2007-09-11 05:17:28 +02:00
|
|
|
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (!verify_path(path, mode)) {
|
2018-11-10 06:16:05 +01:00
|
|
|
error(_("invalid path '%s'"), path);
|
2007-09-11 05:17:28 +02:00
|
|
|
return NULL;
|
2008-10-11 18:39:37 +02:00
|
|
|
}
|
2007-09-11 05:17:28 +02:00
|
|
|
|
|
|
|
len = strlen(path);
|
2018-07-02 21:49:31 +02:00
|
|
|
ce = make_empty_cache_entry(istate, len);
|
2007-09-11 05:17:28 +02:00
|
|
|
|
2018-07-02 21:49:30 +02:00
|
|
|
oidcpy(&ce->oid, oid);
|
2007-09-11 05:17:28 +02:00
|
|
|
memcpy(ce->name, path, len);
|
2012-07-11 11:22:37 +02:00
|
|
|
ce->ce_flags = create_ce_flags(stage);
|
|
|
|
ce->ce_namelen = len;
|
2007-09-11 05:17:28 +02:00
|
|
|
ce->ce_mode = create_ce_mode(mode);
|
|
|
|
|
2018-09-21 17:57:25 +02:00
|
|
|
ret = refresh_cache_entry(istate, ce, refresh_options);
|
2015-03-23 18:57:11 +01:00
|
|
|
if (ret != ce)
|
2018-07-02 21:49:31 +02:00
|
|
|
discard_cache_entry(ce);
|
2015-03-23 18:57:11 +01:00
|
|
|
return ret;
|
2007-09-11 05:17:28 +02:00
|
|
|
}
|
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
struct cache_entry *make_transient_cache_entry(unsigned int mode, const struct object_id *oid,
|
|
|
|
const char *path, int stage)
|
|
|
|
{
|
|
|
|
struct cache_entry *ce;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (!verify_path(path, mode)) {
|
2018-11-10 06:16:05 +01:00
|
|
|
error(_("invalid path '%s'"), path);
|
2018-07-02 21:49:31 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = strlen(path);
|
|
|
|
ce = make_empty_transient_cache_entry(len);
|
|
|
|
|
|
|
|
oidcpy(&ce->oid, oid);
|
|
|
|
memcpy(ce->name, path, len);
|
|
|
|
ce->ce_flags = create_ce_flags(stage);
|
|
|
|
ce->ce_namelen = len;
|
|
|
|
ce->ce_mode = create_ce_mode(mode);
|
|
|
|
|
|
|
|
return ce;
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:07:46 +02:00
|
|
|
/*
|
|
|
|
* Chmod an index entry with either +x or -x.
|
|
|
|
*
|
|
|
|
* Returns -1 if the chmod for the particular cache entry failed (if it's
|
|
|
|
* not a regular file), -2 if an invalid flip argument is passed in, 0
|
|
|
|
* otherwise.
|
|
|
|
*/
|
|
|
|
int chmod_index_entry(struct index_state *istate, struct cache_entry *ce,
|
|
|
|
char flip)
|
|
|
|
{
|
|
|
|
if (!S_ISREG(ce->ce_mode))
|
|
|
|
return -1;
|
|
|
|
switch (flip) {
|
|
|
|
case '+':
|
|
|
|
ce->ce_mode |= 0111;
|
|
|
|
break;
|
|
|
|
case '-':
|
|
|
|
ce->ce_mode &= ~0111;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
cache_tree_invalidate_path(istate, ce->name);
|
|
|
|
ce->ce_flags |= CE_UPDATE_IN_BASE;
|
2017-09-22 18:35:40 +02:00
|
|
|
mark_fsmonitor_invalid(istate, ce);
|
2016-09-14 23:07:46 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_CHANGED;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Convert "struct cache_entry *" to "const ..." wherever possible
I attempted to make index_state->cache[] a "const struct cache_entry **"
to find out how existing entries in index are modified and where. The
question I have is what do we do if we really need to keep track of on-disk
changes in the index. The result is
- diff-lib.c: setting CE_UPTODATE
- name-hash.c: setting CE_HASHED
- preload-index.c, read-cache.c, unpack-trees.c and
builtin/update-index: obvious
- entry.c: write_entry() may refresh the checked out entry via
fill_stat_cache_info(). This causes "non-const struct cache_entry
*" in builtin/apply.c, builtin/checkout-index.c and
builtin/checkout.c
- builtin/ls-files.c: --with-tree changes stagemask and may set
CE_UPDATE
Of these, write_entry() and its call sites are probably most
interesting because it modifies on-disk info. But this is stat info
and can be retrieved via refresh, at least for porcelain
commands. Other just uses ce_flags for local purposes.
So, keeping track of "dirty" entries is just a matter of setting a
flag in index modification functions exposed by read-cache.c. Except
unpack-trees, the rest of the code base does not do anything funny
behind read-cache's back.
The actual patch is less valueable than the summary above. But if
anyone wants to re-identify the above sites. Applying this patch, then
this:
diff --git a/cache.h b/cache.h
index 430d021..1692891 100644
--- a/cache.h
+++ b/cache.h
@@ -267,7 +267,7 @@ static inline unsigned int canon_mode(unsigned int mode)
#define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
struct index_state {
- struct cache_entry **cache;
+ const struct cache_entry **cache;
unsigned int version;
unsigned int cache_nr, cache_alloc, cache_changed;
struct string_list *resolve_undo;
will help quickly identify them without bogus warnings.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-07-09 17:29:00 +02:00
|
|
|
int ce_same_name(const struct cache_entry *a, const struct cache_entry *b)
|
2005-04-16 21:05:45 +02:00
|
|
|
{
|
|
|
|
int len = ce_namelen(a);
|
|
|
|
return ce_namelen(b) == len && !memcmp(a->name, b->name, len);
|
|
|
|
}
|
|
|
|
|
2006-05-18 21:07:31 +02:00
|
|
|
/*
|
|
|
|
* We fundamentally don't like some paths: we don't want
|
|
|
|
* dot or dot-dot anywhere, and for obvious reasons don't
|
|
|
|
* want to recurse into ".git" either.
|
|
|
|
*
|
|
|
|
* Also, we don't want double slashes or slashes at the
|
|
|
|
* end that can make pathnames ambiguous.
|
|
|
|
*/
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
static int verify_dotfile(const char *rest, unsigned mode)
|
2006-05-18 21:07:31 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The first character was '.', but that
|
|
|
|
* has already been discarded, we now test
|
|
|
|
* the rest.
|
|
|
|
*/
|
2011-06-08 14:04:41 +02:00
|
|
|
|
2006-05-18 21:07:31 +02:00
|
|
|
/* "." is not allowed */
|
2011-06-08 14:04:41 +02:00
|
|
|
if (*rest == '\0' || is_dir_sep(*rest))
|
2006-05-18 21:07:31 +02:00
|
|
|
return 0;
|
|
|
|
|
2011-06-08 14:04:41 +02:00
|
|
|
switch (*rest) {
|
2006-05-18 21:07:31 +02:00
|
|
|
/*
|
2018-05-15 15:56:50 +02:00
|
|
|
* ".git" followed by NUL or slash is bad. Note that we match
|
|
|
|
* case-insensitively here, even if ignore_case is not set.
|
|
|
|
* This outlaws ".GIT" everywhere out of an abundance of caution,
|
|
|
|
* since there's really no good reason to allow it.
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
*
|
|
|
|
* Once we've seen ".git", we can also find ".gitmodules", etc (also
|
|
|
|
* case-insensitively).
|
2006-05-18 21:07:31 +02:00
|
|
|
*/
|
|
|
|
case 'g':
|
2014-11-24 19:39:12 +01:00
|
|
|
case 'G':
|
|
|
|
if (rest[1] != 'i' && rest[1] != 'I')
|
2006-05-18 21:07:31 +02:00
|
|
|
break;
|
2014-11-24 19:39:12 +01:00
|
|
|
if (rest[2] != 't' && rest[2] != 'T')
|
2006-05-18 21:07:31 +02:00
|
|
|
break;
|
2018-05-13 19:00:23 +02:00
|
|
|
if (rest[3] == '\0' || is_dir_sep(rest[3]))
|
|
|
|
return 0;
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (S_ISLNK(mode)) {
|
|
|
|
rest += 3;
|
|
|
|
if (skip_iprefix(rest, "modules", &rest) &&
|
|
|
|
(*rest == '\0' || is_dir_sep(*rest)))
|
|
|
|
return 0;
|
|
|
|
}
|
2018-05-13 19:00:23 +02:00
|
|
|
break;
|
2006-05-18 21:07:31 +02:00
|
|
|
case '.':
|
2011-06-08 14:04:41 +02:00
|
|
|
if (rest[1] == '\0' || is_dir_sep(rest[1]))
|
2006-05-18 21:07:31 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
int verify_path(const char *path, unsigned mode)
|
2006-05-18 21:07:31 +02:00
|
|
|
{
|
mingw: safeguard better against backslashes in file names
In 224c7d70fa1 (mingw: only test index entries for backslashes, not tree
entries, 2019-12-31), we relaxed the check for backslashes in tree
entries to check only index entries.
However, the code change was incorrect: it was added to
`add_index_entry_with_check()`, not to `add_index_entry()`, so under
certain circumstances it was possible to side-step the protection.
Besides, the description of that commit purported that all index entries
would be checked when in fact they were only checked when being added to
the index (there are code paths that do not do that, constructing
"transient" index entries).
In any case, it was pointed out in one insightful review at
https://github.com/git-for-windows/git/pull/2437#issuecomment-566771835
that it would be a much better idea to teach `verify_path()` to perform
the check for a backslash. This is safer, even if it comes with two
notable drawbacks:
- `verify_path()` cannot say _what_ is wrong with the path, therefore
the user will no longer be told that there was a backslash in the
path, only that the path was invalid.
- The `git apply` command also calls the `verify_path()` function, and
might have been able to handle Windows-style paths (i.e. with
backslashes instead of forward slashes). This will no longer be
possible unless the user (temporarily) sets `core.protectNTFS=false`.
Note that `git add <windows-path>` will _still_ work because
`normalize_path_copy_len()` will convert the backslashes to forward
slashes before hitting the code path that creates an index entry.
The clear advantage is that `verify_path()`'s purpose is to check the
validity of the file name, therefore we naturally tap into all the code
paths that need safeguarding, also implicitly into future code paths.
The benefits of that approach outweigh the downsides, so let's move the
check from `add_index_entry_with_check()` to `verify_path()`.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-09 14:30:34 +01:00
|
|
|
char c = 0;
|
2006-05-18 21:07:31 +02:00
|
|
|
|
2011-05-27 18:00:40 +02:00
|
|
|
if (has_dos_drive_prefix(path))
|
|
|
|
return 0;
|
|
|
|
|
mingw: refuse to access paths with trailing spaces or periods
When creating a directory on Windows whose path ends in a space or a
period (or chains thereof), the Win32 API "helpfully" trims those. For
example, `mkdir("abc ");` will return success, but actually create a
directory called `abc` instead.
This stems back to the DOS days, when all file names had exactly 8
characters plus exactly 3 characters for the file extension, and the
only way to have shorter names was by padding with spaces.
Sadly, this "helpful" behavior is a bit inconsistent: after a successful
`mkdir("abc ");`, a `mkdir("abc /def")` will actually _fail_ (because
the directory `abc ` does not actually exist).
Even if it would work, we now have a serious problem because a Git
repository could contain directories `abc` and `abc `, and on Windows,
they would be "merged" unintentionally.
As these paths are illegal on Windows, anyway, let's disallow any
accesses to such paths on that Operating System.
For practical reasons, this behavior is still guarded by the
config setting `core.protectNTFS`: it is possible (and at least two
regression tests make use of it) to create commits without involving the
worktree. In such a scenario, it is of course possible -- even on
Windows -- to create such file names.
Among other consequences, this patch disallows submodules' paths to end
in spaces on Windows (which would formerly have confused Git enough to
try to write into incorrect paths, anyway).
While this patch does not fix a vulnerability on its own, it prevents an
attack vector that was exploited in demonstrations of a number of
recently-fixed security bugs.
The regression test added to `t/t7417-submodule-path-url.sh` reflects
that attack vector.
Note that we have to adjust the test case "prevent git~1 squatting on
Windows" in `t/t7415-submodule-names.sh` because of a very subtle issue.
It tries to clone two submodules whose names differ only in a trailing
period character, and as a consequence their git directories differ in
the same way. Previously, when Git tried to clone the second submodule,
it thought that the git directory already existed (because on Windows,
when you create a directory with the name `b.` it actually creates `b`),
but with this patch, the first submodule's clone will fail because of
the illegal name of the git directory. Therefore, when cloning the
second submodule, Git will take a different code path: a fresh clone
(without an existing git directory). Both code paths fail to clone the
second submodule, both because the the corresponding worktree directory
exists and is not empty, but the error messages are worded differently.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2019-09-05 13:27:53 +02:00
|
|
|
if (!is_valid_path(path))
|
|
|
|
return 0;
|
|
|
|
|
2006-05-18 21:07:31 +02:00
|
|
|
goto inside;
|
|
|
|
for (;;) {
|
|
|
|
if (!c)
|
|
|
|
return 1;
|
2011-05-27 18:00:40 +02:00
|
|
|
if (is_dir_sep(c)) {
|
2006-05-18 21:07:31 +02:00
|
|
|
inside:
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (protect_hfs) {
|
mingw: safeguard better against backslashes in file names
In 224c7d70fa1 (mingw: only test index entries for backslashes, not tree
entries, 2019-12-31), we relaxed the check for backslashes in tree
entries to check only index entries.
However, the code change was incorrect: it was added to
`add_index_entry_with_check()`, not to `add_index_entry()`, so under
certain circumstances it was possible to side-step the protection.
Besides, the description of that commit purported that all index entries
would be checked when in fact they were only checked when being added to
the index (there are code paths that do not do that, constructing
"transient" index entries).
In any case, it was pointed out in one insightful review at
https://github.com/git-for-windows/git/pull/2437#issuecomment-566771835
that it would be a much better idea to teach `verify_path()` to perform
the check for a backslash. This is safer, even if it comes with two
notable drawbacks:
- `verify_path()` cannot say _what_ is wrong with the path, therefore
the user will no longer be told that there was a backslash in the
path, only that the path was invalid.
- The `git apply` command also calls the `verify_path()` function, and
might have been able to handle Windows-style paths (i.e. with
backslashes instead of forward slashes). This will no longer be
possible unless the user (temporarily) sets `core.protectNTFS=false`.
Note that `git add <windows-path>` will _still_ work because
`normalize_path_copy_len()` will convert the backslashes to forward
slashes before hitting the code path that creates an index entry.
The clear advantage is that `verify_path()`'s purpose is to check the
validity of the file name, therefore we naturally tap into all the code
paths that need safeguarding, also implicitly into future code paths.
The benefits of that approach outweigh the downsides, so let's move the
check from `add_index_entry_with_check()` to `verify_path()`.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-09 14:30:34 +01:00
|
|
|
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (is_hfs_dotgit(path))
|
|
|
|
return 0;
|
|
|
|
if (S_ISLNK(mode)) {
|
|
|
|
if (is_hfs_dotgitmodules(path))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (protect_ntfs) {
|
mingw: safeguard better against backslashes in file names
In 224c7d70fa1 (mingw: only test index entries for backslashes, not tree
entries, 2019-12-31), we relaxed the check for backslashes in tree
entries to check only index entries.
However, the code change was incorrect: it was added to
`add_index_entry_with_check()`, not to `add_index_entry()`, so under
certain circumstances it was possible to side-step the protection.
Besides, the description of that commit purported that all index entries
would be checked when in fact they were only checked when being added to
the index (there are code paths that do not do that, constructing
"transient" index entries).
In any case, it was pointed out in one insightful review at
https://github.com/git-for-windows/git/pull/2437#issuecomment-566771835
that it would be a much better idea to teach `verify_path()` to perform
the check for a backslash. This is safer, even if it comes with two
notable drawbacks:
- `verify_path()` cannot say _what_ is wrong with the path, therefore
the user will no longer be told that there was a backslash in the
path, only that the path was invalid.
- The `git apply` command also calls the `verify_path()` function, and
might have been able to handle Windows-style paths (i.e. with
backslashes instead of forward slashes). This will no longer be
possible unless the user (temporarily) sets `core.protectNTFS=false`.
Note that `git add <windows-path>` will _still_ work because
`normalize_path_copy_len()` will convert the backslashes to forward
slashes before hitting the code path that creates an index entry.
The clear advantage is that `verify_path()`'s purpose is to check the
validity of the file name, therefore we naturally tap into all the code
paths that need safeguarding, also implicitly into future code paths.
The benefits of that approach outweigh the downsides, so let's move the
check from `add_index_entry_with_check()` to `verify_path()`.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-09 14:30:34 +01:00
|
|
|
#ifdef GIT_WINDOWS_NATIVE
|
|
|
|
if (c == '\\')
|
|
|
|
return 0;
|
|
|
|
#endif
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (is_ntfs_dotgit(path))
|
|
|
|
return 0;
|
|
|
|
if (S_ISLNK(mode)) {
|
|
|
|
if (is_ntfs_dotgitmodules(path))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-05-18 21:07:31 +02:00
|
|
|
c = *path++;
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if ((c == '.' && !verify_dotfile(path, mode)) ||
|
2011-06-07 05:49:06 +02:00
|
|
|
is_dir_sep(c) || c == '\0')
|
|
|
|
return 0;
|
is_ntfs_dotgit(): only verify the leading segment
The config setting `core.protectNTFS` is specifically designed to work
not only on Windows, but anywhere, to allow for repositories hosted on,
say, Linux servers to be protected against NTFS-specific attack vectors.
As a consequence, `is_ntfs_dotgit()` manually splits backslash-separated
paths (but does not do the same for paths separated by forward slashes),
under the assumption that the backslash might not be a valid directory
separator on the _current_ Operating System.
However, the two callers, `verify_path()` and `fsck_tree()`, are
supposed to feed only individual path segments to the `is_ntfs_dotgit()`
function.
This causes a lot of duplicate scanning (and very inefficient scanning,
too, as the inner loop of `is_ntfs_dotgit()` was optimized for
readability rather than for speed.
Let's simplify the design of `is_ntfs_dotgit()` by putting the burden of
splitting the paths by backslashes as directory separators on the
callers of said function.
Consequently, the `verify_path()` function, which already splits the
path by directory separators, now treats backslashes as directory
separators _explicitly_ when `core.protectNTFS` is turned on, even on
platforms where the backslash is _not_ a directory separator.
Note that we have to repeat some code in `verify_path()`: if the
backslash is not a directory separator on the current Operating System,
we want to allow file names like `\`, but we _do_ want to disallow paths
that are clearly intended to cause harm when the repository is cloned on
Windows.
The `fsck_tree()` function (the other caller of `is_ntfs_dotgit()`) now
needs to look for backslashes in tree entries' names specifically when
`core.protectNTFS` is turned on. While it would be tempting to
completely disallow backslashes in that case (much like `fsck` reports
names containing forward slashes as "full paths"), this would be
overzealous: when `core.protectNTFS` is turned on in a non-Windows
setup, backslashes are perfectly valid characters in file names while we
_still_ want to disallow tree entries that are clearly designed to
exploit NTFS-specific behavior.
This simplification will make subsequent changes easier to implement,
such as turning `core.protectNTFS` on by default (not only on Windows)
or protecting against attack vectors involving NTFS Alternate Data
Streams.
Incidentally, this change allows for catching malicious repositories
that contain tree entries of the form `dir\.gitmodules` already on the
server side rather than only on the client side (and previously only on
Windows): in contrast to `is_ntfs_dotgit()`, the
`is_ntfs_dotgitmodules()` function already expects the caller to split
the paths by directory separators.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2019-09-23 08:58:11 +02:00
|
|
|
} else if (c == '\\' && protect_ntfs) {
|
|
|
|
if (is_ntfs_dotgit(path))
|
|
|
|
return 0;
|
|
|
|
if (S_ISLNK(mode)) {
|
|
|
|
if (is_ntfs_dotgitmodules(path))
|
|
|
|
return 0;
|
|
|
|
}
|
2006-05-18 21:07:31 +02:00
|
|
|
}
|
is_ntfs_dotgit(): only verify the leading segment
The config setting `core.protectNTFS` is specifically designed to work
not only on Windows, but anywhere, to allow for repositories hosted on,
say, Linux servers to be protected against NTFS-specific attack vectors.
As a consequence, `is_ntfs_dotgit()` manually splits backslash-separated
paths (but does not do the same for paths separated by forward slashes),
under the assumption that the backslash might not be a valid directory
separator on the _current_ Operating System.
However, the two callers, `verify_path()` and `fsck_tree()`, are
supposed to feed only individual path segments to the `is_ntfs_dotgit()`
function.
This causes a lot of duplicate scanning (and very inefficient scanning,
too, as the inner loop of `is_ntfs_dotgit()` was optimized for
readability rather than for speed.
Let's simplify the design of `is_ntfs_dotgit()` by putting the burden of
splitting the paths by backslashes as directory separators on the
callers of said function.
Consequently, the `verify_path()` function, which already splits the
path by directory separators, now treats backslashes as directory
separators _explicitly_ when `core.protectNTFS` is turned on, even on
platforms where the backslash is _not_ a directory separator.
Note that we have to repeat some code in `verify_path()`: if the
backslash is not a directory separator on the current Operating System,
we want to allow file names like `\`, but we _do_ want to disallow paths
that are clearly intended to cause harm when the repository is cloned on
Windows.
The `fsck_tree()` function (the other caller of `is_ntfs_dotgit()`) now
needs to look for backslashes in tree entries' names specifically when
`core.protectNTFS` is turned on. While it would be tempting to
completely disallow backslashes in that case (much like `fsck` reports
names containing forward slashes as "full paths"), this would be
overzealous: when `core.protectNTFS` is turned on in a non-Windows
setup, backslashes are perfectly valid characters in file names while we
_still_ want to disallow tree entries that are clearly designed to
exploit NTFS-specific behavior.
This simplification will make subsequent changes easier to implement,
such as turning `core.protectNTFS` on by default (not only on Windows)
or protecting against attack vectors involving NTFS Alternate Data
Streams.
Incidentally, this change allows for catching malicious repositories
that contain tree entries of the form `dir\.gitmodules` already on the
server side rather than only on the client side (and previously only on
Windows): in contrast to `is_ntfs_dotgit()`, the
`is_ntfs_dotgitmodules()` function already expects the caller to split
the paths by directory separators.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2019-09-23 08:58:11 +02:00
|
|
|
|
2006-05-18 21:07:31 +02:00
|
|
|
c = *path++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
/*
|
|
|
|
* Do we have another file that has the beginning components being a
|
|
|
|
* proper superset of the name we're trying to add?
|
2005-05-08 06:48:12 +02:00
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
static int has_file_name(struct index_state *istate,
|
|
|
|
const struct cache_entry *ce, int pos, int ok_to_replace)
|
2005-05-08 06:48:12 +02:00
|
|
|
{
|
2005-06-19 05:21:34 +02:00
|
|
|
int retval = 0;
|
|
|
|
int len = ce_namelen(ce);
|
2005-06-25 11:25:29 +02:00
|
|
|
int stage = ce_stage(ce);
|
2005-06-19 05:21:34 +02:00
|
|
|
const char *name = ce->name;
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
while (pos < istate->cache_nr) {
|
|
|
|
struct cache_entry *p = istate->cache[pos++];
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
if (len >= ce_namelen(p))
|
2005-05-08 06:48:12 +02:00
|
|
|
break;
|
2005-06-19 05:21:34 +02:00
|
|
|
if (memcmp(name, p->name, len))
|
|
|
|
break;
|
2005-06-25 11:25:29 +02:00
|
|
|
if (ce_stage(p) != stage)
|
|
|
|
continue;
|
2005-06-19 05:21:34 +02:00
|
|
|
if (p->name[len] != '/')
|
|
|
|
continue;
|
2008-01-15 01:03:17 +01:00
|
|
|
if (p->ce_flags & CE_REMOVE)
|
2007-03-30 10:55:37 +02:00
|
|
|
continue;
|
2005-06-19 05:21:34 +02:00
|
|
|
retval = -1;
|
|
|
|
if (!ok_to_replace)
|
|
|
|
break;
|
2007-04-02 08:26:07 +02:00
|
|
|
remove_index_entry_at(istate, --pos);
|
2005-05-08 06:48:12 +02:00
|
|
|
}
|
2005-06-19 05:21:34 +02:00
|
|
|
return retval;
|
|
|
|
}
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2017-04-14 21:12:28 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Like strcmp(), but also return the offset of the first change.
|
|
|
|
* If strings are equal, return the length.
|
|
|
|
*/
|
|
|
|
int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
|
|
|
|
{
|
|
|
|
size_t k;
|
|
|
|
|
|
|
|
if (!first_change)
|
|
|
|
return strcmp(s1, s2);
|
|
|
|
|
|
|
|
for (k = 0; s1[k] == s2[k]; k++)
|
|
|
|
if (s1[k] == '\0')
|
|
|
|
break;
|
|
|
|
|
|
|
|
*first_change = k;
|
|
|
|
return (unsigned char)s1[k] - (unsigned char)s2[k];
|
|
|
|
}
|
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
/*
|
|
|
|
* Do we have another file with a pathname that is a proper
|
|
|
|
* subset of the name we're trying to add?
|
2017-04-19 19:06:17 +02:00
|
|
|
*
|
|
|
|
* That is, is there another file in the index with a path
|
|
|
|
* that matches a sub-directory in the given entry?
|
2005-06-19 05:21:34 +02:00
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
static int has_dir_name(struct index_state *istate,
|
|
|
|
const struct cache_entry *ce, int pos, int ok_to_replace)
|
2005-06-19 05:21:34 +02:00
|
|
|
{
|
|
|
|
int retval = 0;
|
2005-06-25 11:25:29 +02:00
|
|
|
int stage = ce_stage(ce);
|
2005-06-19 05:21:34 +02:00
|
|
|
const char *name = ce->name;
|
|
|
|
const char *slash = name + ce_namelen(ce);
|
2017-04-19 19:06:17 +02:00
|
|
|
size_t len_eq_last;
|
|
|
|
int cmp_last = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are frequently called during an iteration on a sorted
|
|
|
|
* list of pathnames and while building a new index. Therefore,
|
|
|
|
* there is a high probability that this entry will eventually
|
|
|
|
* be appended to the index, rather than inserted in the middle.
|
|
|
|
* If we can confirm that, we can avoid binary searches on the
|
|
|
|
* components of the pathname.
|
|
|
|
*
|
|
|
|
* Compare the entry's full path with the last path in the index.
|
|
|
|
*/
|
|
|
|
if (istate->cache_nr > 0) {
|
|
|
|
cmp_last = strcmp_offset(name,
|
|
|
|
istate->cache[istate->cache_nr - 1]->name,
|
|
|
|
&len_eq_last);
|
|
|
|
if (cmp_last > 0) {
|
|
|
|
if (len_eq_last == 0) {
|
|
|
|
/*
|
|
|
|
* The entry sorts AFTER the last one in the
|
|
|
|
* index and their paths have no common prefix,
|
|
|
|
* so there cannot be a F/D conflict.
|
|
|
|
*/
|
|
|
|
return retval;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* The entry sorts AFTER the last one in the
|
|
|
|
* index, but has a common prefix. Fall through
|
|
|
|
* to the loop below to disect the entry's path
|
|
|
|
* and see where the difference is.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
} else if (cmp_last == 0) {
|
|
|
|
/*
|
|
|
|
* The entry exactly matches the last one in the
|
|
|
|
* index, but because of multiple stage and CE_REMOVE
|
|
|
|
* items, we fall through and let the regular search
|
|
|
|
* code handle it.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
}
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
for (;;) {
|
2017-04-19 19:06:18 +02:00
|
|
|
size_t len;
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
for (;;) {
|
|
|
|
if (*--slash == '/')
|
|
|
|
break;
|
|
|
|
if (slash <= ce->name)
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
len = slash - name;
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2017-04-19 19:06:18 +02:00
|
|
|
if (cmp_last > 0) {
|
|
|
|
/*
|
|
|
|
* (len + 1) is a directory boundary (including
|
|
|
|
* the trailing slash). And since the loop is
|
|
|
|
* decrementing "slash", the first iteration is
|
|
|
|
* the longest directory prefix; subsequent
|
|
|
|
* iterations consider parent directories.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (len + 1 <= len_eq_last) {
|
|
|
|
/*
|
|
|
|
* The directory prefix (including the trailing
|
|
|
|
* slash) also appears as a prefix in the last
|
|
|
|
* entry, so the remainder cannot collide (because
|
|
|
|
* strcmp said the whole path was greater).
|
|
|
|
*
|
|
|
|
* EQ: last: xxx/A
|
|
|
|
* this: xxx/B
|
|
|
|
*
|
|
|
|
* LT: last: xxx/file_A
|
|
|
|
* this: xxx/file_B
|
|
|
|
*/
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > len_eq_last) {
|
|
|
|
/*
|
|
|
|
* This part of the directory prefix (excluding
|
|
|
|
* the trailing slash) is longer than the known
|
|
|
|
* equal portions, so this sub-directory cannot
|
|
|
|
* collide with a file.
|
|
|
|
*
|
|
|
|
* GT: last: xxxA
|
|
|
|
* this: xxxB/file
|
|
|
|
*/
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is a possible collision. Fall through and
|
|
|
|
* let the regular search code handle it.
|
|
|
|
*
|
|
|
|
* last: xxx
|
|
|
|
* this: xxx/file
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2012-07-11 11:22:37 +02:00
|
|
|
pos = index_name_stage_pos(istate, name, len, stage);
|
2005-06-19 05:21:34 +02:00
|
|
|
if (pos >= 0) {
|
2007-03-30 10:55:37 +02:00
|
|
|
/*
|
|
|
|
* Found one, but not so fast. This could
|
|
|
|
* be a marker that says "I was here, but
|
|
|
|
* I am being removed". Such an entry is
|
|
|
|
* not a part of the resulting tree, and
|
|
|
|
* it is Ok to have a directory at the same
|
|
|
|
* path.
|
|
|
|
*/
|
2008-01-23 06:24:21 +01:00
|
|
|
if (!(istate->cache[pos]->ce_flags & CE_REMOVE)) {
|
2007-03-30 10:55:37 +02:00
|
|
|
retval = -1;
|
|
|
|
if (!ok_to_replace)
|
|
|
|
break;
|
2007-04-02 08:26:07 +02:00
|
|
|
remove_index_entry_at(istate, pos);
|
2007-03-30 10:55:37 +02:00
|
|
|
continue;
|
|
|
|
}
|
2005-06-19 05:21:34 +02:00
|
|
|
}
|
2007-03-30 10:55:37 +02:00
|
|
|
else
|
|
|
|
pos = -pos-1;
|
2005-06-19 05:21:34 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Trivial optimization: if we find an entry that
|
|
|
|
* already matches the sub-directory, then we know
|
2005-06-25 11:25:29 +02:00
|
|
|
* we're ok, and we can exit.
|
2005-06-19 05:21:34 +02:00
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
while (pos < istate->cache_nr) {
|
|
|
|
struct cache_entry *p = istate->cache[pos];
|
2005-06-25 11:25:29 +02:00
|
|
|
if ((ce_namelen(p) <= len) ||
|
|
|
|
(p->name[len] != '/') ||
|
|
|
|
memcmp(p->name, name, len))
|
|
|
|
break; /* not our subdirectory */
|
2008-01-23 06:24:21 +01:00
|
|
|
if (ce_stage(p) == stage && !(p->ce_flags & CE_REMOVE))
|
|
|
|
/*
|
|
|
|
* p is at the same stage as our entry, and
|
2005-06-25 11:25:29 +02:00
|
|
|
* is a subdirectory of what we are looking
|
|
|
|
* at, so we cannot have conflicts at our
|
|
|
|
* level or anything shorter.
|
|
|
|
*/
|
|
|
|
return retval;
|
|
|
|
pos++;
|
2005-05-08 06:55:21 +02:00
|
|
|
}
|
2005-05-08 06:48:12 +02:00
|
|
|
}
|
2005-06-19 05:21:34 +02:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We may be in a situation where we already have path/file and path
|
|
|
|
* is being added, or we already have path and path/file is being
|
|
|
|
* added. Either one would result in a nonsense tree that has path
|
|
|
|
* twice when git-write-tree tries to write it out. Prevent it.
|
2007-06-07 09:04:01 +02:00
|
|
|
*
|
2005-06-19 05:21:34 +02:00
|
|
|
* If ok-to-replace is specified, we remove the conflicting entries
|
|
|
|
* from the cache so the caller should recompute the insert position.
|
|
|
|
* When this happens, we return non-zero.
|
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
static int check_file_directory_conflict(struct index_state *istate,
|
|
|
|
const struct cache_entry *ce,
|
|
|
|
int pos, int ok_to_replace)
|
2005-06-19 05:21:34 +02:00
|
|
|
{
|
2007-03-30 10:55:37 +02:00
|
|
|
int retval;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When ce is an "I am going away" entry, we allow it to be added
|
|
|
|
*/
|
2008-01-15 01:03:17 +01:00
|
|
|
if (ce->ce_flags & CE_REMOVE)
|
2007-03-30 10:55:37 +02:00
|
|
|
return 0;
|
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
/*
|
|
|
|
* We check if the path is a sub-path of a subsequent pathname
|
|
|
|
* first, since removing those will not change the position
|
2007-03-30 10:55:37 +02:00
|
|
|
* in the array.
|
2005-06-19 05:21:34 +02:00
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
retval = has_file_name(istate, ce, pos, ok_to_replace);
|
2007-03-30 10:55:37 +02:00
|
|
|
|
2005-06-19 05:21:34 +02:00
|
|
|
/*
|
|
|
|
* Then check if the path might have a clashing sub-directory
|
|
|
|
* before it.
|
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
return retval + has_dir_name(istate, ce, pos, ok_to_replace);
|
2005-05-08 06:48:12 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 22:42:50 +02:00
|
|
|
static int add_index_entry_with_check(struct index_state *istate, struct cache_entry *ce, int option)
|
2005-04-09 21:09:27 +02:00
|
|
|
{
|
|
|
|
int pos;
|
2005-05-08 06:55:21 +02:00
|
|
|
int ok_to_add = option & ADD_CACHE_OK_TO_ADD;
|
|
|
|
int ok_to_replace = option & ADD_CACHE_OK_TO_REPLACE;
|
2005-06-25 11:25:29 +02:00
|
|
|
int skip_df_check = option & ADD_CACHE_SKIP_DFCHECK;
|
2008-08-21 10:44:53 +02:00
|
|
|
int new_only = option & ADD_CACHE_NEW_ONLY;
|
2006-02-09 06:15:24 +01:00
|
|
|
|
2014-06-13 14:19:42 +02:00
|
|
|
if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
|
|
|
|
cache_tree_invalidate_path(istate, ce->name);
|
2017-04-19 19:06:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this entry's path sorts after the last entry in the index,
|
|
|
|
* we can avoid searching for it.
|
|
|
|
*/
|
|
|
|
if (istate->cache_nr > 0 &&
|
|
|
|
strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
|
msvc: avoid using minus operator on unsigned types
MSVC complains about this with `-Wall`, which can be taken as a sign
that this is indeed a real bug. The symptom is:
C4146: unary minus operator applied to unsigned type, result
still unsigned
Let's avoid this warning in the minimal way, e.g. writing `-1 -
<unsigned value>` instead of `-<unsigned value> - 1`.
Note that the change in the `estimate_cache_size()` function is
needed because MSVC considers the "return type" of the `sizeof()`
operator to be `size_t`, i.e. unsigned, and therefore it cannot be
negated using the unary minus operator.
Even worse, that arithmetic is doing extra work, in vain. We want to
calculate the entry extra cache size as the difference between the
size of the `cache_entry` structure minus the size of the
`ondisk_cache_entry` structure, padded to the appropriate alignment
boundary.
To that end, we start by assigning that difference to the `per_entry`
variable, and then abuse the `len` parameter of the
`align_padding_size()` macro to take the negative size of the ondisk
entry size. Essentially, we try to avoid passing the already calculated
difference to that macro by passing the operands of that difference
instead, when the macro expects operands of an addition:
#define align_padding_size(size, len) \
((size + (len) + 8) & ~7) - (size + len)
Currently, we pass A and -B to that macro instead of passing A - B and
0, where A - B is already stored in the `per_entry` variable, ready to
be used.
This is neither necessary, nor intuitive. Let's fix this, and have code
that is both easier to read and that also does not trigger MSVC's
warning.
While at it, we take care of reporting overflows (which are unlikely,
but hey, defensive programming is good!).
We _also_ take pains of casting the unsigned value to signed: otherwise,
the signed operand (i.e. the `-1`) would be cast to unsigned before
doing the arithmetic.
Helped-by: Denton Liu <liu.denton@gmail.com>
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-10-04 17:09:26 +02:00
|
|
|
pos = index_pos_to_insert_pos(istate->cache_nr);
|
2017-04-19 19:06:16 +02:00
|
|
|
else
|
|
|
|
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
|
2005-04-09 21:09:27 +02:00
|
|
|
|
2005-10-12 03:45:33 +02:00
|
|
|
/* existing match? Just replace it. */
|
2005-04-11 07:06:50 +02:00
|
|
|
if (pos >= 0) {
|
2008-08-21 10:44:53 +02:00
|
|
|
if (!new_only)
|
|
|
|
replace_index_entry(istate, pos, ce);
|
2005-04-09 21:09:27 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-11 07:06:50 +02:00
|
|
|
pos = -pos-1;
|
2005-04-09 21:09:27 +02:00
|
|
|
|
2015-06-07 12:40:52 +02:00
|
|
|
if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
|
|
|
|
untracked_cache_add_to_index(istate, ce->name);
|
2015-03-08 11:12:35 +01:00
|
|
|
|
2005-04-16 21:05:45 +02:00
|
|
|
/*
|
|
|
|
* Inserting a merged entry ("stage 0") into the index
|
|
|
|
* will always replace all non-merged entries..
|
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
if (pos < istate->cache_nr && ce_stage(ce) == 0) {
|
|
|
|
while (ce_same_name(istate->cache[pos], ce)) {
|
2005-04-16 21:05:45 +02:00
|
|
|
ok_to_add = 1;
|
2007-04-02 08:26:07 +02:00
|
|
|
if (!remove_index_entry_at(istate, pos))
|
2005-04-16 21:05:45 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-10 20:32:54 +02:00
|
|
|
if (!ok_to_add)
|
|
|
|
return -1;
|
verify_path: disallow symlinks in .gitmodules
There are a few reasons it's not a good idea to make
.gitmodules a symlink, including:
1. It won't be portable to systems without symlinks.
2. It may behave inconsistently, since Git may look at
this file in the index or a tree without bothering to
resolve any symbolic links. We don't do this _yet_, but
the config infrastructure is there and it's planned for
the future.
With some clever code, we could make (2) work. And some
people may not care about (1) if they only work on one
platform. But there are a few security reasons to simply
disallow it:
a. A symlinked .gitmodules file may circumvent any fsck
checks of the content.
b. Git may read and write from the on-disk file without
sanity checking the symlink target. So for example, if
you link ".gitmodules" to "../oops" and run "git
submodule add", we'll write to the file "oops" outside
the repository.
Again, both of those are problems that _could_ be solved
with sufficient code, but given the complications in (1) and
(2), we're better off just outlawing it explicitly.
Note the slightly tricky call to verify_path() in
update-index's update_one(). There we may not have a mode if
we're not updating from the filesystem (e.g., we might just
be removing the file). Passing "0" as the mode there works
fine; since it's not a symlink, we'll just skip the extra
checks.
Signed-off-by: Jeff King <peff@peff.net>
2018-05-05 02:03:35 +02:00
|
|
|
if (!verify_path(ce->name, ce->ce_mode))
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("invalid path '%s'"), ce->name);
|
2005-04-10 20:32:54 +02:00
|
|
|
|
2005-10-12 03:45:33 +02:00
|
|
|
if (!skip_df_check &&
|
2007-04-02 08:26:07 +02:00
|
|
|
check_file_directory_conflict(istate, ce, pos, ok_to_replace)) {
|
2005-05-08 06:55:21 +02:00
|
|
|
if (!ok_to_replace)
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("'%s' appears as both a file and as a directory"),
|
2007-04-02 08:26:07 +02:00
|
|
|
ce->name);
|
2012-07-11 11:22:37 +02:00
|
|
|
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
|
2005-05-08 06:55:21 +02:00
|
|
|
pos = -pos-1;
|
|
|
|
}
|
2007-08-09 22:42:50 +02:00
|
|
|
return pos + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int add_index_entry(struct index_state *istate, struct cache_entry *ce, int option)
|
|
|
|
{
|
|
|
|
int pos;
|
|
|
|
|
|
|
|
if (option & ADD_CACHE_JUST_APPEND)
|
|
|
|
pos = istate->cache_nr;
|
|
|
|
else {
|
|
|
|
int ret;
|
|
|
|
ret = add_index_entry_with_check(istate, ce, option);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
pos = ret - 1;
|
|
|
|
}
|
2005-05-08 06:48:12 +02:00
|
|
|
|
2005-04-09 21:09:27 +02:00
|
|
|
/* Make sure the array is big enough .. */
|
2014-03-03 23:32:01 +01:00
|
|
|
ALLOC_GROW(istate->cache, istate->cache_nr + 1, istate->cache_alloc);
|
2005-04-09 21:09:27 +02:00
|
|
|
|
|
|
|
/* Add it in.. */
|
2007-04-02 08:26:07 +02:00
|
|
|
istate->cache_nr++;
|
2007-08-09 22:42:50 +02:00
|
|
|
if (istate->cache_nr > pos + 1)
|
2018-01-22 18:50:09 +01:00
|
|
|
MOVE_ARRAY(istate->cache + pos + 1, istate->cache + pos,
|
|
|
|
istate->cache_nr - pos - 1);
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
set_index_entry(istate, pos, ce);
|
2014-06-13 14:19:27 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_ADDED;
|
2005-04-09 21:09:27 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-05-19 18:56:35 +02:00
|
|
|
/*
|
|
|
|
* "refresh" does not calculate a new sha1 file or bring the
|
|
|
|
* cache up-to-date for mode/content changes. But what it
|
|
|
|
* _does_ do is to "re-match" the stat information of a file
|
|
|
|
* with the cache, so that you can refresh the cache for a
|
|
|
|
* file that hasn't been changed but where the stat entry is
|
|
|
|
* out of date.
|
|
|
|
*
|
|
|
|
* For example, you'd want to do this after doing a "git-read-tree",
|
|
|
|
* to link up the stat cache details with the proper files.
|
|
|
|
*/
|
2007-04-02 08:26:07 +02:00
|
|
|
static struct cache_entry *refresh_cache_ent(struct index_state *istate,
|
2007-11-10 09:15:03 +01:00
|
|
|
struct cache_entry *ce,
|
2011-11-18 12:11:08 +01:00
|
|
|
unsigned int options, int *err,
|
|
|
|
int *changed_ret)
|
2006-05-19 18:56:35 +02:00
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
struct cache_entry *updated;
|
2018-07-02 21:49:31 +02:00
|
|
|
int changed;
|
2014-01-27 15:45:08 +01:00
|
|
|
int refresh = options & CE_MATCH_REFRESH;
|
2007-11-10 09:15:03 +01:00
|
|
|
int ignore_valid = options & CE_MATCH_IGNORE_VALID;
|
2009-12-14 12:43:58 +01:00
|
|
|
int ignore_skip_worktree = options & CE_MATCH_IGNORE_SKIP_WORKTREE;
|
2014-01-27 15:45:07 +01:00
|
|
|
int ignore_missing = options & CE_MATCH_IGNORE_MISSING;
|
2017-09-22 18:35:40 +02:00
|
|
|
int ignore_fsmonitor = options & CE_MATCH_IGNORE_FSMONITOR;
|
2006-05-19 18:56:35 +02:00
|
|
|
|
2014-01-27 15:45:08 +01:00
|
|
|
if (!refresh || ce_uptodate(ce))
|
2008-01-19 08:45:24 +01:00
|
|
|
return ce;
|
|
|
|
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!ignore_fsmonitor)
|
|
|
|
refresh_fsmonitor(istate);
|
2008-05-30 14:38:35 +02:00
|
|
|
/*
|
2009-12-14 12:43:58 +01:00
|
|
|
* CE_VALID or CE_SKIP_WORKTREE means the user promised us
|
|
|
|
* that the change to the work tree does not matter and told
|
|
|
|
* us not to worry.
|
2008-05-30 14:38:35 +02:00
|
|
|
*/
|
2009-12-14 12:43:58 +01:00
|
|
|
if (!ignore_skip_worktree && ce_skip_worktree(ce)) {
|
|
|
|
ce_mark_uptodate(ce);
|
|
|
|
return ce;
|
|
|
|
}
|
2008-05-30 14:38:35 +02:00
|
|
|
if (!ignore_valid && (ce->ce_flags & CE_VALID)) {
|
|
|
|
ce_mark_uptodate(ce);
|
|
|
|
return ce;
|
|
|
|
}
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!ignore_fsmonitor && (ce->ce_flags & CE_FSMONITOR_VALID)) {
|
|
|
|
ce_mark_uptodate(ce);
|
|
|
|
return ce;
|
|
|
|
}
|
2008-05-30 14:38:35 +02:00
|
|
|
|
2014-08-09 19:43:29 +02:00
|
|
|
if (has_symlink_leading_path(ce->name, ce_namelen(ce))) {
|
|
|
|
if (ignore_missing)
|
|
|
|
return ce;
|
|
|
|
if (err)
|
|
|
|
*err = ENOENT;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-07-26 06:32:18 +02:00
|
|
|
if (lstat(ce->name, &st) < 0) {
|
2014-01-27 15:45:07 +01:00
|
|
|
if (ignore_missing && errno == ENOENT)
|
|
|
|
return ce;
|
2007-04-02 06:34:34 +02:00
|
|
|
if (err)
|
|
|
|
*err = errno;
|
2006-07-26 06:32:18 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
2006-05-19 18:56:35 +02:00
|
|
|
|
2007-11-10 09:15:03 +01:00
|
|
|
changed = ie_match_stat(istate, ce, &st, options);
|
2011-11-18 12:11:08 +01:00
|
|
|
if (changed_ret)
|
|
|
|
*changed_ret = changed;
|
2006-05-19 18:56:35 +02:00
|
|
|
if (!changed) {
|
2007-11-10 09:15:03 +01:00
|
|
|
/*
|
|
|
|
* The path is unchanged. If we were told to ignore
|
|
|
|
* valid bit, then we did the actual stat check and
|
|
|
|
* found that the entry is unmodified. If the entry
|
|
|
|
* is not marked VALID, this is the place to mark it
|
|
|
|
* valid again, under "assume unchanged" mode.
|
|
|
|
*/
|
|
|
|
if (ignore_valid && assume_unchanged &&
|
2008-01-15 01:03:17 +01:00
|
|
|
!(ce->ce_flags & CE_VALID))
|
2006-05-19 18:56:35 +02:00
|
|
|
; /* mark this one VALID again */
|
2008-01-19 08:45:24 +01:00
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* We do not mark the index itself "modified"
|
|
|
|
* because CE_UPTODATE flag is in-core only;
|
|
|
|
* we are not going to write this change out.
|
|
|
|
*/
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!S_ISGITLINK(ce->ce_mode)) {
|
2010-01-24 09:10:20 +01:00
|
|
|
ce_mark_uptodate(ce);
|
mark_fsmonitor_valid(): mark the index as changed if needed
Without this bug fix, t7519's four "status doesn't detect unreported
modifications" test cases would fail occasionally (and, oddly enough,
*a lot* more frequently on Windows).
The reason is that these test cases intentionally use the side effect of
`git status` to re-write the index if any updates were detected: they
first clean the worktree, run `git status` to update the index as well
as show the output to the casual reader, then make the worktree dirty
again and expect no changes to reported if running with a mocked
fsmonitor hook.
The problem with this strategy was that the index was written during
said `git status` on the clean worktree for the *wrong* reason: not
because the index was marked as changed (it wasn't), but because the
recorded mtimes were racy with the index' own mtime.
As the mtime granularity on Windows is 100 nanoseconds (see e.g.
https://docs.microsoft.com/en-us/windows/desktop/SysInfo/file-times),
the mtimes of the files are often enough *not* racy with the index', so
that that `git status` call currently does not always update the index
(including the fsmonitor extension), causing the test case to fail.
The obvious fix: if we change *any* index entry's `CE_FSMONITOR_VALID`
flag, we should also mark the index as changed. That will cause the
index to be written upon `git status`, *including* an updated fsmonitor
extension.
Side note: Even though the reader might think that the t7519 issue
should be *much* more prevalent on Linux, given that the ext4 filesystem
(that seems to be used by every Linux distribution) stores mtimes in
nanosecond precision. However, ext4 uses `current_kernel_time()` (see
https://unix.stackexchange.com/questions/11599#comment762968_11599; it
is *amazingly* hard to find any proper source of information about such
ext4 questions) whose accuracy seems to depend on many factors but is
safely worse than the 100-nanosecond granularity of NTFS (again, it is
*horribly* hard to find anything remotely authoritative about this
question). So it seems that the racy index condition that hid the bug
fixed by this patch simply is a lot more likely on Linux than on
Windows. But not impossible ;-)
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-05-24 14:23:48 +02:00
|
|
|
mark_fsmonitor_valid(istate, ce);
|
2017-09-22 18:35:40 +02:00
|
|
|
}
|
2006-07-26 06:32:18 +02:00
|
|
|
return ce;
|
2008-01-19 08:45:24 +01:00
|
|
|
}
|
2006-05-19 18:56:35 +02:00
|
|
|
}
|
|
|
|
|
2007-11-10 09:15:03 +01:00
|
|
|
if (ie_modified(istate, ce, &st, options)) {
|
2007-04-02 06:34:34 +02:00
|
|
|
if (err)
|
|
|
|
*err = EINVAL;
|
2006-07-26 06:32:18 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
2006-05-19 18:56:35 +02:00
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
updated = make_empty_cache_entry(istate, ce_namelen(ce));
|
2018-03-15 16:25:20 +01:00
|
|
|
copy_cache_entry(updated, ce);
|
|
|
|
memcpy(updated->name, ce->name, ce->ce_namelen + 1);
|
2019-05-24 14:23:47 +02:00
|
|
|
fill_stat_cache_info(istate, updated, &st);
|
2007-11-10 09:15:03 +01:00
|
|
|
/*
|
|
|
|
* If ignore_valid is not set, we should leave CE_VALID bit
|
|
|
|
* alone. Otherwise, paths marked with --no-assume-unchanged
|
|
|
|
* (i.e. things to be edited) will reacquire CE_VALID bit
|
|
|
|
* automatically, which is not really what we want.
|
2006-05-19 18:56:35 +02:00
|
|
|
*/
|
2007-11-10 09:15:03 +01:00
|
|
|
if (!ignore_valid && assume_unchanged &&
|
2008-01-15 01:03:17 +01:00
|
|
|
!(ce->ce_flags & CE_VALID))
|
|
|
|
updated->ce_flags &= ~CE_VALID;
|
2006-05-19 18:56:35 +02:00
|
|
|
|
2014-06-13 14:19:27 +02:00
|
|
|
/* istate->cache_changed is updated in the caller */
|
2006-05-19 18:56:35 +02:00
|
|
|
return updated;
|
|
|
|
}
|
|
|
|
|
2009-08-21 10:57:59 +02:00
|
|
|
static void show_file(const char * fmt, const char * name, int in_porcelain,
|
2011-02-22 23:43:23 +01:00
|
|
|
int * first, const char *header_msg)
|
2009-08-21 10:57:59 +02:00
|
|
|
{
|
|
|
|
if (in_porcelain && *first && header_msg) {
|
|
|
|
printf("%s\n", header_msg);
|
2011-08-25 23:46:52 +02:00
|
|
|
*first = 0;
|
2009-08-21 10:57:59 +02:00
|
|
|
}
|
|
|
|
printf(fmt, name);
|
|
|
|
}
|
|
|
|
|
2019-09-11 20:20:25 +02:00
|
|
|
int repo_refresh_and_write_index(struct repository *repo,
|
|
|
|
unsigned int refresh_flags,
|
|
|
|
unsigned int write_flags,
|
|
|
|
int gentle,
|
|
|
|
const struct pathspec *pathspec,
|
|
|
|
char *seen, const char *header_msg)
|
|
|
|
{
|
|
|
|
struct lock_file lock_file = LOCK_INIT;
|
|
|
|
int fd, ret = 0;
|
|
|
|
|
|
|
|
fd = repo_hold_locked_index(repo, &lock_file, 0);
|
|
|
|
if (!gentle && fd < 0)
|
|
|
|
return -1;
|
|
|
|
if (refresh_index(repo->index, refresh_flags, pathspec, seen, header_msg))
|
|
|
|
ret = 1;
|
|
|
|
if (0 <= fd && write_locked_index(repo->index, &lock_file, COMMIT_LOCK | write_flags))
|
|
|
|
ret = -1;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-07-14 10:35:54 +02:00
|
|
|
int refresh_index(struct index_state *istate, unsigned int flags,
|
|
|
|
const struct pathspec *pathspec,
|
2011-02-22 23:43:23 +01:00
|
|
|
char *seen, const char *header_msg)
|
2006-05-19 18:56:35 +02:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int has_errors = 0;
|
|
|
|
int really = (flags & REFRESH_REALLY) != 0;
|
|
|
|
int allow_unmerged = (flags & REFRESH_UNMERGED) != 0;
|
|
|
|
int quiet = (flags & REFRESH_QUIET) != 0;
|
|
|
|
int not_new = (flags & REFRESH_IGNORE_MISSING) != 0;
|
2008-05-14 19:03:45 +02:00
|
|
|
int ignore_submodules = (flags & REFRESH_IGNORE_SUBMODULES) != 0;
|
2009-08-21 10:57:59 +02:00
|
|
|
int first = 1;
|
|
|
|
int in_porcelain = (flags & REFRESH_IN_PORCELAIN);
|
2014-01-27 15:45:08 +01:00
|
|
|
unsigned int options = (CE_MATCH_REFRESH |
|
|
|
|
(really ? CE_MATCH_IGNORE_VALID : 0) |
|
2014-01-27 15:45:07 +01:00
|
|
|
(not_new ? CE_MATCH_IGNORE_MISSING : 0));
|
2011-11-18 12:11:28 +01:00
|
|
|
const char *modified_fmt;
|
2011-11-18 12:13:08 +01:00
|
|
|
const char *deleted_fmt;
|
|
|
|
const char *typechange_fmt;
|
|
|
|
const char *added_fmt;
|
2011-11-18 12:11:28 +01:00
|
|
|
const char *unmerged_fmt;
|
2018-09-15 19:56:04 +02:00
|
|
|
struct progress *progress = NULL;
|
|
|
|
|
|
|
|
if (flags & REFRESH_PROGRESS && isatty(2))
|
|
|
|
progress = start_delayed_progress(_("Refresh index"),
|
|
|
|
istate->cache_nr);
|
2006-05-19 18:56:35 +02:00
|
|
|
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_enter();
|
2018-11-10 06:16:06 +01:00
|
|
|
modified_fmt = in_porcelain ? "M\t%s\n" : "%s: needs update\n";
|
|
|
|
deleted_fmt = in_porcelain ? "D\t%s\n" : "%s: needs update\n";
|
|
|
|
typechange_fmt = in_porcelain ? "T\t%s\n" : "%s: needs update\n";
|
|
|
|
added_fmt = in_porcelain ? "A\t%s\n" : "%s: needs update\n";
|
|
|
|
unmerged_fmt = in_porcelain ? "U\t%s\n" : "%s: needs merge\n";
|
2018-10-29 21:41:59 +01:00
|
|
|
/*
|
|
|
|
* Use the multi-threaded preload_index() to refresh most of the
|
|
|
|
* cache entries quickly then in the single threaded loop below,
|
|
|
|
* we only have to do the special cases that are left.
|
|
|
|
*/
|
|
|
|
preload_index(istate, pathspec, 0);
|
2007-04-02 08:26:07 +02:00
|
|
|
for (i = 0; i < istate->cache_nr; i++) {
|
2018-02-14 19:59:45 +01:00
|
|
|
struct cache_entry *ce, *new_entry;
|
2007-04-02 06:34:34 +02:00
|
|
|
int cache_errno = 0;
|
2011-11-18 12:13:08 +01:00
|
|
|
int changed = 0;
|
2012-02-17 19:11:05 +01:00
|
|
|
int filtered = 0;
|
2007-04-02 06:34:34 +02:00
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
ce = istate->cache[i];
|
2008-05-14 19:03:45 +02:00
|
|
|
if (ignore_submodules && S_ISGITLINK(ce->ce_mode))
|
|
|
|
continue;
|
|
|
|
|
2018-09-21 17:57:25 +02:00
|
|
|
if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
|
2012-02-17 19:11:05 +01:00
|
|
|
filtered = 1;
|
|
|
|
|
2006-05-19 18:56:35 +02:00
|
|
|
if (ce_stage(ce)) {
|
2007-04-02 08:26:07 +02:00
|
|
|
while ((i < istate->cache_nr) &&
|
|
|
|
! strcmp(istate->cache[i]->name, ce->name))
|
2006-05-19 18:56:35 +02:00
|
|
|
i++;
|
|
|
|
i--;
|
|
|
|
if (allow_unmerged)
|
|
|
|
continue;
|
2012-02-17 19:11:05 +01:00
|
|
|
if (!filtered)
|
|
|
|
show_file(unmerged_fmt, ce->name, in_porcelain,
|
|
|
|
&first, header_msg);
|
2006-05-19 18:56:35 +02:00
|
|
|
has_errors = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-02-17 19:11:05 +01:00
|
|
|
if (filtered)
|
2007-08-11 23:59:01 +02:00
|
|
|
continue;
|
|
|
|
|
2018-02-14 19:59:45 +01:00
|
|
|
new_entry = refresh_cache_ent(istate, ce, options, &cache_errno, &changed);
|
|
|
|
if (new_entry == ce)
|
2006-05-19 18:56:35 +02:00
|
|
|
continue;
|
2018-09-15 19:56:04 +02:00
|
|
|
if (progress)
|
|
|
|
display_progress(progress, i);
|
2018-02-14 19:59:45 +01:00
|
|
|
if (!new_entry) {
|
2011-11-18 12:13:08 +01:00
|
|
|
const char *fmt;
|
|
|
|
|
2006-07-26 06:32:18 +02:00
|
|
|
if (really && cache_errno == EINVAL) {
|
2006-05-19 18:56:35 +02:00
|
|
|
/* If we are doing --really-refresh that
|
|
|
|
* means the index is not valid anymore.
|
|
|
|
*/
|
2008-01-15 01:03:17 +01:00
|
|
|
ce->ce_flags &= ~CE_VALID;
|
2014-06-13 14:19:39 +02:00
|
|
|
ce->ce_flags |= CE_UPDATE_IN_BASE;
|
2017-09-22 18:35:40 +02:00
|
|
|
mark_fsmonitor_invalid(istate, ce);
|
2014-06-13 14:19:27 +02:00
|
|
|
istate->cache_changed |= CE_ENTRY_CHANGED;
|
2006-05-19 18:56:35 +02:00
|
|
|
}
|
|
|
|
if (quiet)
|
|
|
|
continue;
|
2011-11-18 12:13:08 +01:00
|
|
|
|
|
|
|
if (cache_errno == ENOENT)
|
|
|
|
fmt = deleted_fmt;
|
2015-08-22 03:08:05 +02:00
|
|
|
else if (ce_intent_to_add(ce))
|
2011-11-18 12:13:08 +01:00
|
|
|
fmt = added_fmt; /* must be before other checks */
|
|
|
|
else if (changed & TYPE_CHANGED)
|
|
|
|
fmt = typechange_fmt;
|
|
|
|
else
|
|
|
|
fmt = modified_fmt;
|
|
|
|
show_file(fmt,
|
|
|
|
ce->name, in_porcelain, &first, header_msg);
|
2006-05-19 18:56:35 +02:00
|
|
|
has_errors = 1;
|
|
|
|
continue;
|
|
|
|
}
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
|
2018-02-14 19:59:45 +01:00
|
|
|
replace_index_entry(istate, i, new_entry);
|
2006-05-19 18:56:35 +02:00
|
|
|
}
|
2018-09-15 19:56:04 +02:00
|
|
|
if (progress) {
|
|
|
|
display_progress(progress, istate->cache_nr);
|
|
|
|
stop_progress(&progress);
|
|
|
|
}
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_leave("refresh index");
|
2006-05-19 18:56:35 +02:00
|
|
|
return has_errors;
|
|
|
|
}
|
|
|
|
|
2018-07-02 21:49:29 +02:00
|
|
|
struct cache_entry *refresh_cache_entry(struct index_state *istate,
|
|
|
|
struct cache_entry *ce,
|
|
|
|
unsigned int options)
|
2007-04-02 06:34:34 +02:00
|
|
|
{
|
2018-07-02 21:49:29 +02:00
|
|
|
return refresh_cache_ent(istate, ce, options, NULL, NULL);
|
2007-04-02 06:34:34 +02:00
|
|
|
}
|
|
|
|
|
2012-04-04 00:53:09 +02:00
|
|
|
|
|
|
|
/*****************************************************************
|
|
|
|
* Index File I/O
|
|
|
|
*****************************************************************/
|
|
|
|
|
2012-04-04 18:12:43 +02:00
|
|
|
#define INDEX_FORMAT_DEFAULT 3
|
|
|
|
|
2019-08-13 20:37:43 +02:00
|
|
|
static unsigned int get_index_format_default(struct repository *r)
|
2014-02-23 21:49:57 +01:00
|
|
|
{
|
|
|
|
char *envversion = getenv("GIT_INDEX_VERSION");
|
2014-02-23 21:49:59 +01:00
|
|
|
char *endp;
|
|
|
|
unsigned int version = INDEX_FORMAT_DEFAULT;
|
|
|
|
|
2014-02-23 21:49:57 +01:00
|
|
|
if (!envversion) {
|
2019-08-13 20:37:43 +02:00
|
|
|
prepare_repo_settings(r);
|
|
|
|
|
|
|
|
if (r->settings.index_version >= 0)
|
|
|
|
version = r->settings.index_version;
|
2014-02-23 21:49:59 +01:00
|
|
|
if (version < INDEX_FORMAT_LB || INDEX_FORMAT_UB < version) {
|
|
|
|
warning(_("index.version set, but the value is invalid.\n"
|
2014-02-23 21:49:57 +01:00
|
|
|
"Using version %i"), INDEX_FORMAT_DEFAULT);
|
2014-02-23 21:49:59 +01:00
|
|
|
return INDEX_FORMAT_DEFAULT;
|
2014-02-23 21:49:57 +01:00
|
|
|
}
|
|
|
|
return version;
|
|
|
|
}
|
2014-02-23 21:49:59 +01:00
|
|
|
|
|
|
|
version = strtoul(envversion, &endp, 10);
|
|
|
|
if (*endp ||
|
|
|
|
version < INDEX_FORMAT_LB || INDEX_FORMAT_UB < version) {
|
|
|
|
warning(_("GIT_INDEX_VERSION set, but the value is invalid.\n"
|
|
|
|
"Using version %i"), INDEX_FORMAT_DEFAULT);
|
|
|
|
version = INDEX_FORMAT_DEFAULT;
|
|
|
|
}
|
|
|
|
return version;
|
2014-02-23 21:49:57 +01:00
|
|
|
}
|
|
|
|
|
2012-04-04 00:53:09 +02:00
|
|
|
/*
|
|
|
|
* dev/ino/uid/gid/size are also just tracked to the low 32 bits
|
|
|
|
* Again - this is just a (very strong in practice) heuristic that
|
|
|
|
* the inode hasn't changed.
|
|
|
|
*
|
|
|
|
* We save the fields in big-endian order to allow using the
|
|
|
|
* index file over NFS transparently.
|
|
|
|
*/
|
|
|
|
struct ondisk_cache_entry {
|
|
|
|
struct cache_time ctime;
|
|
|
|
struct cache_time mtime;
|
2013-08-18 21:41:51 +02:00
|
|
|
uint32_t dev;
|
|
|
|
uint32_t ino;
|
|
|
|
uint32_t mode;
|
|
|
|
uint32_t uid;
|
|
|
|
uint32_t gid;
|
|
|
|
uint32_t size;
|
2019-02-19 01:05:24 +01:00
|
|
|
/*
|
|
|
|
* unsigned char hash[hashsz];
|
|
|
|
* uint16_t flags;
|
|
|
|
* if (flags & CE_EXTENDED)
|
|
|
|
* uint16_t flags2;
|
|
|
|
*/
|
|
|
|
unsigned char data[GIT_MAX_RAWSZ + 2 * sizeof(uint16_t)];
|
|
|
|
char name[FLEX_ARRAY];
|
2012-04-04 00:53:09 +02:00
|
|
|
};
|
|
|
|
|
2012-04-04 00:53:15 +02:00
|
|
|
/* These are only used for v3 or lower */
|
2017-08-21 23:24:32 +02:00
|
|
|
#define align_padding_size(size, len) ((size + (len) + 8) & ~7) - (size + len)
|
2019-02-19 01:05:24 +01:00
|
|
|
#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,data) + (len) + 8) & ~7)
|
2012-04-04 00:53:09 +02:00
|
|
|
#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
|
2019-02-19 01:05:24 +01:00
|
|
|
#define ondisk_data_size(flags, len) (the_hash_algo->rawsz + \
|
|
|
|
((flags & CE_EXTENDED) ? 2 : 1) * sizeof(uint16_t) + len)
|
|
|
|
#define ondisk_data_size_max(len) (ondisk_data_size(CE_EXTENDED, len))
|
|
|
|
#define ondisk_ce_size(ce) (ondisk_cache_entry_size(ondisk_data_size((ce)->ce_flags, ce_namelen(ce))))
|
2012-04-04 00:53:09 +02:00
|
|
|
|
2017-04-14 22:32:21 +02:00
|
|
|
/* Allow fsck to force verification of the index checksum. */
|
|
|
|
int verify_index_checksum;
|
|
|
|
|
2017-10-18 16:27:25 +02:00
|
|
|
/* Allow fsck to force verification of the cache entry order. */
|
|
|
|
int verify_ce_order;
|
|
|
|
|
2018-10-10 17:59:33 +02:00
|
|
|
static int verify_hdr(const struct cache_header *hdr, unsigned long size)
|
2005-04-08 00:13:13 +02:00
|
|
|
{
|
2018-02-01 03:18:45 +01:00
|
|
|
git_hash_ctx c;
|
|
|
|
unsigned char hash[GIT_MAX_RAWSZ];
|
2012-04-04 00:53:12 +02:00
|
|
|
int hdr_version;
|
2005-04-08 00:13:13 +02:00
|
|
|
|
2005-04-15 19:44:27 +02:00
|
|
|
if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("bad signature 0x%08x"), hdr->hdr_signature);
|
2012-04-04 00:53:12 +02:00
|
|
|
hdr_version = ntohl(hdr->hdr_version);
|
2013-02-22 13:09:24 +01:00
|
|
|
if (hdr_version < INDEX_FORMAT_LB || INDEX_FORMAT_UB < hdr_version)
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("bad index version %d"), hdr_version);
|
2017-04-14 22:32:21 +02:00
|
|
|
|
|
|
|
if (!verify_index_checksum)
|
|
|
|
return 0;
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
the_hash_algo->init_fn(&c);
|
|
|
|
the_hash_algo->update_fn(&c, hdr, size - the_hash_algo->rawsz);
|
|
|
|
the_hash_algo->final_fn(hash, &c);
|
2018-08-28 23:22:52 +02:00
|
|
|
if (!hasheq(hash, (unsigned char *)hdr + size - the_hash_algo->rawsz))
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("bad index file sha1 signature"));
|
2005-04-08 00:13:13 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
static int read_index_extension(struct index_state *istate,
|
2018-10-10 17:59:33 +02:00
|
|
|
const char *ext, const char *data, unsigned long sz)
|
2006-04-25 06:18:58 +02:00
|
|
|
{
|
|
|
|
switch (CACHE_EXT(ext)) {
|
|
|
|
case CACHE_EXT_TREE:
|
2007-04-02 08:26:07 +02:00
|
|
|
istate->cache_tree = cache_tree_read(data, sz);
|
2006-04-25 06:18:58 +02:00
|
|
|
break;
|
2009-12-25 09:30:51 +01:00
|
|
|
case CACHE_EXT_RESOLVE_UNDO:
|
|
|
|
istate->resolve_undo = resolve_undo_read(data, sz);
|
|
|
|
break;
|
2014-06-13 14:19:36 +02:00
|
|
|
case CACHE_EXT_LINK:
|
|
|
|
if (read_link_extension(istate, data, sz))
|
|
|
|
return -1;
|
|
|
|
break;
|
2015-03-08 11:12:34 +01:00
|
|
|
case CACHE_EXT_UNTRACKED:
|
|
|
|
istate->untracked = read_untracked_extension(data, sz);
|
|
|
|
break;
|
2017-09-22 18:35:40 +02:00
|
|
|
case CACHE_EXT_FSMONITOR:
|
|
|
|
read_fsmonitor_extension(istate, data, sz);
|
|
|
|
break;
|
2018-10-10 17:59:34 +02:00
|
|
|
case CACHE_EXT_ENDOFINDEXENTRIES:
|
2018-10-10 17:59:37 +02:00
|
|
|
case CACHE_EXT_INDEXENTRYOFFSETTABLE:
|
2018-10-10 17:59:34 +02:00
|
|
|
/* already handled in do_read_index() */
|
|
|
|
break;
|
2006-04-25 06:18:58 +02:00
|
|
|
default:
|
|
|
|
if (*ext < 'A' || 'Z' < *ext)
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("index uses %.4s extension, which we do not understand"),
|
2006-04-25 06:18:58 +02:00
|
|
|
ext);
|
2018-11-10 06:16:05 +01:00
|
|
|
fprintf_ln(stderr, _("ignoring %.4s extension"), ext);
|
2006-04-25 06:18:58 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
static struct cache_entry *create_from_disk(struct mem_pool *ce_mem_pool,
|
|
|
|
unsigned int version,
|
2018-07-02 21:49:31 +02:00
|
|
|
struct ondisk_cache_entry *ondisk,
|
2012-04-04 00:53:15 +02:00
|
|
|
unsigned long *ent_size,
|
2018-09-26 21:54:36 +02:00
|
|
|
const struct cache_entry *previous_ce)
|
2008-01-15 01:03:17 +01:00
|
|
|
{
|
2011-10-24 23:59:14 +02:00
|
|
|
struct cache_entry *ce;
|
2008-01-19 08:42:00 +01:00
|
|
|
size_t len;
|
2008-10-01 06:04:01 +02:00
|
|
|
const char *name;
|
2019-02-19 01:05:24 +01:00
|
|
|
const unsigned hashsz = the_hash_algo->rawsz;
|
|
|
|
const uint16_t *flagsp = (const uint16_t *)(ondisk->data + hashsz);
|
2011-10-24 23:59:14 +02:00
|
|
|
unsigned int flags;
|
2018-11-03 09:48:49 +01:00
|
|
|
size_t copy_len = 0;
|
2018-09-26 21:54:36 +02:00
|
|
|
/*
|
|
|
|
* Adjacent cache entries tend to share the leading paths, so it makes
|
|
|
|
* sense to only store the differences in later entries. In the v4
|
|
|
|
* on-disk format of the index, each on-disk cache entry stores the
|
|
|
|
* number of bytes to be stripped from the end of the previous name,
|
|
|
|
* and the bytes to append to the result, to come up with its name.
|
|
|
|
*/
|
2018-10-10 17:59:38 +02:00
|
|
|
int expand_name_field = version == 4;
|
2008-01-19 08:42:00 +01:00
|
|
|
|
2008-01-15 01:03:17 +01:00
|
|
|
/* On-disk flags are just 16 bits */
|
2019-02-19 01:05:24 +01:00
|
|
|
flags = get_be16(flagsp);
|
2011-10-24 23:59:14 +02:00
|
|
|
len = flags & CE_NAMEMASK;
|
2008-01-19 08:42:00 +01:00
|
|
|
|
2011-10-24 23:59:14 +02:00
|
|
|
if (flags & CE_EXTENDED) {
|
2008-10-01 06:04:01 +02:00
|
|
|
int extended_flags;
|
2019-02-19 01:05:24 +01:00
|
|
|
extended_flags = get_be16(flagsp + 1) << 16;
|
2008-10-01 06:04:01 +02:00
|
|
|
/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
|
|
|
|
if (extended_flags & ~CE_EXTENDED_FLAGS)
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("unknown index entry format 0x%08x"), extended_flags);
|
2011-10-24 23:59:14 +02:00
|
|
|
flags |= extended_flags;
|
2019-02-19 01:05:24 +01:00
|
|
|
name = (const char *)(flagsp + 2);
|
2008-10-01 06:04:01 +02:00
|
|
|
}
|
|
|
|
else
|
2019-02-19 01:05:24 +01:00
|
|
|
name = (const char *)(flagsp + 1);
|
2008-10-01 06:04:01 +02:00
|
|
|
|
2018-09-26 21:54:36 +02:00
|
|
|
if (expand_name_field) {
|
|
|
|
const unsigned char *cp = (const unsigned char *)name;
|
|
|
|
size_t strip_len, previous_len;
|
2012-04-04 00:53:15 +02:00
|
|
|
|
2019-11-05 18:07:23 +01:00
|
|
|
/* If we're at the beginning of a block, ignore the previous name */
|
2018-09-26 21:54:36 +02:00
|
|
|
strip_len = decode_varint(&cp);
|
2018-10-10 17:59:38 +02:00
|
|
|
if (previous_ce) {
|
|
|
|
previous_len = previous_ce->ce_namelen;
|
|
|
|
if (previous_len < strip_len)
|
2018-09-26 21:54:36 +02:00
|
|
|
die(_("malformed name field in the index, near path '%s'"),
|
2018-10-10 17:59:38 +02:00
|
|
|
previous_ce->name);
|
|
|
|
copy_len = previous_len - strip_len;
|
2018-09-26 21:54:36 +02:00
|
|
|
}
|
|
|
|
name = (const char *)cp;
|
|
|
|
}
|
2012-04-04 00:53:15 +02:00
|
|
|
|
2018-09-26 21:54:36 +02:00
|
|
|
if (len == CE_NAMEMASK) {
|
|
|
|
len = strlen(name);
|
|
|
|
if (expand_name_field)
|
|
|
|
len += copy_len;
|
|
|
|
}
|
2012-04-04 00:53:15 +02:00
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
ce = mem_pool__ce_alloc(ce_mem_pool, len);
|
2018-09-26 21:54:36 +02:00
|
|
|
|
|
|
|
ce->ce_stat_data.sd_ctime.sec = get_be32(&ondisk->ctime.sec);
|
|
|
|
ce->ce_stat_data.sd_mtime.sec = get_be32(&ondisk->mtime.sec);
|
|
|
|
ce->ce_stat_data.sd_ctime.nsec = get_be32(&ondisk->ctime.nsec);
|
|
|
|
ce->ce_stat_data.sd_mtime.nsec = get_be32(&ondisk->mtime.nsec);
|
|
|
|
ce->ce_stat_data.sd_dev = get_be32(&ondisk->dev);
|
|
|
|
ce->ce_stat_data.sd_ino = get_be32(&ondisk->ino);
|
|
|
|
ce->ce_mode = get_be32(&ondisk->mode);
|
|
|
|
ce->ce_stat_data.sd_uid = get_be32(&ondisk->uid);
|
|
|
|
ce->ce_stat_data.sd_gid = get_be32(&ondisk->gid);
|
|
|
|
ce->ce_stat_data.sd_size = get_be32(&ondisk->size);
|
|
|
|
ce->ce_flags = flags & ~CE_NAMEMASK;
|
|
|
|
ce->ce_namelen = len;
|
|
|
|
ce->index = 0;
|
2019-02-19 01:05:24 +01:00
|
|
|
hashcpy(ce->oid.hash, ondisk->data);
|
|
|
|
memcpy(ce->name, name, len);
|
|
|
|
ce->name[len] = '\0';
|
2012-04-04 00:53:15 +02:00
|
|
|
|
2018-09-26 21:54:36 +02:00
|
|
|
if (expand_name_field) {
|
|
|
|
if (copy_len)
|
|
|
|
memcpy(ce->name, previous_ce->name, copy_len);
|
|
|
|
memcpy(ce->name + copy_len, name, len + 1 - copy_len);
|
|
|
|
*ent_size = (name - ((char *)ondisk)) + len + 1 - copy_len;
|
|
|
|
} else {
|
|
|
|
memcpy(ce->name, name, len + 1);
|
|
|
|
*ent_size = ondisk_ce_size(ce);
|
2012-04-04 00:53:15 +02:00
|
|
|
}
|
2011-10-24 23:59:14 +02:00
|
|
|
return ce;
|
Create pathname-based hash-table lookup into index
This creates a hash index of every single file added to the index.
Right now that hash index isn't actually used for much: I implemented a
"cache_name_exists()" function that uses it to efficiently look up a
filename in the index without having to do the O(logn) binary search,
but quite frankly, that's not why this patch is interesting.
No, the whole and only reason to create the hash of the filenames in the
index is that by modifying the hash function, you can fairly easily do
things like making it always hash equivalent names into the same bucket.
That, in turn, means that suddenly questions like "does this name exist
in the index under an _equivalent_ name?" becomes much much cheaper.
Guiding principles behind this patch:
- it shouldn't be too costly. In fact, my primary goal here was to
actually speed up "git commit" with a fully populated kernel tree, by
being faster at checking whether a file already existed in the index. I
did succeed, but only barely:
Best before:
[torvalds@woody linux]$ time git commit > /dev/null
real 0m0.255s
user 0m0.168s
sys 0m0.088s
Best after:
[torvalds@woody linux]$ time ~/git/git commit > /dev/null
real 0m0.233s
user 0m0.144s
sys 0m0.088s
so some things are actually faster (~8%).
Caveat: that's really the best case. Other things are invariably going
to be slightly slower, since we populate that index cache, and quite
frankly, few things really use it to look things up.
That said, the cost is really quite small. The worst case is probably
doing a "git ls-files", which will do very little except puopulate the
index, and never actually looks anything up in it, just lists it.
Before:
[torvalds@woody linux]$ time git ls-files > /dev/null
real 0m0.016s
user 0m0.016s
sys 0m0.000s
After:
[torvalds@woody linux]$ time ~/git/git ls-files > /dev/null
real 0m0.021s
user 0m0.012s
sys 0m0.008s
and while the thing has really gotten relatively much slower, we're
still talking about something almost unmeasurable (eg 5ms). And that
really should be pretty much the worst case.
So we lose 5ms on one "benchmark", but win 22ms on another. Pick your
poison - this patch has the advantage that it will _likely_ speed up
the cases that are complex and expensive more than it slows down the
cases that are already so fast that nobody cares. But if you look at
relative speedups/slowdowns, it doesn't look so good.
- It should be simple and clean
The code may be a bit subtle (the reasons I do hash removal the way I
do etc), but it re-uses the existing hash.c files, so it really is
fairly small and straightforward apart from a few odd details.
Now, this patch on its own doesn't really do much, but I think it's worth
looking at, if only because if done correctly, the name hashing really can
make an improvement to the whole issue of "do we have a filename that
looks like this in the index already". And at least it gets real testing
by being used even by default (ie there is a real use-case for it even
without any insane filesystems).
NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just
not ashamed of it enough to really care. I took all the numbers out of my
nether regions - I'm sure it's good enough that it works in practice, but
the whole point was that you can make a really much fancier hash that
hashes characters not directly, but by their upper-case value or something
like that, and thus you get a case-insensitive hash, while still keeping
the name and the index itself totally case sensitive.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-23 03:41:14 +01:00
|
|
|
}
|
|
|
|
|
2015-03-20 22:43:14 +01:00
|
|
|
static void check_ce_order(struct index_state *istate)
|
2014-08-29 10:54:41 +02:00
|
|
|
{
|
2015-03-20 22:43:14 +01:00
|
|
|
unsigned int i;
|
|
|
|
|
2017-10-18 16:27:25 +02:00
|
|
|
if (!verify_ce_order)
|
|
|
|
return;
|
|
|
|
|
2015-03-20 22:43:14 +01:00
|
|
|
for (i = 1; i < istate->cache_nr; i++) {
|
|
|
|
struct cache_entry *ce = istate->cache[i - 1];
|
|
|
|
struct cache_entry *next_ce = istate->cache[i];
|
|
|
|
int name_compare = strcmp(ce->name, next_ce->name);
|
|
|
|
|
|
|
|
if (0 < name_compare)
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("unordered stage entries in index"));
|
2015-03-20 22:43:14 +01:00
|
|
|
if (!name_compare) {
|
|
|
|
if (!ce_stage(ce))
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("multiple stage entries for merged file '%s'"),
|
2015-03-20 22:43:14 +01:00
|
|
|
ce->name);
|
|
|
|
if (ce_stage(ce) > ce_stage(next_ce))
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("unordered stage entries for '%s'"),
|
2015-03-20 22:43:14 +01:00
|
|
|
ce->name);
|
|
|
|
}
|
2014-08-29 10:54:41 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
static void tweak_untracked_cache(struct index_state *istate)
|
|
|
|
{
|
2019-08-13 20:37:46 +02:00
|
|
|
struct repository *r = the_repository;
|
|
|
|
|
|
|
|
prepare_repo_settings(r);
|
|
|
|
|
|
|
|
if (r->settings.core_untracked_cache == UNTRACKED_CACHE_REMOVE) {
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
remove_untracked_cache(istate);
|
2019-08-13 20:37:46 +02:00
|
|
|
return;
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
}
|
2019-08-13 20:37:46 +02:00
|
|
|
|
|
|
|
if (r->settings.core_untracked_cache == UNTRACKED_CACHE_WRITE)
|
|
|
|
add_untracked_cache(istate);
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
}
|
|
|
|
|
2017-02-27 19:00:02 +01:00
|
|
|
static void tweak_split_index(struct index_state *istate)
|
|
|
|
{
|
|
|
|
switch (git_config_get_split_index()) {
|
|
|
|
case -1: /* unset: do nothing */
|
|
|
|
break;
|
|
|
|
case 0: /* false */
|
|
|
|
remove_split_index(istate);
|
|
|
|
break;
|
|
|
|
case 1: /* true */
|
|
|
|
add_split_index(istate);
|
|
|
|
break;
|
|
|
|
default: /* unknown value: do nothing */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
static void post_read_index_from(struct index_state *istate)
|
|
|
|
{
|
|
|
|
check_ce_order(istate);
|
|
|
|
tweak_untracked_cache(istate);
|
2017-02-27 19:00:02 +01:00
|
|
|
tweak_split_index(istate);
|
2017-09-22 18:35:40 +02:00
|
|
|
tweak_fsmonitor(istate);
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
}
|
|
|
|
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
static size_t estimate_cache_size_from_compressed(unsigned int entries)
|
|
|
|
{
|
|
|
|
return entries * (sizeof(struct cache_entry) + CACHE_ENTRY_PATH_LENGTH);
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
|
|
|
|
{
|
|
|
|
long per_entry = sizeof(struct cache_entry) - sizeof(struct ondisk_cache_entry);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Account for potential alignment differences.
|
|
|
|
*/
|
msvc: avoid using minus operator on unsigned types
MSVC complains about this with `-Wall`, which can be taken as a sign
that this is indeed a real bug. The symptom is:
C4146: unary minus operator applied to unsigned type, result
still unsigned
Let's avoid this warning in the minimal way, e.g. writing `-1 -
<unsigned value>` instead of `-<unsigned value> - 1`.
Note that the change in the `estimate_cache_size()` function is
needed because MSVC considers the "return type" of the `sizeof()`
operator to be `size_t`, i.e. unsigned, and therefore it cannot be
negated using the unary minus operator.
Even worse, that arithmetic is doing extra work, in vain. We want to
calculate the entry extra cache size as the difference between the
size of the `cache_entry` structure minus the size of the
`ondisk_cache_entry` structure, padded to the appropriate alignment
boundary.
To that end, we start by assigning that difference to the `per_entry`
variable, and then abuse the `len` parameter of the
`align_padding_size()` macro to take the negative size of the ondisk
entry size. Essentially, we try to avoid passing the already calculated
difference to that macro by passing the operands of that difference
instead, when the macro expects operands of an addition:
#define align_padding_size(size, len) \
((size + (len) + 8) & ~7) - (size + len)
Currently, we pass A and -B to that macro instead of passing A - B and
0, where A - B is already stored in the `per_entry` variable, ready to
be used.
This is neither necessary, nor intuitive. Let's fix this, and have code
that is both easier to read and that also does not trigger MSVC's
warning.
While at it, we take care of reporting overflows (which are unlikely,
but hey, defensive programming is good!).
We _also_ take pains of casting the unsigned value to signed: otherwise,
the signed operand (i.e. the `-1`) would be cast to unsigned before
doing the arithmetic.
Helped-by: Denton Liu <liu.denton@gmail.com>
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-10-04 17:09:26 +02:00
|
|
|
per_entry += align_padding_size(per_entry, 0);
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
return ondisk_size + entries * per_entry;
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:37 +02:00
|
|
|
struct index_entry_offset
|
|
|
|
{
|
|
|
|
/* starting byte offset into index file, count of index entries in this block */
|
|
|
|
int offset, nr;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct index_entry_offset_table
|
|
|
|
{
|
|
|
|
int nr;
|
|
|
|
struct index_entry_offset entries[FLEX_ARRAY];
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset);
|
|
|
|
static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot);
|
|
|
|
|
2018-10-10 17:59:34 +02:00
|
|
|
static size_t read_eoie_extension(const char *mmap, size_t mmap_size);
|
|
|
|
static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset);
|
|
|
|
|
2018-10-10 17:59:36 +02:00
|
|
|
struct load_index_extensions
|
|
|
|
{
|
|
|
|
pthread_t pthread;
|
|
|
|
struct index_state *istate;
|
|
|
|
const char *mmap;
|
|
|
|
size_t mmap_size;
|
|
|
|
unsigned long src_offset;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void *load_index_extensions(void *_data)
|
|
|
|
{
|
|
|
|
struct load_index_extensions *p = _data;
|
|
|
|
unsigned long src_offset = p->src_offset;
|
|
|
|
|
|
|
|
while (src_offset <= p->mmap_size - the_hash_algo->rawsz - 8) {
|
|
|
|
/* After an array of active_nr index entries,
|
|
|
|
* there can be arbitrary number of extended
|
|
|
|
* sections, each of which is prefixed with
|
|
|
|
* extension name (4-byte) and section length
|
|
|
|
* in 4-byte network byte order.
|
|
|
|
*/
|
|
|
|
uint32_t extsize = get_be32(p->mmap + src_offset + 4);
|
|
|
|
if (read_index_extension(p->istate,
|
|
|
|
p->mmap + src_offset,
|
|
|
|
p->mmap + src_offset + 8,
|
|
|
|
extsize) < 0) {
|
|
|
|
munmap((void *)p->mmap, p->mmap_size);
|
|
|
|
die(_("index file corrupt"));
|
|
|
|
}
|
|
|
|
src_offset += 8;
|
|
|
|
src_offset += extsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
/*
|
|
|
|
* A helper function that will load the specified range of cache entries
|
|
|
|
* from the memory mapped file and add them to the given index.
|
|
|
|
*/
|
|
|
|
static unsigned long load_cache_entry_block(struct index_state *istate,
|
|
|
|
struct mem_pool *ce_mem_pool, int offset, int nr, const char *mmap,
|
|
|
|
unsigned long start_offset, const struct cache_entry *previous_ce)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
unsigned long src_offset = start_offset;
|
|
|
|
|
|
|
|
for (i = offset; i < offset + nr; i++) {
|
|
|
|
struct ondisk_cache_entry *disk_ce;
|
|
|
|
struct cache_entry *ce;
|
|
|
|
unsigned long consumed;
|
|
|
|
|
|
|
|
disk_ce = (struct ondisk_cache_entry *)(mmap + src_offset);
|
|
|
|
ce = create_from_disk(ce_mem_pool, istate->version, disk_ce, &consumed, previous_ce);
|
|
|
|
set_index_entry(istate, i, ce);
|
|
|
|
|
|
|
|
src_offset += consumed;
|
|
|
|
previous_ce = ce;
|
|
|
|
}
|
|
|
|
return src_offset - start_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long load_all_cache_entries(struct index_state *istate,
|
|
|
|
const char *mmap, size_t mmap_size, unsigned long src_offset)
|
|
|
|
{
|
|
|
|
unsigned long consumed;
|
|
|
|
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
istate->ce_mem_pool = xmalloc(sizeof(*istate->ce_mem_pool));
|
2018-10-10 17:59:38 +02:00
|
|
|
if (istate->version == 4) {
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
mem_pool_init(istate->ce_mem_pool,
|
2018-10-10 17:59:38 +02:00
|
|
|
estimate_cache_size_from_compressed(istate->cache_nr));
|
|
|
|
} else {
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
mem_pool_init(istate->ce_mem_pool,
|
2018-10-10 17:59:38 +02:00
|
|
|
estimate_cache_size(mmap_size, istate->cache_nr));
|
|
|
|
}
|
|
|
|
|
|
|
|
consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
|
|
|
|
0, istate->cache_nr, mmap, src_offset, NULL);
|
|
|
|
return consumed;
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:37 +02:00
|
|
|
/*
|
|
|
|
* Mostly randomly chosen maximum thread counts: we
|
|
|
|
* cap the parallelism to online_cpus() threads, and we want
|
|
|
|
* to have at least 10000 cache entries per thread for it to
|
|
|
|
* be worth starting a thread.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define THREAD_COST (10000)
|
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
struct load_cache_entries_thread_data
|
|
|
|
{
|
|
|
|
pthread_t pthread;
|
|
|
|
struct index_state *istate;
|
|
|
|
struct mem_pool *ce_mem_pool;
|
|
|
|
int offset;
|
|
|
|
const char *mmap;
|
|
|
|
struct index_entry_offset_table *ieot;
|
|
|
|
int ieot_start; /* starting index into the ieot array */
|
|
|
|
int ieot_blocks; /* count of ieot entries to process */
|
|
|
|
unsigned long consumed; /* return # of bytes in index file processed */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A thread proc to run the load_cache_entries() computation
|
|
|
|
* across multiple background threads.
|
|
|
|
*/
|
|
|
|
static void *load_cache_entries_thread(void *_data)
|
|
|
|
{
|
|
|
|
struct load_cache_entries_thread_data *p = _data;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* iterate across all ieot blocks assigned to this thread */
|
|
|
|
for (i = p->ieot_start; i < p->ieot_start + p->ieot_blocks; i++) {
|
|
|
|
p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
|
|
|
|
p->offset, p->ieot->entries[i].nr, p->mmap, p->ieot->entries[i].offset, NULL);
|
|
|
|
p->offset += p->ieot->entries[i].nr;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long load_cache_entries_threaded(struct index_state *istate, const char *mmap, size_t mmap_size,
|
2019-05-09 23:29:44 +02:00
|
|
|
int nr_threads, struct index_entry_offset_table *ieot)
|
2018-10-10 17:59:38 +02:00
|
|
|
{
|
|
|
|
int i, offset, ieot_blocks, ieot_start, err;
|
|
|
|
struct load_cache_entries_thread_data *data;
|
|
|
|
unsigned long consumed = 0;
|
|
|
|
|
|
|
|
/* a little sanity checking */
|
|
|
|
if (istate->name_hash_initialized)
|
|
|
|
BUG("the name hash isn't thread safe");
|
|
|
|
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
istate->ce_mem_pool = xmalloc(sizeof(*istate->ce_mem_pool));
|
|
|
|
mem_pool_init(istate->ce_mem_pool, 0);
|
2018-10-10 17:59:38 +02:00
|
|
|
|
|
|
|
/* ensure we have no more threads than we have blocks to process */
|
|
|
|
if (nr_threads > ieot->nr)
|
|
|
|
nr_threads = ieot->nr;
|
|
|
|
data = xcalloc(nr_threads, sizeof(*data));
|
|
|
|
|
|
|
|
offset = ieot_start = 0;
|
|
|
|
ieot_blocks = DIV_ROUND_UP(ieot->nr, nr_threads);
|
|
|
|
for (i = 0; i < nr_threads; i++) {
|
|
|
|
struct load_cache_entries_thread_data *p = &data[i];
|
|
|
|
int nr, j;
|
|
|
|
|
|
|
|
if (ieot_start + ieot_blocks > ieot->nr)
|
|
|
|
ieot_blocks = ieot->nr - ieot_start;
|
|
|
|
|
|
|
|
p->istate = istate;
|
|
|
|
p->offset = offset;
|
|
|
|
p->mmap = mmap;
|
|
|
|
p->ieot = ieot;
|
|
|
|
p->ieot_start = ieot_start;
|
|
|
|
p->ieot_blocks = ieot_blocks;
|
|
|
|
|
|
|
|
/* create a mem_pool for each thread */
|
|
|
|
nr = 0;
|
|
|
|
for (j = p->ieot_start; j < p->ieot_start + p->ieot_blocks; j++)
|
|
|
|
nr += p->ieot->entries[j].nr;
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
istate->ce_mem_pool = xmalloc(sizeof(*istate->ce_mem_pool));
|
2018-10-10 17:59:38 +02:00
|
|
|
if (istate->version == 4) {
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
mem_pool_init(p->ce_mem_pool,
|
2018-10-10 17:59:38 +02:00
|
|
|
estimate_cache_size_from_compressed(nr));
|
|
|
|
} else {
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
mem_pool_init(p->ce_mem_pool,
|
2018-10-10 17:59:38 +02:00
|
|
|
estimate_cache_size(mmap_size, nr));
|
|
|
|
}
|
|
|
|
|
|
|
|
err = pthread_create(&p->pthread, NULL, load_cache_entries_thread, p);
|
|
|
|
if (err)
|
|
|
|
die(_("unable to create load_cache_entries thread: %s"), strerror(err));
|
|
|
|
|
|
|
|
/* increment by the number of cache entries in the ieot block being processed */
|
|
|
|
for (j = 0; j < ieot_blocks; j++)
|
|
|
|
offset += ieot->entries[ieot_start + j].nr;
|
|
|
|
ieot_start += ieot_blocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < nr_threads; i++) {
|
|
|
|
struct load_cache_entries_thread_data *p = &data[i];
|
|
|
|
|
|
|
|
err = pthread_join(p->pthread, NULL);
|
|
|
|
if (err)
|
|
|
|
die(_("unable to join load_cache_entries thread: %s"), strerror(err));
|
|
|
|
mem_pool_combine(istate->ce_mem_pool, p->ce_mem_pool);
|
|
|
|
consumed += p->consumed;
|
|
|
|
}
|
|
|
|
|
|
|
|
free(data);
|
|
|
|
|
|
|
|
return consumed;
|
|
|
|
}
|
|
|
|
|
2006-07-26 06:32:18 +02:00
|
|
|
/* remember to discard_cache() before reading a different cache! */
|
2014-06-13 14:19:51 +02:00
|
|
|
int do_read_index(struct index_state *istate, const char *path, int must_exist)
|
2005-04-08 00:13:13 +02:00
|
|
|
{
|
2018-10-10 17:59:38 +02:00
|
|
|
int fd;
|
2005-04-08 00:13:13 +02:00
|
|
|
struct stat st;
|
2011-10-24 23:59:14 +02:00
|
|
|
unsigned long src_offset;
|
2018-10-10 17:59:33 +02:00
|
|
|
const struct cache_header *hdr;
|
|
|
|
const char *mmap;
|
2008-01-15 01:03:17 +01:00
|
|
|
size_t mmap_size;
|
2018-10-10 17:59:36 +02:00
|
|
|
struct load_index_extensions p;
|
|
|
|
size_t extension_offset = 0;
|
2018-10-10 17:59:38 +02:00
|
|
|
int nr_threads, cpus;
|
|
|
|
struct index_entry_offset_table *ieot = NULL;
|
2005-04-08 00:13:13 +02:00
|
|
|
|
unpack_trees(): protect the handcrafted in-core index from read_cache()
unpack_trees() rebuilds the in-core index from scratch by allocating a new
structure and finishing it off by copying the built one to the final
index.
The resulting in-core index is Ok for most use, but read_cache() does not
recognize it as such. The function is meant to be no-op if you already
have loaded the index, until you call discard_cache().
This change the way read_cache() detects an already initialized in-core
index, by introducing an extra bit, and marks the handcrafted in-core
index as initialized, to avoid this problem.
A better fix in the longer term would be to change the read_cache() API so
that it will always discard and re-read from the on-disk index to avoid
confusion. But there are higher level API that have relied on the current
semantics, and they and their users all need to get converted, which is
outside the scope of 'maint' track.
An example of such a higher level API is write_cache_as_tree(), which is
used by git-write-tree as well as later Porcelains like git-merge, revert
and cherry-pick. In the longer term, we should remove read_cache() from
there and add one to cmd_write_tree(); other callers expect that the
in-core index they prepared is what gets written as a tree so no other
change is necessary for this particular codepath.
The original version of this patch marked the index by pointing an
otherwise wasted malloc'ed memory with o->result.alloc, but this version
uses Linus's idea to use a new "initialized" bit, which is conceptually
much cleaner.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-08-23 21:57:30 +02:00
|
|
|
if (istate->initialized)
|
2007-04-02 08:26:07 +02:00
|
|
|
return istate->cache_nr;
|
2005-10-01 22:24:27 +02:00
|
|
|
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
istate->timestamp.sec = 0;
|
|
|
|
istate->timestamp.nsec = 0;
|
2006-07-26 06:32:18 +02:00
|
|
|
fd = open(path, O_RDONLY);
|
2005-10-01 22:24:27 +02:00
|
|
|
if (fd < 0) {
|
2014-06-13 14:19:36 +02:00
|
|
|
if (!must_exist && errno == ENOENT)
|
2005-10-01 22:24:27 +02:00
|
|
|
return 0;
|
2018-11-10 06:16:05 +01:00
|
|
|
die_errno(_("%s: index file open failed"), path);
|
2005-10-01 22:24:27 +02:00
|
|
|
}
|
2005-04-08 00:13:13 +02:00
|
|
|
|
2007-04-25 16:18:17 +02:00
|
|
|
if (fstat(fd, &st))
|
2018-11-10 06:16:05 +01:00
|
|
|
die_errno(_("%s: cannot stat the open index"), path);
|
2007-04-25 16:18:17 +02:00
|
|
|
|
2008-01-15 01:03:17 +01:00
|
|
|
mmap_size = xsize_t(st.st_size);
|
2018-02-01 03:18:45 +01:00
|
|
|
if (mmap_size < sizeof(struct cache_header) + the_hash_algo->rawsz)
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("%s: index file smaller than expected"), path);
|
2007-04-25 16:18:17 +02:00
|
|
|
|
2019-07-14 05:01:53 +02:00
|
|
|
mmap = xmmap_gently(NULL, mmap_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
2008-01-15 01:03:17 +01:00
|
|
|
if (mmap == MAP_FAILED)
|
2018-11-10 06:16:05 +01:00
|
|
|
die_errno(_("%s: unable to map index file"), path);
|
2012-08-06 13:27:09 +02:00
|
|
|
close(fd);
|
2005-04-08 00:13:13 +02:00
|
|
|
|
2018-10-10 17:59:33 +02:00
|
|
|
hdr = (const struct cache_header *)mmap;
|
2008-01-15 01:03:17 +01:00
|
|
|
if (verify_hdr(hdr, mmap_size) < 0)
|
2005-04-08 00:13:13 +02:00
|
|
|
goto unmap;
|
|
|
|
|
2018-05-02 02:25:44 +02:00
|
|
|
hashcpy(istate->oid.hash, (const unsigned char *)hdr + mmap_size - the_hash_algo->rawsz);
|
2012-04-04 18:12:43 +02:00
|
|
|
istate->version = ntohl(hdr->hdr_version);
|
2007-04-02 08:26:07 +02:00
|
|
|
istate->cache_nr = ntohl(hdr->hdr_entries);
|
|
|
|
istate->cache_alloc = alloc_nr(istate->cache_nr);
|
2013-05-30 15:56:19 +02:00
|
|
|
istate->cache = xcalloc(istate->cache_alloc, sizeof(*istate->cache));
|
unpack_trees(): protect the handcrafted in-core index from read_cache()
unpack_trees() rebuilds the in-core index from scratch by allocating a new
structure and finishing it off by copying the built one to the final
index.
The resulting in-core index is Ok for most use, but read_cache() does not
recognize it as such. The function is meant to be no-op if you already
have loaded the index, until you call discard_cache().
This change the way read_cache() detects an already initialized in-core
index, by introducing an extra bit, and marks the handcrafted in-core
index as initialized, to avoid this problem.
A better fix in the longer term would be to change the read_cache() API so
that it will always discard and re-read from the on-disk index to avoid
confusion. But there are higher level API that have relied on the current
semantics, and they and their users all need to get converted, which is
outside the scope of 'maint' track.
An example of such a higher level API is write_cache_as_tree(), which is
used by git-write-tree as well as later Porcelains like git-merge, revert
and cherry-pick. In the longer term, we should remove read_cache() from
there and add one to cmd_write_tree(); other callers expect that the
in-core index they prepared is what gets written as a tree so no other
change is necessary for this particular codepath.
The original version of this patch marked the index by pointing an
otherwise wasted malloc'ed memory with o->result.alloc, but this version
uses Linus's idea to use a new "initialized" bit, which is conceptually
much cleaner.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-08-23 21:57:30 +02:00
|
|
|
istate->initialized = 1;
|
2008-01-15 01:03:17 +01:00
|
|
|
|
2018-10-10 17:59:36 +02:00
|
|
|
p.istate = istate;
|
|
|
|
p.mmap = mmap;
|
|
|
|
p.mmap_size = mmap_size;
|
2012-04-04 00:53:15 +02:00
|
|
|
|
2008-01-15 01:03:17 +01:00
|
|
|
src_offset = sizeof(*hdr);
|
2007-04-02 08:26:07 +02:00
|
|
|
|
index: make index.threads=true enable ieot and eoie
If a user explicitly sets
[index]
threads = true
to read the index using multiple threads, ensure that index writes
include the offset table by default to make that possible. This
ensures that the user's intent of turning on threading is respected.
In other words, permit the following configurations:
- index.threads and index.recordOffsetTable unspecified: do not write
the offset table yet (to avoid alarming the user with "ignoring IEOT
extension" messages when an older version of Git accesses the
repository) but do make use of multiple threads to read the index if
the supporting offset table is present.
This can also be requested explicitly by setting index.threads=true,
0, or >1 and index.recordOffsetTable=false.
- index.threads=false or 1: do not write the offset table, and do not
make use of the offset table.
One can set index.recordOffsetTable=false as well, to be more
explicit.
- index.threads=true, 0, or >1 and index.recordOffsetTable unspecified:
write the offset table and make use of threads at read time.
This can also be requested by setting index.threads=true, 0, >1, or
unspecified and index.recordOffsetTable=true.
Fortunately the complication is temporary: once most Git installations
have upgraded to a version with support for the IEOT and EOIE
extensions, we can flip the defaults for index.recordEndOfIndexEntries
and index.recordOffsetTable to true and eliminate the settings.
Helped-by: Ben Peart <benpeart@microsoft.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-20 07:14:26 +01:00
|
|
|
if (git_config_get_index_threads(&nr_threads))
|
|
|
|
nr_threads = 1;
|
2008-01-15 01:03:17 +01:00
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
/* TODO: does creating more threads than cores help? */
|
|
|
|
if (!nr_threads) {
|
|
|
|
nr_threads = istate->cache_nr / THREAD_COST;
|
|
|
|
cpus = online_cpus();
|
|
|
|
if (nr_threads > cpus)
|
|
|
|
nr_threads = cpus;
|
2005-04-08 00:13:13 +02:00
|
|
|
}
|
2018-10-10 17:59:36 +02:00
|
|
|
|
2018-11-03 09:48:48 +01:00
|
|
|
if (!HAVE_THREADS)
|
|
|
|
nr_threads = 1;
|
|
|
|
|
2018-10-10 17:59:36 +02:00
|
|
|
if (nr_threads > 1) {
|
|
|
|
extension_offset = read_eoie_extension(mmap, mmap_size);
|
|
|
|
if (extension_offset) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
p.src_offset = extension_offset;
|
|
|
|
err = pthread_create(&p.pthread, NULL, load_index_extensions, &p);
|
|
|
|
if (err)
|
|
|
|
die(_("unable to create load_index_extensions thread: %s"), strerror(err));
|
|
|
|
|
|
|
|
nr_threads--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:38 +02:00
|
|
|
/*
|
|
|
|
* Locate and read the index entry offset table so that we can use it
|
|
|
|
* to multi-thread the reading of the cache entries.
|
|
|
|
*/
|
|
|
|
if (extension_offset && nr_threads > 1)
|
|
|
|
ieot = read_ieot_extension(mmap, mmap_size, extension_offset);
|
|
|
|
|
|
|
|
if (ieot) {
|
2019-05-09 23:29:44 +02:00
|
|
|
src_offset += load_cache_entries_threaded(istate, mmap, mmap_size, nr_threads, ieot);
|
2018-10-10 17:59:38 +02:00
|
|
|
free(ieot);
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
} else {
|
2018-10-10 17:59:38 +02:00
|
|
|
src_offset += load_all_cache_entries(istate, mmap, mmap_size, src_offset);
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
}
|
2012-04-04 00:53:15 +02:00
|
|
|
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
istate->timestamp.sec = st.st_mtime;
|
2009-03-04 18:47:40 +01:00
|
|
|
istate->timestamp.nsec = ST_MTIME_NSEC(st);
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
|
2018-10-10 17:59:36 +02:00
|
|
|
/* if we created a thread, join it otherwise load the extensions on the primary thread */
|
|
|
|
if (extension_offset) {
|
|
|
|
int ret = pthread_join(p.pthread, NULL);
|
|
|
|
if (ret)
|
|
|
|
die(_("unable to join load_index_extensions thread: %s"), strerror(ret));
|
2018-11-03 09:48:48 +01:00
|
|
|
} else {
|
2018-10-10 17:59:36 +02:00
|
|
|
p.src_offset = src_offset;
|
|
|
|
load_index_extensions(&p);
|
2006-04-25 06:18:58 +02:00
|
|
|
}
|
2018-10-10 17:59:33 +02:00
|
|
|
munmap((void *)mmap, mmap_size);
|
2019-02-22 23:25:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO trace2: replace "the_repository" with the actual repo instance
|
|
|
|
* that is associated with the given "istate".
|
|
|
|
*/
|
|
|
|
trace2_data_intmax("index", the_repository, "read/version",
|
|
|
|
istate->version);
|
|
|
|
trace2_data_intmax("index", the_repository, "read/cache_nr",
|
|
|
|
istate->cache_nr);
|
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
return istate->cache_nr;
|
2005-04-08 00:13:13 +02:00
|
|
|
|
|
|
|
unmap:
|
2018-10-10 17:59:33 +02:00
|
|
|
munmap((void *)mmap, mmap_size);
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("index file corrupt"));
|
2005-04-08 00:13:13 +02:00
|
|
|
}
|
|
|
|
|
2017-02-27 19:00:12 +01:00
|
|
|
/*
|
|
|
|
* Signal that the shared index is used by updating its mtime.
|
|
|
|
*
|
|
|
|
* This way, shared index can be removed if they have not been used
|
|
|
|
* for some time.
|
|
|
|
*/
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
static void freshen_shared_index(const char *shared_index, int warn)
|
2017-02-27 19:00:12 +01:00
|
|
|
{
|
|
|
|
if (!check_and_freshen_file(shared_index, 1) && warn)
|
2018-11-10 06:16:05 +01:00
|
|
|
warning(_("could not freshen shared index '%s'"), shared_index);
|
2017-02-27 19:00:12 +01:00
|
|
|
}
|
|
|
|
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
int read_index_from(struct index_state *istate, const char *path,
|
|
|
|
const char *gitdir)
|
2014-06-13 14:19:36 +02:00
|
|
|
{
|
|
|
|
struct split_index *split_index;
|
|
|
|
int ret;
|
2018-05-02 02:25:43 +02:00
|
|
|
char *base_oid_hex;
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
char *base_path;
|
2014-06-13 14:19:36 +02:00
|
|
|
|
|
|
|
/* istate->initialized covers both .git/index and .git/sharedindex.xxx */
|
|
|
|
if (istate->initialized)
|
|
|
|
return istate->cache_nr;
|
|
|
|
|
2019-02-22 23:25:07 +01:00
|
|
|
/*
|
|
|
|
* TODO trace2: replace "the_repository" with the actual repo instance
|
|
|
|
* that is associated with the given "istate".
|
|
|
|
*/
|
|
|
|
trace2_region_enter_printf("index", "do_read_index", the_repository,
|
|
|
|
"%s", path);
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_enter();
|
2014-06-13 14:19:36 +02:00
|
|
|
ret = do_read_index(istate, path, 0);
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_leave("read cache %s", path);
|
2019-02-22 23:25:07 +01:00
|
|
|
trace2_region_leave_printf("index", "do_read_index", the_repository,
|
|
|
|
"%s", path);
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
|
2014-06-13 14:19:36 +02:00
|
|
|
split_index = istate->split_index;
|
2018-05-02 02:25:43 +02:00
|
|
|
if (!split_index || is_null_oid(&split_index->base_oid)) {
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
post_read_index_from(istate);
|
2014-06-13 14:19:36 +02:00
|
|
|
return ret;
|
2015-03-20 22:43:14 +01:00
|
|
|
}
|
2014-06-13 14:19:36 +02:00
|
|
|
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_enter();
|
2014-06-13 14:19:36 +02:00
|
|
|
if (split_index->base)
|
|
|
|
discard_index(split_index->base);
|
|
|
|
else
|
|
|
|
split_index->base = xcalloc(1, sizeof(*split_index->base));
|
2017-03-06 10:42:00 +01:00
|
|
|
|
2018-05-02 02:25:43 +02:00
|
|
|
base_oid_hex = oid_to_hex(&split_index->base_oid);
|
|
|
|
base_path = xstrfmt("%s/sharedindex.%s", gitdir, base_oid_hex);
|
2019-02-22 23:25:07 +01:00
|
|
|
trace2_region_enter_printf("index", "shared/do_read_index",
|
|
|
|
the_repository, "%s", base_path);
|
2017-03-06 10:42:00 +01:00
|
|
|
ret = do_read_index(split_index->base, base_path, 1);
|
2019-02-22 23:25:07 +01:00
|
|
|
trace2_region_leave_printf("index", "shared/do_read_index",
|
|
|
|
the_repository, "%s", base_path);
|
2018-08-28 23:22:48 +02:00
|
|
|
if (!oideq(&split_index->base_oid, &split_index->base->oid))
|
2018-11-10 06:16:05 +01:00
|
|
|
die(_("broken index, expect %s in %s, got %s"),
|
2018-05-02 02:25:43 +02:00
|
|
|
base_oid_hex, base_path,
|
2018-05-02 02:25:44 +02:00
|
|
|
oid_to_hex(&split_index->base->oid));
|
2017-03-06 10:42:00 +01:00
|
|
|
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
freshen_shared_index(base_path, 0);
|
2014-06-13 14:19:36 +02:00
|
|
|
merge_base_index(istate);
|
config: add core.untrackedCache
When we know that mtime on directory as given by the environment
is usable for the purpose of untracked cache, we may want the
untracked cache to be always used without any mtime test or
kernel name check being performed.
Also when we know that mtime is not usable for the purpose of
untracked cache, for example because the repo is shared over a
network file system, we may want the untracked-cache to be
automatically removed from the index.
Allow the user to express such preference by setting the
'core.untrackedCache' configuration variable, which can take
'keep', 'false', or 'true' and default to 'keep'.
When read_index_from() is called, it now adds or removes the
untracked cache in the index to respect the value of this
variable. So it does nothing if the value is `keep` or if the
variable is unset; it adds the untracked cache if the value is
`true`; and it removes the cache if the value is `false`.
`git update-index --[no-|force-]untracked-cache` still adds the
untracked cache to, or removes it, from the index, but this
shows a warning if it goes against the value of
core.untrackedCache, because the next time the index is read
the untracked cache will be added or removed if the
configuration is set to do so.
Also `--untracked-cache` used to check that the underlying
operating system and file system change `st_mtime` field of a
directory if files are added or deleted in that directory. But
because those tests take a long time, `--untracked-cache` no
longer performs them. Instead, there is now
`--test-untracked-cache` to perform the tests. This change
makes `--untracked-cache` the same as `--force-untracked-cache`.
This last change is backward incompatible and should be
mentioned in the release notes.
Helped-by: Duy Nguyen <pclouds@gmail.com>
Helped-by: Torsten Bögershausen <tboegi@web.de>
Helped-by: Stefan Beller <sbeller@google.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
read-cache: Duy'sfixup
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-01-27 07:58:05 +01:00
|
|
|
post_read_index_from(istate);
|
2018-08-18 16:41:22 +02:00
|
|
|
trace_performance_leave("read cache %s", base_path);
|
2018-10-20 09:33:34 +02:00
|
|
|
free(base_path);
|
2014-06-13 14:19:36 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
checkout: Fix "initial checkout" detection
Earlier commit 5521883 (checkout: do not lose staged removal, 2008-09-07)
tightened the rule to prevent switching branches from losing local
changes, so that staged removal of paths can be protected, while
attempting to keep a loophole to still allow a special case of switching
out of an un-checked-out state.
However, the loophole was made a bit too tight, and did not allow
switching from one branch (in an un-checked-out state) to check out
another branch.
The change to builtin-checkout.c in this commit loosens it to allow this,
by not insisting the original commit and the new commit to be the same.
It also introduces a new function, is_index_unborn (and an associated
macro, is_cache_unborn), to check if the repository is truly in an
un-checked-out state more reliably, by making sure that $GIT_INDEX_FILE
did not exist when populating the in-core index structure. A few places
the earlier commit 5521883 added the check for the initial checkout
condition are updated to use this function.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-11-12 20:52:35 +01:00
|
|
|
int is_index_unborn(struct index_state *istate)
|
|
|
|
{
|
2011-10-24 23:59:14 +02:00
|
|
|
return (!istate->cache_nr && !istate->timestamp.sec);
|
checkout: Fix "initial checkout" detection
Earlier commit 5521883 (checkout: do not lose staged removal, 2008-09-07)
tightened the rule to prevent switching branches from losing local
changes, so that staged removal of paths can be protected, while
attempting to keep a loophole to still allow a special case of switching
out of an un-checked-out state.
However, the loophole was made a bit too tight, and did not allow
switching from one branch (in an un-checked-out state) to check out
another branch.
The change to builtin-checkout.c in this commit loosens it to allow this,
by not insisting the original commit and the new commit to be the same.
It also introduces a new function, is_index_unborn (and an associated
macro, is_cache_unborn), to check if the repository is truly in an
un-checked-out state more reliably, by making sure that $GIT_INDEX_FILE
did not exist when populating the in-core index structure. A few places
the earlier commit 5521883 added the check for the initial checkout
condition are updated to use this function.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-11-12 20:52:35 +01:00
|
|
|
}
|
|
|
|
|
2007-04-02 08:26:07 +02:00
|
|
|
int discard_index(struct index_state *istate)
|
Status update on merge-recursive in C
This is just an update for people being interested. Alex and me were
busy with that project for a few days now. While it has progressed nicely,
there are quite a couple TODOs in merge-recursive.c, just search for "TODO".
For impatient people: yes, it passes all the tests, and yes, according
to the evil test Alex did, it is faster than the Python script.
But no, it is not yet finished. Biggest points are:
- there are still three external calls
- in the end, it should not be necessary to write the index more than once
(just before exiting)
- a lot of things can be refactored to make the code easier and shorter
BTW we cannot just plug in git-merge-tree yet, because git-merge-tree
does not handle renames at all.
This patch is meant for testing, and as such,
- it compile the program to git-merge-recur
- it adjusts the scripts and tests to use git-merge-recur instead of
git-merge-recursive
- it provides "TEST", a script to execute the tests regarding -recursive
- it inlines the changes to read-cache.c (read_cache_from(), discard_cache()
and refresh_cache_entry())
Brought to you by Alex Riesen and Dscho
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-07-08 18:42:41 +02:00
|
|
|
{
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
/*
|
|
|
|
* Cache entries in istate->cache[] should have been allocated
|
|
|
|
* from the memory pool associated with this index, or from an
|
|
|
|
* associated split_index. There is no need to free individual
|
2018-07-02 21:49:39 +02:00
|
|
|
* cache entries. validate_cache_entries can detect when this
|
|
|
|
* assertion does not hold.
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
*/
|
2018-07-02 21:49:39 +02:00
|
|
|
validate_cache_entries(istate);
|
2011-10-24 23:59:14 +02:00
|
|
|
|
2009-12-25 09:30:51 +01:00
|
|
|
resolve_undo_clear_index(istate);
|
2007-04-02 08:26:07 +02:00
|
|
|
istate->cache_nr = 0;
|
|
|
|
istate->cache_changed = 0;
|
make USE_NSEC work as expected
Since the filesystem ext4 is now defined as stable in Linux v2.6.28,
and ext4 supports nanonsecond resolution timestamps natively, it is
time to make USE_NSEC work as expected.
This will make racy git situations less likely to happen. For 'git
checkout' this means it will be less likely that we have to open, read
the contents of the file into RAM, and check if file is really
modified or not. The result sould be a litle less used CPU time, less
pagefaults and a litle faster program, at least for 'git checkout'.
Since the number of possible racy git situations would increase when
disks gets faster, this patch would be more and more helpfull as times
go by. For a fast Solid State Disk, this patch should be helpfull.
Note that, when file operations starts to take less than 1 nanosecond,
one would again start to get more racy git situations.
For more info on racy git, see Documentation/technical/racy-git.txt
For more info on ext4, see http://kernelnewbies.org/Ext4
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-19 21:08:29 +01:00
|
|
|
istate->timestamp.sec = 0;
|
|
|
|
istate->timestamp.nsec = 0;
|
2013-02-28 00:57:48 +01:00
|
|
|
free_name_hash(istate);
|
2007-04-02 08:26:07 +02:00
|
|
|
cache_tree_free(&(istate->cache_tree));
|
unpack_trees(): protect the handcrafted in-core index from read_cache()
unpack_trees() rebuilds the in-core index from scratch by allocating a new
structure and finishing it off by copying the built one to the final
index.
The resulting in-core index is Ok for most use, but read_cache() does not
recognize it as such. The function is meant to be no-op if you already
have loaded the index, until you call discard_cache().
This change the way read_cache() detects an already initialized in-core
index, by introducing an extra bit, and marks the handcrafted in-core
index as initialized, to avoid this problem.
A better fix in the longer term would be to change the read_cache() API so
that it will always discard and re-read from the on-disk index to avoid
confusion. But there are higher level API that have relied on the current
semantics, and they and their users all need to get converted, which is
outside the scope of 'maint' track.
An example of such a higher level API is write_cache_as_tree(), which is
used by git-write-tree as well as later Porcelains like git-merge, revert
and cherry-pick. In the longer term, we should remove read_cache() from
there and add one to cmd_write_tree(); other callers expect that the
in-core index they prepared is what gets written as a tree so no other
change is necessary for this particular codepath.
The original version of this patch marked the index by pointing an
otherwise wasted malloc'ed memory with o->result.alloc, but this version
uses Linus's idea to use a new "initialized" bit, which is conceptually
much cleaner.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-08-23 21:57:30 +02:00
|
|
|
istate->initialized = 0;
|
2019-05-07 13:10:21 +02:00
|
|
|
istate->fsmonitor_has_run_once = 0;
|
2017-06-16 01:15:46 +02:00
|
|
|
FREE_AND_NULL(istate->cache);
|
2013-06-09 19:39:18 +02:00
|
|
|
istate->cache_alloc = 0;
|
2014-06-13 14:19:36 +02:00
|
|
|
discard_split_index(istate);
|
2015-03-08 11:12:34 +01:00
|
|
|
free_untracked_cache(istate->untracked);
|
|
|
|
istate->untracked = NULL;
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
|
|
|
|
if (istate->ce_mem_pool) {
|
2018-07-02 21:49:39 +02:00
|
|
|
mem_pool_discard(istate->ce_mem_pool, should_validate_cache_entries());
|
mem-pool: use more standard initialization and finalization
A typical memory type, such as strbuf, hashmap, or string_list can be
stored on the stack or embedded within another structure. mem_pool
cannot be, because of how mem_pool_init() and mem_pool_discard() are
written. mem_pool_init() does essentially the following (simplified
for purposes of explanation here):
void mem_pool_init(struct mem_pool **pool...)
{
*pool = xcalloc(1, sizeof(*pool));
It seems weird to require that mem_pools can only be accessed through a
pointer. It also seems slightly dangerous: unlike strbuf_release() or
strbuf_reset() or string_list_clear(), all of which put the data
structure into a state where it can be re-used after the call,
mem_pool_discard(pool) will leave pool pointing at free'd memory.
read-cache (and split-index) are the only current users of mem_pools,
and they haven't fallen into a use-after-free mistake here, but it seems
likely to be problematic for future users especially since several of
the current callers of mem_pool_init() will only call it when the
mem_pool* is not already allocated (i.e. is NULL).
This type of mechanism also prevents finding synchronization
points where one can free existing memory and then resume more
operations. It would be natural at such points to run something like
mem_pool_discard(pool...);
and, if necessary,
mem_pool_init(&pool...);
and then carry on continuing to use the pool. However, this fails badly
if several objects had a copy of the value of pool from before these
commands; in such a case, those objects won't get the updated value of
pool that mem_pool_init() overwrites pool with and they'll all instead
be reading and writing from free'd memory.
Modify mem_pool_init()/mem_pool_discard() to behave more like
strbuf_init()/strbuf_release()
or
string_list_init()/string_list_clear()
In particular: (1) make mem_pool_init() just take a mem_pool* and have
it only worry about allocating struct mp_blocks, not the struct mem_pool
itself, (2) make mem_pool_discard() free the memory that the pool was
responsible for, but leave it in a state where it can be used to
allocate more memory afterward (without the need to call mem_pool_init()
again).
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-15 19:37:56 +02:00
|
|
|
FREE_AND_NULL(istate->ce_mem_pool);
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
}
|
|
|
|
|
2008-01-15 01:03:17 +01:00
|
|
|
return 0;
|
Status update on merge-recursive in C
This is just an update for people being interested. Alex and me were
busy with that project for a few days now. While it has progressed nicely,
there are quite a couple TODOs in merge-recursive.c, just search for "TODO".
For impatient people: yes, it passes all the tests, and yes, according
to the evil test Alex did, it is faster than the Python script.
But no, it is not yet finished. Biggest points are:
- there are still three external calls
- in the end, it should not be necessary to write the index more than once
(just before exiting)
- a lot of things can be refactored to make the code easier and shorter
BTW we cannot just plug in git-merge-tree yet, because git-merge-tree
does not handle renames at all.
This patch is meant for testing, and as such,
- it compile the program to git-merge-recur
- it adjusts the scripts and tests to use git-merge-recur instead of
git-merge-recursive
- it provides "TEST", a script to execute the tests regarding -recursive
- it inlines the changes to read-cache.c (read_cache_from(), discard_cache()
and refresh_cache_entry())
Brought to you by Alex Riesen and Dscho
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-07-08 18:42:41 +02:00
|
|
|
}
|
|
|
|
|
2018-07-02 21:49:39 +02:00
|
|
|
/*
|
|
|
|
* Validate the cache entries of this index.
|
|
|
|
* All cache entries associated with this index
|
|
|
|
* should have been allocated by the memory pool
|
|
|
|
* associated with this index, or by a referenced
|
|
|
|
* split index.
|
|
|
|
*/
|
|
|
|
void validate_cache_entries(const struct index_state *istate)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!should_validate_cache_entries() ||!istate || !istate->initialized)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (i = 0; i < istate->cache_nr; i++) {
|
|
|
|
if (!istate) {
|
2018-11-10 06:16:04 +01:00
|
|
|
BUG("cache entry is not allocated from expected memory pool");
|
2018-07-02 21:49:39 +02:00
|
|
|
} else if (!istate->ce_mem_pool ||
|
|
|
|
!mem_pool_contains(istate->ce_mem_pool, istate->cache[i])) {
|
|
|
|
if (!istate->split_index ||
|
|
|
|
!istate->split_index->base ||
|
|
|
|
!istate->split_index->base->ce_mem_pool ||
|
|
|
|
!mem_pool_contains(istate->split_index->base->ce_mem_pool, istate->cache[i])) {
|
2018-11-10 06:16:04 +01:00
|
|
|
BUG("cache entry is not allocated from expected memory pool");
|
2018-07-02 21:49:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (istate->split_index)
|
|
|
|
validate_cache_entries(istate->split_index->base);
|
|
|
|
}
|
|
|
|
|
2008-03-06 21:46:09 +01:00
|
|
|
int unmerged_index(const struct index_state *istate)
|
2008-02-07 17:40:13 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < istate->cache_nr; i++) {
|
|
|
|
if (ce_stage(istate->cache[i]))
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-12 03:13:31 +01:00
|
|
|
int repo_index_has_changes(struct repository *repo,
|
|
|
|
struct tree *tree,
|
|
|
|
struct strbuf *sb)
|
2018-07-01 03:24:55 +02:00
|
|
|
{
|
2019-01-12 03:13:31 +01:00
|
|
|
struct index_state *istate = repo->index;
|
2018-07-01 03:25:00 +02:00
|
|
|
struct object_id cmp;
|
2018-07-01 03:24:55 +02:00
|
|
|
int i;
|
|
|
|
|
2018-07-01 03:25:00 +02:00
|
|
|
if (tree)
|
|
|
|
cmp = tree->object.oid;
|
|
|
|
if (tree || !get_oid_tree("HEAD", &cmp)) {
|
2018-07-01 03:24:55 +02:00
|
|
|
struct diff_options opt;
|
|
|
|
|
2019-01-12 03:13:31 +01:00
|
|
|
repo_diff_setup(repo, &opt);
|
2018-07-01 03:24:55 +02:00
|
|
|
opt.flags.exit_with_status = 1;
|
|
|
|
if (!sb)
|
|
|
|
opt.flags.quick = 1;
|
2018-07-01 03:25:00 +02:00
|
|
|
do_diff_cache(&cmp, &opt);
|
2018-07-01 03:24:55 +02:00
|
|
|
diffcore_std(&opt);
|
|
|
|
for (i = 0; sb && i < diff_queued_diff.nr; i++) {
|
|
|
|
if (i)
|
|
|
|
strbuf_addch(sb, ' ');
|
|
|
|
strbuf_addstr(sb, diff_queued_diff.queue[i]->two->path);
|
|
|
|
}
|
|
|
|
diff_flush(&opt);
|
|
|
|
return opt.flags.has_changes != 0;
|
|
|
|
} else {
|
2018-07-01 03:24:56 +02:00
|
|
|
for (i = 0; sb && i < istate->cache_nr; i++) {
|
2018-07-01 03:24:55 +02:00
|
|
|
if (i)
|
|
|
|
strbuf_addch(sb, ' ');
|
2018-07-01 03:24:56 +02:00
|
|
|
strbuf_addstr(sb, istate->cache[i]->name);
|
2018-07-01 03:24:55 +02:00
|
|
|
}
|
2018-07-01 03:24:56 +02:00
|
|
|
return !!istate->cache_nr;
|
2018-07-01 03:24:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-20 21:16:57 +02:00
|
|
|
#define WRITE_BUFFER_SIZE 8192
|
2005-05-18 14:14:09 +02:00
|
|
|
static unsigned char write_buffer[WRITE_BUFFER_SIZE];
|
2005-04-20 21:16:57 +02:00
|
|
|
static unsigned long write_buffer_len;
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
static int ce_write_flush(git_hash_ctx *context, int fd)
|
2006-08-08 23:47:32 +02:00
|
|
|
{
|
|
|
|
unsigned int buffered = write_buffer_len;
|
|
|
|
if (buffered) {
|
2018-02-01 03:18:45 +01:00
|
|
|
the_hash_algo->update_fn(context, write_buffer, buffered);
|
avoid "write_in_full(fd, buf, len) != len" pattern
The return value of write_in_full() is either "-1", or the
requested number of bytes[1]. If we make a partial write
before seeing an error, we still return -1, not a partial
value. This goes back to f6aa66cb95 (write_in_full: really
write in full or return error on disk full., 2007-01-11).
So checking anything except "was the return value negative"
is pointless. And there are a couple of reasons not to do
so:
1. It can do a funny signed/unsigned comparison. If your
"len" is signed (e.g., a size_t) then the compiler will
promote the "-1" to its unsigned variant.
This works out for "!= len" (unless you really were
trying to write the maximum size_t bytes), but is a
bug if you check "< len" (an example of which was fixed
recently in config.c).
We should avoid promoting the mental model that you
need to check the length at all, so that new sites are
not tempted to copy us.
2. Checking for a negative value is shorter to type,
especially when the length is an expression.
3. Linus says so. In d34cf19b89 (Clean up write_in_full()
users, 2007-01-11), right after the write_in_full()
semantics were changed, he wrote:
I really wish every "write_in_full()" user would just
check against "<0" now, but this fixes the nasty and
stupid ones.
Appeals to authority aside, this makes it clear that
writing it this way does not have an intentional
benefit. It's a historical curiosity that we never
bothered to clean up (and which was undoubtedly
cargo-culted into new sites).
So let's convert these obviously-correct cases (this
includes write_str_in_full(), which is just a wrapper for
write_in_full()).
[1] A careful reader may notice there is one way that
write_in_full() can return a different value. If we ask
write() to write N bytes and get a return value that is
_larger_ than N, we could return a larger total. But
besides the fact that this would imply a totally broken
version of write(), it would already invoke undefined
behavior. Our internal remaining counter is an unsigned
size_t, which means that subtracting too many byte will
wrap it around to a very large number. So we'll instantly
begin reading off the end of the buffer, trying to write
gigabytes (or petabytes) of data.
Signed-off-by: Jeff King <peff@peff.net>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-13 19:16:03 +02:00
|
|
|
if (write_in_full(fd, write_buffer, buffered) < 0)
|
2006-08-08 23:47:32 +02:00
|
|
|
return -1;
|
|
|
|
write_buffer_len = 0;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
|
2005-04-20 21:16:57 +02:00
|
|
|
{
|
|
|
|
while (len) {
|
|
|
|
unsigned int buffered = write_buffer_len;
|
|
|
|
unsigned int partial = WRITE_BUFFER_SIZE - buffered;
|
|
|
|
if (partial > len)
|
|
|
|
partial = len;
|
|
|
|
memcpy(write_buffer + buffered, data, partial);
|
|
|
|
buffered += partial;
|
|
|
|
if (buffered == WRITE_BUFFER_SIZE) {
|
2006-08-08 23:47:32 +02:00
|
|
|
write_buffer_len = buffered;
|
|
|
|
if (ce_write_flush(context, fd))
|
2005-04-20 21:16:57 +02:00
|
|
|
return -1;
|
|
|
|
buffered = 0;
|
|
|
|
}
|
|
|
|
write_buffer_len = buffered;
|
|
|
|
len -= partial;
|
2006-06-18 17:18:09 +02:00
|
|
|
data = (char *) data + partial;
|
2007-06-07 09:04:01 +02:00
|
|
|
}
|
|
|
|
return 0;
|
2005-04-20 21:16:57 +02:00
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:34 +02:00
|
|
|
static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
|
|
|
|
int fd, unsigned int ext, unsigned int sz)
|
2006-04-25 06:18:58 +02:00
|
|
|
{
|
|
|
|
ext = htonl(ext);
|
|
|
|
sz = htonl(sz);
|
2018-10-10 17:59:34 +02:00
|
|
|
if (eoie_context) {
|
|
|
|
the_hash_algo->update_fn(eoie_context, &ext, 4);
|
|
|
|
the_hash_algo->update_fn(eoie_context, &sz, 4);
|
|
|
|
}
|
2006-08-14 22:38:14 +02:00
|
|
|
return ((ce_write(context, fd, &ext, 4) < 0) ||
|
|
|
|
(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
|
2006-04-25 06:18:58 +02:00
|
|
|
}
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
static int ce_flush(git_hash_ctx *context, int fd, unsigned char *hash)
|
2005-04-20 21:16:57 +02:00
|
|
|
{
|
|
|
|
unsigned int left = write_buffer_len;
|
2005-04-20 21:36:41 +02:00
|
|
|
|
2005-04-20 21:16:57 +02:00
|
|
|
if (left) {
|
|
|
|
write_buffer_len = 0;
|
2018-02-01 03:18:45 +01:00
|
|
|
the_hash_algo->update_fn(context, write_buffer, left);
|
2005-04-20 21:16:57 +02:00
|
|
|
}
|
2005-04-20 21:36:41 +02:00
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
/* Flush first if not enough space for hash signature */
|
|
|
|
if (left + the_hash_algo->rawsz > WRITE_BUFFER_SIZE) {
|
avoid "write_in_full(fd, buf, len) != len" pattern
The return value of write_in_full() is either "-1", or the
requested number of bytes[1]. If we make a partial write
before seeing an error, we still return -1, not a partial
value. This goes back to f6aa66cb95 (write_in_full: really
write in full or return error on disk full., 2007-01-11).
So checking anything except "was the return value negative"
is pointless. And there are a couple of reasons not to do
so:
1. It can do a funny signed/unsigned comparison. If your
"len" is signed (e.g., a size_t) then the compiler will
promote the "-1" to its unsigned variant.
This works out for "!= len" (unless you really were
trying to write the maximum size_t bytes), but is a
bug if you check "< len" (an example of which was fixed
recently in config.c).
We should avoid promoting the mental model that you
need to check the length at all, so that new sites are
not tempted to copy us.
2. Checking for a negative value is shorter to type,
especially when the length is an expression.
3. Linus says so. In d34cf19b89 (Clean up write_in_full()
users, 2007-01-11), right after the write_in_full()
semantics were changed, he wrote:
I really wish every "write_in_full()" user would just
check against "<0" now, but this fixes the nasty and
stupid ones.
Appeals to authority aside, this makes it clear that
writing it this way does not have an intentional
benefit. It's a historical curiosity that we never
bothered to clean up (and which was undoubtedly
cargo-culted into new sites).
So let's convert these obviously-correct cases (this
includes write_str_in_full(), which is just a wrapper for
write_in_full()).
[1] A careful reader may notice there is one way that
write_in_full() can return a different value. If we ask
write() to write N bytes and get a return value that is
_larger_ than N, we could return a larger total. But
besides the fact that this would imply a totally broken
version of write(), it would already invoke undefined
behavior. Our internal remaining counter is an unsigned
size_t, which means that subtracting too many byte will
wrap it around to a very large number. So we'll instantly
begin reading off the end of the buffer, trying to write
gigabytes (or petabytes) of data.
Signed-off-by: Jeff King <peff@peff.net>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-13 19:16:03 +02:00
|
|
|
if (write_in_full(fd, write_buffer, left) < 0)
|
2005-09-11 15:27:47 +02:00
|
|
|
return -1;
|
|
|
|
left = 0;
|
|
|
|
}
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
/* Append the hash signature at the end */
|
|
|
|
the_hash_algo->final_fn(write_buffer + left, context);
|
|
|
|
hashcpy(hash, write_buffer + left);
|
|
|
|
left += the_hash_algo->rawsz;
|
avoid "write_in_full(fd, buf, len) != len" pattern
The return value of write_in_full() is either "-1", or the
requested number of bytes[1]. If we make a partial write
before seeing an error, we still return -1, not a partial
value. This goes back to f6aa66cb95 (write_in_full: really
write in full or return error on disk full., 2007-01-11).
So checking anything except "was the return value negative"
is pointless. And there are a couple of reasons not to do
so:
1. It can do a funny signed/unsigned comparison. If your
"len" is signed (e.g., a size_t) then the compiler will
promote the "-1" to its unsigned variant.
This works out for "!= len" (unless you really were
trying to write the maximum size_t bytes), but is a
bug if you check "< len" (an example of which was fixed
recently in config.c).
We should avoid promoting the mental model that you
need to check the length at all, so that new sites are
not tempted to copy us.
2. Checking for a negative value is shorter to type,
especially when the length is an expression.
3. Linus says so. In d34cf19b89 (Clean up write_in_full()
users, 2007-01-11), right after the write_in_full()
semantics were changed, he wrote:
I really wish every "write_in_full()" user would just
check against "<0" now, but this fixes the nasty and
stupid ones.
Appeals to authority aside, this makes it clear that
writing it this way does not have an intentional
benefit. It's a historical curiosity that we never
bothered to clean up (and which was undoubtedly
cargo-culted into new sites).
So let's convert these obviously-correct cases (this
includes write_str_in_full(), which is just a wrapper for
write_in_full()).
[1] A careful reader may notice there is one way that
write_in_full() can return a different value. If we ask
write() to write N bytes and get a return value that is
_larger_ than N, we could return a larger total. But
besides the fact that this would imply a totally broken
version of write(), it would already invoke undefined
behavior. Our internal remaining counter is an unsigned
size_t, which means that subtracting too many byte will
wrap it around to a very large number. So we'll instantly
begin reading off the end of the buffer, trying to write
gigabytes (or petabytes) of data.
Signed-off-by: Jeff King <peff@peff.net>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-13 19:16:03 +02:00
|
|
|
return (write_in_full(fd, write_buffer, left) < 0) ? -1 : 0;
|
2005-04-20 21:16:57 +02:00
|
|
|
}
|
|
|
|
|
2018-09-21 17:57:31 +02:00
|
|
|
static void ce_smudge_racily_clean_entry(struct index_state *istate,
|
|
|
|
struct cache_entry *ce)
|
2005-12-20 21:12:18 +01:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The only thing we care about in this function is to smudge the
|
|
|
|
* falsely clean entry due to touch-update-touch race, so we leave
|
|
|
|
* everything else as they are. We are called for entries whose
|
2013-06-20 10:37:50 +02:00
|
|
|
* ce_stat_data.sd_mtime match the index file mtime.
|
2008-07-29 10:13:44 +02:00
|
|
|
*
|
|
|
|
* Note that this actually does not do much for gitlinks, for
|
|
|
|
* which ce_match_stat_basic() always goes to the actual
|
|
|
|
* contents. The caller checks with is_racy_timestamp() which
|
|
|
|
* always says "no" for gitlinks, so we are not called for them ;-)
|
2005-12-20 21:12:18 +01:00
|
|
|
*/
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (lstat(ce->name, &st) < 0)
|
|
|
|
return;
|
|
|
|
if (ce_match_stat_basic(ce, &st))
|
|
|
|
return;
|
2018-09-21 17:57:31 +02:00
|
|
|
if (ce_modified_check_fs(istate, ce, &st)) {
|
2005-12-20 23:18:47 +01:00
|
|
|
/* This is "racily clean"; smudge it. Note that this
|
|
|
|
* is a tricky code. At first glance, it may appear
|
|
|
|
* that it can break with this sequence:
|
|
|
|
*
|
|
|
|
* $ echo xyzzy >frotz
|
|
|
|
* $ git-update-index --add frotz
|
|
|
|
* $ : >frotz
|
|
|
|
* $ sleep 3
|
|
|
|
* $ echo filfre >nitfol
|
|
|
|
* $ git-update-index --add nitfol
|
|
|
|
*
|
2006-08-05 13:16:02 +02:00
|
|
|
* but it does not. When the second update-index runs,
|
2005-12-20 23:18:47 +01:00
|
|
|
* it notices that the entry "frotz" has the same timestamp
|
|
|
|
* as index, and if we were to smudge it by resetting its
|
|
|
|
* size to zero here, then the object name recorded
|
|
|
|
* in index is the 6-byte file but the cached stat information
|
|
|
|
* becomes zero --- which would then match what we would
|
2007-06-07 09:04:01 +02:00
|
|
|
* obtain from the filesystem next time we stat("frotz").
|
2005-12-20 23:18:47 +01:00
|
|
|
*
|
|
|
|
* However, the second update-index, before calling
|
|
|
|
* this function, notices that the cached size is 6
|
|
|
|
* bytes and what is on the filesystem is an empty
|
|
|
|
* file, and never calls us, so the cached size information
|
|
|
|
* for "frotz" stays 6 which does not match the filesystem.
|
|
|
|
*/
|
2013-06-20 10:37:50 +02:00
|
|
|
ce->ce_stat_data.sd_size = 0;
|
2005-12-20 21:12:18 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-04-04 00:53:14 +02:00
|
|
|
/* Copy miscellaneous fields but not the name */
|
2017-08-21 23:24:32 +02:00
|
|
|
static void copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
|
2012-04-04 00:53:14 +02:00
|
|
|
struct cache_entry *ce)
|
2008-01-15 01:03:17 +01:00
|
|
|
{
|
2012-07-11 11:22:37 +02:00
|
|
|
short flags;
|
2019-02-19 01:05:24 +01:00
|
|
|
const unsigned hashsz = the_hash_algo->rawsz;
|
|
|
|
uint16_t *flagsp = (uint16_t *)(ondisk->data + hashsz);
|
2012-07-11 11:22:37 +02:00
|
|
|
|
2013-06-20 10:37:50 +02:00
|
|
|
ondisk->ctime.sec = htonl(ce->ce_stat_data.sd_ctime.sec);
|
|
|
|
ondisk->mtime.sec = htonl(ce->ce_stat_data.sd_mtime.sec);
|
|
|
|
ondisk->ctime.nsec = htonl(ce->ce_stat_data.sd_ctime.nsec);
|
|
|
|
ondisk->mtime.nsec = htonl(ce->ce_stat_data.sd_mtime.nsec);
|
|
|
|
ondisk->dev = htonl(ce->ce_stat_data.sd_dev);
|
|
|
|
ondisk->ino = htonl(ce->ce_stat_data.sd_ino);
|
2008-01-15 01:03:17 +01:00
|
|
|
ondisk->mode = htonl(ce->ce_mode);
|
2013-06-20 10:37:50 +02:00
|
|
|
ondisk->uid = htonl(ce->ce_stat_data.sd_uid);
|
|
|
|
ondisk->gid = htonl(ce->ce_stat_data.sd_gid);
|
|
|
|
ondisk->size = htonl(ce->ce_stat_data.sd_size);
|
2019-02-19 01:05:24 +01:00
|
|
|
hashcpy(ondisk->data, ce->oid.hash);
|
2012-07-11 11:22:37 +02:00
|
|
|
|
2014-06-13 14:19:25 +02:00
|
|
|
flags = ce->ce_flags & ~CE_NAMEMASK;
|
2012-07-11 11:22:37 +02:00
|
|
|
flags |= (ce_namelen(ce) >= CE_NAMEMASK ? CE_NAMEMASK : ce_namelen(ce));
|
2019-02-19 01:05:24 +01:00
|
|
|
flagsp[0] = htons(flags);
|
2008-10-01 06:04:01 +02:00
|
|
|
if (ce->ce_flags & CE_EXTENDED) {
|
2019-02-19 01:05:24 +01:00
|
|
|
flagsp[1] = htons((ce->ce_flags & CE_EXTENDED_FLAGS) >> 16);
|
2012-04-04 00:53:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
|
2017-08-21 23:24:32 +02:00
|
|
|
struct strbuf *previous_name, struct ondisk_cache_entry *ondisk)
|
2012-04-04 00:53:14 +02:00
|
|
|
{
|
2012-04-04 18:12:43 +02:00
|
|
|
int size;
|
2012-04-04 00:53:14 +02:00
|
|
|
int result;
|
read-cache: fix an -Wmaybe-uninitialized warning
The function ce_write_entry() uses a 'self-initialised' variable
construct, for the symbol 'saved_namelen', to suppress a gcc
'-Wmaybe-uninitialized' warning, given that the warning is a false
positive.
For the purposes of this discussion, the ce_write_entry() function has
three code blocks of interest, that look like so:
/* block #1 */
if (ce->ce_flags & CE_STRIP_NAME) {
saved_namelen = ce_namelen(ce);
ce->ce_namelen = 0;
}
/* block #2 */
/*
* several code blocks that contain, among others, calls
* to copy_cache_entry_to_ondisk(ondisk, ce);
*/
/* block #3 */
if (ce->ce_flags & CE_STRIP_NAME) {
ce->ce_namelen = saved_namelen;
ce->ce_flags &= ~CE_STRIP_NAME;
}
The warning implies that gcc thinks it is possible that the first
block is not entered, the calls to copy_cache_entry_to_ondisk()
could toggle the CE_STRIP_NAME flag on, thereby entering block #3
with saved_namelen unset. However, the copy_cache_entry_to_ondisk()
function does not write to ce->ce_flags (it only reads). gcc could
easily determine this, since that function is local to this file,
but it obviously doesn't.
In order to suppress this warning, we make it clear to the reader
(human and compiler), that block #3 will only be entered when the
first block has been entered, by introducing a new 'stripped_name'
boolean variable. We also take the opportunity to change the type
of 'saved_namelen' to 'unsigned int' to match ce->ce_namelen.
Signed-off-by: Ramsay Jones <ramsay@ramsayjones.plus.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-19 18:56:11 +01:00
|
|
|
unsigned int saved_namelen;
|
|
|
|
int stripped_name = 0;
|
2017-08-21 23:24:32 +02:00
|
|
|
static unsigned char padding[8] = { 0x00 };
|
2012-04-04 00:53:14 +02:00
|
|
|
|
2014-06-13 14:19:43 +02:00
|
|
|
if (ce->ce_flags & CE_STRIP_NAME) {
|
|
|
|
saved_namelen = ce_namelen(ce);
|
|
|
|
ce->ce_namelen = 0;
|
read-cache: fix an -Wmaybe-uninitialized warning
The function ce_write_entry() uses a 'self-initialised' variable
construct, for the symbol 'saved_namelen', to suppress a gcc
'-Wmaybe-uninitialized' warning, given that the warning is a false
positive.
For the purposes of this discussion, the ce_write_entry() function has
three code blocks of interest, that look like so:
/* block #1 */
if (ce->ce_flags & CE_STRIP_NAME) {
saved_namelen = ce_namelen(ce);
ce->ce_namelen = 0;
}
/* block #2 */
/*
* several code blocks that contain, among others, calls
* to copy_cache_entry_to_ondisk(ondisk, ce);
*/
/* block #3 */
if (ce->ce_flags & CE_STRIP_NAME) {
ce->ce_namelen = saved_namelen;
ce->ce_flags &= ~CE_STRIP_NAME;
}
The warning implies that gcc thinks it is possible that the first
block is not entered, the calls to copy_cache_entry_to_ondisk()
could toggle the CE_STRIP_NAME flag on, thereby entering block #3
with saved_namelen unset. However, the copy_cache_entry_to_ondisk()
function does not write to ce->ce_flags (it only reads). gcc could
easily determine this, since that function is local to this file,
but it obviously doesn't.
In order to suppress this warning, we make it clear to the reader
(human and compiler), that block #3 will only be entered when the
first block has been entered, by introducing a new 'stripped_name'
boolean variable. We also take the opportunity to change the type
of 'saved_namelen' to 'unsigned int' to match ce->ce_namelen.
Signed-off-by: Ramsay Jones <ramsay@ramsayjones.plus.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-19 18:56:11 +01:00
|
|
|
stripped_name = 1;
|
2014-06-13 14:19:43 +02:00
|
|
|
}
|
|
|
|
|
2019-02-19 01:05:24 +01:00
|
|
|
size = offsetof(struct ondisk_cache_entry,data) + ondisk_data_size(ce->ce_flags, 0);
|
2017-08-21 23:24:32 +02:00
|
|
|
|
2012-04-04 18:12:43 +02:00
|
|
|
if (!previous_name) {
|
2017-08-21 23:24:32 +02:00
|
|
|
int len = ce_namelen(ce);
|
|
|
|
copy_cache_entry_to_ondisk(ondisk, ce);
|
|
|
|
result = ce_write(c, fd, ondisk, size);
|
|
|
|
if (!result)
|
|
|
|
result = ce_write(c, fd, ce->name, len);
|
|
|
|
if (!result)
|
|
|
|
result = ce_write(c, fd, padding, align_padding_size(size, len));
|
2012-04-04 18:12:43 +02:00
|
|
|
} else {
|
|
|
|
int common, to_remove, prefix_size;
|
|
|
|
unsigned char to_remove_vi[16];
|
|
|
|
for (common = 0;
|
|
|
|
(ce->name[common] &&
|
|
|
|
common < previous_name->len &&
|
|
|
|
ce->name[common] == previous_name->buf[common]);
|
|
|
|
common++)
|
|
|
|
; /* still matching */
|
|
|
|
to_remove = previous_name->len - common;
|
|
|
|
prefix_size = encode_varint(to_remove, to_remove_vi);
|
|
|
|
|
2017-08-21 23:24:32 +02:00
|
|
|
copy_cache_entry_to_ondisk(ondisk, ce);
|
|
|
|
result = ce_write(c, fd, ondisk, size);
|
|
|
|
if (!result)
|
|
|
|
result = ce_write(c, fd, to_remove_vi, prefix_size);
|
|
|
|
if (!result)
|
2017-09-07 21:24:12 +02:00
|
|
|
result = ce_write(c, fd, ce->name + common, ce_namelen(ce) - common);
|
|
|
|
if (!result)
|
|
|
|
result = ce_write(c, fd, padding, 1);
|
2012-04-04 18:12:43 +02:00
|
|
|
|
|
|
|
strbuf_splice(previous_name, common, to_remove,
|
|
|
|
ce->name + common, ce_namelen(ce) - common);
|
2008-10-01 06:04:01 +02:00
|
|
|
}
|
read-cache: fix an -Wmaybe-uninitialized warning
The function ce_write_entry() uses a 'self-initialised' variable
construct, for the symbol 'saved_namelen', to suppress a gcc
'-Wmaybe-uninitialized' warning, given that the warning is a false
positive.
For the purposes of this discussion, the ce_write_entry() function has
three code blocks of interest, that look like so:
/* block #1 */
if (ce->ce_flags & CE_STRIP_NAME) {
saved_namelen = ce_namelen(ce);
ce->ce_namelen = 0;
}
/* block #2 */
/*
* several code blocks that contain, among others, calls
* to copy_cache_entry_to_ondisk(ondisk, ce);
*/
/* block #3 */
if (ce->ce_flags & CE_STRIP_NAME) {
ce->ce_namelen = saved_namelen;
ce->ce_flags &= ~CE_STRIP_NAME;
}
The warning implies that gcc thinks it is possible that the first
block is not entered, the calls to copy_cache_entry_to_ondisk()
could toggle the CE_STRIP_NAME flag on, thereby entering block #3
with saved_namelen unset. However, the copy_cache_entry_to_ondisk()
function does not write to ce->ce_flags (it only reads). gcc could
easily determine this, since that function is local to this file,
but it obviously doesn't.
In order to suppress this warning, we make it clear to the reader
(human and compiler), that block #3 will only be entered when the
first block has been entered, by introducing a new 'stripped_name'
boolean variable. We also take the opportunity to change the type
of 'saved_namelen' to 'unsigned int' to match ce->ce_namelen.
Signed-off-by: Ramsay Jones <ramsay@ramsayjones.plus.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-19 18:56:11 +01:00
|
|
|
if (stripped_name) {
|
2014-06-13 14:19:43 +02:00
|
|
|
ce->ce_namelen = saved_namelen;
|
|
|
|
ce->ce_flags &= ~CE_STRIP_NAME;
|
|
|
|
}
|
2008-01-15 01:03:17 +01:00
|
|
|
|
2010-08-10 05:28:07 +02:00
|
|
|
return result;
|
2008-01-15 01:03:17 +01:00
|
|
|
}
|
|
|
|
|
2014-04-10 20:31:21 +02:00
|
|
|
/*
|
|
|
|
* This function verifies if index_state has the correct sha1 of the
|
|
|
|
* index file. Don't die if we have any other failure, just return 0.
|
|
|
|
*/
|
|
|
|
static int verify_index_from(const struct index_state *istate, const char *path)
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
ssize_t n;
|
|
|
|
struct stat st;
|
2018-02-01 03:18:45 +01:00
|
|
|
unsigned char hash[GIT_MAX_RAWSZ];
|
2014-04-10 20:31:21 +02:00
|
|
|
|
|
|
|
if (!istate->initialized)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fd = open(path, O_RDONLY);
|
|
|
|
if (fd < 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (fstat(fd, &st))
|
|
|
|
goto out;
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
if (st.st_size < sizeof(struct cache_header) + the_hash_algo->rawsz)
|
2014-04-10 20:31:21 +02:00
|
|
|
goto out;
|
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
n = pread_in_full(fd, hash, the_hash_algo->rawsz, st.st_size - the_hash_algo->rawsz);
|
|
|
|
if (n != the_hash_algo->rawsz)
|
2014-04-10 20:31:21 +02:00
|
|
|
goto out;
|
|
|
|
|
2018-08-28 23:22:52 +02:00
|
|
|
if (!hasheq(istate->oid.hash, hash))
|
2014-04-10 20:31:21 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
out:
|
|
|
|
close(fd);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-12 03:13:27 +01:00
|
|
|
static int repo_verify_index(struct repository *repo)
|
2014-04-10 20:31:21 +02:00
|
|
|
{
|
2019-01-12 03:13:27 +01:00
|
|
|
return verify_index_from(repo->index, repo->index_file);
|
2014-04-10 20:31:21 +02:00
|
|
|
}
|
|
|
|
|
2011-03-21 18:18:19 +01:00
|
|
|
static int has_racy_timestamp(struct index_state *istate)
|
|
|
|
{
|
|
|
|
int entries = istate->cache_nr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < entries; i++) {
|
|
|
|
struct cache_entry *ce = istate->cache[i];
|
|
|
|
if (is_racy_timestamp(istate, ce))
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-12 03:13:27 +01:00
|
|
|
void repo_update_index_if_able(struct repository *repo,
|
|
|
|
struct lock_file *lockfile)
|
2011-03-21 18:16:10 +01:00
|
|
|
{
|
2019-01-12 03:13:27 +01:00
|
|
|
if ((repo->index->cache_changed ||
|
|
|
|
has_racy_timestamp(repo->index)) &&
|
|
|
|
repo_verify_index(repo))
|
|
|
|
write_locked_index(repo->index, lockfile, COMMIT_LOCK);
|
2017-10-06 22:12:14 +02:00
|
|
|
else
|
|
|
|
rollback_lock_file(lockfile);
|
2011-03-21 18:16:10 +01:00
|
|
|
}
|
|
|
|
|
2018-11-20 07:11:47 +01:00
|
|
|
static int record_eoie(void)
|
|
|
|
{
|
|
|
|
int val;
|
|
|
|
|
|
|
|
if (!git_config_get_bool("index.recordendofindexentries", &val))
|
|
|
|
return val;
|
index: make index.threads=true enable ieot and eoie
If a user explicitly sets
[index]
threads = true
to read the index using multiple threads, ensure that index writes
include the offset table by default to make that possible. This
ensures that the user's intent of turning on threading is respected.
In other words, permit the following configurations:
- index.threads and index.recordOffsetTable unspecified: do not write
the offset table yet (to avoid alarming the user with "ignoring IEOT
extension" messages when an older version of Git accesses the
repository) but do make use of multiple threads to read the index if
the supporting offset table is present.
This can also be requested explicitly by setting index.threads=true,
0, or >1 and index.recordOffsetTable=false.
- index.threads=false or 1: do not write the offset table, and do not
make use of the offset table.
One can set index.recordOffsetTable=false as well, to be more
explicit.
- index.threads=true, 0, or >1 and index.recordOffsetTable unspecified:
write the offset table and make use of threads at read time.
This can also be requested by setting index.threads=true, 0, >1, or
unspecified and index.recordOffsetTable=true.
Fortunately the complication is temporary: once most Git installations
have upgraded to a version with support for the IEOT and EOIE
extensions, we can flip the defaults for index.recordEndOfIndexEntries
and index.recordOffsetTable to true and eliminate the settings.
Helped-by: Ben Peart <benpeart@microsoft.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-20 07:14:26 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* As a convenience, the end of index entries extension
|
|
|
|
* used for threading is written by default if the user
|
|
|
|
* explicitly requested threaded index reads.
|
|
|
|
*/
|
|
|
|
return !git_config_get_index_threads(&val) && val != 1;
|
2018-11-20 07:11:47 +01:00
|
|
|
}
|
|
|
|
|
2018-11-20 07:12:22 +01:00
|
|
|
static int record_ieot(void)
|
|
|
|
{
|
|
|
|
int val;
|
|
|
|
|
|
|
|
if (!git_config_get_bool("index.recordoffsettable", &val))
|
|
|
|
return val;
|
index: make index.threads=true enable ieot and eoie
If a user explicitly sets
[index]
threads = true
to read the index using multiple threads, ensure that index writes
include the offset table by default to make that possible. This
ensures that the user's intent of turning on threading is respected.
In other words, permit the following configurations:
- index.threads and index.recordOffsetTable unspecified: do not write
the offset table yet (to avoid alarming the user with "ignoring IEOT
extension" messages when an older version of Git accesses the
repository) but do make use of multiple threads to read the index if
the supporting offset table is present.
This can also be requested explicitly by setting index.threads=true,
0, or >1 and index.recordOffsetTable=false.
- index.threads=false or 1: do not write the offset table, and do not
make use of the offset table.
One can set index.recordOffsetTable=false as well, to be more
explicit.
- index.threads=true, 0, or >1 and index.recordOffsetTable unspecified:
write the offset table and make use of threads at read time.
This can also be requested by setting index.threads=true, 0, >1, or
unspecified and index.recordOffsetTable=true.
Fortunately the complication is temporary: once most Git installations
have upgraded to a version with support for the IEOT and EOIE
extensions, we can flip the defaults for index.recordEndOfIndexEntries
and index.recordOffsetTable to true and eliminate the settings.
Helped-by: Ben Peart <benpeart@microsoft.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-20 07:14:26 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* As a convenience, the offset table used for threading is
|
|
|
|
* written by default if the user explicitly requested
|
|
|
|
* threaded index reads.
|
|
|
|
*/
|
|
|
|
return !git_config_get_index_threads(&val) && val != 1;
|
2018-11-20 07:12:22 +01:00
|
|
|
}
|
|
|
|
|
read-cache: drop explicit `CLOSE_LOCK`-flag
`write_locked_index()` takes two flags: `COMMIT_LOCK` and `CLOSE_LOCK`.
At most one is allowed. But it is also possible to use no flag, i.e.,
`0`. But when `write_locked_index()` calls `do_write_index()`, the
temporary file, a.k.a. the lockfile, will be closed. So passing `0` is
effectively the same as `CLOSE_LOCK`, which seems like a bug.
We might feel tempted to restructure the code in order to close the file
later, or conditionally. It also feels a bit unfortunate that we simply
"happen" to close the lock by way of an implementation detail of
lockfiles. But note that we need to close the temporary file before
`stat`-ing it, at least on Windows. See 9f41c7a6b (read-cache: close
index.lock in do_write_index, 2017-04-26).
Drop `CLOSE_LOCK` and make it explicit that `write_locked_index()`
always closes the lock. Whether it is also committed is governed by the
remaining flag, `COMMIT_LOCK`.
This means we neither have nor suggest that we have a mode to write the
index and leave the file open. Whatever extra contents we might
eventually want to write, we should probably write it from within
`write_locked_index()` itself anyway.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-10-06 22:12:12 +02:00
|
|
|
/*
|
|
|
|
* On success, `tempfile` is closed. If it is the temporary file
|
|
|
|
* of a `struct lock_file`, we will therefore effectively perform
|
|
|
|
* a 'close_lock_file_gently()`. Since that is an implementation
|
|
|
|
* detail of lockfiles, callers of `do_write_index()` should not
|
|
|
|
* rely on it.
|
|
|
|
*/
|
2017-04-26 22:05:23 +02:00
|
|
|
static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
|
2014-06-13 14:19:44 +02:00
|
|
|
int strip_extensions)
|
2005-04-09 21:09:27 +02:00
|
|
|
{
|
2018-01-27 13:27:56 +01:00
|
|
|
uint64_t start = getnanotime();
|
2017-04-26 22:05:23 +02:00
|
|
|
int newfd = tempfile->fd;
|
2018-10-10 17:59:34 +02:00
|
|
|
git_hash_ctx c, eoie_c;
|
2005-04-09 21:09:27 +02:00
|
|
|
struct cache_header hdr;
|
2017-08-21 23:24:31 +02:00
|
|
|
int i, err = 0, removed, extended, hdr_version;
|
2007-04-02 08:26:07 +02:00
|
|
|
struct cache_entry **cache = istate->cache;
|
|
|
|
int entries = istate->cache_nr;
|
write_index(): update index_state->timestamp after flushing to disk
Since this timestamp is used to check for racy-clean files, it is
important to keep it uptodate.
For the 'git checkout' command without the '-q' option, this make a
huge difference. Before, each and every file which was updated, was
racy-clean after the call to unpack_trees() and write_index() but
before the GIT process ended.
And because of the call to show_local_changes() in builtin-checkout.c,
we ended up reading those files back into memory, doing a SHA1 to
check if the files was really different from the index. And, of
course, no file was different.
With this fix, 'git checkout' without the '-q' option should now be
almost as fast as with the '-q' option, but not quite, as we still do
some few lstat(2) calls more without the '-q' option.
Below is some average numbers for 10 checkout's to v2.6.27 and 10 to
v2.6.25 of the Linux kernel, to show the difference:
before (git version 1.6.2.rc1.256.g58a87):
7.860 user 2.427 sys 19.465 real 52.8% CPU faults: 0 major 95331 minor
after:
6.184 user 2.160 sys 17.619 real 47.4% CPU faults: 0 major 38994 minor
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-23 19:02:57 +01:00
|
|
|
struct stat st;
|
2019-02-19 01:05:24 +01:00
|
|
|
struct ondisk_cache_entry ondisk;
|
2012-04-04 18:12:43 +02:00
|
|
|
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
|
2018-01-07 23:30:14 +01:00
|
|
|
int drop_cache_tree = istate->drop_cache_tree;
|
2018-10-10 17:59:34 +02:00
|
|
|
off_t offset;
|
2018-10-10 17:59:38 +02:00
|
|
|
int ieot_entries = 1;
|
2018-10-10 17:59:37 +02:00
|
|
|
struct index_entry_offset_table *ieot = NULL;
|
|
|
|
int nr, nr_threads;
|
2005-06-10 10:32:37 +02:00
|
|
|
|
2008-10-01 06:04:01 +02:00
|
|
|
for (i = removed = extended = 0; i < entries; i++) {
|
2008-01-15 01:03:17 +01:00
|
|
|
if (cache[i]->ce_flags & CE_REMOVE)
|
2005-06-10 10:32:37 +02:00
|
|
|
removed++;
|
2005-04-09 21:09:27 +02:00
|
|
|
|
2008-10-01 06:04:01 +02:00
|
|
|
/* reduce extended entries if possible */
|
|
|
|
cache[i]->ce_flags &= ~CE_EXTENDED;
|
|
|
|
if (cache[i]->ce_flags & CE_EXTENDED_FLAGS) {
|
|
|
|
extended++;
|
|
|
|
cache[i]->ce_flags |= CE_EXTENDED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:49 +02:00
|
|
|
if (!istate->version) {
|
2019-08-13 20:37:43 +02:00
|
|
|
istate->version = get_index_format_default(the_repository);
|
2018-04-14 17:34:59 +02:00
|
|
|
if (git_env_bool("GIT_TEST_SPLIT_INDEX", 0))
|
2014-06-13 14:19:49 +02:00
|
|
|
init_split_index(istate);
|
|
|
|
}
|
2012-04-04 18:12:43 +02:00
|
|
|
|
|
|
|
/* demote version 3 to version 2 when the latter suffices */
|
|
|
|
if (istate->version == 3 || istate->version == 2)
|
|
|
|
istate->version = extended ? 3 : 2;
|
|
|
|
|
|
|
|
hdr_version = istate->version;
|
|
|
|
|
2005-04-15 19:44:27 +02:00
|
|
|
hdr.hdr_signature = htonl(CACHE_SIGNATURE);
|
2012-04-04 18:12:43 +02:00
|
|
|
hdr.hdr_version = htonl(hdr_version);
|
2005-06-10 10:32:37 +02:00
|
|
|
hdr.hdr_entries = htonl(entries - removed);
|
2005-04-09 21:09:27 +02:00
|
|
|
|
2018-02-01 03:18:45 +01:00
|
|
|
the_hash_algo->init_fn(&c);
|
2005-04-20 21:36:41 +02:00
|
|
|
if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
|
2005-04-09 21:09:27 +02:00
|
|
|
return -1;
|
|
|
|
|
index: make index.threads=true enable ieot and eoie
If a user explicitly sets
[index]
threads = true
to read the index using multiple threads, ensure that index writes
include the offset table by default to make that possible. This
ensures that the user's intent of turning on threading is respected.
In other words, permit the following configurations:
- index.threads and index.recordOffsetTable unspecified: do not write
the offset table yet (to avoid alarming the user with "ignoring IEOT
extension" messages when an older version of Git accesses the
repository) but do make use of multiple threads to read the index if
the supporting offset table is present.
This can also be requested explicitly by setting index.threads=true,
0, or >1 and index.recordOffsetTable=false.
- index.threads=false or 1: do not write the offset table, and do not
make use of the offset table.
One can set index.recordOffsetTable=false as well, to be more
explicit.
- index.threads=true, 0, or >1 and index.recordOffsetTable unspecified:
write the offset table and make use of threads at read time.
This can also be requested by setting index.threads=true, 0, >1, or
unspecified and index.recordOffsetTable=true.
Fortunately the complication is temporary: once most Git installations
have upgraded to a version with support for the IEOT and EOIE
extensions, we can flip the defaults for index.recordEndOfIndexEntries
and index.recordOffsetTable to true and eliminate the settings.
Helped-by: Ben Peart <benpeart@microsoft.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-20 07:14:26 +01:00
|
|
|
if (!HAVE_THREADS || git_config_get_index_threads(&nr_threads))
|
2018-11-03 09:48:48 +01:00
|
|
|
nr_threads = 1;
|
2018-11-03 09:48:47 +01:00
|
|
|
|
2018-11-20 07:12:22 +01:00
|
|
|
if (nr_threads != 1 && record_ieot()) {
|
2018-10-10 17:59:37 +02:00
|
|
|
int ieot_blocks, cpus;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ensure default number of ieot blocks maps evenly to the
|
|
|
|
* default number of threads that will process them leaving
|
|
|
|
* room for the thread to load the index extensions.
|
|
|
|
*/
|
|
|
|
if (!nr_threads) {
|
|
|
|
ieot_blocks = istate->cache_nr / THREAD_COST;
|
|
|
|
cpus = online_cpus();
|
|
|
|
if (ieot_blocks > cpus - 1)
|
|
|
|
ieot_blocks = cpus - 1;
|
|
|
|
} else {
|
|
|
|
ieot_blocks = nr_threads;
|
2018-10-10 17:59:38 +02:00
|
|
|
if (ieot_blocks > istate->cache_nr)
|
|
|
|
ieot_blocks = istate->cache_nr;
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* no reason to write out the IEOT extension if we don't
|
|
|
|
* have enough blocks to utilize multi-threading
|
|
|
|
*/
|
|
|
|
if (ieot_blocks > 1) {
|
|
|
|
ieot = xcalloc(1, sizeof(struct index_entry_offset_table)
|
|
|
|
+ (ieot_blocks * sizeof(struct index_entry_offset)));
|
2018-10-10 17:59:38 +02:00
|
|
|
ieot_entries = DIV_ROUND_UP(entries, ieot_blocks);
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-10 17:59:34 +02:00
|
|
|
offset = lseek(newfd, 0, SEEK_CUR);
|
2018-10-10 17:59:37 +02:00
|
|
|
if (offset < 0) {
|
|
|
|
free(ieot);
|
2018-10-10 17:59:34 +02:00
|
|
|
return -1;
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
2018-10-10 17:59:34 +02:00
|
|
|
offset += write_buffer_len;
|
2018-10-10 17:59:37 +02:00
|
|
|
nr = 0;
|
2012-04-04 18:12:43 +02:00
|
|
|
previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
|
2017-08-21 23:24:32 +02:00
|
|
|
|
2005-04-09 21:09:27 +02:00
|
|
|
for (i = 0; i < entries; i++) {
|
|
|
|
struct cache_entry *ce = cache[i];
|
2008-01-15 01:03:17 +01:00
|
|
|
if (ce->ce_flags & CE_REMOVE)
|
2005-06-10 00:34:04 +02:00
|
|
|
continue;
|
2008-03-30 18:25:52 +02:00
|
|
|
if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce))
|
2018-09-21 17:57:31 +02:00
|
|
|
ce_smudge_racily_clean_entry(istate, ce);
|
2016-09-05 22:07:52 +02:00
|
|
|
if (is_null_oid(&ce->oid)) {
|
write_index: optionally allow broken null sha1s
Commit 4337b58 (do not write null sha1s to on-disk index,
2012-07-28) added a safety check preventing git from writing
null sha1s into the index. The intent was to catch errors in
other parts of the code that might let such an entry slip
into the index (or worse, a tree).
Some existing repositories may have invalid trees that
contain null sha1s already, though. Until 4337b58, a common
way to clean this up would be to use git-filter-branch's
index-filter to repair such broken entries. That now fails
when filter-branch tries to write out the index.
Introduce a GIT_ALLOW_NULL_SHA1 environment variable to
relax this check and make it easier to recover from such a
history.
It is tempting to not involve filter-branch in this commit
at all, and instead require the user to manually invoke
GIT_ALLOW_NULL_SHA1=1 git filter-branch ...
to perform an index-filter on a history with trees with null
sha1s. That would be slightly safer, but requires some
specialized knowledge from the user. So let's set the
GIT_ALLOW_NULL_SHA1 variable automatically when checking out
the to-be-filtered trees. Advice on using filter-branch to
remove such entries already exists on places like
stackoverflow, and this patch makes it Just Work again on
recent versions of git.
Further commands that touch the index will still notice and
fail, unless they actually remove the broken entries. A
filter-branch whose filters do not touch the index at all
will not error out (since we complain of the null sha1 only
on writing, not when making a tree out of the index), but
this is acceptable, as we still print a loud warning, so the
problem is unlikely to go unnoticed.
Signed-off-by: Jeff King <peff@peff.net>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-08-27 22:41:12 +02:00
|
|
|
static const char msg[] = "cache entry has null sha1: %s";
|
|
|
|
static int allow = -1;
|
|
|
|
|
|
|
|
if (allow < 0)
|
|
|
|
allow = git_env_bool("GIT_ALLOW_NULL_SHA1", 0);
|
|
|
|
if (allow)
|
|
|
|
warning(msg, ce->name);
|
|
|
|
else
|
2017-08-21 23:24:31 +02:00
|
|
|
err = error(msg, ce->name);
|
cache-tree: reject entries with null sha1
We generally disallow null sha1s from entering the index,
due to 4337b5856 (do not write null sha1s to on-disk index,
2012-07-28). However, we loosened that in 83bd7437c
(write_index: optionally allow broken null sha1s,
2013-08-27) so that tools like filter-branch could be used
to repair broken history.
However, we should make sure that these broken entries do
not get propagated into new trees. For most entries, we'd
catch them with the missing-object check (since presumably
the null sha1 does not exist in our object database). But
gitlink entries do not need reachability, so we may blindly
copy the entry into a bogus tree.
This patch rejects all null sha1s (with the same "invalid
entry" message that missing objects get) when building trees
from the index. It does so even for non-gitlinks, and even
when "write-tree" is given the --missing-ok flag. The null
sha1 is a special sentinel value that is already rejected in
trees by fsck; whether the object exists or not, it is an
error to put it in a tree.
Note that for this to work, we must also avoid reusing an
existing cache-tree that contains the null sha1. This patch
does so by just refusing to write out any cache tree when
the index contains a null sha1. This is blunter than we need
to be; we could just reject the subtree that contains the
offending entry. But it's not worth the complexity. The
behavior is unchanged unless you have a broken index entry,
and even then we'd refuse the whole index write unless the
emergency GIT_ALLOW_NULL_SHA1 is in use. And even then the
end result is only a performance drop (any write-tree will
have to generate the whole cache-tree from scratch).
The tests bear some explanation.
The existing test in t7009 doesn't catch this problem,
because our index-filter runs "git rm --cached", which will
try to rewrite the updated index and barf on the bogus
entry. So we never even make it to write-tree. The new test
there adds a noop index-filter, which does show the problem.
The new tests in t1601 are slightly redundant with what
filter-branch is doing under the hood in t7009. But as
they're much more direct, they're easier to reason about.
And should filter-branch ever change or go away, we'd want
to make sure that these plumbing commands behave sanely.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-04-21 20:46:17 +02:00
|
|
|
|
|
|
|
drop_cache_tree = 1;
|
write_index: optionally allow broken null sha1s
Commit 4337b58 (do not write null sha1s to on-disk index,
2012-07-28) added a safety check preventing git from writing
null sha1s into the index. The intent was to catch errors in
other parts of the code that might let such an entry slip
into the index (or worse, a tree).
Some existing repositories may have invalid trees that
contain null sha1s already, though. Until 4337b58, a common
way to clean this up would be to use git-filter-branch's
index-filter to repair such broken entries. That now fails
when filter-branch tries to write out the index.
Introduce a GIT_ALLOW_NULL_SHA1 environment variable to
relax this check and make it easier to recover from such a
history.
It is tempting to not involve filter-branch in this commit
at all, and instead require the user to manually invoke
GIT_ALLOW_NULL_SHA1=1 git filter-branch ...
to perform an index-filter on a history with trees with null
sha1s. That would be slightly safer, but requires some
specialized knowledge from the user. So let's set the
GIT_ALLOW_NULL_SHA1 variable automatically when checking out
the to-be-filtered trees. Advice on using filter-branch to
remove such entries already exists on places like
stackoverflow, and this patch makes it Just Work again on
recent versions of git.
Further commands that touch the index will still notice and
fail, unless they actually remove the broken entries. A
filter-branch whose filters do not touch the index at all
will not error out (since we complain of the null sha1 only
on writing, not when making a tree out of the index), but
this is acceptable, as we still print a loud warning, so the
problem is unlikely to go unnoticed.
Signed-off-by: Jeff King <peff@peff.net>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-08-27 22:41:12 +02:00
|
|
|
}
|
2018-10-10 17:59:38 +02:00
|
|
|
if (ieot && i && (i % ieot_entries == 0)) {
|
2018-10-10 17:59:37 +02:00
|
|
|
ieot->entries[ieot->nr].nr = nr;
|
|
|
|
ieot->entries[ieot->nr].offset = offset;
|
|
|
|
ieot->nr++;
|
|
|
|
/*
|
|
|
|
* If we have a V4 index, set the first byte to an invalid
|
|
|
|
* character to ensure there is nothing common with the previous
|
|
|
|
* entry
|
|
|
|
*/
|
|
|
|
if (previous_name)
|
|
|
|
previous_name->buf[0] = 0;
|
|
|
|
nr = 0;
|
|
|
|
offset = lseek(newfd, 0, SEEK_CUR);
|
|
|
|
if (offset < 0) {
|
|
|
|
free(ieot);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
offset += write_buffer_len;
|
|
|
|
}
|
2017-08-21 23:24:32 +02:00
|
|
|
if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
|
2017-08-21 23:24:31 +02:00
|
|
|
err = -1;
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
break;
|
2018-10-10 17:59:37 +02:00
|
|
|
nr++;
|
|
|
|
}
|
|
|
|
if (ieot && nr) {
|
|
|
|
ieot->entries[ieot->nr].nr = nr;
|
|
|
|
ieot->entries[ieot->nr].offset = offset;
|
|
|
|
ieot->nr++;
|
2005-04-09 21:09:27 +02:00
|
|
|
}
|
2012-04-04 18:12:43 +02:00
|
|
|
strbuf_release(&previous_name_buf);
|
2006-04-24 01:52:08 +02:00
|
|
|
|
2018-10-10 17:59:37 +02:00
|
|
|
if (err) {
|
|
|
|
free(ieot);
|
2017-08-21 23:24:31 +02:00
|
|
|
return err;
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
2017-08-21 23:24:31 +02:00
|
|
|
|
2006-04-25 06:18:58 +02:00
|
|
|
/* Write extension data here */
|
2018-10-10 17:59:34 +02:00
|
|
|
offset = lseek(newfd, 0, SEEK_CUR);
|
2018-10-10 17:59:37 +02:00
|
|
|
if (offset < 0) {
|
|
|
|
free(ieot);
|
2018-10-10 17:59:34 +02:00
|
|
|
return -1;
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
2018-10-10 17:59:34 +02:00
|
|
|
offset += write_buffer_len;
|
|
|
|
the_hash_algo->init_fn(&eoie_c);
|
|
|
|
|
2018-10-10 17:59:37 +02:00
|
|
|
/*
|
|
|
|
* Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
|
|
|
|
* can minimize the number of extensions we have to scan through to
|
|
|
|
* find it during load. Write it out regardless of the
|
|
|
|
* strip_extensions parameter as we need it when loading the shared
|
|
|
|
* index.
|
|
|
|
*/
|
|
|
|
if (ieot) {
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
write_ieot_extension(&sb, ieot);
|
|
|
|
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
|
|
|
|
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
free(ieot);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2019-02-13 10:51:29 +01:00
|
|
|
if (!strip_extensions && istate->split_index &&
|
|
|
|
!is_null_oid(&istate->split_index->base_oid)) {
|
2014-06-13 14:19:36 +02:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
err = write_link_extension(&sb, istate) < 0 ||
|
2018-10-10 17:59:34 +02:00
|
|
|
write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
|
2014-06-13 14:19:36 +02:00
|
|
|
sb.len) < 0 ||
|
|
|
|
ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
cache-tree: reject entries with null sha1
We generally disallow null sha1s from entering the index,
due to 4337b5856 (do not write null sha1s to on-disk index,
2012-07-28). However, we loosened that in 83bd7437c
(write_index: optionally allow broken null sha1s,
2013-08-27) so that tools like filter-branch could be used
to repair broken history.
However, we should make sure that these broken entries do
not get propagated into new trees. For most entries, we'd
catch them with the missing-object check (since presumably
the null sha1 does not exist in our object database). But
gitlink entries do not need reachability, so we may blindly
copy the entry into a bogus tree.
This patch rejects all null sha1s (with the same "invalid
entry" message that missing objects get) when building trees
from the index. It does so even for non-gitlinks, and even
when "write-tree" is given the --missing-ok flag. The null
sha1 is a special sentinel value that is already rejected in
trees by fsck; whether the object exists or not, it is an
error to put it in a tree.
Note that for this to work, we must also avoid reusing an
existing cache-tree that contains the null sha1. This patch
does so by just refusing to write out any cache tree when
the index contains a null sha1. This is blunter than we need
to be; we could just reject the subtree that contains the
offending entry. But it's not worth the complexity. The
behavior is unchanged unless you have a broken index entry,
and even then we'd refuse the whole index write unless the
emergency GIT_ALLOW_NULL_SHA1 is in use. And even then the
end result is only a performance drop (any write-tree will
have to generate the whole cache-tree from scratch).
The tests bear some explanation.
The existing test in t7009 doesn't catch this problem,
because our index-filter runs "git rm --cached", which will
try to rewrite the updated index and barf on the bogus
entry. So we never even make it to write-tree. The new test
there adds a noop index-filter, which does show the problem.
The new tests in t1601 are slightly redundant with what
filter-branch is doing under the hood in t7009. But as
they're much more direct, they're easier to reason about.
And should filter-branch ever change or go away, we'd want
to make sure that these plumbing commands behave sanely.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-04-21 20:46:17 +02:00
|
|
|
if (!strip_extensions && !drop_cache_tree && istate->cache_tree) {
|
2008-10-09 21:12:12 +02:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
2007-09-25 10:22:44 +02:00
|
|
|
|
|
|
|
cache_tree_write(&sb, istate->cache_tree);
|
2018-10-10 17:59:34 +02:00
|
|
|
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
|
2007-09-25 10:22:44 +02:00
|
|
|
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
2006-04-25 06:18:58 +02:00
|
|
|
return -1;
|
|
|
|
}
|
2014-06-13 14:19:44 +02:00
|
|
|
if (!strip_extensions && istate->resolve_undo) {
|
2009-12-25 09:30:51 +01:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
resolve_undo_write(&sb, istate->resolve_undo);
|
2018-10-10 17:59:34 +02:00
|
|
|
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
|
2009-12-25 09:30:51 +01:00
|
|
|
sb.len) < 0
|
|
|
|
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
2015-03-08 11:12:33 +01:00
|
|
|
if (!strip_extensions && istate->untracked) {
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
write_untracked_extension(&sb, istate->untracked);
|
2018-10-10 17:59:34 +02:00
|
|
|
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
|
2015-03-08 11:12:33 +01:00
|
|
|
sb.len) < 0 ||
|
|
|
|
ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
2017-09-22 18:35:40 +02:00
|
|
|
if (!strip_extensions && istate->fsmonitor_last_update) {
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
write_fsmonitor_extension(&sb, istate);
|
2018-10-10 17:59:34 +02:00
|
|
|
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
|
|
|
|
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
|
|
|
|
* so that it can be found and processed before all the index entries are
|
|
|
|
* read. Write it out regardless of the strip_extensions parameter as we need it
|
|
|
|
* when loading the shared index.
|
|
|
|
*/
|
2018-11-20 07:11:47 +01:00
|
|
|
if (offset && record_eoie()) {
|
2018-10-10 17:59:34 +02:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
|
|
|
|
write_eoie_extension(&sb, &eoie_c, offset);
|
|
|
|
err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
|
2017-09-22 18:35:40 +02:00
|
|
|
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err)
|
|
|
|
return -1;
|
|
|
|
}
|
write_index(): update index_state->timestamp after flushing to disk
Since this timestamp is used to check for racy-clean files, it is
important to keep it uptodate.
For the 'git checkout' command without the '-q' option, this make a
huge difference. Before, each and every file which was updated, was
racy-clean after the call to unpack_trees() and write_index() but
before the GIT process ended.
And because of the call to show_local_changes() in builtin-checkout.c,
we ended up reading those files back into memory, doing a SHA1 to
check if the files was really different from the index. And, of
course, no file was different.
With this fix, 'git checkout' without the '-q' option should now be
almost as fast as with the '-q' option, but not quite, as we still do
some few lstat(2) calls more without the '-q' option.
Below is some average numbers for 10 checkout's to v2.6.27 and 10 to
v2.6.25 of the Linux kernel, to show the difference:
before (git version 1.6.2.rc1.256.g58a87):
7.860 user 2.427 sys 19.465 real 52.8% CPU faults: 0 major 95331 minor
after:
6.184 user 2.160 sys 17.619 real 47.4% CPU faults: 0 major 38994 minor
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-23 19:02:57 +01:00
|
|
|
|
2018-05-02 02:25:44 +02:00
|
|
|
if (ce_flush(&c, newfd, istate->oid.hash))
|
2017-04-26 22:05:23 +02:00
|
|
|
return -1;
|
tempfile: do not delete tempfile on failed close
When close_tempfile() fails, we delete the tempfile and
reset the fields of the tempfile struct. This makes it
easier for callers to return without cleaning up, but it
also makes this common pattern:
if (close_tempfile(tempfile))
return error_errno("error closing %s", tempfile->filename.buf);
wrong, because the "filename" field has been reset after the
failed close. And it's not easy to fix, as in many cases we
don't have another copy of the filename (e.g., if it was
created via one of the mks_tempfile functions, and we just
have the original template string).
Let's drop the feature that a failed close automatically
deletes the file. This puts the burden on the caller to do
the deletion themselves, but this isn't that big a deal.
Callers which do:
if (write(...) || close_tempfile(...)) {
delete_tempfile(...);
return -1;
}
already had to call delete when the write() failed, and so
aren't affected. Likewise, any caller which just calls die()
in the error path is OK; we'll delete the tempfile during
the atexit handler.
Because this patch changes the semantics of close_tempfile()
without changing its signature, all callers need to be
manually checked and converted to the new scheme. This patch
covers all in-tree callers, but there may be others for
not-yet-merged topics. To catch these, we rename the
function to close_tempfile_gently(), which will attract
compile-time attention to new callers. (Technically the
original could be considered "gentle" already in that it
didn't die() on errors, but this one is even more so).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-05 14:14:30 +02:00
|
|
|
if (close_tempfile_gently(tempfile)) {
|
|
|
|
error(_("could not close '%s'"), tempfile->filename.buf);
|
|
|
|
return -1;
|
|
|
|
}
|
2017-04-26 22:05:23 +02:00
|
|
|
if (stat(tempfile->filename.buf, &st))
|
write_index(): update index_state->timestamp after flushing to disk
Since this timestamp is used to check for racy-clean files, it is
important to keep it uptodate.
For the 'git checkout' command without the '-q' option, this make a
huge difference. Before, each and every file which was updated, was
racy-clean after the call to unpack_trees() and write_index() but
before the GIT process ended.
And because of the call to show_local_changes() in builtin-checkout.c,
we ended up reading those files back into memory, doing a SHA1 to
check if the files was really different from the index. And, of
course, no file was different.
With this fix, 'git checkout' without the '-q' option should now be
almost as fast as with the '-q' option, but not quite, as we still do
some few lstat(2) calls more without the '-q' option.
Below is some average numbers for 10 checkout's to v2.6.27 and 10 to
v2.6.25 of the Linux kernel, to show the difference:
before (git version 1.6.2.rc1.256.g58a87):
7.860 user 2.427 sys 19.465 real 52.8% CPU faults: 0 major 95331 minor
after:
6.184 user 2.160 sys 17.619 real 47.4% CPU faults: 0 major 38994 minor
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-23 19:02:57 +01:00
|
|
|
return -1;
|
2009-03-15 12:38:55 +01:00
|
|
|
istate->timestamp.sec = (unsigned int)st.st_mtime;
|
|
|
|
istate->timestamp.nsec = ST_MTIME_NSEC(st);
|
2018-01-27 13:27:56 +01:00
|
|
|
trace_performance_since(start, "write index, changed mask = %x", istate->cache_changed);
|
2019-02-22 23:25:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO trace2: replace "the_repository" with the actual repo instance
|
|
|
|
* that is associated with the given "istate".
|
|
|
|
*/
|
|
|
|
trace2_data_intmax("index", the_repository, "write/version",
|
|
|
|
istate->version);
|
|
|
|
trace2_data_intmax("index", the_repository, "write/cache_nr",
|
|
|
|
istate->cache_nr);
|
|
|
|
|
write_index(): update index_state->timestamp after flushing to disk
Since this timestamp is used to check for racy-clean files, it is
important to keep it uptodate.
For the 'git checkout' command without the '-q' option, this make a
huge difference. Before, each and every file which was updated, was
racy-clean after the call to unpack_trees() and write_index() but
before the GIT process ended.
And because of the call to show_local_changes() in builtin-checkout.c,
we ended up reading those files back into memory, doing a SHA1 to
check if the files was really different from the index. And, of
course, no file was different.
With this fix, 'git checkout' without the '-q' option should now be
almost as fast as with the '-q' option, but not quite, as we still do
some few lstat(2) calls more without the '-q' option.
Below is some average numbers for 10 checkout's to v2.6.27 and 10 to
v2.6.25 of the Linux kernel, to show the difference:
before (git version 1.6.2.rc1.256.g58a87):
7.860 user 2.427 sys 19.465 real 52.8% CPU faults: 0 major 95331 minor
after:
6.184 user 2.160 sys 17.619 real 47.4% CPU faults: 0 major 38994 minor
Signed-off-by: Kjetil Barvik <barvik@broadpark.no>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-02-23 19:02:57 +01:00
|
|
|
return 0;
|
2005-04-09 21:09:27 +02:00
|
|
|
}
|
2008-06-27 18:21:58 +02:00
|
|
|
|
2014-06-13 14:19:24 +02:00
|
|
|
void set_alternate_index_output(const char *name)
|
|
|
|
{
|
|
|
|
alternate_index_output = name;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int commit_locked_index(struct lock_file *lk)
|
|
|
|
{
|
2014-10-01 12:28:36 +02:00
|
|
|
if (alternate_index_output)
|
|
|
|
return commit_lock_file_to(lk, alternate_index_output);
|
|
|
|
else
|
2014-06-13 14:19:24 +02:00
|
|
|
return commit_lock_file(lk);
|
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:23 +02:00
|
|
|
static int do_write_locked_index(struct index_state *istate, struct lock_file *lock,
|
|
|
|
unsigned flags)
|
|
|
|
{
|
2019-02-22 23:25:07 +01:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO trace2: replace "the_repository" with the actual repo instance
|
|
|
|
* that is associated with the given "istate".
|
|
|
|
*/
|
|
|
|
trace2_region_enter_printf("index", "do_write_index", the_repository,
|
|
|
|
"%s", lock->tempfile->filename.buf);
|
|
|
|
ret = do_write_index(istate, lock->tempfile, 0);
|
|
|
|
trace2_region_leave_printf("index", "do_write_index", the_repository,
|
|
|
|
"%s", lock->tempfile->filename.buf);
|
|
|
|
|
2014-06-13 14:19:23 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (flags & COMMIT_LOCK)
|
2019-02-15 18:59:21 +01:00
|
|
|
ret = commit_locked_index(lock);
|
|
|
|
else
|
|
|
|
ret = close_lock_file_gently(lock);
|
|
|
|
|
|
|
|
run_hook_le(NULL, "post-index-change",
|
|
|
|
istate->updated_workdir ? "1" : "0",
|
|
|
|
istate->updated_skipworktree ? "1" : "0", NULL);
|
|
|
|
istate->updated_workdir = 0;
|
|
|
|
istate->updated_skipworktree = 0;
|
|
|
|
|
|
|
|
return ret;
|
2014-06-13 14:19:23 +02:00
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:36 +02:00
|
|
|
static int write_split_index(struct index_state *istate,
|
|
|
|
struct lock_file *lock,
|
|
|
|
unsigned flags)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
prepare_to_write_split_index(istate);
|
|
|
|
ret = do_write_locked_index(istate, lock, flags);
|
|
|
|
finish_writing_split_index(istate);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-03-06 10:41:58 +01:00
|
|
|
static const char *shared_index_expire = "2.weeks.ago";
|
|
|
|
|
|
|
|
static unsigned long get_shared_index_expire_date(void)
|
|
|
|
{
|
|
|
|
static unsigned long shared_index_expire_date;
|
|
|
|
static int shared_index_expire_date_prepared;
|
|
|
|
|
|
|
|
if (!shared_index_expire_date_prepared) {
|
|
|
|
git_config_get_expiry("splitindex.sharedindexexpire",
|
|
|
|
&shared_index_expire);
|
|
|
|
shared_index_expire_date = approxidate(shared_index_expire);
|
|
|
|
shared_index_expire_date_prepared = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return shared_index_expire_date;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int should_delete_shared_index(const char *shared_index_path)
|
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
unsigned long expiration;
|
|
|
|
|
|
|
|
/* Check timestamp */
|
|
|
|
expiration = get_shared_index_expire_date();
|
|
|
|
if (!expiration)
|
|
|
|
return 0;
|
|
|
|
if (stat(shared_index_path, &st))
|
2017-04-30 23:32:48 +02:00
|
|
|
return error_errno(_("could not stat '%s'"), shared_index_path);
|
2017-03-06 10:41:58 +01:00
|
|
|
if (st.st_mtime > expiration)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int clean_shared_index_files(const char *current_hex)
|
|
|
|
{
|
|
|
|
struct dirent *de;
|
|
|
|
DIR *dir = opendir(get_git_dir());
|
|
|
|
|
|
|
|
if (!dir)
|
|
|
|
return error_errno(_("unable to open git dir: %s"), get_git_dir());
|
|
|
|
|
|
|
|
while ((de = readdir(dir)) != NULL) {
|
|
|
|
const char *sha1_hex;
|
|
|
|
const char *shared_index_path;
|
|
|
|
if (!skip_prefix(de->d_name, "sharedindex.", &sha1_hex))
|
|
|
|
continue;
|
|
|
|
if (!strcmp(sha1_hex, current_hex))
|
|
|
|
continue;
|
|
|
|
shared_index_path = git_path("%s", de->d_name);
|
|
|
|
if (should_delete_shared_index(shared_index_path) > 0 &&
|
|
|
|
unlink(shared_index_path))
|
|
|
|
warning_errno(_("unable to unlink: %s"), shared_index_path);
|
|
|
|
}
|
|
|
|
closedir(dir);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:45 +02:00
|
|
|
static int write_shared_index(struct index_state *istate,
|
2018-01-14 11:18:19 +01:00
|
|
|
struct tempfile **temp)
|
2014-06-13 14:19:44 +02:00
|
|
|
{
|
|
|
|
struct split_index *si = istate->split_index;
|
tempfile: auto-allocate tempfiles on heap
The previous commit taught the tempfile code to give up
ownership over tempfiles that have been renamed or deleted.
That makes it possible to use a stack variable like this:
struct tempfile t;
create_tempfile(&t, ...);
...
if (!err)
rename_tempfile(&t, ...);
else
delete_tempfile(&t);
But doing it this way has a high potential for creating
memory errors. The tempfile we pass to create_tempfile()
ends up on a global linked list, and it's not safe for it to
go out of scope until we've called one of those two
deactivation functions.
Imagine that we add an early return from the function that
forgets to call delete_tempfile(). With a static or heap
tempfile variable, the worst case is that the tempfile hangs
around until the program exits (and some functions like
setup_shallow_temporary rely on this intentionally, creating
a tempfile and then leaving it for later cleanup).
But with a stack variable as above, this is a serious memory
error: the variable goes out of scope and may be filled with
garbage by the time the tempfile code looks at it. Let's
see if we can make it harder to get this wrong.
Since many callers need to allocate arbitrary numbers of
tempfiles, we can't rely on static storage as a general
solution. So we need to turn to the heap. We could just ask
all callers to pass us a heap variable, but that puts the
burden on them to call free() at the right time.
Instead, let's have the tempfile code handle the heap
allocation _and_ the deallocation (when the tempfile is
deactivated and removed from the list).
This changes the return value of all of the creation
functions. For the cleanup functions (delete and rename),
we'll add one extra bit of safety: instead of taking a
tempfile pointer, we'll take a pointer-to-pointer and set it
to NULL after freeing the object. This makes it safe to
double-call functions like delete_tempfile(), as the second
call treats the NULL input as a noop. Several callsites
follow this pattern.
The resulting patch does have a fair bit of noise, as each
caller needs to be converted to handle:
1. Storing a pointer instead of the struct itself.
2. Passing the pointer instead of taking the struct
address.
3. Handling a "struct tempfile *" return instead of a file
descriptor.
We could play games to make this less noisy. For example, by
defining the tempfile like this:
struct tempfile {
struct heap_allocated_part_of_tempfile {
int fd;
...etc
} *actual_data;
}
Callers would continue to have a "struct tempfile", and it
would be "active" only when the inner pointer was non-NULL.
But that just makes things more awkward in the long run.
There aren't that many callers, so we can simply bite
the bullet and adjust all of them. And the compiler makes it
easy for us to find them all.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-05 14:15:08 +02:00
|
|
|
int ret;
|
2014-06-13 14:19:44 +02:00
|
|
|
|
|
|
|
move_cache_to_base_index(istate);
|
2019-02-22 23:25:07 +01:00
|
|
|
|
|
|
|
trace2_region_enter_printf("index", "shared/do_write_index",
|
|
|
|
the_repository, "%s", (*temp)->filename.buf);
|
2018-01-14 11:18:18 +01:00
|
|
|
ret = do_write_index(si->base, *temp, 1);
|
2019-05-10 15:37:38 +02:00
|
|
|
trace2_region_leave_printf("index", "shared/do_write_index",
|
2019-02-22 23:25:07 +01:00
|
|
|
the_repository, "%s", (*temp)->filename.buf);
|
|
|
|
|
2018-01-14 11:18:19 +01:00
|
|
|
if (ret)
|
2014-06-13 14:19:44 +02:00
|
|
|
return ret;
|
2018-01-14 11:18:18 +01:00
|
|
|
ret = adjust_shared_perm(get_tempfile_path(*temp));
|
2017-06-25 06:34:27 +02:00
|
|
|
if (ret) {
|
2018-11-10 06:16:05 +01:00
|
|
|
error(_("cannot fix permission bits on '%s'"), get_tempfile_path(*temp));
|
2017-06-25 06:34:27 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2018-01-14 11:18:18 +01:00
|
|
|
ret = rename_tempfile(temp,
|
2018-05-02 02:25:44 +02:00
|
|
|
git_path("sharedindex.%s", oid_to_hex(&si->base->oid)));
|
2017-03-06 10:41:58 +01:00
|
|
|
if (!ret) {
|
2018-05-02 02:25:44 +02:00
|
|
|
oidcpy(&si->base_oid, &si->base->oid);
|
|
|
|
clean_shared_index_files(oid_to_hex(&si->base->oid));
|
2017-03-06 10:41:58 +01:00
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:44 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-02-27 19:00:08 +01:00
|
|
|
static const int default_max_percent_split_change = 20;
|
|
|
|
|
|
|
|
static int too_many_not_shared_entries(struct index_state *istate)
|
|
|
|
{
|
|
|
|
int i, not_shared = 0;
|
|
|
|
int max_split = git_config_get_max_percent_split_change();
|
|
|
|
|
|
|
|
switch (max_split) {
|
|
|
|
case -1:
|
|
|
|
/* not or badly configured: use the default value */
|
|
|
|
max_split = default_max_percent_split_change;
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
return 1; /* 0% means always write a new shared index */
|
|
|
|
case 100:
|
|
|
|
return 0; /* 100% means never write a new shared index */
|
|
|
|
default:
|
|
|
|
break; /* just use the configured value */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Count not shared entries */
|
|
|
|
for (i = 0; i < istate->cache_nr; i++) {
|
|
|
|
struct cache_entry *ce = istate->cache[i];
|
|
|
|
if (!ce->index)
|
|
|
|
not_shared++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (int64_t)istate->cache_nr * max_split < (int64_t)not_shared * 100;
|
|
|
|
}
|
|
|
|
|
2014-06-13 14:19:23 +02:00
|
|
|
int write_locked_index(struct index_state *istate, struct lock_file *lock,
|
|
|
|
unsigned flags)
|
|
|
|
{
|
2017-02-27 19:00:12 +01:00
|
|
|
int new_shared_index, ret;
|
2014-06-13 14:19:36 +02:00
|
|
|
struct split_index *si = istate->split_index;
|
|
|
|
|
2018-08-18 16:41:28 +02:00
|
|
|
if (git_env_bool("GIT_TEST_CHECK_CACHE_TREE", 0))
|
2018-11-10 06:49:02 +01:00
|
|
|
cache_tree_verify(the_repository, istate);
|
2018-08-18 16:41:28 +02:00
|
|
|
|
2018-03-01 21:40:20 +01:00
|
|
|
if ((flags & SKIP_IF_UNCHANGED) && !istate->cache_changed) {
|
|
|
|
if (flags & COMMIT_LOCK)
|
|
|
|
rollback_lock_file(lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-09 20:58:10 +01:00
|
|
|
if (istate->fsmonitor_last_update)
|
|
|
|
fill_fsmonitor_bitmap(istate);
|
|
|
|
|
2014-06-13 14:19:47 +02:00
|
|
|
if (!si || alternate_index_output ||
|
|
|
|
(istate->cache_changed & ~EXTMASK)) {
|
2014-06-13 14:19:36 +02:00
|
|
|
if (si)
|
2018-05-02 02:25:43 +02:00
|
|
|
oidclr(&si->base_oid);
|
read-cache: leave lock in right state in `write_locked_index()`
If the original version of `write_locked_index()` returned with an
error, it didn't roll back the lockfile unless the error occured at the
very end, during closing/committing. See commit 03b866477 (read-cache:
new API write_locked_index instead of write_index/write_cache,
2014-06-13).
In commit 9f41c7a6b (read-cache: close index.lock in do_write_index,
2017-04-26), we learned to close the lock slightly earlier in the
callstack. That was mostly a side-effect of lockfiles being implemented
using temporary files, but didn't cause any real harm.
Recently, commit 076aa2cbd (tempfile: auto-allocate tempfiles on heap,
2017-09-05) introduced a subtle bug. If the temporary file is deleted
(i.e., the lockfile is rolled back), the tempfile-pointer in the `struct
lock_file` will be left dangling. Thus, an attempt to reuse the
lockfile, or even just to roll it back, will induce undefined behavior
-- most likely a crash.
Besides not crashing, we clearly want to make things consistent. The
guarantees which the lockfile-machinery itself provides is A) if we ask
to commit and it fails, roll back, and B) if we ask to close and it
fails, do _not_ roll back. Let's do the same for consistency.
Do not delete the temporary file in `do_write_index()`. One of its
callers, `write_locked_index()` will thereby avoid rolling back the
lock. The other caller, `write_shared_index()`, will delete its
temporary file anyway. Both of these callers will avoid undefined
behavior (crashing).
Teach `write_locked_index(..., COMMIT_LOCK)` to roll back the lock
before returning. If we have already succeeded and committed, it will be
a noop. Simplify the existing callers where we now have a superfluous
call to `rollback_lockfile()`. That should keep future readers from
wondering why the callers are inconsistent.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-10-06 22:12:13 +02:00
|
|
|
ret = do_write_locked_index(istate, lock, flags);
|
|
|
|
goto out;
|
2014-06-13 14:19:36 +02:00
|
|
|
}
|
|
|
|
|
2018-04-14 17:34:59 +02:00
|
|
|
if (git_env_bool("GIT_TEST_SPLIT_INDEX", 0)) {
|
2018-05-02 02:25:43 +02:00
|
|
|
int v = si->base_oid.hash[0];
|
2014-06-13 14:19:49 +02:00
|
|
|
if ((v & 15) < 6)
|
|
|
|
istate->cache_changed |= SPLIT_INDEX_ORDERED;
|
|
|
|
}
|
2017-02-27 19:00:08 +01:00
|
|
|
if (too_many_not_shared_entries(istate))
|
|
|
|
istate->cache_changed |= SPLIT_INDEX_ORDERED;
|
2017-02-27 19:00:12 +01:00
|
|
|
|
|
|
|
new_shared_index = istate->cache_changed & SPLIT_INDEX_ORDERED;
|
|
|
|
|
|
|
|
if (new_shared_index) {
|
2018-01-14 11:18:19 +01:00
|
|
|
struct tempfile *temp;
|
|
|
|
int saved_errno;
|
|
|
|
|
2018-11-18 20:04:29 +01:00
|
|
|
/* Same initial permissions as the main .git/index file */
|
|
|
|
temp = mks_tempfile_sm(git_path("sharedindex_XXXXXX"), 0, 0666);
|
2018-01-14 11:18:19 +01:00
|
|
|
if (!temp) {
|
2018-05-02 02:25:43 +02:00
|
|
|
oidclr(&si->base_oid);
|
2018-01-14 11:18:19 +01:00
|
|
|
ret = do_write_locked_index(istate, lock, flags);
|
2018-01-24 10:38:29 +01:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = write_shared_index(istate, &temp);
|
2018-01-14 11:18:19 +01:00
|
|
|
|
|
|
|
saved_errno = errno;
|
|
|
|
if (is_tempfile_active(temp))
|
|
|
|
delete_tempfile(&temp);
|
|
|
|
errno = saved_errno;
|
|
|
|
|
2014-06-13 14:19:44 +02:00
|
|
|
if (ret)
|
read-cache: leave lock in right state in `write_locked_index()`
If the original version of `write_locked_index()` returned with an
error, it didn't roll back the lockfile unless the error occured at the
very end, during closing/committing. See commit 03b866477 (read-cache:
new API write_locked_index instead of write_index/write_cache,
2014-06-13).
In commit 9f41c7a6b (read-cache: close index.lock in do_write_index,
2017-04-26), we learned to close the lock slightly earlier in the
callstack. That was mostly a side-effect of lockfiles being implemented
using temporary files, but didn't cause any real harm.
Recently, commit 076aa2cbd (tempfile: auto-allocate tempfiles on heap,
2017-09-05) introduced a subtle bug. If the temporary file is deleted
(i.e., the lockfile is rolled back), the tempfile-pointer in the `struct
lock_file` will be left dangling. Thus, an attempt to reuse the
lockfile, or even just to roll it back, will induce undefined behavior
-- most likely a crash.
Besides not crashing, we clearly want to make things consistent. The
guarantees which the lockfile-machinery itself provides is A) if we ask
to commit and it fails, roll back, and B) if we ask to close and it
fails, do _not_ roll back. Let's do the same for consistency.
Do not delete the temporary file in `do_write_index()`. One of its
callers, `write_locked_index()` will thereby avoid rolling back the
lock. The other caller, `write_shared_index()`, will delete its
temporary file anyway. Both of these callers will avoid undefined
behavior (crashing).
Teach `write_locked_index(..., COMMIT_LOCK)` to roll back the lock
before returning. If we have already succeeded and committed, it will be
a noop. Simplify the existing callers where we now have a superfluous
call to `rollback_lockfile()`. That should keep future readers from
wondering why the callers are inconsistent.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-10-06 22:12:13 +02:00
|
|
|
goto out;
|
2014-06-13 14:19:44 +02:00
|
|
|
}
|
|
|
|
|
2017-02-27 19:00:12 +01:00
|
|
|
ret = write_split_index(istate, lock, flags);
|
|
|
|
|
|
|
|
/* Freshen the shared index only if the split-index was written */
|
2019-02-13 10:51:29 +01:00
|
|
|
if (!ret && !new_shared_index && !is_null_oid(&si->base_oid)) {
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
const char *shared_index = git_path("sharedindex.%s",
|
2018-05-02 02:25:43 +02:00
|
|
|
oid_to_hex(&si->base_oid));
|
read-cache: fix reading the shared index for other repos
read_index_from() takes a path argument for the location of the index
file. For reading the shared index in split index mode however it just
ignores that path argument, and reads it from the gitdir of the current
repository.
This works as long as an index in the_repository is read. Once that
changes, such as when we read the index of a submodule, or of a
different working tree than the current one, the gitdir of
the_repository will no longer contain the appropriate shared index,
and git will fail to read it.
For example t3007-ls-files-recurse-submodules.sh was broken with
GIT_TEST_SPLIT_INDEX set in 188dce131f ("ls-files: use repository
object", 2017-06-22), and t7814-grep-recurse-submodules.sh was also
broken in a similar manner, probably by introducing struct repository
there, although I didn't track down the exact commit for that.
be489d02d2 ("revision.c: --indexed-objects add objects from all
worktrees", 2017-08-23) breaks with split index mode in a similar
manner, not erroring out when it can't read the index, but instead
carrying on with pruning, without taking the index of the worktree into
account.
Fix this by passing an additional gitdir parameter to read_index_from,
to indicate where it should look for and read the shared index from.
read_cache_from() defaults to using the gitdir of the_repository. As it
is mostly a convenience macro, having to pass get_git_dir() for every
call seems overkill, and if necessary users can have more control by
using read_index_from().
Helped-by: Brandon Williams <bmwill@google.com>
Signed-off-by: Thomas Gummerer <t.gummerer@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-01-07 23:30:13 +01:00
|
|
|
freshen_shared_index(shared_index, 1);
|
|
|
|
}
|
2017-02-27 19:00:12 +01:00
|
|
|
|
read-cache: leave lock in right state in `write_locked_index()`
If the original version of `write_locked_index()` returned with an
error, it didn't roll back the lockfile unless the error occured at the
very end, during closing/committing. See commit 03b866477 (read-cache:
new API write_locked_index instead of write_index/write_cache,
2014-06-13).
In commit 9f41c7a6b (read-cache: close index.lock in do_write_index,
2017-04-26), we learned to close the lock slightly earlier in the
callstack. That was mostly a side-effect of lockfiles being implemented
using temporary files, but didn't cause any real harm.
Recently, commit 076aa2cbd (tempfile: auto-allocate tempfiles on heap,
2017-09-05) introduced a subtle bug. If the temporary file is deleted
(i.e., the lockfile is rolled back), the tempfile-pointer in the `struct
lock_file` will be left dangling. Thus, an attempt to reuse the
lockfile, or even just to roll it back, will induce undefined behavior
-- most likely a crash.
Besides not crashing, we clearly want to make things consistent. The
guarantees which the lockfile-machinery itself provides is A) if we ask
to commit and it fails, roll back, and B) if we ask to close and it
fails, do _not_ roll back. Let's do the same for consistency.
Do not delete the temporary file in `do_write_index()`. One of its
callers, `write_locked_index()` will thereby avoid rolling back the
lock. The other caller, `write_shared_index()`, will delete its
temporary file anyway. Both of these callers will avoid undefined
behavior (crashing).
Teach `write_locked_index(..., COMMIT_LOCK)` to roll back the lock
before returning. If we have already succeeded and committed, it will be
a noop. Simplify the existing callers where we now have a superfluous
call to `rollback_lockfile()`. That should keep future readers from
wondering why the callers are inconsistent.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-10-06 22:12:13 +02:00
|
|
|
out:
|
|
|
|
if (flags & COMMIT_LOCK)
|
|
|
|
rollback_lock_file(lock);
|
2017-02-27 19:00:12 +01:00
|
|
|
return ret;
|
2014-06-13 14:19:23 +02:00
|
|
|
}
|
|
|
|
|
2008-06-27 18:21:58 +02:00
|
|
|
/*
|
|
|
|
* Read the index file that is potentially unmerged into given
|
read-cache: fix directory/file conflict handling in read_index_unmerged()
read_index_unmerged() has two intended purposes:
* return 1 if there are any unmerged entries, 0 otherwise
* drops any higher-stage entries down to stage #0
There are several callers of read_index_unmerged() that check the return
value to see if it is non-zero, all of which then die() if that condition
is met. For these callers, dropping higher-stage entries down to stage #0
is a waste of resources, and returning immediately on first unmerged entry
would be better. But it's probably only a very minor difference and isn't
the focus of this series.
The remaining callers ignore the return value and call this function for
the side effect of dropping higher-stage entries down to stage #0. As
mentioned in commit e11d7b596970 ("'reset --merge': fix unmerged case",
2009-12-31),
The _only_ reason we want to keep a previously unmerged entry in the
index at stage #0 is so that we don't forget the fact that we have
corresponding file in the work tree in order to be able to remove it
when the tree we are resetting to does not have the path.
In fact, prior to commit d1a43f2aa4bf ("reset --hard/read-tree --reset -u:
remove unmerged new paths", 2008-10-15), read_index_unmerged() did just
remove unmerged entries from the cache immediately but that had the
unwanted effect of leaving around new untracked files in the tree from
aborted merges.
So, that's the intended purpose of this function. The problem is that
when directory/files conflicts are present, trying to add the file to the
index at stage 0 fails (because there is still a directory in the way),
and the function returns early with a -1 return code to signify the error.
As noted above, none of the callers who want the drop-to-stage-0 behavior
check the return status, though, so this means all remaining unmerged
entries remain in the index and the callers proceed assuming otherwise.
Users then see errors of the form:
error: 'DIR-OR-FILE' appears as both a file and as a directory
error: DIR-OR-FILE: cannot drop to stage #0
and potentially also messages about other unmerged entries which came
lexicographically later than whatever pathname was both a file and a
directory. Google finds a few hits searching for those messages,
suggesting there were probably a couple people who hit this besides me.
Luckily, calling `git reset --hard` multiple times would workaround
this bug.
Since the whole purpose here is to just put the entry *temporarily* into
the index so that any associated file in the working copy can be removed,
we can just skip the DFCHECK and allow both the file and directory to
appear in the index. The temporary simultaneous appearance of the
directory and file entries in the index will be removed by the callers
by calling unpack_trees(), which excludes these unmerged entries marked
with CE_CONFLICTED flag from the resulting index, before they attempt to
write the index anywhere.
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-31 19:12:05 +02:00
|
|
|
* index_state, dropping any unmerged entries to stage #0 (potentially
|
|
|
|
* resulting in a path appearing as both a file and a directory in the
|
|
|
|
* index; the caller is responsible to clear out the extra entries
|
|
|
|
* before writing the index to a tree). Returns true if the index is
|
|
|
|
* unmerged. Callers who want to refuse to work from an unmerged
|
|
|
|
* state can call this and check its return value, instead of calling
|
|
|
|
* read_cache().
|
2008-06-27 18:21:58 +02:00
|
|
|
*/
|
2019-01-12 03:13:26 +01:00
|
|
|
int repo_read_index_unmerged(struct repository *repo)
|
2008-06-27 18:21:58 +02:00
|
|
|
{
|
2019-01-12 03:13:26 +01:00
|
|
|
struct index_state *istate;
|
2008-06-27 18:21:58 +02:00
|
|
|
int i;
|
2008-10-16 01:00:06 +02:00
|
|
|
int unmerged = 0;
|
2008-06-27 18:21:58 +02:00
|
|
|
|
2019-01-12 03:13:26 +01:00
|
|
|
repo_read_index(repo);
|
|
|
|
istate = repo->index;
|
2008-06-27 18:21:58 +02:00
|
|
|
for (i = 0; i < istate->cache_nr; i++) {
|
|
|
|
struct cache_entry *ce = istate->cache[i];
|
2008-10-16 01:00:06 +02:00
|
|
|
struct cache_entry *new_ce;
|
2018-07-02 21:49:31 +02:00
|
|
|
int len;
|
2008-10-16 01:00:06 +02:00
|
|
|
|
|
|
|
if (!ce_stage(ce))
|
2008-06-27 18:21:58 +02:00
|
|
|
continue;
|
2008-10-16 01:00:06 +02:00
|
|
|
unmerged = 1;
|
2012-07-06 18:07:30 +02:00
|
|
|
len = ce_namelen(ce);
|
2018-07-02 21:49:31 +02:00
|
|
|
new_ce = make_empty_cache_entry(istate, len);
|
2008-10-16 01:00:06 +02:00
|
|
|
memcpy(new_ce->name, ce->name, len);
|
2012-07-11 11:22:37 +02:00
|
|
|
new_ce->ce_flags = create_ce_flags(0) | CE_CONFLICTED;
|
|
|
|
new_ce->ce_namelen = len;
|
2008-10-16 01:00:06 +02:00
|
|
|
new_ce->ce_mode = ce->ce_mode;
|
read-cache: fix directory/file conflict handling in read_index_unmerged()
read_index_unmerged() has two intended purposes:
* return 1 if there are any unmerged entries, 0 otherwise
* drops any higher-stage entries down to stage #0
There are several callers of read_index_unmerged() that check the return
value to see if it is non-zero, all of which then die() if that condition
is met. For these callers, dropping higher-stage entries down to stage #0
is a waste of resources, and returning immediately on first unmerged entry
would be better. But it's probably only a very minor difference and isn't
the focus of this series.
The remaining callers ignore the return value and call this function for
the side effect of dropping higher-stage entries down to stage #0. As
mentioned in commit e11d7b596970 ("'reset --merge': fix unmerged case",
2009-12-31),
The _only_ reason we want to keep a previously unmerged entry in the
index at stage #0 is so that we don't forget the fact that we have
corresponding file in the work tree in order to be able to remove it
when the tree we are resetting to does not have the path.
In fact, prior to commit d1a43f2aa4bf ("reset --hard/read-tree --reset -u:
remove unmerged new paths", 2008-10-15), read_index_unmerged() did just
remove unmerged entries from the cache immediately but that had the
unwanted effect of leaving around new untracked files in the tree from
aborted merges.
So, that's the intended purpose of this function. The problem is that
when directory/files conflicts are present, trying to add the file to the
index at stage 0 fails (because there is still a directory in the way),
and the function returns early with a -1 return code to signify the error.
As noted above, none of the callers who want the drop-to-stage-0 behavior
check the return status, though, so this means all remaining unmerged
entries remain in the index and the callers proceed assuming otherwise.
Users then see errors of the form:
error: 'DIR-OR-FILE' appears as both a file and as a directory
error: DIR-OR-FILE: cannot drop to stage #0
and potentially also messages about other unmerged entries which came
lexicographically later than whatever pathname was both a file and a
directory. Google finds a few hits searching for those messages,
suggesting there were probably a couple people who hit this besides me.
Luckily, calling `git reset --hard` multiple times would workaround
this bug.
Since the whole purpose here is to just put the entry *temporarily* into
the index so that any associated file in the working copy can be removed,
we can just skip the DFCHECK and allow both the file and directory to
appear in the index. The temporary simultaneous appearance of the
directory and file entries in the index will be removed by the callers
by calling unpack_trees(), which excludes these unmerged entries marked
with CE_CONFLICTED flag from the resulting index, before they attempt to
write the index anywhere.
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-31 19:12:05 +02:00
|
|
|
if (add_index_entry(istate, new_ce, ADD_CACHE_SKIP_DFCHECK))
|
2018-11-10 06:16:05 +01:00
|
|
|
return error(_("%s: cannot drop to stage #0"),
|
2013-11-14 20:24:37 +01:00
|
|
|
new_ce->name);
|
2008-06-27 18:21:58 +02:00
|
|
|
}
|
2008-10-16 01:00:06 +02:00
|
|
|
return unmerged;
|
2008-06-27 18:21:58 +02:00
|
|
|
}
|
2008-07-21 10:24:17 +02:00
|
|
|
|
2008-10-16 17:07:26 +02:00
|
|
|
/*
|
|
|
|
* Returns 1 if the path is an "other" path with respect to
|
|
|
|
* the index; that is, the path is not mentioned in the index at all,
|
|
|
|
* either as a file, a directory with some files in the index,
|
|
|
|
* or as an unmerged entry.
|
|
|
|
*
|
|
|
|
* We helpfully remove a trailing "/" from directories so that
|
|
|
|
* the output of read_directory can be used as-is.
|
|
|
|
*/
|
|
|
|
int index_name_is_other(const struct index_state *istate, const char *name,
|
|
|
|
int namelen)
|
|
|
|
{
|
|
|
|
int pos;
|
|
|
|
if (namelen && name[namelen - 1] == '/')
|
|
|
|
namelen--;
|
|
|
|
pos = index_name_pos(istate, name, namelen);
|
|
|
|
if (0 <= pos)
|
|
|
|
return 0; /* exact match */
|
|
|
|
pos = -pos - 1;
|
|
|
|
if (pos < istate->cache_nr) {
|
|
|
|
struct cache_entry *ce = istate->cache[pos];
|
|
|
|
if (ce_namelen(ce) == namelen &&
|
|
|
|
!memcmp(ce->name, name, namelen))
|
|
|
|
return 0; /* Yup, this one exists unmerged */
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2013-04-13 15:28:30 +02:00
|
|
|
|
2017-01-10 21:06:10 +01:00
|
|
|
void *read_blob_data_from_index(const struct index_state *istate,
|
|
|
|
const char *path, unsigned long *size)
|
2013-04-13 15:28:30 +02:00
|
|
|
{
|
|
|
|
int pos, len;
|
|
|
|
unsigned long sz;
|
|
|
|
enum object_type type;
|
|
|
|
void *data;
|
|
|
|
|
|
|
|
len = strlen(path);
|
|
|
|
pos = index_name_pos(istate, path, len);
|
|
|
|
if (pos < 0) {
|
|
|
|
/*
|
|
|
|
* We might be in the middle of a merge, in which
|
|
|
|
* case we would read stage #2 (ours).
|
|
|
|
*/
|
|
|
|
int i;
|
|
|
|
for (i = -pos - 1;
|
|
|
|
(pos < 0 && i < istate->cache_nr &&
|
|
|
|
!strcmp(istate->cache[i]->name, path));
|
|
|
|
i++)
|
|
|
|
if (ce_stage(istate->cache[i]) == 2)
|
|
|
|
pos = i;
|
|
|
|
}
|
|
|
|
if (pos < 0)
|
|
|
|
return NULL;
|
sha1_file: convert read_sha1_file to struct object_id
Convert read_sha1_file to take a pointer to struct object_id and rename
it read_object_file. Do the same for read_sha1_file_extended.
Convert one use in grep.c to use the new function without any other code
change, since the pointer being passed is a void pointer that is already
initialized with a pointer to struct object_id. Update the declaration
and definitions of the modified functions, and apply the following
semantic patch to convert the remaining callers:
@@
expression E1, E2, E3;
@@
- read_sha1_file(E1.hash, E2, E3)
+ read_object_file(&E1, E2, E3)
@@
expression E1, E2, E3;
@@
- read_sha1_file(E1->hash, E2, E3)
+ read_object_file(E1, E2, E3)
@@
expression E1, E2, E3, E4;
@@
- read_sha1_file_extended(E1.hash, E2, E3, E4)
+ read_object_file_extended(&E1, E2, E3, E4)
@@
expression E1, E2, E3, E4;
@@
- read_sha1_file_extended(E1->hash, E2, E3, E4)
+ read_object_file_extended(E1, E2, E3, E4)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-03-12 03:27:53 +01:00
|
|
|
data = read_object_file(&istate->cache[pos]->oid, &type, &sz);
|
2013-04-13 15:28:30 +02:00
|
|
|
if (!data || type != OBJ_BLOB) {
|
|
|
|
free(data);
|
|
|
|
return NULL;
|
|
|
|
}
|
2013-04-13 15:28:31 +02:00
|
|
|
if (size)
|
|
|
|
*size = sz;
|
2013-04-13 15:28:30 +02:00
|
|
|
return data;
|
|
|
|
}
|
2013-06-20 10:37:51 +02:00
|
|
|
|
|
|
|
void stat_validity_clear(struct stat_validity *sv)
|
|
|
|
{
|
2017-06-16 01:15:46 +02:00
|
|
|
FREE_AND_NULL(sv->sd);
|
2013-06-20 10:37:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int stat_validity_check(struct stat_validity *sv, const char *path)
|
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (stat(path, &st) < 0)
|
|
|
|
return sv->sd == NULL;
|
|
|
|
if (!sv->sd)
|
|
|
|
return 0;
|
|
|
|
return S_ISREG(st.st_mode) && !match_stat_data(sv->sd, &st);
|
|
|
|
}
|
|
|
|
|
|
|
|
void stat_validity_update(struct stat_validity *sv, int fd)
|
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (fstat(fd, &st) < 0 || !S_ISREG(st.st_mode))
|
|
|
|
stat_validity_clear(sv);
|
|
|
|
else {
|
|
|
|
if (!sv->sd)
|
|
|
|
sv->sd = xcalloc(1, sizeof(struct stat_data));
|
|
|
|
fill_stat_data(sv->sd, &st);
|
|
|
|
}
|
|
|
|
}
|
2017-05-08 11:41:42 +02:00
|
|
|
|
|
|
|
void move_index_extensions(struct index_state *dst, struct index_state *src)
|
|
|
|
{
|
|
|
|
dst->untracked = src->untracked;
|
|
|
|
src->untracked = NULL;
|
2018-08-18 16:41:26 +02:00
|
|
|
dst->cache_tree = src->cache_tree;
|
|
|
|
src->cache_tree = NULL;
|
2017-05-08 11:41:42 +02:00
|
|
|
}
|
2018-07-02 21:49:31 +02:00
|
|
|
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
struct cache_entry *dup_cache_entry(const struct cache_entry *ce,
|
|
|
|
struct index_state *istate)
|
|
|
|
{
|
|
|
|
unsigned int size = ce_size(ce);
|
|
|
|
int mem_pool_allocated;
|
|
|
|
struct cache_entry *new_entry = make_empty_cache_entry(istate, ce_namelen(ce));
|
|
|
|
mem_pool_allocated = new_entry->mem_pool_allocated;
|
|
|
|
|
|
|
|
memcpy(new_entry, ce, size);
|
|
|
|
new_entry->mem_pool_allocated = mem_pool_allocated;
|
|
|
|
return new_entry;
|
|
|
|
}
|
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
void discard_cache_entry(struct cache_entry *ce)
|
|
|
|
{
|
2018-07-02 21:49:39 +02:00
|
|
|
if (ce && should_validate_cache_entries())
|
|
|
|
memset(ce, 0xCD, cache_entry_size(ce->ce_namelen));
|
|
|
|
|
block alloc: allocate cache entries from mem_pool
When reading large indexes from disk, a portion of the time is
dominated in malloc() calls. This can be mitigated by allocating a
large block of memory and manage it ourselves via memory pools.
This change moves the cache entry allocation to be on top of memory
pools.
Design:
The index_state struct will gain a notion of an associated memory_pool
from which cache_entries will be allocated from. When reading in the
index from disk, we have information on the number of entries and
their size, which can guide us in deciding how large our initial
memory allocation should be. When an index is discarded, the
associated memory_pool will be discarded as well - so the lifetime of
a cache_entry is tied to the lifetime of the index_state that it was
allocated for.
In the case of a Split Index, the following rules are followed. 1st,
some terminology is defined:
Terminology:
- 'the_index': represents the logical view of the index
- 'split_index': represents the "base" cache entries. Read from the
split index file.
'the_index' can reference a single split_index, as well as
cache_entries from the split_index. `the_index` will be discarded
before the `split_index` is. This means that when we are allocating
cache_entries in the presence of a split index, we need to allocate
the entries from the `split_index`'s memory pool. This allows us to
follow the pattern that `the_index` can reference cache_entries from
the `split_index`, and that the cache_entries will not be freed while
they are still being referenced.
Managing transient cache_entry structs:
Cache entries are usually allocated for an index, but this is not always
the case. Cache entries are sometimes allocated because this is the
type that the existing checkout_entry function works with. Because of
this, the existing code needs to handle cache entries associated with an
index / memory pool, and those that only exist transiently. Several
strategies were contemplated around how to handle this:
Chosen approach:
An extra field was added to the cache_entry type to track whether the
cache_entry was allocated from a memory pool or not. This is currently
an int field, as there are no more available bits in the existing
ce_flags bit field. If / when more bits are needed, this new field can
be turned into a proper bit field.
Alternatives:
1) Do not include any information about how the cache_entry was
allocated. Calling code would be responsible for tracking whether the
cache_entry needed to be freed or not.
Pro: No extra memory overhead to track this state
Con: Extra complexity in callers to handle this correctly.
The extra complexity and burden to not regress this behavior in the
future was more than we wanted.
2) cache_entry would gain knowledge about which mem_pool allocated it
Pro: Could (potentially) do extra logic to know when a mem_pool no
longer had references to any cache_entry
Con: cache_entry would grow heavier by a pointer, instead of int
We didn't see a tangible benefit to this approach
3) Do not add any extra information to a cache_entry, but when freeing a
cache entry, check if the memory exists in a region managed by existing
mem_pools.
Pro: No extra memory overhead to track state
Con: Extra computation is performed when freeing cache entries
We decided tracking and iterating over known memory pool regions was
less desirable than adding an extra field to track this stae.
Signed-off-by: Jameson Miller <jamill@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-02 21:49:37 +02:00
|
|
|
if (ce && ce->mem_pool_allocated)
|
|
|
|
return;
|
|
|
|
|
2018-07-02 21:49:31 +02:00
|
|
|
free(ce);
|
|
|
|
}
|
2018-07-02 21:49:39 +02:00
|
|
|
|
|
|
|
int should_validate_cache_entries(void)
|
|
|
|
{
|
|
|
|
static int validate_index_cache_entries = -1;
|
|
|
|
|
|
|
|
if (validate_index_cache_entries < 0) {
|
|
|
|
if (getenv("GIT_TEST_VALIDATE_INDEX_CACHE_ENTRIES"))
|
|
|
|
validate_index_cache_entries = 1;
|
|
|
|
else
|
|
|
|
validate_index_cache_entries = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return validate_index_cache_entries;
|
|
|
|
}
|
2018-10-10 17:59:34 +02:00
|
|
|
|
|
|
|
#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
|
|
|
|
#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
|
|
|
|
|
|
|
|
static size_t read_eoie_extension(const char *mmap, size_t mmap_size)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The end of index entries (EOIE) extension is guaranteed to be last
|
|
|
|
* so that it can be found by scanning backwards from the EOF.
|
|
|
|
*
|
|
|
|
* "EOIE"
|
|
|
|
* <4-byte length>
|
|
|
|
* <4-byte offset>
|
|
|
|
* <20-byte hash>
|
|
|
|
*/
|
|
|
|
const char *index, *eoie;
|
|
|
|
uint32_t extsize;
|
|
|
|
size_t offset, src_offset;
|
|
|
|
unsigned char hash[GIT_MAX_RAWSZ];
|
|
|
|
git_hash_ctx c;
|
|
|
|
|
|
|
|
/* ensure we have an index big enough to contain an EOIE extension */
|
|
|
|
if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* validate the extension signature */
|
|
|
|
index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
|
|
|
|
if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
|
|
|
|
return 0;
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
|
|
|
|
/* validate the extension size */
|
|
|
|
extsize = get_be32(index);
|
|
|
|
if (extsize != EOIE_SIZE)
|
|
|
|
return 0;
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Validate the offset we're going to look for the first extension
|
|
|
|
* signature is after the index header and before the eoie extension.
|
|
|
|
*/
|
|
|
|
offset = get_be32(index);
|
|
|
|
if (mmap + offset < mmap + sizeof(struct cache_header))
|
|
|
|
return 0;
|
|
|
|
if (mmap + offset >= eoie)
|
|
|
|
return 0;
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash is computed over extension types and their sizes (but not
|
|
|
|
* their contents). E.g. if we have "TREE" extension that is N-bytes
|
|
|
|
* long, "REUC" extension that is M-bytes long, followed by "EOIE",
|
|
|
|
* then the hash would be:
|
|
|
|
*
|
|
|
|
* SHA-1("TREE" + <binary representation of N> +
|
|
|
|
* "REUC" + <binary representation of M>)
|
|
|
|
*/
|
|
|
|
src_offset = offset;
|
|
|
|
the_hash_algo->init_fn(&c);
|
|
|
|
while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
|
|
|
|
/* After an array of active_nr index entries,
|
|
|
|
* there can be arbitrary number of extended
|
|
|
|
* sections, each of which is prefixed with
|
|
|
|
* extension name (4-byte) and section length
|
|
|
|
* in 4-byte network byte order.
|
|
|
|
*/
|
|
|
|
uint32_t extsize;
|
|
|
|
memcpy(&extsize, mmap + src_offset + 4, 4);
|
|
|
|
extsize = ntohl(extsize);
|
|
|
|
|
|
|
|
/* verify the extension size isn't so large it will wrap around */
|
|
|
|
if (src_offset + 8 + extsize < src_offset)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
the_hash_algo->update_fn(&c, mmap + src_offset, 8);
|
|
|
|
|
|
|
|
src_offset += 8;
|
|
|
|
src_offset += extsize;
|
|
|
|
}
|
|
|
|
the_hash_algo->final_fn(hash, &c);
|
|
|
|
if (!hasheq(hash, (const unsigned char *)index))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Validate that the extension offsets returned us back to the eoie extension. */
|
|
|
|
if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, size_t offset)
|
|
|
|
{
|
|
|
|
uint32_t buffer;
|
|
|
|
unsigned char hash[GIT_MAX_RAWSZ];
|
|
|
|
|
|
|
|
/* offset */
|
|
|
|
put_be32(&buffer, offset);
|
|
|
|
strbuf_add(sb, &buffer, sizeof(uint32_t));
|
|
|
|
|
|
|
|
/* hash */
|
|
|
|
the_hash_algo->final_fn(hash, eoie_context);
|
|
|
|
strbuf_add(sb, hash, the_hash_algo->rawsz);
|
|
|
|
}
|
2018-10-10 17:59:37 +02:00
|
|
|
|
|
|
|
#define IEOT_VERSION (1)
|
|
|
|
|
|
|
|
static struct index_entry_offset_table *read_ieot_extension(const char *mmap, size_t mmap_size, size_t offset)
|
|
|
|
{
|
2018-12-06 16:42:06 +01:00
|
|
|
const char *index = NULL;
|
|
|
|
uint32_t extsize, ext_version;
|
|
|
|
struct index_entry_offset_table *ieot;
|
|
|
|
int i, nr;
|
|
|
|
|
|
|
|
/* find the IEOT extension */
|
|
|
|
if (!offset)
|
|
|
|
return NULL;
|
|
|
|
while (offset <= mmap_size - the_hash_algo->rawsz - 8) {
|
|
|
|
extsize = get_be32(mmap + offset + 4);
|
|
|
|
if (CACHE_EXT((mmap + offset)) == CACHE_EXT_INDEXENTRYOFFSETTABLE) {
|
|
|
|
index = mmap + offset + 4 + 4;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
offset += 8;
|
|
|
|
offset += extsize;
|
|
|
|
}
|
|
|
|
if (!index)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* validate the version is IEOT_VERSION */
|
|
|
|
ext_version = get_be32(index);
|
|
|
|
if (ext_version != IEOT_VERSION) {
|
|
|
|
error("invalid IEOT version %d", ext_version);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
|
|
|
|
/* extension size - version bytes / bytes per entry */
|
|
|
|
nr = (extsize - sizeof(uint32_t)) / (sizeof(uint32_t) + sizeof(uint32_t));
|
|
|
|
if (!nr) {
|
|
|
|
error("invalid number of IEOT entries %d", nr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
ieot = xmalloc(sizeof(struct index_entry_offset_table)
|
|
|
|
+ (nr * sizeof(struct index_entry_offset)));
|
|
|
|
ieot->nr = nr;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
ieot->entries[i].offset = get_be32(index);
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
ieot->entries[i].nr = get_be32(index);
|
|
|
|
index += sizeof(uint32_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ieot;
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void write_ieot_extension(struct strbuf *sb, struct index_entry_offset_table *ieot)
|
|
|
|
{
|
2018-12-06 16:42:06 +01:00
|
|
|
uint32_t buffer;
|
|
|
|
int i;
|
2018-10-10 17:59:37 +02:00
|
|
|
|
2018-12-06 16:42:06 +01:00
|
|
|
/* version */
|
|
|
|
put_be32(&buffer, IEOT_VERSION);
|
|
|
|
strbuf_add(sb, &buffer, sizeof(uint32_t));
|
2018-10-10 17:59:37 +02:00
|
|
|
|
2018-12-06 16:42:06 +01:00
|
|
|
/* ieot */
|
|
|
|
for (i = 0; i < ieot->nr; i++) {
|
2018-10-10 17:59:37 +02:00
|
|
|
|
2018-12-06 16:42:06 +01:00
|
|
|
/* offset */
|
|
|
|
put_be32(&buffer, ieot->entries[i].offset);
|
|
|
|
strbuf_add(sb, &buffer, sizeof(uint32_t));
|
2018-10-10 17:59:37 +02:00
|
|
|
|
2018-12-06 16:42:06 +01:00
|
|
|
/* count */
|
|
|
|
put_be32(&buffer, ieot->entries[i].nr);
|
|
|
|
strbuf_add(sb, &buffer, sizeof(uint32_t));
|
|
|
|
}
|
2018-10-10 17:59:37 +02:00
|
|
|
}
|