2006-01-07 10:33:54 +01:00
|
|
|
#include "cache.h"
|
2023-02-24 01:09:27 +01:00
|
|
|
#include "hex.h"
|
2005-04-18 20:39:48 +02:00
|
|
|
#include "object.h"
|
2018-04-12 02:21:06 +02:00
|
|
|
#include "replace-object.h"
|
2018-05-16 01:42:15 +02:00
|
|
|
#include "object-store.h"
|
2005-04-28 16:46:33 +02:00
|
|
|
#include "blob.h"
|
|
|
|
#include "tree.h"
|
|
|
|
#include "commit.h"
|
|
|
|
#include "tag.h"
|
2018-05-15 23:48:42 +02:00
|
|
|
#include "alloc.h"
|
2018-03-23 18:21:00 +01:00
|
|
|
#include "packfile.h"
|
2018-07-12 00:42:41 +02:00
|
|
|
#include "commit-graph.h"
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2006-06-30 06:38:55 +02:00
|
|
|
unsigned int get_max_object_index(void)
|
|
|
|
{
|
2018-05-08 21:37:24 +02:00
|
|
|
return the_repository->parsed_objects->obj_hash_size;
|
2006-06-30 06:38:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
struct object *get_indexed_object(unsigned int idx)
|
|
|
|
{
|
2018-05-08 21:37:24 +02:00
|
|
|
return the_repository->parsed_objects->obj_hash[idx];
|
2006-06-30 06:38:55 +02:00
|
|
|
}
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2007-02-26 20:55:58 +01:00
|
|
|
static const char *object_type_strings[] = {
|
|
|
|
NULL, /* OBJ_NONE = 0 */
|
|
|
|
"commit", /* OBJ_COMMIT = 1 */
|
|
|
|
"tree", /* OBJ_TREE = 2 */
|
|
|
|
"blob", /* OBJ_BLOB = 3 */
|
|
|
|
"tag", /* OBJ_TAG = 4 */
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 01:45:13 +02:00
|
|
|
};
|
|
|
|
|
2018-02-14 19:59:24 +01:00
|
|
|
const char *type_name(unsigned int type)
|
2007-02-26 20:55:58 +01:00
|
|
|
{
|
|
|
|
if (type >= ARRAY_SIZE(object_type_strings))
|
|
|
|
return NULL;
|
|
|
|
return object_type_strings[type];
|
|
|
|
}
|
|
|
|
|
2014-09-10 15:52:44 +02:00
|
|
|
int type_from_string_gently(const char *str, ssize_t len, int gentle)
|
2007-02-26 20:55:58 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2014-09-10 15:52:44 +02:00
|
|
|
if (len < 0)
|
|
|
|
len = strlen(str);
|
|
|
|
|
2007-02-26 20:55:58 +01:00
|
|
|
for (i = 1; i < ARRAY_SIZE(object_type_strings); i++)
|
2015-04-17 16:52:48 +02:00
|
|
|
if (!strncmp(str, object_type_strings[i], len) &&
|
|
|
|
object_type_strings[i][len] == '\0')
|
2007-02-26 20:55:58 +01:00
|
|
|
return i;
|
2014-09-10 15:52:44 +02:00
|
|
|
|
|
|
|
if (gentle)
|
|
|
|
return -1;
|
|
|
|
|
2018-07-21 09:49:33 +02:00
|
|
|
die(_("invalid object type \"%s\""), str);
|
2007-02-26 20:55:58 +01:00
|
|
|
}
|
|
|
|
|
2014-02-28 17:29:17 +01:00
|
|
|
/*
|
|
|
|
* Return a numerical hash value between 0 and n-1 for the object with
|
|
|
|
* the specified sha1. n must be a power of 2. Please note that the
|
|
|
|
* return value is *not* consistent across computer architectures.
|
|
|
|
*/
|
2019-06-20 09:41:17 +02:00
|
|
|
static unsigned int hash_obj(const struct object_id *oid, unsigned int n)
|
2006-06-30 20:20:33 +02:00
|
|
|
{
|
2019-06-20 09:41:49 +02:00
|
|
|
return oidhash(oid) & (n - 1);
|
2006-06-30 20:20:33 +02:00
|
|
|
}
|
|
|
|
|
2014-02-28 17:29:17 +01:00
|
|
|
/*
|
|
|
|
* Insert obj into the hash table hash, which has length size (which
|
|
|
|
* must be a power of 2). On collisions, simply overflow to the next
|
|
|
|
* empty bucket.
|
|
|
|
*/
|
2006-06-30 20:20:33 +02:00
|
|
|
static void insert_obj_hash(struct object *obj, struct object **hash, unsigned int size)
|
|
|
|
{
|
2019-06-20 09:41:17 +02:00
|
|
|
unsigned int j = hash_obj(&obj->oid, size);
|
2006-06-30 20:20:33 +02:00
|
|
|
|
|
|
|
while (hash[j]) {
|
|
|
|
j++;
|
|
|
|
if (j >= size)
|
|
|
|
j = 0;
|
|
|
|
}
|
|
|
|
hash[j] = obj;
|
|
|
|
}
|
|
|
|
|
2014-02-28 17:29:17 +01:00
|
|
|
/*
|
|
|
|
* Look up the record for the given sha1 in the hash map stored in
|
|
|
|
* obj_hash. Return NULL if it was not found.
|
|
|
|
*/
|
2019-06-20 09:41:14 +02:00
|
|
|
struct object *lookup_object(struct repository *r, const struct object_id *oid)
|
2005-04-18 20:39:48 +02:00
|
|
|
{
|
lookup_object: prioritize recently found objects
The lookup_object function is backed by a hash table of all
objects we have seen in the program. We manage collisions
with a linear walk over the colliding entries, checking each
with hashcmp(). The main cost of lookup is in these
hashcmp() calls; finding our item in the first slot is
cheaper than finding it in the second slot, which is cheaper
than the third, and so on.
If we assume that there is some locality to the object
lookups (e.g., if X and Y collide, and we have just looked
up X, the next lookup is more likely to be for X than for
Y), then we can improve our average lookup speed by checking
X before Y.
This patch does so by swapping a found item to the front of
the collision chain. The p0001 perf test reveals that this
does indeed exploit locality in the case of "rev-list --all
--objects":
Test origin this tree
-------------------------------------------------------------------------
0001.1: rev-list --all 0.40(0.38+0.02) 0.40(0.36+0.03) +0.0%
0001.2: rev-list --all --objects 2.24(2.17+0.05) 1.86(1.79+0.05) -17.0%
This is not surprising, as the full object traversal will
hit the same tree entries over and over (e.g., for every
commit that doesn't change "Documentation/", we will have to
look up the same sha1 just to find out that we already
processed it).
The reason why this technique works (and does not violate
any properties of the hash table) is subtle and bears some
explanation. Let's imagine we get a lookup for sha1 `X`, and
it hashes to bucket `i` in our table. That stretch of the
table may look like:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | B | C | X | ...
-----------------------------------
We start our probe at i, see that B does not match, nor does
C, and finally find X. There may be multiple C's in the
middle, but we know that there are no empty slots (or else
we would not find X at all).
We do not know the original index of B; it may be `i`, or it
may be less than i (e.g., if it were `i-1`, it would collide
with A and spill over into the `i` bucket). So it is
acceptable for us to move it to the right of a contiguous
stretch of entries (because we will find it from a linear
walk starting anywhere at `i` or before), but never to the
left (if we moved it to `i-1`, we would miss it when
starting our walk at `i`).
We do know the original index of X; it is `i`, so it is safe
to place it anywhere in the contiguous stretch between `i`
and where we found it (`i+2` in the this case).
This patch does a pure swap; after finding X in the
situation above, we would end with:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | C | B | ...
-----------------------------------
We could instead bump X into the `i` slot, and then shift
the whole contiguous chain down by one, resulting in:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | B | C | ...
-----------------------------------
That puts our chain in true most-recently-used order.
However, experiments show that it is not any faster (and in
fact, is slightly slower due to the extra manipulation).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-01 22:34:50 +02:00
|
|
|
unsigned int i, first;
|
2006-06-30 20:20:33 +02:00
|
|
|
struct object *obj;
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2018-06-29 03:22:07 +02:00
|
|
|
if (!r->parsed_objects->obj_hash)
|
2006-06-30 20:20:33 +02:00
|
|
|
return NULL;
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2019-06-20 09:41:17 +02:00
|
|
|
first = i = hash_obj(oid, r->parsed_objects->obj_hash_size);
|
2018-06-29 03:22:07 +02:00
|
|
|
while ((obj = r->parsed_objects->obj_hash[i]) != NULL) {
|
2019-06-20 09:41:14 +02:00
|
|
|
if (oideq(oid, &obj->oid))
|
2006-06-30 20:20:33 +02:00
|
|
|
break;
|
2006-02-12 02:57:57 +01:00
|
|
|
i++;
|
2018-06-29 03:22:07 +02:00
|
|
|
if (i == r->parsed_objects->obj_hash_size)
|
2006-02-12 02:57:57 +01:00
|
|
|
i = 0;
|
|
|
|
}
|
lookup_object: prioritize recently found objects
The lookup_object function is backed by a hash table of all
objects we have seen in the program. We manage collisions
with a linear walk over the colliding entries, checking each
with hashcmp(). The main cost of lookup is in these
hashcmp() calls; finding our item in the first slot is
cheaper than finding it in the second slot, which is cheaper
than the third, and so on.
If we assume that there is some locality to the object
lookups (e.g., if X and Y collide, and we have just looked
up X, the next lookup is more likely to be for X than for
Y), then we can improve our average lookup speed by checking
X before Y.
This patch does so by swapping a found item to the front of
the collision chain. The p0001 perf test reveals that this
does indeed exploit locality in the case of "rev-list --all
--objects":
Test origin this tree
-------------------------------------------------------------------------
0001.1: rev-list --all 0.40(0.38+0.02) 0.40(0.36+0.03) +0.0%
0001.2: rev-list --all --objects 2.24(2.17+0.05) 1.86(1.79+0.05) -17.0%
This is not surprising, as the full object traversal will
hit the same tree entries over and over (e.g., for every
commit that doesn't change "Documentation/", we will have to
look up the same sha1 just to find out that we already
processed it).
The reason why this technique works (and does not violate
any properties of the hash table) is subtle and bears some
explanation. Let's imagine we get a lookup for sha1 `X`, and
it hashes to bucket `i` in our table. That stretch of the
table may look like:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | B | C | X | ...
-----------------------------------
We start our probe at i, see that B does not match, nor does
C, and finally find X. There may be multiple C's in the
middle, but we know that there are no empty slots (or else
we would not find X at all).
We do not know the original index of B; it may be `i`, or it
may be less than i (e.g., if it were `i-1`, it would collide
with A and spill over into the `i` bucket). So it is
acceptable for us to move it to the right of a contiguous
stretch of entries (because we will find it from a linear
walk starting anywhere at `i` or before), but never to the
left (if we moved it to `i-1`, we would miss it when
starting our walk at `i`).
We do know the original index of X; it is `i`, so it is safe
to place it anywhere in the contiguous stretch between `i`
and where we found it (`i+2` in the this case).
This patch does a pure swap; after finding X in the
situation above, we would end with:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | C | B | ...
-----------------------------------
We could instead bump X into the `i` slot, and then shift
the whole contiguous chain down by one, resulting in:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | B | C | ...
-----------------------------------
That puts our chain in true most-recently-used order.
However, experiments show that it is not any faster (and in
fact, is slightly slower due to the extra manipulation).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-01 22:34:50 +02:00
|
|
|
if (obj && i != first) {
|
|
|
|
/*
|
|
|
|
* Move object to where we started to look for it so
|
|
|
|
* that we do not need to walk the hash table the next
|
|
|
|
* time we look for it.
|
|
|
|
*/
|
2018-06-29 03:22:07 +02:00
|
|
|
SWAP(r->parsed_objects->obj_hash[i],
|
|
|
|
r->parsed_objects->obj_hash[first]);
|
lookup_object: prioritize recently found objects
The lookup_object function is backed by a hash table of all
objects we have seen in the program. We manage collisions
with a linear walk over the colliding entries, checking each
with hashcmp(). The main cost of lookup is in these
hashcmp() calls; finding our item in the first slot is
cheaper than finding it in the second slot, which is cheaper
than the third, and so on.
If we assume that there is some locality to the object
lookups (e.g., if X and Y collide, and we have just looked
up X, the next lookup is more likely to be for X than for
Y), then we can improve our average lookup speed by checking
X before Y.
This patch does so by swapping a found item to the front of
the collision chain. The p0001 perf test reveals that this
does indeed exploit locality in the case of "rev-list --all
--objects":
Test origin this tree
-------------------------------------------------------------------------
0001.1: rev-list --all 0.40(0.38+0.02) 0.40(0.36+0.03) +0.0%
0001.2: rev-list --all --objects 2.24(2.17+0.05) 1.86(1.79+0.05) -17.0%
This is not surprising, as the full object traversal will
hit the same tree entries over and over (e.g., for every
commit that doesn't change "Documentation/", we will have to
look up the same sha1 just to find out that we already
processed it).
The reason why this technique works (and does not violate
any properties of the hash table) is subtle and bears some
explanation. Let's imagine we get a lookup for sha1 `X`, and
it hashes to bucket `i` in our table. That stretch of the
table may look like:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | B | C | X | ...
-----------------------------------
We start our probe at i, see that B does not match, nor does
C, and finally find X. There may be multiple C's in the
middle, but we know that there are no empty slots (or else
we would not find X at all).
We do not know the original index of B; it may be `i`, or it
may be less than i (e.g., if it were `i-1`, it would collide
with A and spill over into the `i` bucket). So it is
acceptable for us to move it to the right of a contiguous
stretch of entries (because we will find it from a linear
walk starting anywhere at `i` or before), but never to the
left (if we moved it to `i-1`, we would miss it when
starting our walk at `i`).
We do know the original index of X; it is `i`, so it is safe
to place it anywhere in the contiguous stretch between `i`
and where we found it (`i+2` in the this case).
This patch does a pure swap; after finding X in the
situation above, we would end with:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | C | B | ...
-----------------------------------
We could instead bump X into the `i` slot, and then shift
the whole contiguous chain down by one, resulting in:
index | i-1 | i | i+1 | i+2 |
-----------------------------------
entry ... | A | X | B | C | ...
-----------------------------------
That puts our chain in true most-recently-used order.
However, experiments show that it is not any faster (and in
fact, is slightly slower due to the extra manipulation).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-01 22:34:50 +02:00
|
|
|
}
|
2006-06-30 20:20:33 +02:00
|
|
|
return obj;
|
2005-04-18 20:39:48 +02:00
|
|
|
}
|
|
|
|
|
2014-02-28 17:29:17 +01:00
|
|
|
/*
|
|
|
|
* Increase the size of the hash map stored in obj_hash to the next
|
|
|
|
* power of 2 (but at least 32). Copy the existing values to the new
|
|
|
|
* hash map.
|
|
|
|
*/
|
2018-05-08 21:37:34 +02:00
|
|
|
static void grow_object_hash(struct repository *r)
|
2005-04-18 20:39:48 +02:00
|
|
|
{
|
2006-06-30 20:20:33 +02:00
|
|
|
int i;
|
2013-09-11 00:17:12 +02:00
|
|
|
/*
|
|
|
|
* Note that this size must always be power-of-2 to match hash_obj
|
|
|
|
* above.
|
|
|
|
*/
|
2018-05-08 21:37:34 +02:00
|
|
|
int new_hash_size = r->parsed_objects->obj_hash_size < 32 ? 32 : 2 * r->parsed_objects->obj_hash_size;
|
2006-06-30 20:20:33 +02:00
|
|
|
struct object **new_hash;
|
|
|
|
|
2021-03-13 17:17:22 +01:00
|
|
|
CALLOC_ARRAY(new_hash, new_hash_size);
|
2018-05-08 21:37:34 +02:00
|
|
|
for (i = 0; i < r->parsed_objects->obj_hash_size; i++) {
|
|
|
|
struct object *obj = r->parsed_objects->obj_hash[i];
|
|
|
|
|
2006-06-30 20:20:33 +02:00
|
|
|
if (!obj)
|
|
|
|
continue;
|
|
|
|
insert_obj_hash(obj, new_hash, new_hash_size);
|
|
|
|
}
|
2018-05-08 21:37:34 +02:00
|
|
|
free(r->parsed_objects->obj_hash);
|
|
|
|
r->parsed_objects->obj_hash = new_hash;
|
|
|
|
r->parsed_objects->obj_hash_size = new_hash_size;
|
2005-04-18 20:39:48 +02:00
|
|
|
}
|
|
|
|
|
2019-06-20 09:41:21 +02:00
|
|
|
void *create_object(struct repository *r, const struct object_id *oid, void *o)
|
2005-04-18 20:39:48 +02:00
|
|
|
{
|
2007-04-17 07:11:43 +02:00
|
|
|
struct object *obj = o;
|
|
|
|
|
2005-04-18 20:39:48 +02:00
|
|
|
obj->parsed = 0;
|
2006-06-30 20:20:33 +02:00
|
|
|
obj->flags = 0;
|
2019-06-20 09:41:21 +02:00
|
|
|
oidcpy(&obj->oid, oid);
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2018-05-08 21:37:35 +02:00
|
|
|
if (r->parsed_objects->obj_hash_size - 1 <= r->parsed_objects->nr_objs * 2)
|
|
|
|
grow_object_hash(r);
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2018-05-08 21:37:35 +02:00
|
|
|
insert_obj_hash(obj, r->parsed_objects->obj_hash,
|
|
|
|
r->parsed_objects->obj_hash_size);
|
|
|
|
r->parsed_objects->nr_objs++;
|
2007-04-17 07:11:43 +02:00
|
|
|
return obj;
|
2005-04-18 20:39:48 +02:00
|
|
|
}
|
|
|
|
|
2020-06-17 11:14:08 +02:00
|
|
|
void *object_as_type(struct object *obj, enum object_type type, int quiet)
|
add object_as_type helper for casting objects
When we call lookup_commit, lookup_tree, etc, the logic goes
something like:
1. Look for an existing object struct. If we don't have
one, allocate and return a new one.
2. Double check that any object we have is the expected
type (and complain and return NULL otherwise).
3. Convert an object with type OBJ_NONE (from a prior
call to lookup_unknown_object) to the expected type.
We can encapsulate steps 2 and 3 in a helper function which
checks whether we have the expected object type, converts
OBJ_NONE as appropriate, and returns the object.
Not only does this shorten the code, but it also provides
one central location for converting OBJ_NONE objects into
objects of other types. Future patches will use that to
enforce type-specific invariants.
Since this is a refactoring, we would want it to behave
exactly as the current code. It takes a little reasoning to
see that this is the case:
- for lookup_{commit,tree,etc} functions, we are just
pulling steps 2 and 3 into a function that does the same
thing.
- for the call in peel_object, we currently only do step 3
(but we want to consolidate it with the others, as
mentioned above). However, step 2 is a noop here, as the
surrounding conditional makes sure we have OBJ_NONE
(which we want to keep to avoid an extraneous call to
sha1_object_info).
- for the call in lookup_commit_reference_gently, we are
currently doing step 2 but not step 3. However, step 3
is a noop here. The object we got will have just come
from deref_tag, which must have figured out the type for
each object in order to know when to stop peeling.
Therefore the type will never be OBJ_NONE.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-07-13 08:42:03 +02:00
|
|
|
{
|
|
|
|
if (obj->type == type)
|
|
|
|
return obj;
|
|
|
|
else if (obj->type == OBJ_NONE) {
|
2014-07-13 08:42:12 +02:00
|
|
|
if (type == OBJ_COMMIT)
|
2020-06-17 11:14:08 +02:00
|
|
|
init_commit_node((struct commit *) obj);
|
object_as_type: initialize commit-graph-related fields of 'struct commit'
When the commit graph and generation numbers were introduced in
commits 177722b344 (commit: integrate commit graph with commit
parsing, 2018-04-10) and 83073cc994 (commit: add generation number to
struct commit, 2018-04-25), they tried to make sure that the
corresponding 'graph_pos' and 'generation' fields of 'struct commit'
are initialized conservatively, as if the commit were not included in
the commit-graph file.
Alas, initializing those fields only in alloc_commit_node() missed the
case when an object that happens to be a commit is first looked up via
lookup_unknown_object(), and is then later converted to a 'struct
commit' via the object_as_type() helper function (either calling it
directly, or as part of a subsequent lookup_commit() call).
Consequently, both of those fields incorrectly remain set to zero,
which means e.g. that the commit is present in and is the first entry
of the commit-graph file. This will result in wrong timestamp, parent
and root tree hashes, if such a 'struct commit' instance is later
filled from the commit-graph.
Extract the initialization of 'struct commit's fields from
alloc_commit_node() into a helper function, and call it from
object_as_type() as well, to make sure that it properly initializes
the two commit-graph-related fields, too. With this helper function
it is hopefully less likely that any new fields added to 'struct
commit' in the future would remain uninitialized.
With this change alloc_commit_index() won't have any remaining callers
outside of 'alloc.c', so mark it as static.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-27 14:08:32 +01:00
|
|
|
else
|
|
|
|
obj->type = type;
|
add object_as_type helper for casting objects
When we call lookup_commit, lookup_tree, etc, the logic goes
something like:
1. Look for an existing object struct. If we don't have
one, allocate and return a new one.
2. Double check that any object we have is the expected
type (and complain and return NULL otherwise).
3. Convert an object with type OBJ_NONE (from a prior
call to lookup_unknown_object) to the expected type.
We can encapsulate steps 2 and 3 in a helper function which
checks whether we have the expected object type, converts
OBJ_NONE as appropriate, and returns the object.
Not only does this shorten the code, but it also provides
one central location for converting OBJ_NONE objects into
objects of other types. Future patches will use that to
enforce type-specific invariants.
Since this is a refactoring, we would want it to behave
exactly as the current code. It takes a little reasoning to
see that this is the case:
- for lookup_{commit,tree,etc} functions, we are just
pulling steps 2 and 3 into a function that does the same
thing.
- for the call in peel_object, we currently only do step 3
(but we want to consolidate it with the others, as
mentioned above). However, step 2 is a noop here, as the
surrounding conditional makes sure we have OBJ_NONE
(which we want to keep to avoid an extraneous call to
sha1_object_info).
- for the call in lookup_commit_reference_gently, we are
currently doing step 2 but not step 3. However, step 3
is a noop here. The object we got will have just come
from deref_tag, which must have figured out the type for
each object in order to know when to stop peeling.
Therefore the type will never be OBJ_NONE.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-07-13 08:42:03 +02:00
|
|
|
return obj;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (!quiet)
|
2018-07-21 09:49:33 +02:00
|
|
|
error(_("object %s is a %s, not a %s"),
|
2015-11-10 03:22:28 +01:00
|
|
|
oid_to_hex(&obj->oid),
|
2018-02-14 19:59:24 +01:00
|
|
|
type_name(obj->type), type_name(type));
|
add object_as_type helper for casting objects
When we call lookup_commit, lookup_tree, etc, the logic goes
something like:
1. Look for an existing object struct. If we don't have
one, allocate and return a new one.
2. Double check that any object we have is the expected
type (and complain and return NULL otherwise).
3. Convert an object with type OBJ_NONE (from a prior
call to lookup_unknown_object) to the expected type.
We can encapsulate steps 2 and 3 in a helper function which
checks whether we have the expected object type, converts
OBJ_NONE as appropriate, and returns the object.
Not only does this shorten the code, but it also provides
one central location for converting OBJ_NONE objects into
objects of other types. Future patches will use that to
enforce type-specific invariants.
Since this is a refactoring, we would want it to behave
exactly as the current code. It takes a little reasoning to
see that this is the case:
- for lookup_{commit,tree,etc} functions, we are just
pulling steps 2 and 3 into a function that does the same
thing.
- for the call in peel_object, we currently only do step 3
(but we want to consolidate it with the others, as
mentioned above). However, step 2 is a noop here, as the
surrounding conditional makes sure we have OBJ_NONE
(which we want to keep to avoid an extraneous call to
sha1_object_info).
- for the call in lookup_commit_reference_gently, we are
currently doing step 2 but not step 3. However, step 3
is a noop here. The object we got will have just come
from deref_tag, which must have figured out the type for
each object in order to know when to stop peeling.
Therefore the type will never be OBJ_NONE.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-07-13 08:42:03 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-13 09:16:36 +02:00
|
|
|
struct object *lookup_unknown_object(struct repository *r, const struct object_id *oid)
|
2005-08-03 01:45:48 +02:00
|
|
|
{
|
2021-04-13 09:16:36 +02:00
|
|
|
struct object *obj = lookup_object(r, oid);
|
2007-04-17 07:11:43 +02:00
|
|
|
if (!obj)
|
2021-04-13 09:16:36 +02:00
|
|
|
obj = create_object(r, oid, alloc_object_node(r));
|
2005-08-03 01:45:48 +02:00
|
|
|
return obj;
|
|
|
|
}
|
|
|
|
|
object.h: add lookup_object_by_type() function
In some cases it's useful for efficiency reasons to get the type of an
object before deciding whether to parse it, but we still want an object
struct. E.g., in reachable.c, bitmaps give us the type, but we just want
to mark flags on each object. Likewise, we may loop over every object
and only parse tags in order to peel them; checking the type first lets
us avoid parsing the non-tags.
But our lookup_blob(), etc, functions make getting an object struct
annoying: we have to call the right function for every type. And we
cannot just use the generic lookup_object(), because it only returns an
already-seen object; it won't allocate a new object struct.
Let's provide a function that dispatches to the correct lookup_*
function based on a run-time type. In fact, reachable.c already has such
a helper, so we'll just make that public.
I did change the return type from "void *" to "struct object *". While
the former is a clever way to avoid casting inside the function, it's
less safe and less informative to people reading the function
declaration.
The next commit will add a new caller.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-22 18:06:41 +02:00
|
|
|
struct object *lookup_object_by_type(struct repository *r,
|
|
|
|
const struct object_id *oid,
|
|
|
|
enum object_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case OBJ_COMMIT:
|
|
|
|
return (struct object *)lookup_commit(r, oid);
|
|
|
|
case OBJ_TREE:
|
|
|
|
return (struct object *)lookup_tree(r, oid);
|
|
|
|
case OBJ_TAG:
|
|
|
|
return (struct object *)lookup_tag(r, oid);
|
|
|
|
case OBJ_BLOB:
|
|
|
|
return (struct object *)lookup_blob(r, oid);
|
|
|
|
default:
|
2021-12-07 12:05:54 +01:00
|
|
|
BUG("unknown object type %d", type);
|
object.h: add lookup_object_by_type() function
In some cases it's useful for efficiency reasons to get the type of an
object before deciding whether to parse it, but we still want an object
struct. E.g., in reachable.c, bitmaps give us the type, but we just want
to mark flags on each object. Likewise, we may loop over every object
and only parse tags in order to peel them; checking the type first lets
us avoid parsing the non-tags.
But our lookup_blob(), etc, functions make getting an object struct
annoying: we have to call the right function for every type. And we
cannot just use the generic lookup_object(), because it only returns an
already-seen object; it won't allocate a new object struct.
Let's provide a function that dispatches to the correct lookup_*
function based on a run-time type. In fact, reachable.c already has such
a helper, so we'll just make that public.
I did change the return type from "void *" to "struct object *". While
the former is a clever way to avoid casting inside the function, it's
less safe and less informative to people reading the function
declaration.
The next commit will add a new caller.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-22 18:06:41 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-29 03:22:18 +02:00
|
|
|
struct object *parse_object_buffer(struct repository *r, const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p)
|
2006-09-15 22:30:02 +02:00
|
|
|
{
|
|
|
|
struct object *obj;
|
2013-07-18 00:09:42 +02:00
|
|
|
*eaten_p = 0;
|
2006-09-15 22:30:02 +02:00
|
|
|
|
2007-12-21 11:56:32 +01:00
|
|
|
obj = NULL;
|
2007-02-26 20:55:59 +01:00
|
|
|
if (type == OBJ_BLOB) {
|
2018-06-29 03:22:18 +02:00
|
|
|
struct blob *blob = lookup_blob(r, oid);
|
2007-12-21 11:56:32 +01:00
|
|
|
if (blob) {
|
blob: drop unused parts of parse_blob_buffer()
Our parse_blob_buffer() takes a ptr/len combo, just like
parse_tree_buffer(), etc, and returns success or failure. But it doesn't
actually do anything with them; we just set the "parsed" flag in the
object and return success, without even looking at the contents.
There could be some value to keeping these unused parameters:
- it's consistent with the parse functions for other object types. But
we already lost that consistency in 837d395a5c (Replace parse_blob()
with an explanatory comment, 2010-01-18).
- As the comment from 837d395a5c explains, callers are supposed to
make sure they have the object content available. So in theory
asking for these parameters could serve as a signal. But there are
only two callers, and one of them always passes NULL (after doing a
streaming check of the object hash).
This shows that there aren't likely to be a lot of callers (since
everyone either uses the type-generic parse functions, or handles
blobs individually), and that they need to take special care anyway
(because we usually want to avoid loading whole blobs in memory if
we can avoid it).
So let's just drop these unused parameters, and likewise the useless
return value. While we're touching the header file, let's move the
declaration of parse_blob_buffer() right below that explanatory comment,
where it's more likely to be seen by people looking for the function.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-12-13 12:11:57 +01:00
|
|
|
parse_blob_buffer(blob);
|
2007-12-21 11:56:32 +01:00
|
|
|
obj = &blob->object;
|
|
|
|
}
|
2007-02-26 20:55:59 +01:00
|
|
|
} else if (type == OBJ_TREE) {
|
2018-06-29 03:22:18 +02:00
|
|
|
struct tree *tree = lookup_tree(r, oid);
|
2007-12-21 11:56:32 +01:00
|
|
|
if (tree) {
|
|
|
|
obj = &tree->object;
|
2011-11-17 07:04:13 +01:00
|
|
|
if (!tree->buffer)
|
|
|
|
tree->object.parsed = 0;
|
2007-12-21 11:56:32 +01:00
|
|
|
if (!tree->object.parsed) {
|
2008-02-03 22:22:39 +01:00
|
|
|
if (parse_tree_buffer(tree, buffer, size))
|
|
|
|
return NULL;
|
2013-07-18 00:09:42 +02:00
|
|
|
*eaten_p = 1;
|
2007-12-21 11:56:32 +01:00
|
|
|
}
|
2006-09-15 22:30:02 +02:00
|
|
|
}
|
2007-02-26 20:55:59 +01:00
|
|
|
} else if (type == OBJ_COMMIT) {
|
2018-06-29 03:22:18 +02:00
|
|
|
struct commit *commit = lookup_commit(r, oid);
|
2007-12-21 11:56:32 +01:00
|
|
|
if (commit) {
|
2018-06-29 03:22:18 +02:00
|
|
|
if (parse_commit_buffer(r, commit, buffer, size, 1))
|
2008-02-03 22:22:39 +01:00
|
|
|
return NULL;
|
parse_object_buffer(): respect save_commit_buffer
If the global variable "save_commit_buffer" is set to 0, then
parse_commit() will throw away the commit object data after parsing it,
rather than sticking it into a commit slab. This goes all the way back
to 60ab26de99 ([PATCH] Avoid wasting memory in git-rev-list,
2005-09-15).
But there's another code path which may similarly stash the buffer:
parse_object_buffer(). This is where we end up if we parse a commit via
parse_object(), and it's used directly in a few other code paths like
git-fsck.
The original goal of 60ab26de99 was avoiding extra memory usage for
rev-list. And there it's not all that important to catch parse_object().
We use that function only for looking at the tips of the traversal, and
the majority of the commits are parsed by following parent links, where
we use parse_commit() directly. So we were wasting some memory, but only
a small portion.
It's much easier to see the effect with fsck. Since we now turn off
save_commit_buffer by default there, we _should_ be able to drop the
freeing of the commit buffer in fsck_obj(). But if we do so (taking the
first hunk of this patch without the rest), then the peak heap of "git
fsck" in a clone of git.git goes from 136MB to 194MB. Teaching
parse_object_buffer() to respect save_commit_buffer brings that down to
134.5MB (it's hard to tell from massif's output, but I suspect the
savings comes from avoiding the overhead of the mostly-empty commit
slab).
Other programs should see a small improvement. Both "rev-list --all" and
"fsck --connectivity-only" improve by a few hundred kilobytes, as they'd
avoid loading the tip objects of their traversals.
Most importantly, no code should be hurt by doing this. Any program that
turns off save_commit_buffer is already making the assumption that any
commit it sees may need to have its object data loaded on demand, as it
doesn't know which ones were parsed by parse_commit() versus
parse_object(). Not to mention that anything parsed by the commit graph
may be in the same boat, even if save_commit_buffer was not disabled.
This should be the only spot that needs to be fixed. Grepping for
set_commit_buffer() shows that this and parse_commit() are the only
relevant calls.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-22 12:15:38 +02:00
|
|
|
if (save_commit_buffer &&
|
|
|
|
!get_cached_commit_buffer(r, commit, NULL)) {
|
2018-06-29 03:22:18 +02:00
|
|
|
set_commit_buffer(r, commit, buffer, size);
|
2013-07-18 00:09:42 +02:00
|
|
|
*eaten_p = 1;
|
2007-12-21 11:56:32 +01:00
|
|
|
}
|
|
|
|
obj = &commit->object;
|
2006-09-15 22:30:02 +02:00
|
|
|
}
|
2007-02-26 20:55:59 +01:00
|
|
|
} else if (type == OBJ_TAG) {
|
2018-06-29 03:22:18 +02:00
|
|
|
struct tag *tag = lookup_tag(r, oid);
|
2007-12-21 11:56:32 +01:00
|
|
|
if (tag) {
|
2018-06-29 03:22:18 +02:00
|
|
|
if (parse_tag_buffer(r, tag, buffer, size))
|
2008-02-03 22:22:39 +01:00
|
|
|
return NULL;
|
2007-12-21 11:56:32 +01:00
|
|
|
obj = &tag->object;
|
|
|
|
}
|
2006-09-15 22:30:02 +02:00
|
|
|
} else {
|
2018-07-21 09:49:33 +02:00
|
|
|
warning(_("object %s has unknown type id %d"), oid_to_hex(oid), type);
|
2006-09-15 22:30:02 +02:00
|
|
|
obj = NULL;
|
|
|
|
}
|
|
|
|
return obj;
|
|
|
|
}
|
|
|
|
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 00:10:38 +02:00
|
|
|
struct object *parse_object_or_die(const struct object_id *oid,
|
2013-03-17 09:22:36 +01:00
|
|
|
const char *name)
|
|
|
|
{
|
2018-06-29 03:21:51 +02:00
|
|
|
struct object *o = parse_object(the_repository, oid);
|
2013-03-17 09:22:36 +01:00
|
|
|
if (o)
|
|
|
|
return o;
|
|
|
|
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 00:10:38 +02:00
|
|
|
die(_("unable to parse object: %s"), name ? name : oid_to_hex(oid));
|
2013-03-17 09:22:36 +01:00
|
|
|
}
|
|
|
|
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 01:01:34 +02:00
|
|
|
struct object *parse_object_with_flags(struct repository *r,
|
|
|
|
const struct object_id *oid,
|
|
|
|
enum parse_object_flags flags)
|
2005-04-28 16:46:33 +02:00
|
|
|
{
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 01:01:34 +02:00
|
|
|
int skip_hash = !!(flags & PARSE_OBJECT_SKIP_HASH_CHECK);
|
2005-06-27 12:33:33 +02:00
|
|
|
unsigned long size;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
2006-09-15 22:30:02 +02:00
|
|
|
int eaten;
|
2018-06-29 03:22:19 +02:00
|
|
|
const struct object_id *repl = lookup_replace_object(r, oid);
|
parse_object: try internal cache before reading object db
When parse_object is called, we do the following:
1. read the object data into a buffer via read_sha1_file
2. call parse_object_buffer, which then:
a. calls the appropriate lookup_{commit,tree,blob,tag}
to either create a new "struct object", or to find
an existing one. We know the appropriate type from
the lookup in step 1.
b. calls the appropriate parse_{commit,tree,blob,tag}
to parse the buffer for the new (or existing) object
In step 2b, all of the called functions are no-ops for
object "X" if "X->object.parsed" is set. I.e., when we have
already parsed an object, we end up going to a lot of work
just to find out at a low level that there is nothing left
for us to do (and we throw away the data from read_sha1_file
unread).
We can optimize this by moving the check for "do we have an
in-memory object" from 2a before the expensive call to
read_sha1_file in step 1.
This might seem circular, since step 2a uses the type
information determined in step 1 to call the appropriate
lookup function. However, we can notice that all of the
lookup_* functions are backed by lookup_object. In other
words, all of the objects are kept in a master hash table,
and we don't actually need the type to do the "do we have
it" part of the lookup, only to do the "and create it if it
doesn't exist" part.
This can save time whenever we call parse_object on the same
sha1 twice in a single program. Some code paths already
perform this optimization manually, with either:
if (!obj->parsed)
obj = parse_object(obj->sha1);
if you already have a "struct object", or:
struct object *obj = lookup_unknown_object(sha1);
if (!obj || !obj->parsed)
obj = parse_object(sha1);
if you don't. This patch moves the optimization into
parse_object itself.
Most git operations won't notice any impact. Either they
don't parse a lot of duplicate sha1s, or the calling code
takes special care not to re-parse objects. I timed two
code paths that do benefit (there may be more, but these two
were immediately obvious and easy to time).
The first is fast-export, which calls parse_object on each
object it outputs, like this:
object = parse_object(sha1);
if (!object)
die(...);
if (object->flags & SHOWN)
return;
which means that just to realize we have already shown an
object, we will read the whole object from disk!
With this patch, my best-of-five time for "fast-export --all" on
git.git dropped from 26.3s to 21.3s.
The second case is upload-pack, which will call parse_object
for each advertised ref (because it needs to peel tags to
show "^{}" entries). This doesn't matter for most
repositories, because they don't have a lot of refs pointing
to the same objects. However, if you have a big alternates
repository with a shared object db for a number of child
repositories, then the alternates repository will have
duplicated refs representing each of its children.
For example, GitHub's alternates repository for git.git has
~120,000 refs, of which only ~3200 are unique. The time for
upload-pack to print its list of advertised refs dropped
from 3.4s to 0.76s.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-01-05 22:00:01 +01:00
|
|
|
void *buffer;
|
|
|
|
struct object *obj;
|
|
|
|
|
2019-06-20 09:41:14 +02:00
|
|
|
obj = lookup_object(r, oid);
|
parse_object: try internal cache before reading object db
When parse_object is called, we do the following:
1. read the object data into a buffer via read_sha1_file
2. call parse_object_buffer, which then:
a. calls the appropriate lookup_{commit,tree,blob,tag}
to either create a new "struct object", or to find
an existing one. We know the appropriate type from
the lookup in step 1.
b. calls the appropriate parse_{commit,tree,blob,tag}
to parse the buffer for the new (or existing) object
In step 2b, all of the called functions are no-ops for
object "X" if "X->object.parsed" is set. I.e., when we have
already parsed an object, we end up going to a lot of work
just to find out at a low level that there is nothing left
for us to do (and we throw away the data from read_sha1_file
unread).
We can optimize this by moving the check for "do we have an
in-memory object" from 2a before the expensive call to
read_sha1_file in step 1.
This might seem circular, since step 2a uses the type
information determined in step 1 to call the appropriate
lookup function. However, we can notice that all of the
lookup_* functions are backed by lookup_object. In other
words, all of the objects are kept in a master hash table,
and we don't actually need the type to do the "do we have
it" part of the lookup, only to do the "and create it if it
doesn't exist" part.
This can save time whenever we call parse_object on the same
sha1 twice in a single program. Some code paths already
perform this optimization manually, with either:
if (!obj->parsed)
obj = parse_object(obj->sha1);
if you already have a "struct object", or:
struct object *obj = lookup_unknown_object(sha1);
if (!obj || !obj->parsed)
obj = parse_object(sha1);
if you don't. This patch moves the optimization into
parse_object itself.
Most git operations won't notice any impact. Either they
don't parse a lot of duplicate sha1s, or the calling code
takes special care not to re-parse objects. I timed two
code paths that do benefit (there may be more, but these two
were immediately obvious and easy to time).
The first is fast-export, which calls parse_object on each
object it outputs, like this:
object = parse_object(sha1);
if (!object)
die(...);
if (object->flags & SHOWN)
return;
which means that just to realize we have already shown an
object, we will read the whole object from disk!
With this patch, my best-of-five time for "fast-export --all" on
git.git dropped from 26.3s to 21.3s.
The second case is upload-pack, which will call parse_object
for each advertised ref (because it needs to peel tags to
show "^{}" entries). This doesn't matter for most
repositories, because they don't have a lot of refs pointing
to the same objects. However, if you have a big alternates
repository with a shared object db for a number of child
repositories, then the alternates repository will have
duplicated refs representing each of its children.
For example, GitHub's alternates repository for git.git has
~120,000 refs, of which only ~3200 are unique. The time for
upload-pack to print its list of advertised refs dropped
from 3.4s to 0.76s.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-01-05 22:00:01 +01:00
|
|
|
if (obj && obj->parsed)
|
|
|
|
return obj;
|
2006-09-15 22:30:02 +02:00
|
|
|
|
2022-09-07 01:06:25 +02:00
|
|
|
if (skip_hash) {
|
|
|
|
struct commit *commit = lookup_commit_in_graph(r, repl);
|
|
|
|
if (commit)
|
|
|
|
return &commit->object;
|
|
|
|
}
|
|
|
|
|
2022-11-21 20:26:55 +01:00
|
|
|
if ((!obj || obj->type == OBJ_BLOB) &&
|
parse_object(): check on-disk type of suspected blob
In parse_object(), we try to handle blobs by streaming rather than
loading them entirely into memory. The most common case here will be
that we haven't seen the object yet and check oid_object_info(), which
tells us we have a blob.
But we trigger this code on one other case: when we have an in-memory
object struct with type OBJ_BLOB (and without its "parsed" flag set,
since otherwise we'd return early from the function). This indicates
that some other part of the code suspected we have a blob (e.g., it was
mentioned by a tree or tag) but we haven't yet looked at the on-disk
copy.
In this case before hitting the streaming path, we check if we have the
object on-disk at all. This is mostly pointless extra work, as the
streaming path would complain if it couldn't open the object (albeit
with the message "hash mismatch", which is a little misleading).
But it's also insufficient to catch all problems. The streaming code
will only tell us "yes, the on-disk object matches the oid". But it
doesn't actually confirm that what we found was indeed a blob, and
neither does repo_has_object_file().
One way to improve this would be to teach stream_object_signature() to
check the type (either by returning it to us to check, or taking an
"expected" type). But there's an even simpler fix here: if we suspect
the object is a blob, just call oid_object_info() to confirm that we
have it on-disk, and that it really is a blob.
This is slightly less efficient than teaching stream_object_signature()
to do it (since it has to open the object already). But this case very
rarely comes up. In practice, we usually don't have any clue what the
type is, in which case we already call oid_object_info(). This
"suspected" case happens only when some other code created an object
struct but didn't actually parse the blob, which is actually tricky to
trigger at all (see the discussion of the test below).
I reworked the conditional a bit so that instead of:
if ((suspected_blob && oid_object_info() == OBJ_BLOB)
(no_clue && oid_object_info() == OBJ_BLOB)
we have the simpler:
if ((suspected_blob || no_clue) && oid_object_info() == OBJ_BLOB)
This is shorter, but also reflects what we really want say, which is
"have we ruled out this being a blob; if not, check it on-disk".
In either case, if oid_object_info() fails to tell us it's a blob, we'll
skip the streaming code path and call repo_read_object_file(), just as
before. And if we really do have a mismatch with the existing object
struct, we'll eventually call lookup_commit(), etc, via
parse_object_buffer(), which will complain that it doesn't match our
existing obj->type.
So this fixes one of the lingering expect_failure cases from 0616617c7e
(t: introduce tests for unexpected object types, 2019-04-09). That test
works by peeling a tag that claims to point to a blob (triggering us to
create the struct), but really points to something else, which we later
discover when we call parse_object() as part of the actual traversal).
Prior to this commit, we'd quietly check the sha1 and mark the blob as
"parsed". Now we correctly complain about the mismatch.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
2022-11-17 23:41:16 +01:00
|
|
|
oid_object_info(r, oid, NULL) == OBJ_BLOB) {
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 01:01:34 +02:00
|
|
|
if (!skip_hash && stream_object_signature(r, repl) < 0) {
|
2019-01-07 09:40:34 +01:00
|
|
|
error(_("hash mismatch %s"), oid_to_hex(oid));
|
2012-03-07 11:54:18 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
blob: drop unused parts of parse_blob_buffer()
Our parse_blob_buffer() takes a ptr/len combo, just like
parse_tree_buffer(), etc, and returns success or failure. But it doesn't
actually do anything with them; we just set the "parsed" flag in the
object and return success, without even looking at the contents.
There could be some value to keeping these unused parameters:
- it's consistent with the parse functions for other object types. But
we already lost that consistency in 837d395a5c (Replace parse_blob()
with an explanatory comment, 2010-01-18).
- As the comment from 837d395a5c explains, callers are supposed to
make sure they have the object content available. So in theory
asking for these parameters could serve as a signal. But there are
only two callers, and one of them always passes NULL (after doing a
streaming check of the object hash).
This shows that there aren't likely to be a lot of callers (since
everyone either uses the type-generic parse functions, or handles
blobs individually), and that they need to take special care anyway
(because we usually want to avoid loading whole blobs in memory if
we can avoid it).
So let's just drop these unused parameters, and likewise the useless
return value. While we're touching the header file, let's move the
declaration of parse_blob_buffer() right below that explanatory comment,
where it's more likely to be seen by people looking for the function.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-12-13 12:11:57 +01:00
|
|
|
parse_blob_buffer(lookup_blob(r, oid));
|
2019-06-20 09:41:14 +02:00
|
|
|
return lookup_object(r, oid);
|
2012-03-07 11:54:18 +01:00
|
|
|
}
|
|
|
|
|
2018-11-14 01:12:49 +01:00
|
|
|
buffer = repo_read_object_file(r, oid, &type, &size);
|
2005-06-27 12:33:33 +02:00
|
|
|
if (buffer) {
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 01:01:34 +02:00
|
|
|
if (!skip_hash &&
|
|
|
|
check_object_signature(r, repl, buffer, size, type) < 0) {
|
2007-05-25 03:46:22 +02:00
|
|
|
free(buffer);
|
2019-01-07 09:40:34 +01:00
|
|
|
error(_("hash mismatch %s"), oid_to_hex(repl));
|
2007-03-20 18:05:20 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
2006-09-15 22:30:02 +02:00
|
|
|
|
2018-06-29 03:22:19 +02:00
|
|
|
obj = parse_object_buffer(r, oid, type, size,
|
2018-06-29 03:21:53 +02:00
|
|
|
buffer, &eaten);
|
2006-09-15 22:30:02 +02:00
|
|
|
if (!eaten)
|
|
|
|
free(buffer);
|
2005-05-06 19:48:34 +02:00
|
|
|
return obj;
|
2005-04-28 16:46:33 +02:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-08-03 01:45:48 +02:00
|
|
|
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 01:01:34 +02:00
|
|
|
struct object *parse_object(struct repository *r, const struct object_id *oid)
|
|
|
|
{
|
|
|
|
return parse_object_with_flags(r, oid, 0);
|
|
|
|
}
|
|
|
|
|
2005-08-03 01:45:48 +02:00
|
|
|
struct object_list *object_list_insert(struct object *item,
|
|
|
|
struct object_list **list_p)
|
|
|
|
{
|
|
|
|
struct object_list *new_list = xmalloc(sizeof(struct object_list));
|
2010-09-05 21:36:33 +02:00
|
|
|
new_list->item = item;
|
|
|
|
new_list->next = *list_p;
|
|
|
|
*list_p = new_list;
|
|
|
|
return new_list;
|
2005-08-03 01:45:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int object_list_contains(struct object_list *list, struct object *obj)
|
|
|
|
{
|
|
|
|
while (list) {
|
|
|
|
if (list->item == obj)
|
|
|
|
return 1;
|
|
|
|
list = list->next;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
|
2020-02-13 03:16:33 +01:00
|
|
|
void object_list_free(struct object_list **list)
|
|
|
|
{
|
|
|
|
while (*list) {
|
|
|
|
struct object_list *p = *list;
|
|
|
|
*list = p->next;
|
|
|
|
free(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
/*
|
|
|
|
* A zero-length string to which object_array_entry::name can be
|
|
|
|
* initialized without requiring a malloc/free.
|
|
|
|
*/
|
|
|
|
static char object_array_slopbuf[1];
|
|
|
|
|
2014-10-16 00:42:57 +02:00
|
|
|
void add_object_array_with_path(struct object *obj, const char *name,
|
|
|
|
struct object_array *array,
|
|
|
|
unsigned mode, const char *path)
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
{
|
|
|
|
unsigned nr = array->nr;
|
|
|
|
unsigned alloc = array->alloc;
|
|
|
|
struct object_array_entry *objects = array->objects;
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
struct object_array_entry *entry;
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
|
|
|
|
if (nr >= alloc) {
|
|
|
|
alloc = (alloc + 32) * 2;
|
2014-09-16 20:56:57 +02:00
|
|
|
REALLOC_ARRAY(objects, alloc);
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
array->alloc = alloc;
|
|
|
|
array->objects = objects;
|
|
|
|
}
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
entry = &objects[nr];
|
|
|
|
entry->item = obj;
|
|
|
|
if (!name)
|
|
|
|
entry->name = NULL;
|
|
|
|
else if (!*name)
|
|
|
|
/* Use our own empty string instead of allocating one: */
|
|
|
|
entry->name = object_array_slopbuf;
|
|
|
|
else
|
|
|
|
entry->name = xstrdup(name);
|
|
|
|
entry->mode = mode;
|
2014-10-16 00:42:57 +02:00
|
|
|
if (path)
|
|
|
|
entry->path = xstrdup(path);
|
|
|
|
else
|
|
|
|
entry->path = NULL;
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
array->nr = ++nr;
|
|
|
|
}
|
2009-01-18 07:27:08 +01:00
|
|
|
|
2013-05-10 17:10:16 +02:00
|
|
|
void add_object_array(struct object *obj, const char *name, struct object_array *array)
|
|
|
|
{
|
2014-10-19 04:03:19 +02:00
|
|
|
add_object_array_with_path(obj, name, array, S_IFINVALID, NULL);
|
2013-05-10 17:10:16 +02:00
|
|
|
}
|
|
|
|
|
2014-10-16 00:34:19 +02:00
|
|
|
/*
|
|
|
|
* Free all memory associated with an entry; the result is
|
|
|
|
* in an unspecified state and should not be examined.
|
|
|
|
*/
|
|
|
|
static void object_array_release_entry(struct object_array_entry *ent)
|
|
|
|
{
|
|
|
|
if (ent->name != object_array_slopbuf)
|
|
|
|
free(ent->name);
|
2014-10-16 00:42:57 +02:00
|
|
|
free(ent->path);
|
2014-10-16 00:34:19 +02:00
|
|
|
}
|
|
|
|
|
object_array: add and use `object_array_pop()`
In a couple of places, we pop objects off an object array `foo` by
decreasing `foo.nr`. We access `foo.nr` in many places, but most if not
all other times we do so read-only, e.g., as we iterate over the array.
But when we change `foo.nr` behind the array's back, it feels a bit
nasty and looks like it might leak memory.
Leaks happen if the popped element has an allocated `name` or `path`.
At the moment, that is not the case. Still, 1) the object array might
gain more fields that want to be freed, 2) a code path where we pop
might start using names or paths, 3) one of these code paths might be
copied to somewhere where we do, and 4) using a dedicated function for
popping is conceptually cleaner.
Introduce and use `object_array_pop()` instead. Release memory in the
new function. Document that popping an object leaves the associated
elements in limbo.
The converted places were identified by grepping for "\.nr\>" and
looking for "--".
Make the new function return NULL on an empty array. This is consistent
with `pop_commit()` and allows the following:
while ((o = object_array_pop(&foo)) != NULL) {
// do something
}
But as noted above, we don't need to go out of our way to avoid reading
`foo.nr`. This is probably more readable:
while (foo.nr) {
... o = object_array_pop(&foo);
// do something
}
The name of `object_array_pop()` does not quite align with
`add_object_array()`. That is unfortunate. On the other hand, it matches
`object_array_clear()`. Arguably it's `add_...` that is the odd one out,
since it reads like it's used to "add" an "object array". For that
reason, side with `object_array_clear()`.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-23 01:34:53 +02:00
|
|
|
struct object *object_array_pop(struct object_array *array)
|
|
|
|
{
|
|
|
|
struct object *ret;
|
|
|
|
|
|
|
|
if (!array->nr)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ret = array->objects[array->nr - 1].item;
|
|
|
|
object_array_release_entry(&array->objects[array->nr - 1]);
|
|
|
|
array->nr--;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-05-25 11:08:08 +02:00
|
|
|
void object_array_filter(struct object_array *array,
|
|
|
|
object_array_each_func_t want, void *cb_data)
|
2009-01-18 07:27:08 +01:00
|
|
|
{
|
2013-05-25 11:08:08 +02:00
|
|
|
unsigned nr = array->nr, src, dst;
|
2009-01-18 07:27:08 +01:00
|
|
|
struct object_array_entry *objects = array->objects;
|
|
|
|
|
2013-05-25 11:08:08 +02:00
|
|
|
for (src = dst = 0; src < nr; src++) {
|
|
|
|
if (want(&objects[src], cb_data)) {
|
2009-01-18 07:27:08 +01:00
|
|
|
if (src != dst)
|
|
|
|
objects[dst] = objects[src];
|
|
|
|
dst++;
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
} else {
|
2014-10-16 00:34:19 +02:00
|
|
|
object_array_release_entry(&objects[src]);
|
2013-05-25 11:08:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
array->nr = dst;
|
|
|
|
}
|
|
|
|
|
2014-10-16 00:34:34 +02:00
|
|
|
void object_array_clear(struct object_array *array)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < array->nr; i++)
|
|
|
|
object_array_release_entry(&array->objects[i]);
|
2017-06-16 01:15:46 +02:00
|
|
|
FREE_AND_NULL(array->objects);
|
2014-10-16 00:34:34 +02:00
|
|
|
array->nr = array->alloc = 0;
|
|
|
|
}
|
|
|
|
|
2013-05-25 11:08:10 +02:00
|
|
|
/*
|
bundle: lost objects when removing duplicate pendings
`git rev-list` will list one commit for the following command:
$ git rev-list 'main^!'
<tip-commit-of-main-branch>
But providing the same rev-list args to `git bundle`, fail to create
a bundle file.
$ git bundle create - 'main^!'
# v2 git bundle
-<OID> <one-line-message>
fatal: Refusing to create empty bundle.
This is because when removing duplicate objects in function
`object_array_remove_duplicates()`, one unique pending object which has
the same name is deleted by mistake. The revision arg 'main^!' in the
above example is parsed by `handle_revision_arg()`, and at lease two
different objects will be appended to `revs.pending`, one points to the
parent commit of the "main" branch, and the other points to the tip
commit of the "main" branch. These two objects have the same name
"main". Only one object is left with the name "main" after calling the
function `object_array_remove_duplicates()`.
And what's worse, when adding boundary commits into pending list, we use
one-line commit message as names, and the arbitory names may surprise
git-bundle.
Only comparing objects themselves (".item") is also not good enough,
because user may want to create a bundle with two identical objects but
with different reference names, such as: "HEAD" and "refs/heads/main".
Add new function `contains_object()` which compare both the address and
the name of the object.
Signed-off-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-12 03:27:02 +01:00
|
|
|
* Return true if array already contains an entry.
|
2013-05-25 11:08:10 +02:00
|
|
|
*/
|
bundle: lost objects when removing duplicate pendings
`git rev-list` will list one commit for the following command:
$ git rev-list 'main^!'
<tip-commit-of-main-branch>
But providing the same rev-list args to `git bundle`, fail to create
a bundle file.
$ git bundle create - 'main^!'
# v2 git bundle
-<OID> <one-line-message>
fatal: Refusing to create empty bundle.
This is because when removing duplicate objects in function
`object_array_remove_duplicates()`, one unique pending object which has
the same name is deleted by mistake. The revision arg 'main^!' in the
above example is parsed by `handle_revision_arg()`, and at lease two
different objects will be appended to `revs.pending`, one points to the
parent commit of the "main" branch, and the other points to the tip
commit of the "main" branch. These two objects have the same name
"main". Only one object is left with the name "main" after calling the
function `object_array_remove_duplicates()`.
And what's worse, when adding boundary commits into pending list, we use
one-line commit message as names, and the arbitory names may surprise
git-bundle.
Only comparing objects themselves (".item") is also not good enough,
because user may want to create a bundle with two identical objects but
with different reference names, such as: "HEAD" and "refs/heads/main".
Add new function `contains_object()` which compare both the address and
the name of the object.
Signed-off-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-12 03:27:02 +01:00
|
|
|
static int contains_object(struct object_array *array,
|
|
|
|
const struct object *item, const char *name)
|
2013-05-25 11:08:10 +02:00
|
|
|
{
|
|
|
|
unsigned nr = array->nr, i;
|
|
|
|
struct object_array_entry *object = array->objects;
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++, object++)
|
bundle: lost objects when removing duplicate pendings
`git rev-list` will list one commit for the following command:
$ git rev-list 'main^!'
<tip-commit-of-main-branch>
But providing the same rev-list args to `git bundle`, fail to create
a bundle file.
$ git bundle create - 'main^!'
# v2 git bundle
-<OID> <one-line-message>
fatal: Refusing to create empty bundle.
This is because when removing duplicate objects in function
`object_array_remove_duplicates()`, one unique pending object which has
the same name is deleted by mistake. The revision arg 'main^!' in the
above example is parsed by `handle_revision_arg()`, and at lease two
different objects will be appended to `revs.pending`, one points to the
parent commit of the "main" branch, and the other points to the tip
commit of the "main" branch. These two objects have the same name
"main". Only one object is left with the name "main" after calling the
function `object_array_remove_duplicates()`.
And what's worse, when adding boundary commits into pending list, we use
one-line commit message as names, and the arbitory names may surprise
git-bundle.
Only comparing objects themselves (".item") is also not good enough,
because user may want to create a bundle with two identical objects but
with different reference names, such as: "HEAD" and "refs/heads/main".
Add new function `contains_object()` which compare both the address and
the name of the object.
Signed-off-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-12 03:27:02 +01:00
|
|
|
if (item == object->item && !strcmp(object->name, name))
|
2013-05-25 11:08:10 +02:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-18 07:27:08 +01:00
|
|
|
void object_array_remove_duplicates(struct object_array *array)
|
|
|
|
{
|
2013-05-25 11:08:10 +02:00
|
|
|
unsigned nr = array->nr, src;
|
2009-01-18 07:27:08 +01:00
|
|
|
struct object_array_entry *objects = array->objects;
|
|
|
|
|
2013-05-25 11:08:10 +02:00
|
|
|
array->nr = 0;
|
|
|
|
for (src = 0; src < nr; src++) {
|
bundle: lost objects when removing duplicate pendings
`git rev-list` will list one commit for the following command:
$ git rev-list 'main^!'
<tip-commit-of-main-branch>
But providing the same rev-list args to `git bundle`, fail to create
a bundle file.
$ git bundle create - 'main^!'
# v2 git bundle
-<OID> <one-line-message>
fatal: Refusing to create empty bundle.
This is because when removing duplicate objects in function
`object_array_remove_duplicates()`, one unique pending object which has
the same name is deleted by mistake. The revision arg 'main^!' in the
above example is parsed by `handle_revision_arg()`, and at lease two
different objects will be appended to `revs.pending`, one points to the
parent commit of the "main" branch, and the other points to the tip
commit of the "main" branch. These two objects have the same name
"main". Only one object is left with the name "main" after calling the
function `object_array_remove_duplicates()`.
And what's worse, when adding boundary commits into pending list, we use
one-line commit message as names, and the arbitory names may surprise
git-bundle.
Only comparing objects themselves (".item") is also not good enough,
because user may want to create a bundle with two identical objects but
with different reference names, such as: "HEAD" and "refs/heads/main".
Add new function `contains_object()` which compare both the address and
the name of the object.
Signed-off-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-12 03:27:02 +01:00
|
|
|
if (!contains_object(array, objects[src].item,
|
|
|
|
objects[src].name)) {
|
2013-05-25 11:08:10 +02:00
|
|
|
if (src != array->nr)
|
|
|
|
objects[array->nr] = objects[src];
|
|
|
|
array->nr++;
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
} else {
|
2014-10-16 00:34:19 +02:00
|
|
|
object_array_release_entry(&objects[src]);
|
2009-01-18 07:27:08 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-03-29 09:21:21 +02:00
|
|
|
|
|
|
|
void clear_object_flags(unsigned flags)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2018-05-08 21:37:24 +02:00
|
|
|
for (i=0; i < the_repository->parsed_objects->obj_hash_size; i++) {
|
|
|
|
struct object *obj = the_repository->parsed_objects->obj_hash[i];
|
2012-03-29 09:21:21 +02:00
|
|
|
if (obj)
|
|
|
|
obj->flags &= ~flags;
|
|
|
|
}
|
|
|
|
}
|
2017-12-25 18:44:58 +01:00
|
|
|
|
2020-10-31 13:46:08 +01:00
|
|
|
void repo_clear_commit_marks(struct repository *r, unsigned int flags)
|
2017-12-25 18:44:58 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2020-10-31 13:46:08 +01:00
|
|
|
for (i = 0; i < r->parsed_objects->obj_hash_size; i++) {
|
|
|
|
struct object *obj = r->parsed_objects->obj_hash[i];
|
2017-12-25 18:44:58 +01:00
|
|
|
if (obj && obj->type == OBJ_COMMIT)
|
|
|
|
obj->flags &= ~flags;
|
|
|
|
}
|
|
|
|
}
|
2018-03-23 18:20:55 +01:00
|
|
|
|
2018-05-08 21:37:24 +02:00
|
|
|
struct parsed_object_pool *parsed_object_pool_new(void)
|
|
|
|
{
|
|
|
|
struct parsed_object_pool *o = xmalloc(sizeof(*o));
|
|
|
|
memset(o, 0, sizeof(*o));
|
2018-05-15 23:48:42 +02:00
|
|
|
|
|
|
|
o->blob_state = allocate_alloc_state();
|
|
|
|
o->tree_state = allocate_alloc_state();
|
|
|
|
o->commit_state = allocate_alloc_state();
|
|
|
|
o->tag_state = allocate_alloc_state();
|
|
|
|
o->object_state = allocate_alloc_state();
|
|
|
|
|
2018-05-18 00:51:52 +02:00
|
|
|
o->is_shallow = -1;
|
2021-03-13 17:17:22 +01:00
|
|
|
CALLOC_ARRAY(o->shallow_stat, 1);
|
2018-05-18 00:51:52 +02:00
|
|
|
|
2018-06-29 03:22:15 +02:00
|
|
|
o->buffer_slab = allocate_commit_buffer_slab();
|
|
|
|
|
2018-05-08 21:37:24 +02:00
|
|
|
return o;
|
|
|
|
}
|
|
|
|
|
2018-03-23 18:20:55 +01:00
|
|
|
struct raw_object_store *raw_object_store_new(void)
|
|
|
|
{
|
|
|
|
struct raw_object_store *o = xmalloc(sizeof(*o));
|
|
|
|
|
|
|
|
memset(o, 0, sizeof(*o));
|
2018-03-23 18:20:59 +01:00
|
|
|
INIT_LIST_HEAD(&o->packed_git_mru);
|
2019-11-27 23:24:53 +01:00
|
|
|
hashmap_init(&o->pack_map, pack_map_entry_cmp, NULL, 0);
|
replace-object: make replace operations thread-safe
replace-object functions are very close to being thread-safe: the only
current racy section is the lazy initialization at
prepare_replace_object(). The following patches will protect some object
reading operations to be called threaded, but before that, replace
functions must be protected. To do so, add a mutex to struct
raw_object_store and acquire it before lazy initializing the
replace_map. This won't cause any noticeable performance drop as the
mutex will no longer be used after the replace_map is initialized.
Later, when the replace functions are called in parallel, thread
debuggers might point our use of the added replace_map_initialized flag
as a data race. However, as this boolean variable is initialized as
false and it's only updated once, there's no real harm. It's perfectly
fine if the value is updated right after a thread read it in
replace-map.h:lookup_replace_object() (there'll only be a performance
penalty for the affected threads at that moment). We could cease the
debugger warning protecting the variable reading at the said function.
However, this would negatively affect performance for all threads
calling it, at any time, so it's not really worthy since the warning
doesn't represent a real problem. Instead, to make sure we don't get
false positives (at ThreadSanitizer, at least) an entry for the
respective function is added to .tsan-suppressions.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:52 +01:00
|
|
|
pthread_mutex_init(&o->replace_mutex, NULL);
|
2018-03-23 18:20:55 +01:00
|
|
|
return o;
|
|
|
|
}
|
2018-03-23 18:20:58 +01:00
|
|
|
|
2021-12-06 23:05:04 +01:00
|
|
|
void free_object_directory(struct object_directory *odb)
|
2018-03-23 18:20:58 +01:00
|
|
|
{
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
free(odb->path);
|
2019-01-06 17:45:39 +01:00
|
|
|
odb_clear_loose_cache(odb);
|
2018-11-12 15:48:47 +01:00
|
|
|
free(odb);
|
2018-03-23 18:20:58 +01:00
|
|
|
}
|
|
|
|
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
static void free_object_directories(struct raw_object_store *o)
|
2018-03-23 18:20:58 +01:00
|
|
|
{
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
while (o->odb) {
|
2018-11-12 15:48:47 +01:00
|
|
|
struct object_directory *next;
|
2018-03-23 18:20:58 +01:00
|
|
|
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
next = o->odb->next;
|
|
|
|
free_object_directory(o->odb);
|
|
|
|
o->odb = next;
|
2018-03-23 18:20:58 +01:00
|
|
|
}
|
2021-07-08 01:10:15 +02:00
|
|
|
kh_destroy_odb_path_map(o->odb_by_path);
|
|
|
|
o->odb_by_path = NULL;
|
2018-03-23 18:20:58 +01:00
|
|
|
}
|
|
|
|
|
2018-03-23 18:20:55 +01:00
|
|
|
void raw_object_store_clear(struct raw_object_store *o)
|
|
|
|
{
|
|
|
|
FREE_AND_NULL(o->alternate_db);
|
2018-05-17 20:29:57 +02:00
|
|
|
|
|
|
|
oidmap_free(o->replace_map, 1);
|
2018-05-10 01:40:58 +02:00
|
|
|
FREE_AND_NULL(o->replace_map);
|
replace-object: make replace operations thread-safe
replace-object functions are very close to being thread-safe: the only
current racy section is the lazy initialization at
prepare_replace_object(). The following patches will protect some object
reading operations to be called threaded, but before that, replace
functions must be protected. To do so, add a mutex to struct
raw_object_store and acquire it before lazy initializing the
replace_map. This won't cause any noticeable performance drop as the
mutex will no longer be used after the replace_map is initialized.
Later, when the replace functions are called in parallel, thread
debuggers might point our use of the added replace_map_initialized flag
as a data race. However, as this boolean variable is initialized as
false and it's only updated once, there's no real harm. It's perfectly
fine if the value is updated right after a thread read it in
replace-map.h:lookup_replace_object() (there'll only be a performance
penalty for the affected threads at that moment). We could cease the
debugger warning protecting the variable reading at the said function.
However, this would negatively affect performance for all threads
calling it, at any time, so it's not really worthy since the warning
doesn't represent a real problem. Instead, to make sure we don't get
false positives (at ThreadSanitizer, at least) an entry for the
respective function is added to .tsan-suppressions.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 03:39:52 +01:00
|
|
|
pthread_mutex_destroy(&o->replace_mutex);
|
2018-03-23 18:20:58 +01:00
|
|
|
|
2018-07-12 00:42:41 +02:00
|
|
|
free_commit_graph(o->commit_graph);
|
|
|
|
o->commit_graph = NULL;
|
|
|
|
o->commit_graph_attempted = 0;
|
|
|
|
|
sha1-file: use an object_directory for the main object dir
Our handling of alternate object directories is needlessly different
from the main object directory. As a result, many places in the code
basically look like this:
do_something(r->objects->objdir);
for (odb = r->objects->alt_odb_list; odb; odb = odb->next)
do_something(odb->path);
That gets annoying when do_something() is non-trivial, and we've
resorted to gross hacks like creating fake alternates (see
find_short_object_filename()).
Instead, let's give each raw_object_store a unified list of
object_directory structs. The first will be the main store, and
everything after is an alternate. Very few callers even care about the
distinction, and can just loop over the whole list (and those who care
can just treat the first element differently).
A few observations:
- we don't need r->objects->objectdir anymore, and can just
mechanically convert that to r->objects->odb->path
- object_directory's path field needs to become a real pointer rather
than a FLEX_ARRAY, in order to fill it with expand_base_dir()
- we'll call prepare_alt_odb() earlier in many functions (i.e.,
outside of the loop). This may result in us calling it even when our
function would be satisfied looking only at the main odb.
But this doesn't matter in practice. It's not a very expensive
operation in the first place, and in the majority of cases it will
be a noop. We call it already (and cache its results) in
prepare_packed_git(), and we'll generally check packs before loose
objects. So essentially every program is going to call it
immediately once per program.
Arguably we should just prepare_alt_odb() immediately upon setting
up the repository's object directory, which would save us sprinkling
calls throughout the code base (and forgetting to do so has been a
source of subtle bugs in the past). But I've stopped short of that
here, since there are already a lot of other moving parts in this
patch.
- Most call sites just get shorter. The check_and_freshen() functions
are an exception, because they have entry points to handle local and
nonlocal directories separately.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-12 15:50:39 +01:00
|
|
|
free_object_directories(o);
|
|
|
|
o->odb_tail = NULL;
|
|
|
|
o->loaded_alternates = 0;
|
2018-03-23 18:20:59 +01:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&o->packed_git_mru);
|
2019-05-17 20:41:49 +02:00
|
|
|
close_object_store(o);
|
2018-03-23 18:21:00 +01:00
|
|
|
o->packed_git = NULL;
|
2019-11-27 23:24:53 +01:00
|
|
|
|
2020-11-02 19:55:05 +01:00
|
|
|
hashmap_clear(&o->pack_map);
|
2018-03-23 18:20:55 +01:00
|
|
|
}
|
2018-05-08 21:37:24 +02:00
|
|
|
|
|
|
|
void parsed_object_pool_clear(struct parsed_object_pool *o)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* As objects are allocated in slabs (see alloc.c), we do
|
|
|
|
* not need to free each object, but each slab instead.
|
2018-05-15 23:48:42 +02:00
|
|
|
*
|
|
|
|
* Before doing so, we need to free any additional memory
|
|
|
|
* the objects may hold.
|
2018-05-08 21:37:24 +02:00
|
|
|
*/
|
2018-05-15 23:48:42 +02:00
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < o->obj_hash_size; i++) {
|
|
|
|
struct object *obj = o->obj_hash[i];
|
|
|
|
|
|
|
|
if (!obj)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (obj->type == OBJ_TREE)
|
|
|
|
free_tree_buffer((struct tree*)obj);
|
|
|
|
else if (obj->type == OBJ_COMMIT)
|
2018-12-15 01:09:40 +01:00
|
|
|
release_commit_memory(o, (struct commit*)obj);
|
2018-05-15 23:48:42 +02:00
|
|
|
else if (obj->type == OBJ_TAG)
|
|
|
|
release_tag_memory((struct tag*)obj);
|
|
|
|
}
|
|
|
|
|
|
|
|
FREE_AND_NULL(o->obj_hash);
|
|
|
|
o->obj_hash_size = 0;
|
|
|
|
|
2018-06-29 03:22:15 +02:00
|
|
|
free_commit_buffer_slab(o->buffer_slab);
|
|
|
|
o->buffer_slab = NULL;
|
|
|
|
|
2018-05-15 23:48:42 +02:00
|
|
|
clear_alloc_state(o->blob_state);
|
|
|
|
clear_alloc_state(o->tree_state);
|
|
|
|
clear_alloc_state(o->commit_state);
|
|
|
|
clear_alloc_state(o->tag_state);
|
|
|
|
clear_alloc_state(o->object_state);
|
2019-02-07 21:05:54 +01:00
|
|
|
stat_validity_clear(o->shallow_stat);
|
2018-05-15 23:48:42 +02:00
|
|
|
FREE_AND_NULL(o->blob_state);
|
|
|
|
FREE_AND_NULL(o->tree_state);
|
|
|
|
FREE_AND_NULL(o->commit_state);
|
|
|
|
FREE_AND_NULL(o->tag_state);
|
|
|
|
FREE_AND_NULL(o->object_state);
|
2019-02-07 21:05:54 +01:00
|
|
|
FREE_AND_NULL(o->shallow_stat);
|
2018-05-08 21:37:24 +02:00
|
|
|
}
|