2005-04-18 20:39:48 +02:00
|
|
|
#ifndef OBJECT_H
|
|
|
|
#define OBJECT_H
|
|
|
|
|
2018-08-15 19:54:05 +02:00
|
|
|
#include "cache.h"
|
|
|
|
|
2018-06-29 03:22:15 +02:00
|
|
|
struct buffer_slab;
|
|
|
|
|
2018-05-08 21:37:24 +02:00
|
|
|
struct parsed_object_pool {
|
|
|
|
struct object **obj_hash;
|
|
|
|
int nr_objs, obj_hash_size;
|
2018-05-15 23:48:42 +02:00
|
|
|
|
|
|
|
/* TODO: migrate alloc_states to mem-pool? */
|
|
|
|
struct alloc_state *blob_state;
|
|
|
|
struct alloc_state *tree_state;
|
|
|
|
struct alloc_state *commit_state;
|
|
|
|
struct alloc_state *tag_state;
|
|
|
|
struct alloc_state *object_state;
|
|
|
|
unsigned commit_count;
|
2018-05-16 01:42:16 +02:00
|
|
|
|
|
|
|
/* parent substitutions from .git/info/grafts and .git/shallow */
|
|
|
|
struct commit_graft **grafts;
|
|
|
|
int grafts_alloc, grafts_nr;
|
2018-05-18 00:51:52 +02:00
|
|
|
|
|
|
|
int is_shallow;
|
|
|
|
struct stat_validity *shallow_stat;
|
|
|
|
char *alternate_shallow_file;
|
2018-05-18 00:51:53 +02:00
|
|
|
|
|
|
|
int commit_graft_prepared;
|
2018-06-29 03:22:15 +02:00
|
|
|
|
|
|
|
struct buffer_slab *buffer_slab;
|
2018-05-08 21:37:24 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
struct parsed_object_pool *parsed_object_pool_new(void);
|
|
|
|
void parsed_object_pool_clear(struct parsed_object_pool *o);
|
|
|
|
|
2005-04-18 20:39:48 +02:00
|
|
|
struct object_list {
|
|
|
|
struct object *item;
|
|
|
|
struct object_list *next;
|
|
|
|
};
|
|
|
|
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
struct object_array {
|
|
|
|
unsigned int nr;
|
|
|
|
unsigned int alloc;
|
|
|
|
struct object_array_entry {
|
|
|
|
struct object *item;
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 11:08:14 +02:00
|
|
|
/*
|
|
|
|
* name or NULL. If non-NULL, the memory pointed to
|
|
|
|
* is owned by this object *except* if it points at
|
|
|
|
* object_array_slopbuf, which is a static copy of the
|
|
|
|
* empty string.
|
|
|
|
*/
|
|
|
|
char *name;
|
2014-10-16 00:42:57 +02:00
|
|
|
char *path;
|
2007-04-22 18:43:58 +02:00
|
|
|
unsigned mode;
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
} *objects;
|
|
|
|
};
|
|
|
|
|
2010-08-29 04:04:17 +02:00
|
|
|
#define OBJECT_ARRAY_INIT { 0, 0, NULL }
|
|
|
|
|
2014-03-25 14:23:26 +01:00
|
|
|
/*
|
|
|
|
* object flag allocation:
|
revision.c: generation-based topo-order algorithm
The current --topo-order algorithm requires walking all
reachable commits up front, topo-sorting them, all before
outputting the first value. This patch introduces a new
algorithm which uses stored generation numbers to
incrementally walk in topo-order, outputting commits as
we go. This can dramatically reduce the computation time
to write a fixed number of commits, such as when limiting
with "-n <N>" or filling the first page of a pager.
When running a command like 'git rev-list --topo-order HEAD',
Git performed the following steps:
1. Run limit_list(), which parses all reachable commits,
adds them to a linked list, and distributes UNINTERESTING
flags. If all unprocessed commits are UNINTERESTING, then
it may terminate without walking all reachable commits.
This does not occur if we do not specify UNINTERESTING
commits.
2. Run sort_in_topological_order(), which is an implementation
of Kahn's algorithm. It first iterates through the entire
set of important commits and computes the in-degree of each
(plus one, as we use 'zero' as a special value here). Then,
we walk the commits in priority order, adding them to the
priority queue if and only if their in-degree is one. As
we remove commits from this priority queue, we decrement the
in-degree of their parents.
3. While we are peeling commits for output, get_revision_1()
uses pop_commit on the full list of commits computed by
sort_in_topological_order().
In the new algorithm, these three steps correspond to three
different commit walks. We run these walks simultaneously,
and advance each only as far as necessary to satisfy the
requirements of the 'higher order' walk. We know when we can
pause each walk by using generation numbers from the commit-
graph feature.
Recall that the generation number of a commit satisfies:
* If the commit has at least one parent, then the generation
number is one more than the maximum generation number among
its parents.
* If the commit has no parent, then the generation number is one.
There are two special generation numbers:
* GENERATION_NUMBER_INFINITY: this value is 0xffffffff and
indicates that the commit is not stored in the commit-graph and
the generation number was not previously calculated.
* GENERATION_NUMBER_ZERO: this value (0) is a special indicator
to say that the commit-graph was generated by a version of Git
that does not compute generation numbers (such as v2.18.0).
Since we use generation_numbers_enabled() before using the new
algorithm, we do not need to worry about GENERATION_NUMBER_ZERO.
However, the existence of GENERATION_NUMBER_INFINITY implies the
following weaker statement than the usual we expect from
generation numbers:
If A and B are commits with generation numbers gen(A) and
gen(B) and gen(A) < gen(B), then A cannot reach B.
Thus, we will walk in each of our stages until the "maximum
unexpanded generation number" is strictly lower than the
generation number of a commit we are about to use.
The walks are as follows:
1. EXPLORE: using the explore_queue priority queue (ordered by
maximizing the generation number), parse each reachable
commit until all commits in the queue have generation
number strictly lower than needed. During this walk, update
the UNINTERESTING flags as necessary.
2. INDEGREE: using the indegree_queue priority queue (ordered
by maximizing the generation number), add one to the in-
degree of each parent for each commit that is walked. Since
we walk in order of decreasing generation number, we know
that discovering an in-degree value of 0 means the value for
that commit was not initialized, so should be initialized to
two. (Recall that in-degree value "1" is what we use to say a
commit is ready for output.) As we iterate the parents of a
commit during this walk, ensure the EXPLORE walk has walked
beyond their generation numbers.
3. TOPO: using the topo_queue priority queue (ordered based on
the sort_order given, which could be commit-date, author-
date, or typical topo-order which treats the queue as a LIFO
stack), remove a commit from the queue and decrement the
in-degree of each parent. If a parent has an in-degree of
one, then we add it to the topo_queue. Before we decrement
the in-degree, however, ensure the INDEGREE walk has walked
beyond that generation number.
The implementations of these walks are in the following methods:
* explore_walk_step and explore_to_depth
* indegree_walk_step and compute_indegrees_to_depth
* next_topo_commit and expand_topo_walk
These methods have some patterns that may seem strange at first,
but they are probably carry-overs from their equivalents in
limit_list and sort_in_topological_order.
One thing that is missing from this implementation is a proper
way to stop walking when the entire queue is UNINTERESTING, so
this implementation is not enabled by comparisions, such as in
'git rev-list --topo-order A..B'. This can be updated in the
future.
In my local testing, I used the following Git commands on the
Linux repository in three modes: HEAD~1 with no commit-graph,
HEAD~1 with a commit-graph, and HEAD with a commit-graph. This
allows comparing the benefits we get from parsing commits from
the commit-graph and then again the benefits we get by
restricting the set of commits we walk.
Test: git rev-list --topo-order -100 HEAD
HEAD~1, no commit-graph: 6.80 s
HEAD~1, w/ commit-graph: 0.77 s
HEAD, w/ commit-graph: 0.02 s
Test: git rev-list --topo-order -100 HEAD -- tools
HEAD~1, no commit-graph: 9.63 s
HEAD~1, w/ commit-graph: 6.06 s
HEAD, w/ commit-graph: 0.06 s
This speedup is due to a few things. First, the new generation-
number-enabled algorithm walks commits on order of the number of
results output (subject to some branching structure expectations).
Since we limit to 100 results, we are running a query similar to
filling a single page of results. Second, when specifying a path,
we must parse the root tree object for each commit we walk. The
previous benefits from the commit-graph are entirely from reading
the commit-graph instead of parsing commits. Since we need to
parse trees for the same number of commits as before, we slow
down significantly from the non-path-based query.
For the test above, I specifically selected a path that is changed
frequently, including by merge commits. A less-frequently-changed
path (such as 'README') has similar end-to-end time since we need
to walk the same number of commits (before determining we do not
have 100 hits). However, get the benefit that the output is
presented to the user as it is discovered, much the same as a
normal 'git log' command (no '--topo-order'). This is an improved
user experience, even if the command has the same runtime.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-01 14:46:22 +01:00
|
|
|
* revision.h: 0---------10 25----28
|
2018-06-15 00:54:28 +02:00
|
|
|
* fetch-pack.c: 01
|
|
|
|
* negotiator/default.c: 2--5
|
2018-03-06 11:16:15 +01:00
|
|
|
* walker.c: 0-2
|
2018-07-20 18:33:13 +02:00
|
|
|
* upload-pack.c: 4 11-----14 16-----19
|
2018-03-06 11:16:15 +01:00
|
|
|
* builtin/blame.c: 12-13
|
|
|
|
* bisect.c: 16
|
|
|
|
* bundle.c: 16
|
|
|
|
* http-push.c: 16-----19
|
commit-graph: fix writing first commit-graph during fetch
The previous commit includes a failing test for an issue around
fetch.writeCommitGraph and fetching in a repo with a submodule. Here, we
fix that bug and set the test to "test_expect_success".
The problem arises with this set of commands when the remote repo at
<url> has a submodule. Note that --recurse-submodules is not needed to
demonstrate the bug.
$ git clone <url> test
$ cd test
$ git -c fetch.writeCommitGraph=true fetch origin
Computing commit graph generation numbers: 100% (12/12), done.
BUG: commit-graph.c:886: missing parent <hash1> for commit <hash2>
Aborted (core dumped)
As an initial fix, I converted the code in builtin/fetch.c that calls
write_commit_graph_reachable() to instead launch a "git commit-graph
write --reachable --split" process. That code worked, but is not how we
want the feature to work long-term.
That test did demonstrate that the issue must be something to do with
internal state of the 'git fetch' process.
The write_commit_graph() method in commit-graph.c ensures the commits we
plan to write are "closed under reachability" using close_reachable().
This method walks from the input commits, and uses the UNINTERESTING
flag to mark which commits have already been visited. This allows the
walk to take O(N) time, where N is the number of commits, instead of
O(P) time, where P is the number of paths. (The number of paths can be
exponential in the number of commits.)
However, the UNINTERESTING flag is used in lots of places in the
codebase. This flag usually means some barrier to stop a commit walk,
such as in revision-walking to compare histories. It is not often
cleared after the walk completes because the starting points of those
walks do not have the UNINTERESTING flag, and clear_commit_marks() would
stop immediately.
This is happening during a 'git fetch' call with a remote. The fetch
negotiation is comparing the remote refs with the local refs and marking
some commits as UNINTERESTING.
I tested running clear_commit_marks_many() to clear the UNINTERESTING
flag inside close_reachable(), but the tips did not have the flag, so
that did nothing.
It turns out that the calculate_changed_submodule_paths() method is at
fault. Thanks, Peff, for pointing out this detail! More specifically,
for each submodule, the collect_changed_submodules() runs a revision
walk to essentially do file-history on the list of submodules. That
revision walk marks commits UNININTERESTING if they are simplified away
by not changing the submodule.
Instead, I finally arrived on the conclusion that I should use a flag
that is not used in any other part of the code. In commit-reach.c, a
number of flags were defined for commit walk algorithms. The REACHABLE
flag seemed like it made the most sense, and it seems it was not
actually used in the file. The REACHABLE flag was used in early versions
of commit-reach.c, but was removed by 4fbcca4 (commit-reach: make
can_all_from_reach... linear, 2018-07-20).
Add the REACHABLE flag to commit-graph.c and use it instead of
UNINTERESTING in close_reachable(). This fixes the bug in manual
testing.
Reported-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Helped-by: Jeff King <peff@peff.net>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-10-24 15:40:42 +02:00
|
|
|
* commit-graph.c: 15
|
|
|
|
* commit-reach.c: 16-----19
|
2018-04-10 23:26:19 +02:00
|
|
|
* sha1-name.c: 20
|
2018-03-06 11:16:15 +01:00
|
|
|
* list-objects-filter.c: 21
|
|
|
|
* builtin/fsck.c: 0--3
|
|
|
|
* builtin/index-pack.c: 2021
|
|
|
|
* builtin/pack-objects.c: 20
|
|
|
|
* builtin/reflog.c: 10--12
|
2018-05-19 07:28:28 +02:00
|
|
|
* builtin/show-branch.c: 0-------------------------------------------26
|
2018-03-06 11:16:15 +01:00
|
|
|
* builtin/unpack-objects.c: 2021
|
2014-03-25 14:23:26 +01:00
|
|
|
*/
|
revision.c: generation-based topo-order algorithm
The current --topo-order algorithm requires walking all
reachable commits up front, topo-sorting them, all before
outputting the first value. This patch introduces a new
algorithm which uses stored generation numbers to
incrementally walk in topo-order, outputting commits as
we go. This can dramatically reduce the computation time
to write a fixed number of commits, such as when limiting
with "-n <N>" or filling the first page of a pager.
When running a command like 'git rev-list --topo-order HEAD',
Git performed the following steps:
1. Run limit_list(), which parses all reachable commits,
adds them to a linked list, and distributes UNINTERESTING
flags. If all unprocessed commits are UNINTERESTING, then
it may terminate without walking all reachable commits.
This does not occur if we do not specify UNINTERESTING
commits.
2. Run sort_in_topological_order(), which is an implementation
of Kahn's algorithm. It first iterates through the entire
set of important commits and computes the in-degree of each
(plus one, as we use 'zero' as a special value here). Then,
we walk the commits in priority order, adding them to the
priority queue if and only if their in-degree is one. As
we remove commits from this priority queue, we decrement the
in-degree of their parents.
3. While we are peeling commits for output, get_revision_1()
uses pop_commit on the full list of commits computed by
sort_in_topological_order().
In the new algorithm, these three steps correspond to three
different commit walks. We run these walks simultaneously,
and advance each only as far as necessary to satisfy the
requirements of the 'higher order' walk. We know when we can
pause each walk by using generation numbers from the commit-
graph feature.
Recall that the generation number of a commit satisfies:
* If the commit has at least one parent, then the generation
number is one more than the maximum generation number among
its parents.
* If the commit has no parent, then the generation number is one.
There are two special generation numbers:
* GENERATION_NUMBER_INFINITY: this value is 0xffffffff and
indicates that the commit is not stored in the commit-graph and
the generation number was not previously calculated.
* GENERATION_NUMBER_ZERO: this value (0) is a special indicator
to say that the commit-graph was generated by a version of Git
that does not compute generation numbers (such as v2.18.0).
Since we use generation_numbers_enabled() before using the new
algorithm, we do not need to worry about GENERATION_NUMBER_ZERO.
However, the existence of GENERATION_NUMBER_INFINITY implies the
following weaker statement than the usual we expect from
generation numbers:
If A and B are commits with generation numbers gen(A) and
gen(B) and gen(A) < gen(B), then A cannot reach B.
Thus, we will walk in each of our stages until the "maximum
unexpanded generation number" is strictly lower than the
generation number of a commit we are about to use.
The walks are as follows:
1. EXPLORE: using the explore_queue priority queue (ordered by
maximizing the generation number), parse each reachable
commit until all commits in the queue have generation
number strictly lower than needed. During this walk, update
the UNINTERESTING flags as necessary.
2. INDEGREE: using the indegree_queue priority queue (ordered
by maximizing the generation number), add one to the in-
degree of each parent for each commit that is walked. Since
we walk in order of decreasing generation number, we know
that discovering an in-degree value of 0 means the value for
that commit was not initialized, so should be initialized to
two. (Recall that in-degree value "1" is what we use to say a
commit is ready for output.) As we iterate the parents of a
commit during this walk, ensure the EXPLORE walk has walked
beyond their generation numbers.
3. TOPO: using the topo_queue priority queue (ordered based on
the sort_order given, which could be commit-date, author-
date, or typical topo-order which treats the queue as a LIFO
stack), remove a commit from the queue and decrement the
in-degree of each parent. If a parent has an in-degree of
one, then we add it to the topo_queue. Before we decrement
the in-degree, however, ensure the INDEGREE walk has walked
beyond that generation number.
The implementations of these walks are in the following methods:
* explore_walk_step and explore_to_depth
* indegree_walk_step and compute_indegrees_to_depth
* next_topo_commit and expand_topo_walk
These methods have some patterns that may seem strange at first,
but they are probably carry-overs from their equivalents in
limit_list and sort_in_topological_order.
One thing that is missing from this implementation is a proper
way to stop walking when the entire queue is UNINTERESTING, so
this implementation is not enabled by comparisions, such as in
'git rev-list --topo-order A..B'. This can be updated in the
future.
In my local testing, I used the following Git commands on the
Linux repository in three modes: HEAD~1 with no commit-graph,
HEAD~1 with a commit-graph, and HEAD with a commit-graph. This
allows comparing the benefits we get from parsing commits from
the commit-graph and then again the benefits we get by
restricting the set of commits we walk.
Test: git rev-list --topo-order -100 HEAD
HEAD~1, no commit-graph: 6.80 s
HEAD~1, w/ commit-graph: 0.77 s
HEAD, w/ commit-graph: 0.02 s
Test: git rev-list --topo-order -100 HEAD -- tools
HEAD~1, no commit-graph: 9.63 s
HEAD~1, w/ commit-graph: 6.06 s
HEAD, w/ commit-graph: 0.06 s
This speedup is due to a few things. First, the new generation-
number-enabled algorithm walks commits on order of the number of
results output (subject to some branching structure expectations).
Since we limit to 100 results, we are running a query similar to
filling a single page of results. Second, when specifying a path,
we must parse the root tree object for each commit we walk. The
previous benefits from the commit-graph are entirely from reading
the commit-graph instead of parsing commits. Since we need to
parse trees for the same number of commits as before, we slow
down significantly from the non-path-based query.
For the test above, I specifically selected a path that is changed
frequently, including by merge commits. A less-frequently-changed
path (such as 'README') has similar end-to-end time since we need
to walk the same number of commits (before determining we do not
have 100 hits). However, get the benefit that the output is
presented to the user as it is discovered, much the same as a
normal 'git log' command (no '--topo-order'). This is an improved
user experience, even if the command has the same runtime.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-01 14:46:22 +01:00
|
|
|
#define FLAG_BITS 29
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 01:45:13 +02:00
|
|
|
|
2006-07-12 05:45:31 +02:00
|
|
|
/*
|
|
|
|
* The object type is stored in 3 bits.
|
|
|
|
*/
|
2005-04-18 20:39:48 +02:00
|
|
|
struct object {
|
|
|
|
unsigned parsed : 1;
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 01:45:13 +02:00
|
|
|
unsigned type : TYPE_BITS;
|
|
|
|
unsigned flags : FLAG_BITS;
|
2015-11-10 03:22:28 +01:00
|
|
|
struct object_id oid;
|
2005-04-18 20:39:48 +02:00
|
|
|
};
|
|
|
|
|
2019-04-29 10:28:14 +02:00
|
|
|
const char *type_name(unsigned int type);
|
|
|
|
int type_from_string_gently(const char *str, ssize_t, int gentle);
|
2014-09-10 15:52:44 +02:00
|
|
|
#define type_from_string(str) type_from_string_gently(str, -1, 0)
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 01:45:13 +02:00
|
|
|
|
2014-02-28 17:29:17 +01:00
|
|
|
/*
|
|
|
|
* Return the current number of buckets in the object hashmap.
|
|
|
|
*/
|
2019-04-29 10:28:14 +02:00
|
|
|
unsigned int get_max_object_index(void);
|
2014-02-28 17:29:17 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the object from the specified bucket in the object hashmap.
|
|
|
|
*/
|
2019-04-29 10:28:14 +02:00
|
|
|
struct object *get_indexed_object(unsigned int);
|
2006-06-18 20:45:02 +02:00
|
|
|
|
2008-09-10 21:22:35 +02:00
|
|
|
/*
|
|
|
|
* This can be used to see if we have heard of the object before, but
|
|
|
|
* it can return "yes we have, and here is a half-initialised object"
|
|
|
|
* for an object that we haven't loaded/parsed yet.
|
|
|
|
*
|
|
|
|
* When parsing a commit to create an in-core commit object, its
|
|
|
|
* parents list holds commit objects that represent its parents, but
|
|
|
|
* they are expected to be lazily initialized and do not know what
|
|
|
|
* their trees or parents are yet. When this function returns such a
|
|
|
|
* half-initialised objects, the caller is expected to initialize them
|
|
|
|
* by calling parse_object() on them.
|
|
|
|
*/
|
2019-06-20 09:41:14 +02:00
|
|
|
struct object *lookup_object(struct repository *r, const struct object_id *oid);
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2019-06-20 09:41:21 +02:00
|
|
|
void *create_object(struct repository *r, const struct object_id *oid, void *obj);
|
2005-04-18 20:39:48 +02:00
|
|
|
|
2018-06-29 03:22:06 +02:00
|
|
|
void *object_as_type(struct repository *r, struct object *obj, enum object_type type, int quiet);
|
add object_as_type helper for casting objects
When we call lookup_commit, lookup_tree, etc, the logic goes
something like:
1. Look for an existing object struct. If we don't have
one, allocate and return a new one.
2. Double check that any object we have is the expected
type (and complain and return NULL otherwise).
3. Convert an object with type OBJ_NONE (from a prior
call to lookup_unknown_object) to the expected type.
We can encapsulate steps 2 and 3 in a helper function which
checks whether we have the expected object type, converts
OBJ_NONE as appropriate, and returns the object.
Not only does this shorten the code, but it also provides
one central location for converting OBJ_NONE objects into
objects of other types. Future patches will use that to
enforce type-specific invariants.
Since this is a refactoring, we would want it to behave
exactly as the current code. It takes a little reasoning to
see that this is the case:
- for lookup_{commit,tree,etc} functions, we are just
pulling steps 2 and 3 into a function that does the same
thing.
- for the call in peel_object, we currently only do step 3
(but we want to consolidate it with the others, as
mentioned above). However, step 2 is a noop here, as the
surrounding conditional makes sure we have OBJ_NONE
(which we want to keep to avoid an extraneous call to
sha1_object_info).
- for the call in lookup_commit_reference_gently, we are
currently doing step 2 but not step 3. However, step 3
is a noop here. The object we got will have just come
from deref_tag, which must have figured out the type for
each object in order to know when to stop peeling.
Therefore the type will never be OBJ_NONE.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-07-13 08:42:03 +02:00
|
|
|
|
2013-03-17 09:22:36 +01:00
|
|
|
/*
|
|
|
|
* Returns the object, having parsed it to find out what it is.
|
|
|
|
*
|
|
|
|
* Returns NULL if the object is missing or corrupt.
|
|
|
|
*/
|
2018-06-29 03:22:19 +02:00
|
|
|
struct object *parse_object(struct repository *r, const struct object_id *oid);
|
2005-04-28 16:46:33 +02:00
|
|
|
|
2013-03-17 09:22:36 +01:00
|
|
|
/*
|
|
|
|
* Like parse_object, but will die() instead of returning NULL. If the
|
|
|
|
* "name" parameter is not NULL, it is included in the error message
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 00:10:38 +02:00
|
|
|
* (otherwise, the hex object ID is given).
|
2013-03-17 09:22:36 +01:00
|
|
|
*/
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 00:10:38 +02:00
|
|
|
struct object *parse_object_or_die(const struct object_id *oid, const char *name);
|
2013-03-17 09:22:36 +01:00
|
|
|
|
2006-09-15 22:30:02 +02:00
|
|
|
/* Given the result of read_sha1_file(), returns the object after
|
|
|
|
* parsing it. eaten_p indicates if the object has a borrowed copy
|
|
|
|
* of buffer and the caller should not free() it.
|
|
|
|
*/
|
2018-06-29 03:22:18 +02:00
|
|
|
struct object *parse_object_buffer(struct repository *r, const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
|
2006-09-15 22:30:02 +02:00
|
|
|
|
2005-08-03 01:45:48 +02:00
|
|
|
/** Returns the object, with potentially excess memory allocated. **/
|
2019-06-20 09:41:10 +02:00
|
|
|
struct object *lookup_unknown_object(const struct object_id *oid);
|
2005-08-03 01:45:48 +02:00
|
|
|
|
2007-06-07 09:04:01 +02:00
|
|
|
struct object_list *object_list_insert(struct object *item,
|
2005-08-03 01:45:48 +02:00
|
|
|
struct object_list **list_p);
|
|
|
|
|
|
|
|
int object_list_contains(struct object_list *list, struct object *obj);
|
|
|
|
|
2020-02-13 03:16:33 +01:00
|
|
|
void object_list_free(struct object_list **list);
|
|
|
|
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
/* Object array handling .. */
|
|
|
|
void add_object_array(struct object *obj, const char *name, struct object_array *array);
|
2014-10-16 00:42:57 +02:00
|
|
|
void add_object_array_with_path(struct object *obj, const char *name, struct object_array *array, unsigned mode, const char *path);
|
2013-05-25 11:08:08 +02:00
|
|
|
|
object_array: add and use `object_array_pop()`
In a couple of places, we pop objects off an object array `foo` by
decreasing `foo.nr`. We access `foo.nr` in many places, but most if not
all other times we do so read-only, e.g., as we iterate over the array.
But when we change `foo.nr` behind the array's back, it feels a bit
nasty and looks like it might leak memory.
Leaks happen if the popped element has an allocated `name` or `path`.
At the moment, that is not the case. Still, 1) the object array might
gain more fields that want to be freed, 2) a code path where we pop
might start using names or paths, 3) one of these code paths might be
copied to somewhere where we do, and 4) using a dedicated function for
popping is conceptually cleaner.
Introduce and use `object_array_pop()` instead. Release memory in the
new function. Document that popping an object leaves the associated
elements in limbo.
The converted places were identified by grepping for "\.nr\>" and
looking for "--".
Make the new function return NULL on an empty array. This is consistent
with `pop_commit()` and allows the following:
while ((o = object_array_pop(&foo)) != NULL) {
// do something
}
But as noted above, we don't need to go out of our way to avoid reading
`foo.nr`. This is probably more readable:
while (foo.nr) {
... o = object_array_pop(&foo);
// do something
}
The name of `object_array_pop()` does not quite align with
`add_object_array()`. That is unfortunate. On the other hand, it matches
`object_array_clear()`. Arguably it's `add_...` that is the odd one out,
since it reads like it's used to "add" an "object array". For that
reason, side with `object_array_clear()`.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-23 01:34:53 +02:00
|
|
|
/*
|
|
|
|
* Returns NULL if the array is empty. Otherwise, returns the last object
|
|
|
|
* after removing its entry from the array. Other resources associated
|
|
|
|
* with that object are left in an unspecified state and should not be
|
|
|
|
* examined.
|
|
|
|
*/
|
|
|
|
struct object *object_array_pop(struct object_array *array);
|
|
|
|
|
2013-05-25 11:08:08 +02:00
|
|
|
typedef int (*object_array_each_func_t)(struct object_array_entry *, void *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Apply want to each entry in array, retaining only the entries for
|
|
|
|
* which the function returns true. Preserve the order of the entries
|
|
|
|
* that are retained.
|
|
|
|
*/
|
|
|
|
void object_array_filter(struct object_array *array,
|
|
|
|
object_array_each_func_t want, void *cb_data);
|
|
|
|
|
2013-05-25 11:08:10 +02:00
|
|
|
/*
|
|
|
|
* Remove from array all but the first entry with a given name.
|
|
|
|
* Warning: this function uses an O(N^2) algorithm.
|
|
|
|
*/
|
|
|
|
void object_array_remove_duplicates(struct object_array *array);
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 02:42:35 +02:00
|
|
|
|
2014-10-16 00:34:34 +02:00
|
|
|
/*
|
|
|
|
* Remove any objects from the array, freeing all used memory; afterwards
|
|
|
|
* the array is ready to store more objects with add_object_array().
|
|
|
|
*/
|
|
|
|
void object_array_clear(struct object_array *array);
|
|
|
|
|
2012-03-29 09:21:21 +02:00
|
|
|
void clear_object_flags(unsigned flags);
|
|
|
|
|
2017-12-25 18:44:58 +01:00
|
|
|
/*
|
|
|
|
* Clear the specified object flags from all in-core commit objects.
|
|
|
|
*/
|
2019-04-29 10:28:14 +02:00
|
|
|
void clear_commit_marks_all(unsigned int flags);
|
2017-12-25 18:44:58 +01:00
|
|
|
|
2005-04-18 20:39:48 +02:00
|
|
|
#endif /* OBJECT_H */
|