2006-08-03 17:24:36 +02:00
|
|
|
#include "builtin.h"
|
2005-06-25 23:42:43 +02:00
|
|
|
#include "cache.h"
|
|
|
|
#include "object.h"
|
2006-04-02 14:44:09 +02:00
|
|
|
#include "blob.h"
|
|
|
|
#include "commit.h"
|
|
|
|
#include "tag.h"
|
|
|
|
#include "tree.h"
|
2005-06-25 23:42:43 +02:00
|
|
|
#include "delta.h"
|
2005-06-28 23:21:02 +02:00
|
|
|
#include "pack.h"
|
2005-06-27 05:27:56 +02:00
|
|
|
#include "csum-file.h"
|
2006-03-30 08:55:43 +02:00
|
|
|
#include "tree-walk.h"
|
2006-09-05 08:47:39 +02:00
|
|
|
#include "diff.h"
|
|
|
|
#include "revision.h"
|
|
|
|
#include "list-objects.h"
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2006-11-07 16:51:23 +01:00
|
|
|
static const char pack_usage[] = "\
|
|
|
|
git-pack-objects [{ -q | --progress | --all-progress }] \n\
|
|
|
|
[--local] [--incremental] [--window=N] [--depth=N] \n\
|
|
|
|
[--no-reuse-delta] [--delta-base-offset] [--non-empty] \n\
|
2006-12-19 02:25:28 +01:00
|
|
|
[--revs [--unpacked | --all]*] [--reflog] [--stdout | base-name] \n\
|
2006-11-07 16:51:23 +01:00
|
|
|
[<ref-list | <object-list]";
|
2005-06-25 23:42:43 +02:00
|
|
|
|
|
|
|
struct object_entry {
|
|
|
|
unsigned char sha1[20];
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
unsigned long size; /* uncompressed size */
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset; /* offset into the final pack file;
|
2006-02-18 05:58:45 +01:00
|
|
|
* nonzero if already written.
|
|
|
|
*/
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
unsigned int depth; /* delta depth */
|
|
|
|
unsigned int hash; /* name hint hash */
|
2005-06-28 23:21:02 +02:00
|
|
|
enum object_type type;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
enum object_type in_pack_type; /* could be delta */
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
unsigned long delta_size; /* delta data size (uncompressed) */
|
2006-09-23 03:25:04 +02:00
|
|
|
#define in_pack_header_size delta_size /* only when reusing pack data */
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
struct object_entry *delta; /* delta base object */
|
|
|
|
struct packed_git *in_pack; /* already in pack */
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t in_pack_offset;
|
2006-07-10 07:50:18 +02:00
|
|
|
struct object_entry *delta_child; /* deltified objects who bases me */
|
2006-02-18 05:58:45 +01:00
|
|
|
struct object_entry *delta_sibling; /* other deltified objects who
|
|
|
|
* uses the same base as me
|
|
|
|
*/
|
2006-02-19 23:47:21 +01:00
|
|
|
int preferred_base; /* we do not pack this, but is encouraged to
|
|
|
|
* be used as the base objectto delta huge
|
|
|
|
* objects against.
|
|
|
|
*/
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2007-04-09 07:06:31 +02:00
|
|
|
uint32_t crc32; /* crc of raw pack data for this object */
|
2005-06-25 23:42:43 +02:00
|
|
|
};
|
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
/*
|
2006-07-10 07:50:18 +02:00
|
|
|
* Objects we are going to pack are collected in objects array (dynamically
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
* expanded). nr_objects & nr_alloc controls this array. They are stored
|
|
|
|
* in the order we see -- typically rev-list --objects order that gives us
|
|
|
|
* nice "minimum seek" order.
|
|
|
|
*
|
|
|
|
* sorted-by-sha ans sorted-by-type are arrays of pointers that point at
|
|
|
|
* elements in the objects array. The former is used to build the pack
|
|
|
|
* index (lists object names in the ascending order to help offset lookup),
|
|
|
|
* and the latter is used to group similar things together by try_delta()
|
|
|
|
* heuristics.
|
|
|
|
*/
|
|
|
|
|
2005-07-04 00:34:04 +02:00
|
|
|
static unsigned char object_list_sha1[20];
|
2006-08-15 19:23:48 +02:00
|
|
|
static int non_empty;
|
|
|
|
static int no_reuse_delta;
|
|
|
|
static int local;
|
|
|
|
static int incremental;
|
2006-09-21 06:09:44 +02:00
|
|
|
static int allow_ofs_delta;
|
|
|
|
|
2005-06-25 23:42:43 +02:00
|
|
|
static struct object_entry **sorted_by_sha, **sorted_by_type;
|
2006-08-15 19:23:48 +02:00
|
|
|
static struct object_entry *objects;
|
2007-03-07 02:44:24 +01:00
|
|
|
static uint32_t nr_objects, nr_alloc, nr_result;
|
2005-06-25 23:42:43 +02:00
|
|
|
static const char *base_name;
|
2005-06-27 07:01:46 +02:00
|
|
|
static unsigned char pack_file_sha1[20];
|
2006-02-12 22:01:54 +01:00
|
|
|
static int progress = 1;
|
2006-08-15 19:23:48 +02:00
|
|
|
static volatile sig_atomic_t progress_update;
|
2006-07-23 07:50:30 +02:00
|
|
|
static int window = 10;
|
2006-09-02 00:05:12 +02:00
|
|
|
static int pack_to_stdout;
|
2006-09-06 10:42:23 +02:00
|
|
|
static int num_preferred_base;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
/*
|
|
|
|
* The object names in objects array are hashed with this hashtable,
|
|
|
|
* to help looking up the entry by object name. Binary search from
|
|
|
|
* sorted_by_sha is also possible but this was easier to code and faster.
|
|
|
|
* This hashtable is built after all the objects are seen.
|
|
|
|
*/
|
2006-08-15 19:23:48 +02:00
|
|
|
static int *object_ix;
|
|
|
|
static int object_ix_hashsz;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pack index for existing packs give us easy access to the offsets into
|
|
|
|
* corresponding pack file where each object's data starts, but the entries
|
|
|
|
* do not store the size of the compressed representation (uncompressed
|
2006-09-23 03:25:04 +02:00
|
|
|
* size is easily available by examining the pack entry header). It is
|
|
|
|
* also rather expensive to find the sha1 for an object given its offset.
|
|
|
|
*
|
|
|
|
* We build a hashtable of existing packs (pack_revindex), and keep reverse
|
|
|
|
* index here -- pack index file is sorted by object name mapping to offset;
|
|
|
|
* this pack_revindex[].revindex array is a list of offset/index_nr pairs
|
|
|
|
* ordered by offset, so if you know the offset of an object, next offset
|
|
|
|
* is where its packed representation ends and the index_nr can be used to
|
|
|
|
* get the object sha1 from the main index.
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
*/
|
2006-09-23 03:25:04 +02:00
|
|
|
struct revindex_entry {
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset;
|
2006-09-23 03:25:04 +02:00
|
|
|
unsigned int nr;
|
|
|
|
};
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
struct pack_revindex {
|
|
|
|
struct packed_git *p;
|
2006-09-23 03:25:04 +02:00
|
|
|
struct revindex_entry *revindex;
|
|
|
|
};
|
|
|
|
static struct pack_revindex *pack_revindex;
|
2006-08-15 19:23:48 +02:00
|
|
|
static int pack_revindex_hashsz;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* stats
|
|
|
|
*/
|
2007-03-07 02:44:24 +01:00
|
|
|
static uint32_t written, written_delta;
|
|
|
|
static uint32_t reused, reused_delta;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
|
|
|
static int pack_revindex_ix(struct packed_git *p)
|
|
|
|
{
|
2006-03-02 00:01:53 +01:00
|
|
|
unsigned long ui = (unsigned long)p;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
int i;
|
|
|
|
|
|
|
|
ui = ui ^ (ui >> 16); /* defeat structure alignment */
|
|
|
|
i = (int)(ui % pack_revindex_hashsz);
|
|
|
|
while (pack_revindex[i].p) {
|
|
|
|
if (pack_revindex[i].p == p)
|
|
|
|
return i;
|
|
|
|
if (++i == pack_revindex_hashsz)
|
|
|
|
i = 0;
|
|
|
|
}
|
|
|
|
return -1 - i;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void prepare_pack_ix(void)
|
|
|
|
{
|
|
|
|
int num;
|
|
|
|
struct packed_git *p;
|
|
|
|
for (num = 0, p = packed_git; p; p = p->next)
|
|
|
|
num++;
|
|
|
|
if (!num)
|
|
|
|
return;
|
|
|
|
pack_revindex_hashsz = num * 11;
|
|
|
|
pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz);
|
|
|
|
for (p = packed_git; p; p = p->next) {
|
|
|
|
num = pack_revindex_ix(p);
|
|
|
|
num = - 1 - num;
|
|
|
|
pack_revindex[num].p = p;
|
|
|
|
}
|
|
|
|
/* revindex elements are lazily initialized */
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cmp_offset(const void *a_, const void *b_)
|
|
|
|
{
|
2006-09-23 03:25:04 +02:00
|
|
|
const struct revindex_entry *a = a_;
|
|
|
|
const struct revindex_entry *b = b_;
|
|
|
|
return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ordered list of offsets of objects in the pack.
|
|
|
|
*/
|
|
|
|
static void prepare_pack_revindex(struct pack_revindex *rix)
|
|
|
|
{
|
|
|
|
struct packed_git *p = rix->p;
|
2007-04-09 07:06:28 +02:00
|
|
|
int num_ent = p->num_objects;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
int i;
|
2007-03-16 21:42:50 +01:00
|
|
|
const char *index = p->index_data;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
2006-09-23 03:25:04 +02:00
|
|
|
rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
|
2007-04-09 07:06:33 +02:00
|
|
|
index += 4 * 256;
|
|
|
|
|
|
|
|
if (p->index_version > 1) {
|
|
|
|
const uint32_t *off_32 =
|
|
|
|
(uint32_t *)(index + 8 + p->num_objects * (20 + 4));
|
|
|
|
const uint32_t *off_64 = off_32 + p->num_objects;
|
|
|
|
for (i = 0; i < num_ent; i++) {
|
|
|
|
uint32_t off = ntohl(*off_32++);
|
|
|
|
if (!(off & 0x80000000)) {
|
|
|
|
rix->revindex[i].offset = off;
|
|
|
|
} else {
|
|
|
|
rix->revindex[i].offset =
|
|
|
|
((uint64_t)ntohl(*off_64++)) << 32;
|
|
|
|
rix->revindex[i].offset |=
|
|
|
|
ntohl(*off_64++);
|
|
|
|
}
|
|
|
|
rix->revindex[i].nr = i;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < num_ent; i++) {
|
|
|
|
uint32_t hl = *((uint32_t *)(index + 24 * i));
|
|
|
|
rix->revindex[i].offset = ntohl(hl);
|
|
|
|
rix->revindex[i].nr = i;
|
|
|
|
}
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
2007-04-09 07:06:33 +02:00
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
/* This knows the pack format -- the 20-byte trailer
|
|
|
|
* follows immediately after the last object data.
|
|
|
|
*/
|
2006-09-23 03:25:04 +02:00
|
|
|
rix->revindex[num_ent].offset = p->pack_size - 20;
|
|
|
|
rix->revindex[num_ent].nr = -1;
|
|
|
|
qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
|
2006-09-23 03:25:04 +02:00
|
|
|
static struct revindex_entry * find_packed_object(struct packed_git *p,
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t ofs)
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
{
|
|
|
|
int num;
|
|
|
|
int lo, hi;
|
|
|
|
struct pack_revindex *rix;
|
2006-09-23 03:25:04 +02:00
|
|
|
struct revindex_entry *revindex;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
num = pack_revindex_ix(p);
|
|
|
|
if (num < 0)
|
|
|
|
die("internal error: pack revindex uninitialized");
|
|
|
|
rix = &pack_revindex[num];
|
|
|
|
if (!rix->revindex)
|
|
|
|
prepare_pack_revindex(rix);
|
|
|
|
revindex = rix->revindex;
|
|
|
|
lo = 0;
|
2007-04-09 07:06:28 +02:00
|
|
|
hi = p->num_objects + 1;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
do {
|
|
|
|
int mi = (lo + hi) / 2;
|
2006-09-23 03:25:04 +02:00
|
|
|
if (revindex[mi].offset == ofs) {
|
|
|
|
return revindex + mi;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
2006-09-23 03:25:04 +02:00
|
|
|
else if (ofs < revindex[mi].offset)
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
hi = mi;
|
|
|
|
else
|
|
|
|
lo = mi + 1;
|
|
|
|
} while (lo < hi);
|
|
|
|
die("internal error: pack revindex corrupt");
|
|
|
|
}
|
|
|
|
|
2007-03-16 21:42:50 +01:00
|
|
|
static const unsigned char *find_packed_object_name(struct packed_git *p,
|
|
|
|
off_t ofs)
|
2006-09-23 03:25:04 +02:00
|
|
|
{
|
|
|
|
struct revindex_entry *entry = find_packed_object(p, ofs);
|
2007-04-04 22:49:04 +02:00
|
|
|
return nth_packed_object_sha1(p, entry->nr);
|
2006-09-23 03:25:04 +02:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:42:43 +02:00
|
|
|
static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
|
|
|
|
{
|
|
|
|
unsigned long othersize, delta_size;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
|
|
|
void *otherbuf = read_sha1_file(entry->delta->sha1, &type, &othersize);
|
2005-06-25 23:42:43 +02:00
|
|
|
void *delta_buf;
|
|
|
|
|
|
|
|
if (!otherbuf)
|
|
|
|
die("unable to read %s", sha1_to_hex(entry->delta->sha1));
|
2005-06-26 13:29:18 +02:00
|
|
|
delta_buf = diff_delta(otherbuf, othersize,
|
2005-06-29 08:49:56 +02:00
|
|
|
buf, size, &delta_size, 0);
|
2005-06-26 04:30:20 +02:00
|
|
|
if (!delta_buf || delta_size != entry->delta_size)
|
2005-06-25 23:42:43 +02:00
|
|
|
die("delta size changed");
|
|
|
|
free(buf);
|
|
|
|
free(otherbuf);
|
|
|
|
return delta_buf;
|
|
|
|
}
|
|
|
|
|
2005-06-28 23:21:02 +02:00
|
|
|
/*
|
|
|
|
* The per-object header is a pretty dense thing, which is
|
|
|
|
* - first byte: low four bits are "size", then three bits of "type",
|
|
|
|
* and the high bit is "size continues".
|
|
|
|
* - each byte afterwards: low seven bits are size continuation,
|
|
|
|
* with the high bit being "size continues"
|
|
|
|
*/
|
|
|
|
static int encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
|
|
|
|
{
|
2005-06-29 07:15:57 +02:00
|
|
|
int n = 1;
|
2005-06-28 23:21:02 +02:00
|
|
|
unsigned char c;
|
|
|
|
|
2006-09-21 06:06:49 +02:00
|
|
|
if (type < OBJ_COMMIT || type > OBJ_REF_DELTA)
|
2005-06-28 23:21:02 +02:00
|
|
|
die("bad type %d", type);
|
|
|
|
|
2005-06-29 07:15:57 +02:00
|
|
|
c = (type << 4) | (size & 15);
|
|
|
|
size >>= 4;
|
|
|
|
while (size) {
|
2005-06-28 23:21:02 +02:00
|
|
|
*hdr++ = c | 0x80;
|
2005-06-29 07:15:57 +02:00
|
|
|
c = size & 0x7f;
|
|
|
|
size >>= 7;
|
|
|
|
n++;
|
2005-06-28 23:21:02 +02:00
|
|
|
}
|
|
|
|
*hdr = c;
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2006-09-23 03:25:04 +02:00
|
|
|
/*
|
|
|
|
* we are going to reuse the existing object data as is. make
|
|
|
|
* sure it is not corrupt.
|
|
|
|
*/
|
2006-12-23 08:34:13 +01:00
|
|
|
static int check_pack_inflate(struct packed_git *p,
|
|
|
|
struct pack_window **w_curs,
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset,
|
|
|
|
off_t len,
|
2006-12-23 08:34:13 +01:00
|
|
|
unsigned long expect)
|
|
|
|
{
|
|
|
|
z_stream stream;
|
|
|
|
unsigned char fakebuf[4096], *in;
|
|
|
|
int st;
|
|
|
|
|
|
|
|
memset(&stream, 0, sizeof(stream));
|
|
|
|
inflateInit(&stream);
|
|
|
|
do {
|
|
|
|
in = use_pack(p, w_curs, offset, &stream.avail_in);
|
|
|
|
stream.next_in = in;
|
|
|
|
stream.next_out = fakebuf;
|
|
|
|
stream.avail_out = sizeof(fakebuf);
|
|
|
|
st = inflate(&stream, Z_FINISH);
|
|
|
|
offset += stream.next_in - in;
|
|
|
|
} while (st == Z_OK || st == Z_BUF_ERROR);
|
|
|
|
inflateEnd(&stream);
|
|
|
|
return (st == Z_STREAM_END &&
|
|
|
|
stream.total_out == expect &&
|
|
|
|
stream.total_in == len) ? 0 : -1;
|
|
|
|
}
|
|
|
|
|
2007-04-10 06:15:41 +02:00
|
|
|
static int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
|
|
|
|
off_t offset, off_t len, unsigned int nr)
|
|
|
|
{
|
|
|
|
const uint32_t *index_crc;
|
|
|
|
uint32_t data_crc = crc32(0, Z_NULL, 0);
|
|
|
|
|
|
|
|
do {
|
|
|
|
unsigned int avail;
|
|
|
|
void *data = use_pack(p, w_curs, offset, &avail);
|
|
|
|
if (avail > len)
|
|
|
|
avail = len;
|
|
|
|
data_crc = crc32(data_crc, data, avail);
|
|
|
|
offset += avail;
|
|
|
|
len -= avail;
|
|
|
|
} while (len);
|
|
|
|
|
|
|
|
index_crc = p->index_data;
|
|
|
|
index_crc += 2 + 256 + p->num_objects * (20/4) + nr;
|
|
|
|
|
|
|
|
return data_crc != ntohl(*index_crc);
|
|
|
|
}
|
|
|
|
|
2006-12-23 08:34:13 +01:00
|
|
|
static void copy_pack_data(struct sha1file *f,
|
|
|
|
struct packed_git *p,
|
|
|
|
struct pack_window **w_curs,
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset,
|
|
|
|
off_t len)
|
2006-12-23 08:34:13 +01:00
|
|
|
{
|
|
|
|
unsigned char *in;
|
|
|
|
unsigned int avail;
|
|
|
|
|
|
|
|
while (len) {
|
|
|
|
in = use_pack(p, w_curs, offset, &avail);
|
|
|
|
if (avail > len)
|
2007-03-07 02:44:34 +01:00
|
|
|
avail = (unsigned int)len;
|
2006-12-23 08:34:13 +01:00
|
|
|
sha1write(f, in, avail);
|
|
|
|
offset += avail;
|
|
|
|
len -= avail;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int check_loose_inflate(unsigned char *data, unsigned long len, unsigned long expect)
|
2006-09-02 00:05:12 +02:00
|
|
|
{
|
2006-09-04 06:09:18 +02:00
|
|
|
z_stream stream;
|
|
|
|
unsigned char fakebuf[4096];
|
|
|
|
int st;
|
|
|
|
|
|
|
|
memset(&stream, 0, sizeof(stream));
|
|
|
|
stream.next_in = data;
|
|
|
|
stream.avail_in = len;
|
|
|
|
stream.next_out = fakebuf;
|
|
|
|
stream.avail_out = sizeof(fakebuf);
|
|
|
|
inflateInit(&stream);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
st = inflate(&stream, Z_FINISH);
|
|
|
|
if (st == Z_STREAM_END || st == Z_OK) {
|
|
|
|
st = (stream.total_out == expect &&
|
|
|
|
stream.total_in == len) ? 0 : -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (st != Z_BUF_ERROR) {
|
|
|
|
st = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
stream.next_out = fakebuf;
|
|
|
|
stream.avail_out = sizeof(fakebuf);
|
|
|
|
}
|
|
|
|
inflateEnd(&stream);
|
|
|
|
return st;
|
2006-09-02 00:05:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int revalidate_loose_object(struct object_entry *entry,
|
|
|
|
unsigned char *map,
|
|
|
|
unsigned long mapsize)
|
|
|
|
{
|
|
|
|
/* we already know this is a loose object with new type header. */
|
2006-09-04 06:09:18 +02:00
|
|
|
enum object_type type;
|
|
|
|
unsigned long size, used;
|
2006-09-02 00:05:12 +02:00
|
|
|
|
|
|
|
if (pack_to_stdout)
|
|
|
|
return 0;
|
|
|
|
|
2006-09-04 06:09:18 +02:00
|
|
|
used = unpack_object_header_gently(map, mapsize, &type, &size);
|
|
|
|
if (!used)
|
|
|
|
return -1;
|
|
|
|
map += used;
|
|
|
|
mapsize -= used;
|
2006-12-23 08:34:13 +01:00
|
|
|
return check_loose_inflate(map, mapsize, size);
|
2006-09-02 00:05:12 +02:00
|
|
|
}
|
|
|
|
|
2007-04-09 07:06:30 +02:00
|
|
|
static unsigned long write_object(struct sha1file *f,
|
2006-02-19 23:47:21 +01:00
|
|
|
struct object_entry *entry)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
|
|
|
unsigned long size;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
void *buf;
|
2005-06-28 23:21:02 +02:00
|
|
|
unsigned char header[10];
|
2007-03-07 02:44:34 +01:00
|
|
|
unsigned hdrlen;
|
|
|
|
off_t datalen;
|
2005-06-28 23:21:02 +02:00
|
|
|
enum object_type obj_type;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
int to_reuse = 0;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2007-04-09 07:06:31 +02:00
|
|
|
if (!pack_to_stdout)
|
|
|
|
crc32_begin(f);
|
|
|
|
|
2005-06-28 23:21:02 +02:00
|
|
|
obj_type = entry->type;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
if (! entry->in_pack)
|
|
|
|
to_reuse = 0; /* can't reuse what we don't have */
|
2006-09-23 03:25:04 +02:00
|
|
|
else if (obj_type == OBJ_REF_DELTA || obj_type == OBJ_OFS_DELTA)
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
to_reuse = 1; /* check_object() decided it for us */
|
|
|
|
else if (obj_type != entry->in_pack_type)
|
|
|
|
to_reuse = 0; /* pack has delta which is unusable */
|
|
|
|
else if (entry->delta)
|
|
|
|
to_reuse = 0; /* we want to pack afresh */
|
|
|
|
else
|
|
|
|
to_reuse = 1; /* we have it in-pack undeltified,
|
|
|
|
* and we do not need to deltify it.
|
|
|
|
*/
|
|
|
|
|
2006-07-18 00:06:23 +02:00
|
|
|
if (!entry->in_pack && !entry->delta) {
|
|
|
|
unsigned char *map;
|
|
|
|
unsigned long mapsize;
|
|
|
|
map = map_sha1_file(entry->sha1, &mapsize);
|
|
|
|
if (map && !legacy_loose_object(map)) {
|
|
|
|
/* We can copy straight into the pack file */
|
2006-09-02 00:05:12 +02:00
|
|
|
if (revalidate_loose_object(entry, map, mapsize))
|
|
|
|
die("corrupt loose object %s",
|
|
|
|
sha1_to_hex(entry->sha1));
|
2006-07-18 00:06:23 +02:00
|
|
|
sha1write(f, map, mapsize);
|
|
|
|
munmap(map, mapsize);
|
|
|
|
written++;
|
|
|
|
reused++;
|
|
|
|
return mapsize;
|
|
|
|
}
|
|
|
|
if (map)
|
|
|
|
munmap(map, mapsize);
|
|
|
|
}
|
|
|
|
|
2006-09-02 00:05:12 +02:00
|
|
|
if (!to_reuse) {
|
2007-02-26 20:55:59 +01:00
|
|
|
buf = read_sha1_file(entry->sha1, &type, &size);
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
if (!buf)
|
|
|
|
die("unable to read %s", sha1_to_hex(entry->sha1));
|
|
|
|
if (size != entry->size)
|
|
|
|
die("object %s size inconsistency (%lu vs %lu)",
|
|
|
|
sha1_to_hex(entry->sha1), size, entry->size);
|
|
|
|
if (entry->delta) {
|
|
|
|
buf = delta_against(buf, size, entry);
|
|
|
|
size = entry->delta_size;
|
2006-09-21 06:09:44 +02:00
|
|
|
obj_type = (allow_ofs_delta && entry->delta->offset) ?
|
|
|
|
OBJ_OFS_DELTA : OBJ_REF_DELTA;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The object header is a byte of 'type' followed by zero or
|
2006-09-21 06:09:44 +02:00
|
|
|
* more bytes of length.
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
*/
|
|
|
|
hdrlen = encode_header(obj_type, size, header);
|
|
|
|
sha1write(f, header, hdrlen);
|
|
|
|
|
2006-09-21 06:09:44 +02:00
|
|
|
if (obj_type == OBJ_OFS_DELTA) {
|
|
|
|
/*
|
|
|
|
* Deltas with relative base contain an additional
|
|
|
|
* encoding of the relative offset for the delta
|
|
|
|
* base from this object's position in the pack.
|
|
|
|
*/
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t ofs = entry->offset - entry->delta->offset;
|
2006-09-21 06:09:44 +02:00
|
|
|
unsigned pos = sizeof(header) - 1;
|
|
|
|
header[pos] = ofs & 127;
|
|
|
|
while (ofs >>= 7)
|
|
|
|
header[--pos] = 128 | (--ofs & 127);
|
|
|
|
sha1write(f, header + pos, sizeof(header) - pos);
|
|
|
|
hdrlen += sizeof(header) - pos;
|
|
|
|
} else if (obj_type == OBJ_REF_DELTA) {
|
|
|
|
/*
|
|
|
|
* Deltas with a base reference contain
|
|
|
|
* an additional 20 bytes for the base sha1.
|
|
|
|
*/
|
|
|
|
sha1write(f, entry->delta->sha1, 20);
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
hdrlen += 20;
|
|
|
|
}
|
|
|
|
datalen = sha1write_compressed(f, buf, size);
|
|
|
|
free(buf);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
else {
|
|
|
|
struct packed_git *p = entry->in_pack;
|
2006-12-23 08:34:08 +01:00
|
|
|
struct pack_window *w_curs = NULL;
|
2007-04-10 06:15:41 +02:00
|
|
|
struct revindex_entry *revidx;
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
2006-09-23 03:25:04 +02:00
|
|
|
if (entry->delta) {
|
|
|
|
obj_type = (allow_ofs_delta && entry->delta->offset) ?
|
|
|
|
OBJ_OFS_DELTA : OBJ_REF_DELTA;
|
|
|
|
reused_delta++;
|
|
|
|
}
|
|
|
|
hdrlen = encode_header(obj_type, entry->size, header);
|
|
|
|
sha1write(f, header, hdrlen);
|
|
|
|
if (obj_type == OBJ_OFS_DELTA) {
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t ofs = entry->offset - entry->delta->offset;
|
2006-09-23 03:25:04 +02:00
|
|
|
unsigned pos = sizeof(header) - 1;
|
|
|
|
header[pos] = ofs & 127;
|
|
|
|
while (ofs >>= 7)
|
|
|
|
header[--pos] = 128 | (--ofs & 127);
|
|
|
|
sha1write(f, header + pos, sizeof(header) - pos);
|
|
|
|
hdrlen += sizeof(header) - pos;
|
|
|
|
} else if (obj_type == OBJ_REF_DELTA) {
|
|
|
|
sha1write(f, entry->delta->sha1, 20);
|
|
|
|
hdrlen += 20;
|
|
|
|
}
|
2006-09-02 00:05:12 +02:00
|
|
|
|
2007-04-10 06:15:41 +02:00
|
|
|
offset = entry->in_pack_offset;
|
|
|
|
revidx = find_packed_object(p, offset);
|
|
|
|
datalen = revidx[1].offset - offset;
|
|
|
|
if (!pack_to_stdout && p->index_version > 1 &&
|
|
|
|
check_pack_crc(p, &w_curs, offset, datalen, revidx->nr))
|
|
|
|
die("bad packed object CRC for %s", sha1_to_hex(entry->sha1));
|
|
|
|
offset += entry->in_pack_header_size;
|
|
|
|
datalen -= entry->in_pack_header_size;
|
|
|
|
if (!pack_to_stdout && p->index_version == 1 &&
|
|
|
|
check_pack_inflate(p, &w_curs, offset, datalen, entry->size))
|
|
|
|
die("corrupt packed object for %s", sha1_to_hex(entry->sha1));
|
2006-12-23 08:34:13 +01:00
|
|
|
copy_pack_data(f, p, &w_curs, offset, datalen);
|
2006-12-23 08:34:08 +01:00
|
|
|
unuse_pack(&w_curs);
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
reused++;
|
2005-06-28 23:21:02 +02:00
|
|
|
}
|
2006-09-21 06:09:44 +02:00
|
|
|
if (entry->delta)
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
written_delta++;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
written++;
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2007-04-09 07:06:31 +02:00
|
|
|
if (!pack_to_stdout)
|
|
|
|
entry->crc32 = crc32_end(f);
|
2005-06-25 23:42:43 +02:00
|
|
|
return hdrlen + datalen;
|
|
|
|
}
|
|
|
|
|
2007-03-07 02:44:34 +01:00
|
|
|
static off_t write_one(struct sha1file *f,
|
2005-06-29 02:49:27 +02:00
|
|
|
struct object_entry *e,
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset)
|
2005-06-29 02:49:27 +02:00
|
|
|
{
|
2007-04-09 07:06:30 +02:00
|
|
|
unsigned long size;
|
|
|
|
|
|
|
|
/* offset is non zero if object is written already. */
|
2006-09-21 06:09:44 +02:00
|
|
|
if (e->offset || e->preferred_base)
|
2005-06-29 02:49:27 +02:00
|
|
|
return offset;
|
2007-04-09 07:06:30 +02:00
|
|
|
|
|
|
|
/* if we are deltified, write out base object first. */
|
2005-06-29 02:49:27 +02:00
|
|
|
if (e->delta)
|
|
|
|
offset = write_one(f, e->delta, offset);
|
2007-04-09 07:06:30 +02:00
|
|
|
|
2006-09-21 06:09:44 +02:00
|
|
|
e->offset = offset;
|
2007-04-09 07:06:30 +02:00
|
|
|
size = write_object(f, e);
|
|
|
|
|
|
|
|
/* make sure off_t is sufficiently large not to wrap */
|
|
|
|
if (offset > offset + size)
|
|
|
|
die("pack too large for current definition of off_t");
|
|
|
|
return offset + size;
|
2005-06-29 02:49:27 +02:00
|
|
|
}
|
|
|
|
|
2007-04-09 07:06:33 +02:00
|
|
|
static off_t write_pack_file(void)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2005-06-28 20:10:48 +02:00
|
|
|
struct sha1file *f;
|
2007-04-09 07:06:33 +02:00
|
|
|
off_t offset, last_obj_offset = 0;
|
2005-06-28 23:21:02 +02:00
|
|
|
struct pack_header hdr;
|
2006-02-23 01:02:59 +01:00
|
|
|
unsigned last_percent = 999;
|
2006-10-31 22:58:32 +01:00
|
|
|
int do_progress = progress;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2006-10-31 22:58:32 +01:00
|
|
|
if (!base_name) {
|
2005-06-28 20:10:48 +02:00
|
|
|
f = sha1fd(1, "<stdout>");
|
2006-10-31 22:58:32 +01:00
|
|
|
do_progress >>= 1;
|
|
|
|
}
|
|
|
|
else
|
2006-02-23 01:02:59 +01:00
|
|
|
f = sha1create("%s-%s.%s", base_name,
|
|
|
|
sha1_to_hex(object_list_sha1), "pack");
|
|
|
|
if (do_progress)
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Writing %u objects.\n", nr_result);
|
2006-02-23 01:02:59 +01:00
|
|
|
|
2005-06-28 23:21:02 +02:00
|
|
|
hdr.hdr_signature = htonl(PACK_SIGNATURE);
|
2005-06-29 07:15:57 +02:00
|
|
|
hdr.hdr_version = htonl(PACK_VERSION);
|
2006-02-19 23:47:21 +01:00
|
|
|
hdr.hdr_entries = htonl(nr_result);
|
2005-06-28 23:21:02 +02:00
|
|
|
sha1write(f, &hdr, sizeof(hdr));
|
|
|
|
offset = sizeof(hdr);
|
2006-02-25 06:55:23 +01:00
|
|
|
if (!nr_result)
|
|
|
|
goto done;
|
2006-02-22 23:41:32 +01:00
|
|
|
for (i = 0; i < nr_objects; i++) {
|
2007-04-09 07:06:33 +02:00
|
|
|
last_obj_offset = offset;
|
2005-06-29 02:49:27 +02:00
|
|
|
offset = write_one(f, objects + i, offset);
|
2006-02-23 01:02:59 +01:00
|
|
|
if (do_progress) {
|
2006-02-25 06:55:23 +01:00
|
|
|
unsigned percent = written * 100 / nr_result;
|
2006-02-23 01:02:59 +01:00
|
|
|
if (progress_update || percent != last_percent) {
|
|
|
|
fprintf(stderr, "%4u%% (%u/%u) done\r",
|
2006-02-25 06:55:23 +01:00
|
|
|
percent, written, nr_result);
|
2006-02-23 01:02:59 +01:00
|
|
|
progress_update = 0;
|
|
|
|
last_percent = percent;
|
|
|
|
}
|
2006-02-22 23:41:32 +01:00
|
|
|
}
|
|
|
|
}
|
2006-02-23 01:02:59 +01:00
|
|
|
if (do_progress)
|
|
|
|
fputc('\n', stderr);
|
2006-02-25 06:55:23 +01:00
|
|
|
done:
|
2006-11-29 23:15:48 +01:00
|
|
|
if (written != nr_result)
|
2007-03-07 02:44:24 +01:00
|
|
|
die("wrote %u objects while expecting %u", written, nr_result);
|
2005-06-27 07:01:46 +02:00
|
|
|
sha1close(f, pack_file_sha1, 1);
|
2007-04-09 07:06:33 +02:00
|
|
|
|
|
|
|
return last_obj_offset;
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2007-04-09 23:32:03 +02:00
|
|
|
static uint32_t index_default_version = 1;
|
|
|
|
static uint32_t index_off32_limit = 0x7fffffff;
|
|
|
|
|
2007-04-09 07:06:33 +02:00
|
|
|
static void write_index_file(off_t last_obj_offset)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2006-02-19 23:47:21 +01:00
|
|
|
struct sha1file *f = sha1create("%s-%s.%s", base_name,
|
|
|
|
sha1_to_hex(object_list_sha1), "idx");
|
2005-06-25 23:42:43 +02:00
|
|
|
struct object_entry **list = sorted_by_sha;
|
2006-02-19 23:47:21 +01:00
|
|
|
struct object_entry **last = list + nr_result;
|
2007-01-18 08:17:28 +01:00
|
|
|
uint32_t array[256];
|
2007-04-09 07:06:33 +02:00
|
|
|
uint32_t index_version;
|
|
|
|
|
|
|
|
/* if last object's offset is >= 2^31 we should use index V2 */
|
2007-04-09 23:32:03 +02:00
|
|
|
index_version = (last_obj_offset >> 31) ? 2 : index_default_version;
|
2007-04-09 07:06:33 +02:00
|
|
|
|
|
|
|
/* index versions 2 and above need a header */
|
|
|
|
if (index_version >= 2) {
|
|
|
|
struct pack_idx_header hdr;
|
|
|
|
hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
|
|
|
|
hdr.idx_version = htonl(index_version);
|
|
|
|
sha1write(f, &hdr, sizeof(hdr));
|
|
|
|
}
|
2005-06-25 23:42:43 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the first-level table (the list is sorted,
|
|
|
|
* but we use a 256-entry lookup to be able to avoid
|
2005-06-27 07:01:46 +02:00
|
|
|
* having to do eight extra binary search iterations).
|
2005-06-25 23:42:43 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
|
|
struct object_entry **next = list;
|
|
|
|
while (next < last) {
|
|
|
|
struct object_entry *entry = *next;
|
|
|
|
if (entry->sha1[0] != i)
|
|
|
|
break;
|
|
|
|
next++;
|
|
|
|
}
|
|
|
|
array[i] = htonl(next - sorted_by_sha);
|
|
|
|
list = next;
|
|
|
|
}
|
2007-01-18 08:17:28 +01:00
|
|
|
sha1write(f, array, 256 * 4);
|
2005-06-25 23:42:43 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the actual SHA1 entries..
|
|
|
|
*/
|
|
|
|
list = sorted_by_sha;
|
2006-02-19 23:47:21 +01:00
|
|
|
for (i = 0; i < nr_result; i++) {
|
2005-06-25 23:42:43 +02:00
|
|
|
struct object_entry *entry = *list++;
|
2007-04-09 07:06:33 +02:00
|
|
|
if (index_version < 2) {
|
|
|
|
uint32_t offset = htonl(entry->offset);
|
|
|
|
sha1write(f, &offset, 4);
|
|
|
|
}
|
2005-06-27 05:27:56 +02:00
|
|
|
sha1write(f, entry->sha1, 20);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
2007-04-09 07:06:33 +02:00
|
|
|
|
|
|
|
if (index_version >= 2) {
|
|
|
|
unsigned int nr_large_offset = 0;
|
|
|
|
|
|
|
|
/* write the crc32 table */
|
|
|
|
list = sorted_by_sha;
|
|
|
|
for (i = 0; i < nr_objects; i++) {
|
|
|
|
struct object_entry *entry = *list++;
|
|
|
|
uint32_t crc32_val = htonl(entry->crc32);
|
|
|
|
sha1write(f, &crc32_val, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* write the 32-bit offset table */
|
|
|
|
list = sorted_by_sha;
|
|
|
|
for (i = 0; i < nr_objects; i++) {
|
|
|
|
struct object_entry *entry = *list++;
|
2007-04-09 23:32:03 +02:00
|
|
|
uint32_t offset = (entry->offset <= index_off32_limit) ?
|
2007-04-09 07:06:33 +02:00
|
|
|
entry->offset : (0x80000000 | nr_large_offset++);
|
|
|
|
offset = htonl(offset);
|
|
|
|
sha1write(f, &offset, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* write the large offset table */
|
|
|
|
list = sorted_by_sha;
|
|
|
|
while (nr_large_offset) {
|
|
|
|
struct object_entry *entry = *list++;
|
|
|
|
uint64_t offset = entry->offset;
|
2007-04-09 23:32:03 +02:00
|
|
|
if (offset > index_off32_limit) {
|
2007-04-09 07:06:33 +02:00
|
|
|
uint32_t split[2];
|
|
|
|
split[0] = htonl(offset >> 32);
|
|
|
|
split[1] = htonl(offset & 0xffffffff);
|
|
|
|
sha1write(f, split, 8);
|
|
|
|
nr_large_offset--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-27 07:01:46 +02:00
|
|
|
sha1write(f, pack_file_sha1, 20);
|
|
|
|
sha1close(f, NULL, 1);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2006-02-19 23:47:21 +01:00
|
|
|
static int locate_object_entry_hash(const unsigned char *sha1)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
unsigned int ui;
|
|
|
|
memcpy(&ui, sha1, sizeof(unsigned int));
|
|
|
|
i = ui % object_ix_hashsz;
|
|
|
|
while (0 < object_ix[i]) {
|
2006-08-17 20:54:57 +02:00
|
|
|
if (!hashcmp(sha1, objects[object_ix[i] - 1].sha1))
|
2006-02-19 23:47:21 +01:00
|
|
|
return i;
|
|
|
|
if (++i == object_ix_hashsz)
|
|
|
|
i = 0;
|
|
|
|
}
|
|
|
|
return -1 - i;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct object_entry *locate_object_entry(const unsigned char *sha1)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!object_ix_hashsz)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
i = locate_object_entry_hash(sha1);
|
|
|
|
if (0 <= i)
|
|
|
|
return &objects[object_ix[i]-1];
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void rehash_objects(void)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2006-02-19 23:47:21 +01:00
|
|
|
struct object_entry *oe;
|
|
|
|
|
|
|
|
object_ix_hashsz = nr_objects * 3;
|
|
|
|
if (object_ix_hashsz < 1024)
|
|
|
|
object_ix_hashsz = 1024;
|
|
|
|
object_ix = xrealloc(object_ix, sizeof(int) * object_ix_hashsz);
|
2006-04-06 08:24:57 +02:00
|
|
|
memset(object_ix, 0, sizeof(int) * object_ix_hashsz);
|
2006-02-19 23:47:21 +01:00
|
|
|
for (i = 0, oe = objects; i < nr_objects; i++, oe++) {
|
|
|
|
int ix = locate_object_entry_hash(oe->sha1);
|
|
|
|
if (0 <= ix)
|
|
|
|
continue;
|
|
|
|
ix = -1 - ix;
|
|
|
|
object_ix[ix] = i + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
static unsigned name_hash(const char *name)
|
2006-02-23 07:10:24 +01:00
|
|
|
{
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
unsigned char c;
|
|
|
|
unsigned hash = 0;
|
|
|
|
|
2006-02-24 08:27:49 +01:00
|
|
|
/*
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
* This effectively just creates a sortable number from the
|
|
|
|
* last sixteen non-whitespace characters. Last characters
|
|
|
|
* count "most", so things that end in ".c" sort together.
|
2006-02-24 08:27:49 +01:00
|
|
|
*/
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
while ((c = *name++) != 0) {
|
|
|
|
if (isspace(c))
|
|
|
|
continue;
|
|
|
|
hash = (hash >> 2) + (c << 24);
|
|
|
|
}
|
2006-02-23 07:10:24 +01:00
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
|
|
|
struct object_entry *entry;
|
2007-04-11 04:54:36 +02:00
|
|
|
struct packed_git *p, *found_pack = NULL;
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t found_offset = 0;
|
2007-04-11 04:54:36 +02:00
|
|
|
int ix;
|
|
|
|
|
|
|
|
ix = nr_objects ? locate_object_entry_hash(sha1) : -1;
|
|
|
|
if (ix >= 0) {
|
|
|
|
if (exclude) {
|
|
|
|
entry = objects + object_ix[ix] - 1;
|
|
|
|
entry->preferred_base = 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2006-02-19 23:47:21 +01:00
|
|
|
if (!exclude) {
|
2005-10-14 00:38:28 +02:00
|
|
|
for (p = packed_git; p; p = p->next) {
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t offset = find_pack_entry_one(sha1, p);
|
2006-09-21 06:05:37 +02:00
|
|
|
if (offset) {
|
2005-10-14 00:38:28 +02:00
|
|
|
if (incremental)
|
|
|
|
return 0;
|
|
|
|
if (local && !p->pack_local)
|
|
|
|
return 0;
|
2006-02-19 23:47:21 +01:00
|
|
|
if (!found_pack) {
|
2006-09-21 06:05:37 +02:00
|
|
|
found_offset = offset;
|
|
|
|
found_pack = p;
|
2006-02-19 23:47:21 +01:00
|
|
|
}
|
2005-10-14 00:38:28 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-07-03 22:08:40 +02:00
|
|
|
|
2007-04-11 04:54:36 +02:00
|
|
|
if (nr_objects >= nr_alloc) {
|
|
|
|
nr_alloc = (nr_alloc + 1024) * 3 / 2;
|
2007-03-07 02:44:24 +01:00
|
|
|
objects = xrealloc(objects, nr_alloc * sizeof(*entry));
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
2007-04-11 04:54:36 +02:00
|
|
|
|
|
|
|
entry = objects + nr_objects++;
|
2005-06-25 23:42:43 +02:00
|
|
|
memset(entry, 0, sizeof(*entry));
|
2006-08-23 08:49:00 +02:00
|
|
|
hashcpy(entry->sha1, sha1);
|
2005-06-27 00:27:28 +02:00
|
|
|
entry->hash = hash;
|
2007-04-11 04:54:36 +02:00
|
|
|
if (exclude)
|
|
|
|
entry->preferred_base = 1;
|
|
|
|
if (found_pack) {
|
|
|
|
entry->in_pack = found_pack;
|
|
|
|
entry->in_pack_offset = found_offset;
|
|
|
|
}
|
2006-02-19 23:47:21 +01:00
|
|
|
|
|
|
|
if (object_ix_hashsz * 3 <= nr_objects * 4)
|
|
|
|
rehash_objects();
|
2007-04-11 04:54:36 +02:00
|
|
|
else
|
|
|
|
object_ix[-1 - ix] = nr_objects;
|
2006-02-19 23:47:21 +01:00
|
|
|
|
2006-02-25 06:55:23 +01:00
|
|
|
if (progress_update) {
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Counting objects...%u\r", nr_objects);
|
2006-02-25 06:55:23 +01:00
|
|
|
progress_update = 0;
|
|
|
|
}
|
2007-04-11 04:54:36 +02:00
|
|
|
|
|
|
|
return 1;
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2006-04-06 08:24:57 +02:00
|
|
|
struct pbase_tree_cache {
|
|
|
|
unsigned char sha1[20];
|
|
|
|
int ref;
|
|
|
|
int temporary;
|
|
|
|
void *tree_data;
|
|
|
|
unsigned long tree_size;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct pbase_tree_cache *(pbase_tree_cache[256]);
|
|
|
|
static int pbase_tree_cache_ix(const unsigned char *sha1)
|
|
|
|
{
|
|
|
|
return sha1[0] % ARRAY_SIZE(pbase_tree_cache);
|
|
|
|
}
|
|
|
|
static int pbase_tree_cache_ix_incr(int ix)
|
|
|
|
{
|
|
|
|
return (ix+1) % ARRAY_SIZE(pbase_tree_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct pbase_tree {
|
|
|
|
struct pbase_tree *next;
|
|
|
|
/* This is a phony "cache" entry; we are not
|
|
|
|
* going to evict it nor find it through _get()
|
|
|
|
* mechanism -- this is for the toplevel node that
|
|
|
|
* would almost always change with any commit.
|
|
|
|
*/
|
|
|
|
struct pbase_tree_cache pcache;
|
|
|
|
} *pbase_tree;
|
|
|
|
|
|
|
|
static struct pbase_tree_cache *pbase_tree_get(const unsigned char *sha1)
|
|
|
|
{
|
|
|
|
struct pbase_tree_cache *ent, *nent;
|
|
|
|
void *data;
|
|
|
|
unsigned long size;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
2006-04-06 08:24:57 +02:00
|
|
|
int neigh;
|
|
|
|
int my_ix = pbase_tree_cache_ix(sha1);
|
|
|
|
int available_ix = -1;
|
|
|
|
|
|
|
|
/* pbase-tree-cache acts as a limited hashtable.
|
|
|
|
* your object will be found at your index or within a few
|
|
|
|
* slots after that slot if it is cached.
|
|
|
|
*/
|
|
|
|
for (neigh = 0; neigh < 8; neigh++) {
|
|
|
|
ent = pbase_tree_cache[my_ix];
|
2006-08-17 20:54:57 +02:00
|
|
|
if (ent && !hashcmp(ent->sha1, sha1)) {
|
2006-04-06 08:24:57 +02:00
|
|
|
ent->ref++;
|
|
|
|
return ent;
|
|
|
|
}
|
|
|
|
else if (((available_ix < 0) && (!ent || !ent->ref)) ||
|
|
|
|
((0 <= available_ix) &&
|
|
|
|
(!ent && pbase_tree_cache[available_ix])))
|
|
|
|
available_ix = my_ix;
|
|
|
|
if (!ent)
|
|
|
|
break;
|
|
|
|
my_ix = pbase_tree_cache_ix_incr(my_ix);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Did not find one. Either we got a bogus request or
|
|
|
|
* we need to read and perhaps cache.
|
|
|
|
*/
|
2007-02-26 20:55:59 +01:00
|
|
|
data = read_sha1_file(sha1, &type, &size);
|
2006-04-06 08:24:57 +02:00
|
|
|
if (!data)
|
|
|
|
return NULL;
|
2007-02-26 20:55:59 +01:00
|
|
|
if (type != OBJ_TREE) {
|
2006-04-06 08:24:57 +02:00
|
|
|
free(data);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We need to either cache or return a throwaway copy */
|
|
|
|
|
|
|
|
if (available_ix < 0)
|
|
|
|
ent = NULL;
|
|
|
|
else {
|
|
|
|
ent = pbase_tree_cache[available_ix];
|
|
|
|
my_ix = available_ix;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ent) {
|
|
|
|
nent = xmalloc(sizeof(*nent));
|
|
|
|
nent->temporary = (available_ix < 0);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* evict and reuse */
|
|
|
|
free(ent->tree_data);
|
|
|
|
nent = ent;
|
|
|
|
}
|
2006-08-23 08:49:00 +02:00
|
|
|
hashcpy(nent->sha1, sha1);
|
2006-04-06 08:24:57 +02:00
|
|
|
nent->tree_data = data;
|
|
|
|
nent->tree_size = size;
|
|
|
|
nent->ref = 1;
|
|
|
|
if (!nent->temporary)
|
|
|
|
pbase_tree_cache[my_ix] = nent;
|
|
|
|
return nent;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pbase_tree_put(struct pbase_tree_cache *cache)
|
|
|
|
{
|
|
|
|
if (!cache->temporary) {
|
|
|
|
cache->ref--;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
free(cache->tree_data);
|
|
|
|
free(cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int name_cmp_len(const char *name)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 0; name[i] && name[i] != '\n' && name[i] != '/'; i++)
|
|
|
|
;
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void add_pbase_object(struct tree_desc *tree,
|
|
|
|
const char *name,
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
int cmplen,
|
|
|
|
const char *fullname)
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
{
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 18:45:45 +02:00
|
|
|
struct name_entry entry;
|
2007-04-16 18:28:10 +02:00
|
|
|
int cmp;
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 18:45:45 +02:00
|
|
|
|
|
|
|
while (tree_entry(tree,&entry)) {
|
2007-04-16 18:28:10 +02:00
|
|
|
cmp = tree_entry_len(entry.path, entry.sha1) != cmplen ? 1 :
|
|
|
|
memcmp(name, entry.path, cmplen);
|
|
|
|
if (cmp > 0)
|
2006-02-19 23:47:21 +01:00
|
|
|
continue;
|
2007-04-16 18:28:10 +02:00
|
|
|
if (cmp < 0)
|
|
|
|
return;
|
2006-04-06 08:24:57 +02:00
|
|
|
if (name[cmplen] != '/') {
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
unsigned hash = name_hash(fullname);
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 18:45:45 +02:00
|
|
|
add_object_entry(entry.sha1, hash, 1);
|
2006-04-06 08:24:57 +02:00
|
|
|
return;
|
|
|
|
}
|
2007-04-16 18:28:10 +02:00
|
|
|
if (S_ISDIR(entry.mode)) {
|
2006-02-19 23:47:21 +01:00
|
|
|
struct tree_desc sub;
|
2006-04-06 08:24:57 +02:00
|
|
|
struct pbase_tree_cache *tree;
|
|
|
|
const char *down = name+cmplen+1;
|
|
|
|
int downlen = name_cmp_len(down);
|
|
|
|
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 18:45:45 +02:00
|
|
|
tree = pbase_tree_get(entry.sha1);
|
2006-04-06 08:24:57 +02:00
|
|
|
if (!tree)
|
|
|
|
return;
|
2007-03-21 18:08:25 +01:00
|
|
|
init_tree_desc(&sub, tree->tree_data, tree->tree_size);
|
2006-04-06 08:24:57 +02:00
|
|
|
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
add_pbase_object(&sub, down, downlen, fullname);
|
2006-04-06 08:24:57 +02:00
|
|
|
pbase_tree_put(tree);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-02-23 07:10:24 +01:00
|
|
|
|
2006-04-06 08:24:57 +02:00
|
|
|
static unsigned *done_pbase_paths;
|
|
|
|
static int done_pbase_paths_num;
|
|
|
|
static int done_pbase_paths_alloc;
|
|
|
|
static int done_pbase_path_pos(unsigned hash)
|
|
|
|
{
|
|
|
|
int lo = 0;
|
|
|
|
int hi = done_pbase_paths_num;
|
|
|
|
while (lo < hi) {
|
|
|
|
int mi = (hi + lo) / 2;
|
|
|
|
if (done_pbase_paths[mi] == hash)
|
|
|
|
return mi;
|
|
|
|
if (done_pbase_paths[mi] < hash)
|
|
|
|
hi = mi;
|
|
|
|
else
|
|
|
|
lo = mi + 1;
|
|
|
|
}
|
|
|
|
return -lo-1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int check_pbase_path(unsigned hash)
|
|
|
|
{
|
|
|
|
int pos = (!done_pbase_paths) ? -1 : done_pbase_path_pos(hash);
|
|
|
|
if (0 <= pos)
|
|
|
|
return 1;
|
|
|
|
pos = -pos - 1;
|
|
|
|
if (done_pbase_paths_alloc <= done_pbase_paths_num) {
|
|
|
|
done_pbase_paths_alloc = alloc_nr(done_pbase_paths_alloc);
|
|
|
|
done_pbase_paths = xrealloc(done_pbase_paths,
|
|
|
|
done_pbase_paths_alloc *
|
|
|
|
sizeof(unsigned));
|
|
|
|
}
|
|
|
|
done_pbase_paths_num++;
|
|
|
|
if (pos < done_pbase_paths_num)
|
|
|
|
memmove(done_pbase_paths + pos + 1,
|
|
|
|
done_pbase_paths + pos,
|
|
|
|
(done_pbase_paths_num - pos - 1) * sizeof(unsigned));
|
|
|
|
done_pbase_paths[pos] = hash;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
static void add_preferred_base_object(const char *name, unsigned hash)
|
2006-04-06 08:24:57 +02:00
|
|
|
{
|
|
|
|
struct pbase_tree *it;
|
2007-04-16 18:28:10 +02:00
|
|
|
int cmplen;
|
2006-04-06 08:24:57 +02:00
|
|
|
|
2007-04-16 18:28:10 +02:00
|
|
|
if (!num_preferred_base || check_pbase_path(hash))
|
2006-04-06 08:24:57 +02:00
|
|
|
return;
|
|
|
|
|
2007-04-16 18:28:10 +02:00
|
|
|
cmplen = name_cmp_len(name);
|
2006-04-06 08:24:57 +02:00
|
|
|
for (it = pbase_tree; it; it = it->next) {
|
|
|
|
if (cmplen == 0) {
|
2007-04-16 18:28:10 +02:00
|
|
|
add_object_entry(it->pcache.sha1, 0, 1);
|
2006-04-06 08:24:57 +02:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
struct tree_desc tree;
|
2007-03-21 18:08:25 +01:00
|
|
|
init_tree_desc(&tree, it->pcache.tree_data, it->pcache.tree_size);
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
add_pbase_object(&tree, name, cmplen, name);
|
2006-02-19 23:47:21 +01:00
|
|
|
}
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-02-19 23:47:21 +01:00
|
|
|
static void add_preferred_base(unsigned char *sha1)
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
{
|
2006-04-06 08:24:57 +02:00
|
|
|
struct pbase_tree *it;
|
|
|
|
void *data;
|
|
|
|
unsigned long size;
|
|
|
|
unsigned char tree_sha1[20];
|
2006-02-23 07:10:24 +01:00
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
if (window <= num_preferred_base++)
|
|
|
|
return;
|
|
|
|
|
2006-04-06 08:24:57 +02:00
|
|
|
data = read_object_with_reference(sha1, tree_type, &size, tree_sha1);
|
|
|
|
if (!data)
|
2006-02-19 23:47:21 +01:00
|
|
|
return;
|
2006-04-06 08:24:57 +02:00
|
|
|
|
|
|
|
for (it = pbase_tree; it; it = it->next) {
|
2006-08-17 20:54:57 +02:00
|
|
|
if (!hashcmp(it->pcache.sha1, tree_sha1)) {
|
2006-04-06 08:24:57 +02:00
|
|
|
free(data);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
it = xcalloc(1, sizeof(*it));
|
|
|
|
it->next = pbase_tree;
|
|
|
|
pbase_tree = it;
|
|
|
|
|
2006-08-23 08:49:00 +02:00
|
|
|
hashcpy(it->pcache.sha1, tree_sha1);
|
2006-04-06 08:24:57 +02:00
|
|
|
it->pcache.tree_data = data;
|
|
|
|
it->pcache.tree_size = size;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:42:43 +02:00
|
|
|
static void check_object(struct object_entry *entry)
|
|
|
|
{
|
2006-02-19 23:47:21 +01:00
|
|
|
if (entry->in_pack && !entry->preferred_base) {
|
2006-09-23 03:25:04 +02:00
|
|
|
struct packed_git *p = entry->in_pack;
|
2006-12-23 08:34:08 +01:00
|
|
|
struct pack_window *w_curs = NULL;
|
2006-09-23 03:25:04 +02:00
|
|
|
unsigned long size, used;
|
2007-03-07 02:44:34 +01:00
|
|
|
unsigned int avail;
|
2006-09-23 03:25:04 +02:00
|
|
|
unsigned char *buf;
|
|
|
|
struct object_entry *base_entry = NULL;
|
|
|
|
|
2007-03-07 02:44:34 +01:00
|
|
|
buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
|
|
|
/* We want in_pack_type even if we do not reuse delta.
|
|
|
|
* There is no point not reusing non-delta representations.
|
|
|
|
*/
|
2007-03-07 02:44:34 +01:00
|
|
|
used = unpack_object_header_gently(buf, avail,
|
2006-09-23 03:25:04 +02:00
|
|
|
&entry->in_pack_type, &size);
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
/* Check if it is delta, and the base is also an object
|
|
|
|
* we are going to pack. If so we will reuse the existing
|
|
|
|
* delta.
|
|
|
|
*/
|
2006-09-23 03:25:04 +02:00
|
|
|
if (!no_reuse_delta) {
|
2007-03-16 21:42:50 +01:00
|
|
|
unsigned char c;
|
|
|
|
const unsigned char *base_name;
|
2007-03-07 02:44:34 +01:00
|
|
|
off_t ofs;
|
2006-12-29 07:29:06 +01:00
|
|
|
unsigned long used_0;
|
2006-09-23 03:25:04 +02:00
|
|
|
/* there is at least 20 bytes left in the pack */
|
|
|
|
switch (entry->in_pack_type) {
|
|
|
|
case OBJ_REF_DELTA:
|
2006-12-27 08:46:23 +01:00
|
|
|
base_name = use_pack(p, &w_curs,
|
|
|
|
entry->in_pack_offset + used, NULL);
|
2006-09-23 03:25:04 +02:00
|
|
|
used += 20;
|
|
|
|
break;
|
|
|
|
case OBJ_OFS_DELTA:
|
2006-12-27 08:46:23 +01:00
|
|
|
buf = use_pack(p, &w_curs,
|
|
|
|
entry->in_pack_offset + used, NULL);
|
2006-12-29 07:29:06 +01:00
|
|
|
used_0 = 0;
|
|
|
|
c = buf[used_0++];
|
2006-09-23 03:25:04 +02:00
|
|
|
ofs = c & 127;
|
|
|
|
while (c & 128) {
|
|
|
|
ofs += 1;
|
2007-04-09 07:06:29 +02:00
|
|
|
if (!ofs || MSB(ofs, 7))
|
2006-09-23 03:25:04 +02:00
|
|
|
die("delta base offset overflow in pack for %s",
|
|
|
|
sha1_to_hex(entry->sha1));
|
2006-12-29 07:29:06 +01:00
|
|
|
c = buf[used_0++];
|
2006-09-23 03:25:04 +02:00
|
|
|
ofs = (ofs << 7) + (c & 127);
|
|
|
|
}
|
|
|
|
if (ofs >= entry->in_pack_offset)
|
|
|
|
die("delta base offset out of bound for %s",
|
|
|
|
sha1_to_hex(entry->sha1));
|
|
|
|
ofs = entry->in_pack_offset - ofs;
|
|
|
|
base_name = find_packed_object_name(p, ofs);
|
2006-12-29 07:29:06 +01:00
|
|
|
used += used_0;
|
2006-09-23 03:25:04 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
base_name = NULL;
|
|
|
|
}
|
|
|
|
if (base_name)
|
|
|
|
base_entry = locate_object_entry(base_name);
|
|
|
|
}
|
2006-12-23 08:34:08 +01:00
|
|
|
unuse_pack(&w_curs);
|
2006-09-23 03:25:04 +02:00
|
|
|
entry->in_pack_header_size = used;
|
|
|
|
|
2006-09-27 21:42:16 +02:00
|
|
|
if (base_entry) {
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
|
|
|
/* Depth value does not matter - find_deltas()
|
|
|
|
* will never consider reused delta as the
|
|
|
|
* base object to deltify other objects
|
|
|
|
* against, in order to avoid circular deltas.
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
*/
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
|
|
|
/* uncompressed size of the delta data */
|
2006-09-23 03:25:04 +02:00
|
|
|
entry->size = size;
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
entry->delta = base_entry;
|
2006-09-23 03:25:04 +02:00
|
|
|
entry->type = entry->in_pack_type;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
2006-02-18 05:58:45 +01:00
|
|
|
entry->delta_sibling = base_entry->delta_child;
|
|
|
|
base_entry->delta_child = entry;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Otherwise we would do the usual */
|
2005-06-27 12:34:06 +02:00
|
|
|
}
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
2007-02-26 20:55:59 +01:00
|
|
|
entry->type = sha1_object_info(entry->sha1, &entry->size);
|
|
|
|
if (entry->type < 0)
|
2005-06-27 12:34:06 +02:00
|
|
|
die("unable to get type of object %s",
|
|
|
|
sha1_to_hex(entry->sha1));
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:42:43 +02:00
|
|
|
static void get_object_details(void)
|
|
|
|
{
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2006-02-18 05:58:45 +01:00
|
|
|
struct object_entry *entry;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
prepare_pack_ix();
|
2006-02-18 05:58:45 +01:00
|
|
|
for (i = 0, entry = objects; i < nr_objects; i++, entry++)
|
|
|
|
check_object(entry);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);
|
|
|
|
|
|
|
|
static entry_sort_t current_sort;
|
|
|
|
|
|
|
|
static int sort_comparator(const void *_a, const void *_b)
|
|
|
|
{
|
|
|
|
struct object_entry *a = *(struct object_entry **)_a;
|
|
|
|
struct object_entry *b = *(struct object_entry **)_b;
|
|
|
|
return current_sort(a,b);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct object_entry **create_sorted_list(entry_sort_t sort)
|
|
|
|
{
|
|
|
|
struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
|
|
|
for (i = 0; i < nr_objects; i++)
|
|
|
|
list[i] = objects + i;
|
|
|
|
current_sort = sort;
|
|
|
|
qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
|
|
|
|
{
|
2006-08-17 20:54:57 +02:00
|
|
|
return hashcmp(a->sha1, b->sha1);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2006-02-26 16:13:46 +01:00
|
|
|
static struct object_entry **create_final_object_list(void)
|
2006-02-19 23:47:21 +01:00
|
|
|
{
|
|
|
|
struct object_entry **list;
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i, j;
|
2006-02-19 23:47:21 +01:00
|
|
|
|
|
|
|
for (i = nr_result = 0; i < nr_objects; i++)
|
|
|
|
if (!objects[i].preferred_base)
|
|
|
|
nr_result++;
|
|
|
|
list = xmalloc(nr_result * sizeof(struct object_entry *));
|
|
|
|
for (i = j = 0; i < nr_objects; i++) {
|
|
|
|
if (!objects[i].preferred_base)
|
|
|
|
list[j++] = objects + i;
|
|
|
|
}
|
|
|
|
current_sort = sha1_sort;
|
|
|
|
qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator);
|
|
|
|
return list;
|
|
|
|
}
|
|
|
|
|
2005-06-25 23:42:43 +02:00
|
|
|
static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
|
|
|
|
{
|
|
|
|
if (a->type < b->type)
|
|
|
|
return -1;
|
|
|
|
if (a->type > b->type)
|
|
|
|
return 1;
|
2005-06-27 00:27:28 +02:00
|
|
|
if (a->hash < b->hash)
|
|
|
|
return -1;
|
|
|
|
if (a->hash > b->hash)
|
|
|
|
return 1;
|
2006-02-19 23:47:21 +01:00
|
|
|
if (a->preferred_base < b->preferred_base)
|
|
|
|
return -1;
|
|
|
|
if (a->preferred_base > b->preferred_base)
|
|
|
|
return 1;
|
2005-06-25 23:42:43 +02:00
|
|
|
if (a->size < b->size)
|
|
|
|
return -1;
|
|
|
|
if (a->size > b->size)
|
|
|
|
return 1;
|
2007-04-16 18:28:52 +02:00
|
|
|
return a > b ? -1 : (a < b); /* newest last */
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
struct unpacked {
|
|
|
|
struct object_entry *entry;
|
|
|
|
void *data;
|
2006-04-27 05:58:00 +02:00
|
|
|
struct delta_index *index;
|
2005-06-25 23:42:43 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2005-06-26 22:43:41 +02:00
|
|
|
* We search for deltas _backwards_ in a list sorted by type and
|
|
|
|
* by size, so that we see progressively smaller and smaller files.
|
|
|
|
* That's because we prefer deltas to be from the bigger file
|
|
|
|
* to the smaller - deletes are potentially cheaper, but perhaps
|
|
|
|
* more importantly, the bigger file is likely the more recent
|
|
|
|
* one.
|
2005-06-25 23:42:43 +02:00
|
|
|
*/
|
2006-04-27 05:58:00 +02:00
|
|
|
static int try_delta(struct unpacked *trg, struct unpacked *src,
|
2006-07-01 04:55:30 +02:00
|
|
|
unsigned max_depth)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2006-04-27 05:58:00 +02:00
|
|
|
struct object_entry *trg_entry = trg->entry;
|
|
|
|
struct object_entry *src_entry = src->entry;
|
2006-07-01 04:55:30 +02:00
|
|
|
unsigned long trg_size, src_size, delta_size, sizediff, max_size, sz;
|
2007-02-26 20:55:59 +01:00
|
|
|
enum object_type type;
|
2005-06-25 23:42:43 +02:00
|
|
|
void *delta_buf;
|
|
|
|
|
|
|
|
/* Don't bother doing diffs between different types */
|
2006-04-27 05:58:00 +02:00
|
|
|
if (trg_entry->type != src_entry->type)
|
2005-06-25 23:42:43 +02:00
|
|
|
return -1;
|
|
|
|
|
2006-02-19 23:47:21 +01:00
|
|
|
/* We do not compute delta to *create* objects we are not
|
|
|
|
* going to pack.
|
|
|
|
*/
|
2006-04-27 05:58:00 +02:00
|
|
|
if (trg_entry->preferred_base)
|
2005-06-26 04:30:20 +02:00
|
|
|
return -1;
|
2006-02-19 23:47:21 +01:00
|
|
|
|
2006-06-29 23:04:01 +02:00
|
|
|
/*
|
|
|
|
* We do not bother to try a delta that we discarded
|
2006-06-30 05:44:52 +02:00
|
|
|
* on an earlier try, but only when reusing delta data.
|
2006-06-29 23:04:01 +02:00
|
|
|
*/
|
2006-06-30 05:44:52 +02:00
|
|
|
if (!no_reuse_delta && trg_entry->in_pack &&
|
2006-11-15 07:18:31 +01:00
|
|
|
trg_entry->in_pack == src_entry->in_pack &&
|
|
|
|
trg_entry->in_pack_type != OBJ_REF_DELTA &&
|
|
|
|
trg_entry->in_pack_type != OBJ_OFS_DELTA)
|
2006-06-29 23:04:01 +02:00
|
|
|
return 0;
|
|
|
|
|
2007-04-16 18:29:16 +02:00
|
|
|
/* Let's not bust the allowed depth. */
|
2006-04-27 05:58:00 +02:00
|
|
|
if (src_entry->depth >= max_depth)
|
2005-06-26 05:17:59 +02:00
|
|
|
return 0;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2006-05-16 22:29:14 +02:00
|
|
|
/* Now some size filtering heuristics. */
|
2006-07-01 04:55:30 +02:00
|
|
|
trg_size = trg_entry->size;
|
|
|
|
max_size = trg_size/2 - 20;
|
2006-05-16 22:29:14 +02:00
|
|
|
max_size = max_size * (max_depth - src_entry->depth) / max_depth;
|
|
|
|
if (max_size == 0)
|
|
|
|
return 0;
|
2006-05-15 17:40:05 +02:00
|
|
|
if (trg_entry->delta && trg_entry->delta_size <= max_size)
|
2006-04-27 05:58:00 +02:00
|
|
|
max_size = trg_entry->delta_size-1;
|
|
|
|
src_size = src_entry->size;
|
2006-07-01 04:55:30 +02:00
|
|
|
sizediff = src_size < trg_size ? trg_size - src_size : 0;
|
2005-06-27 00:27:28 +02:00
|
|
|
if (sizediff >= max_size)
|
2006-04-21 08:36:22 +02:00
|
|
|
return 0;
|
2006-04-27 05:58:00 +02:00
|
|
|
|
2006-07-01 04:55:30 +02:00
|
|
|
/* Load data if not already done */
|
|
|
|
if (!trg->data) {
|
2007-02-26 20:55:59 +01:00
|
|
|
trg->data = read_sha1_file(trg_entry->sha1, &type, &sz);
|
2006-07-01 04:55:30 +02:00
|
|
|
if (sz != trg_size)
|
|
|
|
die("object %s inconsistent object length (%lu vs %lu)",
|
|
|
|
sha1_to_hex(trg_entry->sha1), sz, trg_size);
|
|
|
|
}
|
|
|
|
if (!src->data) {
|
2007-02-26 20:55:59 +01:00
|
|
|
src->data = read_sha1_file(src_entry->sha1, &type, &sz);
|
2006-07-01 04:55:30 +02:00
|
|
|
if (sz != src_size)
|
|
|
|
die("object %s inconsistent object length (%lu vs %lu)",
|
|
|
|
sha1_to_hex(src_entry->sha1), sz, src_size);
|
|
|
|
}
|
|
|
|
if (!src->index) {
|
|
|
|
src->index = create_delta_index(src->data, src_size);
|
|
|
|
if (!src->index)
|
|
|
|
die("out of memory");
|
|
|
|
}
|
|
|
|
|
|
|
|
delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
|
2005-06-25 23:42:43 +02:00
|
|
|
if (!delta_buf)
|
2005-06-26 04:30:20 +02:00
|
|
|
return 0;
|
2006-04-27 05:58:00 +02:00
|
|
|
|
|
|
|
trg_entry->delta = src_entry;
|
|
|
|
trg_entry->delta_size = delta_size;
|
|
|
|
trg_entry->depth = src_entry->depth + 1;
|
2005-06-25 23:42:43 +02:00
|
|
|
free(delta_buf);
|
2006-04-27 05:58:00 +02:00
|
|
|
return 1;
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2007-04-16 18:29:16 +02:00
|
|
|
static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
|
2006-02-22 22:00:08 +01:00
|
|
|
{
|
2007-04-16 18:29:16 +02:00
|
|
|
struct object_entry *child = me->delta_child;
|
|
|
|
unsigned int m = n;
|
|
|
|
while (child) {
|
|
|
|
unsigned int c = check_delta_limit(child, n + 1);
|
|
|
|
if (m < c)
|
|
|
|
m = c;
|
|
|
|
child = child->delta_sibling;
|
|
|
|
}
|
|
|
|
return m;
|
2006-02-22 22:00:08 +01:00
|
|
|
}
|
|
|
|
|
2005-06-26 05:17:59 +02:00
|
|
|
static void find_deltas(struct object_entry **list, int window, int depth)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i = nr_objects, idx = 0, processed = 0;
|
2005-06-25 23:42:43 +02:00
|
|
|
unsigned int array_size = window * sizeof(struct unpacked);
|
2007-03-07 02:44:24 +01:00
|
|
|
struct unpacked *array;
|
2006-02-23 01:02:59 +01:00
|
|
|
unsigned last_percent = 999;
|
2007-04-16 18:29:16 +02:00
|
|
|
int max_depth;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2007-03-07 02:44:24 +01:00
|
|
|
if (!nr_objects)
|
|
|
|
return;
|
|
|
|
array = xmalloc(array_size);
|
2005-06-25 23:42:43 +02:00
|
|
|
memset(array, 0, array_size);
|
2006-02-23 01:02:59 +01:00
|
|
|
if (progress)
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Deltifying %u objects.\n", nr_result);
|
2006-02-12 02:54:18 +01:00
|
|
|
|
2007-03-07 02:44:24 +01:00
|
|
|
do {
|
|
|
|
struct object_entry *entry = list[--i];
|
2005-06-25 23:42:43 +02:00
|
|
|
struct unpacked *n = array + idx;
|
|
|
|
int j;
|
|
|
|
|
2006-02-25 06:55:23 +01:00
|
|
|
if (!entry->preferred_base)
|
|
|
|
processed++;
|
|
|
|
|
2006-02-23 01:02:59 +01:00
|
|
|
if (progress) {
|
2006-02-25 06:55:23 +01:00
|
|
|
unsigned percent = processed * 100 / nr_result;
|
2006-02-23 01:02:59 +01:00
|
|
|
if (percent != last_percent || progress_update) {
|
|
|
|
fprintf(stderr, "%4u%% (%u/%u) done\r",
|
2006-02-25 06:55:23 +01:00
|
|
|
percent, processed, nr_result);
|
2006-02-23 01:02:59 +01:00
|
|
|
progress_update = 0;
|
|
|
|
last_percent = percent;
|
|
|
|
}
|
2006-02-12 02:54:18 +01:00
|
|
|
}
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
|
|
|
|
if (entry->delta)
|
|
|
|
/* This happens if we decided to reuse existing
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
* delta from a pack. "!no_reuse_delta &&" is implied.
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
|
2006-04-28 04:31:46 +02:00
|
|
|
if (entry->size < 50)
|
|
|
|
continue;
|
2006-05-15 19:47:16 +02:00
|
|
|
free_delta_index(n->index);
|
|
|
|
n->index = NULL;
|
2005-06-25 23:42:43 +02:00
|
|
|
free(n->data);
|
2006-07-01 04:55:30 +02:00
|
|
|
n->data = NULL;
|
2005-06-25 23:42:43 +02:00
|
|
|
n->entry = entry;
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
|
2007-04-16 18:29:16 +02:00
|
|
|
/*
|
|
|
|
* If the current object is at pack edge, take the depth the
|
|
|
|
* objects that depend on the current object into account
|
|
|
|
* otherwise they would become too deep.
|
|
|
|
*/
|
|
|
|
max_depth = depth;
|
|
|
|
if (entry->delta_child) {
|
|
|
|
max_depth -= check_delta_limit(entry, 0);
|
|
|
|
if (max_depth <= 0)
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
2005-06-26 03:29:23 +02:00
|
|
|
j = window;
|
|
|
|
while (--j > 0) {
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t other_idx = idx + j;
|
2005-06-25 23:42:43 +02:00
|
|
|
struct unpacked *m;
|
2005-06-26 03:29:23 +02:00
|
|
|
if (other_idx >= window)
|
|
|
|
other_idx -= window;
|
2005-06-25 23:42:43 +02:00
|
|
|
m = array + other_idx;
|
|
|
|
if (!m->entry)
|
|
|
|
break;
|
2007-04-16 18:29:16 +02:00
|
|
|
if (try_delta(n, m, max_depth) < 0)
|
2005-06-25 23:42:43 +02:00
|
|
|
break;
|
|
|
|
}
|
2007-04-16 18:29:16 +02:00
|
|
|
|
2006-03-05 20:22:57 +01:00
|
|
|
/* if we made n a delta, and if n is already at max
|
|
|
|
* depth, leaving it in the window is pointless. we
|
|
|
|
* should evict it first.
|
|
|
|
*/
|
|
|
|
if (entry->delta && depth <= entry->depth)
|
|
|
|
continue;
|
2006-05-15 19:47:16 +02:00
|
|
|
|
2007-04-16 18:29:16 +02:00
|
|
|
next:
|
2005-06-26 22:43:41 +02:00
|
|
|
idx++;
|
|
|
|
if (idx >= window)
|
|
|
|
idx = 0;
|
2007-03-07 02:44:24 +01:00
|
|
|
} while (i > 0);
|
2005-08-08 20:46:58 +02:00
|
|
|
|
2006-02-22 22:00:08 +01:00
|
|
|
if (progress)
|
|
|
|
fputc('\n', stderr);
|
|
|
|
|
2006-04-27 05:58:00 +02:00
|
|
|
for (i = 0; i < window; ++i) {
|
2006-05-15 19:47:16 +02:00
|
|
|
free_delta_index(array[i].index);
|
2005-08-08 20:46:58 +02:00
|
|
|
free(array[i].data);
|
2006-04-27 05:58:00 +02:00
|
|
|
}
|
2005-08-08 20:46:58 +02:00
|
|
|
free(array);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
|
|
|
|
2005-10-22 10:28:13 +02:00
|
|
|
static void prepare_pack(int window, int depth)
|
|
|
|
{
|
pack-objects: reuse data from existing packs.
When generating a new pack, notice if we have already needed
objects in existing packs. If an object is stored deltified,
and its base object is also what we are going to pack, then
reuse the existing deltified representation unconditionally,
bypassing all the expensive find_deltas() and try_deltas()
calls.
Also, notice if what we are going to write out exactly match
what is already in an existing pack (either deltified or just
compressed). In such a case, we can just copy it instead of
going through the usual uncompressing & recompressing cycle.
Without this patch, in linux-2.6 repository with about 1500
loose objects and a single mega pack:
$ git-rev-list --objects v2.6.16-rc3 >RL
$ wc -l RL
184141 RL
$ time git-pack-objects p <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
real 12m4.323s
user 11m2.560s
sys 0m55.950s
With this patch, the same input:
$ time ../git.junio/git-pack-objects q <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
a1fc7b3e537fcb9b3c46b7505df859f0a11e79d2
Total 184141, written 184141, reused 182441
real 1m2.608s
user 0m55.090s
sys 0m1.830s
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 02:34:29 +01:00
|
|
|
get_object_details();
|
2005-10-22 10:28:13 +02:00
|
|
|
sorted_by_type = create_sorted_list(type_size_sort);
|
|
|
|
if (window && depth)
|
|
|
|
find_deltas(sorted_by_type, window+1, depth);
|
|
|
|
}
|
|
|
|
|
2006-09-02 00:05:12 +02:00
|
|
|
static int reuse_cached_pack(unsigned char *sha1)
|
2005-10-22 10:28:13 +02:00
|
|
|
{
|
|
|
|
static const char cache[] = "pack-cache/pack-%s.%s";
|
|
|
|
char *cached_pack, *cached_idx;
|
|
|
|
int ifd, ofd, ifd_ix = -1;
|
|
|
|
|
|
|
|
cached_pack = git_path(cache, sha1_to_hex(sha1), "pack");
|
|
|
|
ifd = open(cached_pack, O_RDONLY);
|
|
|
|
if (ifd < 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!pack_to_stdout) {
|
|
|
|
cached_idx = git_path(cache, sha1_to_hex(sha1), "idx");
|
|
|
|
ifd_ix = open(cached_idx, O_RDONLY);
|
|
|
|
if (ifd_ix < 0) {
|
|
|
|
close(ifd);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
if (progress)
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Reusing %u objects pack %s\n", nr_objects,
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
sha1_to_hex(sha1));
|
2005-10-22 10:28:13 +02:00
|
|
|
|
|
|
|
if (pack_to_stdout) {
|
|
|
|
if (copy_fd(ifd, 1))
|
|
|
|
exit(1);
|
|
|
|
close(ifd);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
char name[PATH_MAX];
|
|
|
|
snprintf(name, sizeof(name),
|
|
|
|
"%s-%s.%s", base_name, sha1_to_hex(sha1), "pack");
|
|
|
|
ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
|
|
|
|
if (ofd < 0)
|
|
|
|
die("unable to open %s (%s)", name, strerror(errno));
|
|
|
|
if (copy_fd(ifd, ofd))
|
|
|
|
exit(1);
|
|
|
|
close(ifd);
|
|
|
|
|
|
|
|
snprintf(name, sizeof(name),
|
|
|
|
"%s-%s.%s", base_name, sha1_to_hex(sha1), "idx");
|
|
|
|
ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
|
|
|
|
if (ofd < 0)
|
|
|
|
die("unable to open %s (%s)", name, strerror(errno));
|
|
|
|
if (copy_fd(ifd_ix, ofd))
|
|
|
|
exit(1);
|
|
|
|
close(ifd_ix);
|
|
|
|
puts(sha1_to_hex(sha1));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-04-16 18:29:16 +02:00
|
|
|
static void progress_interval(int signum)
|
|
|
|
{
|
|
|
|
progress_update = 1;
|
|
|
|
}
|
|
|
|
|
2006-04-02 22:28:27 +02:00
|
|
|
static void setup_progress_signal(void)
|
|
|
|
{
|
|
|
|
struct sigaction sa;
|
|
|
|
struct itimerval v;
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof(sa));
|
|
|
|
sa.sa_handler = progress_interval;
|
|
|
|
sigemptyset(&sa.sa_mask);
|
|
|
|
sa.sa_flags = SA_RESTART;
|
|
|
|
sigaction(SIGALRM, &sa, NULL);
|
|
|
|
|
|
|
|
v.it_interval.tv_sec = 1;
|
|
|
|
v.it_interval.tv_usec = 0;
|
|
|
|
v.it_value = v.it_interval;
|
|
|
|
setitimer(ITIMER_REAL, &v, NULL);
|
|
|
|
}
|
|
|
|
|
2006-07-23 07:50:30 +02:00
|
|
|
static int git_pack_config(const char *k, const char *v)
|
|
|
|
{
|
|
|
|
if(!strcmp(k, "pack.window")) {
|
|
|
|
window = git_config_int(k, v);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return git_default_config(k, v);
|
|
|
|
}
|
|
|
|
|
2006-09-05 08:47:39 +02:00
|
|
|
static void read_object_list_from_stdin(void)
|
2005-06-25 23:42:43 +02:00
|
|
|
{
|
2006-09-05 08:47:39 +02:00
|
|
|
char line[40 + 1 + PATH_MAX + 2];
|
|
|
|
unsigned char sha1[20];
|
|
|
|
unsigned hash;
|
2006-02-22 22:00:08 +01:00
|
|
|
|
2006-04-02 22:31:54 +02:00
|
|
|
for (;;) {
|
|
|
|
if (!fgets(line, sizeof(line), stdin)) {
|
|
|
|
if (feof(stdin))
|
|
|
|
break;
|
|
|
|
if (!ferror(stdin))
|
|
|
|
die("fgets returned NULL, not EOF, not error!");
|
2006-04-04 08:41:09 +02:00
|
|
|
if (errno != EINTR)
|
|
|
|
die("fgets: %s", strerror(errno));
|
|
|
|
clearerr(stdin);
|
|
|
|
continue;
|
2006-04-02 22:31:54 +02:00
|
|
|
}
|
2006-02-19 23:47:21 +01:00
|
|
|
if (line[0] == '-') {
|
|
|
|
if (get_sha1_hex(line+1, sha1))
|
|
|
|
die("expected edge sha1, got garbage:\n %s",
|
2006-09-05 08:47:39 +02:00
|
|
|
line);
|
2006-09-06 10:42:23 +02:00
|
|
|
add_preferred_base(sha1);
|
2006-02-19 23:47:21 +01:00
|
|
|
continue;
|
2006-02-12 02:54:18 +01:00
|
|
|
}
|
2005-06-25 23:42:43 +02:00
|
|
|
if (get_sha1_hex(line, sha1))
|
2005-11-21 21:38:31 +01:00
|
|
|
die("expected sha1, got garbage:\n %s", line);
|
2006-09-05 08:47:39 +02:00
|
|
|
|
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually
improves packing for both git and the kernel.
The git archive pack shrinks from 6824090->6622627 bytes (a 3%
improvement), and the kernel pack shrinks from 108756213 to 108219021 (a
mere 0.5% improvement, but still, it's an improvement from making the
hashing much simpler!)
We just create a 32-bit hash, where we "age" previous characters by two
bits, so the last characters in a filename count most. So when we then
compare the hashes in the sort routine, filenames that end the same way
sort the same way.
It takes the subdirectory into account (unless the filename is > 16
characters), but files with the same name within the same subdirectory
will obviously sort closer than files in different subdirectories.
And, incidentally (which is why I tried the hash change in the first
place, of course) builtin-rev-list.c will sort fairly close to rev-list.c.
And no, it's not a "good hash" in the sense of being secure or unique, but
that's not what we're looking for. The whole "hash" thing is misnamed
here. It's not so much a hash as a "sorting number".
[jc: rolled in simplification for computing the sorting number
computation for thin pack base objects]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-05 21:03:31 +02:00
|
|
|
hash = name_hash(line+41);
|
2006-04-06 08:24:57 +02:00
|
|
|
add_preferred_base_object(line+41, hash);
|
|
|
|
add_object_entry(sha1, hash, 0);
|
2005-06-25 23:42:43 +02:00
|
|
|
}
|
2006-09-05 08:47:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void show_commit(struct commit *commit)
|
|
|
|
{
|
2007-04-16 18:28:10 +02:00
|
|
|
add_object_entry(commit->object.sha1, 0, 0);
|
2006-09-05 08:47:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void show_object(struct object_array_entry *p)
|
|
|
|
{
|
|
|
|
unsigned hash = name_hash(p->name);
|
2006-09-06 10:42:23 +02:00
|
|
|
add_preferred_base_object(p->name, hash);
|
2006-09-05 08:47:39 +02:00
|
|
|
add_object_entry(p->item->sha1, hash, 0);
|
|
|
|
}
|
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
static void show_edge(struct commit *commit)
|
|
|
|
{
|
|
|
|
add_preferred_base(commit->object.sha1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void get_object_list(int ac, const char **av)
|
2006-09-05 08:47:39 +02:00
|
|
|
{
|
|
|
|
struct rev_info revs;
|
|
|
|
char line[1000];
|
|
|
|
int flags = 0;
|
|
|
|
|
|
|
|
init_revisions(&revs, NULL);
|
|
|
|
save_commit_buffer = 0;
|
|
|
|
track_object_refs = 0;
|
|
|
|
setup_revisions(ac, av, &revs, NULL);
|
|
|
|
|
|
|
|
while (fgets(line, sizeof(line), stdin) != NULL) {
|
|
|
|
int len = strlen(line);
|
|
|
|
if (line[len - 1] == '\n')
|
|
|
|
line[--len] = 0;
|
|
|
|
if (!len)
|
|
|
|
break;
|
|
|
|
if (*line == '-') {
|
|
|
|
if (!strcmp(line, "--not")) {
|
|
|
|
flags ^= UNINTERESTING;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
die("not a rev '%s'", line);
|
|
|
|
}
|
|
|
|
if (handle_revision_arg(line, &revs, flags, 1))
|
|
|
|
die("bad revision '%s'", line);
|
|
|
|
}
|
|
|
|
|
|
|
|
prepare_revision_walk(&revs);
|
2006-09-06 10:42:23 +02:00
|
|
|
mark_edges_uninteresting(revs.commits, &revs, show_edge);
|
2006-09-05 08:47:39 +02:00
|
|
|
traverse_commit_list(&revs, show_commit, show_object);
|
|
|
|
}
|
|
|
|
|
|
|
|
int cmd_pack_objects(int argc, const char **argv, const char *prefix)
|
|
|
|
{
|
|
|
|
SHA_CTX ctx;
|
|
|
|
int depth = 10;
|
|
|
|
struct object_entry **list;
|
|
|
|
int use_internal_rev_list = 0;
|
2006-09-06 10:42:23 +02:00
|
|
|
int thin = 0;
|
2007-03-07 02:44:24 +01:00
|
|
|
uint32_t i;
|
2007-02-25 18:34:27 +01:00
|
|
|
const char **rp_av;
|
|
|
|
int rp_ac_alloc = 64;
|
2006-09-06 10:42:23 +02:00
|
|
|
int rp_ac;
|
|
|
|
|
2007-02-25 18:34:27 +01:00
|
|
|
rp_av = xcalloc(rp_ac_alloc, sizeof(*rp_av));
|
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
rp_av[0] = "pack-objects";
|
|
|
|
rp_av[1] = "--objects"; /* --thin will make it --objects-edge */
|
|
|
|
rp_ac = 2;
|
2006-09-05 08:47:39 +02:00
|
|
|
|
|
|
|
git_config(git_pack_config);
|
|
|
|
|
|
|
|
progress = isatty(2);
|
|
|
|
for (i = 1; i < argc; i++) {
|
|
|
|
const char *arg = argv[i];
|
|
|
|
|
|
|
|
if (*arg != '-')
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!strcmp("--non-empty", arg)) {
|
|
|
|
non_empty = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp("--local", arg)) {
|
|
|
|
local = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp("--incremental", arg)) {
|
|
|
|
incremental = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2007-02-20 10:54:00 +01:00
|
|
|
if (!prefixcmp(arg, "--window=")) {
|
2006-09-05 08:47:39 +02:00
|
|
|
char *end;
|
|
|
|
window = strtoul(arg+9, &end, 0);
|
|
|
|
if (!arg[9] || *end)
|
|
|
|
usage(pack_usage);
|
|
|
|
continue;
|
|
|
|
}
|
2007-02-20 10:54:00 +01:00
|
|
|
if (!prefixcmp(arg, "--depth=")) {
|
2006-09-05 08:47:39 +02:00
|
|
|
char *end;
|
|
|
|
depth = strtoul(arg+8, &end, 0);
|
|
|
|
if (!arg[8] || *end)
|
|
|
|
usage(pack_usage);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp("--progress", arg)) {
|
|
|
|
progress = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2006-11-07 16:51:23 +01:00
|
|
|
if (!strcmp("--all-progress", arg)) {
|
|
|
|
progress = 2;
|
|
|
|
continue;
|
|
|
|
}
|
2006-09-05 08:47:39 +02:00
|
|
|
if (!strcmp("-q", arg)) {
|
|
|
|
progress = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp("--no-reuse-delta", arg)) {
|
|
|
|
no_reuse_delta = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2006-09-21 06:09:44 +02:00
|
|
|
if (!strcmp("--delta-base-offset", arg)) {
|
2006-09-23 03:25:04 +02:00
|
|
|
allow_ofs_delta = 1;
|
2006-09-21 06:09:44 +02:00
|
|
|
continue;
|
|
|
|
}
|
2006-09-05 08:47:39 +02:00
|
|
|
if (!strcmp("--stdout", arg)) {
|
|
|
|
pack_to_stdout = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!strcmp("--revs", arg)) {
|
|
|
|
use_internal_rev_list = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2006-09-06 10:42:23 +02:00
|
|
|
if (!strcmp("--unpacked", arg) ||
|
2007-02-20 10:54:00 +01:00
|
|
|
!prefixcmp(arg, "--unpacked=") ||
|
2006-12-19 02:25:28 +01:00
|
|
|
!strcmp("--reflog", arg) ||
|
2006-09-06 10:42:23 +02:00
|
|
|
!strcmp("--all", arg)) {
|
|
|
|
use_internal_rev_list = 1;
|
2007-02-25 18:34:27 +01:00
|
|
|
if (rp_ac >= rp_ac_alloc - 1) {
|
|
|
|
rp_ac_alloc = alloc_nr(rp_ac_alloc);
|
|
|
|
rp_av = xrealloc(rp_av,
|
|
|
|
rp_ac_alloc * sizeof(*rp_av));
|
|
|
|
}
|
2006-09-06 10:42:23 +02:00
|
|
|
rp_av[rp_ac++] = arg;
|
2006-09-05 08:47:39 +02:00
|
|
|
continue;
|
|
|
|
}
|
2006-09-06 10:42:23 +02:00
|
|
|
if (!strcmp("--thin", arg)) {
|
|
|
|
use_internal_rev_list = 1;
|
|
|
|
thin = 1;
|
|
|
|
rp_av[1] = "--objects-edge";
|
2006-09-05 08:47:39 +02:00
|
|
|
continue;
|
2007-04-09 23:32:03 +02:00
|
|
|
}
|
|
|
|
if (!prefixcmp(arg, "--index-version=")) {
|
|
|
|
char *c;
|
|
|
|
index_default_version = strtoul(arg + 16, &c, 10);
|
|
|
|
if (index_default_version > 2)
|
|
|
|
die("bad %s", arg);
|
|
|
|
if (*c == ',')
|
|
|
|
index_off32_limit = strtoul(c+1, &c, 0);
|
|
|
|
if (*c || index_off32_limit & 0x80000000)
|
|
|
|
die("bad %s", arg);
|
|
|
|
continue;
|
2006-09-05 08:47:39 +02:00
|
|
|
}
|
|
|
|
usage(pack_usage);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Traditionally "pack-objects [options] base extra" failed;
|
|
|
|
* we would however want to take refs parameter that would
|
|
|
|
* have been given to upstream rev-list ourselves, which means
|
|
|
|
* we somehow want to say what the base name is. So the
|
|
|
|
* syntax would be:
|
|
|
|
*
|
|
|
|
* pack-objects [options] base <refs...>
|
|
|
|
*
|
|
|
|
* in other words, we would treat the first non-option as the
|
|
|
|
* base_name and send everything else to the internal revision
|
|
|
|
* walker.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!pack_to_stdout)
|
|
|
|
base_name = argv[i++];
|
|
|
|
|
|
|
|
if (pack_to_stdout != !base_name)
|
|
|
|
usage(pack_usage);
|
|
|
|
|
2006-09-06 10:42:23 +02:00
|
|
|
if (!pack_to_stdout && thin)
|
|
|
|
die("--thin cannot be used to build an indexable pack.");
|
2006-09-05 08:47:39 +02:00
|
|
|
|
|
|
|
prepare_packed_git();
|
|
|
|
|
|
|
|
if (progress) {
|
|
|
|
fprintf(stderr, "Generating pack...\n");
|
|
|
|
setup_progress_signal();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!use_internal_rev_list)
|
|
|
|
read_object_list_from_stdin();
|
2006-09-06 10:42:23 +02:00
|
|
|
else {
|
|
|
|
rp_av[rp_ac] = NULL;
|
|
|
|
get_object_list(rp_ac, rp_av);
|
|
|
|
}
|
2006-09-05 08:47:39 +02:00
|
|
|
|
2006-02-12 02:54:18 +01:00
|
|
|
if (progress)
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Done counting %u objects.\n", nr_objects);
|
2006-02-25 06:55:23 +01:00
|
|
|
sorted_by_sha = create_final_object_list();
|
|
|
|
if (non_empty && !nr_result)
|
2005-07-03 22:36:58 +02:00
|
|
|
return 0;
|
2005-06-25 23:42:43 +02:00
|
|
|
|
2005-10-13 01:54:19 +02:00
|
|
|
SHA1_Init(&ctx);
|
|
|
|
list = sorted_by_sha;
|
2006-02-19 23:47:21 +01:00
|
|
|
for (i = 0; i < nr_result; i++) {
|
2005-10-13 01:54:19 +02:00
|
|
|
struct object_entry *entry = *list++;
|
|
|
|
SHA1_Update(&ctx, entry->sha1, 20);
|
|
|
|
}
|
|
|
|
SHA1_Final(object_list_sha1, &ctx);
|
2006-02-19 23:47:21 +01:00
|
|
|
if (progress && (nr_objects != nr_result))
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Result has %u objects.\n", nr_result);
|
2005-10-13 01:54:19 +02:00
|
|
|
|
2006-09-02 00:05:12 +02:00
|
|
|
if (reuse_cached_pack(object_list_sha1))
|
2005-10-22 10:28:13 +02:00
|
|
|
;
|
|
|
|
else {
|
2007-04-09 07:06:33 +02:00
|
|
|
off_t last_obj_offset;
|
2006-02-25 06:55:23 +01:00
|
|
|
if (nr_result)
|
|
|
|
prepare_pack(window, depth);
|
2006-10-31 22:58:32 +01:00
|
|
|
if (progress == 1 && pack_to_stdout) {
|
2006-02-22 23:41:32 +01:00
|
|
|
/* the other end usually displays progress itself */
|
|
|
|
struct itimerval v = {{0,},};
|
|
|
|
setitimer(ITIMER_REAL, &v, NULL);
|
|
|
|
signal(SIGALRM, SIG_IGN );
|
|
|
|
progress_update = 0;
|
|
|
|
}
|
2007-04-09 07:06:33 +02:00
|
|
|
last_obj_offset = write_pack_file();
|
2005-10-22 10:28:13 +02:00
|
|
|
if (!pack_to_stdout) {
|
2007-04-09 07:06:33 +02:00
|
|
|
write_index_file(last_obj_offset);
|
2005-10-22 10:28:13 +02:00
|
|
|
puts(sha1_to_hex(object_list_sha1));
|
|
|
|
}
|
2005-07-04 00:34:04 +02:00
|
|
|
}
|
pack-objects: finishing touches.
This introduces --no-reuse-delta option to disable reusing of
existing delta, which is a large part of the optimization
introduced by this series. This may become necessary if
repeated repacking makes delta chain too long. With this, the
output of the command becomes identical to that of the older
implementation. But the performance suffers greatly.
It still allows reusing non-deltified representations; there is
no point uncompressing and recompressing the whole text.
It also adds a couple more statistics output, while squelching
it under -q flag, which the last round forgot to do.
$ time old-git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects....................
real 12m8.530s user 11m1.450s sys 0m57.920s
$ time git-pack-objects --stdout >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 138297), reused 178833 (delta 134081)
real 0m59.549s user 0m56.670s sys 0m2.400s
$ time git-pack-objects --stdout --no-reuse-delta >/dev/null <RL
Generating pack...
Done counting 184141 objects.
Packing 184141 objects.....................
Total 184141, written 184141 (delta 134833), reused 47904 (delta 0)
real 11m13.830s user 9m45.240s sys 0m44.330s
There is one remaining issue when --no-reuse-delta option is not
used. It can create delta chains that are deeper than specified.
A<--B<--C<--D E F G
Suppose we have a delta chain A to D (A is stored in full either
in a pack or as a loose object. B is depth1 delta relative to A,
C is depth2 delta relative to B...) with loose objects E, F, G.
And we are going to pack all of them.
B, C and D are left as delta against A, B and C respectively.
So A, E, F, and G are examined for deltification, and let's say
we decided to keep E expanded, and store the rest as deltas like
this:
E<--F<--G<--A
Oops. We ended up making D a bit too deep, didn't we? B, C and
D form a chain on top of A!
This is because we did not know what the final depth of A would
be, when we checked objects and decided to keep the existing
delta. Unfortunately, deferring the decision until just before
the deltification is not an option. To be able to make B, C,
and D candidates for deltification with the rest, we need to
know the type and final unexpanded size of them, but the major
part of the optimization comes from the fact that we do not read
the delta data to do so -- getting the final size is quite an
expensive operation.
To prevent this from happening, we should keep A from being
deltified. But how would we tell that, cheaply?
To do this most precisely, after check_object() runs, each
object that is used as the base object of some existing delta
needs to be marked with the maximum depth of the objects we
decided to keep deltified (in this case, D is depth 3 relative
to A, so if no other delta chain that is longer than 3 based on
A exists, mark A with 3). Then when attempting to deltify A, we
would take that number into account to see if the final delta
chain that leads to D becomes too deep.
However, this is a bit cumbersome to compute, so we would cheat
and reduce the maximum depth for A arbitrarily to depth/4 in
this implementation.
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-02-16 20:55:51 +01:00
|
|
|
if (progress)
|
2007-03-07 02:44:24 +01:00
|
|
|
fprintf(stderr, "Total %u (delta %u), reused %u (delta %u)\n",
|
2006-11-29 23:15:48 +01:00
|
|
|
written, written_delta, reused, reused_delta);
|
2005-06-25 23:42:43 +02:00
|
|
|
return 0;
|
|
|
|
}
|