From 3449f8c4cb93a0ec445db22ee7549d0b051446d6 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 28 Feb 2008 00:25:17 -0500 Subject: [PATCH 1/4] factorize revindex code out of builtin-pack-objects.c No functional change. This is needed to fix verify-pack in a later patch. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- Makefile | 5 +- builtin-pack-objects.c | 160 +++-------------------------------------- pack-revindex.c | 142 ++++++++++++++++++++++++++++++++++++ pack-revindex.h | 12 ++++ 4 files changed, 167 insertions(+), 152 deletions(-) create mode 100644 pack-revindex.c create mode 100644 pack-revindex.h diff --git a/Makefile b/Makefile index a5b6eebf1a..1acac02766 100644 --- a/Makefile +++ b/Makefile @@ -304,7 +304,8 @@ LIB_H = \ run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \ tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \ utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \ - mailmap.h remote.h parse-options.h transport.h diffcore.h hash.h + mailmap.h remote.h parse-options.h transport.h diffcore.h hash.h \ + pack-revindex.h DIFF_OBJS = \ diff.o diff-lib.o diffcore-break.o diffcore-order.o \ @@ -328,7 +329,7 @@ LIB_OBJS = \ color.o wt-status.o archive-zip.o archive-tar.o shallow.o utf8.o \ convert.o attr.o decorate.o progress.o mailmap.o symlinks.o remote.o \ transport.o bundle.o walker.o parse-options.o ws.o archive.o branch.o \ - alias.o + alias.o pack-revindex.o BUILTIN_OBJS = \ builtin-add.o \ diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 1bba6e6a64..c7834bb368 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -8,6 +8,7 @@ #include "tree.h" #include "delta.h" #include "pack.h" +#include "pack-revindex.h" #include "csum-file.h" #include "tree-walk.h" #include "diff.h" @@ -92,158 +93,12 @@ static unsigned long window_memory_limit = 0; static int *object_ix; static int object_ix_hashsz; -/* - * Pack index for existing packs give us easy access to the offsets into - * corresponding pack file where each object's data starts, but the entries - * do not store the size of the compressed representation (uncompressed - * size is easily available by examining the pack entry header). It is - * also rather expensive to find the sha1 for an object given its offset. - * - * We build a hashtable of existing packs (pack_revindex), and keep reverse - * index here -- pack index file is sorted by object name mapping to offset; - * this pack_revindex[].revindex array is a list of offset/index_nr pairs - * ordered by offset, so if you know the offset of an object, next offset - * is where its packed representation ends and the index_nr can be used to - * get the object sha1 from the main index. - */ -struct revindex_entry { - off_t offset; - unsigned int nr; -}; -struct pack_revindex { - struct packed_git *p; - struct revindex_entry *revindex; -}; -static struct pack_revindex *pack_revindex; -static int pack_revindex_hashsz; - /* * stats */ static uint32_t written, written_delta; static uint32_t reused, reused_delta; -static int pack_revindex_ix(struct packed_git *p) -{ - unsigned long ui = (unsigned long)p; - int i; - - ui = ui ^ (ui >> 16); /* defeat structure alignment */ - i = (int)(ui % pack_revindex_hashsz); - while (pack_revindex[i].p) { - if (pack_revindex[i].p == p) - return i; - if (++i == pack_revindex_hashsz) - i = 0; - } - return -1 - i; -} - -static void prepare_pack_ix(void) -{ - int num; - struct packed_git *p; - for (num = 0, p = packed_git; p; p = p->next) - num++; - if (!num) - return; - pack_revindex_hashsz = num * 11; - pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz); - for (p = packed_git; p; p = p->next) { - num = pack_revindex_ix(p); - num = - 1 - num; - pack_revindex[num].p = p; - } - /* revindex elements are lazily initialized */ -} - -static int cmp_offset(const void *a_, const void *b_) -{ - const struct revindex_entry *a = a_; - const struct revindex_entry *b = b_; - return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0; -} - -/* - * Ordered list of offsets of objects in the pack. - */ -static void prepare_pack_revindex(struct pack_revindex *rix) -{ - struct packed_git *p = rix->p; - int num_ent = p->num_objects; - int i; - const char *index = p->index_data; - - rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1)); - index += 4 * 256; - - if (p->index_version > 1) { - const uint32_t *off_32 = - (uint32_t *)(index + 8 + p->num_objects * (20 + 4)); - const uint32_t *off_64 = off_32 + p->num_objects; - for (i = 0; i < num_ent; i++) { - uint32_t off = ntohl(*off_32++); - if (!(off & 0x80000000)) { - rix->revindex[i].offset = off; - } else { - rix->revindex[i].offset = - ((uint64_t)ntohl(*off_64++)) << 32; - rix->revindex[i].offset |= - ntohl(*off_64++); - } - rix->revindex[i].nr = i; - } - } else { - for (i = 0; i < num_ent; i++) { - uint32_t hl = *((uint32_t *)(index + 24 * i)); - rix->revindex[i].offset = ntohl(hl); - rix->revindex[i].nr = i; - } - } - - /* This knows the pack format -- the 20-byte trailer - * follows immediately after the last object data. - */ - rix->revindex[num_ent].offset = p->pack_size - 20; - rix->revindex[num_ent].nr = -1; - qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset); -} - -static struct revindex_entry * find_packed_object(struct packed_git *p, - off_t ofs) -{ - int num; - int lo, hi; - struct pack_revindex *rix; - struct revindex_entry *revindex; - num = pack_revindex_ix(p); - if (num < 0) - die("internal error: pack revindex uninitialized"); - rix = &pack_revindex[num]; - if (!rix->revindex) - prepare_pack_revindex(rix); - revindex = rix->revindex; - lo = 0; - hi = p->num_objects + 1; - do { - int mi = (lo + hi) / 2; - if (revindex[mi].offset == ofs) { - return revindex + mi; - } - else if (ofs < revindex[mi].offset) - hi = mi; - else - lo = mi + 1; - } while (lo < hi); - die("internal error: pack revindex corrupt"); -} - -static const unsigned char *find_packed_object_name(struct packed_git *p, - off_t ofs) -{ - struct revindex_entry *entry = find_packed_object(p, ofs); - return nth_packed_object_sha1(p, entry->nr); -} static void *delta_against(void *buf, unsigned long size, struct object_entry *entry) { @@ -510,7 +365,7 @@ static unsigned long write_object(struct sha1file *f, } hdrlen = encode_header(obj_type, entry->size, header); offset = entry->in_pack_offset; - revidx = find_packed_object(p, offset); + revidx = find_pack_revindex(p, offset); datalen = revidx[1].offset - offset; if (!pack_to_stdout && p->index_version > 1 && check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) @@ -1162,8 +1017,11 @@ static void check_object(struct object_entry *entry) die("delta base offset out of bound for %s", sha1_to_hex(entry->idx.sha1)); ofs = entry->in_pack_offset - ofs; - if (!no_reuse_delta && !entry->preferred_base) - base_ref = find_packed_object_name(p, ofs); + if (!no_reuse_delta && !entry->preferred_base) { + struct revindex_entry *revidx; + revidx = find_pack_revindex(p, ofs); + base_ref = nth_packed_object_sha1(p, revidx->nr); + } entry->in_pack_header_size = used + used_0; break; } @@ -1240,9 +1098,11 @@ static void get_object_details(void) sorted_by_offset[i] = objects + i; qsort(sorted_by_offset, nr_objects, sizeof(*sorted_by_offset), pack_offset_sort); - prepare_pack_ix(); + init_pack_revindex(); + for (i = 0; i < nr_objects; i++) check_object(sorted_by_offset[i]); + free(sorted_by_offset); } diff --git a/pack-revindex.c b/pack-revindex.c new file mode 100644 index 0000000000..a8aa2cd6ca --- /dev/null +++ b/pack-revindex.c @@ -0,0 +1,142 @@ +#include "cache.h" +#include "pack-revindex.h" + +/* + * Pack index for existing packs give us easy access to the offsets into + * corresponding pack file where each object's data starts, but the entries + * do not store the size of the compressed representation (uncompressed + * size is easily available by examining the pack entry header). It is + * also rather expensive to find the sha1 for an object given its offset. + * + * We build a hashtable of existing packs (pack_revindex), and keep reverse + * index here -- pack index file is sorted by object name mapping to offset; + * this pack_revindex[].revindex array is a list of offset/index_nr pairs + * ordered by offset, so if you know the offset of an object, next offset + * is where its packed representation ends and the index_nr can be used to + * get the object sha1 from the main index. + */ + +struct pack_revindex { + struct packed_git *p; + struct revindex_entry *revindex; +}; + +static struct pack_revindex *pack_revindex; +static int pack_revindex_hashsz; + +static int pack_revindex_ix(struct packed_git *p) +{ + unsigned long ui = (unsigned long)p; + int i; + + ui = ui ^ (ui >> 16); /* defeat structure alignment */ + i = (int)(ui % pack_revindex_hashsz); + while (pack_revindex[i].p) { + if (pack_revindex[i].p == p) + return i; + if (++i == pack_revindex_hashsz) + i = 0; + } + return -1 - i; +} + +void init_pack_revindex(void) +{ + int num; + struct packed_git *p; + + for (num = 0, p = packed_git; p; p = p->next) + num++; + if (!num) + return; + pack_revindex_hashsz = num * 11; + pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz); + for (p = packed_git; p; p = p->next) { + num = pack_revindex_ix(p); + num = - 1 - num; + pack_revindex[num].p = p; + } + /* revindex elements are lazily initialized */ +} + +static int cmp_offset(const void *a_, const void *b_) +{ + const struct revindex_entry *a = a_; + const struct revindex_entry *b = b_; + return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0; +} + +/* + * Ordered list of offsets of objects in the pack. + */ +static void create_pack_revindex(struct pack_revindex *rix) +{ + struct packed_git *p = rix->p; + int num_ent = p->num_objects; + int i; + const char *index = p->index_data; + + rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1)); + index += 4 * 256; + + if (p->index_version > 1) { + const uint32_t *off_32 = + (uint32_t *)(index + 8 + p->num_objects * (20 + 4)); + const uint32_t *off_64 = off_32 + p->num_objects; + for (i = 0; i < num_ent; i++) { + uint32_t off = ntohl(*off_32++); + if (!(off & 0x80000000)) { + rix->revindex[i].offset = off; + } else { + rix->revindex[i].offset = + ((uint64_t)ntohl(*off_64++)) << 32; + rix->revindex[i].offset |= + ntohl(*off_64++); + } + rix->revindex[i].nr = i; + } + } else { + for (i = 0; i < num_ent; i++) { + uint32_t hl = *((uint32_t *)(index + 24 * i)); + rix->revindex[i].offset = ntohl(hl); + rix->revindex[i].nr = i; + } + } + + /* This knows the pack format -- the 20-byte trailer + * follows immediately after the last object data. + */ + rix->revindex[num_ent].offset = p->pack_size - 20; + rix->revindex[num_ent].nr = -1; + qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset); +} + +struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs) +{ + int num; + int lo, hi; + struct pack_revindex *rix; + struct revindex_entry *revindex; + + num = pack_revindex_ix(p); + if (num < 0) + die("internal error: pack revindex uninitialized"); + + rix = &pack_revindex[num]; + if (!rix->revindex) + create_pack_revindex(rix); + revindex = rix->revindex; + + lo = 0; + hi = p->num_objects + 1; + do { + int mi = (lo + hi) / 2; + if (revindex[mi].offset == ofs) { + return revindex + mi; + } else if (ofs < revindex[mi].offset) + hi = mi; + else + lo = mi + 1; + } while (lo < hi); + die("internal error: pack revindex corrupt"); +} diff --git a/pack-revindex.h b/pack-revindex.h new file mode 100644 index 0000000000..c3527a7565 --- /dev/null +++ b/pack-revindex.h @@ -0,0 +1,12 @@ +#ifndef PACK_REVINDEX_H +#define PACK_REVINDEX_H + +struct revindex_entry { + off_t offset; + unsigned int nr; +}; + +void init_pack_revindex(void); +struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs); + +#endif From 340814636dde3cbd2e461b12f9ae832d2100766a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 28 Feb 2008 00:25:18 -0500 Subject: [PATCH 2/4] make verify_one_pack() a bit less wrong wrt packed_git structure Simply freeing it is wrong. There are many things attached to this structure that are not cleaned up. In practice this doesn't matter much since this happens just before the program exits, but it is still a bit more "correct" to leak it implicitly rather than explicitly. And therefore it is also a good idea to register it with install_packed_git(). Not only might it have better chance of being properly cleaned up if such functionality is implemented for the general case, but some functions like init_revindex() expect all packed_git instances to be globally accessible. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-verify-pack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin-verify-pack.c b/builtin-verify-pack.c index 4e31c273f4..4958bbbf11 100644 --- a/builtin-verify-pack.c +++ b/builtin-verify-pack.c @@ -40,8 +40,8 @@ static int verify_one_pack(const char *path, int verbose) if (!pack) return error("packfile %s not found.", arg); + install_packed_git(pack); err = verify_pack(pack, verbose); - free(pack); return err; } From 70f5d5d31cbf0ba273083805124ad67135142527 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 28 Feb 2008 00:25:19 -0500 Subject: [PATCH 3/4] fix unimplemented packed_object_info_detail() features Since commit eb32d236df0c16b936b04f0c5402addb61cdb311, there was a TODO comment in packed_object_info_detail() about the SHA1 of base object to OBJ_OFS_DELTA objects. So here it is at last. While at it, providing the actual storage size information as well is now trivial. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-check.c | 3 +++ sha1_file.c | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pack-check.c b/pack-check.c index d7dd62bb83..ae25685036 100644 --- a/pack-check.c +++ b/pack-check.c @@ -1,5 +1,6 @@ #include "cache.h" #include "pack.h" +#include "pack-revindex.h" struct idx_entry { @@ -101,8 +102,10 @@ static int verify_packfile(struct packed_git *p, static void show_pack_info(struct packed_git *p) { uint32_t nr_objects, i, chain_histogram[MAX_CHAIN+1]; + nr_objects = p->num_objects; memset(chain_histogram, 0, sizeof(chain_histogram)); + init_pack_revindex(); for (i = 0; i < nr_objects; i++) { const unsigned char *sha1; diff --git a/sha1_file.c b/sha1_file.c index 1ddb96bb82..445a871db3 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -14,6 +14,7 @@ #include "tag.h" #include "tree.h" #include "refs.h" +#include "pack-revindex.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -1367,11 +1368,15 @@ const char *packed_object_info_detail(struct packed_git *p, unsigned long dummy; unsigned char *next_sha1; enum object_type type; + struct revindex_entry *revidx; *delta_chain_length = 0; curpos = obj_offset; type = unpack_object_header(p, &w_curs, &curpos, size); + revidx = find_pack_revindex(p, obj_offset); + *store_size = revidx[1].offset - obj_offset; + for (;;) { switch (type) { default: @@ -1381,14 +1386,13 @@ const char *packed_object_info_detail(struct packed_git *p, case OBJ_TREE: case OBJ_BLOB: case OBJ_TAG: - *store_size = 0; /* notyet */ unuse_pack(&w_curs); return typename(type); case OBJ_OFS_DELTA: obj_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset); if (*delta_chain_length == 0) { - /* TODO: find base_sha1 as pointed by curpos */ - hashclr(base_sha1); + revidx = find_pack_revindex(p, obj_offset); + hashcpy(base_sha1, nth_packed_object_sha1(p, revidx->nr)); } break; case OBJ_REF_DELTA: From 5f4347bba39ddb147b06913ac263fc46954d2d0b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 28 Feb 2008 00:25:20 -0500 Subject: [PATCH 4/4] add storage size output to 'git verify-pack -v' This can possibly break external scripts that depend on the previous output, but those script can't possibly be critical to Git usage, and fixing them should be trivial. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- Documentation/git-verify-pack.txt | 4 ++-- contrib/stats/packinfo.pl | 2 +- pack-check.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/git-verify-pack.txt b/Documentation/git-verify-pack.txt index db019a2b8d..ba2a157299 100644 --- a/Documentation/git-verify-pack.txt +++ b/Documentation/git-verify-pack.txt @@ -32,11 +32,11 @@ OUTPUT FORMAT ------------- When specifying the -v option the format used is: - SHA1 type size offset-in-packfile + SHA1 type size size-in-pack-file offset-in-packfile for objects that are not deltified in the pack, and - SHA1 type size offset-in-packfile depth base-SHA1 + SHA1 type size size-in-packfile offset-in-packfile depth base-SHA1 for objects that are deltified. diff --git a/contrib/stats/packinfo.pl b/contrib/stats/packinfo.pl index aab501ea08..f4a7b62cd9 100755 --- a/contrib/stats/packinfo.pl +++ b/contrib/stats/packinfo.pl @@ -93,7 +93,7 @@ my %depths; my @depths; while () { - my ($sha1, $type, $size, $offset, $depth, $parent) = split(/\s+/, $_); + my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); next unless ($sha1 =~ /^[0-9a-f]{40}$/); $depths{$sha1} = $depth || 0; push(@depths, $depth || 0); diff --git a/pack-check.c b/pack-check.c index ae25685036..0f8ad2c00f 100644 --- a/pack-check.c +++ b/pack-check.c @@ -128,11 +128,11 @@ static void show_pack_info(struct packed_git *p) base_sha1); printf("%s ", sha1_to_hex(sha1)); if (!delta_chain_length) - printf("%-6s %lu %"PRIuMAX"\n", - type, size, (uintmax_t)offset); + printf("%-6s %lu %lu %"PRIuMAX"\n", + type, size, store_size, (uintmax_t)offset); else { - printf("%-6s %lu %"PRIuMAX" %u %s\n", - type, size, (uintmax_t)offset, + printf("%-6s %lu %lu %"PRIuMAX" %u %s\n", + type, size, store_size, (uintmax_t)offset, delta_chain_length, sha1_to_hex(base_sha1)); if (delta_chain_length <= MAX_CHAIN) chain_histogram[delta_chain_length]++;