Merge branch 'jk/for-each-object-iteration'

The API to iterate over all objects learned to optionally list
objects in the order they appear in packfiles, which helps locality
of access if the caller accesses these objects while as objects are
enumerated.

* jk/for-each-object-iteration:
  for_each_*_object: move declarations to object-store.h
  cat-file: use a single strbuf for all output
  cat-file: split batch "buf" into two variables
  cat-file: use oidset check-and-insert
  cat-file: support "unordered" output for --batch-all-objects
  cat-file: rename batch_{loose,packed}_object callbacks
  t1006: test cat-file --batch-all-objects with duplicates
  for_each_packed_object: support iterating in pack-order
  for_each_*_object: give more comprehensive docstrings
  for_each_*_object: take flag arguments as enum
  for_each_*_object: store flag definitions in a single location
This commit is contained in:
Junio C Hamano 2018-08-20 11:33:52 -07:00
commit 0c54cdaf65
10 changed files with 219 additions and 112 deletions

View File

@ -104,6 +104,16 @@ OPTIONS
buffering; this is much more efficient when invoking
`--batch-check` on a large number of objects.
--unordered::
When `--batch-all-objects` is in use, visit objects in an
order which may be more efficient for accessing the object
contents than hash order. The exact details of the order are
unspecified, but if you do not require a specific order, this
should generally result in faster output, especially with
`--batch`. Note that `cat-file` will still show each object
only once, even if it is stored multiple times in the
repository.
--allow-unknown-type::
Allow -s or -t to query broken/corrupt objects of unknown type.

View File

@ -21,6 +21,7 @@ struct batch_options {
int print_contents;
int buffer_output;
int all_objects;
int unordered;
int cmdmode; /* may be 'w' or 'c' for --filters or --textconv */
const char *format;
};
@ -337,11 +338,11 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
}
}
static void batch_object_write(const char *obj_name, struct batch_options *opt,
static void batch_object_write(const char *obj_name,
struct strbuf *scratch,
struct batch_options *opt,
struct expand_data *data)
{
struct strbuf buf = STRBUF_INIT;
if (!data->skip_object_info &&
oid_object_info_extended(the_repository, &data->oid, &data->info,
OBJECT_INFO_LOOKUP_REPLACE) < 0) {
@ -351,10 +352,10 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt,
return;
}
strbuf_expand(&buf, opt->format, expand_format, data);
strbuf_addch(&buf, '\n');
batch_write(opt, buf.buf, buf.len);
strbuf_release(&buf);
strbuf_reset(scratch);
strbuf_expand(scratch, opt->format, expand_format, data);
strbuf_addch(scratch, '\n');
batch_write(opt, scratch->buf, scratch->len);
if (opt->print_contents) {
print_object_or_die(opt, data);
@ -362,7 +363,9 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt,
}
}
static void batch_one_object(const char *obj_name, struct batch_options *opt,
static void batch_one_object(const char *obj_name,
struct strbuf *scratch,
struct batch_options *opt,
struct expand_data *data)
{
struct object_context ctx;
@ -404,42 +407,70 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt,
return;
}
batch_object_write(obj_name, opt, data);
batch_object_write(obj_name, scratch, opt, data);
}
struct object_cb_data {
struct batch_options *opt;
struct expand_data *expand;
struct oidset *seen;
struct strbuf *scratch;
};
static int batch_object_cb(const struct object_id *oid, void *vdata)
{
struct object_cb_data *data = vdata;
oidcpy(&data->expand->oid, oid);
batch_object_write(NULL, data->opt, data->expand);
batch_object_write(NULL, data->scratch, data->opt, data->expand);
return 0;
}
static int batch_loose_object(const struct object_id *oid,
const char *path,
void *data)
static int collect_loose_object(const struct object_id *oid,
const char *path,
void *data)
{
oid_array_append(data, oid);
return 0;
}
static int batch_packed_object(const struct object_id *oid,
struct packed_git *pack,
uint32_t pos,
void *data)
static int collect_packed_object(const struct object_id *oid,
struct packed_git *pack,
uint32_t pos,
void *data)
{
oid_array_append(data, oid);
return 0;
}
static int batch_unordered_object(const struct object_id *oid, void *vdata)
{
struct object_cb_data *data = vdata;
if (oidset_insert(data->seen, oid))
return 0;
return batch_object_cb(oid, data);
}
static int batch_unordered_loose(const struct object_id *oid,
const char *path,
void *data)
{
return batch_unordered_object(oid, data);
}
static int batch_unordered_packed(const struct object_id *oid,
struct packed_git *pack,
uint32_t pos,
void *data)
{
return batch_unordered_object(oid, data);
}
static int batch_objects(struct batch_options *opt)
{
struct strbuf buf = STRBUF_INIT;
struct strbuf input = STRBUF_INIT;
struct strbuf output = STRBUF_INIT;
struct expand_data data;
int save_warning;
int retval = 0;
@ -454,8 +485,9 @@ static int batch_objects(struct batch_options *opt)
*/
memset(&data, 0, sizeof(data));
data.mark_query = 1;
strbuf_expand(&buf, opt->format, expand_format, &data);
strbuf_expand(&output, opt->format, expand_format, &data);
data.mark_query = 0;
strbuf_release(&output);
if (opt->cmdmode)
data.split_on_whitespace = 1;
@ -473,19 +505,37 @@ static int batch_objects(struct batch_options *opt)
data.info.typep = &data.type;
if (opt->all_objects) {
struct oid_array sa = OID_ARRAY_INIT;
struct object_cb_data cb;
for_each_loose_object(batch_loose_object, &sa, 0);
for_each_packed_object(batch_packed_object, &sa, 0);
if (repository_format_partial_clone)
warning("This repository has extensions.partialClone set. Some objects may not be loaded.");
cb.opt = opt;
cb.expand = &data;
oid_array_for_each_unique(&sa, batch_object_cb, &cb);
cb.scratch = &output;
oid_array_clear(&sa);
if (opt->unordered) {
struct oidset seen = OIDSET_INIT;
cb.seen = &seen;
for_each_loose_object(batch_unordered_loose, &cb, 0);
for_each_packed_object(batch_unordered_packed, &cb,
FOR_EACH_OBJECT_PACK_ORDER);
oidset_clear(&seen);
} else {
struct oid_array sa = OID_ARRAY_INIT;
for_each_loose_object(collect_loose_object, &sa, 0);
for_each_packed_object(collect_packed_object, &sa, 0);
oid_array_for_each_unique(&sa, batch_object_cb, &cb);
oid_array_clear(&sa);
}
strbuf_release(&output);
return 0;
}
@ -499,14 +549,14 @@ static int batch_objects(struct batch_options *opt)
save_warning = warn_on_object_refname_ambiguity;
warn_on_object_refname_ambiguity = 0;
while (strbuf_getline(&buf, stdin) != EOF) {
while (strbuf_getline(&input, stdin) != EOF) {
if (data.split_on_whitespace) {
/*
* Split at first whitespace, tying off the beginning
* of the string and saving the remainder (or NULL) in
* data.rest.
*/
char *p = strpbrk(buf.buf, " \t");
char *p = strpbrk(input.buf, " \t");
if (p) {
while (*p && strchr(" \t", *p))
*p++ = '\0';
@ -514,10 +564,11 @@ static int batch_objects(struct batch_options *opt)
data.rest = p;
}
batch_one_object(buf.buf, opt, &data);
batch_one_object(input.buf, &output, opt, &data);
}
strbuf_release(&buf);
strbuf_release(&input);
strbuf_release(&output);
warn_on_object_refname_ambiguity = save_warning;
return retval;
}
@ -586,6 +637,8 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix)
N_("follow in-tree symlinks (used with --batch or --batch-check)")),
OPT_BOOL(0, "batch-all-objects", &batch.all_objects,
N_("show all objects with --batch or --batch-check")),
OPT_BOOL(0, "unordered", &batch.unordered,
N_("do not order --batch-all-objects output")),
OPT_END()
};

View File

@ -3,6 +3,7 @@
#include "progress.h"
#include "parse-options.h"
#include "packfile.h"
#include "object-store.h"
static const char * const prune_packed_usage[] = {
N_("git prune-packed [-n | --dry-run] [-q | --quiet]"),

56
cache.h
View File

@ -1575,62 +1575,6 @@ extern int odb_mkstemp(struct strbuf *temp_filename, const char *pattern);
*/
extern int odb_pack_keep(const char *name);
/*
* Iterate over the files in the loose-object parts of the object
* directory "path", triggering the following callbacks:
*
* - loose_object is called for each loose object we find.
*
* - loose_cruft is called for any files that do not appear to be
* loose objects. Note that we only look in the loose object
* directories "objects/[0-9a-f]{2}/", so we will not report
* "objects/foobar" as cruft.
*
* - loose_subdir is called for each top-level hashed subdirectory
* of the object directory (e.g., "$OBJDIR/f0"). It is called
* after the objects in the directory are processed.
*
* Any callback that is NULL will be ignored. Callbacks returning non-zero
* will end the iteration.
*
* In the "buf" variant, "path" is a strbuf which will also be used as a
* scratch buffer, but restored to its original contents before
* the function returns.
*/
typedef int each_loose_object_fn(const struct object_id *oid,
const char *path,
void *data);
typedef int each_loose_cruft_fn(const char *basename,
const char *path,
void *data);
typedef int each_loose_subdir_fn(unsigned int nr,
const char *path,
void *data);
int for_each_file_in_obj_subdir(unsigned int subdir_nr,
struct strbuf *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
int for_each_loose_file_in_objdir(const char *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
int for_each_loose_file_in_objdir_buf(struct strbuf *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
/*
* Iterate over loose objects in both the local
* repository and any alternates repositories (unless the
* LOCAL_ONLY flag is set).
*/
#define FOR_EACH_OBJECT_LOCAL_ONLY 0x1
extern int for_each_loose_object(each_loose_object_fn, void *, unsigned flags);
/*
* Set this to 0 to prevent sha1_object_info_extended() from fetching missing
* blobs. This has a difference only if extensions.partialClone is set.

View File

@ -730,7 +730,7 @@ void write_commit_graph(const char *obj_dir,
die(_("error adding pack %s"), packname.buf);
if (open_pack_index(p))
die(_("error opening index for %s"), packname.buf);
for_each_object_in_pack(p, add_packed_commits, &oids);
for_each_object_in_pack(p, add_packed_commits, &oids, 0);
close_pack(p);
}
strbuf_release(&packname);

View File

@ -262,4 +262,94 @@ int oid_object_info_extended(struct repository *r,
const struct object_id *,
struct object_info *, unsigned flags);
/*
* Iterate over the files in the loose-object parts of the object
* directory "path", triggering the following callbacks:
*
* - loose_object is called for each loose object we find.
*
* - loose_cruft is called for any files that do not appear to be
* loose objects. Note that we only look in the loose object
* directories "objects/[0-9a-f]{2}/", so we will not report
* "objects/foobar" as cruft.
*
* - loose_subdir is called for each top-level hashed subdirectory
* of the object directory (e.g., "$OBJDIR/f0"). It is called
* after the objects in the directory are processed.
*
* Any callback that is NULL will be ignored. Callbacks returning non-zero
* will end the iteration.
*
* In the "buf" variant, "path" is a strbuf which will also be used as a
* scratch buffer, but restored to its original contents before
* the function returns.
*/
typedef int each_loose_object_fn(const struct object_id *oid,
const char *path,
void *data);
typedef int each_loose_cruft_fn(const char *basename,
const char *path,
void *data);
typedef int each_loose_subdir_fn(unsigned int nr,
const char *path,
void *data);
int for_each_file_in_obj_subdir(unsigned int subdir_nr,
struct strbuf *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
int for_each_loose_file_in_objdir(const char *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
int for_each_loose_file_in_objdir_buf(struct strbuf *path,
each_loose_object_fn obj_cb,
each_loose_cruft_fn cruft_cb,
each_loose_subdir_fn subdir_cb,
void *data);
/* Flags for for_each_*_object() below. */
enum for_each_object_flags {
/* Iterate only over local objects, not alternates. */
FOR_EACH_OBJECT_LOCAL_ONLY = (1<<0),
/* Only iterate over packs obtained from the promisor remote. */
FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1),
/*
* Visit objects within a pack in packfile order rather than .idx order
*/
FOR_EACH_OBJECT_PACK_ORDER = (1<<2),
};
/*
* Iterate over all accessible loose objects without respect to
* reachability. By default, this includes both local and alternate objects.
* The order in which objects are visited is unspecified.
*
* Any flags specific to packs are ignored.
*/
int for_each_loose_object(each_loose_object_fn, void *,
enum for_each_object_flags flags);
/*
* Iterate over all accessible packed objects without respect to reachability.
* By default, this includes both local and alternate packs.
*
* Note that some objects may appear twice if they are found in multiple packs.
* Each pack is visited in an unspecified order. By default, objects within a
* pack are visited in pack-idx order (i.e., sorted by oid).
*/
typedef int each_packed_object_fn(const struct object_id *oid,
struct packed_git *pack,
uint32_t pos,
void *data);
int for_each_object_in_pack(struct packed_git *p,
each_packed_object_fn, void *data,
enum for_each_object_flags flags);
int for_each_packed_object(each_packed_object_fn, void *,
enum for_each_object_flags flags);
#endif /* OBJECT_STORE_H */

View File

@ -1885,26 +1885,38 @@ int has_pack_index(const unsigned char *sha1)
return 1;
}
int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn cb, void *data)
int for_each_object_in_pack(struct packed_git *p,
each_packed_object_fn cb, void *data,
enum for_each_object_flags flags)
{
uint32_t i;
int r = 0;
if (flags & FOR_EACH_OBJECT_PACK_ORDER)
load_pack_revindex(p);
for (i = 0; i < p->num_objects; i++) {
uint32_t pos;
struct object_id oid;
if (!nth_packed_object_oid(&oid, p, i))
return error("unable to get sha1 of object %u in %s",
i, p->pack_name);
if (flags & FOR_EACH_OBJECT_PACK_ORDER)
pos = p->revindex[i].nr;
else
pos = i;
r = cb(&oid, p, i, data);
if (!nth_packed_object_oid(&oid, p, pos))
return error("unable to get sha1 of object %u in %s",
pos, p->pack_name);
r = cb(&oid, p, pos, data);
if (r)
break;
}
return r;
}
int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags)
int for_each_packed_object(each_packed_object_fn cb, void *data,
enum for_each_object_flags flags)
{
struct packed_git *p;
int r = 0;
@ -1921,7 +1933,7 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags)
pack_errors = 1;
continue;
}
r = for_each_object_in_pack(p, cb, data);
r = for_each_object_in_pack(p, cb, data, flags);
if (r)
break;
}

View File

@ -148,23 +148,6 @@ extern int has_object_pack(const struct object_id *oid);
extern int has_pack_index(const unsigned char *sha1);
/*
* Only iterate over packs obtained from the promisor remote.
*/
#define FOR_EACH_OBJECT_PROMISOR_ONLY 2
/*
* Iterate over packed objects in both the local
* repository and any alternates repositories (unless the
* FOR_EACH_OBJECT_LOCAL_ONLY flag, defined in cache.h, is set).
*/
typedef int each_packed_object_fn(const struct object_id *oid,
struct packed_git *pack,
uint32_t pos,
void *data);
extern int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn, void *data);
extern int for_each_packed_object(each_packed_object_fn, void *, unsigned flags);
/*
* Return 1 if an object in a promisor packfile is or refers to the given
* object, 0 otherwise.

View File

@ -2146,7 +2146,8 @@ static int loose_from_alt_odb(struct alternate_object_database *alt,
return r;
}
int for_each_loose_object(each_loose_object_fn cb, void *data, unsigned flags)
int for_each_loose_object(each_loose_object_fn cb, void *data,
enum for_each_object_flags flags)
{
struct loose_alt_odb_data alt;
int r;

View File

@ -550,8 +550,8 @@ test_expect_success 'git cat-file --batch --follow-symlink returns correct sha a
test_expect_success 'cat-file --batch-all-objects shows all objects' '
# make new repos so we know the full set of objects; we will
# also make sure that there are some packed and some loose
# objects, some referenced and some not, and that there are
# some available only via alternates.
# objects, some referenced and some not, some duplicates, and that
# there are some available only via alternates.
git init all-one &&
(
cd all-one &&
@ -567,10 +567,23 @@ test_expect_success 'cat-file --batch-all-objects shows all objects' '
cd all-two &&
echo local-unref | git hash-object -w --stdin
) >>expect.unsorted &&
git -C all-two rev-parse HEAD:file |
git -C all-two pack-objects .git/objects/pack/pack &&
sort <expect.unsorted >expect &&
git -C all-two cat-file --batch-all-objects \
--batch-check="%(objectname)" >actual &&
test_cmp expect actual
'
# The only user-visible difference is that the objects are no longer sorted,
# and the resulting sort order is undefined. So we can only check that it
# produces the same objects as the ordered case, but that at least exercises
# the code.
test_expect_success 'cat-file --unordered works' '
git -C all-two cat-file --batch-all-objects --unordered \
--batch-check="%(objectname)" >actual.unsorted &&
sort <actual.unsorted >actual &&
test_cmp expect actual
'
test_done