git-commit-vandalism/list-objects-filter.c

715 lines
19 KiB
C
Raw Normal View History

#include "cache.h"
#include "dir.h"
#include "tag.h"
#include "commit.h"
#include "tree.h"
#include "blob.h"
#include "diff.h"
#include "tree-walk.h"
#include "revision.h"
#include "list-objects.h"
#include "list-objects-filter.h"
#include "list-objects-filter-options.h"
#include "oidmap.h"
#include "oidset.h"
#include "object-store.h"
/* Remember to update object flag allocation in object.h */
/*
* FILTER_SHOWN_BUT_REVISIT -- we set this bit on tree objects
* that have been shown, but should be revisited if they appear
* in the traversal (until we mark it SEEN). This is a way to
* let us silently de-dup calls to show() in the caller. This
* is subtly different from the "revision.h:SHOWN" and the
* "sha1-name.c:ONELINE_SEEN" bits. And also different from
* the non-de-dup usage in pack-bitmap.c
*/
#define FILTER_SHOWN_BUT_REVISIT (1<<21)
struct subfilter {
struct filter *filter;
struct oidset seen;
struct oidset omits;
struct object_id skip_tree;
unsigned is_skipping_tree : 1;
};
struct filter {
enum list_objects_filter_result (*filter_object_fn)(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data);
/*
* Optional. If this function is supplied and the filter needs
* to collect omits, then this function is called once before
* free_fn is called.
*
* This is required because the following two conditions hold:
*
* a. A tree filter can add and remove objects as an object
* graph is traversed.
* b. A combine filter's omit set is the union of all its
* subfilters, which may include tree: filters.
*
* As such, the omits sets must be separate sets, and can only
* be unioned after the traversal is completed.
*/
void (*finalize_omits_fn)(struct oidset *omits, void *filter_data);
void (*free_fn)(void *filter_data);
void *filter_data;
/* If non-NULL, the filter collects a list of the omitted OIDs here. */
struct oidset *omits;
};
static enum list_objects_filter_result filter_blobs_none(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data_)
{
switch (filter_situation) {
default:
BUG("unknown filter_situation: %d", filter_situation);
case LOFS_BEGIN_TREE:
assert(obj->type == OBJ_TREE);
/* always include all tree objects */
return LOFR_MARK_SEEN | LOFR_DO_SHOW;
case LOFS_END_TREE:
assert(obj->type == OBJ_TREE);
return LOFR_ZERO;
case LOFS_BLOB:
assert(obj->type == OBJ_BLOB);
assert((obj->flags & SEEN) == 0);
if (omits)
oidset_insert(omits, &obj->oid);
return LOFR_MARK_SEEN; /* but not LOFR_DO_SHOW (hard omit) */
}
}
static void filter_blobs_none__init(
struct list_objects_filter_options *filter_options,
struct filter *filter)
{
filter->filter_object_fn = filter_blobs_none;
filter->free_fn = free;
}
/*
* A filter for list-objects to omit ALL trees and blobs from the traversal.
* Can OPTIONALLY collect a list of the omitted OIDs.
*/
struct filter_trees_depth_data {
/*
* Maps trees to the minimum depth at which they were seen. It is not
* necessary to re-traverse a tree at deeper or equal depths than it has
* already been traversed.
*
* We can't use LOFR_MARK_SEEN for tree objects since this will prevent
* it from being traversed at shallower depths.
*/
struct oidmap seen_at_depth;
unsigned long exclude_depth;
unsigned long current_depth;
};
struct seen_map_entry {
struct oidmap_entry base;
size_t depth;
};
/* Returns 1 if the oid was in the omits set before it was invoked. */
static int filter_trees_update_omits(
struct object *obj,
struct oidset *omits,
int include_it)
{
if (!omits)
return 0;
if (include_it)
return oidset_remove(omits, &obj->oid);
else
return oidset_insert(omits, &obj->oid);
}
static enum list_objects_filter_result filter_trees_depth(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data_)
{
struct filter_trees_depth_data *filter_data = filter_data_;
struct seen_map_entry *seen_info;
int include_it = filter_data->current_depth <
filter_data->exclude_depth;
int filter_res;
int already_seen;
/*
* Note that we do not use _MARK_SEEN in order to allow re-traversal in
* case we encounter a tree or blob again at a shallower depth.
*/
switch (filter_situation) {
default:
BUG("unknown filter_situation: %d", filter_situation);
case LOFS_END_TREE:
assert(obj->type == OBJ_TREE);
filter_data->current_depth--;
return LOFR_ZERO;
case LOFS_BLOB:
filter_trees_update_omits(obj, omits, include_it);
return include_it ? LOFR_MARK_SEEN | LOFR_DO_SHOW : LOFR_ZERO;
case LOFS_BEGIN_TREE:
seen_info = oidmap_get(
&filter_data->seen_at_depth, &obj->oid);
if (!seen_info) {
seen_info = xcalloc(1, sizeof(*seen_info));
oidcpy(&seen_info->base.oid, &obj->oid);
seen_info->depth = filter_data->current_depth;
oidmap_put(&filter_data->seen_at_depth, seen_info);
already_seen = 0;
} else {
already_seen =
filter_data->current_depth >= seen_info->depth;
}
if (already_seen) {
filter_res = LOFR_SKIP_TREE;
} else {
int been_omitted = filter_trees_update_omits(
obj, omits, include_it);
seen_info->depth = filter_data->current_depth;
if (include_it)
filter_res = LOFR_DO_SHOW;
else if (omits && !been_omitted)
/*
* Must update omit information of children
* recursively; they have not been omitted yet.
*/
filter_res = LOFR_ZERO;
else
filter_res = LOFR_SKIP_TREE;
}
filter_data->current_depth++;
return filter_res;
}
}
static void filter_trees_free(void *filter_data) {
struct filter_trees_depth_data *d = filter_data;
if (!d)
return;
oidmap_free(&d->seen_at_depth, 1);
free(d);
}
static void filter_trees_depth__init(
struct list_objects_filter_options *filter_options,
struct filter *filter)
{
struct filter_trees_depth_data *d = xcalloc(1, sizeof(*d));
oidmap_init(&d->seen_at_depth, 0);
d->exclude_depth = filter_options->tree_exclude_depth;
d->current_depth = 0;
filter->filter_data = d;
filter->filter_object_fn = filter_trees_depth;
filter->free_fn = filter_trees_free;
}
/*
* A filter for list-objects to omit large blobs.
* And to OPTIONALLY collect a list of the omitted OIDs.
*/
struct filter_blobs_limit_data {
unsigned long max_bytes;
};
static enum list_objects_filter_result filter_blobs_limit(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data_)
{
struct filter_blobs_limit_data *filter_data = filter_data_;
unsigned long object_length;
enum object_type t;
switch (filter_situation) {
default:
BUG("unknown filter_situation: %d", filter_situation);
case LOFS_BEGIN_TREE:
assert(obj->type == OBJ_TREE);
/* always include all tree objects */
return LOFR_MARK_SEEN | LOFR_DO_SHOW;
case LOFS_END_TREE:
assert(obj->type == OBJ_TREE);
return LOFR_ZERO;
case LOFS_BLOB:
assert(obj->type == OBJ_BLOB);
assert((obj->flags & SEEN) == 0);
t = oid_object_info(r, &obj->oid, &object_length);
if (t != OBJ_BLOB) { /* probably OBJ_NONE */
/*
* We DO NOT have the blob locally, so we cannot
* apply the size filter criteria. Be conservative
* and force show it (and let the caller deal with
* the ambiguity).
*/
goto include_it;
}
if (object_length < filter_data->max_bytes)
goto include_it;
if (omits)
oidset_insert(omits, &obj->oid);
return LOFR_MARK_SEEN; /* but not LOFR_DO_SHOW (hard omit) */
}
include_it:
if (omits)
oidset_remove(omits, &obj->oid);
return LOFR_MARK_SEEN | LOFR_DO_SHOW;
}
static void filter_blobs_limit__init(
struct list_objects_filter_options *filter_options,
struct filter *filter)
{
struct filter_blobs_limit_data *d = xcalloc(1, sizeof(*d));
d->max_bytes = filter_options->blob_limit_value;
filter->filter_data = d;
filter->filter_object_fn = filter_blobs_limit;
filter->free_fn = free;
}
/*
* A filter driven by a sparse-checkout specification to only
* include blobs that a sparse checkout would populate.
*
* The sparse-checkout spec can be loaded from a blob with the
* given OID or from a local pathname. We allow an OID because
* the repo may be bare or we may be doing the filtering on the
* server.
*/
struct frame {
/*
* default_match is the usual default include/exclude value that
* should be inherited as we recurse into directories based
* upon pattern matching of the directory itself or of a
* containing directory.
*/
enum pattern_match_result default_match;
/*
* 1 if the directory (recursively) contains any provisionally
* omitted objects.
*
* 0 if everything (recursively) contained in this directory
* has been explicitly included (SHOWN) in the result and
* the directory may be short-cut later in the traversal.
*/
unsigned child_prov_omit : 1;
};
struct filter_sparse_data {
struct pattern_list pl;
size_t nr, alloc;
struct frame *array_frame;
};
static enum list_objects_filter_result filter_sparse(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data_)
{
struct filter_sparse_data *filter_data = filter_data_;
int dtype;
struct frame *frame;
enum pattern_match_result match;
switch (filter_situation) {
default:
BUG("unknown filter_situation: %d", filter_situation);
case LOFS_BEGIN_TREE:
assert(obj->type == OBJ_TREE);
dtype = DT_DIR;
match = path_matches_pattern_list(pathname, strlen(pathname),
filename, &dtype, &filter_data->pl,
r->index);
if (match == UNDECIDED)
match = filter_data->array_frame[filter_data->nr - 1].default_match;
ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1,
filter_data->alloc);
filter_data->array_frame[filter_data->nr].default_match = match;
filter_data->array_frame[filter_data->nr].child_prov_omit = 0;
filter_data->nr++;
/*
* A directory with this tree OID may appear in multiple
* places in the tree. (Think of a directory move or copy,
* with no other changes, so the OID is the same, but the
* full pathnames of objects within this directory are new
* and may match is_excluded() patterns differently.)
* So we cannot mark this directory as SEEN (yet), since
* that will prevent process_tree() from revisiting this
* tree object with other pathname prefixes.
*
* Only _DO_SHOW the tree object the first time we visit
* this tree object.
*
* We always show all tree objects. A future optimization
* may want to attempt to narrow this.
*/
if (obj->flags & FILTER_SHOWN_BUT_REVISIT)
return LOFR_ZERO;
obj->flags |= FILTER_SHOWN_BUT_REVISIT;
return LOFR_DO_SHOW;
case LOFS_END_TREE:
assert(obj->type == OBJ_TREE);
assert(filter_data->nr > 1);
frame = &filter_data->array_frame[--filter_data->nr];
/*
* Tell our parent directory if any of our children were
* provisionally omitted.
*/
filter_data->array_frame[filter_data->nr - 1].child_prov_omit |=
frame->child_prov_omit;
/*
* If there are NO provisionally omitted child objects (ALL child
* objects in this folder were INCLUDED), then we can mark the
* folder as SEEN (so we will not have to revisit it again).
*/
if (!frame->child_prov_omit)
return LOFR_MARK_SEEN;
return LOFR_ZERO;
case LOFS_BLOB:
assert(obj->type == OBJ_BLOB);
assert((obj->flags & SEEN) == 0);
frame = &filter_data->array_frame[filter_data->nr - 1];
dtype = DT_REG;
match = path_matches_pattern_list(pathname, strlen(pathname),
filename, &dtype, &filter_data->pl,
r->index);
if (match == UNDECIDED)
match = frame->default_match;
if (match == MATCHED) {
if (omits)
oidset_remove(omits, &obj->oid);
return LOFR_MARK_SEEN | LOFR_DO_SHOW;
}
/*
* Provisionally omit it. We've already established that
* this pathname is not in the sparse-checkout specification
* with the CURRENT pathname, so we *WANT* to omit this blob.
*
* However, a pathname elsewhere in the tree may also
* reference this same blob, so we cannot reject it yet.
* Leave the LOFR_ bits unset so that if the blob appears
* again in the traversal, we will be asked again.
*/
if (omits)
oidset_insert(omits, &obj->oid);
/*
* Remember that at least 1 blob in this tree was
* provisionally omitted. This prevents us from short
* cutting the tree in future iterations.
*/
frame->child_prov_omit = 1;
return LOFR_ZERO;
}
}
static void filter_sparse_free(void *filter_data)
{
struct filter_sparse_data *d = filter_data;
free(d->array_frame);
free(d);
}
static void filter_sparse_oid__init(
struct list_objects_filter_options *filter_options,
struct filter *filter)
{
struct filter_sparse_data *d = xcalloc(1, sizeof(*d));
list-objects-filter: delay parsing of sparse oid The list-objects-filter code has two steps to its initialization: 1. parse_list_objects_filter() makes sure the spec is a filter we know about and is syntactically correct. This step is done by "rev-list" or "upload-pack" that is going to apply a filter, but also by "git clone" or "git fetch" before they send the spec across the wire. 2. list_objects_filter__init() runs the type-specific initialization (using function pointers established in step 1). This happens at the start of traverse_commit_list_filtered(), when we're about to actually use the filter. It's a good idea to parse as much as we can in step 1, in order to catch problems early (e.g., a blob size limit that isn't a number). But one thing we _shouldn't_ do is resolve any oids at that step (e.g., for sparse-file contents specified by oid). In the case of a fetch, the oid has to be resolved on the remote side. The current code does resolve the oid during the parse phase, but ignores any error (which we must do, because we might just be sending the spec across the wire). This leads to two bugs: - if we're not in a repository (e.g., because it's git-clone parsing the spec), then we trigger a BUG() trying to resolve the name - if we did hit the error case, we still have to notice that later and bail. The code path in rev-list handles this, but the one in upload-pack does not, leading to a segfault. We can fix both by moving the oid resolution into the sparse-oid init function. At that point we know we have a repository (because we're about to traverse), and handling the error there fixes the segfault. As a bonus, we can drop the NULL sparse_oid_value check in rev-list, since this is now handled in the sparse-oid-filter init function. Signed-off-by: Jeff King <peff@peff.net> Acked-by: Jeff Hostetler <jeffhost@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-09-15 18:12:44 +02:00
struct object_context oc;
struct object_id sparse_oid;
if (get_oid_with_context(the_repository,
filter_options->sparse_oid_name,
GET_OID_BLOB, &sparse_oid, &oc))
die(_("unable to access sparse blob in '%s'"),
list-objects-filter: delay parsing of sparse oid The list-objects-filter code has two steps to its initialization: 1. parse_list_objects_filter() makes sure the spec is a filter we know about and is syntactically correct. This step is done by "rev-list" or "upload-pack" that is going to apply a filter, but also by "git clone" or "git fetch" before they send the spec across the wire. 2. list_objects_filter__init() runs the type-specific initialization (using function pointers established in step 1). This happens at the start of traverse_commit_list_filtered(), when we're about to actually use the filter. It's a good idea to parse as much as we can in step 1, in order to catch problems early (e.g., a blob size limit that isn't a number). But one thing we _shouldn't_ do is resolve any oids at that step (e.g., for sparse-file contents specified by oid). In the case of a fetch, the oid has to be resolved on the remote side. The current code does resolve the oid during the parse phase, but ignores any error (which we must do, because we might just be sending the spec across the wire). This leads to two bugs: - if we're not in a repository (e.g., because it's git-clone parsing the spec), then we trigger a BUG() trying to resolve the name - if we did hit the error case, we still have to notice that later and bail. The code path in rev-list handles this, but the one in upload-pack does not, leading to a segfault. We can fix both by moving the oid resolution into the sparse-oid init function. At that point we know we have a repository (because we're about to traverse), and handling the error there fixes the segfault. As a bonus, we can drop the NULL sparse_oid_value check in rev-list, since this is now handled in the sparse-oid-filter init function. Signed-off-by: Jeff King <peff@peff.net> Acked-by: Jeff Hostetler <jeffhost@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-09-15 18:12:44 +02:00
filter_options->sparse_oid_name);
if (add_patterns_from_blob_to_list(&sparse_oid, "", 0, &d->pl) < 0)
die(_("unable to parse sparse filter data in %s"),
oid_to_hex(&sparse_oid));
ALLOC_GROW(d->array_frame, d->nr + 1, d->alloc);
d->array_frame[d->nr].default_match = 0; /* default to include */
d->array_frame[d->nr].child_prov_omit = 0;
d->nr++;
filter->filter_data = d;
filter->filter_object_fn = filter_sparse;
filter->free_fn = filter_sparse_free;
}
/* A filter which only shows objects shown by all sub-filters. */
struct combine_filter_data {
struct subfilter *sub;
size_t nr;
};
static enum list_objects_filter_result process_subfilter(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct subfilter *sub)
{
enum list_objects_filter_result result;
/*
* Check and update is_skipping_tree before oidset_contains so
* that is_skipping_tree gets unset even when the object is
* marked as seen. As of this writing, no filter uses
* LOFR_MARK_SEEN on trees that also uses LOFR_SKIP_TREE, so the
* ordering is only theoretically important. Be cautious if you
* change the order of the below checks and more filters have
* been added!
*/
if (sub->is_skipping_tree) {
if (filter_situation == LOFS_END_TREE &&
oideq(&obj->oid, &sub->skip_tree))
sub->is_skipping_tree = 0;
else
return LOFR_ZERO;
}
if (oidset_contains(&sub->seen, &obj->oid))
return LOFR_ZERO;
result = list_objects_filter__filter_object(
r, filter_situation, obj, pathname, filename, sub->filter);
if (result & LOFR_MARK_SEEN)
oidset_insert(&sub->seen, &obj->oid);
if (result & LOFR_SKIP_TREE) {
sub->is_skipping_tree = 1;
sub->skip_tree = obj->oid;
}
return result;
}
static enum list_objects_filter_result filter_combine(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct oidset *omits,
void *filter_data)
{
struct combine_filter_data *d = filter_data;
enum list_objects_filter_result combined_result =
LOFR_DO_SHOW | LOFR_MARK_SEEN | LOFR_SKIP_TREE;
size_t sub;
for (sub = 0; sub < d->nr; sub++) {
enum list_objects_filter_result sub_result = process_subfilter(
r, filter_situation, obj, pathname, filename,
&d->sub[sub]);
if (!(sub_result & LOFR_DO_SHOW))
combined_result &= ~LOFR_DO_SHOW;
if (!(sub_result & LOFR_MARK_SEEN))
combined_result &= ~LOFR_MARK_SEEN;
if (!d->sub[sub].is_skipping_tree)
combined_result &= ~LOFR_SKIP_TREE;
}
return combined_result;
}
static void filter_combine__free(void *filter_data)
{
struct combine_filter_data *d = filter_data;
size_t sub;
for (sub = 0; sub < d->nr; sub++) {
list_objects_filter__free(d->sub[sub].filter);
oidset_clear(&d->sub[sub].seen);
if (d->sub[sub].omits.set.size)
BUG("expected oidset to be cleared already");
}
free(d->sub);
}
static void add_all(struct oidset *dest, struct oidset *src) {
struct oidset_iter iter;
struct object_id *src_oid;
oidset_iter_init(src, &iter);
while ((src_oid = oidset_iter_next(&iter)) != NULL)
oidset_insert(dest, src_oid);
}
static void filter_combine__finalize_omits(
struct oidset *omits,
void *filter_data)
{
struct combine_filter_data *d = filter_data;
size_t sub;
for (sub = 0; sub < d->nr; sub++) {
add_all(omits, &d->sub[sub].omits);
oidset_clear(&d->sub[sub].omits);
}
}
static void filter_combine__init(
struct list_objects_filter_options *filter_options,
struct filter* filter)
{
struct combine_filter_data *d = xcalloc(1, sizeof(*d));
size_t sub;
d->nr = filter_options->sub_nr;
d->sub = xcalloc(d->nr, sizeof(*d->sub));
for (sub = 0; sub < d->nr; sub++)
d->sub[sub].filter = list_objects_filter__init(
filter->omits ? &d->sub[sub].omits : NULL,
&filter_options->sub[sub]);
filter->filter_data = d;
filter->filter_object_fn = filter_combine;
filter->free_fn = filter_combine__free;
filter->finalize_omits_fn = filter_combine__finalize_omits;
}
typedef void (*filter_init_fn)(
struct list_objects_filter_options *filter_options,
struct filter *filter);
/*
* Must match "enum list_objects_filter_choice".
*/
static filter_init_fn s_filters[] = {
NULL,
filter_blobs_none__init,
filter_blobs_limit__init,
filter_trees_depth__init,
filter_sparse_oid__init,
filter_combine__init,
};
struct filter *list_objects_filter__init(
struct oidset *omitted,
struct list_objects_filter_options *filter_options)
{
struct filter *filter;
filter_init_fn init_fn;
assert((sizeof(s_filters) / sizeof(s_filters[0])) == LOFC__COUNT);
if (!filter_options)
return NULL;
if (filter_options->choice >= LOFC__COUNT)
BUG("invalid list-objects filter choice: %d",
filter_options->choice);
init_fn = s_filters[filter_options->choice];
if (!init_fn)
return NULL;
filter = xcalloc(1, sizeof(*filter));
filter->omits = omitted;
init_fn(filter_options, filter);
return filter;
}
enum list_objects_filter_result list_objects_filter__filter_object(
struct repository *r,
enum list_objects_filter_situation filter_situation,
struct object *obj,
const char *pathname,
const char *filename,
struct filter *filter)
{
if (filter && (obj->flags & NOT_USER_GIVEN))
return filter->filter_object_fn(r, filter_situation, obj,
pathname, filename,
filter->omits,
filter->filter_data);
/*
* No filter is active or user gave object explicitly. In this case,
* always show the object (except when LOFS_END_TREE, since this tree
* had already been shown when LOFS_BEGIN_TREE).
*/
if (filter_situation == LOFS_END_TREE)
return 0;
return LOFR_MARK_SEEN | LOFR_DO_SHOW;
}
void list_objects_filter__free(struct filter *filter)
{
if (!filter)
return;
if (filter->finalize_omits_fn && filter->omits)
filter->finalize_omits_fn(filter->omits, filter->filter_data);
filter->free_fn(filter->filter_data);
free(filter);
}