Merge branch 'gs/commit-graph-path-filter'
Introduce an extension to the commit-graph to make it efficient to check for the paths that were modified at each commit using Bloom filters. * gs/commit-graph-path-filter: bloom: ignore renames when computing changed paths commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag t4216: add end to end tests for git log with Bloom filters revision.c: add trace2 stats around Bloom filter usage revision.c: use Bloom filters to speed up path based revision walks commit-graph: add --changed-paths option to write subcommand commit-graph: reuse existing Bloom filters during write commit-graph: write Bloom filters to commit graph file commit-graph: examine commits by generation number commit-graph: examine changed-path objects in pack order commit-graph: compute Bloom filters for changed paths diff: halt tree-diff early after max_changes bloom.c: core Bloom filter implementation for changed paths. bloom.c: introduce core Bloom filter constructs bloom.c: add the murmur3 hash implementation commit-graph: define and use MAX_NUM_CHUNKS
This commit is contained in:
commit
9b6606f43d
@ -57,6 +57,11 @@ or `--stdin-packs`.)
|
||||
With the `--append` option, include all commits that are present in the
|
||||
existing commit-graph file.
|
||||
+
|
||||
With the `--changed-paths` option, compute and write information about the
|
||||
paths changed between a commit and it's first parent. This operation can
|
||||
take a while on large repositories. It provides significant performance gains
|
||||
for getting history of a directory or a file with `git log -- <path>`.
|
||||
+
|
||||
With the `--split[=<strategy>]` option, write the commit-graph as a
|
||||
chain of multiple commit-graph files stored in
|
||||
`<dir>/info/commit-graphs`. Commit-graph layers are merged based on the
|
||||
|
@ -17,6 +17,9 @@ metadata, including:
|
||||
- The parents of the commit, stored using positional references within
|
||||
the graph file.
|
||||
|
||||
- The Bloom filter of the commit carrying the paths that were changed between
|
||||
the commit and its first parent, if requested.
|
||||
|
||||
These positional references are stored as unsigned 32-bit integers
|
||||
corresponding to the array position within the list of commit OIDs. Due
|
||||
to some special constants we use to track parents, we can store at most
|
||||
@ -93,6 +96,33 @@ CHUNK DATA:
|
||||
positions for the parents until reaching a value with the most-significant
|
||||
bit on. The other bits correspond to the position of the last parent.
|
||||
|
||||
Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
|
||||
* The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
|
||||
Bloom filters from commit 0 to commit i (inclusive) in lexicographic
|
||||
order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
|
||||
BIDX[i] (plus header length), where BIDX[-1] is 0.
|
||||
* The BIDX chunk is ignored if the BDAT chunk is not present.
|
||||
|
||||
Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
|
||||
* It starts with header consisting of three unsigned 32-bit integers:
|
||||
- Version of the hash algorithm being used. We currently only support
|
||||
value 1 which corresponds to the 32-bit version of the murmur3 hash
|
||||
implemented exactly as described in
|
||||
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
|
||||
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
|
||||
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
|
||||
in Probabilistic Verification"
|
||||
- The number of times a path is hashed and hence the number of bit positions
|
||||
that cumulatively determine whether a file is present in the commit.
|
||||
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter
|
||||
contains 'n' entries, then the filter size is the minimum number of 64-bit
|
||||
words that contain n*b bits.
|
||||
* The rest of the chunk is the concatenation of all the computed Bloom
|
||||
filters for the commits in lexicographic order.
|
||||
* Note: Commits with no changes or more than 512 changes have Bloom filters
|
||||
of length zero.
|
||||
* The BDAT chunk is present if and only if BIDX is present.
|
||||
|
||||
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
|
||||
This list of H-byte hashes describe a set of B commit-graph files that
|
||||
form a commit-graph chain. The graph position for the ith commit in this
|
||||
|
2
Makefile
2
Makefile
@ -689,6 +689,7 @@ X =
|
||||
PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
|
||||
|
||||
TEST_BUILTINS_OBJS += test-advise.o
|
||||
TEST_BUILTINS_OBJS += test-bloom.o
|
||||
TEST_BUILTINS_OBJS += test-chmtime.o
|
||||
TEST_BUILTINS_OBJS += test-config.o
|
||||
TEST_BUILTINS_OBJS += test-ctype.o
|
||||
@ -834,6 +835,7 @@ LIB_OBJS += base85.o
|
||||
LIB_OBJS += bisect.o
|
||||
LIB_OBJS += blame.o
|
||||
LIB_OBJS += blob.o
|
||||
LIB_OBJS += bloom.o
|
||||
LIB_OBJS += branch.o
|
||||
LIB_OBJS += bulk-checkin.o
|
||||
LIB_OBJS += bundle.o
|
||||
|
276
bloom.c
Normal file
276
bloom.c
Normal file
@ -0,0 +1,276 @@
|
||||
#include "git-compat-util.h"
|
||||
#include "bloom.h"
|
||||
#include "diff.h"
|
||||
#include "diffcore.h"
|
||||
#include "revision.h"
|
||||
#include "hashmap.h"
|
||||
#include "commit-graph.h"
|
||||
#include "commit.h"
|
||||
|
||||
define_commit_slab(bloom_filter_slab, struct bloom_filter);
|
||||
|
||||
struct bloom_filter_slab bloom_filters;
|
||||
|
||||
struct pathmap_hash_entry {
|
||||
struct hashmap_entry entry;
|
||||
const char path[FLEX_ARRAY];
|
||||
};
|
||||
|
||||
static uint32_t rotate_left(uint32_t value, int32_t count)
|
||||
{
|
||||
uint32_t mask = 8 * sizeof(uint32_t) - 1;
|
||||
count &= mask;
|
||||
return ((value << count) | (value >> ((-count) & mask)));
|
||||
}
|
||||
|
||||
static inline unsigned char get_bitmask(uint32_t pos)
|
||||
{
|
||||
return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
|
||||
}
|
||||
|
||||
static int load_bloom_filter_from_graph(struct commit_graph *g,
|
||||
struct bloom_filter *filter,
|
||||
struct commit *c)
|
||||
{
|
||||
uint32_t lex_pos, start_index, end_index;
|
||||
|
||||
while (c->graph_pos < g->num_commits_in_base)
|
||||
g = g->base_graph;
|
||||
|
||||
/* The commit graph commit 'c' lives in doesn't carry bloom filters. */
|
||||
if (!g->chunk_bloom_indexes)
|
||||
return 0;
|
||||
|
||||
lex_pos = c->graph_pos - g->num_commits_in_base;
|
||||
|
||||
end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
|
||||
|
||||
if (lex_pos > 0)
|
||||
start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
|
||||
else
|
||||
start_index = 0;
|
||||
|
||||
filter->len = end_index - start_index;
|
||||
filter->data = (unsigned char *)(g->chunk_bloom_data +
|
||||
sizeof(unsigned char) * start_index +
|
||||
BLOOMDATA_CHUNK_HEADER_SIZE);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the murmur3 32-bit hash value for the given data
|
||||
* using the given seed.
|
||||
* Produces a uniformly distributed hash value.
|
||||
* Not considered to be cryptographically secure.
|
||||
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
|
||||
*/
|
||||
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
|
||||
{
|
||||
const uint32_t c1 = 0xcc9e2d51;
|
||||
const uint32_t c2 = 0x1b873593;
|
||||
const uint32_t r1 = 15;
|
||||
const uint32_t r2 = 13;
|
||||
const uint32_t m = 5;
|
||||
const uint32_t n = 0xe6546b64;
|
||||
int i;
|
||||
uint32_t k1 = 0;
|
||||
const char *tail;
|
||||
|
||||
int len4 = len / sizeof(uint32_t);
|
||||
|
||||
uint32_t k;
|
||||
for (i = 0; i < len4; i++) {
|
||||
uint32_t byte1 = (uint32_t)data[4*i];
|
||||
uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
|
||||
uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
|
||||
uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
|
||||
k = byte1 | byte2 | byte3 | byte4;
|
||||
k *= c1;
|
||||
k = rotate_left(k, r1);
|
||||
k *= c2;
|
||||
|
||||
seed ^= k;
|
||||
seed = rotate_left(seed, r2) * m + n;
|
||||
}
|
||||
|
||||
tail = (data + len4 * sizeof(uint32_t));
|
||||
|
||||
switch (len & (sizeof(uint32_t) - 1)) {
|
||||
case 3:
|
||||
k1 ^= ((uint32_t)tail[2]) << 16;
|
||||
/*-fallthrough*/
|
||||
case 2:
|
||||
k1 ^= ((uint32_t)tail[1]) << 8;
|
||||
/*-fallthrough*/
|
||||
case 1:
|
||||
k1 ^= ((uint32_t)tail[0]) << 0;
|
||||
k1 *= c1;
|
||||
k1 = rotate_left(k1, r1);
|
||||
k1 *= c2;
|
||||
seed ^= k1;
|
||||
break;
|
||||
}
|
||||
|
||||
seed ^= (uint32_t)len;
|
||||
seed ^= (seed >> 16);
|
||||
seed *= 0x85ebca6b;
|
||||
seed ^= (seed >> 13);
|
||||
seed *= 0xc2b2ae35;
|
||||
seed ^= (seed >> 16);
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
void fill_bloom_key(const char *data,
|
||||
size_t len,
|
||||
struct bloom_key *key,
|
||||
const struct bloom_filter_settings *settings)
|
||||
{
|
||||
int i;
|
||||
const uint32_t seed0 = 0x293ae76f;
|
||||
const uint32_t seed1 = 0x7e646e2c;
|
||||
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
|
||||
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
|
||||
|
||||
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
|
||||
for (i = 0; i < settings->num_hashes; i++)
|
||||
key->hashes[i] = hash0 + i * hash1;
|
||||
}
|
||||
|
||||
void add_key_to_filter(const struct bloom_key *key,
|
||||
struct bloom_filter *filter,
|
||||
const struct bloom_filter_settings *settings)
|
||||
{
|
||||
int i;
|
||||
uint64_t mod = filter->len * BITS_PER_WORD;
|
||||
|
||||
for (i = 0; i < settings->num_hashes; i++) {
|
||||
uint64_t hash_mod = key->hashes[i] % mod;
|
||||
uint64_t block_pos = hash_mod / BITS_PER_WORD;
|
||||
|
||||
filter->data[block_pos] |= get_bitmask(hash_mod);
|
||||
}
|
||||
}
|
||||
|
||||
void init_bloom_filters(void)
|
||||
{
|
||||
init_bloom_filter_slab(&bloom_filters);
|
||||
}
|
||||
|
||||
struct bloom_filter *get_bloom_filter(struct repository *r,
|
||||
struct commit *c,
|
||||
int compute_if_not_present)
|
||||
{
|
||||
struct bloom_filter *filter;
|
||||
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
|
||||
int i;
|
||||
struct diff_options diffopt;
|
||||
int max_changes = 512;
|
||||
|
||||
if (bloom_filters.slab_size == 0)
|
||||
return NULL;
|
||||
|
||||
filter = bloom_filter_slab_at(&bloom_filters, c);
|
||||
|
||||
if (!filter->data) {
|
||||
load_commit_graph_info(r, c);
|
||||
if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
|
||||
r->objects->commit_graph->chunk_bloom_indexes) {
|
||||
if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
|
||||
return filter;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->data || !compute_if_not_present)
|
||||
return filter;
|
||||
|
||||
repo_diff_setup(r, &diffopt);
|
||||
diffopt.flags.recursive = 1;
|
||||
diffopt.detect_rename = 0;
|
||||
diffopt.max_changes = max_changes;
|
||||
diff_setup_done(&diffopt);
|
||||
|
||||
if (c->parents)
|
||||
diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
|
||||
else
|
||||
diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
|
||||
diffcore_std(&diffopt);
|
||||
|
||||
if (diff_queued_diff.nr <= max_changes) {
|
||||
struct hashmap pathmap;
|
||||
struct pathmap_hash_entry *e;
|
||||
struct hashmap_iter iter;
|
||||
hashmap_init(&pathmap, NULL, NULL, 0);
|
||||
|
||||
for (i = 0; i < diff_queued_diff.nr; i++) {
|
||||
const char *path = diff_queued_diff.queue[i]->two->path;
|
||||
|
||||
/*
|
||||
* Add each leading directory of the changed file, i.e. for
|
||||
* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
|
||||
* the Bloom filter could be used to speed up commands like
|
||||
* 'git log dir/subdir', too.
|
||||
*
|
||||
* Note that directories are added without the trailing '/'.
|
||||
*/
|
||||
do {
|
||||
char *last_slash = strrchr(path, '/');
|
||||
|
||||
FLEX_ALLOC_STR(e, path, path);
|
||||
hashmap_entry_init(&e->entry, strhash(path));
|
||||
hashmap_add(&pathmap, &e->entry);
|
||||
|
||||
if (!last_slash)
|
||||
last_slash = (char*)path;
|
||||
*last_slash = '\0';
|
||||
|
||||
} while (*path);
|
||||
|
||||
diff_free_filepair(diff_queued_diff.queue[i]);
|
||||
}
|
||||
|
||||
filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
filter->data = xcalloc(filter->len, sizeof(unsigned char));
|
||||
|
||||
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
|
||||
struct bloom_key key;
|
||||
fill_bloom_key(e->path, strlen(e->path), &key, &settings);
|
||||
add_key_to_filter(&key, filter, &settings);
|
||||
}
|
||||
|
||||
hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
|
||||
} else {
|
||||
for (i = 0; i < diff_queued_diff.nr; i++)
|
||||
diff_free_filepair(diff_queued_diff.queue[i]);
|
||||
filter->data = NULL;
|
||||
filter->len = 0;
|
||||
}
|
||||
|
||||
free(diff_queued_diff.queue);
|
||||
DIFF_QUEUE_CLEAR(&diff_queued_diff);
|
||||
|
||||
return filter;
|
||||
}
|
||||
|
||||
int bloom_filter_contains(const struct bloom_filter *filter,
|
||||
const struct bloom_key *key,
|
||||
const struct bloom_filter_settings *settings)
|
||||
{
|
||||
int i;
|
||||
uint64_t mod = filter->len * BITS_PER_WORD;
|
||||
|
||||
if (!mod)
|
||||
return -1;
|
||||
|
||||
for (i = 0; i < settings->num_hashes; i++) {
|
||||
uint64_t hash_mod = key->hashes[i] % mod;
|
||||
uint64_t block_pos = hash_mod / BITS_PER_WORD;
|
||||
if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
90
bloom.h
Normal file
90
bloom.h
Normal file
@ -0,0 +1,90 @@
|
||||
#ifndef BLOOM_H
|
||||
#define BLOOM_H
|
||||
|
||||
struct commit;
|
||||
struct repository;
|
||||
|
||||
struct bloom_filter_settings {
|
||||
/*
|
||||
* The version of the hashing technique being used.
|
||||
* We currently only support version = 1 which is
|
||||
* the seeded murmur3 hashing technique implemented
|
||||
* in bloom.c.
|
||||
*/
|
||||
uint32_t hash_version;
|
||||
|
||||
/*
|
||||
* The number of times a path is hashed, i.e. the
|
||||
* number of bit positions tht cumulatively
|
||||
* determine whether a path is present in the
|
||||
* Bloom filter.
|
||||
*/
|
||||
uint32_t num_hashes;
|
||||
|
||||
/*
|
||||
* The minimum number of bits per entry in the Bloom
|
||||
* filter. If the filter contains 'n' entries, then
|
||||
* filter size is the minimum number of 8-bit words
|
||||
* that contain n*b bits.
|
||||
*/
|
||||
uint32_t bits_per_entry;
|
||||
};
|
||||
|
||||
#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
|
||||
#define BITS_PER_WORD 8
|
||||
#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
|
||||
|
||||
/*
|
||||
* A bloom_filter struct represents a data segment to
|
||||
* use when testing hash values. The 'len' member
|
||||
* dictates how many entries are stored in
|
||||
* 'data'.
|
||||
*/
|
||||
struct bloom_filter {
|
||||
unsigned char *data;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
/*
|
||||
* A bloom_key represents the k hash values for a
|
||||
* given string. These can be precomputed and
|
||||
* stored in a bloom_key for re-use when testing
|
||||
* against a bloom_filter. The number of hashes is
|
||||
* given by the Bloom filter settings and is the same
|
||||
* for all Bloom filters and keys interacting with
|
||||
* the loaded version of the commit graph file and
|
||||
* the Bloom data chunks.
|
||||
*/
|
||||
struct bloom_key {
|
||||
uint32_t *hashes;
|
||||
};
|
||||
|
||||
/*
|
||||
* Calculate the murmur3 32-bit hash value for the given data
|
||||
* using the given seed.
|
||||
* Produces a uniformly distributed hash value.
|
||||
* Not considered to be cryptographically secure.
|
||||
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
|
||||
*/
|
||||
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
|
||||
|
||||
void fill_bloom_key(const char *data,
|
||||
size_t len,
|
||||
struct bloom_key *key,
|
||||
const struct bloom_filter_settings *settings);
|
||||
|
||||
void add_key_to_filter(const struct bloom_key *key,
|
||||
struct bloom_filter *filter,
|
||||
const struct bloom_filter_settings *settings);
|
||||
|
||||
void init_bloom_filters(void);
|
||||
|
||||
struct bloom_filter *get_bloom_filter(struct repository *r,
|
||||
struct commit *c,
|
||||
int compute_if_not_present);
|
||||
|
||||
int bloom_filter_contains(const struct bloom_filter *filter,
|
||||
const struct bloom_key *key,
|
||||
const struct bloom_filter_settings *settings);
|
||||
|
||||
#endif
|
@ -11,7 +11,7 @@ static char const * const builtin_commit_graph_usage[] = {
|
||||
N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
|
||||
N_("git commit-graph write [--object-dir <objdir>] [--append] "
|
||||
"[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
|
||||
"[--[no-]progress] <split options>"),
|
||||
"[--changed-paths] [--[no-]progress] <split options>"),
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -23,7 +23,7 @@ static const char * const builtin_commit_graph_verify_usage[] = {
|
||||
static const char * const builtin_commit_graph_write_usage[] = {
|
||||
N_("git commit-graph write [--object-dir <objdir>] [--append] "
|
||||
"[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
|
||||
"[--[no-]progress] <split options>"),
|
||||
"[--changed-paths] [--[no-]progress] <split options>"),
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -36,6 +36,7 @@ static struct opts_commit_graph {
|
||||
int split;
|
||||
int shallow;
|
||||
int progress;
|
||||
int enable_changed_paths;
|
||||
} opts;
|
||||
|
||||
static struct object_directory *find_odb(struct repository *r,
|
||||
@ -158,6 +159,8 @@ static int graph_write(int argc, const char **argv)
|
||||
N_("start walk at commits listed by stdin")),
|
||||
OPT_BOOL(0, "append", &opts.append,
|
||||
N_("include all commits already in the commit-graph file")),
|
||||
OPT_BOOL(0, "changed-paths", &opts.enable_changed_paths,
|
||||
N_("enable computation for changed paths")),
|
||||
OPT_BOOL(0, "progress", &opts.progress, N_("force progress reporting")),
|
||||
OPT_CALLBACK_F(0, "split", &split_opts.flags, NULL,
|
||||
N_("allow writing an incremental commit-graph file"),
|
||||
@ -193,6 +196,9 @@ static int graph_write(int argc, const char **argv)
|
||||
flags |= COMMIT_GRAPH_WRITE_SPLIT;
|
||||
if (opts.progress)
|
||||
flags |= COMMIT_GRAPH_WRITE_PROGRESS;
|
||||
if (opts.enable_changed_paths ||
|
||||
git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
|
||||
flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
|
||||
|
||||
read_replace_refs = 0;
|
||||
odb = find_odb(the_repository, opts.obj_dir);
|
||||
|
@ -19,6 +19,7 @@ linux-gcc)
|
||||
export GIT_TEST_OE_SIZE=10
|
||||
export GIT_TEST_OE_DELTA_SIZE=5
|
||||
export GIT_TEST_COMMIT_GRAPH=1
|
||||
export GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1
|
||||
export GIT_TEST_MULTI_PACK_INDEX=1
|
||||
export GIT_TEST_ADD_I_USE_BUILTIN=1
|
||||
make test
|
||||
|
213
commit-graph.c
213
commit-graph.c
@ -16,13 +16,18 @@
|
||||
#include "hashmap.h"
|
||||
#include "replace-object.h"
|
||||
#include "progress.h"
|
||||
#include "bloom.h"
|
||||
#include "commit-slab.h"
|
||||
|
||||
#define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
|
||||
#define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
|
||||
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
|
||||
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
|
||||
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
|
||||
#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
|
||||
#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
|
||||
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
|
||||
#define MAX_NUM_CHUNKS 7
|
||||
|
||||
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
|
||||
|
||||
@ -44,9 +49,51 @@
|
||||
/* Remember to update object flag allocation in object.h */
|
||||
#define REACHABLE (1u<<15)
|
||||
|
||||
char *get_commit_graph_filename(struct object_directory *odb)
|
||||
/* Keep track of the order in which commits are added to our list. */
|
||||
define_commit_slab(commit_pos, int);
|
||||
static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
|
||||
|
||||
static void set_commit_pos(struct repository *r, const struct object_id *oid)
|
||||
{
|
||||
return xstrfmt("%s/info/commit-graph", odb->path);
|
||||
static int32_t max_pos;
|
||||
struct commit *commit = lookup_commit(r, oid);
|
||||
|
||||
if (!commit)
|
||||
return; /* should never happen, but be lenient */
|
||||
|
||||
*commit_pos_at(&commit_pos, commit) = max_pos++;
|
||||
}
|
||||
|
||||
static int commit_pos_cmp(const void *va, const void *vb)
|
||||
{
|
||||
const struct commit *a = *(const struct commit **)va;
|
||||
const struct commit *b = *(const struct commit **)vb;
|
||||
return commit_pos_at(&commit_pos, a) -
|
||||
commit_pos_at(&commit_pos, b);
|
||||
}
|
||||
|
||||
static int commit_gen_cmp(const void *va, const void *vb)
|
||||
{
|
||||
const struct commit *a = *(const struct commit **)va;
|
||||
const struct commit *b = *(const struct commit **)vb;
|
||||
|
||||
/* lower generation commits first */
|
||||
if (a->generation < b->generation)
|
||||
return -1;
|
||||
else if (a->generation > b->generation)
|
||||
return 1;
|
||||
|
||||
/* use date as a heuristic when generations are equal */
|
||||
if (a->date < b->date)
|
||||
return -1;
|
||||
else if (a->date > b->date)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *get_commit_graph_filename(struct object_directory *obj_dir)
|
||||
{
|
||||
return xstrfmt("%s/info/commit-graph", obj_dir->path);
|
||||
}
|
||||
|
||||
static char *get_split_graph_filename(struct object_directory *odb,
|
||||
@ -270,6 +317,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
|
||||
chunk_repeated = 1;
|
||||
else
|
||||
graph->chunk_base_graphs = data + chunk_offset;
|
||||
break;
|
||||
|
||||
case GRAPH_CHUNKID_BLOOMINDEXES:
|
||||
if (graph->chunk_bloom_indexes)
|
||||
chunk_repeated = 1;
|
||||
else
|
||||
graph->chunk_bloom_indexes = data + chunk_offset;
|
||||
break;
|
||||
|
||||
case GRAPH_CHUNKID_BLOOMDATA:
|
||||
if (graph->chunk_bloom_data)
|
||||
chunk_repeated = 1;
|
||||
else {
|
||||
uint32_t hash_version;
|
||||
graph->chunk_bloom_data = data + chunk_offset;
|
||||
hash_version = get_be32(data + chunk_offset);
|
||||
|
||||
if (hash_version != 1)
|
||||
break;
|
||||
|
||||
graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
|
||||
graph->bloom_filter_settings->hash_version = hash_version;
|
||||
graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
|
||||
graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (chunk_repeated) {
|
||||
@ -288,6 +361,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
|
||||
last_chunk_offset = chunk_offset;
|
||||
}
|
||||
|
||||
if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
|
||||
init_bloom_filters();
|
||||
} else {
|
||||
/* We need both the bloom chunks to exist together. Else ignore the data */
|
||||
graph->chunk_bloom_indexes = NULL;
|
||||
graph->chunk_bloom_data = NULL;
|
||||
graph->bloom_filter_settings = NULL;
|
||||
}
|
||||
|
||||
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
|
||||
|
||||
if (verify_commit_graph_lite(graph)) {
|
||||
@ -784,9 +866,12 @@ struct write_commit_graph_context {
|
||||
unsigned append:1,
|
||||
report_progress:1,
|
||||
split:1,
|
||||
check_oids:1;
|
||||
check_oids:1,
|
||||
changed_paths:1,
|
||||
order_by_pack:1;
|
||||
|
||||
const struct split_commit_graph_opts *split_opts;
|
||||
size_t total_bloom_filter_data_size;
|
||||
};
|
||||
|
||||
static void write_graph_chunk_fanout(struct hashfile *f,
|
||||
@ -982,6 +1067,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
|
||||
}
|
||||
}
|
||||
|
||||
static void write_graph_chunk_bloom_indexes(struct hashfile *f,
|
||||
struct write_commit_graph_context *ctx)
|
||||
{
|
||||
struct commit **list = ctx->commits.list;
|
||||
struct commit **last = ctx->commits.list + ctx->commits.nr;
|
||||
uint32_t cur_pos = 0;
|
||||
struct progress *progress = NULL;
|
||||
int i = 0;
|
||||
|
||||
if (ctx->report_progress)
|
||||
progress = start_delayed_progress(
|
||||
_("Writing changed paths Bloom filters index"),
|
||||
ctx->commits.nr);
|
||||
|
||||
while (list < last) {
|
||||
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
|
||||
cur_pos += filter->len;
|
||||
display_progress(progress, ++i);
|
||||
hashwrite_be32(f, cur_pos);
|
||||
list++;
|
||||
}
|
||||
|
||||
stop_progress(&progress);
|
||||
}
|
||||
|
||||
static void write_graph_chunk_bloom_data(struct hashfile *f,
|
||||
struct write_commit_graph_context *ctx,
|
||||
const struct bloom_filter_settings *settings)
|
||||
{
|
||||
struct commit **list = ctx->commits.list;
|
||||
struct commit **last = ctx->commits.list + ctx->commits.nr;
|
||||
struct progress *progress = NULL;
|
||||
int i = 0;
|
||||
|
||||
if (ctx->report_progress)
|
||||
progress = start_delayed_progress(
|
||||
_("Writing changed paths Bloom filters data"),
|
||||
ctx->commits.nr);
|
||||
|
||||
hashwrite_be32(f, settings->hash_version);
|
||||
hashwrite_be32(f, settings->num_hashes);
|
||||
hashwrite_be32(f, settings->bits_per_entry);
|
||||
|
||||
while (list < last) {
|
||||
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
|
||||
display_progress(progress, ++i);
|
||||
hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
|
||||
list++;
|
||||
}
|
||||
|
||||
stop_progress(&progress);
|
||||
}
|
||||
|
||||
static int oid_compare(const void *_a, const void *_b)
|
||||
{
|
||||
const struct object_id *a = (const struct object_id *)_a;
|
||||
@ -1013,6 +1151,8 @@ static int add_packed_commits(const struct object_id *oid,
|
||||
oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
|
||||
ctx->oids.nr++;
|
||||
|
||||
set_commit_pos(ctx->r, oid);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1132,6 +1272,38 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
|
||||
stop_progress(&ctx->progress);
|
||||
}
|
||||
|
||||
static void compute_bloom_filters(struct write_commit_graph_context *ctx)
|
||||
{
|
||||
int i;
|
||||
struct progress *progress = NULL;
|
||||
struct commit **sorted_commits;
|
||||
|
||||
init_bloom_filters();
|
||||
|
||||
if (ctx->report_progress)
|
||||
progress = start_delayed_progress(
|
||||
_("Computing commit changed paths Bloom filters"),
|
||||
ctx->commits.nr);
|
||||
|
||||
ALLOC_ARRAY(sorted_commits, ctx->commits.nr);
|
||||
COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr);
|
||||
|
||||
if (ctx->order_by_pack)
|
||||
QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
|
||||
else
|
||||
QSORT(sorted_commits, ctx->commits.nr, commit_gen_cmp);
|
||||
|
||||
for (i = 0; i < ctx->commits.nr; i++) {
|
||||
struct commit *c = sorted_commits[i];
|
||||
struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
|
||||
ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
|
||||
display_progress(progress, i + 1);
|
||||
}
|
||||
|
||||
free(sorted_commits);
|
||||
stop_progress(&progress);
|
||||
}
|
||||
|
||||
static int add_ref_to_set(const char *refname,
|
||||
const struct object_id *oid,
|
||||
int flags, void *cb_data)
|
||||
@ -1361,12 +1533,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
|
||||
int fd;
|
||||
struct hashfile *f;
|
||||
struct lock_file lk = LOCK_INIT;
|
||||
uint32_t chunk_ids[6];
|
||||
uint64_t chunk_offsets[6];
|
||||
uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
|
||||
uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
|
||||
const unsigned hashsz = the_hash_algo->rawsz;
|
||||
struct strbuf progress_title = STRBUF_INIT;
|
||||
int num_chunks = 3;
|
||||
struct object_id file_hash;
|
||||
const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
|
||||
|
||||
if (ctx->split) {
|
||||
struct strbuf tmp_file = STRBUF_INIT;
|
||||
@ -1411,6 +1584,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
|
||||
chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
|
||||
num_chunks++;
|
||||
}
|
||||
if (ctx->changed_paths) {
|
||||
chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
|
||||
num_chunks++;
|
||||
chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
|
||||
num_chunks++;
|
||||
}
|
||||
if (ctx->num_commit_graphs_after > 1) {
|
||||
chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
|
||||
num_chunks++;
|
||||
@ -1429,6 +1608,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
|
||||
4 * ctx->num_extra_edges;
|
||||
num_chunks++;
|
||||
}
|
||||
if (ctx->changed_paths) {
|
||||
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
|
||||
sizeof(uint32_t) * ctx->commits.nr;
|
||||
num_chunks++;
|
||||
|
||||
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
|
||||
sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
|
||||
num_chunks++;
|
||||
}
|
||||
if (ctx->num_commit_graphs_after > 1) {
|
||||
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
|
||||
hashsz * (ctx->num_commit_graphs_after - 1);
|
||||
@ -1466,6 +1654,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
|
||||
write_graph_chunk_data(f, hashsz, ctx);
|
||||
if (ctx->num_extra_edges)
|
||||
write_graph_chunk_extra_edges(f, ctx);
|
||||
if (ctx->changed_paths) {
|
||||
write_graph_chunk_bloom_indexes(f, ctx);
|
||||
write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
|
||||
}
|
||||
if (ctx->num_commit_graphs_after > 1 &&
|
||||
write_graph_chunk_base(f, ctx)) {
|
||||
return -1;
|
||||
@ -1804,6 +1996,8 @@ int write_commit_graph(struct object_directory *odb,
|
||||
ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
|
||||
ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
|
||||
ctx->split_opts = split_opts;
|
||||
ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
|
||||
ctx->total_bloom_filter_data_size = 0;
|
||||
|
||||
if (ctx->split) {
|
||||
struct commit_graph *g;
|
||||
@ -1856,6 +2050,7 @@ int write_commit_graph(struct object_directory *odb,
|
||||
}
|
||||
|
||||
if (pack_indexes) {
|
||||
ctx->order_by_pack = 1;
|
||||
if ((res = fill_oids_from_packs(ctx, pack_indexes)))
|
||||
goto cleanup;
|
||||
}
|
||||
@ -1865,8 +2060,10 @@ int write_commit_graph(struct object_directory *odb,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (!pack_indexes && !commits)
|
||||
if (!pack_indexes && !commits) {
|
||||
ctx->order_by_pack = 1;
|
||||
fill_oids_from_all_packs(ctx);
|
||||
}
|
||||
|
||||
close_reachable(ctx);
|
||||
|
||||
@ -1902,6 +2099,9 @@ int write_commit_graph(struct object_directory *odb,
|
||||
|
||||
compute_generation_numbers(ctx);
|
||||
|
||||
if (ctx->changed_paths)
|
||||
compute_bloom_filters(ctx);
|
||||
|
||||
res = write_commit_graph_file(ctx);
|
||||
|
||||
if (ctx->split)
|
||||
@ -2126,6 +2326,7 @@ void free_commit_graph(struct commit_graph *g)
|
||||
g->data = NULL;
|
||||
}
|
||||
free(g->filename);
|
||||
free(g->bloom_filter_settings);
|
||||
free(g);
|
||||
}
|
||||
|
||||
|
@ -10,8 +10,10 @@
|
||||
|
||||
#define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
|
||||
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
|
||||
#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
|
||||
|
||||
struct commit;
|
||||
struct bloom_filter_settings;
|
||||
|
||||
char *get_commit_graph_filename(struct object_directory *odb);
|
||||
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
|
||||
@ -58,6 +60,10 @@ struct commit_graph {
|
||||
const unsigned char *chunk_commit_data;
|
||||
const unsigned char *chunk_extra_edges;
|
||||
const unsigned char *chunk_base_graphs;
|
||||
const unsigned char *chunk_bloom_indexes;
|
||||
const unsigned char *chunk_bloom_data;
|
||||
|
||||
struct bloom_filter_settings *bloom_filter_settings;
|
||||
};
|
||||
|
||||
struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st,
|
||||
@ -77,7 +83,8 @@ enum commit_graph_write_flags {
|
||||
COMMIT_GRAPH_WRITE_PROGRESS = (1 << 1),
|
||||
COMMIT_GRAPH_WRITE_SPLIT = (1 << 2),
|
||||
/* Make sure that each OID in the input is a valid commit OID. */
|
||||
COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3)
|
||||
COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
|
||||
COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4),
|
||||
};
|
||||
|
||||
enum commit_graph_split_flags {
|
||||
|
5
diff.h
5
diff.h
@ -285,6 +285,11 @@ struct diff_options {
|
||||
/* Number of hexdigits to abbreviate raw format output to. */
|
||||
int abbrev;
|
||||
|
||||
/* If non-zero, then stop computing after this many changes. */
|
||||
int max_changes;
|
||||
/* For internal use only. */
|
||||
int num_changes;
|
||||
|
||||
int ita_invisible_in_index;
|
||||
/* white-space error highlighting */
|
||||
#define WSEH_NEW (1<<12)
|
||||
|
126
revision.c
126
revision.c
@ -29,6 +29,8 @@
|
||||
#include "prio-queue.h"
|
||||
#include "hashmap.h"
|
||||
#include "utf8.h"
|
||||
#include "bloom.h"
|
||||
#include "json-writer.h"
|
||||
|
||||
volatile show_early_output_fn_t show_early_output;
|
||||
|
||||
@ -624,11 +626,116 @@ static void file_change(struct diff_options *options,
|
||||
options->flags.has_changes = 1;
|
||||
}
|
||||
|
||||
static int bloom_filter_atexit_registered;
|
||||
static unsigned int count_bloom_filter_maybe;
|
||||
static unsigned int count_bloom_filter_definitely_not;
|
||||
static unsigned int count_bloom_filter_false_positive;
|
||||
static unsigned int count_bloom_filter_not_present;
|
||||
static unsigned int count_bloom_filter_length_zero;
|
||||
|
||||
static void trace2_bloom_filter_statistics_atexit(void)
|
||||
{
|
||||
struct json_writer jw = JSON_WRITER_INIT;
|
||||
|
||||
jw_object_begin(&jw, 0);
|
||||
jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present);
|
||||
jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero);
|
||||
jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe);
|
||||
jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not);
|
||||
jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive);
|
||||
jw_end(&jw);
|
||||
|
||||
trace2_data_json("bloom", the_repository, "statistics", &jw);
|
||||
|
||||
jw_release(&jw);
|
||||
}
|
||||
|
||||
static void prepare_to_use_bloom_filter(struct rev_info *revs)
|
||||
{
|
||||
struct pathspec_item *pi;
|
||||
char *path_alloc = NULL;
|
||||
const char *path;
|
||||
int last_index;
|
||||
int len;
|
||||
|
||||
if (!revs->commits)
|
||||
return;
|
||||
|
||||
repo_parse_commit(revs->repo, revs->commits->item);
|
||||
|
||||
if (!revs->repo->objects->commit_graph)
|
||||
return;
|
||||
|
||||
revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings;
|
||||
if (!revs->bloom_filter_settings)
|
||||
return;
|
||||
|
||||
pi = &revs->pruning.pathspec.items[0];
|
||||
last_index = pi->len - 1;
|
||||
|
||||
/* remove single trailing slash from path, if needed */
|
||||
if (pi->match[last_index] == '/') {
|
||||
path_alloc = xstrdup(pi->match);
|
||||
path_alloc[last_index] = '\0';
|
||||
path = path_alloc;
|
||||
} else
|
||||
path = pi->match;
|
||||
|
||||
len = strlen(path);
|
||||
|
||||
revs->bloom_key = xmalloc(sizeof(struct bloom_key));
|
||||
fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
|
||||
|
||||
if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
|
||||
atexit(trace2_bloom_filter_statistics_atexit);
|
||||
bloom_filter_atexit_registered = 1;
|
||||
}
|
||||
|
||||
free(path_alloc);
|
||||
}
|
||||
|
||||
static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
|
||||
struct commit *commit)
|
||||
{
|
||||
struct bloom_filter *filter;
|
||||
int result;
|
||||
|
||||
if (!revs->repo->objects->commit_graph)
|
||||
return -1;
|
||||
|
||||
if (commit->generation == GENERATION_NUMBER_INFINITY)
|
||||
return -1;
|
||||
|
||||
filter = get_bloom_filter(revs->repo, commit, 0);
|
||||
|
||||
if (!filter) {
|
||||
count_bloom_filter_not_present++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!filter->len) {
|
||||
count_bloom_filter_length_zero++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
result = bloom_filter_contains(filter,
|
||||
revs->bloom_key,
|
||||
revs->bloom_filter_settings);
|
||||
|
||||
if (result)
|
||||
count_bloom_filter_maybe++;
|
||||
else
|
||||
count_bloom_filter_definitely_not++;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int rev_compare_tree(struct rev_info *revs,
|
||||
struct commit *parent, struct commit *commit)
|
||||
struct commit *parent, struct commit *commit, int nth_parent)
|
||||
{
|
||||
struct tree *t1 = get_commit_tree(parent);
|
||||
struct tree *t2 = get_commit_tree(commit);
|
||||
int bloom_ret = 1;
|
||||
|
||||
if (!t1)
|
||||
return REV_TREE_NEW;
|
||||
@ -653,11 +760,23 @@ static int rev_compare_tree(struct rev_info *revs,
|
||||
return REV_TREE_SAME;
|
||||
}
|
||||
|
||||
if (revs->bloom_key && !nth_parent) {
|
||||
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
|
||||
|
||||
if (bloom_ret == 0)
|
||||
return REV_TREE_SAME;
|
||||
}
|
||||
|
||||
tree_difference = REV_TREE_SAME;
|
||||
revs->pruning.flags.has_changes = 0;
|
||||
if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
|
||||
&revs->pruning) < 0)
|
||||
return REV_TREE_DIFFERENT;
|
||||
|
||||
if (!nth_parent)
|
||||
if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
|
||||
count_bloom_filter_false_positive++;
|
||||
|
||||
return tree_difference;
|
||||
}
|
||||
|
||||
@ -855,7 +974,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
|
||||
die("cannot simplify commit %s (because of %s)",
|
||||
oid_to_hex(&commit->object.oid),
|
||||
oid_to_hex(&p->object.oid));
|
||||
switch (rev_compare_tree(revs, p, commit)) {
|
||||
switch (rev_compare_tree(revs, p, commit, nth_parent)) {
|
||||
case REV_TREE_SAME:
|
||||
if (!revs->simplify_history || !relevant_commit(p)) {
|
||||
/* Even if a merge with an uninteresting
|
||||
@ -3385,6 +3504,8 @@ int prepare_revision_walk(struct rev_info *revs)
|
||||
FOR_EACH_OBJECT_PROMISOR_ONLY);
|
||||
}
|
||||
|
||||
if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info)
|
||||
prepare_to_use_bloom_filter(revs);
|
||||
if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED)
|
||||
commit_list_sort_by_date(&revs->commits);
|
||||
if (revs->no_walk)
|
||||
@ -3402,6 +3523,7 @@ int prepare_revision_walk(struct rev_info *revs)
|
||||
simplify_merges(revs);
|
||||
if (revs->children.name)
|
||||
set_children(revs);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
11
revision.h
11
revision.h
@ -59,6 +59,8 @@ struct repository;
|
||||
struct rev_info;
|
||||
struct string_list;
|
||||
struct saved_parents;
|
||||
struct bloom_key;
|
||||
struct bloom_filter_settings;
|
||||
define_shared_commit_slab(revision_sources, char *);
|
||||
|
||||
struct rev_cmdline_info {
|
||||
@ -296,6 +298,15 @@ struct rev_info {
|
||||
struct revision_sources *sources;
|
||||
|
||||
struct topo_walk_info *topo_walk_info;
|
||||
|
||||
/* Commit graph bloom filter fields */
|
||||
/* The bloom filter key for the pathspec */
|
||||
struct bloom_key *bloom_key;
|
||||
/*
|
||||
* The bloom filter settings used to generate the key.
|
||||
* This is loaded from the commit-graph being used.
|
||||
*/
|
||||
struct bloom_filter_settings *bloom_filter_settings;
|
||||
};
|
||||
|
||||
int ref_excluded(struct string_list *, const char *path);
|
||||
|
5
t/README
5
t/README
@ -379,6 +379,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
|
||||
be written after every 'git commit' command, and overrides the
|
||||
'core.commitGraph' setting to true.
|
||||
|
||||
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=<boolean>, when true, forces
|
||||
commit-graph write to compute and write changed path Bloom filters for
|
||||
every 'git commit-graph write', as if the `--changed-paths` option was
|
||||
passed in.
|
||||
|
||||
GIT_TEST_FSMONITOR=$PWD/t7519/fsmonitor-all exercises the fsmonitor
|
||||
code path for utilizing a file system monitor to speed up detecting
|
||||
new or changed files.
|
||||
|
81
t/helper/test-bloom.c
Normal file
81
t/helper/test-bloom.c
Normal file
@ -0,0 +1,81 @@
|
||||
#include "git-compat-util.h"
|
||||
#include "bloom.h"
|
||||
#include "test-tool.h"
|
||||
#include "commit.h"
|
||||
|
||||
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
|
||||
|
||||
static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
|
||||
struct bloom_key key;
|
||||
int i;
|
||||
|
||||
fill_bloom_key(data, strlen(data), &key, &settings);
|
||||
printf("Hashes:");
|
||||
for (i = 0; i < settings.num_hashes; i++){
|
||||
printf("0x%08x|", key.hashes[i]);
|
||||
}
|
||||
printf("\n");
|
||||
add_key_to_filter(&key, filter, &settings);
|
||||
}
|
||||
|
||||
static void print_bloom_filter(struct bloom_filter *filter) {
|
||||
int i;
|
||||
|
||||
if (!filter) {
|
||||
printf("No filter.\n");
|
||||
return;
|
||||
}
|
||||
printf("Filter_Length:%d\n", (int)filter->len);
|
||||
printf("Filter_Data:");
|
||||
for (i = 0; i < filter->len; i++){
|
||||
printf("%02x|", filter->data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
|
||||
{
|
||||
struct commit *c;
|
||||
struct bloom_filter *filter;
|
||||
setup_git_directory();
|
||||
c = lookup_commit(the_repository, commit_oid);
|
||||
filter = get_bloom_filter(the_repository, c, 1);
|
||||
print_bloom_filter(filter);
|
||||
}
|
||||
|
||||
int cmd__bloom(int argc, const char **argv)
|
||||
{
|
||||
if (!strcmp(argv[1], "get_murmur3")) {
|
||||
uint32_t hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
|
||||
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
|
||||
}
|
||||
|
||||
if (!strcmp(argv[1], "generate_filter")) {
|
||||
struct bloom_filter filter;
|
||||
int i = 2;
|
||||
filter.len = (settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
filter.data = xcalloc(filter.len, sizeof(unsigned char));
|
||||
|
||||
if (!argv[2]){
|
||||
die("at least one input string expected");
|
||||
}
|
||||
|
||||
while (argv[i]) {
|
||||
add_string_to_filter(argv[i], &filter);
|
||||
i++;
|
||||
}
|
||||
|
||||
print_bloom_filter(&filter);
|
||||
}
|
||||
|
||||
if (!strcmp(argv[1], "get_filter_for_commit")) {
|
||||
struct object_id oid;
|
||||
const char *end;
|
||||
if (parse_oid_hex(argv[2], &oid, &end))
|
||||
die("cannot parse oid '%s'", argv[2]);
|
||||
init_bloom_filters();
|
||||
get_bloom_filter_for_commit(&oid);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -34,6 +34,10 @@ int cmd__read_graph(int argc, const char **argv)
|
||||
printf(" commit_metadata");
|
||||
if (graph->chunk_extra_edges)
|
||||
printf(" extra_edges");
|
||||
if (graph->chunk_bloom_indexes)
|
||||
printf(" bloom_indexes");
|
||||
if (graph->chunk_bloom_data)
|
||||
printf(" bloom_data");
|
||||
printf("\n");
|
||||
|
||||
UNLEAK(graph);
|
||||
|
@ -15,6 +15,7 @@ struct test_cmd {
|
||||
|
||||
static struct test_cmd cmds[] = {
|
||||
{ "advise", cmd__advise_if_enabled },
|
||||
{ "bloom", cmd__bloom },
|
||||
{ "chmtime", cmd__chmtime },
|
||||
{ "config", cmd__config },
|
||||
{ "ctype", cmd__ctype },
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "git-compat-util.h"
|
||||
|
||||
int cmd__advise_if_enabled(int argc, const char **argv);
|
||||
int cmd__bloom(int argc, const char **argv);
|
||||
int cmd__chmtime(int argc, const char **argv);
|
||||
int cmd__config(int argc, const char **argv);
|
||||
int cmd__ctype(int argc, const char **argv);
|
||||
|
117
t/t0095-bloom.sh
Executable file
117
t/t0095-bloom.sh
Executable file
@ -0,0 +1,117 @@
|
||||
#!/bin/sh
|
||||
|
||||
test_description='Testing the various Bloom filter computations in bloom.c'
|
||||
. ./test-lib.sh
|
||||
|
||||
test_expect_success 'compute unseeded murmur3 hash for empty string' '
|
||||
cat >expect <<-\EOF &&
|
||||
Murmur3 Hash with seed=0:0x00000000
|
||||
EOF
|
||||
test-tool bloom get_murmur3 "" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute unseeded murmur3 hash for test string 1' '
|
||||
cat >expect <<-\EOF &&
|
||||
Murmur3 Hash with seed=0:0x627b0c2c
|
||||
EOF
|
||||
test-tool bloom get_murmur3 "Hello world!" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute unseeded murmur3 hash for test string 2' '
|
||||
cat >expect <<-\EOF &&
|
||||
Murmur3 Hash with seed=0:0x2e4ff723
|
||||
EOF
|
||||
test-tool bloom get_murmur3 "The quick brown fox jumps over the lazy dog" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute bloom key for empty string' '
|
||||
cat >expect <<-\EOF &&
|
||||
Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004|
|
||||
Filter_Length:2
|
||||
Filter_Data:11|11|
|
||||
EOF
|
||||
test-tool bloom generate_filter "" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute bloom key for whitespace' '
|
||||
cat >expect <<-\EOF &&
|
||||
Hashes:0xf178874c|0x5f3d6eb6|0xcd025620|0x3ac73d8a|0xa88c24f4|0x16510c5e|0x8415f3c8|
|
||||
Filter_Length:2
|
||||
Filter_Data:51|55|
|
||||
EOF
|
||||
test-tool bloom generate_filter " " >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute bloom key for test string 1' '
|
||||
cat >expect <<-\EOF &&
|
||||
Hashes:0xb270de9b|0x1bb6f26e|0x84fd0641|0xee431a14|0x57892de7|0xc0cf41ba|0x2a15558d|
|
||||
Filter_Length:2
|
||||
Filter_Data:92|6c|
|
||||
EOF
|
||||
test-tool bloom generate_filter "Hello world!" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'compute bloom key for test string 2' '
|
||||
cat >expect <<-\EOF &&
|
||||
Hashes:0x20ab385b|0xf5237fe2|0xc99bc769|0x9e140ef0|0x728c5677|0x47049dfe|0x1b7ce585|
|
||||
Filter_Length:2
|
||||
Filter_Data:a5|4a|
|
||||
EOF
|
||||
test-tool bloom generate_filter "file.txt" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'get bloom filters for commit with no changes' '
|
||||
git init &&
|
||||
git commit --allow-empty -m "c0" &&
|
||||
cat >expect <<-\EOF &&
|
||||
Filter_Length:0
|
||||
Filter_Data:
|
||||
EOF
|
||||
test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success 'get bloom filter for commit with 10 changes' '
|
||||
rm actual &&
|
||||
rm expect &&
|
||||
mkdir smallDir &&
|
||||
for i in $(test_seq 0 9)
|
||||
do
|
||||
echo $i >smallDir/$i
|
||||
done &&
|
||||
git add smallDir &&
|
||||
git commit -m "commit with 10 changes" &&
|
||||
cat >expect <<-\EOF &&
|
||||
Filter_Length:25
|
||||
Filter_Data:82|a0|65|47|0c|92|90|c0|a1|40|02|a0|e2|40|e0|04|0a|9a|66|cf|80|19|85|42|23|
|
||||
EOF
|
||||
test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_expect_success EXPENSIVE 'get bloom filter for commit with 513 changes' '
|
||||
rm actual &&
|
||||
rm expect &&
|
||||
mkdir bigDir &&
|
||||
for i in $(test_seq 0 512)
|
||||
do
|
||||
echo $i >bigDir/$i
|
||||
done &&
|
||||
git add bigDir &&
|
||||
git commit -m "commit with 513 changes" &&
|
||||
cat >expect <<-\EOF &&
|
||||
Filter_Length:0
|
||||
Filter_Data:
|
||||
EOF
|
||||
test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
|
||||
test_cmp expect actual
|
||||
'
|
||||
|
||||
test_done
|
155
t/t4216-log-bloom.sh
Executable file
155
t/t4216-log-bloom.sh
Executable file
@ -0,0 +1,155 @@
|
||||
#!/bin/sh
|
||||
|
||||
test_description='git log for a path with Bloom filters'
|
||||
. ./test-lib.sh
|
||||
|
||||
GIT_TEST_COMMIT_GRAPH=0
|
||||
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
|
||||
|
||||
test_expect_success 'setup test - repo, commits, commit graph, log outputs' '
|
||||
git init &&
|
||||
mkdir A A/B A/B/C &&
|
||||
test_commit c1 A/file1 &&
|
||||
test_commit c2 A/B/file2 &&
|
||||
test_commit c3 A/B/C/file3 &&
|
||||
test_commit c4 A/file1 &&
|
||||
test_commit c5 A/B/file2 &&
|
||||
test_commit c6 A/B/C/file3 &&
|
||||
test_commit c7 A/file1 &&
|
||||
test_commit c8 A/B/file2 &&
|
||||
test_commit c9 A/B/C/file3 &&
|
||||
test_commit c10 file_to_be_deleted &&
|
||||
git checkout -b side HEAD~4 &&
|
||||
test_commit side-1 file4 &&
|
||||
git checkout master &&
|
||||
git merge side &&
|
||||
test_commit c11 file5 &&
|
||||
mv file5 file5_renamed &&
|
||||
git add file5_renamed &&
|
||||
git commit -m "rename" &&
|
||||
rm file_to_be_deleted &&
|
||||
git add . &&
|
||||
git commit -m "file removed" &&
|
||||
git commit-graph write --reachable --changed-paths
|
||||
'
|
||||
graph_read_expect () {
|
||||
NUM_CHUNKS=5
|
||||
cat >expect <<- EOF
|
||||
header: 43475048 1 1 $NUM_CHUNKS 0
|
||||
num_commits: $1
|
||||
chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data
|
||||
EOF
|
||||
test-tool read-graph >actual &&
|
||||
test_cmp expect actual
|
||||
}
|
||||
|
||||
test_expect_success 'commit-graph write wrote out the bloom chunks' '
|
||||
graph_read_expect 15
|
||||
'
|
||||
|
||||
# Turn off any inherited trace2 settings for this test.
|
||||
sane_unset GIT_TRACE2 GIT_TRACE2_PERF GIT_TRACE2_EVENT
|
||||
sane_unset GIT_TRACE2_PERF_BRIEF
|
||||
sane_unset GIT_TRACE2_CONFIG_PARAMS
|
||||
|
||||
setup () {
|
||||
rm "$TRASH_DIRECTORY/trace.perf"
|
||||
git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom &&
|
||||
GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
|
||||
}
|
||||
|
||||
test_bloom_filters_used () {
|
||||
log_args=$1
|
||||
bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\""
|
||||
setup "$log_args" &&
|
||||
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
|
||||
test_cmp log_wo_bloom log_w_bloom &&
|
||||
test_path_is_file "$TRASH_DIRECTORY/trace.perf"
|
||||
}
|
||||
|
||||
test_bloom_filters_not_used () {
|
||||
log_args=$1
|
||||
setup "$log_args" &&
|
||||
!(grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf") &&
|
||||
test_cmp log_wo_bloom log_w_bloom
|
||||
}
|
||||
|
||||
for path in A A/B A/B/C A/file1 A/B/file2 A/B/C/file3 file4 file5 file5_renamed file_to_be_deleted
|
||||
do
|
||||
for option in "" \
|
||||
"--all" \
|
||||
"--full-history" \
|
||||
"--full-history --simplify-merges" \
|
||||
"--simplify-merges" \
|
||||
"--simplify-by-decoration" \
|
||||
"--follow" \
|
||||
"--first-parent" \
|
||||
"--topo-order" \
|
||||
"--date-order" \
|
||||
"--author-date-order" \
|
||||
"--ancestry-path side..master"
|
||||
do
|
||||
test_expect_success "git log option: $option for path: $path" '
|
||||
test_bloom_filters_used "$option -- $path"
|
||||
'
|
||||
done
|
||||
done
|
||||
|
||||
test_expect_success 'git log -- folder works with and without the trailing slash' '
|
||||
test_bloom_filters_used "-- A" &&
|
||||
test_bloom_filters_used "-- A/"
|
||||
'
|
||||
|
||||
test_expect_success 'git log for path that does not exist. ' '
|
||||
test_bloom_filters_used "-- path_does_not_exist"
|
||||
'
|
||||
|
||||
test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
|
||||
test_bloom_filters_not_used "--walk-reflogs -- A"
|
||||
'
|
||||
|
||||
test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
|
||||
test_bloom_filters_not_used "-- file4 A/file1"
|
||||
'
|
||||
|
||||
test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' '
|
||||
test_bloom_filters_used "-- *4" &&
|
||||
test_bloom_filters_used "-- *renamed"
|
||||
'
|
||||
|
||||
test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' '
|
||||
test_bloom_filters_not_used "-- *" &&
|
||||
test_bloom_filters_not_used "-- file*"
|
||||
'
|
||||
|
||||
test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '
|
||||
test_commit c14 A/anotherFile2 &&
|
||||
test_commit c15 A/B/anotherFile2 &&
|
||||
test_commit c16 A/B/C/anotherFile2 &&
|
||||
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split &&
|
||||
test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
|
||||
'
|
||||
|
||||
test_expect_success 'Do not use Bloom filters if the latest graph does not have Bloom filters.' '
|
||||
test_bloom_filters_not_used "-- A/B"
|
||||
'
|
||||
|
||||
test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
|
||||
test_commit c17 A/anotherFile3 &&
|
||||
git commit-graph write --reachable --changed-paths --split &&
|
||||
test_line_count = 3 .git/objects/info/commit-graphs/commit-graph-chain
|
||||
'
|
||||
|
||||
test_bloom_filters_used_when_some_filters_are_missing () {
|
||||
log_args=$1
|
||||
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
|
||||
setup "$log_args" &&
|
||||
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
|
||||
test_cmp log_wo_bloom log_w_bloom
|
||||
}
|
||||
|
||||
test_expect_success 'Use Bloom filters if they exist in the latest but not all commit graphs in the chain.' '
|
||||
test_bloom_filters_used_when_some_filters_are_missing "-- A/B"
|
||||
'
|
||||
|
||||
test_done
|
@ -3,6 +3,8 @@
|
||||
test_description='commit graph'
|
||||
. ./test-lib.sh
|
||||
|
||||
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
|
||||
|
||||
test_expect_success 'setup full repo' '
|
||||
mkdir full &&
|
||||
cd "$TRASH_DIRECTORY/full" &&
|
||||
|
@ -4,6 +4,7 @@ test_description='split commit graph'
|
||||
. ./test-lib.sh
|
||||
|
||||
GIT_TEST_COMMIT_GRAPH=0
|
||||
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
|
||||
|
||||
test_expect_success 'setup repo' '
|
||||
git init &&
|
||||
|
@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
|
||||
if (diff_can_quit_early(opt))
|
||||
break;
|
||||
|
||||
if (opt->max_changes && opt->num_changes > opt->max_changes)
|
||||
break;
|
||||
|
||||
if (opt->pathspec.nr) {
|
||||
skip_uninteresting(&t, base, opt);
|
||||
for (i = 0; i < nparent; i++)
|
||||
@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
|
||||
|
||||
/* t↓ */
|
||||
update_tree_entry(&t);
|
||||
opt->num_changes++;
|
||||
}
|
||||
|
||||
/* t > p[imin] */
|
||||
@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
|
||||
skip_emit_tp:
|
||||
/* ∀ pi=p[imin] pi↓ */
|
||||
update_tp_entries(tp, nparent);
|
||||
opt->num_changes++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
|
||||
const struct object_id **parents_oid, int nparent,
|
||||
struct strbuf *base, struct diff_options *opt)
|
||||
{
|
||||
opt->num_changes = 0;
|
||||
p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user