Merge branch 'en/ort-perf-batch-10'

Various rename detection optimization to help "ort" merge strategy
backend.

* en/ort-perf-batch-10:
  diffcore-rename: determine which relevant_sources are no longer relevant
  merge-ort: record the reason that we want a rename for a file
  diffcore-rename: add computation of number of unknown renames
  diffcore-rename: check if we have enough renames for directories early on
  diffcore-rename: only compute dir_rename_count for relevant directories
  merge-ort: record the reason that we want a rename for a directory
  merge-ort, diffcore-rename: tweak dirs_removed and relevant_source type
  diffcore-rename: take advantage of "majority rules" to skip more renames
This commit is contained in:
Junio C Hamano 2021-04-16 13:53:33 -07:00
commit e2e1a03f6b
3 changed files with 281 additions and 47 deletions

View File

@ -371,7 +371,7 @@ struct dir_rename_info {
struct strintmap idx_map;
struct strmap dir_rename_guess;
struct strmap *dir_rename_count;
struct strset *relevant_source_dirs;
struct strintmap *relevant_source_dirs;
unsigned setup;
};
@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts)
return highest_destination_dir;
}
static char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */
static int dir_rename_already_determinable(struct strintmap *counts)
{
struct hashmap_iter iter;
struct strmap_entry *entry;
int first = 0, second = 0, unknown = 0;
strintmap_for_each_entry(counts, &iter, entry) {
const char *destination_dir = entry->key;
intptr_t count = (intptr_t)entry->value;
if (!strcmp(destination_dir, UNKNOWN_DIR)) {
unknown = count;
} else if (count >= first) {
second = first;
first = count;
} else if (count >= second) {
second = count;
}
}
return first > second + unknown;
}
static void increment_count(struct dir_rename_info *info,
char *old_dir,
char *new_dir)
@ -429,7 +451,7 @@ static void increment_count(struct dir_rename_info *info,
}
static void update_dir_rename_counts(struct dir_rename_info *info,
struct strset *dirs_removed,
struct strintmap *dirs_removed,
const char *oldname,
const char *newname)
{
@ -461,10 +483,12 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
return;
while (1) {
int drd_flag = NOT_RELEVANT;
/* Get old_dir, skip if its directory isn't relevant. */
dirname_munge(old_dir);
if (info->relevant_source_dirs &&
!strset_contains(info->relevant_source_dirs, old_dir))
!strintmap_contains(info->relevant_source_dirs, old_dir))
break;
/* Get new_dir */
@ -509,16 +533,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
}
}
if (strset_contains(dirs_removed, old_dir))
/*
* Above we suggested that we'd keep recording renames for
* all ancestor directories where the trailing directories
* matched, i.e. for
* "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
* we'd increment rename counts for each of
* a/b/c/d/e/ => a/b/some/thing/else/e/
* a/b/c/d/ => a/b/some/thing/else/
* However, we only need the rename counts for directories
* in dirs_removed whose value is RELEVANT_FOR_SELF.
* However, we add one special case of also recording it for
* first_time_in_loop because find_basename_matches() can
* use that as a hint to find a good pairing.
*/
if (dirs_removed)
drd_flag = strintmap_get(dirs_removed, old_dir);
if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop)
increment_count(info, old_dir, new_dir);
else
break;
first_time_in_loop = 0;
if (drd_flag == NOT_RELEVANT)
break;
/* If we hit toplevel directory ("") for old or new dir, quit */
if (!*old_dir || !*new_dir)
break;
first_time_in_loop = 0;
}
/* Free resources we don't need anymore */
@ -527,8 +566,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
}
static void initialize_dir_rename_info(struct dir_rename_info *info,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strintmap *relevant_sources,
struct strintmap *dirs_removed,
struct strmap *dir_rename_count)
{
struct hashmap_iter iter;
@ -555,12 +594,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
info->relevant_source_dirs = dirs_removed; /* might be NULL */
} else {
info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
strset_init(info->relevant_source_dirs);
strset_for_each_entry(relevant_sources, &iter, entry) {
strintmap_init(info->relevant_source_dirs, 0 /* unused */);
strintmap_for_each_entry(relevant_sources, &iter, entry) {
char *dirname = get_dirname(entry->key);
if (!dirs_removed ||
strset_contains(dirs_removed, dirname))
strset_add(info->relevant_source_dirs, dirname);
strintmap_contains(dirs_removed, dirname))
strintmap_set(info->relevant_source_dirs,
dirname, 0 /* value irrelevant */);
free(dirname);
}
}
@ -624,7 +664,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
}
static void cleanup_dir_rename_info(struct dir_rename_info *info,
struct strset *dirs_removed,
struct strintmap *dirs_removed,
int keep_dir_rename_count)
{
struct hashmap_iter iter;
@ -644,7 +684,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
/* relevant_source_dirs */
if (info->relevant_source_dirs &&
info->relevant_source_dirs != dirs_removed) {
strset_clear(info->relevant_source_dirs);
strintmap_clear(info->relevant_source_dirs);
FREE_AND_NULL(info->relevant_source_dirs);
}
@ -659,18 +699,22 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
/*
* Although dir_rename_count was passed in
* diffcore_rename_extended() and we want to keep it around and
* return it to that caller, we first want to remove any data
* return it to that caller, we first want to remove any counts in
* the maps associated with UNKNOWN_DIR entries and any data
* associated with directories that weren't renamed.
*/
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
const char *source_dir = entry->key;
struct strintmap *counts = entry->value;
if (!strset_contains(dirs_removed, source_dir)) {
if (!strintmap_get(dirs_removed, source_dir)) {
string_list_append(&to_remove, source_dir);
strintmap_clear(counts);
continue;
}
if (strintmap_contains(counts, UNKNOWN_DIR))
strintmap_remove(counts, UNKNOWN_DIR);
}
for (i = 0; i < to_remove.nr; ++i)
strmap_remove(info->dir_rename_count,
@ -770,8 +814,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
static int find_basename_matches(struct diff_options *options,
int minimum_score,
struct dir_rename_info *info,
struct strset *relevant_sources,
struct strset *dirs_removed)
struct strintmap *relevant_sources,
struct strintmap *dirs_removed)
{
/*
* When I checked in early 2020, over 76% of file renames in linux
@ -863,7 +907,7 @@ static int find_basename_matches(struct diff_options *options,
/* Skip irrelevant sources */
if (relevant_sources &&
!strset_contains(relevant_sources, filename))
!strintmap_contains(relevant_sources, filename))
continue;
/*
@ -994,7 +1038,7 @@ static int find_renames(struct diff_score *mx,
int minimum_score,
int copies,
struct dir_rename_info *info,
struct strset *dirs_removed)
struct strintmap *dirs_removed)
{
int count = 0, i;
@ -1019,7 +1063,7 @@ static int find_renames(struct diff_score *mx,
}
static void remove_unneeded_paths_from_src(int detecting_copies,
struct strset *interesting)
struct strintmap *interesting)
{
int i, new_num_src;
@ -1061,7 +1105,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
continue;
/* If we don't care about the source path, skip it */
if (interesting && !strset_contains(interesting, one->path))
if (interesting && !strintmap_contains(interesting, one->path))
continue;
if (new_num_src < i)
@ -1073,9 +1117,136 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
rename_src_nr = new_num_src;
}
static void handle_early_known_dir_renames(struct dir_rename_info *info,
struct strintmap *relevant_sources,
struct strintmap *dirs_removed)
{
/*
* Directory renames are determined via an aggregate of all renames
* under them and using a "majority wins" rule. The fact that
* "majority wins", though, means we don't need all the renames
* under the given directory, we only need enough to ensure we have
* a majority.
*/
int i, new_num_src;
struct hashmap_iter iter;
struct strmap_entry *entry;
if (!dirs_removed || !relevant_sources)
return; /* nothing to cull */
if (break_idx)
return; /* culling incompatbile with break detection */
/*
* Supplement dir_rename_count with number of potential renames,
* marking all potential rename sources as mapping to UNKNOWN_DIR.
*/
for (i = 0; i < rename_src_nr; i++) {
char *old_dir;
struct diff_filespec *one = rename_src[i].p->one;
/*
* sources that are part of a rename will have already been
* removed by a prior call to remove_unneeded_paths_from_src()
*/
assert(!one->rename_used);
old_dir = get_dirname(one->path);
while (*old_dir != '\0' &&
NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) {
char *freeme = old_dir;
increment_count(info, old_dir, UNKNOWN_DIR);
old_dir = get_dirname(old_dir);
/* Free resources we don't need anymore */
free(freeme);
}
/*
* old_dir and new_dir free'd in increment_count, but
* get_dirname() gives us a new pointer we need to free for
* old_dir. Also, if the loop runs 0 times we need old_dir
* to be freed.
*/
free(old_dir);
}
/*
* For any directory which we need a potential rename detected for
* (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check
* whether we have enough renames to satisfy the "majority rules"
* requirement such that detecting any more renames of files under
* it won't change the result. For any such directory, mark that
* we no longer need to detect a rename for it. However, since we
* might need to still detect renames for an ancestor of that
* directory, use RELEVANT_FOR_ANCESTOR.
*/
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
/* entry->key is source_dir */
struct strintmap *counts = entry->value;
if (strintmap_get(dirs_removed, entry->key) ==
RELEVANT_FOR_SELF &&
dir_rename_already_determinable(counts)) {
strintmap_set(dirs_removed, entry->key,
RELEVANT_FOR_ANCESTOR);
}
}
for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
struct diff_filespec *one = rename_src[i].p->one;
int val;
val = strintmap_get(relevant_sources, one->path);
/*
* sources that were not found in relevant_sources should
* have already been removed by a prior call to
* remove_unneeded_paths_from_src()
*/
assert(val != -1);
if (val == RELEVANT_LOCATION) {
int removable = 1;
char *dir = get_dirname(one->path);
while (1) {
char *freeme = dir;
int res = strintmap_get(dirs_removed, dir);
/* Quit if not found or irrelevant */
if (res == NOT_RELEVANT)
break;
/* If RELEVANT_FOR_SELF, can't remove */
if (res == RELEVANT_FOR_SELF) {
removable = 0;
break;
}
/* Else continue searching upwards */
assert(res == RELEVANT_FOR_ANCESTOR);
dir = get_dirname(dir);
free(freeme);
}
free(dir);
if (removable) {
strintmap_set(relevant_sources, one->path,
RELEVANT_NO_MORE);
continue;
}
}
if (new_num_src < i)
memcpy(&rename_src[new_num_src], &rename_src[i],
sizeof(struct diff_rename_src));
new_num_src++;
}
rename_src_nr = new_num_src;
}
void diffcore_rename_extended(struct diff_options *options,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strintmap *relevant_sources,
struct strintmap *dirs_removed,
struct strmap *dir_rename_count)
{
int detect_rename = options->detect_rename;
@ -1208,9 +1379,16 @@ void diffcore_rename_extended(struct diff_options *options,
* Cull sources, again:
* - remove ones involved in renames (found via basenames)
* - remove ones not found in relevant_sources
* and
* - remove ones in relevant_sources which are needed only
* for directory renames IF no ancestory directory
* actually needs to know any more individual path
* renames under them
*/
trace2_region_enter("diff", "cull basename", options->repo);
remove_unneeded_paths_from_src(want_copies, relevant_sources);
handle_early_known_dir_renames(&info, relevant_sources,
dirs_removed);
trace2_region_leave("diff", "cull basename", options->repo);
}

View File

@ -8,8 +8,8 @@
struct diff_options;
struct repository;
struct strintmap;
struct strmap;
struct strset;
struct userdiff_driver;
/* This header file is internal between diff.c and its diff transformers
@ -161,13 +161,26 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
struct diff_filespec *);
void diff_q(struct diff_queue_struct *, struct diff_filepair *);
/* dir_rename_relevance: the reason we want rename information for a dir */
enum dir_rename_relevance {
NOT_RELEVANT = 0,
RELEVANT_FOR_ANCESTOR = 1,
RELEVANT_FOR_SELF = 2
};
/* file_rename_relevance: the reason(s) we want rename information for a file */
enum file_rename_relevance {
RELEVANT_NO_MORE = 0, /* i.e. NOT relevant */
RELEVANT_CONTENT = 1,
RELEVANT_LOCATION = 2
};
void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
void diffcore_break(struct repository *, int);
void diffcore_rename(struct diff_options *);
void diffcore_rename_extended(struct diff_options *options,
struct strset *relevant_sources,
struct strset *dirs_removed,
struct strintmap *relevant_sources,
struct strintmap *dirs_removed,
struct strmap *dir_rename_count);
void diffcore_merge_broken(void);
void diffcore_pickaxe(struct diff_options *);

View File

@ -73,8 +73,12 @@ struct rename_info {
/*
* dirs_removed: directories removed on a given side of history.
*
* The keys of dirs_removed[side] are the directories that were removed
* on the given side of history. The value of the strintmap for each
* directory is a value from enum dir_rename_relevance.
*/
struct strset dirs_removed[3];
struct strintmap dirs_removed[3];
/*
* dir_rename_count: tracking where parts of a directory were renamed to
@ -95,18 +99,20 @@ struct rename_info {
struct strmap dir_renames[3];
/*
* relevant_sources: deleted paths for which we need rename detection
* relevant_sources: deleted paths wanted in rename detection, and why
*
* relevant_sources is a set of deleted paths on each side of
* history for which we need rename detection. If a path is deleted
* on one side of history, we need to detect if it is part of a
* rename if either
* * we need to detect renames for an ancestor directory
* * the file is modified/deleted on the other side of history
* * we need to detect renames for an ancestor directory
* If neither of those are true, we can skip rename detection for
* that path.
* that path. The reason is stored as a value from enum
* file_rename_relevance, as the reason can inform the algorithm in
* diffcore_rename_extended().
*/
struct strset relevant_sources[3];
struct strintmap relevant_sources[3];
/*
* dir_rename_mask:
@ -362,8 +368,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
int i;
void (*strmap_func)(struct strmap *, int) =
reinitialize ? strmap_partial_clear : strmap_clear;
void (*strset_func)(struct strset *) =
reinitialize ? strset_partial_clear : strset_clear;
void (*strintmap_func)(struct strintmap *) =
reinitialize ? strintmap_partial_clear : strintmap_clear;
/*
* We marked opti->paths with strdup_strings = 0, so that we
@ -395,7 +401,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
/* Free memory used by various renames maps */
for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) {
strset_func(&renames->dirs_removed[i]);
strintmap_func(&renames->dirs_removed[i]);
partial_clear_dir_rename_count(&renames->dir_rename_count[i]);
if (!reinitialize)
@ -403,7 +409,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
strmap_func(&renames->dir_renames[i], 0);
strset_func(&renames->relevant_sources[i]);
strintmap_func(&renames->relevant_sources[i]);
}
if (!reinitialize) {
@ -673,8 +679,11 @@ static void add_pair(struct merge_options *opt,
unsigned content_relevant = (match_mask == 0);
unsigned location_relevant = (dir_rename_mask == 0x07);
if (content_relevant || location_relevant)
strset_add(&renames->relevant_sources[side], pathname);
if (content_relevant || location_relevant) {
/* content_relevant trumps location_relevant */
strintmap_set(&renames->relevant_sources[side], pathname,
content_relevant ? RELEVANT_CONTENT : RELEVANT_LOCATION);
}
}
one = alloc_filespec(pathname);
@ -729,10 +738,41 @@ static void collect_rename_info(struct merge_options *opt,
if (dirmask == 1 || dirmask == 3 || dirmask == 5) {
/* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */
unsigned sides = (0x07 - dirmask)/2;
unsigned relevance = (renames->dir_rename_mask == 0x07) ?
RELEVANT_FOR_ANCESTOR : NOT_RELEVANT;
/*
* Record relevance of this directory. However, note that
* when collect_merge_info_callback() recurses into this
* directory and calls collect_rename_info() on paths
* within that directory, if we find a path that was added
* to this directory on the other side of history, we will
* upgrade this value to RELEVANT_FOR_SELF; see below.
*/
if (sides & 1)
strset_add(&renames->dirs_removed[1], fullname);
strintmap_set(&renames->dirs_removed[1], fullname,
relevance);
if (sides & 2)
strset_add(&renames->dirs_removed[2], fullname);
strintmap_set(&renames->dirs_removed[2], fullname,
relevance);
}
/*
* Here's the block that potentially upgrades to RELEVANT_FOR_SELF.
* When we run across a file added to a directory. In such a case,
* find the directory of the file and upgrade its relevance.
*/
if (renames->dir_rename_mask == 0x07 &&
(filemask == 2 || filemask == 4)) {
/*
* Need directory rename for parent directory on other side
* of history from added file. Thus
* side = (~filemask & 0x06) >> 1
* or
* side = 3 - (filemask/2).
*/
unsigned side = 3 - (filemask >> 1);
strintmap_set(&renames->dirs_removed[side], dirname,
RELEVANT_FOR_SELF);
}
if (filemask == 0 || filemask == 7)
@ -1511,6 +1551,9 @@ static void get_provisional_directory_renames(struct merge_options *opt,
}
}
if (max == 0)
continue;
if (bad_max == max) {
path_msg(opt, source_dir, 0,
_("CONFLICT (directory rename split): "
@ -2160,7 +2203,7 @@ static inline int possible_side_renames(struct rename_info *renames,
unsigned side_index)
{
return renames->pairs[side_index].nr > 0 &&
!strset_empty(&renames->relevant_sources[side_index]);
!strintmap_empty(&renames->relevant_sources[side_index]);
}
static inline int possible_renames(struct rename_info *renames)
@ -3444,14 +3487,14 @@ static void merge_start(struct merge_options *opt, struct merge_result *result)
/* Initialization of various renames fields */
renames = &opt->priv->renames;
for (i = MERGE_SIDE1; i <= MERGE_SIDE2; i++) {
strset_init_with_options(&renames->dirs_removed[i],
NULL, 0);
strintmap_init_with_options(&renames->dirs_removed[i],
NOT_RELEVANT, NULL, 0);
strmap_init_with_options(&renames->dir_rename_count[i],
NULL, 1);
strmap_init_with_options(&renames->dir_renames[i],
NULL, 0);
strset_init_with_options(&renames->relevant_sources[i],
NULL, 0);
strintmap_init_with_options(&renames->relevant_sources[i],
0, NULL, 0);
}
/*