combine-diff: coalesce lost lines optimally

This replaces the greedy implementation to coalesce lost lines by using
dynamic programming to find the Longest Common Subsequence.

The O(n²) time complexity is obviously bigger than previous
implementation but it can produce shorter diff results (and most likely
easier to read).

List of lost lines is now doubly-linked because we reverse-read it when
reading the direction matrix.

Signed-off-by: Antoine Pelisse <apelisse@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Antoine Pelisse 2013-03-23 18:23:28 +01:00 committed by Junio C Hamano
parent fa04ae0be8
commit 99d3206010
2 changed files with 320 additions and 64 deletions

View File

@ -74,16 +74,24 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr,
/* Lines lost from parent */
struct lline {
struct lline *next;
struct lline *next, *prev;
int len;
unsigned long parent_map;
char line[FLEX_ARRAY];
};
/* Lines lost from current parent (before coalescing) */
struct plost {
struct lline *lost_head, *lost_tail;
int len;
};
/* Lines surviving in the merge result */
struct sline {
struct lline *lost_head, **lost_tail;
struct lline *next_lost;
/* Accumulated and coalesced lost lines */
struct lline *lost;
int lenlost;
struct plost plost;
char *bol;
int len;
/* bit 0 up to (N-1) are on if the parent has this line (i.e.
@ -95,34 +103,6 @@ struct sline {
unsigned long *p_lno;
};
static char *grab_blob(const unsigned char *sha1, unsigned int mode,
unsigned long *size, struct userdiff_driver *textconv,
const char *path)
{
char *blob;
enum object_type type;
if (S_ISGITLINK(mode)) {
blob = xmalloc(100);
*size = snprintf(blob, 100,
"Subproject commit %s\n", sha1_to_hex(sha1));
} else if (is_null_sha1(sha1)) {
/* deleted blob */
*size = 0;
return xcalloc(1, 1);
} else if (textconv) {
struct diff_filespec *df = alloc_filespec(path);
fill_filespec(df, sha1, 1, mode);
*size = fill_textconv(textconv, df, &blob);
free_filespec(df);
} else {
blob = read_sha1_file(sha1, &type, size);
if (type != OBJ_BLOB)
die("object '%s' is not a blob!", sha1_to_hex(sha1));
}
return blob;
}
static int match_string_spaces(const char *line1, int len1,
const char *line2, int len2,
long flags)
@ -163,36 +143,180 @@ static int match_string_spaces(const char *line1, int len1,
return 0;
}
static void append_lost(struct sline *sline, int n, const char *line, int len, long flags)
enum coalesce_direction { MATCH, BASE, NEW };
/* Coalesce new lines into base by finding LCS */
static struct lline *coalesce_lines(struct lline *base, int *lenbase,
struct lline *new, int lennew,
unsigned long parent, long flags)
{
int **lcs;
enum coalesce_direction **directions;
struct lline *baseend, *newend = NULL;
int i, j, origbaselen = *lenbase;
if (new == NULL)
return base;
if (base == NULL) {
*lenbase = lennew;
return new;
}
/*
* Coalesce new lines into base by finding the LCS
* - Create the table to run dynamic programing
* - Compute the LCS
* - Then reverse read the direction structure:
* - If we have MATCH, assign parent to base flag, and consume
* both baseend and newend
* - Else if we have BASE, consume baseend
* - Else if we have NEW, insert newend lline into base and
* consume newend
*/
lcs = xcalloc(origbaselen + 1, sizeof(int*));
directions = xcalloc(origbaselen + 1, sizeof(enum coalesce_direction*));
for (i = 0; i < origbaselen + 1; i++) {
lcs[i] = xcalloc(lennew + 1, sizeof(int));
directions[i] = xcalloc(lennew + 1, sizeof(enum coalesce_direction));
directions[i][0] = BASE;
}
for (j = 1; j < lennew + 1; j++)
directions[0][j] = NEW;
for (i = 1, baseend = base; i < origbaselen + 1; i++) {
for (j = 1, newend = new; j < lennew + 1; j++) {
if (match_string_spaces(baseend->line, baseend->len,
newend->line, newend->len, flags)) {
lcs[i][j] = lcs[i - 1][j - 1] + 1;
directions[i][j] = MATCH;
} else if (lcs[i][j - 1] >= lcs[i - 1][j]) {
lcs[i][j] = lcs[i][j - 1];
directions[i][j] = NEW;
} else {
lcs[i][j] = lcs[i - 1][j];
directions[i][j] = BASE;
}
if (newend->next)
newend = newend->next;
}
if (baseend->next)
baseend = baseend->next;
}
for (i = 0; i < origbaselen + 1; i++)
free(lcs[i]);
free(lcs);
/* At this point, baseend and newend point to the end of each lists */
i--;
j--;
while (i != 0 || j != 0) {
if (directions[i][j] == MATCH) {
baseend->parent_map |= 1<<parent;
baseend = baseend->prev;
newend = newend->prev;
i--;
j--;
} else if (directions[i][j] == NEW) {
struct lline *lline;
lline = newend;
/* Remove lline from new list and update newend */
if (lline->prev)
lline->prev->next = lline->next;
else
new = lline->next;
if (lline->next)
lline->next->prev = lline->prev;
newend = lline->prev;
j--;
/* Add lline to base list */
if (baseend) {
lline->next = baseend->next;
lline->prev = baseend;
if (lline->prev)
lline->prev->next = lline;
}
else {
lline->next = base;
base = lline;
}
(*lenbase)++;
if (lline->next)
lline->next->prev = lline;
} else {
baseend = baseend->prev;
i--;
}
}
newend = new;
while (newend) {
struct lline *lline = newend;
newend = newend->next;
free(lline);
}
for (i = 0; i < origbaselen + 1; i++)
free(directions[i]);
free(directions);
return base;
}
static char *grab_blob(const unsigned char *sha1, unsigned int mode,
unsigned long *size, struct userdiff_driver *textconv,
const char *path)
{
char *blob;
enum object_type type;
if (S_ISGITLINK(mode)) {
blob = xmalloc(100);
*size = snprintf(blob, 100,
"Subproject commit %s\n", sha1_to_hex(sha1));
} else if (is_null_sha1(sha1)) {
/* deleted blob */
*size = 0;
return xcalloc(1, 1);
} else if (textconv) {
struct diff_filespec *df = alloc_filespec(path);
fill_filespec(df, sha1, 1, mode);
*size = fill_textconv(textconv, df, &blob);
free_filespec(df);
} else {
blob = read_sha1_file(sha1, &type, size);
if (type != OBJ_BLOB)
die("object '%s' is not a blob!", sha1_to_hex(sha1));
}
return blob;
}
static void append_lost(struct sline *sline, int n, const char *line, int len)
{
struct lline *lline;
unsigned long this_mask = (1UL<<n);
if (line[len-1] == '\n')
len--;
/* Check to see if we can squash things */
if (sline->lost_head) {
lline = sline->next_lost;
while (lline) {
if (match_string_spaces(lline->line, lline->len,
line, len, flags)) {
lline->parent_map |= this_mask;
sline->next_lost = lline->next;
return;
}
lline = lline->next;
}
}
lline = xmalloc(sizeof(*lline) + len + 1);
lline->len = len;
lline->next = NULL;
lline->prev = sline->plost.lost_tail;
if (lline->prev)
lline->prev->next = lline;
else
sline->plost.lost_head = lline;
sline->plost.lost_tail = lline;
sline->plost.len++;
lline->parent_map = this_mask;
memcpy(lline->line, line, len);
lline->line[len] = 0;
*sline->lost_tail = lline;
sline->lost_tail = &lline->next;
sline->next_lost = NULL;
}
struct combine_diff_state {
@ -203,7 +327,6 @@ struct combine_diff_state {
int n;
struct sline *sline;
struct sline *lost_bucket;
long flags;
};
static void consume_line(void *state_, char *line, unsigned long len)
@ -236,14 +359,13 @@ static void consume_line(void *state_, char *line, unsigned long len)
xcalloc(state->num_parent,
sizeof(unsigned long));
state->sline[state->nb-1].p_lno[state->n] = state->ob;
state->lost_bucket->next_lost = state->lost_bucket->lost_head;
return;
}
if (!state->lost_bucket)
return; /* not in any hunk yet */
switch (line[0]) {
case '-':
append_lost(state->lost_bucket, state->n, line+1, len-1, state->flags);
append_lost(state->lost_bucket, state->n, line+1, len-1);
break;
case '+':
state->sline[state->lno-1].flag |= state->nmask;
@ -276,7 +398,6 @@ static void combine_diff(const unsigned char *parent, unsigned int mode,
xpp.flags = flags;
memset(&xecfg, 0, sizeof(xecfg));
memset(&state, 0, sizeof(state));
state.flags = flags;
state.nmask = nmask;
state.sline = sline;
state.lno = 1;
@ -298,8 +419,18 @@ static void combine_diff(const unsigned char *parent, unsigned int mode,
struct lline *ll;
sline[lno].p_lno[n] = p_lno;
/* Coalesce new lines */
if (sline[lno].plost.lost_head) {
struct sline *sl = &sline[lno];
sl->lost = coalesce_lines(sl->lost, &sl->lenlost,
sl->plost.lost_head,
sl->plost.len, n, flags);
sl->plost.lost_head = sl->plost.lost_tail = NULL;
sl->plost.len = 0;
}
/* How many lines would this sline advance the p_lno? */
ll = sline[lno].lost_head;
ll = sline[lno].lost;
while (ll) {
if (ll->parent_map & nmask)
p_lno++; /* '-' means parent had it */
@ -319,7 +450,7 @@ static int interesting(struct sline *sline, unsigned long all_mask)
/* If some parents lost lines here, or if we have added to
* some parent, it is interesting.
*/
return ((sline->flag & all_mask) || sline->lost_head);
return ((sline->flag & all_mask) || sline->lost);
}
static unsigned long adjust_hunk_tail(struct sline *sline,
@ -502,7 +633,7 @@ static int make_hunks(struct sline *sline, unsigned long cnt,
has_interesting = 0;
for (j = i; j < hunk_end && !has_interesting; j++) {
unsigned long this_diff = sline[j].flag & all_mask;
struct lline *ll = sline[j].lost_head;
struct lline *ll = sline[j].lost;
if (this_diff) {
/* This has some changes. Is it the
* same as others?
@ -656,7 +787,7 @@ static void dump_sline(struct sline *sline, const char *line_prefix,
int j;
unsigned long p_mask;
struct sline *sl = &sline[lno++];
ll = (sl->flag & no_pre_delete) ? NULL : sl->lost_head;
ll = (sl->flag & no_pre_delete) ? NULL : sl->lost;
while (ll) {
printf("%s%s", line_prefix, c_old);
for (j = 0; j < num_parent; j++) {
@ -707,7 +838,7 @@ static void reuse_combine_diff(struct sline *sline, unsigned long cnt,
jmask = (1UL<<j);
for (lno = 0; lno <= cnt; lno++) {
struct lline *ll = sline->lost_head;
struct lline *ll = sline->lost;
sline->p_lno[i] = sline->p_lno[j];
while (ll) {
if (ll->parent_map & jmask)
@ -966,10 +1097,6 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
sline = xcalloc(cnt+2, sizeof(*sline));
sline[0].bol = result;
for (lno = 0; lno <= cnt + 1; lno++) {
sline[lno].lost_tail = &sline[lno].lost_head;
sline[lno].flag = 0;
}
for (lno = 0, cp = result; cp < result + result_size; cp++) {
if (*cp == '\n') {
sline[lno].len = cp - sline[lno].bol;
@ -1019,8 +1146,8 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
free(result);
for (lno = 0; lno < cnt; lno++) {
if (sline[lno].lost_head) {
struct lline *ll = sline[lno].lost_head;
if (sline[lno].lost) {
struct lline *ll = sline[lno].lost;
while (ll) {
struct lline *tmp = ll;
ll = ll->next;

View File

@ -224,4 +224,133 @@ test_expect_success 'check combined output (ignore all spaces)' '
compare_diff_patch expected actual
'
test_expect_success 'combine diff coalesce simple' '
>test &&
git add test &&
git commit -m initial &&
test_seq 4 >test &&
git commit -a -m empty1 &&
git branch side1 &&
git checkout HEAD^ &&
test_seq 5 >test &&
git commit -a -m empty2 &&
test_must_fail git merge side1 &&
>test &&
git commit -a -m merge &&
git show >actual.tmp &&
sed -e "1,/^@@@/d" < actual.tmp >actual &&
tr -d Q <<-\EOF >expected &&
--1
--2
--3
--4
- 5
EOF
compare_diff_patch expected actual
'
test_expect_success 'combine diff coalesce tricky' '
>test &&
git add test &&
git commit -m initial --allow-empty &&
cat <<-\EOF >test &&
3
1
2
3
4
EOF
git commit -a -m empty1 &&
git branch -f side1 &&
git checkout HEAD^ &&
cat <<-\EOF >test &&
1
3
5
4
EOF
git commit -a -m empty2 &&
git branch -f side2 &&
test_must_fail git merge side1 &&
>test &&
git commit -a -m merge &&
git show >actual.tmp &&
sed -e "1,/^@@@/d" < actual.tmp >actual &&
tr -d Q <<-\EOF >expected &&
-3
--1
-2
--3
- 5
--4
EOF
compare_diff_patch expected actual &&
git checkout -f side1 &&
test_must_fail git merge side2 &&
>test &&
git commit -a -m merge &&
git show >actual.tmp &&
sed -e "1,/^@@@/d" < actual.tmp >actual &&
tr -d Q <<-\EOF >expected &&
- 3
--1
- 2
--3
-5
--4
EOF
compare_diff_patch expected actual
'
test_expect_failure 'combine diff coalesce three parents' '
>test &&
git add test &&
git commit -m initial --allow-empty &&
cat <<-\EOF >test &&
3
1
2
3
4
EOF
git commit -a -m empty1 &&
git checkout -B side1 &&
git checkout HEAD^ &&
cat <<-\EOF >test &&
1
3
7
5
4
EOF
git commit -a -m empty2 &&
git branch -f side2 &&
git checkout HEAD^ &&
cat <<-\EOF >test &&
3
1
6
5
4
EOF
git commit -a -m empty3 &&
>test &&
git add test &&
TREE=$(git write-tree) &&
COMMIT=$(git commit-tree -p HEAD -p side1 -p side2 -m merge $TREE) &&
git show $COMMIT >actual.tmp &&
sed -e "1,/^@@@/d" < actual.tmp >actual &&
tr -d Q <<-\EOF >expected &&
-- 3
---1
- 6
- 2
--3
-7
- -5
---4
EOF
compare_diff_patch expected actual
'
test_done