0ed49a3ed9
Compiling this module gave the following warnings (some double dutch!): xdiff/xdiffi.c: In functie 'xdl_recs_cmp': xdiff/xdiffi.c:298: let op: 'spl.i1' may be used uninitialized in this function xdiff/xdiffi.c:298: let op: 'spl.i2' may be used uninitialized in this function xdiff/xdiffi.c:219: let op: 'fbest1' may be used uninitialized in this function xdiff/xdiffi.c:219: let op: 'bbest1' may be used uninitialized in this function A superficial tracking of their usage, without deeper knowledge about the algorithm, indeed confirms that there are code paths on which these variables will be used uninitialized. In practice these code paths might never be reached, but then these fixes will not change the algorithm. If these code paths are ever reached we now at least have a predictable outcome. And should the very small performance impact of these initializations be noticeable, then they should at least be replaced by comments why certain code paths will never be reached. Some extra initializations in this patch now fix the warnings.
466 lines
11 KiB
C
466 lines
11 KiB
C
/*
|
|
* LibXDiff by Davide Libenzi ( File Differential Library )
|
|
* Copyright (C) 2003 Davide Libenzi
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*
|
|
* Davide Libenzi <davidel@xmailserver.org>
|
|
*
|
|
*/
|
|
|
|
#include "xinclude.h"
|
|
|
|
|
|
|
|
#define XDL_MAX_COST_MIN 256
|
|
#define XDL_HEUR_MIN_COST 256
|
|
#define XDL_LINE_MAX (long)((1UL << (8 * sizeof(long) - 1)) - 1)
|
|
#define XDL_SNAKE_CNT 20
|
|
#define XDL_K_HEUR 4
|
|
|
|
|
|
|
|
typedef struct s_xdpsplit {
|
|
long i1, i2;
|
|
int min_lo, min_hi;
|
|
} xdpsplit_t;
|
|
|
|
|
|
|
|
|
|
static long xdl_split(unsigned long const *ha1, long off1, long lim1,
|
|
unsigned long const *ha2, long off2, long lim2,
|
|
long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
|
|
xdalgoenv_t *xenv);
|
|
static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
|
|
|
|
|
|
|
|
|
|
/*
|
|
* See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
|
|
* Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
|
|
* the forward diagonal starting from (off1, off2) and the backward diagonal
|
|
* starting from (lim1, lim2). If the K values on the same diagonal crosses
|
|
* returns the furthest point of reach. We might end up having to expensive
|
|
* cases using this algorithm is full, so a little bit of heuristic is needed
|
|
* to cut the search and to return a suboptimal point.
|
|
*/
|
|
static long xdl_split(unsigned long const *ha1, long off1, long lim1,
|
|
unsigned long const *ha2, long off2, long lim2,
|
|
long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
|
|
xdalgoenv_t *xenv) {
|
|
long dmin = off1 - lim2, dmax = lim1 - off2;
|
|
long fmid = off1 - off2, bmid = lim1 - lim2;
|
|
long odd = (fmid - bmid) & 1;
|
|
long fmin = fmid, fmax = fmid;
|
|
long bmin = bmid, bmax = bmid;
|
|
long ec, d, i1, i2, prev1, best, dd, v, k;
|
|
|
|
/*
|
|
* Set initial diagonal values for both forward and backward path.
|
|
*/
|
|
kvdf[fmid] = off1;
|
|
kvdb[bmid] = lim1;
|
|
|
|
for (ec = 1;; ec++) {
|
|
int got_snake = 0;
|
|
|
|
/*
|
|
* We need to extent the diagonal "domain" by one. If the next
|
|
* values exits the box boundaries we need to change it in the
|
|
* opposite direction because (max - min) must be a power of two.
|
|
* Also we initialize the extenal K value to -1 so that we can
|
|
* avoid extra conditions check inside the core loop.
|
|
*/
|
|
if (fmin > dmin)
|
|
kvdf[--fmin - 1] = -1;
|
|
else
|
|
++fmin;
|
|
if (fmax < dmax)
|
|
kvdf[++fmax + 1] = -1;
|
|
else
|
|
--fmax;
|
|
|
|
for (d = fmax; d >= fmin; d -= 2) {
|
|
if (kvdf[d - 1] >= kvdf[d + 1])
|
|
i1 = kvdf[d - 1] + 1;
|
|
else
|
|
i1 = kvdf[d + 1];
|
|
prev1 = i1;
|
|
i2 = i1 - d;
|
|
for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
|
|
if (i1 - prev1 > xenv->snake_cnt)
|
|
got_snake = 1;
|
|
kvdf[d] = i1;
|
|
if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
|
|
spl->i1 = i1;
|
|
spl->i2 = i2;
|
|
spl->min_lo = spl->min_hi = 1;
|
|
return ec;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We need to extent the diagonal "domain" by one. If the next
|
|
* values exits the box boundaries we need to change it in the
|
|
* opposite direction because (max - min) must be a power of two.
|
|
* Also we initialize the extenal K value to -1 so that we can
|
|
* avoid extra conditions check inside the core loop.
|
|
*/
|
|
if (bmin > dmin)
|
|
kvdb[--bmin - 1] = XDL_LINE_MAX;
|
|
else
|
|
++bmin;
|
|
if (bmax < dmax)
|
|
kvdb[++bmax + 1] = XDL_LINE_MAX;
|
|
else
|
|
--bmax;
|
|
|
|
for (d = bmax; d >= bmin; d -= 2) {
|
|
if (kvdb[d - 1] < kvdb[d + 1])
|
|
i1 = kvdb[d - 1];
|
|
else
|
|
i1 = kvdb[d + 1] - 1;
|
|
prev1 = i1;
|
|
i2 = i1 - d;
|
|
for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
|
|
if (prev1 - i1 > xenv->snake_cnt)
|
|
got_snake = 1;
|
|
kvdb[d] = i1;
|
|
if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
|
|
spl->i1 = i1;
|
|
spl->i2 = i2;
|
|
spl->min_lo = spl->min_hi = 1;
|
|
return ec;
|
|
}
|
|
}
|
|
|
|
if (need_min)
|
|
continue;
|
|
|
|
/*
|
|
* If the edit cost is above the heuristic trigger and if
|
|
* we got a good snake, we sample current diagonals to see
|
|
* if some of the, have reached an "interesting" path. Our
|
|
* measure is a function of the distance from the diagonal
|
|
* corner (i1 + i2) penalized with the distance from the
|
|
* mid diagonal itself. If this value is above the current
|
|
* edit cost times a magic factor (XDL_K_HEUR) we consider
|
|
* it interesting.
|
|
*/
|
|
if (got_snake && ec > xenv->heur_min) {
|
|
for (best = 0, d = fmax; d >= fmin; d -= 2) {
|
|
dd = d > fmid ? d - fmid: fmid - d;
|
|
i1 = kvdf[d];
|
|
i2 = i1 - d;
|
|
v = (i1 - off1) + (i2 - off2) - dd;
|
|
|
|
if (v > XDL_K_HEUR * ec && v > best &&
|
|
off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
|
|
off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
|
|
for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
|
|
if (k == xenv->snake_cnt) {
|
|
best = v;
|
|
spl->i1 = i1;
|
|
spl->i2 = i2;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (best > 0) {
|
|
spl->min_lo = 1;
|
|
spl->min_hi = 0;
|
|
return ec;
|
|
}
|
|
|
|
for (best = 0, d = bmax; d >= bmin; d -= 2) {
|
|
dd = d > bmid ? d - bmid: bmid - d;
|
|
i1 = kvdb[d];
|
|
i2 = i1 - d;
|
|
v = (lim1 - i1) + (lim2 - i2) - dd;
|
|
|
|
if (v > XDL_K_HEUR * ec && v > best &&
|
|
off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
|
|
off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
|
|
for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
|
|
if (k == xenv->snake_cnt - 1) {
|
|
best = v;
|
|
spl->i1 = i1;
|
|
spl->i2 = i2;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (best > 0) {
|
|
spl->min_lo = 0;
|
|
spl->min_hi = 1;
|
|
return ec;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Enough is enough. We spent too much time here and now we collect
|
|
* the furthest reaching path using the (i1 + i2) measure.
|
|
*/
|
|
if (ec >= xenv->mxcost) {
|
|
long fbest, fbest1, bbest, bbest1;
|
|
|
|
fbest = fbest1 = -1;
|
|
for (d = fmax; d >= fmin; d -= 2) {
|
|
i1 = XDL_MIN(kvdf[d], lim1);
|
|
i2 = i1 - d;
|
|
if (lim2 < i2)
|
|
i1 = lim2 + d, i2 = lim2;
|
|
if (fbest < i1 + i2) {
|
|
fbest = i1 + i2;
|
|
fbest1 = i1;
|
|
}
|
|
}
|
|
|
|
bbest = bbest1 = XDL_LINE_MAX;
|
|
for (d = bmax; d >= bmin; d -= 2) {
|
|
i1 = XDL_MAX(off1, kvdb[d]);
|
|
i2 = i1 - d;
|
|
if (i2 < off2)
|
|
i1 = off2 + d, i2 = off2;
|
|
if (i1 + i2 < bbest) {
|
|
bbest = i1 + i2;
|
|
bbest1 = i1;
|
|
}
|
|
}
|
|
|
|
if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
|
|
spl->i1 = fbest1;
|
|
spl->i2 = fbest - fbest1;
|
|
spl->min_lo = 1;
|
|
spl->min_hi = 0;
|
|
} else {
|
|
spl->i1 = bbest1;
|
|
spl->i2 = bbest - bbest1;
|
|
spl->min_lo = 0;
|
|
spl->min_hi = 1;
|
|
}
|
|
return ec;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
/*
|
|
* Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
|
|
* the box splitting function. Note that the real job (marking changed lines)
|
|
* is done in the two boundary reaching checks.
|
|
*/
|
|
int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
|
|
diffdata_t *dd2, long off2, long lim2,
|
|
long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
|
|
unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
|
|
|
|
/*
|
|
* Shrink the box by walking through each diagonal snake (SW and NE).
|
|
*/
|
|
for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
|
|
for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
|
|
|
|
/*
|
|
* If one dimension is empty, then all records on the other one must
|
|
* be obviously changed.
|
|
*/
|
|
if (off1 == lim1) {
|
|
char *rchg2 = dd2->rchg;
|
|
long *rindex2 = dd2->rindex;
|
|
|
|
for (; off2 < lim2; off2++)
|
|
rchg2[rindex2[off2]] = 1;
|
|
} else if (off2 == lim2) {
|
|
char *rchg1 = dd1->rchg;
|
|
long *rindex1 = dd1->rindex;
|
|
|
|
for (; off1 < lim1; off1++)
|
|
rchg1[rindex1[off1]] = 1;
|
|
} else {
|
|
long ec;
|
|
xdpsplit_t spl;
|
|
spl.i1 = spl.i2 = 0;
|
|
|
|
/*
|
|
* Divide ...
|
|
*/
|
|
if ((ec = xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
|
|
need_min, &spl, xenv)) < 0) {
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* ... et Impera.
|
|
*/
|
|
if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
|
|
kvdf, kvdb, spl.min_lo, xenv) < 0 ||
|
|
xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
|
|
kvdf, kvdb, spl.min_hi, xenv) < 0) {
|
|
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
|
xdfenv_t *xe) {
|
|
long ndiags;
|
|
long *kvd, *kvdf, *kvdb;
|
|
xdalgoenv_t xenv;
|
|
diffdata_t dd1, dd2;
|
|
|
|
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Allocate and setup K vectors to be used by the differential algorithm.
|
|
* One is to store the forward path and one to store the backward path.
|
|
*/
|
|
ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
|
|
if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) {
|
|
|
|
xdl_free_env(xe);
|
|
return -1;
|
|
}
|
|
kvdf = kvd;
|
|
kvdb = kvdf + ndiags;
|
|
kvdf += xe->xdf2.nreff + 1;
|
|
kvdb += xe->xdf2.nreff + 1;
|
|
|
|
xenv.mxcost = xdl_bogosqrt(ndiags);
|
|
if (xenv.mxcost < XDL_MAX_COST_MIN)
|
|
xenv.mxcost = XDL_MAX_COST_MIN;
|
|
xenv.snake_cnt = XDL_SNAKE_CNT;
|
|
xenv.heur_min = XDL_HEUR_MIN_COST;
|
|
|
|
dd1.nrec = xe->xdf1.nreff;
|
|
dd1.ha = xe->xdf1.ha;
|
|
dd1.rchg = xe->xdf1.rchg;
|
|
dd1.rindex = xe->xdf1.rindex;
|
|
dd2.nrec = xe->xdf2.nreff;
|
|
dd2.ha = xe->xdf2.ha;
|
|
dd2.rchg = xe->xdf2.rchg;
|
|
dd2.rindex = xe->xdf2.rindex;
|
|
|
|
if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
|
|
kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
|
|
|
|
xdl_free(kvd);
|
|
xdl_free_env(xe);
|
|
return -1;
|
|
}
|
|
|
|
xdl_free(kvd);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
|
|
xdchange_t *xch;
|
|
|
|
if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
|
|
return NULL;
|
|
|
|
xch->next = xscr;
|
|
xch->i1 = i1;
|
|
xch->i2 = i2;
|
|
xch->chg1 = chg1;
|
|
xch->chg2 = chg2;
|
|
|
|
return xch;
|
|
}
|
|
|
|
|
|
int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
|
|
xdchange_t *cscr = NULL, *xch;
|
|
char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
|
|
long i1, i2, l1, l2;
|
|
|
|
/*
|
|
* Trivial. Collects "groups" of changes and creates an edit script.
|
|
*/
|
|
for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
|
|
if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
|
|
for (l1 = i1; rchg1[i1 - 1]; i1--);
|
|
for (l2 = i2; rchg2[i2 - 1]; i2--);
|
|
|
|
if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
|
|
xdl_free_script(cscr);
|
|
return -1;
|
|
}
|
|
cscr = xch;
|
|
}
|
|
|
|
*xscr = cscr;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void xdl_free_script(xdchange_t *xscr) {
|
|
xdchange_t *xch;
|
|
|
|
while ((xch = xscr) != NULL) {
|
|
xscr = xscr->next;
|
|
xdl_free(xch);
|
|
}
|
|
}
|
|
|
|
|
|
int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
|
|
xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
|
|
xdchange_t *xscr;
|
|
xdfenv_t xe;
|
|
|
|
if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
|
|
|
|
return -1;
|
|
}
|
|
|
|
if (xdl_build_script(&xe, &xscr) < 0) {
|
|
|
|
xdl_free_env(&xe);
|
|
return -1;
|
|
}
|
|
|
|
if (xscr) {
|
|
if (xdl_emit_diff(&xe, xscr, ecb, xecfg) < 0) {
|
|
|
|
xdl_free_script(xscr);
|
|
xdl_free_env(&xe);
|
|
return -1;
|
|
}
|
|
|
|
xdl_free_script(xscr);
|
|
}
|
|
|
|
xdl_free_env(&xe);
|
|
|
|
return 0;
|
|
}
|
|
|