Merge branch 'jc/diff' into next
* jc/diff: diffcore-rename: similarity estimator fix. diffcore-delta: stop using deltifier for packing.
This commit is contained in:
commit
5aad948f65
139
diffcore-delta.c
139
diffcore-delta.c
@ -1,43 +1,128 @@
|
||||
#include "cache.h"
|
||||
#include "diff.h"
|
||||
#include "diffcore.h"
|
||||
#include "delta.h"
|
||||
#include "count-delta.h"
|
||||
|
||||
static int diffcore_count_changes_1(void *src, unsigned long src_size,
|
||||
void *dst, unsigned long dst_size,
|
||||
unsigned long delta_limit,
|
||||
unsigned long *src_copied,
|
||||
unsigned long *literal_added)
|
||||
struct linehash {
|
||||
unsigned long bytes;
|
||||
unsigned long hash;
|
||||
};
|
||||
|
||||
static unsigned long hash_extended_line(const unsigned char **buf_p,
|
||||
unsigned long left)
|
||||
{
|
||||
void *delta;
|
||||
unsigned long delta_size;
|
||||
|
||||
delta = diff_delta(src, src_size,
|
||||
dst, dst_size,
|
||||
&delta_size, delta_limit);
|
||||
if (!delta)
|
||||
/* If delta_limit is exceeded, we have too much differences */
|
||||
return -1;
|
||||
|
||||
/* Estimate the edit size by interpreting delta. */
|
||||
if (count_delta(delta, delta_size, src_copied, literal_added)) {
|
||||
free(delta);
|
||||
return -1;
|
||||
/* An extended line is zero or more whitespace letters (including LF)
|
||||
* followed by one non whitespace letter followed by zero or more
|
||||
* non LF, and terminated with by a LF (or EOF).
|
||||
*/
|
||||
const unsigned char *bol = *buf_p;
|
||||
const unsigned char *buf = bol;
|
||||
unsigned long hashval = 0;
|
||||
while (left) {
|
||||
unsigned c = *buf++;
|
||||
if (!c)
|
||||
goto binary;
|
||||
left--;
|
||||
if (' ' < c) {
|
||||
hashval = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(delta);
|
||||
while (left) {
|
||||
unsigned c = *buf++;
|
||||
if (!c)
|
||||
goto binary;
|
||||
left--;
|
||||
if (c == '\n')
|
||||
break;
|
||||
if (' ' < c)
|
||||
hashval = hashval * 11 + c;
|
||||
}
|
||||
*buf_p = buf;
|
||||
return hashval;
|
||||
|
||||
binary:
|
||||
*buf_p = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linehash_compare(const void *a_, const void *b_)
|
||||
{
|
||||
struct linehash *a = (struct linehash *) a_;
|
||||
struct linehash *b = (struct linehash *) b_;
|
||||
if (a->hash < b->hash) return -1;
|
||||
if (a->hash > b->hash) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct linehash *hash_lines(const unsigned char *buf,
|
||||
unsigned long size)
|
||||
{
|
||||
const unsigned char *eobuf = buf + size;
|
||||
struct linehash *line = NULL;
|
||||
int alloc = 0, used = 0;
|
||||
|
||||
while (buf < eobuf) {
|
||||
const unsigned char *ptr = buf;
|
||||
unsigned long hash = hash_extended_line(&buf, eobuf-ptr);
|
||||
if (!buf) {
|
||||
free(line);
|
||||
return NULL;
|
||||
}
|
||||
if (alloc <= used) {
|
||||
alloc = alloc_nr(alloc);
|
||||
line = xrealloc(line, sizeof(*line) * alloc);
|
||||
}
|
||||
line[used].bytes = buf - ptr;
|
||||
line[used].hash = hash;
|
||||
used++;
|
||||
}
|
||||
qsort(line, used, sizeof(*line), linehash_compare);
|
||||
|
||||
/* Terminate the list */
|
||||
if (alloc <= used)
|
||||
line = xrealloc(line, sizeof(*line) * (used+1));
|
||||
line[used].bytes = line[used].hash = 0;
|
||||
return line;
|
||||
}
|
||||
|
||||
int diffcore_count_changes(void *src, unsigned long src_size,
|
||||
void *dst, unsigned long dst_size,
|
||||
unsigned long delta_limit,
|
||||
unsigned long *src_copied,
|
||||
unsigned long *literal_added)
|
||||
{
|
||||
return diffcore_count_changes_1(src, src_size,
|
||||
dst, dst_size,
|
||||
delta_limit,
|
||||
src_copied,
|
||||
literal_added);
|
||||
struct linehash *src_lines, *dst_lines;
|
||||
unsigned long sc, la;
|
||||
|
||||
src_lines = hash_lines(src, src_size);
|
||||
if (!src_lines)
|
||||
return -1;
|
||||
dst_lines = hash_lines(dst, dst_size);
|
||||
if (!dst_lines) {
|
||||
free(src_lines);
|
||||
return -1;
|
||||
}
|
||||
sc = la = 0;
|
||||
while (src_lines->bytes && dst_lines->bytes) {
|
||||
int cmp = linehash_compare(src_lines, dst_lines);
|
||||
if (!cmp) {
|
||||
sc += src_lines->bytes;
|
||||
src_lines++;
|
||||
dst_lines++;
|
||||
continue;
|
||||
}
|
||||
if (cmp < 0) {
|
||||
src_lines++;
|
||||
continue;
|
||||
}
|
||||
la += dst_lines->bytes;
|
||||
dst_lines++;
|
||||
}
|
||||
while (dst_lines->bytes) {
|
||||
la += dst_lines->bytes;
|
||||
dst_lines++;
|
||||
}
|
||||
*src_copied = sc;
|
||||
*literal_added = la;
|
||||
return 0;
|
||||
}
|
||||
|
@ -170,19 +170,15 @@ static int estimate_similarity(struct diff_filespec *src,
|
||||
&src_copied, &literal_added))
|
||||
return 0;
|
||||
|
||||
/* Extent of damage */
|
||||
if (src->size + literal_added < src_copied)
|
||||
delta_size = 0;
|
||||
else
|
||||
delta_size = (src->size - src_copied) + literal_added;
|
||||
|
||||
/*
|
||||
* Now we will give some score to it. 100% edit gets 0 points
|
||||
* and 0% edit gets MAX_SCORE points.
|
||||
/* How similar are they?
|
||||
* what percentage of material in dst are from source?
|
||||
*/
|
||||
score = MAX_SCORE - (MAX_SCORE * delta_size / base_size);
|
||||
if (score < 0) return 0;
|
||||
if (MAX_SCORE < score) return MAX_SCORE;
|
||||
if (dst->size < src_copied)
|
||||
score = MAX_SCORE;
|
||||
else if (!dst->size)
|
||||
score = 0; /* should not happen */
|
||||
else
|
||||
score = src_copied * MAX_SCORE / dst->size;
|
||||
return score;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user