Geert's similarity

Define a function to compute similarity score 0.0<=score<=1.0

Signed-off-by: Junio C Hamano <junkio@cox.net>
This commit is contained in:
Junio C Hamano 2006-04-16 21:07:32 -07:00
parent fd2bbdd238
commit 9a305b67f8
3 changed files with 39 additions and 24 deletions

29
gsimm.c
View File

@ -1,3 +1,4 @@
#include <string.h>
#include "rabinpoly.h"
#include "gsimm.h"
@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq)
bzero (freq, sizeof(freq[0]) * MD_BITS);
}
static int dist (u_char *l, u_char *r)
{ int j, k;
int d = 0;
for (j = 0; j < MD_LENGTH; j++)
{ u_char ch = l[j] ^ r[j];
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
}
return d;
}
double gb_simm_score(u_char *l, u_char *r)
{
int d = dist(l, r);
double sim = (double) (d) / (MD_LENGTH * 4 - 1);
if (1.0 < sim)
return 0;
else
return 1.0 - sim;
}
void gb_simm_process(u_char *data, unsigned len, u_char *md)
{ size_t j = 0;
u_int32_t ofs;
@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md)
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
int freq[MD_BITS];
if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
memset(md, 0, MD_LENGTH);
return;
}
bzero (freq, sizeof(freq[0]) * MD_BITS);
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));

View File

@ -15,14 +15,15 @@
In order to get at least an average of 12 samples
per bit in the final message digest, require at least 3 * MD_LENGTH
complete windows in the file. */
#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
#define GB_SIMM_MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
/* Limit matching algorithm to files less than 256 MB, so we can use
32 bit integers everywhere without fear of overflow. For larger
files we should add logic to mmap the file by piece and accumulate
the frequency counts. */
#define MAX_FILE_SIZE (256*1024*1024 - 1)
#define GB_SIMM_MAX_FILE_SIZE (256*1024*1024 - 1)
void gb_simm_process(u_char *data, unsigned len, u_char *md);
double gb_simm_score(u_char *l, u_char *r);
#endif

View File

@ -58,19 +58,6 @@ void usage()
exit (1);
}
int dist (u_char *l, u_char *r)
{ int j, k;
int d = 0;
for (j = 0; j < MD_LENGTH; j++)
{ u_char ch = l[j] ^ r[j];
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
}
return d;
}
char *md_to_str(u_char *md)
{ int j;
@ -102,8 +89,8 @@ void process_file (char *name)
exit (2);
}
if (fs.st_size >= MIN_FILE_SIZE
&& fs.st_size <= MAX_FILE_SIZE)
if (fs.st_size >= GB_SIMM_MIN_FILE_SIZE
&& fs.st_size <= GB_SIMM_MAX_FILE_SIZE)
{ fi->length = fs.st_size;
fi->name = name;
@ -116,13 +103,11 @@ void process_file (char *name)
gb_simm_process (data, fs.st_size, fi->md);
if (flag_relative)
{ int d = dist (fi->md, relative_md);
double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1));
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
md_to_str (fi->md), (long long unsigned) 0,
(unsigned) fs.st_size, name,
d, 100.0 * sim);
}
(unsigned) 0,
100.0 * gb_simm_score(fi->md, relative_md));
else
{
fprintf (stdout, "%s %llu %u %s\n",