Geert's similarity
Define a function to compute similarity score 0.0<=score<=1.0 Signed-off-by: Junio C Hamano <junkio@cox.net>
This commit is contained in:
parent
fd2bbdd238
commit
9a305b67f8
29
gsimm.c
29
gsimm.c
@ -1,3 +1,4 @@
|
||||
#include <string.h>
|
||||
#include "rabinpoly.h"
|
||||
#include "gsimm.h"
|
||||
|
||||
@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq)
|
||||
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
||||
}
|
||||
|
||||
static int dist (u_char *l, u_char *r)
|
||||
{ int j, k;
|
||||
int d = 0;
|
||||
|
||||
for (j = 0; j < MD_LENGTH; j++)
|
||||
{ u_char ch = l[j] ^ r[j];
|
||||
|
||||
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
double gb_simm_score(u_char *l, u_char *r)
|
||||
{
|
||||
int d = dist(l, r);
|
||||
double sim = (double) (d) / (MD_LENGTH * 4 - 1);
|
||||
if (1.0 < sim)
|
||||
return 0;
|
||||
else
|
||||
return 1.0 - sim;
|
||||
}
|
||||
|
||||
void gb_simm_process(u_char *data, unsigned len, u_char *md)
|
||||
{ size_t j = 0;
|
||||
u_int32_t ofs;
|
||||
@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md)
|
||||
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
|
||||
int freq[MD_BITS];
|
||||
|
||||
if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
|
||||
memset(md, 0, MD_LENGTH);
|
||||
return;
|
||||
}
|
||||
|
||||
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
||||
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
|
||||
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
|
||||
|
5
gsimm.h
5
gsimm.h
@ -15,14 +15,15 @@
|
||||
In order to get at least an average of 12 samples
|
||||
per bit in the final message digest, require at least 3 * MD_LENGTH
|
||||
complete windows in the file. */
|
||||
#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
|
||||
#define GB_SIMM_MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
|
||||
|
||||
/* Limit matching algorithm to files less than 256 MB, so we can use
|
||||
32 bit integers everywhere without fear of overflow. For larger
|
||||
files we should add logic to mmap the file by piece and accumulate
|
||||
the frequency counts. */
|
||||
#define MAX_FILE_SIZE (256*1024*1024 - 1)
|
||||
#define GB_SIMM_MAX_FILE_SIZE (256*1024*1024 - 1)
|
||||
|
||||
void gb_simm_process(u_char *data, unsigned len, u_char *md);
|
||||
double gb_simm_score(u_char *l, u_char *r);
|
||||
|
||||
#endif
|
||||
|
29
test-gsimm.c
29
test-gsimm.c
@ -58,19 +58,6 @@ void usage()
|
||||
exit (1);
|
||||
}
|
||||
|
||||
int dist (u_char *l, u_char *r)
|
||||
{ int j, k;
|
||||
int d = 0;
|
||||
|
||||
for (j = 0; j < MD_LENGTH; j++)
|
||||
{ u_char ch = l[j] ^ r[j];
|
||||
|
||||
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
|
||||
}
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
char *md_to_str(u_char *md)
|
||||
{ int j;
|
||||
|
||||
@ -102,8 +89,8 @@ void process_file (char *name)
|
||||
exit (2);
|
||||
}
|
||||
|
||||
if (fs.st_size >= MIN_FILE_SIZE
|
||||
&& fs.st_size <= MAX_FILE_SIZE)
|
||||
if (fs.st_size >= GB_SIMM_MIN_FILE_SIZE
|
||||
&& fs.st_size <= GB_SIMM_MAX_FILE_SIZE)
|
||||
{ fi->length = fs.st_size;
|
||||
fi->name = name;
|
||||
|
||||
@ -116,13 +103,11 @@ void process_file (char *name)
|
||||
|
||||
gb_simm_process (data, fs.st_size, fi->md);
|
||||
if (flag_relative)
|
||||
{ int d = dist (fi->md, relative_md);
|
||||
double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1));
|
||||
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
|
||||
md_to_str (fi->md), (long long unsigned) 0,
|
||||
(unsigned) fs.st_size, name,
|
||||
d, 100.0 * sim);
|
||||
}
|
||||
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
|
||||
md_to_str (fi->md), (long long unsigned) 0,
|
||||
(unsigned) fs.st_size, name,
|
||||
(unsigned) 0,
|
||||
100.0 * gb_simm_score(fi->md, relative_md));
|
||||
else
|
||||
{
|
||||
fprintf (stdout, "%s %llu %u %s\n",
|
||||
|
Loading…
Reference in New Issue
Block a user