Clean-up Geert's similarity fingerprint code.

This splits Geert's similarity fingerprint code into main
program and fingerprinting function.  The next step would be to
try using this in pack-objects.c::try_delta() -- which would be
a good evaluation.

Signed-off-by: Junio C Hamano <junkio@cox.net>
This commit is contained in:
Junio C Hamano 2006-04-15 01:32:39 -07:00
parent 42b5c78845
commit fd2bbdd238
7 changed files with 521 additions and 0 deletions

1
.gitignore vendored
View File

@ -123,6 +123,7 @@ git-write-tree
git-core-*/?*
test-date
test-delta
test-gsimm
common-cmds.h
*.tar.gz
*.dsc

View File

@ -606,6 +606,9 @@ test-date$X: test-date.c date.o ctype.o
test-delta$X: test-delta.c diff-delta.o patch-delta.o
$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $^ -lz
test-gsimm$X: test-gsimm.c gsimm.o rabinpoly.o
$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $^
check:
for i in *.c; do sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; done
@ -654,6 +657,7 @@ clean:
rm -f *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \
$(LIB_FILE) $(XDIFF_LIB)
rm -f $(ALL_PROGRAMS) git$X
rm -f test-date$X test-delta$X test-gsimm$X
rm -f *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags
rm -rf $(GIT_TARNAME)
rm -f $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz

94
gsimm.c Normal file
View File

@ -0,0 +1,94 @@
#include "rabinpoly.h"
#include "gsimm.h"
/* Has to be power of two. Since the Rabin hash only has 63
usable bits, the number of hashes is limited to 32.
Lower powers of two could be used for speeding up processing
of very large files. */
#define NUM_HASHES_PER_CHAR 32
/* Size of cache used to eliminate duplicate substrings.
Make small enough to comfortably fit in L1 cache. */
#define DUP_CACHE_SIZE 256
/* For the final counting, do not count each bit individually, but
group them. Must be power of two, at most NUM_HASHES_PER_CHAR.
However, larger sizes result in higher cache usage. Use 8 bits
per group for efficient processing of large files on fast machines
with decent caches, or 4 bits for faster processing of small files
and for machines with small caches. */
#define GROUP_BITS 4
#define GROUP_COUNTERS (1<<GROUP_BITS)
static void freq_to_md(u_char *md, int *freq)
{ int j, k;
for (j = 0; j < MD_LENGTH; j++)
{ u_char ch = 0;
for (k = 0; k < 8; k++) ch = 2*ch + (freq[8*j+k] > 0);
md[j] = ch;
}
bzero (freq, sizeof(freq[0]) * MD_BITS);
}
void gb_simm_process(u_char *data, unsigned len, u_char *md)
{ size_t j = 0;
u_int32_t ofs;
u_int32_t dup_cache[DUP_CACHE_SIZE];
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
int freq[MD_BITS];
bzero (freq, sizeof(freq[0]) * MD_BITS);
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
/* Ignore incomplete substrings */
while (j < len && j < RABIN_WINDOW_SIZE) rabin_slide8 (data[j++]);
while (j < len)
{ u_int64_t hash;
u_int32_t ofs, sum;
u_char idx;
int k;
hash = rabin_slide8 (data[j++]);
/* In order to update a much larger frequency table
with only 32 bits of checksum, randomly select a
part of the table to update. The selection should
only depend on the content of the represented data,
and be independent of the bits used for the update.
Instead of updating 32 individual counters, process
the checksum in MD_BITS / GROUP_BITS groups of
GROUP_BITS bits, and count the frequency of each bit pattern.
*/
idx = (hash >> 32);
sum = (u_int32_t) hash;
ofs = idx % (MD_BITS / NUM_HASHES_PER_CHAR) * NUM_HASHES_PER_CHAR;
idx %= DUP_CACHE_SIZE;
if (dup_cache[idx] != sum)
{ dup_cache[idx] = sum;
for (k = 0; k < NUM_HASHES_PER_CHAR / GROUP_BITS; k++)
{ count[ofs * GROUP_COUNTERS / GROUP_BITS + (sum % GROUP_COUNTERS)]++;
ofs += GROUP_BITS;
sum >>= GROUP_BITS;
} } }
/* Distribute the occurrences of each bit group over the frequency table. */
for (ofs = 0; ofs < MD_BITS; ofs += GROUP_BITS)
{ int j;
for (j = 0; j < GROUP_COUNTERS; j++)
{ int k;
for (k = 0; k < GROUP_BITS; k++)
{ freq[ofs + k] += ((1<<k) & j)
? count[ofs * GROUP_COUNTERS / GROUP_BITS + j]
: -count[ofs * GROUP_COUNTERS / GROUP_BITS + j];
} } }
if (md)
{ rabin_reset();
freq_to_md (md, freq);
} }

28
gsimm.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef GSIMM_H
#define GSIMM_H
/* Length of file message digest (MD) in bytes. Longer MD's are
better, but increase processing time for diminishing returns.
Must be multiple of NUM_HASHES_PER_CHAR / 8, and at least 24
for good results
*/
#define MD_LENGTH 32
#define MD_BITS (MD_LENGTH * 8)
/* The MIN_FILE_SIZE indicates the absolute minimal file size that
can be processed. As indicated above, the first and last
RABIN_WINDOW_SIZE - 1 bytes are skipped.
In order to get at least an average of 12 samples
per bit in the final message digest, require at least 3 * MD_LENGTH
complete windows in the file. */
#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
/* Limit matching algorithm to files less than 256 MB, so we can use
32 bit integers everywhere without fear of overflow. For larger
files we should add logic to mmap the file by piece and accumulate
the frequency counts. */
#define MAX_FILE_SIZE (256*1024*1024 - 1)
void gb_simm_process(u_char *data, unsigned len, u_char *md);
#endif

154
rabinpoly.c Normal file
View File

@ -0,0 +1,154 @@
/*
*
* Copyright (C) 1999 David Mazieres (dm@uun.org)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
*/
/* Faster generic_fls */
/* (c) 2002, D.Phillips and Sistina Software */
#include "rabinpoly.h"
#define MSB64 0x8000000000000000ULL
static inline unsigned fls8(unsigned n)
{
return n & 0xf0?
n & 0xc0? (n >> 7) + 7: (n >> 5) + 5:
n & 0x0c? (n >> 3) + 3: n - ((n + 1) >> 2);
}
static inline unsigned fls16(unsigned n)
{
return n & 0xff00? fls8(n >> 8) + 8: fls8(n);
}
static inline unsigned fls32(unsigned n)
{
return n & 0xffff0000? fls16(n >> 16) + 16: fls16(n);
}
static inline unsigned fls64(unsigned long long n) /* should be u64 */
{
return n & 0xffffffff00000000ULL? fls32(n >> 32) + 32: fls32(n);
}
static u_int64_t polymod (u_int64_t nh, u_int64_t nl, u_int64_t d);
static void polymult (u_int64_t *php, u_int64_t *plp,
u_int64_t x, u_int64_t y);
static u_int64_t polymmult (u_int64_t x, u_int64_t y, u_int64_t d);
static u_int64_t poly = 0xb15e234bd3792f63ull; // Actual polynomial
static u_int64_t T[256]; // Lookup table for mod
static int shift;
u_int64_t append8 (u_int64_t p, u_char m)
{ return ((p << 8) | m) ^ T[p >> shift];
}
static u_int64_t
polymod (u_int64_t nh, u_int64_t nl, u_int64_t d)
{ int i, k;
assert (d);
k = fls64 (d) - 1;
d <<= 63 - k;
if (nh) {
if (nh & MSB64)
nh ^= d;
for (i = 62; i >= 0; i--)
if (nh & 1ULL << i) {
nh ^= d >> (63 - i);
nl ^= d << (i + 1);
}
}
for (i = 63; i >= k; i--)
if (nl & 1ULL << i)
nl ^= d >> (63 - i);
return nl;
}
static void
polymult (u_int64_t *php, u_int64_t *plp, u_int64_t x, u_int64_t y)
{ int i;
u_int64_t ph = 0, pl = 0;
if (x & 1)
pl = y;
for (i = 1; i < 64; i++)
if (x & (1ULL << i)) {
ph ^= y >> (64 - i);
pl ^= y << i;
}
if (php)
*php = ph;
if (plp)
*plp = pl;
}
static u_int64_t
polymmult (u_int64_t x, u_int64_t y, u_int64_t d)
{
u_int64_t h, l;
polymult (&h, &l, x, y);
return polymod (h, l, d);
}
static int size = RABIN_WINDOW_SIZE;
static u_int64_t fingerprint = 0;
static int bufpos = -1;
static u_int64_t U[256];
static u_char buf[RABIN_WINDOW_SIZE];
void rabin_init ()
{ u_int64_t sizeshift = 1;
u_int64_t T1;
int xshift;
int i, j;
assert (poly >= 0x100);
xshift = fls64 (poly) - 1;
shift = xshift - 8;
T1 = polymod (0, 1ULL << xshift, poly);
for (j = 0; j < 256; j++)
T[j] = polymmult (j, T1, poly) | ((u_int64_t) j << xshift);
for (i = 1; i < size; i++)
sizeshift = append8 (sizeshift, 0);
for (i = 0; i < 256; i++)
U[i] = polymmult (i, sizeshift, poly);
bzero (buf, sizeof (buf));
}
void
rabin_reset ()
{ rabin_init();
fingerprint = 0;
bzero (buf, sizeof (buf));
}
u_int64_t
rabin_slide8 (u_char m)
{ u_char om;
if (++bufpos >= size) bufpos = 0;
om = buf[bufpos];
buf[bufpos] = m;
fingerprint = append8 (fingerprint ^ U[om], m);
return fingerprint;
}

31
rabinpoly.h Normal file
View File

@ -0,0 +1,31 @@
/*
*
* Copyright (C) 2000 David Mazieres (dm@uun.org)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* Translated to C and simplified by Geert Bosch (bosch@gnat.com)
*/
#include <assert.h>
#include <strings.h>
#include <sys/types.h>
#ifndef RABIN_WINDOW_SIZE
#define RABIN_WINDOW_SIZE 48
#endif
void rabin_reset();
u_int64_t rabin_slide8(u_char c);

209
test-gsimm.c Normal file
View File

@ -0,0 +1,209 @@
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <libgen.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "rabinpoly.h"
#include "gsimm.h"
#define MIN(x,y) ((y)<(x) ? (y) : (x))
#define MAX(x,y) ((y)>(x) ? (y) : (x))
/* The RABIN_WINDOW_SIZE is the size of fingerprint window used by
Rabin algorithm. This is not a modifiable parameter.
The first RABIN_WINDOW_SIZE - 1 bytes are skipped, in order to ensure
fingerprints are good hashes. This does somewhat reduce the
influence of the first few bytes in the file (they're part of
fewer windows, like the last few bytes), but that actually isn't
so bad as files often start with fixed content that may bias comparisons.
*/
typedef struct fileinfo
{ char *name;
size_t length;
u_char md[MD_LENGTH];
int match;
} File;
int flag_verbose = 0;
int flag_debug = 0;
char *flag_relative = 0;
char cmd[12] = " ...";
char md_strbuf[MD_LENGTH * 2 + 1];
u_char relative_md [MD_LENGTH];
File *file;
int file_count;
size_t file_bytes;
char hex[17] = "0123456789abcdef";
void usage()
{ fprintf (stderr, "usage: %s [-dhvw] [-r fingerprint] file ...\n", cmd);
fprintf (stderr, " -d\tdebug output, repeate for more verbosity\n");
fprintf (stderr, " -h\tshow this usage information\n");
fprintf (stderr, " -r\tshow distance relative to fingerprint "
"(%u hex digits)\n", MD_LENGTH * 2);
fprintf (stderr, " -v\tverbose output, repeat for even more verbosity\n");
fprintf (stderr, " -w\tenable warnings for suspect statistics\n");
exit (1);
}
int dist (u_char *l, u_char *r)
{ int j, k;
int d = 0;
for (j = 0; j < MD_LENGTH; j++)
{ u_char ch = l[j] ^ r[j];
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
}
return d;
}
char *md_to_str(u_char *md)
{ int j;
for (j = 0; j < MD_LENGTH; j++)
{ u_char ch = md[j];
md_strbuf[j*2] = hex[ch >> 4];
md_strbuf[j*2+1] = hex[ch & 0xF];
}
md_strbuf[j*2] = 0;
return md_strbuf;
}
void process_file (char *name)
{ int fd;
struct stat fs;
u_char *data;
File *fi = file+file_count;;
fd = open (name, O_RDONLY, 0);
if (fd < 0)
{ perror (name);
exit (2);
}
if (fstat (fd, &fs))
{ perror (name);
exit (2);
}
if (fs.st_size >= MIN_FILE_SIZE
&& fs.st_size <= MAX_FILE_SIZE)
{ fi->length = fs.st_size;
fi->name = name;
data = (u_char *) mmap (0, fs.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == (u_char *) -1)
{ perror (name);
exit (2);
}
gb_simm_process (data, fs.st_size, fi->md);
if (flag_relative)
{ int d = dist (fi->md, relative_md);
double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1));
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
md_to_str (fi->md), (long long unsigned) 0,
(unsigned) fs.st_size, name,
d, 100.0 * sim);
}
else
{
fprintf (stdout, "%s %llu %u %s\n",
md_to_str (fi->md), (long long unsigned) 0,
(unsigned) fs.st_size, name);
}
munmap (data, fs.st_size);
file_bytes += fs.st_size;
file_count++;
} else if (flag_verbose)
{ fprintf (stdout, "skipping %s (size %llu)\n", name, (long long unsigned) fs.st_size); }
close (fd);
}
u_char *str_to_md(char *str, u_char *md)
{ int j;
if (!md || !str) return 0;
bzero (md, MD_LENGTH);
for (j = 0; j < MD_LENGTH * 2; j++)
{ char ch = str[j];
if (ch >= '0' && ch <= '9')
{ md [j/2] = (md [j/2] << 4) + (ch - '0');
}
else
{ ch |= 32;
if (ch < 'a' || ch > 'f') break;
md [j/2] = (md[j/2] << 4) + (ch - 'a' + 10);
} }
return (j != MD_LENGTH * 2 || str[j] != 0) ? 0 : md;
}
int main (int argc, char *argv[])
{ int ch, j;
strncpy (cmd, basename (argv[0]), 8);
while ((ch = getopt(argc, argv, "dhr:vw")) != -1)
{ switch (ch)
{ case 'd': flag_debug++;
break;
case 'r': if (!optarg)
{ fprintf (stderr, "%s: missing argument for -r\n", cmd);
return 1;
}
if (str_to_md (optarg, relative_md)) flag_relative = optarg;
else
{ fprintf (stderr, "%s: not a valid fingerprint\n", optarg);
return 1;
}
break;
case 'v': flag_verbose++;
break;
case 'w': break;
default : usage();
return (ch != 'h');
} }
argc -= optind;
argv += optind;
if (argc == 0) usage();
rabin_reset ();
if (flag_verbose && flag_relative)
{ fprintf (stdout, "distances are relative to %s\n", flag_relative);
}
file = (File *) calloc (argc, sizeof (File));
for (j = 0; j < argc; j++) process_file (argv[j]);
if (flag_verbose)
{ fprintf (stdout, "%li bytes in %i files\n", (long) file_bytes, file_count);
}
return 0;
}