e1273106f6
EWAH is a word-aligned compressed variant of a bitset (i.e. a data structure that acts as a 0-indexed boolean array for many entries). It uses a 64-bit run-length encoding (RLE) compression scheme, trading some compression for better processing speed. The goal of this word-aligned implementation is not to achieve the best compression, but rather to improve query processing time. As it stands right now, this EWAH implementation will always be more efficient storage-wise than its uncompressed alternative. EWAH arrays will be used as the on-disk format to store reachability bitmaps for all objects in a repository while keeping reasonable sizes, in the same way that JGit does. This EWAH implementation is a mostly straightforward port of the original `javaewah` library that JGit currently uses. The library is self-contained and has been embedded whole (4 files) inside the `ewah` folder to ease redistribution. The library is re-licensed under the GPLv2 with the permission of Daniel Lemire, the original author. The source code for the C version can be found on GitHub: https://github.com/vmg/libewok The original Java implementation can also be found on GitHub: https://github.com/lemire/javaewah [jc: stripped debug-only code per Peff's $gmane/239768] Signed-off-by: Vicent Marti <tanoku@gmail.com> Signed-off-by: Jeff King <peff@peff.net> Helped-by: Ramsay Jones <ramsay@ramsay1.demon.co.uk> Signed-off-by: Junio C Hamano <gitster@pobox.com>
194 lines
4.9 KiB
C
194 lines
4.9 KiB
C
/**
|
|
* Copyright 2013, GitHub, Inc
|
|
* Copyright 2009-2013, Daniel Lemire, Cliff Moon,
|
|
* David McIntosh, Robert Becho, Google Inc. and Veronika Zenz
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*/
|
|
#include "git-compat-util.h"
|
|
#include "ewok.h"
|
|
|
|
int ewah_serialize_native(struct ewah_bitmap *self, int fd)
|
|
{
|
|
uint32_t write32;
|
|
size_t to_write = self->buffer_size * 8;
|
|
|
|
/* 32 bit -- bit size for the map */
|
|
write32 = (uint32_t)self->bit_size;
|
|
if (write(fd, &write32, 4) != 4)
|
|
return -1;
|
|
|
|
/** 32 bit -- number of compressed 64-bit words */
|
|
write32 = (uint32_t)self->buffer_size;
|
|
if (write(fd, &write32, 4) != 4)
|
|
return -1;
|
|
|
|
if (write(fd, self->buffer, to_write) != to_write)
|
|
return -1;
|
|
|
|
/** 32 bit -- position for the RLW */
|
|
write32 = self->rlw - self->buffer;
|
|
if (write(fd, &write32, 4) != 4)
|
|
return -1;
|
|
|
|
return (3 * 4) + to_write;
|
|
}
|
|
|
|
int ewah_serialize_to(struct ewah_bitmap *self,
|
|
int (*write_fun)(void *, const void *, size_t),
|
|
void *data)
|
|
{
|
|
size_t i;
|
|
eword_t dump[2048];
|
|
const size_t words_per_dump = sizeof(dump) / sizeof(eword_t);
|
|
uint32_t bitsize, word_count, rlw_pos;
|
|
|
|
const eword_t *buffer;
|
|
size_t words_left;
|
|
|
|
/* 32 bit -- bit size for the map */
|
|
bitsize = htonl((uint32_t)self->bit_size);
|
|
if (write_fun(data, &bitsize, 4) != 4)
|
|
return -1;
|
|
|
|
/** 32 bit -- number of compressed 64-bit words */
|
|
word_count = htonl((uint32_t)self->buffer_size);
|
|
if (write_fun(data, &word_count, 4) != 4)
|
|
return -1;
|
|
|
|
/** 64 bit x N -- compressed words */
|
|
buffer = self->buffer;
|
|
words_left = self->buffer_size;
|
|
|
|
while (words_left >= words_per_dump) {
|
|
for (i = 0; i < words_per_dump; ++i, ++buffer)
|
|
dump[i] = htonll(*buffer);
|
|
|
|
if (write_fun(data, dump, sizeof(dump)) != sizeof(dump))
|
|
return -1;
|
|
|
|
words_left -= words_per_dump;
|
|
}
|
|
|
|
if (words_left) {
|
|
for (i = 0; i < words_left; ++i, ++buffer)
|
|
dump[i] = htonll(*buffer);
|
|
|
|
if (write_fun(data, dump, words_left * 8) != words_left * 8)
|
|
return -1;
|
|
}
|
|
|
|
/** 32 bit -- position for the RLW */
|
|
rlw_pos = (uint8_t*)self->rlw - (uint8_t *)self->buffer;
|
|
rlw_pos = htonl(rlw_pos / sizeof(eword_t));
|
|
|
|
if (write_fun(data, &rlw_pos, 4) != 4)
|
|
return -1;
|
|
|
|
return (3 * 4) + (self->buffer_size * 8);
|
|
}
|
|
|
|
static int write_helper(void *fd, const void *buf, size_t len)
|
|
{
|
|
return write((intptr_t)fd, buf, len);
|
|
}
|
|
|
|
int ewah_serialize(struct ewah_bitmap *self, int fd)
|
|
{
|
|
return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd);
|
|
}
|
|
|
|
int ewah_read_mmap(struct ewah_bitmap *self, void *map, size_t len)
|
|
{
|
|
uint32_t *read32 = map;
|
|
eword_t *read64;
|
|
size_t i;
|
|
|
|
self->bit_size = ntohl(*read32++);
|
|
self->buffer_size = self->alloc_size = ntohl(*read32++);
|
|
self->buffer = ewah_realloc(self->buffer,
|
|
self->alloc_size * sizeof(eword_t));
|
|
|
|
if (!self->buffer)
|
|
return -1;
|
|
|
|
for (i = 0, read64 = (void *)read32; i < self->buffer_size; ++i)
|
|
self->buffer[i] = ntohll(*read64++);
|
|
|
|
read32 = (void *)read64;
|
|
self->rlw = self->buffer + ntohl(*read32++);
|
|
|
|
return (3 * 4) + (self->buffer_size * 8);
|
|
}
|
|
|
|
int ewah_deserialize(struct ewah_bitmap *self, int fd)
|
|
{
|
|
size_t i;
|
|
eword_t dump[2048];
|
|
const size_t words_per_dump = sizeof(dump) / sizeof(eword_t);
|
|
uint32_t bitsize, word_count, rlw_pos;
|
|
|
|
eword_t *buffer = NULL;
|
|
size_t words_left;
|
|
|
|
ewah_clear(self);
|
|
|
|
/* 32 bit -- bit size for the map */
|
|
if (read(fd, &bitsize, 4) != 4)
|
|
return -1;
|
|
|
|
self->bit_size = (size_t)ntohl(bitsize);
|
|
|
|
/** 32 bit -- number of compressed 64-bit words */
|
|
if (read(fd, &word_count, 4) != 4)
|
|
return -1;
|
|
|
|
self->buffer_size = self->alloc_size = (size_t)ntohl(word_count);
|
|
self->buffer = ewah_realloc(self->buffer,
|
|
self->alloc_size * sizeof(eword_t));
|
|
|
|
if (!self->buffer)
|
|
return -1;
|
|
|
|
/** 64 bit x N -- compressed words */
|
|
buffer = self->buffer;
|
|
words_left = self->buffer_size;
|
|
|
|
while (words_left >= words_per_dump) {
|
|
if (read(fd, dump, sizeof(dump)) != sizeof(dump))
|
|
return -1;
|
|
|
|
for (i = 0; i < words_per_dump; ++i, ++buffer)
|
|
*buffer = ntohll(dump[i]);
|
|
|
|
words_left -= words_per_dump;
|
|
}
|
|
|
|
if (words_left) {
|
|
if (read(fd, dump, words_left * 8) != words_left * 8)
|
|
return -1;
|
|
|
|
for (i = 0; i < words_left; ++i, ++buffer)
|
|
*buffer = ntohll(dump[i]);
|
|
}
|
|
|
|
/** 32 bit -- position for the RLW */
|
|
if (read(fd, &rlw_pos, 4) != 4)
|
|
return -1;
|
|
|
|
self->rlw = self->buffer + ntohl(rlw_pos);
|
|
return 0;
|
|
}
|