Merge branch 'dk/diff-delta'

* dk/diff-delta: diff-delta.c: Rationalize culling of hash buckets diff-delta.c: pack the index structure
2007-09-14 18:33:15 -07:00 · 2007-09-14 18:33:15 -07:00 · eb6d54fc79
commit eb6d54fc79
parent 04222b245c 02e665ce49
1 changed files with 86 additions and 23 deletions
--- a/diff-delta.c
+++ b/diff-delta.c
@ -115,7 +115,11 @@ static const unsigned int U[256] = {
 struct index_entry {
 	const unsigned char *ptr;
 	unsigned int val;
-	struct index_entry *next;
+};
 struct unpacked_index_entry {
 	struct index_entry entry;
 	struct unpacked_index_entry *next;
 };
 struct delta_index {
@ -131,7 +135,8 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
-	struct index_entry *entry, **hash;
+	struct unpacked_index_entry *entry, **hash;
 	struct index_entry *packed_entry, **packed_hash;
 	void *mem;
 	unsigned long memsize;
@ -148,28 +153,21 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 	/* allocate lookup index */
-	memsize = sizeof(*index) +
+	memsize = sizeof(*hash) * hsize +
 		  sizeof(*hash) * hsize +
 		  sizeof(*entry) * entries;
 	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
 	index = mem;
 	mem = index + 1;
 	hash = mem;
 	mem = hash + hsize;
 	entry = mem;
 	index->memsize = memsize;
 	index->src_buf = buf;
 	index->src_size = bufsize;
 	index->hash_mask = hmask;
 	memset(hash, 0, hsize * sizeof(*hash));
 	/* allocate an array to count hash entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
-		free(index);
+		free(hash);
 		return NULL;
 	}
@ -183,12 +181,13 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
-			entry[-1].ptr = data + RABIN_WINDOW;
+			entry[-1].entry.ptr = data + RABIN_WINDOW;
 			--entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
-			entry->ptr = data + RABIN_WINDOW;
+			entry->entry.ptr = data + RABIN_WINDOW;
-			entry->val = val;
+			entry->entry.val = val;
 			entry->next = hash[i];
 			hash[i] = entry++;
 			hash_count[i]++;
@ -208,20 +207,84 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	 * the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
-		if (hash_count[i] < HASH_LIMIT)
+		int acc;
 		if (hash_count[i] <= HASH_LIMIT)
 			continue;
 		entries -= hash_count[i] - HASH_LIMIT;
 		/* We leave exactly HASH_LIMIT entries in the bucket */
 		entry = hash[i];
 		acc = 0;
 		do {
-			struct index_entry *keep = entry;
+			acc += hash_count[i] - HASH_LIMIT;
-			int skip = hash_count[i] / HASH_LIMIT;
+			if (acc > 0) {
 				struct unpacked_index_entry *keep = entry;
 				do {
 					entry = entry->next;
-			} while(--skip && entry);
+					acc -= HASH_LIMIT;
-			keep->next = entry;
+				} while (acc > 0);
 				keep->next = entry->next;
 			}
 			entry = entry->next;
 		} while (entry);
 		/* Assume that this loop is gone through exactly
 		 * HASH_LIMIT times and is entered and left with
 		 * acc==0.  So the first statement in the loop
 		 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
 		 * to the accumulator, and the inner loop consequently
 		 * is run (hash_count[i]-HASH_LIMIT) times, removing
 		 * one element from the list each time.  Since acc
 		 * balances out to 0 at the final run, the inner loop
 		 * body can't be left with entry==NULL.  So we indeed
 		 * encounter entry==NULL in the outer loop only.
 		 */
 	}
 	free(hash_count);
 	/* Now create the packed index in array form rather than
 	 * linked lists */
 	memsize = sizeof(*index)
 		+ sizeof(*packed_hash) * (hsize+1)
 		+ sizeof(*packed_entry) * entries;
 	mem = malloc(memsize);
 	if (!mem) {
 		free(hash);
 		return NULL;
 	}
 	index = mem;
 	index->memsize = memsize;
 	index->src_buf = buf;
 	index->src_size = bufsize;
 	index->hash_mask = hmask;
 	mem = index + 1;
 	packed_hash = mem;
 	mem = packed_hash + (hsize+1);
 	packed_entry = mem;
 	/* Coalesce all entries belonging to one linked list into
 	 * consecutive array entries */
 	for (i = 0; i < hsize; i++) {
 		packed_hash[i] = packed_entry;
 		for (entry = hash[i]; entry; entry = entry->next)
 			*packed_entry++ = entry->entry;
 	}
 	/* Sentinel value to indicate the length of the last hash
 	 * bucket */
 	packed_hash[hsize] = packed_entry;
 	assert(packed_entry - (struct index_entry *)mem == entries);
 	free(hash);
 	return index;
 }
@ -302,7 +365,7 @@ create_delta(const struct delta_index *index,
 			val ^= U[data[-RABIN_WINDOW]];
 			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 			i = val & index->hash_mask;
-			for (entry = index->hash[i]; entry; entry = entry->next) {
+			for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = ref_top - ref;