Merge branch 'rs/mergesort'

The mergesort implementation used to sort linked list has been
optimized.

* rs/mergesort:
  test-mergesort: use repeatable random numbers
  mergesort: use ranks stack
  p0071: test performance of llist_mergesort()
  p0071: measure sorting of already sorted and reversed files
  test-mergesort: add unriffle_skewed mode
  test-mergesort: add unriffle mode
  test-mergesort: add generate subcommand
  test-mergesort: add test subcommand
  test-mergesort: add sort subcommand
  test-mergesort: use strbuf_getline()
This commit is contained in:
Junio C Hamano 2021-10-18 15:47:56 -07:00
commit 0ef08090d2
4 changed files with 474 additions and 68 deletions

View File

@ -1,73 +1,84 @@
#include "cache.h"
#include "mergesort.h"
struct mergesort_sublist {
void *ptr;
unsigned long len;
};
static void *get_nth_next(void *list, unsigned long n,
void *(*get_next_fn)(const void *))
/* Combine two sorted lists. Take from `list` on equality. */
static void *llist_merge(void *list, void *other,
void *(*get_next_fn)(const void *),
void (*set_next_fn)(void *, void *),
int (*compare_fn)(const void *, const void *))
{
while (n-- && list)
list = get_next_fn(list);
return list;
}
static void *pop_item(struct mergesort_sublist *l,
void *(*get_next_fn)(const void *))
{
void *p = l->ptr;
l->ptr = get_next_fn(l->ptr);
l->len = l->ptr ? (l->len - 1) : 0;
return p;
void *result = list, *tail;
if (compare_fn(list, other) > 0) {
result = other;
goto other;
}
for (;;) {
do {
tail = list;
list = get_next_fn(list);
if (!list) {
set_next_fn(tail, other);
return result;
}
} while (compare_fn(list, other) <= 0);
set_next_fn(tail, other);
other:
do {
tail = other;
other = get_next_fn(other);
if (!other) {
set_next_fn(tail, list);
return result;
}
} while (compare_fn(list, other) > 0);
set_next_fn(tail, list);
}
}
/*
* Perform an iterative mergesort using an array of sublists.
*
* n is the number of items.
* ranks[i] is undefined if n & 2^i == 0, and assumed empty.
* ranks[i] contains a sublist of length 2^i otherwise.
*
* The number of bits in a void pointer limits the number of objects
* that can be created, and thus the number of array elements necessary
* to be able to sort any valid list.
*
* Adding an item to this array is like incrementing a binary number;
* positional values for set bits correspond to sublist lengths.
*/
void *llist_mergesort(void *list,
void *(*get_next_fn)(const void *),
void (*set_next_fn)(void *, void *),
int (*compare_fn)(const void *, const void *))
{
unsigned long l;
void *ranks[bitsizeof(void *)];
size_t n = 0;
int i;
if (!list)
return NULL;
for (l = 1; ; l *= 2) {
void *curr;
struct mergesort_sublist p, q;
while (list) {
void *next = get_next_fn(list);
if (next)
set_next_fn(list, NULL);
for (i = 0; n & (1 << i); i++)
list = llist_merge(ranks[i], list, get_next_fn,
set_next_fn, compare_fn);
n++;
ranks[i] = list;
list = next;
}
p.ptr = list;
q.ptr = get_nth_next(p.ptr, l, get_next_fn);
if (!q.ptr)
break;
p.len = q.len = l;
if (compare_fn(p.ptr, q.ptr) > 0)
list = curr = pop_item(&q, get_next_fn);
for (i = 0; n; i++, n >>= 1) {
if (!(n & 1))
continue;
if (list)
list = llist_merge(ranks[i], list, get_next_fn,
set_next_fn, compare_fn);
else
list = curr = pop_item(&p, get_next_fn);
while (p.ptr) {
while (p.len || q.len) {
void *prev = curr;
if (!p.len)
curr = pop_item(&q, get_next_fn);
else if (!q.len)
curr = pop_item(&p, get_next_fn);
else if (compare_fn(p.ptr, q.ptr) > 0)
curr = pop_item(&q, get_next_fn);
else
curr = pop_item(&p, get_next_fn);
set_next_fn(prev, curr);
}
p.ptr = q.ptr;
p.len = l;
q.ptr = get_nth_next(p.ptr, l, get_next_fn);
q.len = q.ptr ? l : 0;
}
set_next_fn(curr, NULL);
list = ranks[i];
}
return list;
}

View File

@ -2,6 +2,12 @@
#include "cache.h"
#include "mergesort.h"
static uint32_t minstd_rand(uint32_t *state)
{
*state = (uint64_t)*state * 48271 % 2147483647;
return *state;
}
struct line {
char *text;
struct line *next;
@ -23,14 +29,12 @@ static int compare_strings(const void *a, const void *b)
return strcmp(x->text, y->text);
}
int cmd__mergesort(int argc, const char **argv)
static int sort_stdin(void)
{
struct line *line, *p = NULL, *lines = NULL;
struct strbuf sb = STRBUF_INIT;
for (;;) {
if (strbuf_getwholeline(&sb, stdin, '\n'))
break;
while (!strbuf_getline(&sb, stdin)) {
line = xmalloc(sizeof(struct line));
line->text = strbuf_detach(&sb, NULL);
if (p) {
@ -46,8 +50,362 @@ int cmd__mergesort(int argc, const char **argv)
lines = llist_mergesort(lines, get_next, set_next, compare_strings);
while (lines) {
printf("%s", lines->text);
puts(lines->text);
lines = lines->next;
}
return 0;
}
static void dist_sawtooth(int *arr, int n, int m)
{
int i;
for (i = 0; i < n; i++)
arr[i] = i % m;
}
static void dist_rand(int *arr, int n, int m)
{
int i;
uint32_t seed = 1;
for (i = 0; i < n; i++)
arr[i] = minstd_rand(&seed) % m;
}
static void dist_stagger(int *arr, int n, int m)
{
int i;
for (i = 0; i < n; i++)
arr[i] = (i * m + i) % n;
}
static void dist_plateau(int *arr, int n, int m)
{
int i;
for (i = 0; i < n; i++)
arr[i] = (i < m) ? i : m;
}
static void dist_shuffle(int *arr, int n, int m)
{
int i, j, k;
uint32_t seed = 1;
for (i = j = 0, k = 1; i < n; i++)
arr[i] = minstd_rand(&seed) % m ? (j += 2) : (k += 2);
}
#define DIST(name) { #name, dist_##name }
static struct dist {
const char *name;
void (*fn)(int *arr, int n, int m);
} dist[] = {
DIST(sawtooth),
DIST(rand),
DIST(stagger),
DIST(plateau),
DIST(shuffle),
};
static const struct dist *get_dist_by_name(const char *name)
{
int i;
for (i = 0; i < ARRAY_SIZE(dist); i++) {
if (!strcmp(dist[i].name, name))
return &dist[i];
}
return NULL;
}
static void mode_copy(int *arr, int n)
{
/* nothing */
}
static void mode_reverse(int *arr, int n)
{
int i, j;
for (i = 0, j = n - 1; i < j; i++, j--)
SWAP(arr[i], arr[j]);
}
static void mode_reverse_1st_half(int *arr, int n)
{
mode_reverse(arr, n / 2);
}
static void mode_reverse_2nd_half(int *arr, int n)
{
int half = n / 2;
mode_reverse(arr + half, n - half);
}
static int compare_ints(const void *av, const void *bv)
{
const int *ap = av, *bp = bv;
int a = *ap, b = *bp;
return (a > b) - (a < b);
}
static void mode_sort(int *arr, int n)
{
QSORT(arr, n, compare_ints);
}
static void mode_dither(int *arr, int n)
{
int i;
for (i = 0; i < n; i++)
arr[i] += i % 5;
}
static void unriffle(int *arr, int n, int *tmp)
{
int i, j;
COPY_ARRAY(tmp, arr, n);
for (i = j = 0; i < n; i += 2)
arr[j++] = tmp[i];
for (i = 1; i < n; i += 2)
arr[j++] = tmp[i];
}
static void unriffle_recursively(int *arr, int n, int *tmp)
{
if (n > 1) {
int half = n / 2;
unriffle(arr, n, tmp);
unriffle_recursively(arr, half, tmp);
unriffle_recursively(arr + half, n - half, tmp);
}
}
static void mode_unriffle(int *arr, int n)
{
int *tmp;
ALLOC_ARRAY(tmp, n);
unriffle_recursively(arr, n, tmp);
free(tmp);
}
static unsigned int prev_pow2(unsigned int n)
{
unsigned int pow2 = 1;
while (pow2 * 2 < n)
pow2 *= 2;
return pow2;
}
static void unriffle_recursively_skewed(int *arr, int n, int *tmp)
{
if (n > 1) {
int pow2 = prev_pow2(n);
int rest = n - pow2;
unriffle(arr + pow2 - rest, rest * 2, tmp);
unriffle_recursively_skewed(arr, pow2, tmp);
unriffle_recursively_skewed(arr + pow2, rest, tmp);
}
}
static void mode_unriffle_skewed(int *arr, int n)
{
int *tmp;
ALLOC_ARRAY(tmp, n);
unriffle_recursively_skewed(arr, n, tmp);
free(tmp);
}
#define MODE(name) { #name, mode_##name }
static struct mode {
const char *name;
void (*fn)(int *arr, int n);
} mode[] = {
MODE(copy),
MODE(reverse),
MODE(reverse_1st_half),
MODE(reverse_2nd_half),
MODE(sort),
MODE(dither),
MODE(unriffle),
MODE(unriffle_skewed),
};
static const struct mode *get_mode_by_name(const char *name)
{
int i;
for (i = 0; i < ARRAY_SIZE(mode); i++) {
if (!strcmp(mode[i].name, name))
return &mode[i];
}
return NULL;
}
static int generate(int argc, const char **argv)
{
const struct dist *dist = NULL;
const struct mode *mode = NULL;
int i, n, m, *arr;
if (argc != 4)
return 1;
dist = get_dist_by_name(argv[0]);
mode = get_mode_by_name(argv[1]);
n = strtol(argv[2], NULL, 10);
m = strtol(argv[3], NULL, 10);
if (!dist || !mode)
return 1;
ALLOC_ARRAY(arr, n);
dist->fn(arr, n, m);
mode->fn(arr, n);
for (i = 0; i < n; i++)
printf("%08x\n", arr[i]);
free(arr);
return 0;
}
static struct stats {
int get_next, set_next, compare;
} stats;
struct number {
int value, rank;
struct number *next;
};
static void *get_next_number(const void *a)
{
stats.get_next++;
return ((const struct number *)a)->next;
}
static void set_next_number(void *a, void *b)
{
stats.set_next++;
((struct number *)a)->next = b;
}
static int compare_numbers(const void *av, const void *bv)
{
const struct number *an = av, *bn = bv;
int a = an->value, b = bn->value;
stats.compare++;
return (a > b) - (a < b);
}
static void clear_numbers(struct number *list)
{
while (list) {
struct number *next = list->next;
free(list);
list = next;
}
}
static int test(const struct dist *dist, const struct mode *mode, int n, int m)
{
int *arr;
size_t i;
struct number *curr, *list, **tail;
int is_sorted = 1;
int is_stable = 1;
const char *verdict;
int result = -1;
ALLOC_ARRAY(arr, n);
dist->fn(arr, n, m);
mode->fn(arr, n);
for (i = 0, tail = &list; i < n; i++) {
curr = xmalloc(sizeof(*curr));
curr->value = arr[i];
curr->rank = i;
*tail = curr;
tail = &curr->next;
}
*tail = NULL;
stats.get_next = stats.set_next = stats.compare = 0;
list = llist_mergesort(list, get_next_number, set_next_number,
compare_numbers);
QSORT(arr, n, compare_ints);
for (i = 0, curr = list; i < n && curr; i++, curr = curr->next) {
if (arr[i] != curr->value)
is_sorted = 0;
if (curr->next && curr->value == curr->next->value &&
curr->rank >= curr->next->rank)
is_stable = 0;
}
if (i < n) {
verdict = "too short";
} else if (curr) {
verdict = "too long";
} else if (!is_sorted) {
verdict = "not sorted";
} else if (!is_stable) {
verdict = "unstable";
} else {
verdict = "OK";
result = 0;
}
printf("%-9s %-16s %8d %8d %8d %8d %8d %s\n",
dist->name, mode->name, n, m, stats.get_next, stats.set_next,
stats.compare, verdict);
clear_numbers(list);
free(arr);
return result;
}
/*
* A version of the qsort certification program from "Engineering a Sort
* Function" by Bentley and McIlroy, Software—Practice and Experience,
* Volume 23, Issue 11, 12491265 (November 1993).
*/
static int run_tests(int argc, const char **argv)
{
const char *argv_default[] = { "100", "1023", "1024", "1025" };
if (!argc)
return run_tests(ARRAY_SIZE(argv_default), argv_default);
printf("%-9s %-16s %8s %8s %8s %8s %8s %s\n",
"distribut", "mode", "n", "m", "get_next", "set_next",
"compare", "verdict");
while (argc--) {
int i, j, m, n = strtol(*argv++, NULL, 10);
for (i = 0; i < ARRAY_SIZE(dist); i++) {
for (j = 0; j < ARRAY_SIZE(mode); j++) {
for (m = 1; m < 2 * n; m *= 2) {
if (test(&dist[i], &mode[j], n, m))
return 1;
}
}
}
}
return 0;
}
int cmd__mergesort(int argc, const char **argv)
{
int i;
const char *sep;
if (argc == 6 && !strcmp(argv[1], "generate"))
return generate(argc - 2, argv + 2);
if (argc == 2 && !strcmp(argv[1], "sort"))
return sort_stdin();
if (argc > 1 && !strcmp(argv[1], "test"))
return run_tests(argc - 2, argv + 2);
fprintf(stderr, "usage: test-tool mergesort generate <distribution> <mode> <n> <m>\n");
fprintf(stderr, " or: test-tool mergesort sort\n");
fprintf(stderr, " or: test-tool mergesort test [<n>...]\n");
fprintf(stderr, "\n");
for (i = 0, sep = "distributions: "; i < ARRAY_SIZE(dist); i++, sep = ", ")
fprintf(stderr, "%s%s", sep, dist[i].name);
fprintf(stderr, "\n");
for (i = 0, sep = "modes: "; i < ARRAY_SIZE(mode); i++, sep = ", ")
fprintf(stderr, "%s%s", sep, mode[i].name);
fprintf(stderr, "\n");
return 129;
}

View File

@ -11,16 +11,42 @@ test_expect_success 'setup' '
git cat-file --batch >unsorted
'
test_perf 'sort(1)' '
sort <unsorted >expect
test_perf 'sort(1) unsorted' '
sort <unsorted >sorted
'
test_perf 'string_list_sort()' '
test-tool string-list sort <unsorted >actual
test_expect_success 'reverse' '
sort -r <unsorted >reversed
'
test_expect_success 'string_list_sort() sorts like sort(1)' '
test_cmp_bin expect actual
'
for file in sorted reversed
do
test_perf "sort(1) $file" "
sort <$file >actual
"
done
for file in unsorted sorted reversed
do
test_perf "string_list_sort() $file" "
test-tool string-list sort <$file >actual
"
test_expect_success "string_list_sort() $file sorts like sort(1)" "
test_cmp_bin sorted actual
"
done
for file in unsorted sorted reversed
do
test_perf "llist_mergesort() $file" "
test-tool mergesort sort <$file >actual
"
test_expect_success "llist_mergesort() $file sorts like sort(1)" "
test_cmp_bin sorted actual
"
done
test_done

11
t/t0071-sort.sh Executable file
View File

@ -0,0 +1,11 @@
#!/bin/sh
test_description='verify sort functions'
. ./test-lib.sh
test_expect_success 'llist_mergesort()' '
test-tool mergesort test
'
test_done