hashmap: add string interning API
Interning short strings with high probability of duplicates can reduce the memory footprint and speed up comparisons. Add strintern() and memintern() APIs that use a hashmap to manage the pool of unique, interned strings. Note: strintern(getenv()) could be used to sanitize git's use of getenv(), in case we ever encounter a platform where a call to getenv() invalidates previous getenv() results (which is allowed by POSIX). Signed-off-by: Karsten Blees <blees@dcon.de> Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
parent
ab73a9d119
commit
7b64d42d22
@ -193,6 +193,21 @@ more entries.
|
||||
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
|
||||
and returns the first entry, if any).
|
||||
|
||||
`const char *strintern(const char *string)`::
|
||||
`const void *memintern(const void *data, size_t len)`::
|
||||
|
||||
Returns the unique, interned version of the specified string or data,
|
||||
similar to the `String.intern` API in Java and .NET, respectively.
|
||||
Interned strings remain valid for the entire lifetime of the process.
|
||||
+
|
||||
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
|
||||
strings / data must not be modified or freed.
|
||||
+
|
||||
Interned strings are best used for short strings with high probability of
|
||||
duplicates.
|
||||
+
|
||||
Uses a hashmap to store the pool of interned strings.
|
||||
|
||||
Usage example
|
||||
-------------
|
||||
|
||||
|
38
hashmap.c
38
hashmap.c
@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
|
||||
current = iter->map->table[iter->tablepos++];
|
||||
}
|
||||
}
|
||||
|
||||
struct pool_entry {
|
||||
struct hashmap_entry ent;
|
||||
size_t len;
|
||||
unsigned char data[FLEX_ARRAY];
|
||||
};
|
||||
|
||||
static int pool_entry_cmp(const struct pool_entry *e1,
|
||||
const struct pool_entry *e2,
|
||||
const unsigned char *keydata)
|
||||
{
|
||||
return e1->data != keydata &&
|
||||
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
|
||||
}
|
||||
|
||||
const void *memintern(const void *data, size_t len)
|
||||
{
|
||||
static struct hashmap map;
|
||||
struct pool_entry key, *e;
|
||||
|
||||
/* initialize string pool hashmap */
|
||||
if (!map.tablesize)
|
||||
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
|
||||
|
||||
/* lookup interned string in pool */
|
||||
hashmap_entry_init(&key, memhash(data, len));
|
||||
key.len = len;
|
||||
e = hashmap_get(&map, &key, data);
|
||||
if (!e) {
|
||||
/* not found: create it */
|
||||
e = xmallocz(sizeof(struct pool_entry) + len);
|
||||
hashmap_entry_init(e, key.ent.hash);
|
||||
e->len = len;
|
||||
memcpy(e->data, data, len);
|
||||
hashmap_add(&map, e);
|
||||
}
|
||||
return e->data;
|
||||
}
|
||||
|
@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
|
||||
return hashmap_iter_next(iter);
|
||||
}
|
||||
|
||||
/* string interning */
|
||||
|
||||
extern const void *memintern(const void *data, size_t len);
|
||||
static inline const char *strintern(const char *string)
|
||||
{
|
||||
return memintern(string, strlen(string));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
|
||||
|
||||
'
|
||||
|
||||
test_expect_success 'string interning' '
|
||||
|
||||
test_hashmap "intern value1
|
||||
intern Value1
|
||||
intern value2
|
||||
intern value2
|
||||
" "value1
|
||||
Value1
|
||||
value2
|
||||
value2"
|
||||
|
||||
'
|
||||
|
||||
test_done
|
||||
|
@ -234,6 +234,20 @@ int main(int argc, char *argv[])
|
||||
/* print table sizes */
|
||||
printf("%u %u\n", map.tablesize, map.size);
|
||||
|
||||
} else if (!strcmp("intern", cmd) && l1) {
|
||||
|
||||
/* test that strintern works */
|
||||
const char *i1 = strintern(p1);
|
||||
const char *i2 = strintern(p1);
|
||||
if (strcmp(i1, p1))
|
||||
printf("strintern(%s) returns %s\n", p1, i1);
|
||||
else if (i1 == p1)
|
||||
printf("strintern(%s) returns input pointer\n", p1);
|
||||
else if (i1 != i2)
|
||||
printf("strintern(%s) != strintern(%s)", i1, i2);
|
||||
else
|
||||
printf("%s\n", i1);
|
||||
|
||||
} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
|
||||
|
||||
perf_hashmap(atoi(p1), atoi(p2));
|
||||
|
Loading…
Reference in New Issue
Block a user