From 6a364ced497e407ab3ffb2554d4ef2c78f801832 Mon Sep 17 00:00:00 2001 From: Karsten Blees Date: Thu, 14 Nov 2013 20:17:54 +0100 Subject: add a hashtable implementation that supports O(1) removal The existing hashtable implementation (in hash.[ch]) uses open addressing (i.e. resolve hash collisions by distributing entries across the table). Thus, removal is difficult to implement with less than O(n) complexity. Resolving collisions of entries with identical hashes (e.g. via chaining) is left to the client code. Add a hashtable implementation that supports O(1) removal and is slightly easier to use due to builtin entry chaining. Supports all basic operations init, free, get, add, remove and iteration. Also includes ready-to-use hash functions based on the public domain FNV-1 algorithm (http://www.isthe.com/chongo/tech/comp/fnv). The per-entry data structure (hashmap_entry) is piggybacked in front of the client's data structure to save memory. See test-hashmap.c for usage examples. The hashtable is resized by a factor of four when 80% full. With these settings, average memory consumption is about 2/3 of hash.[ch], and insertion is about twice as fast due to less frequent resizing. Lookups are also slightly faster, because entries are strictly confined to their bucket (i.e. no data of other buckets needs to be traversed). Signed-off-by: Karsten Blees Signed-off-by: Junio C Hamano --- test-hashmap.c | 340 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 test-hashmap.c (limited to 'test-hashmap.c') diff --git a/test-hashmap.c b/test-hashmap.c new file mode 100644 index 0000000000..581d2964e4 --- /dev/null +++ b/test-hashmap.c @@ -0,0 +1,340 @@ +#include "cache.h" +#include "hashmap.h" +#include + +struct test_entry +{ + struct hashmap_entry ent; + /* key and value as two \0-terminated strings */ + char key[FLEX_ARRAY]; +}; + +static const char *get_value(const struct test_entry *e) +{ + return e->key + strlen(e->key) + 1; +} + +static int test_entry_cmp(const struct test_entry *e1, + const struct test_entry *e2, const char* key) +{ + return strcmp(e1->key, key ? key : e2->key); +} + +static int test_entry_cmp_icase(const struct test_entry *e1, + const struct test_entry *e2, const char* key) +{ + return strcasecmp(e1->key, key ? key : e2->key); +} + +static struct test_entry *alloc_test_entry(int hash, char *key, int klen, + char *value, int vlen) +{ + struct test_entry *entry = malloc(sizeof(struct test_entry) + klen + + vlen + 2); + hashmap_entry_init(entry, hash); + memcpy(entry->key, key, klen + 1); + memcpy(entry->key + klen + 1, value, vlen + 1); + return entry; +} + +#define HASH_METHOD_FNV 0 +#define HASH_METHOD_I 1 +#define HASH_METHOD_IDIV10 2 +#define HASH_METHOD_0 3 +#define HASH_METHOD_X2 4 +#define TEST_SPARSE 8 +#define TEST_ADD 16 +#define TEST_SIZE 100000 + +static unsigned int hash(unsigned int method, unsigned int i, const char *key) +{ + unsigned int hash; + switch (method & 3) + { + case HASH_METHOD_FNV: + hash = strhash(key); + break; + case HASH_METHOD_I: + hash = i; + break; + case HASH_METHOD_IDIV10: + hash = i / 10; + break; + case HASH_METHOD_0: + hash = 0; + break; + } + + if (method & HASH_METHOD_X2) + hash = 2 * hash; + return hash; +} + +/* + * Test performance of hashmap.[ch] + * Usage: time echo "perfhashmap method rounds" | test-hashmap + */ +static void perf_hashmap(unsigned int method, unsigned int rounds) +{ + struct hashmap map; + char buf[16]; + struct test_entry **entries; + unsigned int *hashes; + unsigned int i, j; + + entries = malloc(TEST_SIZE * sizeof(struct test_entry *)); + hashes = malloc(TEST_SIZE * sizeof(int)); + for (i = 0; i < TEST_SIZE; i++) { + snprintf(buf, sizeof(buf), "%i", i); + entries[i] = alloc_test_entry(0, buf, strlen(buf), "", 0); + hashes[i] = hash(method, i, entries[i]->key); + } + + if (method & TEST_ADD) { + /* test adding to the map */ + for (j = 0; j < rounds; j++) { + hashmap_init(&map, (hashmap_cmp_fn) test_entry_cmp, 0); + + /* add entries */ + for (i = 0; i < TEST_SIZE; i++) { + hashmap_entry_init(entries[i], hashes[i]); + hashmap_add(&map, entries[i]); + } + + hashmap_free(&map, 0); + } + } else { + /* test map lookups */ + hashmap_init(&map, (hashmap_cmp_fn) test_entry_cmp, 0); + + /* fill the map (sparsely if specified) */ + j = (method & TEST_SPARSE) ? TEST_SIZE / 10 : TEST_SIZE; + for (i = 0; i < j; i++) { + hashmap_entry_init(entries[i], hashes[i]); + hashmap_add(&map, entries[i]); + } + + for (j = 0; j < rounds; j++) { + for (i = 0; i < TEST_SIZE; i++) { + struct hashmap_entry key; + hashmap_entry_init(&key, hashes[i]); + hashmap_get(&map, &key, entries[i]->key); + } + } + + hashmap_free(&map, 0); + } +} + +struct hash_entry +{ + struct hash_entry *next; + char key[FLEX_ARRAY]; +}; + +/* + * Test performance of hash.[ch] + * Usage: time echo "perfhash method rounds" | test-hashmap + */ +static void perf_hash(unsigned int method, unsigned int rounds) +{ + struct hash_table map; + char buf[16]; + struct hash_entry **entries, **res, *entry; + unsigned int *hashes; + unsigned int i, j; + + entries = malloc(TEST_SIZE * sizeof(struct hash_entry *)); + hashes = malloc(TEST_SIZE * sizeof(int)); + for (i = 0; i < TEST_SIZE; i++) { + snprintf(buf, sizeof(buf), "%i", i); + entries[i] = malloc(sizeof(struct hash_entry) + strlen(buf) + 1); + strcpy(entries[i]->key, buf); + hashes[i] = hash(method, i, entries[i]->key); + } + + if (method & TEST_ADD) { + /* test adding to the map */ + for (j = 0; j < rounds; j++) { + init_hash(&map); + + /* add entries */ + for (i = 0; i < TEST_SIZE; i++) { + res = (struct hash_entry **) insert_hash( + hashes[i], entries[i], &map); + if (res) { + entries[i]->next = *res; + *res = entries[i]; + } else { + entries[i]->next = NULL; + } + } + + free_hash(&map); + } + } else { + /* test map lookups */ + init_hash(&map); + + /* fill the map (sparsely if specified) */ + j = (method & TEST_SPARSE) ? TEST_SIZE / 10 : TEST_SIZE; + for (i = 0; i < j; i++) { + res = (struct hash_entry **) insert_hash(hashes[i], + entries[i], &map); + if (res) { + entries[i]->next = *res; + *res = entries[i]; + } else { + entries[i]->next = NULL; + } + } + + for (j = 0; j < rounds; j++) { + for (i = 0; i < TEST_SIZE; i++) { + entry = lookup_hash(hashes[i], &map); + while (entry) { + if (!strcmp(entries[i]->key, entry->key)) + break; + entry = entry->next; + } + } + } + + free_hash(&map); + + } +} + +#define DELIM " \t\r\n" + +/* + * Read stdin line by line and print result of commands to stdout: + * + * hash key -> strhash(key) memhash(key) strihash(key) memihash(key) + * put key value -> NULL / old value + * get key -> NULL / value + * remove key -> NULL / old value + * iterate -> key1 value1\nkey2 value2\n... + * size -> tablesize numentries + * + * perfhashmap method rounds -> test hashmap.[ch] performance + * perfhash method rounds -> test hash.[ch] performance + */ +int main(int argc, char *argv[]) +{ + char line[1024]; + struct hashmap map; + int icase; + + /* init hash map */ + icase = argc > 1 && !strcmp("ignorecase", argv[1]); + hashmap_init(&map, (hashmap_cmp_fn) (icase ? test_entry_cmp_icase + : test_entry_cmp), 0); + + /* process commands from stdin */ + while (fgets(line, sizeof(line), stdin)) { + char *cmd, *p1 = NULL, *p2 = NULL; + int l1 = 0, l2 = 0, hash = 0; + struct test_entry *entry; + + /* break line into command and up to two parameters */ + cmd = strtok(line, DELIM); + /* ignore empty lines */ + if (!cmd || *cmd == '#') + continue; + + p1 = strtok(NULL, DELIM); + if (p1) { + l1 = strlen(p1); + hash = icase ? strihash(p1) : strhash(p1); + p2 = strtok(NULL, DELIM); + if (p2) + l2 = strlen(p2); + } + + if (!strcmp("hash", cmd) && l1) { + + /* print results of different hash functions */ + printf("%u %u %u %u\n", strhash(p1), memhash(p1, l1), + strihash(p1), memihash(p1, l1)); + + } else if (!strcmp("add", cmd) && l1 && l2) { + + /* create entry with key = p1, value = p2 */ + entry = alloc_test_entry(hash, p1, l1, p2, l2); + + /* add to hashmap */ + hashmap_add(&map, entry); + + } else if (!strcmp("put", cmd) && l1 && l2) { + + /* create entry with key = p1, value = p2 */ + entry = alloc_test_entry(hash, p1, l1, p2, l2); + + /* add / replace entry */ + entry = hashmap_put(&map, entry); + + /* print and free replaced entry, if any */ + puts(entry ? get_value(entry) : "NULL"); + free(entry); + + } else if (!strcmp("get", cmd) && l1) { + + /* setup static key */ + struct hashmap_entry key; + hashmap_entry_init(&key, hash); + + /* lookup entry in hashmap */ + entry = hashmap_get(&map, &key, p1); + + /* print result */ + if (!entry) + puts("NULL"); + while (entry) { + puts(get_value(entry)); + entry = hashmap_get_next(&map, entry); + } + + } else if (!strcmp("remove", cmd) && l1) { + + /* setup static key */ + struct hashmap_entry key; + hashmap_entry_init(&key, hash); + + /* remove entry from hashmap */ + entry = hashmap_remove(&map, &key, p1); + + /* print result and free entry*/ + puts(entry ? get_value(entry) : "NULL"); + free(entry); + + } else if (!strcmp("iterate", cmd)) { + + struct hashmap_iter iter; + hashmap_iter_init(&map, &iter); + while ((entry = hashmap_iter_next(&iter))) + printf("%s %s\n", entry->key, get_value(entry)); + + } else if (!strcmp("size", cmd)) { + + /* print table sizes */ + printf("%u %u\n", map.tablesize, map.size); + + } else if (!strcmp("perfhashmap", cmd) && l1 && l2) { + + perf_hashmap(atoi(p1), atoi(p2)); + + } else if (!strcmp("perfhash", cmd) && l1 && l2) { + + perf_hash(atoi(p1), atoi(p2)); + + } else { + + printf("Unknown command %s\n", cmd); + + } + } + + hashmap_free(&map, 1); + return 0; +} -- cgit v1.3 From efc684245b81ae0fb8f0afbd06dc1c3101c4e5a0 Mon Sep 17 00:00:00 2001 From: Karsten Blees Date: Thu, 14 Nov 2013 20:23:12 +0100 Subject: remove old hash.[ch] implementation Signed-off-by: Karsten Blees Signed-off-by: Junio C Hamano --- Documentation/technical/api-hash.txt | 52 ----------------- Makefile | 2 - cache.h | 1 - hash.c | 110 ----------------------------------- hash.h | 50 ---------------- test-hashmap.c | 84 -------------------------- 6 files changed, 299 deletions(-) delete mode 100644 Documentation/technical/api-hash.txt delete mode 100644 hash.c delete mode 100644 hash.h (limited to 'test-hashmap.c') diff --git a/Documentation/technical/api-hash.txt b/Documentation/technical/api-hash.txt deleted file mode 100644 index e5061e0677..0000000000 --- a/Documentation/technical/api-hash.txt +++ /dev/null @@ -1,52 +0,0 @@ -hash API -======== - -The hash API is a collection of simple hash table functions. Users are expected -to implement their own hashing. - -Data Structures ---------------- - -`struct hash_table`:: - - The hash table structure. The `array` member points to the hash table - entries. The `size` member counts the total number of valid and invalid - entries in the table. The `nr` member keeps track of the number of - valid entries. - -`struct hash_table_entry`:: - - An opaque structure representing an entry in the hash table. The `hash` - member is the entry's hash key and the `ptr` member is the entry's - value. - -Functions ---------- - -`init_hash`:: - - Initialize the hash table. - -`free_hash`:: - - Release memory associated with the hash table. - -`insert_hash`:: - - Insert a pointer into the hash table. If an entry with that hash - already exists, a pointer to the existing entry's value is returned. - Otherwise NULL is returned. This allows callers to implement - chaining, etc. - -`lookup_hash`:: - - Lookup an entry in the hash table. If an entry with that hash exists - the entry's value is returned. Otherwise NULL is returned. - -`for_each_hash`:: - - Call a function for each entry in the hash table. The function is - expected to take the entry's value as its only argument and return an - int. If the function returns a negative int the loop is aborted - immediately. Otherwise, the return value is accumulated and the sum - returned upon completion of the loop. diff --git a/Makefile b/Makefile index d8d3d6705b..f495dd4c13 100644 --- a/Makefile +++ b/Makefile @@ -677,7 +677,6 @@ LIB_H += git-compat-util.h LIB_H += gpg-interface.h LIB_H += graph.h LIB_H += grep.h -LIB_H += hash.h LIB_H += hashmap.h LIB_H += help.h LIB_H += http.h @@ -809,7 +808,6 @@ LIB_OBJS += gettext.o LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o -LIB_OBJS += hash.o LIB_OBJS += hashmap.o LIB_OBJS += help.o LIB_OBJS += hex.o diff --git a/cache.h b/cache.h index 1f11e24cd0..407145c364 100644 --- a/cache.h +++ b/cache.h @@ -3,7 +3,6 @@ #include "git-compat-util.h" #include "strbuf.h" -#include "hash.h" #include "hashmap.h" #include "advice.h" #include "gettext.h" diff --git a/hash.c b/hash.c deleted file mode 100644 index 749ecfe484..0000000000 --- a/hash.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Some generic hashing helpers. - */ -#include "cache.h" -#include "hash.h" - -/* - * Look up a hash entry in the hash table. Return the pointer to - * the existing entry, or the empty slot if none existed. The caller - * can then look at the (*ptr) to see whether it existed or not. - */ -static struct hash_table_entry *lookup_hash_entry(unsigned int hash, const struct hash_table *table) -{ - unsigned int size = table->size, nr = hash % size; - struct hash_table_entry *array = table->array; - - while (array[nr].ptr) { - if (array[nr].hash == hash) - break; - nr++; - if (nr >= size) - nr = 0; - } - return array + nr; -} - - -/* - * Insert a new hash entry pointer into the table. - * - * If that hash entry already existed, return the pointer to - * the existing entry (and the caller can create a list of the - * pointers or do anything else). If it didn't exist, return - * NULL (and the caller knows the pointer has been inserted). - */ -static void **insert_hash_entry(unsigned int hash, void *ptr, struct hash_table *table) -{ - struct hash_table_entry *entry = lookup_hash_entry(hash, table); - - if (!entry->ptr) { - entry->ptr = ptr; - entry->hash = hash; - table->nr++; - return NULL; - } - return &entry->ptr; -} - -static void grow_hash_table(struct hash_table *table) -{ - unsigned int i; - unsigned int old_size = table->size, new_size; - struct hash_table_entry *old_array = table->array, *new_array; - - new_size = alloc_nr(old_size); - new_array = xcalloc(sizeof(struct hash_table_entry), new_size); - table->size = new_size; - table->array = new_array; - table->nr = 0; - for (i = 0; i < old_size; i++) { - unsigned int hash = old_array[i].hash; - void *ptr = old_array[i].ptr; - if (ptr) - insert_hash_entry(hash, ptr, table); - } - free(old_array); -} - -void *lookup_hash(unsigned int hash, const struct hash_table *table) -{ - if (!table->array) - return NULL; - return lookup_hash_entry(hash, table)->ptr; -} - -void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table) -{ - unsigned int nr = table->nr; - if (nr >= table->size/2) - grow_hash_table(table); - return insert_hash_entry(hash, ptr, table); -} - -int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data) -{ - int sum = 0; - unsigned int i; - unsigned int size = table->size; - struct hash_table_entry *array = table->array; - - for (i = 0; i < size; i++) { - void *ptr = array->ptr; - array++; - if (ptr) { - int val = fn(ptr, data); - if (val < 0) - return val; - sum += val; - } - } - return sum; -} - -void free_hash(struct hash_table *table) -{ - free(table->array); - table->array = NULL; - table->size = 0; - table->nr = 0; -} diff --git a/hash.h b/hash.h deleted file mode 100644 index 1d43ac0ba0..0000000000 --- a/hash.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef HASH_H -#define HASH_H - -/* - * These are some simple generic hash table helper functions. - * Not necessarily suitable for all users, but good for things - * where you want to just keep track of a list of things, and - * have a good hash to use on them. - * - * It keeps the hash table at roughly 50-75% free, so the memory - * cost of the hash table itself is roughly - * - * 3 * 2*sizeof(void *) * nr_of_objects - * - * bytes. - * - * FIXME: on 64-bit architectures, we waste memory. It would be - * good to have just 32-bit pointers, requiring a special allocator - * for hashed entries or something. - */ -struct hash_table_entry { - unsigned int hash; - void *ptr; -}; - -struct hash_table { - unsigned int size, nr; - struct hash_table_entry *array; -}; - -extern void *lookup_hash(unsigned int hash, const struct hash_table *table); -extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table); -extern int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data); -extern void free_hash(struct hash_table *table); - -static inline void init_hash(struct hash_table *table) -{ - table->size = 0; - table->nr = 0; - table->array = NULL; -} - -static inline void preallocate_hash(struct hash_table *table, unsigned int elts) -{ - assert(table->size == 0 && table->nr == 0 && table->array == NULL); - table->size = elts * 2; - table->array = xcalloc(sizeof(struct hash_table_entry), table->size); -} - -#endif diff --git a/test-hashmap.c b/test-hashmap.c index 581d2964e4..7e86f886d8 100644 --- a/test-hashmap.c +++ b/test-hashmap.c @@ -126,85 +126,6 @@ static void perf_hashmap(unsigned int method, unsigned int rounds) } } -struct hash_entry -{ - struct hash_entry *next; - char key[FLEX_ARRAY]; -}; - -/* - * Test performance of hash.[ch] - * Usage: time echo "perfhash method rounds" | test-hashmap - */ -static void perf_hash(unsigned int method, unsigned int rounds) -{ - struct hash_table map; - char buf[16]; - struct hash_entry **entries, **res, *entry; - unsigned int *hashes; - unsigned int i, j; - - entries = malloc(TEST_SIZE * sizeof(struct hash_entry *)); - hashes = malloc(TEST_SIZE * sizeof(int)); - for (i = 0; i < TEST_SIZE; i++) { - snprintf(buf, sizeof(buf), "%i", i); - entries[i] = malloc(sizeof(struct hash_entry) + strlen(buf) + 1); - strcpy(entries[i]->key, buf); - hashes[i] = hash(method, i, entries[i]->key); - } - - if (method & TEST_ADD) { - /* test adding to the map */ - for (j = 0; j < rounds; j++) { - init_hash(&map); - - /* add entries */ - for (i = 0; i < TEST_SIZE; i++) { - res = (struct hash_entry **) insert_hash( - hashes[i], entries[i], &map); - if (res) { - entries[i]->next = *res; - *res = entries[i]; - } else { - entries[i]->next = NULL; - } - } - - free_hash(&map); - } - } else { - /* test map lookups */ - init_hash(&map); - - /* fill the map (sparsely if specified) */ - j = (method & TEST_SPARSE) ? TEST_SIZE / 10 : TEST_SIZE; - for (i = 0; i < j; i++) { - res = (struct hash_entry **) insert_hash(hashes[i], - entries[i], &map); - if (res) { - entries[i]->next = *res; - *res = entries[i]; - } else { - entries[i]->next = NULL; - } - } - - for (j = 0; j < rounds; j++) { - for (i = 0; i < TEST_SIZE; i++) { - entry = lookup_hash(hashes[i], &map); - while (entry) { - if (!strcmp(entries[i]->key, entry->key)) - break; - entry = entry->next; - } - } - } - - free_hash(&map); - - } -} - #define DELIM " \t\r\n" /* @@ -218,7 +139,6 @@ static void perf_hash(unsigned int method, unsigned int rounds) * size -> tablesize numentries * * perfhashmap method rounds -> test hashmap.[ch] performance - * perfhash method rounds -> test hash.[ch] performance */ int main(int argc, char *argv[]) { @@ -324,10 +244,6 @@ int main(int argc, char *argv[]) perf_hashmap(atoi(p1), atoi(p2)); - } else if (!strcmp("perfhash", cmd) && l1 && l2) { - - perf_hash(atoi(p1), atoi(p2)); - } else { printf("Unknown command %s\n", cmd); -- cgit v1.3 From e8fa59b9081d90299aa5b41412bf93e8856a612b Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Fri, 13 Dec 2013 18:06:40 -0800 Subject: test-hashmap.c: drop unnecessary #includes Per Documentation/CodingGuidelines most C files in git start with a #include of git-compat-util.h or another header file that includes it, such as cache.h or builtin.h. This file doesn't need anything beyond "git-compat-util.h", so use that. Remove a #include of the system header since it is already included by "git-compat-util.h". Signed-off-by: Jonathan Nieder Signed-off-by: Junio C Hamano --- test-hashmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'test-hashmap.c') diff --git a/test-hashmap.c b/test-hashmap.c index 7e86f886d8..f5183fb9e8 100644 --- a/test-hashmap.c +++ b/test-hashmap.c @@ -1,6 +1,5 @@ -#include "cache.h" +#include "git-compat-util.h" #include "hashmap.h" -#include struct test_entry { -- cgit v1.3