Klib是一个独立的轻量级c通用库,里面大多数组件除了C标准库外不包含外部库,想用对应组件直接拷贝对应文件即可使用。
该库致力于高效和较小的内存占用,其中部分组件(如khash、kbtree、ksort、kvec),无论是内存还是速度方面,都是所有编程语言中相似算法或数据结构最高效的实现之一。
源代码在这里
核心代码相对较少但定义的API和宏挺多
#include "khash.h"
KHASH_MAP_INIT_INT(32, char) // 定义key是int的宏,设定value为char类型
int main() {
int ret, is_missing;
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
k = kh_get(32, h, 5);
kh_del(32, h, k);
for (k = kh_begin(h); k != kh_end(h); ++k)
if (kh_exist(h, k)) kh_value(h, k) = 1;
kh_destroy(32, h);
return 0;
}
类似c++的set或map
众所周知,这类结构最重要的就是hash函数以及冲突解决机制,所以本文着重解读这两方面
可喜可贺的是,源代码中大部分API宏都有注释了,所以本文就只列举不赘述了
#define __KHASH_TYPE(name, khkey_t, khval_t) \
typedef struct kh_##name##_s { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t;
#define khash_t(name) kh_##name##_t
n_buckets
:桶个数,或者说容量(必定是2的整数次方)size
:元素个数n_occupied
:占用桶个数upper_bound
:最多占用桶个数上限flags
:key的标记keys
:key数组(大小为n_buckets
)vals
:value数组(大小与keys
相同,初始化为set时不使用该结构)一个标记有32bit
,一个hash值用2bit
,所以一个标记能记录16个hash的相关信息,所以flags个数为容量/16
这2bit
,低位记录是否被删除(有元素但被删除),高位记录是否为空(没元素)
下面的API中输入的i
为key的hash值经过mask的结果,mask = n_buckets - 1
,还记得n_buckets = 2^n
吧,所以这里的mask
一定是高位为0低n位为1
i
的低n位也分为两部分:前面记录其所属flag的偏移位置,最后四位记录在flag中的偏移位置
// get
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
// set
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
x
为插入位置,如果对应位置为空__ac_isempty
则可直接设置x
位置插入元素,并返回x
ret
:额外返回码:-1
表示更新table出错了,1
表示x
对应位置为空,2
表示该位置为被删除的元素,0
表示key已经存在且未被删除
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
... \
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
/* clear "deleted" elements or expand the hash table */ \
... \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
i = (i + (++step)) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
else x = i; \
} \
} \
} \
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
... \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
... \
*ret = 2; \
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} ...
并未真正删除元素,只是标记一下flag
#define kh_del(name, h, k) kh_del_##name(h, k)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
... \
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
搜索过程:
key
计算hash并mask一下得到起始搜索位置i
不为空,则递增式地往后搜索i = (i + (++step)) & mask
,直到满足下面的条件之一
__ac_isempty(h->flags, i)
__ac_isdel(h->flags, i)
且相同的key__hash_equal(h->keys[i], key)
i == last
(不一定遍历所有的位置)i
,否则返回n_buckets
#define kh_get(name, h, k) kh_get_##name(h, k)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
... \
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t k, i, last, mask, step = 0; \
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
i = (i + (++step)) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} ...
更新过程:
keys
进行realloc
(map的话还有vals
)new_n_buckets
设置得太小则resize不会成功且返回0(第12行),否则都进行rehashkey
的插入位置
key
keys
进行realloc
(map还有vals
)flags
、n_buckets
、n_occupied
、upper_bound
等正常返回0
,出错时返回-1
#define kh_resize(name, h, s) kh_resize_##name(h, s)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
... \
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
... /* init new_flags */ \
if (h->n_buckets < new_n_buckets) { /* expand */ \
... \
} /* otherwise shrink */ \
} \
} \
if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
khint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
... \
while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
... \
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { /* write the element and jump out of the loop */ \
... \
break; \
} \
} \
} \
} \
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
... \
} \
kfree(h->flags); /* free the working space */ \
... \
} \
return 0; \
}
#define kh_init(name) kh_init_##name()
#define kh_destroy(name, h) kh_destroy_##name(h)
#define kh_clear(name, h) kh_clear_##name(h)
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
#define kh_key(h, x) ((h)->keys[x])
#define kh_val(h, x) ((h)->vals[x])
#define kh_value(h, x) ((h)->vals[x]) // alias of kh_val
#define kh_begin(h) (khint_t)(0)
#define kh_end(h) ((h)->n_buckets)
#define kh_n_buckets(h) ((h)->n_buckets)
#define kh_size(h) ((h)->size)
一般用这些即可,KHASH_INIT
自由度最高,其他都是为特定key类型准备的
另外代码中还提供了KHASH_DECLARE
,用户需要实现所有函数,如果想更换某些函数可以自行定义
// generic
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
// key = int32
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
// key = int64
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
// key = const char*
#define KHASH_SET_INIT_STR(name) \
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
#define KHASH_MAP_INIT_STR(name, khval_t) \
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
hash函数的目的是尽量让每个key计算出来的值都独一无二,这样能尽量避免冲突,更加省时间
我以为大神会有什么特别的hash函数,结果没想到int型直接取值本身,难道这就是大道至简?
// key = int32
#define kh_int_hash_func(key) (khint32_t)(key)
#define kh_int_hash_equal(a, b) ((a) == (b))
// key = int64
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
#define kh_int64_hash_equal(a, b) ((a) == (b))
// key = const char*
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
khint_t h = (khint_t)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
另外,还有个int的hash函数,大神说在特定非随机输入下更robust
static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)