xref: /xnu-11215.41.3/bsd/skywalk/lib/cuckoo_hashtable.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2018-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <skywalk/os_skywalk_private.h>
29 
30 #include "cuckoo_hashtable.h"
31 
32 #define CUCKOO_TAG "com.apple.skywalk.libcuckoo"
33 SKMEM_TAG_DEFINE(cuckoo_tag, CUCKOO_TAG);
34 
35 
36 SYSCTL_NODE(_kern_skywalk, OID_AUTO, libcuckoo, CTLFLAG_RW | CTLFLAG_LOCKED,
37     0, "Skywalk Cuckoo Hashtable Library");
38 
39 uint32_t cuckoo_verbose = 0;
40 #if (DEVELOPMENT || DEBUG)
41 SYSCTL_UINT(_kern_skywalk_libcuckoo, OID_AUTO, verbose,
42     CTLFLAG_RW | CTLFLAG_LOCKED, &cuckoo_verbose, 0, "");
43 #endif /* DEVELOPMENT || DEBUG */
44 
45 typedef enum cht_verb {
46 	CHTV_ERR = 0,
47 	CHTV_WARN = 1,
48 	CHTV_INFO = 2,
49 	CHTV_DEBUG = 3,
50 } cht_verb_t;
51 
52 static LCK_GRP_DECLARE(cht_lock_group, "CHT_LOCK");
53 static LCK_ATTR_DECLARE(cht_lock_attr, 0, 0);
54 
55 #if SK_LOG
56 #define cht_log(level, _fmt, ...)       \
57 	do {    \
58 	        if (level <= cuckoo_verbose) {  \
59 	                kprintf("Cuckoo: thread %p %-30s " _fmt "\n",   \
60 	                    current_thread(), __FUNCTION__, ##__VA_ARGS__);     \
61 	        }       \
62 	} while (0);
63 #else  /* !SK_LOG */
64 #define cht_log(_flag, _fmt, ...) do { ((void)0); } while (0)
65 #endif /* !SK_LOG */
66 
67 #define cht_err(_fmt, ...) cht_log(CHTV_ERR, _fmt, ##__VA_ARGS__)
68 #define cht_warn(_fmt, ...) cht_log(CHTV_WARN, _fmt, ##__VA_ARGS__)
69 #define cht_info(_fmt, ...) cht_log(CHTV_INFO, _fmt, ##__VA_ARGS__)
70 #define cht_debug(_fmt, ...) cht_log(CHTV_DEBUG, _fmt, ##__VA_ARGS__)
71 
72 static inline int
cuckoo_node_chain(struct cuckoo_node * node,struct cuckoo_node * new_node)73 cuckoo_node_chain(struct cuckoo_node *node,
74     struct cuckoo_node *new_node)
75 {
76 	struct cuckoo_node *prev_node = node;
77 
78 	/* new node must be zero initialized */
79 	ASSERT(new_node->next == NULL);
80 
81 	/* use tail insert to check for duplicate along list */
82 	while (__improbable(node != NULL)) {
83 		if (node == new_node) {
84 			return EEXIST;
85 		}
86 		prev_node = node;
87 		node = node->next;
88 	}
89 
90 	prev_node->next = new_node;
91 
92 	return 0;
93 }
94 
95 static inline bool
cuckoo_node_del(struct cuckoo_node ** pnode,struct cuckoo_node * del_node)96 cuckoo_node_del(struct cuckoo_node **pnode,
97     struct cuckoo_node *del_node)
98 {
99 	ASSERT(pnode != NULL);
100 
101 	struct cuckoo_node *node = *pnode;
102 	while (node != NULL && node != del_node) {
103 		pnode = &node->next;
104 		node = node->next;
105 	}
106 	if (__probable(node != NULL)) {
107 		*pnode = node->next;
108 		node->next = NULL;
109 		return true;
110 	}
111 
112 	return false;
113 }
114 
115 static inline void
cuckoo_node_set_next(struct cuckoo_node * node,struct cuckoo_node * next_node)116 cuckoo_node_set_next(struct cuckoo_node *node, struct cuckoo_node *next_node)
117 {
118 	node->next = next_node;
119 }
120 
121 /* We probably won't add RCU soon so use simple pointer reference for now */
122 static inline struct cuckoo_node *
cuckoo_node_next(struct cuckoo_node * node)123 cuckoo_node_next(struct cuckoo_node *node)
124 {
125 	return node->next;
126 }
127 
128 #define _CHT_MAX_LOAD_SHRINK 40       /* at least below 40% load to shrink */
129 #define _CHT_MIN_LOAD_EXPAND 85       /* cuckoo could hold 85% full table */
130 
131 enum cuckoo_resize_ops {
132 	_CHT_RESIZE_EXPAND = 0,
133 	_CHT_RESIZE_SHRINK = 1,
134 };
135 
136 /*
137  * Following classic Cuckoo hash table design, cuckoo_hashtable use k hash
138  * functions to derive multiple candidate hash table bucket indexes.
139  * Here cuckoo_hashtable use k=2.
140  *     prim_bkt_idx = bkt_idx[1] = hash[1](key) % N_BUCKETS
141  *     alt_bkt_idx  = bkt_idx[2] = hash[2](key) % N_BUCKETS
142  *
143  * Currently, we let the caller pass in the actual key's hash value, because
144  * in most of the use cases, caller probably have already calculated the hash
145  * value of actual key (e.g. using hardware offloading or copy+hash). This also
146  * save us from storing the key in the table (or any side data structure). So
147  *
148  *     hash[1] = hash    // hash(hash value) passed in from caller
149  *     hash[2] = __alt_hash(hash[1])
150  *
151  * __alt_hash derives h2 using h1's high bits, since calculating primary
152  * bucket index uses its low bits. So alt_hash is still a uniformly distributed
153  * random variable (but not independent of h1, but is fine for hashtable usage).
154  *
155  * There is option to store h2 in the table bucket as well but cuckoo_hashtable
156  * is not doing this to use less memory usage with the small price of a few
157  * more cpu cycles during add/del operation. Assuming that the hashtable is
158  * read-heavy rather than write-heavy, this is reasonable.
159  *
160  * In the rare case of full hash value collision, where
161  *     hash[1] == hash[1]'
162  * , there is no way for the hash table to differentiate two objects, thus we
163  * need to chain the fully collided objects under the same bucket slot.
164  * The caller need to walk the chain to explicitly compare the full length key
165  * to find the correct object.
166  *
167  * Reference Counting
168  * The hashtable assumes all objects are reference counted. It takes function
169  * pointers that retain and release the object.
170  * Adding to the table will call its retain function.
171  * Deleting from the table will call its release function.
172  *
173  */
174 
175 /* hash might be zero, so always use _node == NULL to test empty slot */
176 struct _slot {
177 	uint32_t                _hash;
178 	struct cuckoo_node      *_node;
179 };
180 
181 /*
182  * Cuckoo hashtable cache line awareness:
183  *   - ARM platform has 128B CPU cache line.
184  *   - Intel platform has 64B CPU cache line. However, hardware prefetcher
185  *     treats cache lines as 128B chunk and prefetch the other 64B cache line.
186  *
187  * Thus cuckoo_hashtable use 128B as bucket size to make best use CPU cache
188  * resource.
189  */
190 #define _CHT_CACHELINE_CHUNK 128
191 #define _CHT_SLOT_INVAL UINT8_MAX
192 static const uint8_t _CHT_BUCKET_SLOTS =
193     ((_CHT_CACHELINE_CHUNK - sizeof(lck_mtx_t) - sizeof(uint8_t)) /
194     sizeof(struct _slot));
195 
196 struct _bucket {
197 	struct _slot            _slots[_CHT_BUCKET_SLOTS];
198 	decl_lck_mtx_data(, _lock);
199 	uint8_t                 _inuse;
200 } __attribute__((aligned(_CHT_CACHELINE_CHUNK)));
201 
202 struct cuckoo_hashtable {
203 	uint32_t        _bitmask;       /* 1s' mask for quick MOD */
204 	uint32_t        _n_buckets;     /* number of buckets */
205 
206 	volatile uint32_t _n_entries;   /* number of entires in table */
207 	uint32_t          _capacity;    /* max number of entires */
208 	uint32_t          _rcapacity;   /* requested capacity */
209 
210 	bool            _busy;
211 	uint32_t        _resize_waiters;
212 	decl_lck_rw_data(, _resize_lock);
213 	decl_lck_mtx_data(, _lock);
214 
215 	struct _bucket  *__counted_by(_n_buckets) _buckets;
216 
217 	int (*_obj_cmp)(struct cuckoo_node *node, void *key);
218 	void (*_obj_retain)(struct cuckoo_node *);
219 	void (*_obj_release)(struct cuckoo_node *);
220 } __attribute__((aligned(_CHT_CACHELINE_CHUNK)));
221 
222 static_assert(sizeof(struct _bucket) <= _CHT_CACHELINE_CHUNK);
223 
224 static inline void
__slot_set(struct _slot * slt,uint32_t hash,struct cuckoo_node * node)225 __slot_set(struct _slot *slt, uint32_t hash, struct cuckoo_node *node)
226 {
227 	slt->_hash = hash;
228 	slt->_node = node;
229 }
230 
231 static inline void
__slot_reset(struct _slot * slt)232 __slot_reset(struct _slot *slt)
233 {
234 	slt->_hash = 0;
235 	slt->_node = NULL;
236 }
237 
238 static inline uint32_t
__alt_hash(uint32_t hash)239 __alt_hash(uint32_t hash)
240 {
241 #define _CHT_ALT_HASH_MIX       0x5bd1e995      /* Murmur hash mix */
242 	uint32_t tag = hash >> 16;
243 	uint32_t alt_hash = hash ^ ((tag + 1) * _CHT_ALT_HASH_MIX);
244 	return alt_hash;
245 }
246 
247 static inline struct _bucket *
__get_bucket(struct cuckoo_hashtable * h,uint32_t b_i)248 __get_bucket(struct cuckoo_hashtable *h, uint32_t b_i)
249 {
250 	return &h->_buckets[b_i];
251 }
252 
253 static inline struct _bucket *
__prim_bucket(struct cuckoo_hashtable * h,uint32_t hash)254 __prim_bucket(struct cuckoo_hashtable *h, uint32_t hash)
255 {
256 	return __get_bucket(h, hash & h->_bitmask);
257 }
258 
259 static inline struct _bucket *
__alt_bucket(struct cuckoo_hashtable * h,uint32_t hash)260 __alt_bucket(struct cuckoo_hashtable *h, uint32_t hash)
261 {
262 	return __get_bucket(h, __alt_hash(hash) & h->_bitmask);
263 }
264 
265 #if SK_LOG
266 static inline size_t
__bucket_idx(struct cuckoo_hashtable * h,struct _bucket * b)267 __bucket_idx(struct cuckoo_hashtable *h, struct _bucket *b)
268 {
269 	return ((uintptr_t)b - (uintptr_t)&h->_buckets[0]) / sizeof(struct _bucket);
270 }
271 #endif /* SK_LOG */
272 
273 static inline struct _slot *
__bucket_slot(struct _bucket * b,uint32_t slot_idx)274 __bucket_slot(struct _bucket *b, uint32_t slot_idx)
275 {
276 	return &b->_slots[slot_idx];
277 }
278 
279 static inline bool
__slot_empty(struct _slot * s)280 __slot_empty(struct _slot *s)
281 {
282 	return s->_node == NULL;
283 }
284 
285 static inline uint32_t
__align32pow2(uint32_t v)286 __align32pow2(uint32_t v)
287 {
288 	v--;
289 	v |= v >> 1;
290 	v |= v >> 2;
291 	v |= v >> 4;
292 	v |= v >> 8;
293 	v |= v >> 16;
294 	v++;
295 
296 	return v;
297 }
298 
299 uint32_t
cuckoo_hashtable_load_factor(struct cuckoo_hashtable * h)300 cuckoo_hashtable_load_factor(struct cuckoo_hashtable *h)
301 {
302 	return (100 * h->_n_entries) / (h->_n_buckets * _CHT_BUCKET_SLOTS);
303 }
304 
305 /*
306  * Cuckoo hashtable uses regular mutex.  Most operations(find/add) should
307  * finish faster than a context switch.  It avoids using the spin lock since
308  * it might cause issues on certain platforms (e.g. x86_64) where the trap
309  * handler for dealing with FP/SIMD use would be invoked to perform thread-
310  * specific allocations; the use of FP/SIMD here is related to the memory
311  * compare with mask routines.  Even in case of another thread holding a
312  * bucket lock and went asleep, cuckoo path search would try to find another
313  * path without blockers.
314  *
315  * The only exception is table expansion, which could take a long time, we use
316  * read/write lock to protect the whole table against any read/write in that
317  * case.
318  */
319 
320 /* find/add only acquires table rlock, and serialize with bucket lock */
321 #define __lock_bucket(b)        lck_mtx_lock(&b->_lock)
322 #define __unlock_bucket(b)      lck_mtx_unlock(&b->_lock)
323 
324 #define _CHT_DEADLOCK_THRESHOLD 20
325 static inline bool
__lock_bucket_with_backoff(struct _bucket * b)326 __lock_bucket_with_backoff(struct _bucket *b)
327 {
328 	uint32_t try_counter = 0;
329 	while (!lck_mtx_try_lock(&b->_lock)) {
330 		if (try_counter++ > _CHT_DEADLOCK_THRESHOLD) {
331 			return false;
332 		}
333 	}
334 	return true;
335 }
336 
337 #define __rlock_table(h)        lck_rw_lock_shared(&h->_resize_lock)
338 #define __unrlock_table(h)      lck_rw_unlock_shared(&h->_resize_lock)
339 #define __r2wlock_table(h)      lck_rw_lock_shared_to_exclusive(&h->_resize_lock)
340 #define __wlock_table(h)        lck_rw_lock_exclusive(&h->_resize_lock)
341 #define __unwlock_table(h)      lck_rw_unlock_exclusive(&h->_resize_lock)
342 
343 static inline int
__resize_begin(struct cuckoo_hashtable * h)344 __resize_begin(struct cuckoo_hashtable *h)
345 {
346 	// takes care of concurrent resize
347 	lck_mtx_lock(&h->_lock);
348 	while (h->_busy) {
349 		if (++(h->_resize_waiters) == 0) {   /* wraparound */
350 			h->_resize_waiters++;
351 		}
352 		int error = msleep(&h->_resize_waiters, &h->_lock,
353 		    (PZERO + 1), __FUNCTION__, NULL);
354 		if (error == EINTR) {
355 			cht_warn("resize waiter was interrupted");
356 			ASSERT(h->_resize_waiters > 0);
357 			h->_resize_waiters--;
358 			lck_mtx_unlock(&h->_lock);
359 			return EINTR;
360 		}
361 		// resizer finished
362 		lck_mtx_unlock(&h->_lock);
363 		return EAGAIN;
364 	}
365 
366 	h->_busy = true;
367 	lck_mtx_unlock(&h->_lock);
368 
369 	// takes other readers offline
370 	__wlock_table(h);
371 	return 0;
372 }
373 
374 static inline void
__resize_end(struct cuckoo_hashtable * h)375 __resize_end(struct cuckoo_hashtable *h)
376 {
377 	__unwlock_table(h);
378 	lck_mtx_lock(&h->_lock);
379 	h->_busy = false;
380 	if (__improbable(h->_resize_waiters > 0)) {
381 		h->_resize_waiters = 0;
382 		wakeup(&h->_resize_waiters);
383 	}
384 	lck_mtx_unlock(&h->_lock);
385 }
386 
387 struct cuckoo_hashtable *
cuckoo_hashtable_create(struct cuckoo_hashtable_params * p)388 cuckoo_hashtable_create(struct cuckoo_hashtable_params *p)
389 {
390 	struct cuckoo_hashtable *h = NULL;
391 	uint32_t n = 0;
392 	uint32_t n_buckets = 0;
393 	struct _bucket *buckets = NULL;
394 	uint32_t i;
395 
396 	if (p->cht_capacity > CUCKOO_HASHTABLE_ENTRIES_MAX ||
397 	    p->cht_capacity < _CHT_BUCKET_SLOTS) {
398 		return NULL;
399 	}
400 
401 	ASSERT(p->cht_capacity < UINT32_MAX);
402 	n = (uint32_t)p->cht_capacity;
403 	h = sk_alloc_type(struct cuckoo_hashtable, Z_WAITOK | Z_NOFAIL, cuckoo_tag);
404 
405 	n_buckets = __align32pow2(n / _CHT_BUCKET_SLOTS);
406 	buckets = sk_alloc_type_array(struct _bucket, n_buckets, Z_WAITOK, cuckoo_tag);
407 	if (buckets == NULL) {
408 		sk_free_type(struct cuckoo_hashtable, h);
409 		return NULL;
410 	}
411 
412 	for (i = 0; i < n_buckets; i++) {
413 		lck_mtx_init(&buckets[i]._lock, &cht_lock_group, &cht_lock_attr);
414 	}
415 
416 	lck_mtx_init(&h->_lock, &cht_lock_group, &cht_lock_attr);
417 
418 	h->_n_entries = 0;
419 	h->_buckets = buckets;
420 	h->_n_buckets = n_buckets;
421 	h->_capacity = h->_rcapacity = h->_n_buckets * _CHT_BUCKET_SLOTS;
422 	h->_bitmask = n_buckets - 1;
423 	lck_rw_init(&h->_resize_lock, &cht_lock_group, &cht_lock_attr);
424 	h->_busy = false;
425 	h->_resize_waiters = 0;
426 
427 	ASSERT(p->cht_obj_retain != NULL);
428 	ASSERT(p->cht_obj_release != NULL);
429 	ASSERT(p->cht_obj_cmp != NULL);
430 	h->_obj_cmp = p->cht_obj_cmp;
431 	h->_obj_retain = p->cht_obj_retain;
432 	h->_obj_release = p->cht_obj_release;
433 
434 	return h;
435 }
436 
437 void
cuckoo_hashtable_free(struct cuckoo_hashtable * h)438 cuckoo_hashtable_free(struct cuckoo_hashtable *h)
439 {
440 	uint32_t i;
441 
442 	if (h == NULL) {
443 		return;
444 	}
445 
446 	ASSERT(h->_n_entries == 0);
447 
448 	if (h->_buckets != NULL) {
449 		for (i = 0; i < h->_n_buckets; i++) {
450 			lck_mtx_destroy(&h->_buckets[i]._lock, &cht_lock_group);
451 		}
452 		sk_free_type_array_counted_by(struct _bucket, h->_n_buckets, h->_buckets);
453 	}
454 	sk_free_type(struct cuckoo_hashtable, h);
455 }
456 
457 size_t
cuckoo_hashtable_entries(struct cuckoo_hashtable * h)458 cuckoo_hashtable_entries(struct cuckoo_hashtable *h)
459 {
460 	return h->_n_entries;
461 }
462 
463 size_t
cuckoo_hashtable_capacity(struct cuckoo_hashtable * h)464 cuckoo_hashtable_capacity(struct cuckoo_hashtable *h)
465 {
466 	return h->_n_buckets * _CHT_BUCKET_SLOTS;
467 }
468 
469 size_t
cuckoo_hashtable_memory_footprint(struct cuckoo_hashtable * h)470 cuckoo_hashtable_memory_footprint(struct cuckoo_hashtable *h)
471 {
472 	size_t total_meminuse = sizeof(struct cuckoo_hashtable) +
473 	    (h->_n_buckets * sizeof(struct _bucket));
474 	return total_meminuse;
475 }
476 
477 static inline struct cuckoo_node *
__find_in_bucket(struct cuckoo_hashtable * h,struct _bucket * b,void * key,uint32_t hash)478 __find_in_bucket(struct cuckoo_hashtable *h, struct _bucket *b, void *key,
479     uint32_t hash)
480 {
481 	uint32_t i;
482 	struct cuckoo_node *node = NULL;
483 
484 	__lock_bucket(b);
485 	if (b->_inuse == 0) {
486 		goto done;
487 	}
488 	for (i = 0; i < _CHT_BUCKET_SLOTS; i++) {
489 		if (b->_slots[i]._hash == hash) {
490 			node = b->_slots[i]._node;
491 			while (node != NULL) {
492 				if (h->_obj_cmp(node, key) == 0) {
493 					h->_obj_retain(node);
494 					goto done;
495 				}
496 				node = cuckoo_node_next(node);
497 			}
498 		}
499 	}
500 
501 done:
502 	__unlock_bucket(b);
503 	return node;
504 }
505 
506 /* will return node retained */
507 struct cuckoo_node *
cuckoo_hashtable_find_with_hash(struct cuckoo_hashtable * h,void * key,uint32_t hash)508 cuckoo_hashtable_find_with_hash(struct cuckoo_hashtable *h, void *key,
509     uint32_t hash)
510 {
511 	struct _bucket *b1, *b2;
512 	struct cuckoo_node *node = NULL;
513 
514 	__rlock_table(h);
515 
516 	b1 = __prim_bucket(h, hash);
517 	if ((node = __find_in_bucket(h, b1, key, hash)) != NULL) {
518 		goto done;
519 	}
520 
521 	b2 = __alt_bucket(h, hash);
522 	if ((node = __find_in_bucket(h, b2, key, hash)) != NULL) {
523 		goto done;
524 	}
525 
526 done:
527 	__unrlock_table(h);
528 	return node;
529 }
530 
531 /*
532  * To add a key into cuckoo_hashtable:
533  *   1. First it searches the key's two candidate buckets b1, b2
534  *   2. If there are slots available in b1 or b2, we place the key there
535  *   3. Otherwise cuckoo_hashtable will have to probe and make space
536  *
537  * To move keys around (open addressing hash table), cuckoo_hashtable needs to
538  * first find available slot via Cuckoo search. Here it uses bread-first-search
539  * to find the shorted path towards an empty bucket slot.
540  *
541  */
542 static inline int
__add_to_bucket(struct cuckoo_hashtable * h,struct _bucket * b,struct cuckoo_node * node,uint32_t hash)543 __add_to_bucket(struct cuckoo_hashtable *h, struct _bucket *b,
544     struct cuckoo_node *node, uint32_t hash)
545 {
546 	int ret = -1;
547 	uint8_t avail_i = _CHT_SLOT_INVAL;
548 
549 	__lock_bucket(b);
550 	if (b->_inuse == _CHT_BUCKET_SLOTS) {
551 		goto done;
552 	}
553 	for (uint8_t i = 0; i < _CHT_BUCKET_SLOTS; i++) {
554 		struct _slot *s = __bucket_slot(b, i);
555 		if (__slot_empty(s)) {
556 			if (avail_i == _CHT_SLOT_INVAL) {
557 				avail_i = i;
558 			}
559 		} else {
560 			/* chain to existing slot with same hash */
561 			if (__improbable(s->_hash == hash)) {
562 				ASSERT(s->_node != NULL);
563 				ret = cuckoo_node_chain(s->_node, node);
564 				if (ret != 0) {
565 					goto done;
566 				}
567 				cht_debug("hash %x node %p inserted [%zu][%d]",
568 				    hash, node, __bucket_idx(h, b), i);
569 				OSAddAtomic(1, &h->_n_entries);
570 				h->_obj_retain(node);
571 				goto done;
572 			}
573 		}
574 	}
575 	if (avail_i != _CHT_SLOT_INVAL) {
576 		h->_obj_retain(node);
577 		b->_slots[avail_i]._hash = hash;
578 		b->_slots[avail_i]._node = node;
579 		b->_inuse++;
580 		cht_debug("hash %x node %p inserted [%zu][%d]", hash, node,
581 		    __bucket_idx(h, b), avail_i);
582 		OSAddAtomic(1, &h->_n_entries);
583 		ret = 0;
584 	}
585 done:
586 	__unlock_bucket(b);
587 	return ret;
588 }
589 
590 #define _CHT_BFS_QUEUE_LEN      UINT8_MAX
591 #define _CHT_BFS_QUEUE_END      (_CHT_BFS_QUEUE_LEN - _CHT_BUCKET_SLOTS)
592 
593 struct _bfs_node {
594 	uint32_t        bkt_idx;
595 	uint8_t         prev_node_idx;
596 	uint8_t         prev_slot_idx;
597 };
598 
599 /*
600  * Move slots backwards on cuckoo path
601  *
602  * cuckoo_move would hold at most 2 locks at any time, moving from
603  * the end of cuckoo path toward the bucket where new keys should be
604  * stored. There could be chances of dead lock in case of multiple
605  * writers have overlapping cuckoo path. We could arrange the order of
606  * locking to avoid that but then we have to take all locks upfront,
607  * which is not friendly to concurrent readers. So instead, we try to
608  * take one by one(but still at most 2 locks holding at any time),
609  * with backoff in mind.
610  */
611 static int
cuckoo_move(struct cuckoo_hashtable * h,struct cuckoo_node * node,uint32_t hash,struct _bfs_node queue[_CHT_BFS_QUEUE_LEN],uint8_t leaf_node_idx,uint8_t leaf_slot)612 cuckoo_move(struct cuckoo_hashtable *h, struct cuckoo_node *node,
613     uint32_t hash, struct _bfs_node queue[_CHT_BFS_QUEUE_LEN], uint8_t leaf_node_idx,
614     uint8_t leaf_slot)
615 {
616 	struct _bfs_node *prev_node, *curr_node;
617 	struct _bucket *from_bkt, *to_bkt, *alt_bkt;
618 	uint8_t from_slot, to_slot;
619 
620 	curr_node = &queue[leaf_node_idx];
621 	to_bkt = __get_bucket(h, curr_node->bkt_idx);
622 	to_slot = leaf_slot;
623 
624 	__lock_bucket(to_bkt);
625 
626 	while (__probable(curr_node->prev_node_idx != _CHT_BFS_QUEUE_LEN)) {
627 		prev_node = &queue[curr_node->prev_node_idx];
628 		from_bkt = __get_bucket(h, prev_node->bkt_idx);
629 		from_slot = curr_node->prev_slot_idx;
630 
631 		if (!__lock_bucket_with_backoff(from_bkt)) {
632 			/* a dead lock or a sleeping-thread holding the lock */
633 			__unlock_bucket(to_bkt);
634 			cht_warn("cuckoo move deadlock detected");
635 			return EINVAL;
636 		}
637 
638 		/*
639 		 * Verify cuckoo path by checking:
640 		 * 1. from_bkt[from_slot]'s alternative bucket is still to_bkt
641 		 * 3. to_bkt[to_slot] is still vacant
642 		 */
643 		alt_bkt = __alt_bucket(h, from_bkt->_slots[from_slot]._hash);
644 		if (alt_bkt != to_bkt ||
645 		    !__slot_empty(__bucket_slot(to_bkt, to_slot))) {
646 			__unlock_bucket(from_bkt);
647 			__unlock_bucket(to_bkt);
648 			cht_warn("cuckoo move path invalid: %s %s",
649 			    alt_bkt != to_bkt ? "alt_bkt != to_bkt" : "",
650 			    !__slot_empty(__bucket_slot(to_bkt, to_slot)) ?
651 			    "!slot_empty(to_bkt, to_slot)" : "");
652 			return EINVAL;
653 		}
654 
655 		cht_log(CHTV_DEBUG, "Move [0x%lx][%d] to [0x%lx][%d]",
656 		    from_bkt - h->_buckets, from_slot, to_bkt - h->_buckets,
657 		    to_slot);
658 
659 		ASSERT(to_bkt->_slots[to_slot]._node == NULL);
660 		ASSERT(to_bkt->_slots[to_slot]._hash == 0);
661 
662 		/* move entry backward */
663 		to_bkt->_slots[to_slot] = from_bkt->_slots[from_slot];
664 		to_bkt->_inuse++;
665 		__slot_reset(&from_bkt->_slots[from_slot]);
666 		from_bkt->_inuse--;
667 
668 		__unlock_bucket(to_bkt);
669 
670 		curr_node = prev_node;
671 		to_bkt = from_bkt;
672 		to_slot = from_slot;
673 	}
674 
675 	ASSERT(curr_node->prev_node_idx == _CHT_BFS_QUEUE_LEN);
676 	ASSERT(curr_node->prev_slot_idx == _CHT_SLOT_INVAL);
677 
678 	/* if root slot is no longer valid */
679 	if (to_bkt->_slots[to_slot]._node != NULL) {
680 		__unlock_bucket(to_bkt);
681 		return EINVAL;
682 	}
683 
684 	to_bkt->_inuse++;
685 	__slot_set(&to_bkt->_slots[to_slot], hash, node);
686 	h->_obj_retain(node);
687 	__unlock_bucket(to_bkt);
688 
689 	OSAddAtomic(1, &h->_n_entries);
690 
691 	cht_debug("hash %x node %p inserted at [%zu][%d]", hash, node,
692 	    __bucket_idx(h, to_bkt), to_slot);
693 
694 	return 0;
695 }
696 
697 static int
cuckoo_probe(struct cuckoo_hashtable * h,struct cuckoo_node * node,uint32_t hash)698 cuckoo_probe(struct cuckoo_hashtable *h, struct cuckoo_node *node,
699     uint32_t hash)
700 {
701 	struct _bfs_node queue[_CHT_BFS_QUEUE_LEN];
702 	uint8_t head, tail;
703 	struct _bucket *b;
704 	uint8_t avail_i;
705 	int ret = ENOMEM;
706 
707 	/* probe starts from its primary bucket */
708 	queue[0].bkt_idx = hash & h->_bitmask;
709 	queue[0].prev_node_idx = _CHT_BFS_QUEUE_LEN;
710 	queue[0].prev_slot_idx = _CHT_SLOT_INVAL;
711 
712 	head = 0;
713 	tail = 1;
714 
715 	while (__probable(tail != head && tail < _CHT_BFS_QUEUE_END)) {
716 		b = __get_bucket(h, queue[head].bkt_idx);
717 		avail_i = _CHT_SLOT_INVAL;
718 		for (uint8_t i = 0; i < _CHT_BUCKET_SLOTS; i++) {
719 			struct _slot *s = __bucket_slot(b, i);
720 			if (__slot_empty(s)) {
721 				if (avail_i == _CHT_SLOT_INVAL) {
722 					avail_i = i;
723 				}
724 				continue;
725 			}
726 
727 			/*
728 			 * Another node with same hash could have been probed
729 			 * into this bucket, chain to it.
730 			 */
731 			if (__improbable(s->_hash == hash)) {
732 				ASSERT(s->_node != NULL);
733 				ret = cuckoo_node_chain(s->_node, node);
734 				if (ret != 0) {
735 					goto done;
736 				}
737 				cht_debug("hash %x node %p inserted [%zu][%d]",
738 				    hash, node, __bucket_idx(h, b), i);
739 				OSAddAtomic(1, &h->_n_entries);
740 				h->_obj_retain(node);
741 				goto done;
742 			}
743 
744 			queue[tail].bkt_idx = __alt_hash(s->_hash) & h->_bitmask;
745 			queue[tail].prev_node_idx = head;
746 			queue[tail].prev_slot_idx = i;
747 			tail++;
748 		}
749 
750 		if (avail_i != _CHT_SLOT_INVAL) {
751 			ret = cuckoo_move(h, node, hash, queue, head, avail_i);
752 			if (ret == 0) {
753 				goto done;
754 			} else if (ret == EINVAL) {
755 				cht_warn("cukoo path invalidated");
756 				goto skip;
757 			} else {
758 				cht_err("faild: unknown err %d", ret);
759 				goto done;
760 			}
761 		}
762 skip:
763 		head++;
764 	}
765 
766 	if (tail == head || tail >= _CHT_BFS_QUEUE_END) {
767 		cht_warn("failed: cuckoo probe out of search space "
768 		    "head %d tail %d (%d/%d, load factor %d%%)", head, tail,
769 		    h->_n_entries, h->_capacity,
770 		    cuckoo_hashtable_load_factor(h));
771 		ret = ENOMEM;
772 	} else {
773 		cht_warn("failed: cuckoo probe path invalidated "
774 		    " (%d/%d, load factor %d%%)", h->_n_entries, h->_capacity,
775 		    cuckoo_hashtable_load_factor(h));
776 		ret = EAGAIN;
777 	}
778 done:
779 	return ret;
780 }
781 
782 static inline void
783 __foreach_node(struct cuckoo_hashtable *h, bool wlocked,
784     void (^node_handler)(struct cuckoo_node *, uint32_t hash))
785 {
786 	if (!wlocked) {
787 		__rlock_table(h);
788 	}
789 	for (uint32_t i = 0; i < h->_n_buckets; i++) {
790 		struct _bucket *b = &h->_buckets[i];
791 		if (b->_inuse == 0) {
792 			continue;
793 		}
794 		if (!wlocked) {
795 			__lock_bucket(b);
796 		}
797 		for (uint32_t j = 0; j < _CHT_BUCKET_SLOTS; j++) {
798 			struct _slot *s = __bucket_slot(b, j);
799 			struct cuckoo_node *node = NULL, *next_node = NULL;
800 			node = s->_node;
801 			while (node != NULL) {
802 				next_node = cuckoo_node_next(node);
803 				node_handler(node, s->_hash);
804 				node = next_node;
805 			}
806 		}
807 		if (!wlocked) {
808 			__unlock_bucket(b);
809 		}
810 	}
811 	if (!wlocked) {
812 		__unrlock_table(h);
813 	}
814 }
815 
816 void
817 cuckoo_hashtable_foreach(struct cuckoo_hashtable *h,
818     void (^node_handler)(struct cuckoo_node *, uint32_t hash))
819 {
820 	__foreach_node(h, false, node_handler);
821 }
822 
823 static void
cuckoo_dummy_retain(struct cuckoo_node * node)824 cuckoo_dummy_retain(struct cuckoo_node *node)
825 {
826 #pragma unused(node)
827 }
828 
829 static void
cuckoo_dummy_release(struct cuckoo_node * node)830 cuckoo_dummy_release(struct cuckoo_node *node)
831 {
832 #pragma unused(node)
833 }
834 
835 static int
cuckoo_resize(struct cuckoo_hashtable * h,enum cuckoo_resize_ops option)836 cuckoo_resize(struct cuckoo_hashtable *h, enum cuckoo_resize_ops option)
837 {
838 	int ret = 0;
839 
840 	/* backoff from concurrent expansion */
841 	do {
842 		ret = __resize_begin(h);
843 		if (ret == EAGAIN) {
844 			cht_info("resize done by peer");
845 			return EAGAIN;
846 		}
847 	} while (ret == EINTR);
848 
849 	uint32_t curr_capacity = h->_n_buckets * _CHT_BUCKET_SLOTS;
850 	uint32_t curr_load = (100 * h->_n_entries) / curr_capacity;
851 	uint32_t new_capacity;
852 	__block size_t add_called = 0;
853 
854 	/* check load factor to ensure we are not hitting something else */
855 	if (option == _CHT_RESIZE_EXPAND) {
856 		if (curr_load < _CHT_MIN_LOAD_EXPAND) {
857 			cht_warn("Warning: early expand at %d load", curr_load);
858 		}
859 		new_capacity = curr_capacity * 2;
860 	} else {
861 		if (curr_load > _CHT_MAX_LOAD_SHRINK ||
862 		    curr_capacity == h->_rcapacity) {
863 			goto done;
864 		}
865 		new_capacity = curr_capacity / 2;
866 	}
867 
868 	cht_info("resize %d/(%d -> %d)", h->_n_entries,
869 	    curr_capacity, new_capacity);
870 
871 	struct cuckoo_hashtable_params new_p = {
872 		.cht_capacity = new_capacity,
873 		.cht_obj_cmp = h->_obj_cmp,
874 		.cht_obj_retain = cuckoo_dummy_retain,
875 		.cht_obj_release = cuckoo_dummy_release,
876 	};
877 	struct cuckoo_hashtable *tmp_h;
878 	tmp_h = cuckoo_hashtable_create(&new_p);
879 	if (tmp_h == NULL) {
880 		ret = ENOMEM;
881 		goto done;
882 	}
883 
884 	__foreach_node(h, true, ^(struct cuckoo_node *node, uint32_t hash) {
885 		int error = 0;
886 		cuckoo_node_set_next(node, NULL);
887 		error = cuckoo_hashtable_add_with_hash(tmp_h, node, hash);
888 		ASSERT(error == 0);
889 		add_called++;
890 	});
891 
892 	if (__improbable(cuckoo_hashtable_entries(h) !=
893 	    cuckoo_hashtable_entries(tmp_h))) {
894 		panic("h %zu add_called %zu tmp_h %zu",
895 		    cuckoo_hashtable_entries(h), add_called,
896 		    cuckoo_hashtable_entries(tmp_h));
897 	}
898 
899 	for (uint32_t i = 0; i < h->_n_buckets; i++) {
900 		lck_mtx_destroy(&h->_buckets[i]._lock, &cht_lock_group);
901 	}
902 	h->_capacity = tmp_h->_n_buckets * _CHT_BUCKET_SLOTS;
903 	h->_bitmask = tmp_h->_bitmask;
904 	sk_free_type_array_counted_by(struct _bucket, h->_n_buckets, h->_buckets);
905 
906 	h->_buckets = tmp_h->_buckets;
907 	h->_n_buckets = tmp_h->_n_buckets;
908 	lck_rw_destroy(&tmp_h->_resize_lock, &cht_lock_group);
909 	lck_mtx_destroy(&tmp_h->_lock, &cht_lock_group);
910 	sk_free_type(struct cuckoo_hashtable, tmp_h);
911 
912 done:
913 	__resize_end(h);
914 
915 	return ret;
916 }
917 
918 static inline int
cuckoo_add_no_expand(struct cuckoo_hashtable * h,struct cuckoo_node * node,uint32_t hash)919 cuckoo_add_no_expand(struct cuckoo_hashtable *h,
920     struct cuckoo_node *node, uint32_t hash)
921 {
922 	struct _bucket *b1, *b2;
923 	int ret = -1;
924 
925 	__rlock_table(h);
926 
927 	b1 = __prim_bucket(h, hash);
928 	if ((ret = __add_to_bucket(h, b1, node, hash)) == 0) {
929 		goto done;
930 	}
931 
932 	b2 = __alt_bucket(h, hash);
933 	if ((ret = __add_to_bucket(h, b2, node, hash)) == 0) {
934 		goto done;
935 	}
936 
937 	ret = cuckoo_probe(h, node, hash);
938 done:
939 	__unrlock_table(h);
940 	return ret;
941 }
942 
943 int
cuckoo_hashtable_add_with_hash(struct cuckoo_hashtable * h,struct cuckoo_node * node,uint32_t hash)944 cuckoo_hashtable_add_with_hash(struct cuckoo_hashtable *h,
945     struct cuckoo_node *node, uint32_t hash)
946 {
947 	int ret;
948 
949 	/* neutralize node to avoid non-terminating tail */
950 	ASSERT(cuckoo_node_next(node) == NULL);
951 
952 	ret = cuckoo_add_no_expand(h, node, hash);
953 	if (ret == ENOMEM) {
954 		do {
955 			ret = cuckoo_resize(h, _CHT_RESIZE_EXPAND);
956 			if (ret != 0 && ret != EAGAIN) {
957 				break;
958 			}
959 			// this could still fail, when other threads added
960 			// enough objs that another resize is needed
961 			ret = cuckoo_add_no_expand(h, node, hash);
962 		} while (ret == ENOMEM);
963 	}
964 
965 	return ret;
966 }
967 
968 static inline int
__del_from_bucket(struct cuckoo_hashtable * h,struct _bucket * b,struct cuckoo_node * node,uint32_t hash)969 __del_from_bucket(struct cuckoo_hashtable *h, struct _bucket *b,
970     struct cuckoo_node *node, uint32_t hash)
971 {
972 	uint32_t i;
973 
974 	__lock_bucket(b);
975 	for (i = 0; i < _CHT_BUCKET_SLOTS; i++) {
976 		if (b->_slots[i]._hash == hash) {
977 			if (cuckoo_node_del(&b->_slots[i]._node, node)) {
978 				h->_obj_release(node);
979 				OSAddAtomic(-1, &h->_n_entries);
980 				if (__slot_empty(__bucket_slot(b, i))) {
981 					b->_slots[i]._hash = 0;
982 					b->_inuse--;
983 				}
984 				__unlock_bucket(b);
985 				return 0;
986 			}
987 		}
988 	}
989 	__unlock_bucket(b);
990 	return ENOENT;
991 }
992 
993 int
cuckoo_hashtable_del(struct cuckoo_hashtable * h,struct cuckoo_node * node,uint32_t hash)994 cuckoo_hashtable_del(struct cuckoo_hashtable *h,
995     struct cuckoo_node *node, uint32_t hash)
996 {
997 	struct _bucket *b1, *b2;
998 	int ret = -1;
999 
1000 	__rlock_table(h);
1001 
1002 	b1 = __prim_bucket(h, hash);
1003 	if ((ret = __del_from_bucket(h, b1, node, hash)) == 0) {
1004 		goto done;
1005 	}
1006 
1007 	b2 = __alt_bucket(h, hash);
1008 	if ((ret = __del_from_bucket(h, b2, node, hash)) == 0) {
1009 		goto done;
1010 	}
1011 
1012 done:
1013 	if (ret == 0) {
1014 		cuckoo_node_set_next(node, NULL);
1015 	}
1016 	__unrlock_table(h);
1017 	return ret;
1018 }
1019 
1020 void
cuckoo_hashtable_try_shrink(struct cuckoo_hashtable * h)1021 cuckoo_hashtable_try_shrink(struct cuckoo_hashtable *h)
1022 {
1023 	cuckoo_resize(h, _CHT_RESIZE_SHRINK);
1024 }
1025 
1026 #if (DEVELOPMENT || DEBUG)
1027 
1028 static inline bool
cuckoo_node_looped(struct cuckoo_node * node)1029 cuckoo_node_looped(struct cuckoo_node *node)
1030 {
1031 	struct cuckoo_node *runner = node;
1032 
1033 	if (node == NULL) {
1034 		return false;
1035 	}
1036 
1037 	while (runner->next && runner->next->next) {
1038 		runner = runner->next->next;
1039 		node = node->next;
1040 
1041 		if (runner == node) {
1042 			return true;
1043 		}
1044 	}
1045 	return false;
1046 }
1047 
1048 int
cuckoo_hashtable_health_check(struct cuckoo_hashtable * h)1049 cuckoo_hashtable_health_check(struct cuckoo_hashtable *h)
1050 {
1051 	uint32_t hash;
1052 	uint32_t i, j;
1053 	struct _bucket *b;
1054 	struct cuckoo_node *node;
1055 	bool healthy = true;
1056 	uint32_t seen = 0;
1057 
1058 	__wlock_table(h);
1059 
1060 	for (i = 0; i < h->_n_buckets; i++) {
1061 		b = &h->_buckets[i];
1062 		uint8_t inuse = 0;
1063 		for (j = 0; j < _CHT_BUCKET_SLOTS; j++) {
1064 			hash = b->_slots[j]._hash;
1065 			node = b->_slots[j]._node;
1066 			if (node != NULL) {
1067 				inuse++;
1068 			}
1069 			while (node != NULL) {
1070 				seen++;
1071 				if ((__prim_bucket(h, hash) != b) &&
1072 				    (__alt_bucket(h, hash) != b)) {
1073 					panic("[%d][%d] stray hash %x node %p",
1074 					    i, j, hash, node);
1075 					healthy = false;
1076 				}
1077 
1078 				if (cuckoo_node_looped(node)) {
1079 					panic("[%d][%d] looped hash %x node %p",
1080 					    i, j, hash, node);
1081 					healthy = false;
1082 				}
1083 				node = cuckoo_node_next(node);
1084 			}
1085 		}
1086 		ASSERT(inuse == b->_inuse);
1087 	}
1088 
1089 	if (seen != h->_n_entries) {
1090 		panic("seen %d != n_entries %d", seen, h->_n_entries);
1091 	}
1092 
1093 	__unwlock_table(h);
1094 
1095 	if (!healthy) {
1096 		cht_err("table unhealthy");
1097 		return -1;
1098 	} else {
1099 		return 0;
1100 	}
1101 }
1102 
1103 void
cuckoo_hashtable_dump(struct cuckoo_hashtable * h)1104 cuckoo_hashtable_dump(struct cuckoo_hashtable *h)
1105 {
1106 	uint32_t hash;
1107 	struct cuckoo_node *node;
1108 	uint32_t i, j;
1109 	struct _bucket *b;
1110 
1111 	cuckoo_hashtable_health_check(h);
1112 
1113 	for (i = 0; i < h->_n_buckets; i++) {
1114 		printf("%d\t", i);
1115 		b = &h->_buckets[i];
1116 		for (j = 0; j < _CHT_BUCKET_SLOTS; j++) {
1117 			hash = b->_slots[j]._hash;
1118 			node = b->_slots[j]._node;
1119 			printf("0x%08x(%p) ", hash, node);
1120 		}
1121 		printf("\n");
1122 	}
1123 }
1124 #endif /* !DEVELOPMENT && !DEBUG */
1125