1 // SPDX-License-Identifier: GPL-2.0
2 /* IPVS:	Maglev Hashing scheduling module
3  *
4  * Authors:	Inju Song <inju.song@navercorp.com>
5  *
6  */
7 
8 /* The mh algorithm is to assign a preference list of all the lookup
9  * table positions to each destination and populate the table with
10  * the most-preferred position of destinations. Then it is to select
11  * destination with the hash key of source IP address through looking
12  * up a the lookup table.
13  *
14  * The algorithm is detailed in:
15  * [3.4 Consistent Hasing]
16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
17  *
18  */
19 
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22 
23 #include <linux/ip.h>
24 #include <linux/slab.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/skbuff.h>
28 
29 #include <net/ip_vs.h>
30 
31 #include <linux/siphash.h>
32 #include <linux/bitops.h>
33 #include <linux/gcd.h>
34 
35 #define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 /* MH fallback */
36 #define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 /* MH use port */
37 
38 struct ip_vs_mh_lookup {
39 	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
40 };
41 
42 struct ip_vs_mh_dest_setup {
43 	unsigned int	offset; /* starting offset */
44 	unsigned int	skip;	/* skip */
45 	unsigned int	perm;	/* next_offset */
46 	int		turns;	/* weight / gcd() and rshift */
47 };
48 
49 /* Available prime numbers for MH table */
50 static int primes[] = {251, 509, 1021, 2039, 4093,
51 		       8191, 16381, 32749, 65521, 131071};
52 
53 /* For IPVS MH entry hash table */
54 #ifndef CONFIG_IP_VS_MH_TAB_INDEX
55 #define CONFIG_IP_VS_MH_TAB_INDEX	12
56 #endif
57 #define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)
58 #define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)
59 #define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
60 
61 struct ip_vs_mh_state {
62 	struct rcu_head			rcu_head;
63 	struct ip_vs_mh_lookup		*lookup;
64 	struct ip_vs_mh_dest_setup	*dest_setup;
65 	hsiphash_key_t			hash1, hash2;
66 	int				gcd;
67 	int				rshift;
68 };
69 
generate_hash_secret(hsiphash_key_t * hash1,hsiphash_key_t * hash2)70 static inline void generate_hash_secret(hsiphash_key_t *hash1,
71 					hsiphash_key_t *hash2)
72 {
73 	hash1->key[0] = 2654435761UL;
74 	hash1->key[1] = 2654435761UL;
75 
76 	hash2->key[0] = 2654446892UL;
77 	hash2->key[1] = 2654446892UL;
78 }
79 
80 /* Helper function to determine if server is unavailable */
is_unavailable(struct ip_vs_dest * dest)81 static inline bool is_unavailable(struct ip_vs_dest *dest)
82 {
83 	return atomic_read(&dest->weight) <= 0 ||
84 	       dest->flags & IP_VS_DEST_F_OVERLOAD;
85 }
86 
87 /* Returns hash value for IPVS MH entry */
88 static inline unsigned int
ip_vs_mh_hashkey(int af,const union nf_inet_addr * addr,__be16 port,hsiphash_key_t * key,unsigned int offset)89 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
90 		 __be16 port, hsiphash_key_t *key, unsigned int offset)
91 {
92 	unsigned int v;
93 	__be32 addr_fold = addr->ip;
94 
95 #ifdef CONFIG_IP_VS_IPV6
96 	if (af == AF_INET6)
97 		addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
98 			    addr->ip6[2] ^ addr->ip6[3];
99 #endif
100 	v = (offset + ntohs(port) + ntohl(addr_fold));
101 	return hsiphash(&v, sizeof(v), key);
102 }
103 
104 /* Reset all the hash buckets of the specified table. */
ip_vs_mh_reset(struct ip_vs_mh_state * s)105 static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
106 {
107 	int i;
108 	struct ip_vs_mh_lookup *l;
109 	struct ip_vs_dest *dest;
110 
111 	l = &s->lookup[0];
112 	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
113 		dest = rcu_dereference_protected(l->dest, 1);
114 		if (dest) {
115 			ip_vs_dest_put(dest);
116 			RCU_INIT_POINTER(l->dest, NULL);
117 		}
118 		l++;
119 	}
120 }
121 
ip_vs_mh_permutate(struct ip_vs_mh_state * s,struct ip_vs_service * svc)122 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
123 			      struct ip_vs_service *svc)
124 {
125 	struct list_head *p;
126 	struct ip_vs_mh_dest_setup *ds;
127 	struct ip_vs_dest *dest;
128 	int lw;
129 
130 	/* If gcd is smaller then 1, number of dests or
131 	 * all last_weight of dests are zero. So, skip
132 	 * permutation for the dests.
133 	 */
134 	if (s->gcd < 1)
135 		return 0;
136 
137 	/* Set dest_setup for the dests permutation */
138 	p = &svc->destinations;
139 	ds = &s->dest_setup[0];
140 	while ((p = p->next) != &svc->destinations) {
141 		dest = list_entry(p, struct ip_vs_dest, n_list);
142 
143 		ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
144 					      dest->port, &s->hash1, 0) %
145 					      IP_VS_MH_TAB_SIZE;
146 		ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
147 					    dest->port, &s->hash2, 0) %
148 					    (IP_VS_MH_TAB_SIZE - 1) + 1;
149 		ds->perm = ds->offset;
150 
151 		lw = atomic_read(&dest->last_weight);
152 		ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
153 		ds++;
154 	}
155 
156 	return 0;
157 }
158 
ip_vs_mh_populate(struct ip_vs_mh_state * s,struct ip_vs_service * svc)159 static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
160 			     struct ip_vs_service *svc)
161 {
162 	int n, c, dt_count;
163 	unsigned long *table;
164 	struct list_head *p;
165 	struct ip_vs_mh_dest_setup *ds;
166 	struct ip_vs_dest *dest, *new_dest;
167 
168 	/* If gcd is smaller then 1, number of dests or
169 	 * all last_weight of dests are zero. So, skip
170 	 * the population for the dests and reset lookup table.
171 	 */
172 	if (s->gcd < 1) {
173 		ip_vs_mh_reset(s);
174 		return 0;
175 	}
176 
177 	table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
178 			sizeof(unsigned long), GFP_KERNEL);
179 	if (!table)
180 		return -ENOMEM;
181 
182 	p = &svc->destinations;
183 	n = 0;
184 	dt_count = 0;
185 	while (n < IP_VS_MH_TAB_SIZE) {
186 		if (p == &svc->destinations)
187 			p = p->next;
188 
189 		ds = &s->dest_setup[0];
190 		while (p != &svc->destinations) {
191 			/* Ignore added server with zero weight */
192 			if (ds->turns < 1) {
193 				p = p->next;
194 				ds++;
195 				continue;
196 			}
197 
198 			c = ds->perm;
199 			while (test_bit(c, table)) {
200 				/* Add skip, mod IP_VS_MH_TAB_SIZE */
201 				ds->perm += ds->skip;
202 				if (ds->perm >= IP_VS_MH_TAB_SIZE)
203 					ds->perm -= IP_VS_MH_TAB_SIZE;
204 				c = ds->perm;
205 			}
206 
207 			__set_bit(c, table);
208 
209 			dest = rcu_dereference_protected(s->lookup[c].dest, 1);
210 			new_dest = list_entry(p, struct ip_vs_dest, n_list);
211 			if (dest != new_dest) {
212 				if (dest)
213 					ip_vs_dest_put(dest);
214 				ip_vs_dest_hold(new_dest);
215 				RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
216 			}
217 
218 			if (++n == IP_VS_MH_TAB_SIZE)
219 				goto out;
220 
221 			if (++dt_count >= ds->turns) {
222 				dt_count = 0;
223 				p = p->next;
224 				ds++;
225 			}
226 		}
227 	}
228 
229 out:
230 	kfree(table);
231 	return 0;
232 }
233 
234 /* Get ip_vs_dest associated with supplied parameters. */
235 static inline struct ip_vs_dest *
ip_vs_mh_get(struct ip_vs_service * svc,struct ip_vs_mh_state * s,const union nf_inet_addr * addr,__be16 port)236 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
237 	     const union nf_inet_addr *addr, __be16 port)
238 {
239 	unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
240 					     % IP_VS_MH_TAB_SIZE;
241 	struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
242 
243 	return (!dest || is_unavailable(dest)) ? NULL : dest;
244 }
245 
246 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */
247 static inline struct ip_vs_dest *
ip_vs_mh_get_fallback(struct ip_vs_service * svc,struct ip_vs_mh_state * s,const union nf_inet_addr * addr,__be16 port)248 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
249 		      const union nf_inet_addr *addr, __be16 port)
250 {
251 	unsigned int offset, roffset;
252 	unsigned int hash, ihash;
253 	struct ip_vs_dest *dest;
254 
255 	/* First try the dest it's supposed to go to */
256 	ihash = ip_vs_mh_hashkey(svc->af, addr, port,
257 				 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
258 	dest = rcu_dereference(s->lookup[ihash].dest);
259 	if (!dest)
260 		return NULL;
261 	if (!is_unavailable(dest))
262 		return dest;
263 
264 	IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
265 		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
266 
267 	/* If the original dest is unavailable, loop around the table
268 	 * starting from ihash to find a new dest
269 	 */
270 	for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
271 		roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
272 		hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
273 					roffset) % IP_VS_MH_TAB_SIZE;
274 		dest = rcu_dereference(s->lookup[hash].dest);
275 		if (!dest)
276 			break;
277 		if (!is_unavailable(dest))
278 			return dest;
279 		IP_VS_DBG_BUF(6,
280 			      "MH: selected unavailable server %s:%u (offset %u), reselecting",
281 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
282 			      ntohs(dest->port), roffset);
283 	}
284 
285 	return NULL;
286 }
287 
288 /* Assign all the hash buckets of the specified table with the service. */
ip_vs_mh_reassign(struct ip_vs_mh_state * s,struct ip_vs_service * svc)289 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
290 			     struct ip_vs_service *svc)
291 {
292 	int ret;
293 
294 	if (svc->num_dests > IP_VS_MH_TAB_SIZE)
295 		return -EINVAL;
296 
297 	if (svc->num_dests >= 1) {
298 		s->dest_setup = kcalloc(svc->num_dests,
299 					sizeof(struct ip_vs_mh_dest_setup),
300 					GFP_KERNEL);
301 		if (!s->dest_setup)
302 			return -ENOMEM;
303 	}
304 
305 	ip_vs_mh_permutate(s, svc);
306 
307 	ret = ip_vs_mh_populate(s, svc);
308 	if (ret < 0)
309 		goto out;
310 
311 	IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
312 		      IP_VS_DBG_ADDR(svc->af, &svc->addr),
313 		      ntohs(svc->port));
314 
315 out:
316 	if (svc->num_dests >= 1) {
317 		kfree(s->dest_setup);
318 		s->dest_setup = NULL;
319 	}
320 	return ret;
321 }
322 
ip_vs_mh_gcd_weight(struct ip_vs_service * svc)323 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
324 {
325 	struct ip_vs_dest *dest;
326 	int weight;
327 	int g = 0;
328 
329 	list_for_each_entry(dest, &svc->destinations, n_list) {
330 		weight = atomic_read(&dest->last_weight);
331 		if (weight > 0) {
332 			if (g > 0)
333 				g = gcd(weight, g);
334 			else
335 				g = weight;
336 		}
337 	}
338 	return g;
339 }
340 
341 /* To avoid assigning huge weight for the MH table,
342  * calculate shift value with gcd.
343  */
ip_vs_mh_shift_weight(struct ip_vs_service * svc,int gcd)344 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
345 {
346 	struct ip_vs_dest *dest;
347 	int new_weight, weight = 0;
348 	int mw, shift;
349 
350 	/* If gcd is smaller then 1, number of dests or
351 	 * all last_weight of dests are zero. So, return
352 	 * shift value as zero.
353 	 */
354 	if (gcd < 1)
355 		return 0;
356 
357 	list_for_each_entry(dest, &svc->destinations, n_list) {
358 		new_weight = atomic_read(&dest->last_weight);
359 		if (new_weight > weight)
360 			weight = new_weight;
361 	}
362 
363 	/* Because gcd is greater than zero,
364 	 * the maximum weight and gcd are always greater than zero
365 	 */
366 	mw = weight / gcd;
367 
368 	/* shift = occupied bits of weight/gcd - MH highest bits */
369 	shift = fls(mw) - IP_VS_MH_TAB_BITS;
370 	return (shift >= 0) ? shift : 0;
371 }
372 
ip_vs_mh_state_free(struct rcu_head * head)373 static void ip_vs_mh_state_free(struct rcu_head *head)
374 {
375 	struct ip_vs_mh_state *s;
376 
377 	s = container_of(head, struct ip_vs_mh_state, rcu_head);
378 	kfree(s->lookup);
379 	kfree(s);
380 }
381 
ip_vs_mh_init_svc(struct ip_vs_service * svc)382 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
383 {
384 	int ret;
385 	struct ip_vs_mh_state *s;
386 
387 	/* Allocate the MH table for this service */
388 	s = kzalloc(sizeof(*s), GFP_KERNEL);
389 	if (!s)
390 		return -ENOMEM;
391 
392 	s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
393 			    GFP_KERNEL);
394 	if (!s->lookup) {
395 		kfree(s);
396 		return -ENOMEM;
397 	}
398 
399 	generate_hash_secret(&s->hash1, &s->hash2);
400 	s->gcd = ip_vs_mh_gcd_weight(svc);
401 	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
402 
403 	IP_VS_DBG(6,
404 		  "MH lookup table (memory=%zdbytes) allocated for current service\n",
405 		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
406 
407 	/* Assign the lookup table with current dests */
408 	ret = ip_vs_mh_reassign(s, svc);
409 	if (ret < 0) {
410 		ip_vs_mh_reset(s);
411 		ip_vs_mh_state_free(&s->rcu_head);
412 		return ret;
413 	}
414 
415 	/* No more failures, attach state */
416 	svc->sched_data = s;
417 	return 0;
418 }
419 
ip_vs_mh_done_svc(struct ip_vs_service * svc)420 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
421 {
422 	struct ip_vs_mh_state *s = svc->sched_data;
423 
424 	/* Got to clean up lookup entry here */
425 	ip_vs_mh_reset(s);
426 
427 	call_rcu(&s->rcu_head, ip_vs_mh_state_free);
428 	IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
429 		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
430 }
431 
ip_vs_mh_dest_changed(struct ip_vs_service * svc,struct ip_vs_dest * dest)432 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
433 				 struct ip_vs_dest *dest)
434 {
435 	struct ip_vs_mh_state *s = svc->sched_data;
436 
437 	s->gcd = ip_vs_mh_gcd_weight(svc);
438 	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
439 
440 	/* Assign the lookup table with the updated service */
441 	return ip_vs_mh_reassign(s, svc);
442 }
443 
444 /* Helper function to get port number */
445 static inline __be16
ip_vs_mh_get_port(const struct sk_buff * skb,struct ip_vs_iphdr * iph)446 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
447 {
448 	__be16 _ports[2], *ports;
449 
450 	/* At this point we know that we have a valid packet of some kind.
451 	 * Because ICMP packets are only guaranteed to have the first 8
452 	 * bytes, let's just grab the ports.  Fortunately they're in the
453 	 * same position for all three of the protocols we care about.
454 	 */
455 	switch (iph->protocol) {
456 	case IPPROTO_TCP:
457 	case IPPROTO_UDP:
458 	case IPPROTO_SCTP:
459 		ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
460 					   &_ports);
461 		if (unlikely(!ports))
462 			return 0;
463 
464 		if (likely(!ip_vs_iph_inverse(iph)))
465 			return ports[0];
466 		else
467 			return ports[1];
468 	default:
469 		return 0;
470 	}
471 }
472 
473 /* Maglev Hashing scheduling */
474 static struct ip_vs_dest *
ip_vs_mh_schedule(struct ip_vs_service * svc,const struct sk_buff * skb,struct ip_vs_iphdr * iph)475 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
476 		  struct ip_vs_iphdr *iph)
477 {
478 	struct ip_vs_dest *dest;
479 	struct ip_vs_mh_state *s;
480 	__be16 port = 0;
481 	const union nf_inet_addr *hash_addr;
482 
483 	hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
484 
485 	IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
486 
487 	if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
488 		port = ip_vs_mh_get_port(skb, iph);
489 
490 	s = (struct ip_vs_mh_state *)svc->sched_data;
491 
492 	if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
493 		dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
494 	else
495 		dest = ip_vs_mh_get(svc, s, hash_addr, port);
496 
497 	if (!dest) {
498 		ip_vs_scheduler_err(svc, "no destination available");
499 		return NULL;
500 	}
501 
502 	IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
503 		      IP_VS_DBG_ADDR(svc->af, hash_addr),
504 		      ntohs(port),
505 		      IP_VS_DBG_ADDR(dest->af, &dest->addr),
506 		      ntohs(dest->port));
507 
508 	return dest;
509 }
510 
511 /* IPVS MH Scheduler structure */
512 static struct ip_vs_scheduler ip_vs_mh_scheduler = {
513 	.name =			"mh",
514 	.refcnt =		ATOMIC_INIT(0),
515 	.module =		THIS_MODULE,
516 	.n_list	 =		LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
517 	.init_service =		ip_vs_mh_init_svc,
518 	.done_service =		ip_vs_mh_done_svc,
519 	.add_dest =		ip_vs_mh_dest_changed,
520 	.del_dest =		ip_vs_mh_dest_changed,
521 	.upd_dest =		ip_vs_mh_dest_changed,
522 	.schedule =		ip_vs_mh_schedule,
523 };
524 
ip_vs_mh_init(void)525 static int __init ip_vs_mh_init(void)
526 {
527 	return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
528 }
529 
ip_vs_mh_cleanup(void)530 static void __exit ip_vs_mh_cleanup(void)
531 {
532 	unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
533 	rcu_barrier();
534 }
535 
536 module_init(ip_vs_mh_init);
537 module_exit(ip_vs_mh_cleanup);
538 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
539 MODULE_LICENSE("GPL v2");
540 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
541