1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum_prio;
177 	u16	tin_quantum_band;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 5, 1, 2, 4, 2, 2, 2,
317 	0, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 2, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 0, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 #define REC_INV_SQRT_CACHE (16)
365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366 
367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369  *
370  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371  */
372 
cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars)
374 {
375 	u32 invsqrt, invsqrt2;
376 	u64 val;
377 
378 	invsqrt = vars->rec_inv_sqrt;
379 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381 
382 	val >>= 2; /* avoid overflow in following multiply */
383 	val = (val * invsqrt) >> (32 - 2 + 1);
384 
385 	vars->rec_inv_sqrt = val;
386 }
387 
cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars)
389 {
390 	if (vars->count < REC_INV_SQRT_CACHE)
391 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392 	else
393 		cobalt_newton_step(vars);
394 }
395 
396 /* There is a big difference in timing between the accurate values placed in
397  * the cache and the approximations given by a single Newton step for small
398  * count values, particularly when stepping from count 1 to 2 or vice versa.
399  * Above 16, a single Newton step gives sufficient accuracy in either
400  * direction, given the precision stored.
401  *
402  * The magnitude of the error when stepping up to count 2 is such as to give
403  * the value that *should* have been produced at count 4.
404  */
405 
cobalt_cache_init(void)406 static void cobalt_cache_init(void)
407 {
408 	struct cobalt_vars v;
409 
410 	memset(&v, 0, sizeof(v));
411 	v.rec_inv_sqrt = ~0U;
412 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413 
414 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 		cobalt_newton_step(&v);
419 
420 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421 	}
422 }
423 
cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 	memset(vars, 0, sizeof(*vars));
427 
428 	if (!cobalt_rec_inv_sqrt_cache[0]) {
429 		cobalt_cache_init();
430 		cobalt_rec_inv_sqrt_cache[0] = ~0;
431 	}
432 }
433 
434 /* CoDel control_law is t + interval/sqrt(count)
435  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436  * both sqrt() and divide operation.
437  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t,
439 			      u64 interval,
440 			      u32 rec_inv_sqrt)
441 {
442 	return ktime_add_ns(t, reciprocal_scale(interval,
443 						rec_inv_sqrt));
444 }
445 
446 /* Call this when a packet had to be dropped due to queue overflow.  Returns
447  * true if the BLUE state was quiescent before but active after this call.
448  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars,
450 			      struct cobalt_params *p,
451 			      ktime_t now)
452 {
453 	bool up = false;
454 
455 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456 		up = !vars->p_drop;
457 		vars->p_drop += p->p_inc;
458 		if (vars->p_drop < p->p_inc)
459 			vars->p_drop = ~0;
460 		vars->blue_timer = now;
461 	}
462 	vars->dropping = true;
463 	vars->drop_next = now;
464 	if (!vars->count)
465 		vars->count = 1;
466 
467 	return up;
468 }
469 
470 /* Call this when the queue was serviced but turned out to be empty.  Returns
471  * true if the BLUE state was active before but quiescent after this call.
472  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
474 			       struct cobalt_params *p,
475 			       ktime_t now)
476 {
477 	bool down = false;
478 
479 	if (vars->p_drop &&
480 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481 		if (vars->p_drop < p->p_dec)
482 			vars->p_drop = 0;
483 		else
484 			vars->p_drop -= p->p_dec;
485 		vars->blue_timer = now;
486 		down = !vars->p_drop;
487 	}
488 	vars->dropping = false;
489 
490 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491 		vars->count--;
492 		cobalt_invsqrt(vars);
493 		vars->drop_next = cobalt_control(vars->drop_next,
494 						 p->interval,
495 						 vars->rec_inv_sqrt);
496 	}
497 
498 	return down;
499 }
500 
501 /* Call this with a freshly dequeued packet for possible congestion marking.
502  * Returns true as an instruction to drop the packet, false for delivery.
503  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars,
505 			       struct cobalt_params *p,
506 			       ktime_t now,
507 			       struct sk_buff *skb,
508 			       u32 bulk_flows)
509 {
510 	bool next_due, over_target, drop = false;
511 	ktime_t schedule;
512 	u64 sojourn;
513 
514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
515  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516  * scheduling decision is actually branched, without destroying that
517  * information.  Similarly, the first 'schedule' value calculated is preserved
518  * in the boolean 'next_due'.
519  *
520  * As for 'drop_next', we take advantage of the fact that 'interval' is both
521  * the delay between first exceeding 'target' and the first signalling event,
522  * *and* the scaling factor for the signalling frequency.  It's therefore very
523  * natural to use a single mechanism for both purposes, and eliminates a
524  * significant amount of reference Codel's spaghetti code.  To help with this,
525  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526  * as possible to 1.0 in fixed-point.
527  */
528 
529 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530 	schedule = ktime_sub(now, vars->drop_next);
531 	over_target = sojourn > p->target &&
532 		      sojourn > p->mtu_time * bulk_flows * 2 &&
533 		      sojourn > p->mtu_time * 4;
534 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535 
536 	vars->ecn_marked = false;
537 
538 	if (over_target) {
539 		if (!vars->dropping) {
540 			vars->dropping = true;
541 			vars->drop_next = cobalt_control(now,
542 							 p->interval,
543 							 vars->rec_inv_sqrt);
544 		}
545 		if (!vars->count)
546 			vars->count = 1;
547 	} else if (vars->dropping) {
548 		vars->dropping = false;
549 	}
550 
551 	if (next_due && vars->dropping) {
552 		/* Use ECN mark if possible, otherwise drop */
553 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554 
555 		vars->count++;
556 		if (!vars->count)
557 			vars->count--;
558 		cobalt_invsqrt(vars);
559 		vars->drop_next = cobalt_control(vars->drop_next,
560 						 p->interval,
561 						 vars->rec_inv_sqrt);
562 		schedule = ktime_sub(now, vars->drop_next);
563 	} else {
564 		while (next_due) {
565 			vars->count--;
566 			cobalt_invsqrt(vars);
567 			vars->drop_next = cobalt_control(vars->drop_next,
568 							 p->interval,
569 							 vars->rec_inv_sqrt);
570 			schedule = ktime_sub(now, vars->drop_next);
571 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572 		}
573 	}
574 
575 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576 	if (vars->p_drop)
577 		drop |= (prandom_u32() < vars->p_drop);
578 
579 	/* Overload the drop_next field as an activity timeout */
580 	if (!vars->count)
581 		vars->drop_next = ktime_add_ns(now, p->interval);
582 	else if (ktime_to_ns(schedule) > 0 && !drop)
583 		vars->drop_next = now;
584 
585 	return drop;
586 }
587 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static void cake_update_flowkeys(struct flow_keys *keys,
589 				 const struct sk_buff *skb)
590 {
591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592 	struct nf_conntrack_tuple tuple = {};
593 	bool rev = !skb->_nfct;
594 
595 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
596 		return;
597 
598 	if (!nf_ct_get_tuple_skb(&tuple, skb))
599 		return;
600 
601 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
603 
604 	if (keys->ports.ports) {
605 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
606 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
607 	}
608 #endif
609 }
610 
611 /* Cake has several subtle multiple bit settings. In these cases you
612  *  would be matching triple isolate mode as well.
613  */
614 
cake_dsrc(int flow_mode)615 static bool cake_dsrc(int flow_mode)
616 {
617 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
618 }
619 
cake_ddst(int flow_mode)620 static bool cake_ddst(int flow_mode)
621 {
622 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
623 }
624 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)625 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
626 		     int flow_mode, u16 flow_override, u16 host_override)
627 {
628 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
629 	u16 reduced_hash, srchost_idx, dsthost_idx;
630 	struct flow_keys keys, host_keys;
631 
632 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
633 		return 0;
634 
635 	/* If both overrides are set we can skip packet dissection entirely */
636 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
637 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
638 		goto skip_hash;
639 
640 	skb_flow_dissect_flow_keys(skb, &keys,
641 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
642 
643 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
644 		cake_update_flowkeys(&keys, skb);
645 
646 	/* flow_hash_from_keys() sorts the addresses by value, so we have
647 	 * to preserve their order in a separate data structure to treat
648 	 * src and dst host addresses as independently selectable.
649 	 */
650 	host_keys = keys;
651 	host_keys.ports.ports     = 0;
652 	host_keys.basic.ip_proto  = 0;
653 	host_keys.keyid.keyid     = 0;
654 	host_keys.tags.flow_label = 0;
655 
656 	switch (host_keys.control.addr_type) {
657 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
658 		host_keys.addrs.v4addrs.src = 0;
659 		dsthost_hash = flow_hash_from_keys(&host_keys);
660 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
661 		host_keys.addrs.v4addrs.dst = 0;
662 		srchost_hash = flow_hash_from_keys(&host_keys);
663 		break;
664 
665 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
666 		memset(&host_keys.addrs.v6addrs.src, 0,
667 		       sizeof(host_keys.addrs.v6addrs.src));
668 		dsthost_hash = flow_hash_from_keys(&host_keys);
669 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
670 		memset(&host_keys.addrs.v6addrs.dst, 0,
671 		       sizeof(host_keys.addrs.v6addrs.dst));
672 		srchost_hash = flow_hash_from_keys(&host_keys);
673 		break;
674 
675 	default:
676 		dsthost_hash = 0;
677 		srchost_hash = 0;
678 	}
679 
680 	/* This *must* be after the above switch, since as a
681 	 * side-effect it sorts the src and dst addresses.
682 	 */
683 	if (flow_mode & CAKE_FLOW_FLOWS)
684 		flow_hash = flow_hash_from_keys(&keys);
685 
686 skip_hash:
687 	if (flow_override)
688 		flow_hash = flow_override - 1;
689 	if (host_override) {
690 		dsthost_hash = host_override - 1;
691 		srchost_hash = host_override - 1;
692 	}
693 
694 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
695 		if (flow_mode & CAKE_FLOW_SRC_IP)
696 			flow_hash ^= srchost_hash;
697 
698 		if (flow_mode & CAKE_FLOW_DST_IP)
699 			flow_hash ^= dsthost_hash;
700 	}
701 
702 	reduced_hash = flow_hash % CAKE_QUEUES;
703 
704 	/* set-associative hashing */
705 	/* fast path if no hash collision (direct lookup succeeds) */
706 	if (likely(q->tags[reduced_hash] == flow_hash &&
707 		   q->flows[reduced_hash].set)) {
708 		q->way_directs++;
709 	} else {
710 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
711 		u32 outer_hash = reduced_hash - inner_hash;
712 		bool allocate_src = false;
713 		bool allocate_dst = false;
714 		u32 i, k;
715 
716 		/* check if any active queue in the set is reserved for
717 		 * this flow.
718 		 */
719 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
720 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
721 			if (q->tags[outer_hash + k] == flow_hash) {
722 				if (i)
723 					q->way_hits++;
724 
725 				if (!q->flows[outer_hash + k].set) {
726 					/* need to increment host refcnts */
727 					allocate_src = cake_dsrc(flow_mode);
728 					allocate_dst = cake_ddst(flow_mode);
729 				}
730 
731 				goto found;
732 			}
733 		}
734 
735 		/* no queue is reserved for this flow, look for an
736 		 * empty one.
737 		 */
738 		for (i = 0; i < CAKE_SET_WAYS;
739 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
740 			if (!q->flows[outer_hash + k].set) {
741 				q->way_misses++;
742 				allocate_src = cake_dsrc(flow_mode);
743 				allocate_dst = cake_ddst(flow_mode);
744 				goto found;
745 			}
746 		}
747 
748 		/* With no empty queues, default to the original
749 		 * queue, accept the collision, update the host tags.
750 		 */
751 		q->way_collisions++;
752 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
753 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
754 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
755 		}
756 		allocate_src = cake_dsrc(flow_mode);
757 		allocate_dst = cake_ddst(flow_mode);
758 found:
759 		/* reserve queue for future packets in same flow */
760 		reduced_hash = outer_hash + k;
761 		q->tags[reduced_hash] = flow_hash;
762 
763 		if (allocate_src) {
764 			srchost_idx = srchost_hash % CAKE_QUEUES;
765 			inner_hash = srchost_idx % CAKE_SET_WAYS;
766 			outer_hash = srchost_idx - inner_hash;
767 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
768 				i++, k = (k + 1) % CAKE_SET_WAYS) {
769 				if (q->hosts[outer_hash + k].srchost_tag ==
770 				    srchost_hash)
771 					goto found_src;
772 			}
773 			for (i = 0; i < CAKE_SET_WAYS;
774 				i++, k = (k + 1) % CAKE_SET_WAYS) {
775 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
776 					break;
777 			}
778 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
779 found_src:
780 			srchost_idx = outer_hash + k;
781 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
782 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
783 			q->flows[reduced_hash].srchost = srchost_idx;
784 		}
785 
786 		if (allocate_dst) {
787 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
788 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
789 			outer_hash = dsthost_idx - inner_hash;
790 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
791 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
792 				if (q->hosts[outer_hash + k].dsthost_tag ==
793 				    dsthost_hash)
794 					goto found_dst;
795 			}
796 			for (i = 0; i < CAKE_SET_WAYS;
797 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
798 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
799 					break;
800 			}
801 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
802 found_dst:
803 			dsthost_idx = outer_hash + k;
804 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
805 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
806 			q->flows[reduced_hash].dsthost = dsthost_idx;
807 		}
808 	}
809 
810 	return reduced_hash;
811 }
812 
813 /* helper functions : might be changed when/if skb use a standard list_head */
814 /* remove one skb from head of slot queue */
815 
dequeue_head(struct cake_flow * flow)816 static struct sk_buff *dequeue_head(struct cake_flow *flow)
817 {
818 	struct sk_buff *skb = flow->head;
819 
820 	if (skb) {
821 		flow->head = skb->next;
822 		skb_mark_not_on_list(skb);
823 	}
824 
825 	return skb;
826 }
827 
828 /* add skb to flow queue (tail add) */
829 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)830 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
831 {
832 	if (!flow->head)
833 		flow->head = skb;
834 	else
835 		flow->tail->next = skb;
836 	flow->tail = skb;
837 	skb->next = NULL;
838 }
839 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)840 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
841 				    struct ipv6hdr *buf)
842 {
843 	unsigned int offset = skb_network_offset(skb);
844 	struct iphdr *iph;
845 
846 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
847 
848 	if (!iph)
849 		return NULL;
850 
851 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
852 		return skb_header_pointer(skb, offset + iph->ihl * 4,
853 					  sizeof(struct ipv6hdr), buf);
854 
855 	else if (iph->version == 4)
856 		return iph;
857 
858 	else if (iph->version == 6)
859 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
860 					  buf);
861 
862 	return NULL;
863 }
864 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)865 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
866 				      void *buf, unsigned int bufsize)
867 {
868 	unsigned int offset = skb_network_offset(skb);
869 	const struct ipv6hdr *ipv6h;
870 	const struct tcphdr *tcph;
871 	const struct iphdr *iph;
872 	struct ipv6hdr _ipv6h;
873 	struct tcphdr _tcph;
874 
875 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
876 
877 	if (!ipv6h)
878 		return NULL;
879 
880 	if (ipv6h->version == 4) {
881 		iph = (struct iphdr *)ipv6h;
882 		offset += iph->ihl * 4;
883 
884 		/* special-case 6in4 tunnelling, as that is a common way to get
885 		 * v6 connectivity in the home
886 		 */
887 		if (iph->protocol == IPPROTO_IPV6) {
888 			ipv6h = skb_header_pointer(skb, offset,
889 						   sizeof(_ipv6h), &_ipv6h);
890 
891 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
892 				return NULL;
893 
894 			offset += sizeof(struct ipv6hdr);
895 
896 		} else if (iph->protocol != IPPROTO_TCP) {
897 			return NULL;
898 		}
899 
900 	} else if (ipv6h->version == 6) {
901 		if (ipv6h->nexthdr != IPPROTO_TCP)
902 			return NULL;
903 
904 		offset += sizeof(struct ipv6hdr);
905 	} else {
906 		return NULL;
907 	}
908 
909 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
910 	if (!tcph)
911 		return NULL;
912 
913 	return skb_header_pointer(skb, offset,
914 				  min(__tcp_hdrlen(tcph), bufsize), buf);
915 }
916 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)917 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
918 				   int code, int *oplen)
919 {
920 	/* inspired by tcp_parse_options in tcp_input.c */
921 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
922 	const u8 *ptr = (const u8 *)(tcph + 1);
923 
924 	while (length > 0) {
925 		int opcode = *ptr++;
926 		int opsize;
927 
928 		if (opcode == TCPOPT_EOL)
929 			break;
930 		if (opcode == TCPOPT_NOP) {
931 			length--;
932 			continue;
933 		}
934 		opsize = *ptr++;
935 		if (opsize < 2 || opsize > length)
936 			break;
937 
938 		if (opcode == code) {
939 			*oplen = opsize;
940 			return ptr;
941 		}
942 
943 		ptr += opsize - 2;
944 		length -= opsize;
945 	}
946 
947 	return NULL;
948 }
949 
950 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
951  * bytes than the other. In the case where both sequences ACKs bytes that the
952  * other doesn't, A is considered greater. DSACKs in A also makes A be
953  * considered greater.
954  *
955  * @return -1, 0 or 1 as normal compare functions
956  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)957 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
958 				  const struct tcphdr *tcph_b)
959 {
960 	const struct tcp_sack_block_wire *sack_a, *sack_b;
961 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
962 	u32 bytes_a = 0, bytes_b = 0;
963 	int oplen_a, oplen_b;
964 	bool first = true;
965 
966 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
967 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
968 
969 	/* pointers point to option contents */
970 	oplen_a -= TCPOLEN_SACK_BASE;
971 	oplen_b -= TCPOLEN_SACK_BASE;
972 
973 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
974 	    (!sack_b || oplen_b < sizeof(*sack_b)))
975 		return -1;
976 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
977 		 (!sack_a || oplen_a < sizeof(*sack_a)))
978 		return 1;
979 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
980 		 (!sack_b || oplen_b < sizeof(*sack_b)))
981 		return 0;
982 
983 	while (oplen_a >= sizeof(*sack_a)) {
984 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
985 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
986 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
987 		int oplen_tmp = oplen_b;
988 		bool found = false;
989 
990 		/* DSACK; always considered greater to prevent dropping */
991 		if (before(start_a, ack_seq_a))
992 			return -1;
993 
994 		bytes_a += end_a - start_a;
995 
996 		while (oplen_tmp >= sizeof(*sack_tmp)) {
997 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
998 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
999 
1000 			/* first time through we count the total size */
1001 			if (first)
1002 				bytes_b += end_b - start_b;
1003 
1004 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1005 				found = true;
1006 				if (!first)
1007 					break;
1008 			}
1009 			oplen_tmp -= sizeof(*sack_tmp);
1010 			sack_tmp++;
1011 		}
1012 
1013 		if (!found)
1014 			return -1;
1015 
1016 		oplen_a -= sizeof(*sack_a);
1017 		sack_a++;
1018 		first = false;
1019 	}
1020 
1021 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1022 	 * either the SACKs are equal, or B SACKs more bytes.
1023 	 */
1024 	return bytes_b > bytes_a ? 1 : 0;
1025 }
1026 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1027 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1028 				 u32 *tsval, u32 *tsecr)
1029 {
1030 	const u8 *ptr;
1031 	int opsize;
1032 
1033 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1034 
1035 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1036 		*tsval = get_unaligned_be32(ptr);
1037 		*tsecr = get_unaligned_be32(ptr + 4);
1038 	}
1039 }
1040 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1041 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1042 			       u32 tstamp_new, u32 tsecr_new)
1043 {
1044 	/* inspired by tcp_parse_options in tcp_input.c */
1045 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1046 	const u8 *ptr = (const u8 *)(tcph + 1);
1047 	u32 tstamp, tsecr;
1048 
1049 	/* 3 reserved flags must be unset to avoid future breakage
1050 	 * ACK must be set
1051 	 * ECE/CWR are handled separately
1052 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1053 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1054 	 * 0x00C00000 = CWR/ECE (handled separately)
1055 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1056 	 */
1057 	if (((tcp_flag_word(tcph) &
1058 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1059 		return false;
1060 
1061 	while (length > 0) {
1062 		int opcode = *ptr++;
1063 		int opsize;
1064 
1065 		if (opcode == TCPOPT_EOL)
1066 			break;
1067 		if (opcode == TCPOPT_NOP) {
1068 			length--;
1069 			continue;
1070 		}
1071 		opsize = *ptr++;
1072 		if (opsize < 2 || opsize > length)
1073 			break;
1074 
1075 		switch (opcode) {
1076 		case TCPOPT_MD5SIG: /* doesn't influence state */
1077 			break;
1078 
1079 		case TCPOPT_SACK: /* stricter checking performed later */
1080 			if (opsize % 8 != 2)
1081 				return false;
1082 			break;
1083 
1084 		case TCPOPT_TIMESTAMP:
1085 			/* only drop timestamps lower than new */
1086 			if (opsize != TCPOLEN_TIMESTAMP)
1087 				return false;
1088 			tstamp = get_unaligned_be32(ptr);
1089 			tsecr = get_unaligned_be32(ptr + 4);
1090 			if (after(tstamp, tstamp_new) ||
1091 			    after(tsecr, tsecr_new))
1092 				return false;
1093 			break;
1094 
1095 		case TCPOPT_MSS:  /* these should only be set on SYN */
1096 		case TCPOPT_WINDOW:
1097 		case TCPOPT_SACK_PERM:
1098 		case TCPOPT_FASTOPEN:
1099 		case TCPOPT_EXP:
1100 		default: /* don't drop if any unknown options are present */
1101 			return false;
1102 		}
1103 
1104 		ptr += opsize - 2;
1105 		length -= opsize;
1106 	}
1107 
1108 	return true;
1109 }
1110 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1111 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1112 				       struct cake_flow *flow)
1113 {
1114 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1115 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1116 	struct sk_buff *skb_check, *skb_prev = NULL;
1117 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1118 	unsigned char _tcph[64], _tcph_check[64];
1119 	const struct tcphdr *tcph, *tcph_check;
1120 	const struct iphdr *iph, *iph_check;
1121 	struct ipv6hdr _iph, _iph_check;
1122 	const struct sk_buff *skb;
1123 	int seglen, num_found = 0;
1124 	u32 tstamp = 0, tsecr = 0;
1125 	__be32 elig_flags = 0;
1126 	int sack_comp;
1127 
1128 	/* no other possible ACKs to filter */
1129 	if (flow->head == flow->tail)
1130 		return NULL;
1131 
1132 	skb = flow->tail;
1133 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1134 	iph = cake_get_iphdr(skb, &_iph);
1135 	if (!tcph)
1136 		return NULL;
1137 
1138 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1139 
1140 	/* the 'triggering' packet need only have the ACK flag set.
1141 	 * also check that SYN is not set, as there won't be any previous ACKs.
1142 	 */
1143 	if ((tcp_flag_word(tcph) &
1144 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1145 		return NULL;
1146 
1147 	/* the 'triggering' ACK is at the tail of the queue, we have already
1148 	 * returned if it is the only packet in the flow. loop through the rest
1149 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1150 	 * triggering one.
1151 	 */
1152 	for (skb_check = flow->head;
1153 	     skb_check && skb_check != skb;
1154 	     skb_prev = skb_check, skb_check = skb_check->next) {
1155 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1156 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1157 					     sizeof(_tcph_check));
1158 
1159 		/* only TCP packets with matching 5-tuple are eligible, and only
1160 		 * drop safe headers
1161 		 */
1162 		if (!tcph_check || iph->version != iph_check->version ||
1163 		    tcph_check->source != tcph->source ||
1164 		    tcph_check->dest != tcph->dest)
1165 			continue;
1166 
1167 		if (iph_check->version == 4) {
1168 			if (iph_check->saddr != iph->saddr ||
1169 			    iph_check->daddr != iph->daddr)
1170 				continue;
1171 
1172 			seglen = ntohs(iph_check->tot_len) -
1173 				       (4 * iph_check->ihl);
1174 		} else if (iph_check->version == 6) {
1175 			ipv6h = (struct ipv6hdr *)iph;
1176 			ipv6h_check = (struct ipv6hdr *)iph_check;
1177 
1178 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1179 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1180 				continue;
1181 
1182 			seglen = ntohs(ipv6h_check->payload_len);
1183 		} else {
1184 			WARN_ON(1);  /* shouldn't happen */
1185 			continue;
1186 		}
1187 
1188 		/* If the ECE/CWR flags changed from the previous eligible
1189 		 * packet in the same flow, we should no longer be dropping that
1190 		 * previous packet as this would lose information.
1191 		 */
1192 		if (elig_ack && (tcp_flag_word(tcph_check) &
1193 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1194 			elig_ack = NULL;
1195 			elig_ack_prev = NULL;
1196 			num_found--;
1197 		}
1198 
1199 		/* Check TCP options and flags, don't drop ACKs with segment
1200 		 * data, and don't drop ACKs with a higher cumulative ACK
1201 		 * counter than the triggering packet. Check ACK seqno here to
1202 		 * avoid parsing SACK options of packets we are going to exclude
1203 		 * anyway.
1204 		 */
1205 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1206 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1207 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1208 			continue;
1209 
1210 		/* Check SACK options. The triggering packet must SACK more data
1211 		 * than the ACK under consideration, or SACK the same range but
1212 		 * have a larger cumulative ACK counter. The latter is a
1213 		 * pathological case, but is contained in the following check
1214 		 * anyway, just to be safe.
1215 		 */
1216 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1217 
1218 		if (sack_comp < 0 ||
1219 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1220 		     sack_comp == 0))
1221 			continue;
1222 
1223 		/* At this point we have found an eligible pure ACK to drop; if
1224 		 * we are in aggressive mode, we are done. Otherwise, keep
1225 		 * searching unless this is the second eligible ACK we
1226 		 * found.
1227 		 *
1228 		 * Since we want to drop ACK closest to the head of the queue,
1229 		 * save the first eligible ACK we find, even if we need to loop
1230 		 * again.
1231 		 */
1232 		if (!elig_ack) {
1233 			elig_ack = skb_check;
1234 			elig_ack_prev = skb_prev;
1235 			elig_flags = (tcp_flag_word(tcph_check)
1236 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1237 		}
1238 
1239 		if (num_found++ > 0)
1240 			goto found;
1241 	}
1242 
1243 	/* We made it through the queue without finding two eligible ACKs . If
1244 	 * we found a single eligible ACK we can drop it in aggressive mode if
1245 	 * we can guarantee that this does not interfere with ECN flag
1246 	 * information. We ensure this by dropping it only if the enqueued
1247 	 * packet is consecutive with the eligible ACK, and their flags match.
1248 	 */
1249 	if (elig_ack && aggressive && elig_ack->next == skb &&
1250 	    (elig_flags == (tcp_flag_word(tcph) &
1251 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1252 		goto found;
1253 
1254 	return NULL;
1255 
1256 found:
1257 	if (elig_ack_prev)
1258 		elig_ack_prev->next = elig_ack->next;
1259 	else
1260 		flow->head = elig_ack->next;
1261 
1262 	skb_mark_not_on_list(elig_ack);
1263 
1264 	return elig_ack;
1265 }
1266 
cake_ewma(u64 avg,u64 sample,u32 shift)1267 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1268 {
1269 	avg -= avg >> shift;
1270 	avg += sample >> shift;
1271 	return avg;
1272 }
1273 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1274 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1275 {
1276 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1277 		len -= off;
1278 
1279 	if (q->max_netlen < len)
1280 		q->max_netlen = len;
1281 	if (q->min_netlen > len)
1282 		q->min_netlen = len;
1283 
1284 	len += q->rate_overhead;
1285 
1286 	if (len < q->rate_mpu)
1287 		len = q->rate_mpu;
1288 
1289 	if (q->atm_mode == CAKE_ATM_ATM) {
1290 		len += 47;
1291 		len /= 48;
1292 		len *= 53;
1293 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1294 		/* Add one byte per 64 bytes or part thereof.
1295 		 * This is conservative and easier to calculate than the
1296 		 * precise value.
1297 		 */
1298 		len += (len + 63) / 64;
1299 	}
1300 
1301 	if (q->max_adjlen < len)
1302 		q->max_adjlen = len;
1303 	if (q->min_adjlen > len)
1304 		q->min_adjlen = len;
1305 
1306 	return len;
1307 }
1308 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1309 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1310 {
1311 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1312 	unsigned int hdr_len, last_len = 0;
1313 	u32 off = skb_network_offset(skb);
1314 	u32 len = qdisc_pkt_len(skb);
1315 	u16 segs = 1;
1316 
1317 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1318 
1319 	if (!shinfo->gso_size)
1320 		return cake_calc_overhead(q, len, off);
1321 
1322 	/* borrowed from qdisc_pkt_len_init() */
1323 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1324 
1325 	/* + transport layer */
1326 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1327 						SKB_GSO_TCPV6))) {
1328 		const struct tcphdr *th;
1329 		struct tcphdr _tcphdr;
1330 
1331 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1332 					sizeof(_tcphdr), &_tcphdr);
1333 		if (likely(th))
1334 			hdr_len += __tcp_hdrlen(th);
1335 	} else {
1336 		struct udphdr _udphdr;
1337 
1338 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1339 				       sizeof(_udphdr), &_udphdr))
1340 			hdr_len += sizeof(struct udphdr);
1341 	}
1342 
1343 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1344 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1345 				    shinfo->gso_size);
1346 	else
1347 		segs = shinfo->gso_segs;
1348 
1349 	len = shinfo->gso_size + hdr_len;
1350 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1351 
1352 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1353 		cake_calc_overhead(q, last_len, off));
1354 }
1355 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1356 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1357 {
1358 	struct cake_heap_entry ii = q->overflow_heap[i];
1359 	struct cake_heap_entry jj = q->overflow_heap[j];
1360 
1361 	q->overflow_heap[i] = jj;
1362 	q->overflow_heap[j] = ii;
1363 
1364 	q->tins[ii.t].overflow_idx[ii.b] = j;
1365 	q->tins[jj.t].overflow_idx[jj.b] = i;
1366 }
1367 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1368 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1369 {
1370 	struct cake_heap_entry ii = q->overflow_heap[i];
1371 
1372 	return q->tins[ii.t].backlogs[ii.b];
1373 }
1374 
cake_heapify(struct cake_sched_data * q,u16 i)1375 static void cake_heapify(struct cake_sched_data *q, u16 i)
1376 {
1377 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1378 	u32 mb = cake_heap_get_backlog(q, i);
1379 	u32 m = i;
1380 
1381 	while (m < a) {
1382 		u32 l = m + m + 1;
1383 		u32 r = l + 1;
1384 
1385 		if (l < a) {
1386 			u32 lb = cake_heap_get_backlog(q, l);
1387 
1388 			if (lb > mb) {
1389 				m  = l;
1390 				mb = lb;
1391 			}
1392 		}
1393 
1394 		if (r < a) {
1395 			u32 rb = cake_heap_get_backlog(q, r);
1396 
1397 			if (rb > mb) {
1398 				m  = r;
1399 				mb = rb;
1400 			}
1401 		}
1402 
1403 		if (m != i) {
1404 			cake_heap_swap(q, i, m);
1405 			i = m;
1406 		} else {
1407 			break;
1408 		}
1409 	}
1410 }
1411 
cake_heapify_up(struct cake_sched_data * q,u16 i)1412 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1413 {
1414 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1415 		u16 p = (i - 1) >> 1;
1416 		u32 ib = cake_heap_get_backlog(q, i);
1417 		u32 pb = cake_heap_get_backlog(q, p);
1418 
1419 		if (ib > pb) {
1420 			cake_heap_swap(q, i, p);
1421 			i = p;
1422 		} else {
1423 			break;
1424 		}
1425 	}
1426 }
1427 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1428 static int cake_advance_shaper(struct cake_sched_data *q,
1429 			       struct cake_tin_data *b,
1430 			       struct sk_buff *skb,
1431 			       ktime_t now, bool drop)
1432 {
1433 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1434 
1435 	/* charge packet bandwidth to this tin
1436 	 * and to the global shaper.
1437 	 */
1438 	if (q->rate_ns) {
1439 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1440 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1441 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1442 
1443 		if (ktime_before(b->time_next_packet, now))
1444 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1445 							   tin_dur);
1446 
1447 		else if (ktime_before(b->time_next_packet,
1448 				      ktime_add_ns(now, tin_dur)))
1449 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1450 
1451 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1452 						   global_dur);
1453 		if (!drop)
1454 			q->failsafe_next_packet = \
1455 				ktime_add_ns(q->failsafe_next_packet,
1456 					     failsafe_dur);
1457 	}
1458 	return len;
1459 }
1460 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1461 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1462 {
1463 	struct cake_sched_data *q = qdisc_priv(sch);
1464 	ktime_t now = ktime_get();
1465 	u32 idx = 0, tin = 0, len;
1466 	struct cake_heap_entry qq;
1467 	struct cake_tin_data *b;
1468 	struct cake_flow *flow;
1469 	struct sk_buff *skb;
1470 
1471 	if (!q->overflow_timeout) {
1472 		int i;
1473 		/* Build fresh max-heap */
1474 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1475 			cake_heapify(q, i);
1476 	}
1477 	q->overflow_timeout = 65535;
1478 
1479 	/* select longest queue for pruning */
1480 	qq  = q->overflow_heap[0];
1481 	tin = qq.t;
1482 	idx = qq.b;
1483 
1484 	b = &q->tins[tin];
1485 	flow = &b->flows[idx];
1486 	skb = dequeue_head(flow);
1487 	if (unlikely(!skb)) {
1488 		/* heap has gone wrong, rebuild it next time */
1489 		q->overflow_timeout = 0;
1490 		return idx + (tin << 16);
1491 	}
1492 
1493 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1494 		b->unresponsive_flow_count++;
1495 
1496 	len = qdisc_pkt_len(skb);
1497 	q->buffer_used      -= skb->truesize;
1498 	b->backlogs[idx]    -= len;
1499 	b->tin_backlog      -= len;
1500 	sch->qstats.backlog -= len;
1501 	qdisc_tree_reduce_backlog(sch, 1, len);
1502 
1503 	flow->dropped++;
1504 	b->tin_dropped++;
1505 	sch->qstats.drops++;
1506 
1507 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1508 		cake_advance_shaper(q, b, skb, now, true);
1509 
1510 	__qdisc_drop(skb, to_free);
1511 	sch->q.qlen--;
1512 
1513 	cake_heapify(q, 0);
1514 
1515 	return idx + (tin << 16);
1516 }
1517 
cake_handle_diffserv(struct sk_buff * skb,u16 wash)1518 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1519 {
1520 	int wlen = skb_network_offset(skb);
1521 	u8 dscp;
1522 
1523 	switch (tc_skb_protocol(skb)) {
1524 	case htons(ETH_P_IP):
1525 		wlen += sizeof(struct iphdr);
1526 		if (!pskb_may_pull(skb, wlen) ||
1527 		    skb_try_make_writable(skb, wlen))
1528 			return 0;
1529 
1530 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1531 		if (wash && dscp)
1532 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1533 		return dscp;
1534 
1535 	case htons(ETH_P_IPV6):
1536 		wlen += sizeof(struct ipv6hdr);
1537 		if (!pskb_may_pull(skb, wlen) ||
1538 		    skb_try_make_writable(skb, wlen))
1539 			return 0;
1540 
1541 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1542 		if (wash && dscp)
1543 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1544 		return dscp;
1545 
1546 	case htons(ETH_P_ARP):
1547 		return 0x38;  /* CS7 - Net Control */
1548 
1549 	default:
1550 		/* If there is no Diffserv field, treat as best-effort */
1551 		return 0;
1552 	}
1553 }
1554 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1555 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1556 					     struct sk_buff *skb)
1557 {
1558 	struct cake_sched_data *q = qdisc_priv(sch);
1559 	u32 tin, mark;
1560 	u8 dscp;
1561 
1562 	/* Tin selection: Default to diffserv-based selection, allow overriding
1563 	 * using firewall marks or skb->priority.
1564 	 */
1565 	dscp = cake_handle_diffserv(skb,
1566 				    q->rate_flags & CAKE_FLAG_WASH);
1567 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1568 
1569 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1570 		tin = 0;
1571 
1572 	else if (mark && mark <= q->tin_cnt)
1573 		tin = q->tin_order[mark - 1];
1574 
1575 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1576 		 TC_H_MIN(skb->priority) > 0 &&
1577 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1578 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1579 
1580 	else {
1581 		tin = q->tin_index[dscp];
1582 
1583 		if (unlikely(tin >= q->tin_cnt))
1584 			tin = 0;
1585 	}
1586 
1587 	return &q->tins[tin];
1588 }
1589 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1590 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1591 			 struct sk_buff *skb, int flow_mode, int *qerr)
1592 {
1593 	struct cake_sched_data *q = qdisc_priv(sch);
1594 	struct tcf_proto *filter;
1595 	struct tcf_result res;
1596 	u16 flow = 0, host = 0;
1597 	int result;
1598 
1599 	filter = rcu_dereference_bh(q->filter_list);
1600 	if (!filter)
1601 		goto hash;
1602 
1603 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1604 	result = tcf_classify(skb, filter, &res, false);
1605 
1606 	if (result >= 0) {
1607 #ifdef CONFIG_NET_CLS_ACT
1608 		switch (result) {
1609 		case TC_ACT_STOLEN:
1610 		case TC_ACT_QUEUED:
1611 		case TC_ACT_TRAP:
1612 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1613 			/* fall through */
1614 		case TC_ACT_SHOT:
1615 			return 0;
1616 		}
1617 #endif
1618 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1619 			flow = TC_H_MIN(res.classid);
1620 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1621 			host = TC_H_MAJ(res.classid) >> 16;
1622 	}
1623 hash:
1624 	*t = cake_select_tin(sch, skb);
1625 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1626 }
1627 
1628 static void cake_reconfigure(struct Qdisc *sch);
1629 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1630 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1631 			struct sk_buff **to_free)
1632 {
1633 	struct cake_sched_data *q = qdisc_priv(sch);
1634 	int len = qdisc_pkt_len(skb);
1635 	int uninitialized_var(ret);
1636 	struct sk_buff *ack = NULL;
1637 	ktime_t now = ktime_get();
1638 	struct cake_tin_data *b;
1639 	struct cake_flow *flow;
1640 	u32 idx;
1641 
1642 	/* choose flow to insert into */
1643 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1644 	if (idx == 0) {
1645 		if (ret & __NET_XMIT_BYPASS)
1646 			qdisc_qstats_drop(sch);
1647 		__qdisc_drop(skb, to_free);
1648 		return ret;
1649 	}
1650 	idx--;
1651 	flow = &b->flows[idx];
1652 
1653 	/* ensure shaper state isn't stale */
1654 	if (!b->tin_backlog) {
1655 		if (ktime_before(b->time_next_packet, now))
1656 			b->time_next_packet = now;
1657 
1658 		if (!sch->q.qlen) {
1659 			if (ktime_before(q->time_next_packet, now)) {
1660 				q->failsafe_next_packet = now;
1661 				q->time_next_packet = now;
1662 			} else if (ktime_after(q->time_next_packet, now) &&
1663 				   ktime_after(q->failsafe_next_packet, now)) {
1664 				u64 next = \
1665 					min(ktime_to_ns(q->time_next_packet),
1666 					    ktime_to_ns(
1667 						   q->failsafe_next_packet));
1668 				sch->qstats.overlimits++;
1669 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1670 			}
1671 		}
1672 	}
1673 
1674 	if (unlikely(len > b->max_skblen))
1675 		b->max_skblen = len;
1676 
1677 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1678 		struct sk_buff *segs, *nskb;
1679 		netdev_features_t features = netif_skb_features(skb);
1680 		unsigned int slen = 0, numsegs = 0;
1681 
1682 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1683 		if (IS_ERR_OR_NULL(segs))
1684 			return qdisc_drop(skb, sch, to_free);
1685 
1686 		while (segs) {
1687 			nskb = segs->next;
1688 			skb_mark_not_on_list(segs);
1689 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1690 			cobalt_set_enqueue_time(segs, now);
1691 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1692 									  segs);
1693 			flow_queue_add(flow, segs);
1694 
1695 			sch->q.qlen++;
1696 			numsegs++;
1697 			slen += segs->len;
1698 			q->buffer_used += segs->truesize;
1699 			b->packets++;
1700 			segs = nskb;
1701 		}
1702 
1703 		/* stats */
1704 		b->bytes	    += slen;
1705 		b->backlogs[idx]    += slen;
1706 		b->tin_backlog      += slen;
1707 		sch->qstats.backlog += slen;
1708 		q->avg_window_bytes += slen;
1709 
1710 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1711 		consume_skb(skb);
1712 	} else {
1713 		/* not splitting */
1714 		cobalt_set_enqueue_time(skb, now);
1715 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1716 		flow_queue_add(flow, skb);
1717 
1718 		if (q->ack_filter)
1719 			ack = cake_ack_filter(q, flow);
1720 
1721 		if (ack) {
1722 			b->ack_drops++;
1723 			sch->qstats.drops++;
1724 			b->bytes += qdisc_pkt_len(ack);
1725 			len -= qdisc_pkt_len(ack);
1726 			q->buffer_used += skb->truesize - ack->truesize;
1727 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1728 				cake_advance_shaper(q, b, ack, now, true);
1729 
1730 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1731 			consume_skb(ack);
1732 		} else {
1733 			sch->q.qlen++;
1734 			q->buffer_used      += skb->truesize;
1735 		}
1736 
1737 		/* stats */
1738 		b->packets++;
1739 		b->bytes	    += len;
1740 		b->backlogs[idx]    += len;
1741 		b->tin_backlog      += len;
1742 		sch->qstats.backlog += len;
1743 		q->avg_window_bytes += len;
1744 	}
1745 
1746 	if (q->overflow_timeout)
1747 		cake_heapify_up(q, b->overflow_idx[idx]);
1748 
1749 	/* incoming bandwidth capacity estimate */
1750 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1751 		u64 packet_interval = \
1752 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1753 
1754 		if (packet_interval > NSEC_PER_SEC)
1755 			packet_interval = NSEC_PER_SEC;
1756 
1757 		/* filter out short-term bursts, eg. wifi aggregation */
1758 		q->avg_packet_interval = \
1759 			cake_ewma(q->avg_packet_interval,
1760 				  packet_interval,
1761 				  (packet_interval > q->avg_packet_interval ?
1762 					  2 : 8));
1763 
1764 		q->last_packet_time = now;
1765 
1766 		if (packet_interval > q->avg_packet_interval) {
1767 			u64 window_interval = \
1768 				ktime_to_ns(ktime_sub(now,
1769 						      q->avg_window_begin));
1770 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1771 
1772 			do_div(b, window_interval);
1773 			q->avg_peak_bandwidth =
1774 				cake_ewma(q->avg_peak_bandwidth, b,
1775 					  b > q->avg_peak_bandwidth ? 2 : 8);
1776 			q->avg_window_bytes = 0;
1777 			q->avg_window_begin = now;
1778 
1779 			if (ktime_after(now,
1780 					ktime_add_ms(q->last_reconfig_time,
1781 						     250))) {
1782 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1783 				cake_reconfigure(sch);
1784 			}
1785 		}
1786 	} else {
1787 		q->avg_window_bytes = 0;
1788 		q->last_packet_time = now;
1789 	}
1790 
1791 	/* flowchain */
1792 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1793 		struct cake_host *srchost = &b->hosts[flow->srchost];
1794 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1795 		u16 host_load = 1;
1796 
1797 		if (!flow->set) {
1798 			list_add_tail(&flow->flowchain, &b->new_flows);
1799 		} else {
1800 			b->decaying_flow_count--;
1801 			list_move_tail(&flow->flowchain, &b->new_flows);
1802 		}
1803 		flow->set = CAKE_SET_SPARSE;
1804 		b->sparse_flow_count++;
1805 
1806 		if (cake_dsrc(q->flow_mode))
1807 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1808 
1809 		if (cake_ddst(q->flow_mode))
1810 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1811 
1812 		flow->deficit = (b->flow_quantum *
1813 				 quantum_div[host_load]) >> 16;
1814 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1815 		struct cake_host *srchost = &b->hosts[flow->srchost];
1816 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1817 
1818 		/* this flow was empty, accounted as a sparse flow, but actually
1819 		 * in the bulk rotation.
1820 		 */
1821 		flow->set = CAKE_SET_BULK;
1822 		b->sparse_flow_count--;
1823 		b->bulk_flow_count++;
1824 
1825 		if (cake_dsrc(q->flow_mode))
1826 			srchost->srchost_bulk_flow_count++;
1827 
1828 		if (cake_ddst(q->flow_mode))
1829 			dsthost->dsthost_bulk_flow_count++;
1830 
1831 	}
1832 
1833 	if (q->buffer_used > q->buffer_max_used)
1834 		q->buffer_max_used = q->buffer_used;
1835 
1836 	if (q->buffer_used > q->buffer_limit) {
1837 		u32 dropped = 0;
1838 
1839 		while (q->buffer_used > q->buffer_limit) {
1840 			dropped++;
1841 			cake_drop(sch, to_free);
1842 		}
1843 		b->drop_overlimit += dropped;
1844 	}
1845 	return NET_XMIT_SUCCESS;
1846 }
1847 
cake_dequeue_one(struct Qdisc * sch)1848 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1849 {
1850 	struct cake_sched_data *q = qdisc_priv(sch);
1851 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1852 	struct cake_flow *flow = &b->flows[q->cur_flow];
1853 	struct sk_buff *skb = NULL;
1854 	u32 len;
1855 
1856 	if (flow->head) {
1857 		skb = dequeue_head(flow);
1858 		len = qdisc_pkt_len(skb);
1859 		b->backlogs[q->cur_flow] -= len;
1860 		b->tin_backlog		 -= len;
1861 		sch->qstats.backlog      -= len;
1862 		q->buffer_used		 -= skb->truesize;
1863 		sch->q.qlen--;
1864 
1865 		if (q->overflow_timeout)
1866 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1867 	}
1868 	return skb;
1869 }
1870 
1871 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1872 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1873 {
1874 	struct cake_sched_data *q = qdisc_priv(sch);
1875 	struct sk_buff *skb;
1876 
1877 	q->cur_tin = tin;
1878 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1879 		while (!!(skb = cake_dequeue_one(sch)))
1880 			kfree_skb(skb);
1881 }
1882 
cake_dequeue(struct Qdisc * sch)1883 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1884 {
1885 	struct cake_sched_data *q = qdisc_priv(sch);
1886 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1887 	struct cake_host *srchost, *dsthost;
1888 	ktime_t now = ktime_get();
1889 	struct cake_flow *flow;
1890 	struct list_head *head;
1891 	bool first_flow = true;
1892 	struct sk_buff *skb;
1893 	u16 host_load;
1894 	u64 delay;
1895 	u32 len;
1896 
1897 begin:
1898 	if (!sch->q.qlen)
1899 		return NULL;
1900 
1901 	/* global hard shaper */
1902 	if (ktime_after(q->time_next_packet, now) &&
1903 	    ktime_after(q->failsafe_next_packet, now)) {
1904 		u64 next = min(ktime_to_ns(q->time_next_packet),
1905 			       ktime_to_ns(q->failsafe_next_packet));
1906 
1907 		sch->qstats.overlimits++;
1908 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1909 		return NULL;
1910 	}
1911 
1912 	/* Choose a class to work on. */
1913 	if (!q->rate_ns) {
1914 		/* In unlimited mode, can't rely on shaper timings, just balance
1915 		 * with DRR
1916 		 */
1917 		bool wrapped = false, empty = true;
1918 
1919 		while (b->tin_deficit < 0 ||
1920 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1921 			if (b->tin_deficit <= 0)
1922 				b->tin_deficit += b->tin_quantum_band;
1923 			if (b->sparse_flow_count + b->bulk_flow_count)
1924 				empty = false;
1925 
1926 			q->cur_tin++;
1927 			b++;
1928 			if (q->cur_tin >= q->tin_cnt) {
1929 				q->cur_tin = 0;
1930 				b = q->tins;
1931 
1932 				if (wrapped) {
1933 					/* It's possible for q->qlen to be
1934 					 * nonzero when we actually have no
1935 					 * packets anywhere.
1936 					 */
1937 					if (empty)
1938 						return NULL;
1939 				} else {
1940 					wrapped = true;
1941 				}
1942 			}
1943 		}
1944 	} else {
1945 		/* In shaped mode, choose:
1946 		 * - Highest-priority tin with queue and meeting schedule, or
1947 		 * - The earliest-scheduled tin with queue.
1948 		 */
1949 		ktime_t best_time = KTIME_MAX;
1950 		int tin, best_tin = 0;
1951 
1952 		for (tin = 0; tin < q->tin_cnt; tin++) {
1953 			b = q->tins + tin;
1954 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1955 				ktime_t time_to_pkt = \
1956 					ktime_sub(b->time_next_packet, now);
1957 
1958 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1959 				    ktime_compare(time_to_pkt,
1960 						  best_time) <= 0) {
1961 					best_time = time_to_pkt;
1962 					best_tin = tin;
1963 				}
1964 			}
1965 		}
1966 
1967 		q->cur_tin = best_tin;
1968 		b = q->tins + best_tin;
1969 
1970 		/* No point in going further if no packets to deliver. */
1971 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1972 			return NULL;
1973 	}
1974 
1975 retry:
1976 	/* service this class */
1977 	head = &b->decaying_flows;
1978 	if (!first_flow || list_empty(head)) {
1979 		head = &b->new_flows;
1980 		if (list_empty(head)) {
1981 			head = &b->old_flows;
1982 			if (unlikely(list_empty(head))) {
1983 				head = &b->decaying_flows;
1984 				if (unlikely(list_empty(head)))
1985 					goto begin;
1986 			}
1987 		}
1988 	}
1989 	flow = list_first_entry(head, struct cake_flow, flowchain);
1990 	q->cur_flow = flow - b->flows;
1991 	first_flow = false;
1992 
1993 	/* triple isolation (modified DRR++) */
1994 	srchost = &b->hosts[flow->srchost];
1995 	dsthost = &b->hosts[flow->dsthost];
1996 	host_load = 1;
1997 
1998 	/* flow isolation (DRR++) */
1999 	if (flow->deficit <= 0) {
2000 		/* Keep all flows with deficits out of the sparse and decaying
2001 		 * rotations.  No non-empty flow can go into the decaying
2002 		 * rotation, so they can't get deficits
2003 		 */
2004 		if (flow->set == CAKE_SET_SPARSE) {
2005 			if (flow->head) {
2006 				b->sparse_flow_count--;
2007 				b->bulk_flow_count++;
2008 
2009 				if (cake_dsrc(q->flow_mode))
2010 					srchost->srchost_bulk_flow_count++;
2011 
2012 				if (cake_ddst(q->flow_mode))
2013 					dsthost->dsthost_bulk_flow_count++;
2014 
2015 				flow->set = CAKE_SET_BULK;
2016 			} else {
2017 				/* we've moved it to the bulk rotation for
2018 				 * correct deficit accounting but we still want
2019 				 * to count it as a sparse flow, not a bulk one.
2020 				 */
2021 				flow->set = CAKE_SET_SPARSE_WAIT;
2022 			}
2023 		}
2024 
2025 		if (cake_dsrc(q->flow_mode))
2026 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2027 
2028 		if (cake_ddst(q->flow_mode))
2029 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2030 
2031 		WARN_ON(host_load > CAKE_QUEUES);
2032 
2033 		/* The shifted prandom_u32() is a way to apply dithering to
2034 		 * avoid accumulating roundoff errors
2035 		 */
2036 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2037 				  (prandom_u32() >> 16)) >> 16;
2038 		list_move_tail(&flow->flowchain, &b->old_flows);
2039 
2040 		goto retry;
2041 	}
2042 
2043 	/* Retrieve a packet via the AQM */
2044 	while (1) {
2045 		skb = cake_dequeue_one(sch);
2046 		if (!skb) {
2047 			/* this queue was actually empty */
2048 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2049 				b->unresponsive_flow_count--;
2050 
2051 			if (flow->cvars.p_drop || flow->cvars.count ||
2052 			    ktime_before(now, flow->cvars.drop_next)) {
2053 				/* keep in the flowchain until the state has
2054 				 * decayed to rest
2055 				 */
2056 				list_move_tail(&flow->flowchain,
2057 					       &b->decaying_flows);
2058 				if (flow->set == CAKE_SET_BULK) {
2059 					b->bulk_flow_count--;
2060 
2061 					if (cake_dsrc(q->flow_mode))
2062 						srchost->srchost_bulk_flow_count--;
2063 
2064 					if (cake_ddst(q->flow_mode))
2065 						dsthost->dsthost_bulk_flow_count--;
2066 
2067 					b->decaying_flow_count++;
2068 				} else if (flow->set == CAKE_SET_SPARSE ||
2069 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2070 					b->sparse_flow_count--;
2071 					b->decaying_flow_count++;
2072 				}
2073 				flow->set = CAKE_SET_DECAYING;
2074 			} else {
2075 				/* remove empty queue from the flowchain */
2076 				list_del_init(&flow->flowchain);
2077 				if (flow->set == CAKE_SET_SPARSE ||
2078 				    flow->set == CAKE_SET_SPARSE_WAIT)
2079 					b->sparse_flow_count--;
2080 				else if (flow->set == CAKE_SET_BULK) {
2081 					b->bulk_flow_count--;
2082 
2083 					if (cake_dsrc(q->flow_mode))
2084 						srchost->srchost_bulk_flow_count--;
2085 
2086 					if (cake_ddst(q->flow_mode))
2087 						dsthost->dsthost_bulk_flow_count--;
2088 
2089 				} else
2090 					b->decaying_flow_count--;
2091 
2092 				flow->set = CAKE_SET_NONE;
2093 			}
2094 			goto begin;
2095 		}
2096 
2097 		/* Last packet in queue may be marked, shouldn't be dropped */
2098 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2099 					(b->bulk_flow_count *
2100 					 !!(q->rate_flags &
2101 					    CAKE_FLAG_INGRESS))) ||
2102 		    !flow->head)
2103 			break;
2104 
2105 		/* drop this packet, get another one */
2106 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2107 			len = cake_advance_shaper(q, b, skb,
2108 						  now, true);
2109 			flow->deficit -= len;
2110 			b->tin_deficit -= len;
2111 		}
2112 		flow->dropped++;
2113 		b->tin_dropped++;
2114 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2115 		qdisc_qstats_drop(sch);
2116 		kfree_skb(skb);
2117 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2118 			goto retry;
2119 	}
2120 
2121 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2122 	qdisc_bstats_update(sch, skb);
2123 
2124 	/* collect delay stats */
2125 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2126 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2127 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2128 				  delay > b->peak_delay ? 2 : 8);
2129 	b->base_delay = cake_ewma(b->base_delay, delay,
2130 				  delay < b->base_delay ? 2 : 8);
2131 
2132 	len = cake_advance_shaper(q, b, skb, now, false);
2133 	flow->deficit -= len;
2134 	b->tin_deficit -= len;
2135 
2136 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2137 		u64 next = min(ktime_to_ns(q->time_next_packet),
2138 			       ktime_to_ns(q->failsafe_next_packet));
2139 
2140 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2141 	} else if (!sch->q.qlen) {
2142 		int i;
2143 
2144 		for (i = 0; i < q->tin_cnt; i++) {
2145 			if (q->tins[i].decaying_flow_count) {
2146 				ktime_t next = \
2147 					ktime_add_ns(now,
2148 						     q->tins[i].cparams.target);
2149 
2150 				qdisc_watchdog_schedule_ns(&q->watchdog,
2151 							   ktime_to_ns(next));
2152 				break;
2153 			}
2154 		}
2155 	}
2156 
2157 	if (q->overflow_timeout)
2158 		q->overflow_timeout--;
2159 
2160 	return skb;
2161 }
2162 
cake_reset(struct Qdisc * sch)2163 static void cake_reset(struct Qdisc *sch)
2164 {
2165 	u32 c;
2166 
2167 	for (c = 0; c < CAKE_MAX_TINS; c++)
2168 		cake_clear_tin(sch, c);
2169 }
2170 
2171 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2172 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2173 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2174 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2175 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2176 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2177 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2178 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2179 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2180 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2181 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2182 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2183 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2184 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2185 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2186 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2187 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2188 };
2189 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2190 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2191 			  u64 target_ns, u64 rtt_est_ns)
2192 {
2193 	/* convert byte-rate into time-per-byte
2194 	 * so it will always unwedge in reasonable time.
2195 	 */
2196 	static const u64 MIN_RATE = 64;
2197 	u32 byte_target = mtu;
2198 	u64 byte_target_ns;
2199 	u8  rate_shft = 0;
2200 	u64 rate_ns = 0;
2201 
2202 	b->flow_quantum = 1514;
2203 	if (rate) {
2204 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2205 		rate_shft = 34;
2206 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2207 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2208 		while (!!(rate_ns >> 34)) {
2209 			rate_ns >>= 1;
2210 			rate_shft--;
2211 		}
2212 	} /* else unlimited, ie. zero delay */
2213 
2214 	b->tin_rate_bps  = rate;
2215 	b->tin_rate_ns   = rate_ns;
2216 	b->tin_rate_shft = rate_shft;
2217 
2218 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2219 
2220 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2221 	b->cparams.interval = max(rtt_est_ns +
2222 				     b->cparams.target - target_ns,
2223 				     b->cparams.target * 2);
2224 	b->cparams.mtu_time = byte_target_ns;
2225 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2226 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2227 }
2228 
cake_config_besteffort(struct Qdisc * sch)2229 static int cake_config_besteffort(struct Qdisc *sch)
2230 {
2231 	struct cake_sched_data *q = qdisc_priv(sch);
2232 	struct cake_tin_data *b = &q->tins[0];
2233 	u32 mtu = psched_mtu(qdisc_dev(sch));
2234 	u64 rate = q->rate_bps;
2235 
2236 	q->tin_cnt = 1;
2237 
2238 	q->tin_index = besteffort;
2239 	q->tin_order = normal_order;
2240 
2241 	cake_set_rate(b, rate, mtu,
2242 		      us_to_ns(q->target), us_to_ns(q->interval));
2243 	b->tin_quantum_band = 65535;
2244 	b->tin_quantum_prio = 65535;
2245 
2246 	return 0;
2247 }
2248 
cake_config_precedence(struct Qdisc * sch)2249 static int cake_config_precedence(struct Qdisc *sch)
2250 {
2251 	/* convert high-level (user visible) parameters into internal format */
2252 	struct cake_sched_data *q = qdisc_priv(sch);
2253 	u32 mtu = psched_mtu(qdisc_dev(sch));
2254 	u64 rate = q->rate_bps;
2255 	u32 quantum1 = 256;
2256 	u32 quantum2 = 256;
2257 	u32 i;
2258 
2259 	q->tin_cnt = 8;
2260 	q->tin_index = precedence;
2261 	q->tin_order = normal_order;
2262 
2263 	for (i = 0; i < q->tin_cnt; i++) {
2264 		struct cake_tin_data *b = &q->tins[i];
2265 
2266 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2267 			      us_to_ns(q->interval));
2268 
2269 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2270 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2271 
2272 		/* calculate next class's parameters */
2273 		rate  *= 7;
2274 		rate >>= 3;
2275 
2276 		quantum1  *= 3;
2277 		quantum1 >>= 1;
2278 
2279 		quantum2  *= 7;
2280 		quantum2 >>= 3;
2281 	}
2282 
2283 	return 0;
2284 }
2285 
2286 /*	List of known Diffserv codepoints:
2287  *
2288  *	Least Effort (CS1)
2289  *	Best Effort (CS0)
2290  *	Max Reliability & LLT "Lo" (TOS1)
2291  *	Max Throughput (TOS2)
2292  *	Min Delay (TOS4)
2293  *	LLT "La" (TOS5)
2294  *	Assured Forwarding 1 (AF1x) - x3
2295  *	Assured Forwarding 2 (AF2x) - x3
2296  *	Assured Forwarding 3 (AF3x) - x3
2297  *	Assured Forwarding 4 (AF4x) - x3
2298  *	Precedence Class 2 (CS2)
2299  *	Precedence Class 3 (CS3)
2300  *	Precedence Class 4 (CS4)
2301  *	Precedence Class 5 (CS5)
2302  *	Precedence Class 6 (CS6)
2303  *	Precedence Class 7 (CS7)
2304  *	Voice Admit (VA)
2305  *	Expedited Forwarding (EF)
2306 
2307  *	Total 25 codepoints.
2308  */
2309 
2310 /*	List of traffic classes in RFC 4594:
2311  *		(roughly descending order of contended priority)
2312  *		(roughly ascending order of uncontended throughput)
2313  *
2314  *	Network Control (CS6,CS7)      - routing traffic
2315  *	Telephony (EF,VA)         - aka. VoIP streams
2316  *	Signalling (CS5)               - VoIP setup
2317  *	Multimedia Conferencing (AF4x) - aka. video calls
2318  *	Realtime Interactive (CS4)     - eg. games
2319  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2320  *	Broadcast Video (CS3)
2321  *	Low Latency Data (AF2x,TOS4)      - eg. database
2322  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2323  *	Standard Service (CS0 & unrecognised codepoints)
2324  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2325  *	Low Priority Data (CS1)           - eg. BitTorrent
2326 
2327  *	Total 12 traffic classes.
2328  */
2329 
cake_config_diffserv8(struct Qdisc * sch)2330 static int cake_config_diffserv8(struct Qdisc *sch)
2331 {
2332 /*	Pruned list of traffic classes for typical applications:
2333  *
2334  *		Network Control          (CS6, CS7)
2335  *		Minimum Latency          (EF, VA, CS5, CS4)
2336  *		Interactive Shell        (CS2, TOS1)
2337  *		Low Latency Transactions (AF2x, TOS4)
2338  *		Video Streaming          (AF4x, AF3x, CS3)
2339  *		Bog Standard             (CS0 etc.)
2340  *		High Throughput          (AF1x, TOS2)
2341  *		Background Traffic       (CS1)
2342  *
2343  *		Total 8 traffic classes.
2344  */
2345 
2346 	struct cake_sched_data *q = qdisc_priv(sch);
2347 	u32 mtu = psched_mtu(qdisc_dev(sch));
2348 	u64 rate = q->rate_bps;
2349 	u32 quantum1 = 256;
2350 	u32 quantum2 = 256;
2351 	u32 i;
2352 
2353 	q->tin_cnt = 8;
2354 
2355 	/* codepoint to class mapping */
2356 	q->tin_index = diffserv8;
2357 	q->tin_order = normal_order;
2358 
2359 	/* class characteristics */
2360 	for (i = 0; i < q->tin_cnt; i++) {
2361 		struct cake_tin_data *b = &q->tins[i];
2362 
2363 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2364 			      us_to_ns(q->interval));
2365 
2366 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2367 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2368 
2369 		/* calculate next class's parameters */
2370 		rate  *= 7;
2371 		rate >>= 3;
2372 
2373 		quantum1  *= 3;
2374 		quantum1 >>= 1;
2375 
2376 		quantum2  *= 7;
2377 		quantum2 >>= 3;
2378 	}
2379 
2380 	return 0;
2381 }
2382 
cake_config_diffserv4(struct Qdisc * sch)2383 static int cake_config_diffserv4(struct Qdisc *sch)
2384 {
2385 /*  Further pruned list of traffic classes for four-class system:
2386  *
2387  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2388  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2389  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2390  *	    Background Traffic (CS1)
2391  *
2392  *		Total 4 traffic classes.
2393  */
2394 
2395 	struct cake_sched_data *q = qdisc_priv(sch);
2396 	u32 mtu = psched_mtu(qdisc_dev(sch));
2397 	u64 rate = q->rate_bps;
2398 	u32 quantum = 1024;
2399 
2400 	q->tin_cnt = 4;
2401 
2402 	/* codepoint to class mapping */
2403 	q->tin_index = diffserv4;
2404 	q->tin_order = bulk_order;
2405 
2406 	/* class characteristics */
2407 	cake_set_rate(&q->tins[0], rate, mtu,
2408 		      us_to_ns(q->target), us_to_ns(q->interval));
2409 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2410 		      us_to_ns(q->target), us_to_ns(q->interval));
2411 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2412 		      us_to_ns(q->target), us_to_ns(q->interval));
2413 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2414 		      us_to_ns(q->target), us_to_ns(q->interval));
2415 
2416 	/* priority weights */
2417 	q->tins[0].tin_quantum_prio = quantum;
2418 	q->tins[1].tin_quantum_prio = quantum >> 4;
2419 	q->tins[2].tin_quantum_prio = quantum << 2;
2420 	q->tins[3].tin_quantum_prio = quantum << 4;
2421 
2422 	/* bandwidth-sharing weights */
2423 	q->tins[0].tin_quantum_band = quantum;
2424 	q->tins[1].tin_quantum_band = quantum >> 4;
2425 	q->tins[2].tin_quantum_band = quantum >> 1;
2426 	q->tins[3].tin_quantum_band = quantum >> 2;
2427 
2428 	return 0;
2429 }
2430 
cake_config_diffserv3(struct Qdisc * sch)2431 static int cake_config_diffserv3(struct Qdisc *sch)
2432 {
2433 /*  Simplified Diffserv structure with 3 tins.
2434  *		Low Priority		(CS1)
2435  *		Best Effort
2436  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2437  */
2438 	struct cake_sched_data *q = qdisc_priv(sch);
2439 	u32 mtu = psched_mtu(qdisc_dev(sch));
2440 	u64 rate = q->rate_bps;
2441 	u32 quantum = 1024;
2442 
2443 	q->tin_cnt = 3;
2444 
2445 	/* codepoint to class mapping */
2446 	q->tin_index = diffserv3;
2447 	q->tin_order = bulk_order;
2448 
2449 	/* class characteristics */
2450 	cake_set_rate(&q->tins[0], rate, mtu,
2451 		      us_to_ns(q->target), us_to_ns(q->interval));
2452 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2453 		      us_to_ns(q->target), us_to_ns(q->interval));
2454 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2455 		      us_to_ns(q->target), us_to_ns(q->interval));
2456 
2457 	/* priority weights */
2458 	q->tins[0].tin_quantum_prio = quantum;
2459 	q->tins[1].tin_quantum_prio = quantum >> 4;
2460 	q->tins[2].tin_quantum_prio = quantum << 4;
2461 
2462 	/* bandwidth-sharing weights */
2463 	q->tins[0].tin_quantum_band = quantum;
2464 	q->tins[1].tin_quantum_band = quantum >> 4;
2465 	q->tins[2].tin_quantum_band = quantum >> 2;
2466 
2467 	return 0;
2468 }
2469 
cake_reconfigure(struct Qdisc * sch)2470 static void cake_reconfigure(struct Qdisc *sch)
2471 {
2472 	struct cake_sched_data *q = qdisc_priv(sch);
2473 	int c, ft;
2474 
2475 	switch (q->tin_mode) {
2476 	case CAKE_DIFFSERV_BESTEFFORT:
2477 		ft = cake_config_besteffort(sch);
2478 		break;
2479 
2480 	case CAKE_DIFFSERV_PRECEDENCE:
2481 		ft = cake_config_precedence(sch);
2482 		break;
2483 
2484 	case CAKE_DIFFSERV_DIFFSERV8:
2485 		ft = cake_config_diffserv8(sch);
2486 		break;
2487 
2488 	case CAKE_DIFFSERV_DIFFSERV4:
2489 		ft = cake_config_diffserv4(sch);
2490 		break;
2491 
2492 	case CAKE_DIFFSERV_DIFFSERV3:
2493 	default:
2494 		ft = cake_config_diffserv3(sch);
2495 		break;
2496 	}
2497 
2498 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2499 		cake_clear_tin(sch, c);
2500 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2501 	}
2502 
2503 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2504 	q->rate_shft = q->tins[ft].tin_rate_shft;
2505 
2506 	if (q->buffer_config_limit) {
2507 		q->buffer_limit = q->buffer_config_limit;
2508 	} else if (q->rate_bps) {
2509 		u64 t = q->rate_bps * q->interval;
2510 
2511 		do_div(t, USEC_PER_SEC / 4);
2512 		q->buffer_limit = max_t(u32, t, 4U << 20);
2513 	} else {
2514 		q->buffer_limit = ~0;
2515 	}
2516 
2517 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2518 
2519 	q->buffer_limit = min(q->buffer_limit,
2520 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2521 				  q->buffer_config_limit));
2522 }
2523 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2524 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2525 		       struct netlink_ext_ack *extack)
2526 {
2527 	struct cake_sched_data *q = qdisc_priv(sch);
2528 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2529 	int err;
2530 
2531 	if (!opt)
2532 		return -EINVAL;
2533 
2534 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2535 					  extack);
2536 	if (err < 0)
2537 		return err;
2538 
2539 	if (tb[TCA_CAKE_NAT]) {
2540 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2541 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2542 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2543 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2544 #else
2545 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2546 				    "No conntrack support in kernel");
2547 		return -EOPNOTSUPP;
2548 #endif
2549 	}
2550 
2551 	if (tb[TCA_CAKE_BASE_RATE64])
2552 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2553 
2554 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2555 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2556 
2557 	if (tb[TCA_CAKE_WASH]) {
2558 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2559 			q->rate_flags |= CAKE_FLAG_WASH;
2560 		else
2561 			q->rate_flags &= ~CAKE_FLAG_WASH;
2562 	}
2563 
2564 	if (tb[TCA_CAKE_FLOW_MODE])
2565 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2566 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2567 					CAKE_FLOW_MASK));
2568 
2569 	if (tb[TCA_CAKE_ATM])
2570 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2571 
2572 	if (tb[TCA_CAKE_OVERHEAD]) {
2573 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2574 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2575 
2576 		q->max_netlen = 0;
2577 		q->max_adjlen = 0;
2578 		q->min_netlen = ~0;
2579 		q->min_adjlen = ~0;
2580 	}
2581 
2582 	if (tb[TCA_CAKE_RAW]) {
2583 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2584 
2585 		q->max_netlen = 0;
2586 		q->max_adjlen = 0;
2587 		q->min_netlen = ~0;
2588 		q->min_adjlen = ~0;
2589 	}
2590 
2591 	if (tb[TCA_CAKE_MPU])
2592 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2593 
2594 	if (tb[TCA_CAKE_RTT]) {
2595 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2596 
2597 		if (!q->interval)
2598 			q->interval = 1;
2599 	}
2600 
2601 	if (tb[TCA_CAKE_TARGET]) {
2602 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2603 
2604 		if (!q->target)
2605 			q->target = 1;
2606 	}
2607 
2608 	if (tb[TCA_CAKE_AUTORATE]) {
2609 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2610 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2611 		else
2612 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2613 	}
2614 
2615 	if (tb[TCA_CAKE_INGRESS]) {
2616 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2617 			q->rate_flags |= CAKE_FLAG_INGRESS;
2618 		else
2619 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2620 	}
2621 
2622 	if (tb[TCA_CAKE_ACK_FILTER])
2623 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2624 
2625 	if (tb[TCA_CAKE_MEMORY])
2626 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2627 
2628 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2629 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2630 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2631 		else
2632 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2633 	}
2634 
2635 	if (tb[TCA_CAKE_FWMARK]) {
2636 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2637 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2638 	}
2639 
2640 	if (q->tins) {
2641 		sch_tree_lock(sch);
2642 		cake_reconfigure(sch);
2643 		sch_tree_unlock(sch);
2644 	}
2645 
2646 	return 0;
2647 }
2648 
cake_destroy(struct Qdisc * sch)2649 static void cake_destroy(struct Qdisc *sch)
2650 {
2651 	struct cake_sched_data *q = qdisc_priv(sch);
2652 
2653 	qdisc_watchdog_cancel(&q->watchdog);
2654 	tcf_block_put(q->block);
2655 	kvfree(q->tins);
2656 }
2657 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2658 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2659 		     struct netlink_ext_ack *extack)
2660 {
2661 	struct cake_sched_data *q = qdisc_priv(sch);
2662 	int i, j, err;
2663 
2664 	sch->limit = 10240;
2665 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2666 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2667 
2668 	q->rate_bps = 0; /* unlimited by default */
2669 
2670 	q->interval = 100000; /* 100ms default */
2671 	q->target   =   5000; /* 5ms: codel RFC argues
2672 			       * for 5 to 10% of interval
2673 			       */
2674 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2675 	q->cur_tin = 0;
2676 	q->cur_flow  = 0;
2677 
2678 	qdisc_watchdog_init(&q->watchdog, sch);
2679 
2680 	if (opt) {
2681 		int err = cake_change(sch, opt, extack);
2682 
2683 		if (err)
2684 			return err;
2685 	}
2686 
2687 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2688 	if (err)
2689 		return err;
2690 
2691 	quantum_div[0] = ~0;
2692 	for (i = 1; i <= CAKE_QUEUES; i++)
2693 		quantum_div[i] = 65535 / i;
2694 
2695 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2696 			   GFP_KERNEL);
2697 	if (!q->tins)
2698 		goto nomem;
2699 
2700 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2701 		struct cake_tin_data *b = q->tins + i;
2702 
2703 		INIT_LIST_HEAD(&b->new_flows);
2704 		INIT_LIST_HEAD(&b->old_flows);
2705 		INIT_LIST_HEAD(&b->decaying_flows);
2706 		b->sparse_flow_count = 0;
2707 		b->bulk_flow_count = 0;
2708 		b->decaying_flow_count = 0;
2709 
2710 		for (j = 0; j < CAKE_QUEUES; j++) {
2711 			struct cake_flow *flow = b->flows + j;
2712 			u32 k = j * CAKE_MAX_TINS + i;
2713 
2714 			INIT_LIST_HEAD(&flow->flowchain);
2715 			cobalt_vars_init(&flow->cvars);
2716 
2717 			q->overflow_heap[k].t = i;
2718 			q->overflow_heap[k].b = j;
2719 			b->overflow_idx[j] = k;
2720 		}
2721 	}
2722 
2723 	cake_reconfigure(sch);
2724 	q->avg_peak_bandwidth = q->rate_bps;
2725 	q->min_netlen = ~0;
2726 	q->min_adjlen = ~0;
2727 	return 0;
2728 
2729 nomem:
2730 	cake_destroy(sch);
2731 	return -ENOMEM;
2732 }
2733 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2734 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2735 {
2736 	struct cake_sched_data *q = qdisc_priv(sch);
2737 	struct nlattr *opts;
2738 
2739 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2740 	if (!opts)
2741 		goto nla_put_failure;
2742 
2743 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2744 			      TCA_CAKE_PAD))
2745 		goto nla_put_failure;
2746 
2747 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2748 			q->flow_mode & CAKE_FLOW_MASK))
2749 		goto nla_put_failure;
2750 
2751 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2752 		goto nla_put_failure;
2753 
2754 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2755 		goto nla_put_failure;
2756 
2757 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2758 		goto nla_put_failure;
2759 
2760 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2761 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2762 		goto nla_put_failure;
2763 
2764 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2765 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2766 		goto nla_put_failure;
2767 
2768 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2769 		goto nla_put_failure;
2770 
2771 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2772 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2773 		goto nla_put_failure;
2774 
2775 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2776 		goto nla_put_failure;
2777 
2778 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2779 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2780 		goto nla_put_failure;
2781 
2782 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2783 		goto nla_put_failure;
2784 
2785 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2786 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2787 			goto nla_put_failure;
2788 
2789 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2790 		goto nla_put_failure;
2791 
2792 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2793 		goto nla_put_failure;
2794 
2795 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2796 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2797 		goto nla_put_failure;
2798 
2799 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2800 		goto nla_put_failure;
2801 
2802 	return nla_nest_end(skb, opts);
2803 
2804 nla_put_failure:
2805 	return -1;
2806 }
2807 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2808 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2809 {
2810 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2811 	struct cake_sched_data *q = qdisc_priv(sch);
2812 	struct nlattr *tstats, *ts;
2813 	int i;
2814 
2815 	if (!stats)
2816 		return -1;
2817 
2818 #define PUT_STAT_U32(attr, data) do {				       \
2819 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2820 			goto nla_put_failure;			       \
2821 	} while (0)
2822 #define PUT_STAT_U64(attr, data) do {				       \
2823 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2824 					data, TCA_CAKE_STATS_PAD)) \
2825 			goto nla_put_failure;			       \
2826 	} while (0)
2827 
2828 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2829 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2830 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2831 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2832 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2833 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2834 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2835 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2836 
2837 #undef PUT_STAT_U32
2838 #undef PUT_STAT_U64
2839 
2840 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2841 	if (!tstats)
2842 		goto nla_put_failure;
2843 
2844 #define PUT_TSTAT_U32(attr, data) do {					\
2845 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2846 			goto nla_put_failure;				\
2847 	} while (0)
2848 #define PUT_TSTAT_U64(attr, data) do {					\
2849 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2850 					data, TCA_CAKE_TIN_STATS_PAD))	\
2851 			goto nla_put_failure;				\
2852 	} while (0)
2853 
2854 	for (i = 0; i < q->tin_cnt; i++) {
2855 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2856 
2857 		ts = nla_nest_start_noflag(d->skb, i + 1);
2858 		if (!ts)
2859 			goto nla_put_failure;
2860 
2861 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2862 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2863 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2864 
2865 		PUT_TSTAT_U32(TARGET_US,
2866 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2867 		PUT_TSTAT_U32(INTERVAL_US,
2868 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2869 
2870 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2871 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2872 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2873 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2874 
2875 		PUT_TSTAT_U32(PEAK_DELAY_US,
2876 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2877 		PUT_TSTAT_U32(AVG_DELAY_US,
2878 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2879 		PUT_TSTAT_U32(BASE_DELAY_US,
2880 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2881 
2882 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2883 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2884 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2885 
2886 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2887 					    b->decaying_flow_count);
2888 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2889 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2890 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2891 
2892 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2893 		nla_nest_end(d->skb, ts);
2894 	}
2895 
2896 #undef PUT_TSTAT_U32
2897 #undef PUT_TSTAT_U64
2898 
2899 	nla_nest_end(d->skb, tstats);
2900 	return nla_nest_end(d->skb, stats);
2901 
2902 nla_put_failure:
2903 	nla_nest_cancel(d->skb, stats);
2904 	return -1;
2905 }
2906 
cake_leaf(struct Qdisc * sch,unsigned long arg)2907 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2908 {
2909 	return NULL;
2910 }
2911 
cake_find(struct Qdisc * sch,u32 classid)2912 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2913 {
2914 	return 0;
2915 }
2916 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2917 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2918 			       u32 classid)
2919 {
2920 	return 0;
2921 }
2922 
cake_unbind(struct Qdisc * q,unsigned long cl)2923 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2924 {
2925 }
2926 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2927 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2928 					struct netlink_ext_ack *extack)
2929 {
2930 	struct cake_sched_data *q = qdisc_priv(sch);
2931 
2932 	if (cl)
2933 		return NULL;
2934 	return q->block;
2935 }
2936 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2937 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2938 			   struct sk_buff *skb, struct tcmsg *tcm)
2939 {
2940 	tcm->tcm_handle |= TC_H_MIN(cl);
2941 	return 0;
2942 }
2943 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2944 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2945 				 struct gnet_dump *d)
2946 {
2947 	struct cake_sched_data *q = qdisc_priv(sch);
2948 	const struct cake_flow *flow = NULL;
2949 	struct gnet_stats_queue qs = { 0 };
2950 	struct nlattr *stats;
2951 	u32 idx = cl - 1;
2952 
2953 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2954 		const struct cake_tin_data *b = \
2955 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2956 		const struct sk_buff *skb;
2957 
2958 		flow = &b->flows[idx % CAKE_QUEUES];
2959 
2960 		if (flow->head) {
2961 			sch_tree_lock(sch);
2962 			skb = flow->head;
2963 			while (skb) {
2964 				qs.qlen++;
2965 				skb = skb->next;
2966 			}
2967 			sch_tree_unlock(sch);
2968 		}
2969 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2970 		qs.drops = flow->dropped;
2971 	}
2972 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2973 		return -1;
2974 	if (flow) {
2975 		ktime_t now = ktime_get();
2976 
2977 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2978 		if (!stats)
2979 			return -1;
2980 
2981 #define PUT_STAT_U32(attr, data) do {				       \
2982 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2983 			goto nla_put_failure;			       \
2984 	} while (0)
2985 #define PUT_STAT_S32(attr, data) do {				       \
2986 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2987 			goto nla_put_failure;			       \
2988 	} while (0)
2989 
2990 		PUT_STAT_S32(DEFICIT, flow->deficit);
2991 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2992 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2993 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2994 		if (flow->cvars.p_drop) {
2995 			PUT_STAT_S32(BLUE_TIMER_US,
2996 				     ktime_to_us(
2997 					     ktime_sub(now,
2998 						     flow->cvars.blue_timer)));
2999 		}
3000 		if (flow->cvars.dropping) {
3001 			PUT_STAT_S32(DROP_NEXT_US,
3002 				     ktime_to_us(
3003 					     ktime_sub(now,
3004 						       flow->cvars.drop_next)));
3005 		}
3006 
3007 		if (nla_nest_end(d->skb, stats) < 0)
3008 			return -1;
3009 	}
3010 
3011 	return 0;
3012 
3013 nla_put_failure:
3014 	nla_nest_cancel(d->skb, stats);
3015 	return -1;
3016 }
3017 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3018 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3019 {
3020 	struct cake_sched_data *q = qdisc_priv(sch);
3021 	unsigned int i, j;
3022 
3023 	if (arg->stop)
3024 		return;
3025 
3026 	for (i = 0; i < q->tin_cnt; i++) {
3027 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3028 
3029 		for (j = 0; j < CAKE_QUEUES; j++) {
3030 			if (list_empty(&b->flows[j].flowchain) ||
3031 			    arg->count < arg->skip) {
3032 				arg->count++;
3033 				continue;
3034 			}
3035 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3036 				arg->stop = 1;
3037 				break;
3038 			}
3039 			arg->count++;
3040 		}
3041 	}
3042 }
3043 
3044 static const struct Qdisc_class_ops cake_class_ops = {
3045 	.leaf		=	cake_leaf,
3046 	.find		=	cake_find,
3047 	.tcf_block	=	cake_tcf_block,
3048 	.bind_tcf	=	cake_bind,
3049 	.unbind_tcf	=	cake_unbind,
3050 	.dump		=	cake_dump_class,
3051 	.dump_stats	=	cake_dump_class_stats,
3052 	.walk		=	cake_walk,
3053 };
3054 
3055 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3056 	.cl_ops		=	&cake_class_ops,
3057 	.id		=	"cake",
3058 	.priv_size	=	sizeof(struct cake_sched_data),
3059 	.enqueue	=	cake_enqueue,
3060 	.dequeue	=	cake_dequeue,
3061 	.peek		=	qdisc_peek_dequeued,
3062 	.init		=	cake_init,
3063 	.reset		=	cake_reset,
3064 	.destroy	=	cake_destroy,
3065 	.change		=	cake_change,
3066 	.dump		=	cake_dump,
3067 	.dump_stats	=	cake_dump_stats,
3068 	.owner		=	THIS_MODULE,
3069 };
3070 
cake_module_init(void)3071 static int __init cake_module_init(void)
3072 {
3073 	return register_qdisc(&cake_qdisc_ops);
3074 }
3075 
cake_module_exit(void)3076 static void __exit cake_module_exit(void)
3077 {
3078 	unregister_qdisc(&cake_qdisc_ops);
3079 }
3080 
3081 module_init(cake_module_init)
3082 module_exit(cake_module_exit)
3083 MODULE_AUTHOR("Jonathan Morton");
3084 MODULE_LICENSE("Dual BSD/GPL");
3085 MODULE_DESCRIPTION("The CAKE shaper.");
3086