1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18 
19 
20 /*	Class-Based Queueing (CBQ) algorithm.
21 	=======================================
22 
23 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24 		 Management Models for Packet Networks",
25 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26 
27 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28 
29 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30 		 Parameters", 1996
31 
32 		 [4] Sally Floyd and Michael Speer, "Experimental Results
33 		 for Class-Based Queueing", 1998, not published.
34 
35 	-----------------------------------------------------------------------
36 
37 	Algorithm skeleton was taken from NS simulator cbq.cc.
38 	If someone wants to check this code against the LBL version,
39 	he should take into account that ONLY the skeleton was borrowed,
40 	the implementation is different. Particularly:
41 
42 	--- The WRR algorithm is different. Our version looks more
43 	reasonable (I hope) and works when quanta are allowed to be
44 	less than MTU, which is always the case when real time classes
45 	have small rates. Note, that the statement of [3] is
46 	incomplete, delay may actually be estimated even if class
47 	per-round allotment is less than MTU. Namely, if per-round
48 	allotment is W*r_i, and r_1+...+r_k = r < 1
49 
50 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51 
52 	In the worst case we have IntServ estimate with D = W*r+k*MTU
53 	and C = MTU*r. The proof (if correct at all) is trivial.
54 
55 
56 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
57 	interpret some places, which look like wrong translations
58 	from NS. Anyone is advised to find these differences
59 	and explain to me, why I am wrong 8).
60 
61 	--- Linux has no EOI event, so that we cannot estimate true class
62 	idle time. Workaround is to consider the next dequeue event
63 	as sign that previous packet is finished. This is wrong because of
64 	internal device queueing, but on a permanently loaded link it is true.
65 	Moreover, combined with clock integrator, this scheme looks
66 	very close to an ideal solution.  */
67 
68 struct cbq_sched_data;
69 
70 
71 struct cbq_class {
72 	struct Qdisc_class_common common;
73 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
74 
75 /* Parameters */
76 	unsigned char		priority;	/* class priority */
77 	unsigned char		priority2;	/* priority to be used after overlimit */
78 	unsigned char		ewma_log;	/* time constant for idle time calculation */
79 
80 	u32			defmap;
81 
82 	/* Link-sharing scheduler parameters */
83 	long			maxidle;	/* Class parameters: see below. */
84 	long			offtime;
85 	long			minidle;
86 	u32			avpkt;
87 	struct qdisc_rate_table	*R_tab;
88 
89 	/* General scheduler (WRR) parameters */
90 	long			allot;
91 	long			quantum;	/* Allotment per WRR round */
92 	long			weight;		/* Relative allotment: see below */
93 
94 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
95 	struct cbq_class	*split;		/* Ptr to split node */
96 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
97 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
98 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
99 						   parent otherwise */
100 	struct cbq_class	*sibling;	/* Sibling chain */
101 	struct cbq_class	*children;	/* Pointer to children chain */
102 
103 	struct Qdisc		*q;		/* Elementary queueing discipline */
104 
105 
106 /* Variables */
107 	unsigned char		cpriority;	/* Effective priority */
108 	unsigned char		delayed;
109 	unsigned char		level;		/* level of the class in hierarchy:
110 						   0 for leaf classes, and maximal
111 						   level of children + 1 for nodes.
112 						 */
113 
114 	psched_time_t		last;		/* Last end of service */
115 	psched_time_t		undertime;
116 	long			avgidle;
117 	long			deficit;	/* Saved deficit for WRR */
118 	psched_time_t		penalized;
119 	struct gnet_stats_basic_sync bstats;
120 	struct gnet_stats_queue qstats;
121 	struct net_rate_estimator __rcu *rate_est;
122 	struct tc_cbq_xstats	xstats;
123 
124 	struct tcf_proto __rcu	*filter_list;
125 	struct tcf_block	*block;
126 
127 	int			filters;
128 
129 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
130 };
131 
132 struct cbq_sched_data {
133 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
134 	int			nclasses[TC_CBQ_MAXPRIO + 1];
135 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
136 
137 	struct cbq_class	link;
138 
139 	unsigned int		activemask;
140 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
141 								   with backlog */
142 
143 #ifdef CONFIG_NET_CLS_ACT
144 	struct cbq_class	*rx_class;
145 #endif
146 	struct cbq_class	*tx_class;
147 	struct cbq_class	*tx_borrowed;
148 	int			tx_len;
149 	psched_time_t		now;		/* Cached timestamp */
150 	unsigned int		pmask;
151 
152 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
153 						   started when CBQ has
154 						   backlog, but cannot
155 						   transmit just now */
156 	psched_tdiff_t		wd_expires;
157 	int			toplevel;
158 	u32			hgenerator;
159 };
160 
161 
162 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
163 
164 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)165 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
166 {
167 	struct Qdisc_class_common *clc;
168 
169 	clc = qdisc_class_find(&q->clhash, classid);
170 	if (clc == NULL)
171 		return NULL;
172 	return container_of(clc, struct cbq_class, common);
173 }
174 
175 #ifdef CONFIG_NET_CLS_ACT
176 
177 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)178 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
179 {
180 	struct cbq_class *cl;
181 
182 	for (cl = this->tparent; cl; cl = cl->tparent) {
183 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
184 
185 		if (new != NULL && new != this)
186 			return new;
187 	}
188 	return NULL;
189 }
190 
191 #endif
192 
193 /* Classify packet. The procedure is pretty complicated, but
194  * it allows us to combine link sharing and priority scheduling
195  * transparently.
196  *
197  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
198  * so that it resolves to split nodes. Then packets are classified
199  * by logical priority, or a more specific classifier may be attached
200  * to the split node.
201  */
202 
203 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)204 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
205 {
206 	struct cbq_sched_data *q = qdisc_priv(sch);
207 	struct cbq_class *head = &q->link;
208 	struct cbq_class **defmap;
209 	struct cbq_class *cl = NULL;
210 	u32 prio = skb->priority;
211 	struct tcf_proto *fl;
212 	struct tcf_result res;
213 
214 	/*
215 	 *  Step 1. If skb->priority points to one of our classes, use it.
216 	 */
217 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
218 	    (cl = cbq_class_lookup(q, prio)) != NULL)
219 		return cl;
220 
221 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
222 	for (;;) {
223 		int result = 0;
224 		defmap = head->defaults;
225 
226 		fl = rcu_dereference_bh(head->filter_list);
227 		/*
228 		 * Step 2+n. Apply classifier.
229 		 */
230 		result = tcf_classify(skb, NULL, fl, &res, true);
231 		if (!fl || result < 0)
232 			goto fallback;
233 
234 		cl = (void *)res.class;
235 		if (!cl) {
236 			if (TC_H_MAJ(res.classid))
237 				cl = cbq_class_lookup(q, res.classid);
238 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
239 				cl = defmap[TC_PRIO_BESTEFFORT];
240 
241 			if (cl == NULL)
242 				goto fallback;
243 		}
244 		if (cl->level >= head->level)
245 			goto fallback;
246 #ifdef CONFIG_NET_CLS_ACT
247 		switch (result) {
248 		case TC_ACT_QUEUED:
249 		case TC_ACT_STOLEN:
250 		case TC_ACT_TRAP:
251 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 			fallthrough;
253 		case TC_ACT_SHOT:
254 			return NULL;
255 		case TC_ACT_RECLASSIFY:
256 			return cbq_reclassify(skb, cl);
257 		}
258 #endif
259 		if (cl->level == 0)
260 			return cl;
261 
262 		/*
263 		 * Step 3+n. If classifier selected a link sharing class,
264 		 *	   apply agency specific classifier.
265 		 *	   Repeat this procedure until we hit a leaf node.
266 		 */
267 		head = cl;
268 	}
269 
270 fallback:
271 	cl = head;
272 
273 	/*
274 	 * Step 4. No success...
275 	 */
276 	if (TC_H_MAJ(prio) == 0 &&
277 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
278 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
279 		return head;
280 
281 	return cl;
282 }
283 
284 /*
285  * A packet has just been enqueued on the empty class.
286  * cbq_activate_class adds it to the tail of active class list
287  * of its priority band.
288  */
289 
cbq_activate_class(struct cbq_class * cl)290 static inline void cbq_activate_class(struct cbq_class *cl)
291 {
292 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
293 	int prio = cl->cpriority;
294 	struct cbq_class *cl_tail;
295 
296 	cl_tail = q->active[prio];
297 	q->active[prio] = cl;
298 
299 	if (cl_tail != NULL) {
300 		cl->next_alive = cl_tail->next_alive;
301 		cl_tail->next_alive = cl;
302 	} else {
303 		cl->next_alive = cl;
304 		q->activemask |= (1<<prio);
305 	}
306 }
307 
308 /*
309  * Unlink class from active chain.
310  * Note that this same procedure is done directly in cbq_dequeue*
311  * during round-robin procedure.
312  */
313 
cbq_deactivate_class(struct cbq_class * this)314 static void cbq_deactivate_class(struct cbq_class *this)
315 {
316 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
317 	int prio = this->cpriority;
318 	struct cbq_class *cl;
319 	struct cbq_class *cl_prev = q->active[prio];
320 
321 	do {
322 		cl = cl_prev->next_alive;
323 		if (cl == this) {
324 			cl_prev->next_alive = cl->next_alive;
325 			cl->next_alive = NULL;
326 
327 			if (cl == q->active[prio]) {
328 				q->active[prio] = cl_prev;
329 				if (cl == q->active[prio]) {
330 					q->active[prio] = NULL;
331 					q->activemask &= ~(1<<prio);
332 					return;
333 				}
334 			}
335 			return;
336 		}
337 	} while ((cl_prev = cl) != q->active[prio]);
338 }
339 
340 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)341 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
342 {
343 	int toplevel = q->toplevel;
344 
345 	if (toplevel > cl->level) {
346 		psched_time_t now = psched_get_time();
347 
348 		do {
349 			if (cl->undertime < now) {
350 				q->toplevel = cl->level;
351 				return;
352 			}
353 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
354 	}
355 }
356 
357 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)358 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
359 	    struct sk_buff **to_free)
360 {
361 	struct cbq_sched_data *q = qdisc_priv(sch);
362 	int ret;
363 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
364 
365 #ifdef CONFIG_NET_CLS_ACT
366 	q->rx_class = cl;
367 #endif
368 	if (cl == NULL) {
369 		if (ret & __NET_XMIT_BYPASS)
370 			qdisc_qstats_drop(sch);
371 		__qdisc_drop(skb, to_free);
372 		return ret;
373 	}
374 
375 	ret = qdisc_enqueue(skb, cl->q, to_free);
376 	if (ret == NET_XMIT_SUCCESS) {
377 		sch->q.qlen++;
378 		cbq_mark_toplevel(q, cl);
379 		if (!cl->next_alive)
380 			cbq_activate_class(cl);
381 		return ret;
382 	}
383 
384 	if (net_xmit_drop_count(ret)) {
385 		qdisc_qstats_drop(sch);
386 		cbq_mark_toplevel(q, cl);
387 		cl->qstats.drops++;
388 	}
389 	return ret;
390 }
391 
392 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)393 static void cbq_overlimit(struct cbq_class *cl)
394 {
395 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
396 	psched_tdiff_t delay = cl->undertime - q->now;
397 
398 	if (!cl->delayed) {
399 		delay += cl->offtime;
400 
401 		/*
402 		 * Class goes to sleep, so that it will have no
403 		 * chance to work avgidle. Let's forgive it 8)
404 		 *
405 		 * BTW cbq-2.0 has a crap in this
406 		 * place, apparently they forgot to shift it by cl->ewma_log.
407 		 */
408 		if (cl->avgidle < 0)
409 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
410 		if (cl->avgidle < cl->minidle)
411 			cl->avgidle = cl->minidle;
412 		if (delay <= 0)
413 			delay = 1;
414 		cl->undertime = q->now + delay;
415 
416 		cl->xstats.overactions++;
417 		cl->delayed = 1;
418 	}
419 	if (q->wd_expires == 0 || q->wd_expires > delay)
420 		q->wd_expires = delay;
421 
422 	/* Dirty work! We must schedule wakeups based on
423 	 * real available rate, rather than leaf rate,
424 	 * which may be tiny (even zero).
425 	 */
426 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
427 		struct cbq_class *b;
428 		psched_tdiff_t base_delay = q->wd_expires;
429 
430 		for (b = cl->borrow; b; b = b->borrow) {
431 			delay = b->undertime - q->now;
432 			if (delay < base_delay) {
433 				if (delay <= 0)
434 					delay = 1;
435 				base_delay = delay;
436 			}
437 		}
438 
439 		q->wd_expires = base_delay;
440 	}
441 }
442 
443 /*
444  * It is mission critical procedure.
445  *
446  * We "regenerate" toplevel cutoff, if transmitting class
447  * has backlog and it is not regulated. It is not part of
448  * original CBQ description, but looks more reasonable.
449  * Probably, it is wrong. This question needs further investigation.
450  */
451 
452 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)453 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
454 		    struct cbq_class *borrowed)
455 {
456 	if (cl && q->toplevel >= borrowed->level) {
457 		if (cl->q->q.qlen > 1) {
458 			do {
459 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
460 					q->toplevel = borrowed->level;
461 					return;
462 				}
463 			} while ((borrowed = borrowed->borrow) != NULL);
464 		}
465 #if 0
466 	/* It is not necessary now. Uncommenting it
467 	   will save CPU cycles, but decrease fairness.
468 	 */
469 		q->toplevel = TC_CBQ_MAXLEVEL;
470 #endif
471 	}
472 }
473 
474 static void
cbq_update(struct cbq_sched_data * q)475 cbq_update(struct cbq_sched_data *q)
476 {
477 	struct cbq_class *this = q->tx_class;
478 	struct cbq_class *cl = this;
479 	int len = q->tx_len;
480 	psched_time_t now;
481 
482 	q->tx_class = NULL;
483 	/* Time integrator. We calculate EOS time
484 	 * by adding expected packet transmission time.
485 	 */
486 	now = q->now + L2T(&q->link, len);
487 
488 	for ( ; cl; cl = cl->share) {
489 		long avgidle = cl->avgidle;
490 		long idle;
491 
492 		_bstats_update(&cl->bstats, len, 1);
493 
494 		/*
495 		 * (now - last) is total time between packet right edges.
496 		 * (last_pktlen/rate) is "virtual" busy time, so that
497 		 *
498 		 *	idle = (now - last) - last_pktlen/rate
499 		 */
500 
501 		idle = now - cl->last;
502 		if ((unsigned long)idle > 128*1024*1024) {
503 			avgidle = cl->maxidle;
504 		} else {
505 			idle -= L2T(cl, len);
506 
507 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
508 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
509 		 * cl->avgidle == true_avgidle/W,
510 		 * hence:
511 		 */
512 			avgidle += idle - (avgidle>>cl->ewma_log);
513 		}
514 
515 		if (avgidle <= 0) {
516 			/* Overlimit or at-limit */
517 
518 			if (avgidle < cl->minidle)
519 				avgidle = cl->minidle;
520 
521 			cl->avgidle = avgidle;
522 
523 			/* Calculate expected time, when this class
524 			 * will be allowed to send.
525 			 * It will occur, when:
526 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
527 			 * idle = (1/W - 1)*(-true_avgidle)
528 			 * or
529 			 * idle = (1 - W)*(-cl->avgidle);
530 			 */
531 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
532 
533 			/*
534 			 * That is not all.
535 			 * To maintain the rate allocated to the class,
536 			 * we add to undertime virtual clock,
537 			 * necessary to complete transmitted packet.
538 			 * (len/phys_bandwidth has been already passed
539 			 * to the moment of cbq_update)
540 			 */
541 
542 			idle -= L2T(&q->link, len);
543 			idle += L2T(cl, len);
544 
545 			cl->undertime = now + idle;
546 		} else {
547 			/* Underlimit */
548 
549 			cl->undertime = PSCHED_PASTPERFECT;
550 			if (avgidle > cl->maxidle)
551 				cl->avgidle = cl->maxidle;
552 			else
553 				cl->avgidle = avgidle;
554 		}
555 		if ((s64)(now - cl->last) > 0)
556 			cl->last = now;
557 	}
558 
559 	cbq_update_toplevel(q, this, q->tx_borrowed);
560 }
561 
562 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)563 cbq_under_limit(struct cbq_class *cl)
564 {
565 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
566 	struct cbq_class *this_cl = cl;
567 
568 	if (cl->tparent == NULL)
569 		return cl;
570 
571 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
572 		cl->delayed = 0;
573 		return cl;
574 	}
575 
576 	do {
577 		/* It is very suspicious place. Now overlimit
578 		 * action is generated for not bounded classes
579 		 * only if link is completely congested.
580 		 * Though it is in agree with ancestor-only paradigm,
581 		 * it looks very stupid. Particularly,
582 		 * it means that this chunk of code will either
583 		 * never be called or result in strong amplification
584 		 * of burstiness. Dangerous, silly, and, however,
585 		 * no another solution exists.
586 		 */
587 		cl = cl->borrow;
588 		if (!cl) {
589 			this_cl->qstats.overlimits++;
590 			cbq_overlimit(this_cl);
591 			return NULL;
592 		}
593 		if (cl->level > q->toplevel)
594 			return NULL;
595 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
596 
597 	cl->delayed = 0;
598 	return cl;
599 }
600 
601 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)602 cbq_dequeue_prio(struct Qdisc *sch, int prio)
603 {
604 	struct cbq_sched_data *q = qdisc_priv(sch);
605 	struct cbq_class *cl_tail, *cl_prev, *cl;
606 	struct sk_buff *skb;
607 	int deficit;
608 
609 	cl_tail = cl_prev = q->active[prio];
610 	cl = cl_prev->next_alive;
611 
612 	do {
613 		deficit = 0;
614 
615 		/* Start round */
616 		do {
617 			struct cbq_class *borrow = cl;
618 
619 			if (cl->q->q.qlen &&
620 			    (borrow = cbq_under_limit(cl)) == NULL)
621 				goto skip_class;
622 
623 			if (cl->deficit <= 0) {
624 				/* Class exhausted its allotment per
625 				 * this round. Switch to the next one.
626 				 */
627 				deficit = 1;
628 				cl->deficit += cl->quantum;
629 				goto next_class;
630 			}
631 
632 			skb = cl->q->dequeue(cl->q);
633 
634 			/* Class did not give us any skb :-(
635 			 * It could occur even if cl->q->q.qlen != 0
636 			 * f.e. if cl->q == "tbf"
637 			 */
638 			if (skb == NULL)
639 				goto skip_class;
640 
641 			cl->deficit -= qdisc_pkt_len(skb);
642 			q->tx_class = cl;
643 			q->tx_borrowed = borrow;
644 			if (borrow != cl) {
645 #ifndef CBQ_XSTATS_BORROWS_BYTES
646 				borrow->xstats.borrows++;
647 				cl->xstats.borrows++;
648 #else
649 				borrow->xstats.borrows += qdisc_pkt_len(skb);
650 				cl->xstats.borrows += qdisc_pkt_len(skb);
651 #endif
652 			}
653 			q->tx_len = qdisc_pkt_len(skb);
654 
655 			if (cl->deficit <= 0) {
656 				q->active[prio] = cl;
657 				cl = cl->next_alive;
658 				cl->deficit += cl->quantum;
659 			}
660 			return skb;
661 
662 skip_class:
663 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
664 				/* Class is empty or penalized.
665 				 * Unlink it from active chain.
666 				 */
667 				cl_prev->next_alive = cl->next_alive;
668 				cl->next_alive = NULL;
669 
670 				/* Did cl_tail point to it? */
671 				if (cl == cl_tail) {
672 					/* Repair it! */
673 					cl_tail = cl_prev;
674 
675 					/* Was it the last class in this band? */
676 					if (cl == cl_tail) {
677 						/* Kill the band! */
678 						q->active[prio] = NULL;
679 						q->activemask &= ~(1<<prio);
680 						if (cl->q->q.qlen)
681 							cbq_activate_class(cl);
682 						return NULL;
683 					}
684 
685 					q->active[prio] = cl_tail;
686 				}
687 				if (cl->q->q.qlen)
688 					cbq_activate_class(cl);
689 
690 				cl = cl_prev;
691 			}
692 
693 next_class:
694 			cl_prev = cl;
695 			cl = cl->next_alive;
696 		} while (cl_prev != cl_tail);
697 	} while (deficit);
698 
699 	q->active[prio] = cl_prev;
700 
701 	return NULL;
702 }
703 
704 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)705 cbq_dequeue_1(struct Qdisc *sch)
706 {
707 	struct cbq_sched_data *q = qdisc_priv(sch);
708 	struct sk_buff *skb;
709 	unsigned int activemask;
710 
711 	activemask = q->activemask & 0xFF;
712 	while (activemask) {
713 		int prio = ffz(~activemask);
714 		activemask &= ~(1<<prio);
715 		skb = cbq_dequeue_prio(sch, prio);
716 		if (skb)
717 			return skb;
718 	}
719 	return NULL;
720 }
721 
722 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)723 cbq_dequeue(struct Qdisc *sch)
724 {
725 	struct sk_buff *skb;
726 	struct cbq_sched_data *q = qdisc_priv(sch);
727 	psched_time_t now;
728 
729 	now = psched_get_time();
730 
731 	if (q->tx_class)
732 		cbq_update(q);
733 
734 	q->now = now;
735 
736 	for (;;) {
737 		q->wd_expires = 0;
738 
739 		skb = cbq_dequeue_1(sch);
740 		if (skb) {
741 			qdisc_bstats_update(sch, skb);
742 			sch->q.qlen--;
743 			return skb;
744 		}
745 
746 		/* All the classes are overlimit.
747 		 *
748 		 * It is possible, if:
749 		 *
750 		 * 1. Scheduler is empty.
751 		 * 2. Toplevel cutoff inhibited borrowing.
752 		 * 3. Root class is overlimit.
753 		 *
754 		 * Reset 2d and 3d conditions and retry.
755 		 *
756 		 * Note, that NS and cbq-2.0 are buggy, peeking
757 		 * an arbitrary class is appropriate for ancestor-only
758 		 * sharing, but not for toplevel algorithm.
759 		 *
760 		 * Our version is better, but slower, because it requires
761 		 * two passes, but it is unavoidable with top-level sharing.
762 		 */
763 
764 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
765 		    q->link.undertime == PSCHED_PASTPERFECT)
766 			break;
767 
768 		q->toplevel = TC_CBQ_MAXLEVEL;
769 		q->link.undertime = PSCHED_PASTPERFECT;
770 	}
771 
772 	/* No packets in scheduler or nobody wants to give them to us :-(
773 	 * Sigh... start watchdog timer in the last case.
774 	 */
775 
776 	if (sch->q.qlen) {
777 		qdisc_qstats_overlimit(sch);
778 		if (q->wd_expires)
779 			qdisc_watchdog_schedule(&q->watchdog,
780 						now + q->wd_expires);
781 	}
782 	return NULL;
783 }
784 
785 /* CBQ class maintenance routines */
786 
cbq_adjust_levels(struct cbq_class * this)787 static void cbq_adjust_levels(struct cbq_class *this)
788 {
789 	if (this == NULL)
790 		return;
791 
792 	do {
793 		int level = 0;
794 		struct cbq_class *cl;
795 
796 		cl = this->children;
797 		if (cl) {
798 			do {
799 				if (cl->level > level)
800 					level = cl->level;
801 			} while ((cl = cl->sibling) != this->children);
802 		}
803 		this->level = level + 1;
804 	} while ((this = this->tparent) != NULL);
805 }
806 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)807 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
808 {
809 	struct cbq_class *cl;
810 	unsigned int h;
811 
812 	if (q->quanta[prio] == 0)
813 		return;
814 
815 	for (h = 0; h < q->clhash.hashsize; h++) {
816 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
817 			/* BUGGGG... Beware! This expression suffer of
818 			 * arithmetic overflows!
819 			 */
820 			if (cl->priority == prio) {
821 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
822 					q->quanta[prio];
823 			}
824 			if (cl->quantum <= 0 ||
825 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
826 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
827 					cl->common.classid, cl->quantum);
828 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
829 			}
830 		}
831 	}
832 }
833 
cbq_sync_defmap(struct cbq_class * cl)834 static void cbq_sync_defmap(struct cbq_class *cl)
835 {
836 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
837 	struct cbq_class *split = cl->split;
838 	unsigned int h;
839 	int i;
840 
841 	if (split == NULL)
842 		return;
843 
844 	for (i = 0; i <= TC_PRIO_MAX; i++) {
845 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
846 			split->defaults[i] = NULL;
847 	}
848 
849 	for (i = 0; i <= TC_PRIO_MAX; i++) {
850 		int level = split->level;
851 
852 		if (split->defaults[i])
853 			continue;
854 
855 		for (h = 0; h < q->clhash.hashsize; h++) {
856 			struct cbq_class *c;
857 
858 			hlist_for_each_entry(c, &q->clhash.hash[h],
859 					     common.hnode) {
860 				if (c->split == split && c->level < level &&
861 				    c->defmap & (1<<i)) {
862 					split->defaults[i] = c;
863 					level = c->level;
864 				}
865 			}
866 		}
867 	}
868 }
869 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)870 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
871 {
872 	struct cbq_class *split = NULL;
873 
874 	if (splitid == 0) {
875 		split = cl->split;
876 		if (!split)
877 			return;
878 		splitid = split->common.classid;
879 	}
880 
881 	if (split == NULL || split->common.classid != splitid) {
882 		for (split = cl->tparent; split; split = split->tparent)
883 			if (split->common.classid == splitid)
884 				break;
885 	}
886 
887 	if (split == NULL)
888 		return;
889 
890 	if (cl->split != split) {
891 		cl->defmap = 0;
892 		cbq_sync_defmap(cl);
893 		cl->split = split;
894 		cl->defmap = def & mask;
895 	} else
896 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
897 
898 	cbq_sync_defmap(cl);
899 }
900 
cbq_unlink_class(struct cbq_class * this)901 static void cbq_unlink_class(struct cbq_class *this)
902 {
903 	struct cbq_class *cl, **clp;
904 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
905 
906 	qdisc_class_hash_remove(&q->clhash, &this->common);
907 
908 	if (this->tparent) {
909 		clp = &this->sibling;
910 		cl = *clp;
911 		do {
912 			if (cl == this) {
913 				*clp = cl->sibling;
914 				break;
915 			}
916 			clp = &cl->sibling;
917 		} while ((cl = *clp) != this->sibling);
918 
919 		if (this->tparent->children == this) {
920 			this->tparent->children = this->sibling;
921 			if (this->sibling == this)
922 				this->tparent->children = NULL;
923 		}
924 	} else {
925 		WARN_ON(this->sibling != this);
926 	}
927 }
928 
cbq_link_class(struct cbq_class * this)929 static void cbq_link_class(struct cbq_class *this)
930 {
931 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
932 	struct cbq_class *parent = this->tparent;
933 
934 	this->sibling = this;
935 	qdisc_class_hash_insert(&q->clhash, &this->common);
936 
937 	if (parent == NULL)
938 		return;
939 
940 	if (parent->children == NULL) {
941 		parent->children = this;
942 	} else {
943 		this->sibling = parent->children->sibling;
944 		parent->children->sibling = this;
945 	}
946 }
947 
948 static void
cbq_reset(struct Qdisc * sch)949 cbq_reset(struct Qdisc *sch)
950 {
951 	struct cbq_sched_data *q = qdisc_priv(sch);
952 	struct cbq_class *cl;
953 	int prio;
954 	unsigned int h;
955 
956 	q->activemask = 0;
957 	q->pmask = 0;
958 	q->tx_class = NULL;
959 	q->tx_borrowed = NULL;
960 	qdisc_watchdog_cancel(&q->watchdog);
961 	q->toplevel = TC_CBQ_MAXLEVEL;
962 	q->now = psched_get_time();
963 
964 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
965 		q->active[prio] = NULL;
966 
967 	for (h = 0; h < q->clhash.hashsize; h++) {
968 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
969 			qdisc_reset(cl->q);
970 
971 			cl->next_alive = NULL;
972 			cl->undertime = PSCHED_PASTPERFECT;
973 			cl->avgidle = cl->maxidle;
974 			cl->deficit = cl->quantum;
975 			cl->cpriority = cl->priority;
976 		}
977 	}
978 }
979 
980 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)981 static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
982 {
983 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
984 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
985 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
986 	}
987 	if (lss->change & TCF_CBQ_LSS_EWMA)
988 		cl->ewma_log = lss->ewma_log;
989 	if (lss->change & TCF_CBQ_LSS_AVPKT)
990 		cl->avpkt = lss->avpkt;
991 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
992 		cl->minidle = -(long)lss->minidle;
993 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
994 		cl->maxidle = lss->maxidle;
995 		cl->avgidle = lss->maxidle;
996 	}
997 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
998 		cl->offtime = lss->offtime;
999 }
1000 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1001 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1002 {
1003 	q->nclasses[cl->priority]--;
1004 	q->quanta[cl->priority] -= cl->weight;
1005 	cbq_normalize_quanta(q, cl->priority);
1006 }
1007 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1008 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1009 {
1010 	q->nclasses[cl->priority]++;
1011 	q->quanta[cl->priority] += cl->weight;
1012 	cbq_normalize_quanta(q, cl->priority);
1013 }
1014 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1015 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1016 {
1017 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1018 
1019 	if (wrr->allot)
1020 		cl->allot = wrr->allot;
1021 	if (wrr->weight)
1022 		cl->weight = wrr->weight;
1023 	if (wrr->priority) {
1024 		cl->priority = wrr->priority - 1;
1025 		cl->cpriority = cl->priority;
1026 		if (cl->priority >= cl->priority2)
1027 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1028 	}
1029 
1030 	cbq_addprio(q, cl);
1031 	return 0;
1032 }
1033 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1034 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1035 {
1036 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1037 	return 0;
1038 }
1039 
1040 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1041 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1042 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1043 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1044 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1045 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1046 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1047 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1048 };
1049 
cbq_opt_parse(struct nlattr * tb[TCA_CBQ_MAX+1],struct nlattr * opt,struct netlink_ext_ack * extack)1050 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1051 			 struct nlattr *opt,
1052 			 struct netlink_ext_ack *extack)
1053 {
1054 	int err;
1055 
1056 	if (!opt) {
1057 		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1058 		return -EINVAL;
1059 	}
1060 
1061 	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1062 					  cbq_policy, extack);
1063 	if (err < 0)
1064 		return err;
1065 
1066 	if (tb[TCA_CBQ_WRROPT]) {
1067 		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1068 
1069 		if (wrr->priority > TC_CBQ_MAXPRIO) {
1070 			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1071 			err = -EINVAL;
1072 		}
1073 	}
1074 	return err;
1075 }
1076 
cbq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1077 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1078 		    struct netlink_ext_ack *extack)
1079 {
1080 	struct cbq_sched_data *q = qdisc_priv(sch);
1081 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1082 	struct tc_ratespec *r;
1083 	int err;
1084 
1085 	qdisc_watchdog_init(&q->watchdog, sch);
1086 
1087 	err = cbq_opt_parse(tb, opt, extack);
1088 	if (err < 0)
1089 		return err;
1090 
1091 	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1092 		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1093 		return -EINVAL;
1094 	}
1095 
1096 	r = nla_data(tb[TCA_CBQ_RATE]);
1097 
1098 	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1099 	if (!q->link.R_tab)
1100 		return -EINVAL;
1101 
1102 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1103 	if (err)
1104 		goto put_rtab;
1105 
1106 	err = qdisc_class_hash_init(&q->clhash);
1107 	if (err < 0)
1108 		goto put_block;
1109 
1110 	q->link.sibling = &q->link;
1111 	q->link.common.classid = sch->handle;
1112 	q->link.qdisc = sch;
1113 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1114 				      sch->handle, NULL);
1115 	if (!q->link.q)
1116 		q->link.q = &noop_qdisc;
1117 	else
1118 		qdisc_hash_add(q->link.q, true);
1119 
1120 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1121 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1122 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1123 	q->link.allot = psched_mtu(qdisc_dev(sch));
1124 	q->link.quantum = q->link.allot;
1125 	q->link.weight = q->link.R_tab->rate.rate;
1126 
1127 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1128 	q->link.avpkt = q->link.allot/2;
1129 	q->link.minidle = -0x7FFFFFFF;
1130 
1131 	q->toplevel = TC_CBQ_MAXLEVEL;
1132 	q->now = psched_get_time();
1133 
1134 	cbq_link_class(&q->link);
1135 
1136 	if (tb[TCA_CBQ_LSSOPT])
1137 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1138 
1139 	cbq_addprio(q, &q->link);
1140 	return 0;
1141 
1142 put_block:
1143 	tcf_block_put(q->link.block);
1144 
1145 put_rtab:
1146 	qdisc_put_rtab(q->link.R_tab);
1147 	return err;
1148 }
1149 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1150 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1151 {
1152 	unsigned char *b = skb_tail_pointer(skb);
1153 
1154 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1155 		goto nla_put_failure;
1156 	return skb->len;
1157 
1158 nla_put_failure:
1159 	nlmsg_trim(skb, b);
1160 	return -1;
1161 }
1162 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1163 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1164 {
1165 	unsigned char *b = skb_tail_pointer(skb);
1166 	struct tc_cbq_lssopt opt;
1167 
1168 	opt.flags = 0;
1169 	if (cl->borrow == NULL)
1170 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1171 	if (cl->share == NULL)
1172 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1173 	opt.ewma_log = cl->ewma_log;
1174 	opt.level = cl->level;
1175 	opt.avpkt = cl->avpkt;
1176 	opt.maxidle = cl->maxidle;
1177 	opt.minidle = (u32)(-cl->minidle);
1178 	opt.offtime = cl->offtime;
1179 	opt.change = ~0;
1180 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1181 		goto nla_put_failure;
1182 	return skb->len;
1183 
1184 nla_put_failure:
1185 	nlmsg_trim(skb, b);
1186 	return -1;
1187 }
1188 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1189 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1190 {
1191 	unsigned char *b = skb_tail_pointer(skb);
1192 	struct tc_cbq_wrropt opt;
1193 
1194 	memset(&opt, 0, sizeof(opt));
1195 	opt.flags = 0;
1196 	opt.allot = cl->allot;
1197 	opt.priority = cl->priority + 1;
1198 	opt.cpriority = cl->cpriority + 1;
1199 	opt.weight = cl->weight;
1200 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1201 		goto nla_put_failure;
1202 	return skb->len;
1203 
1204 nla_put_failure:
1205 	nlmsg_trim(skb, b);
1206 	return -1;
1207 }
1208 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1209 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1210 {
1211 	unsigned char *b = skb_tail_pointer(skb);
1212 	struct tc_cbq_fopt opt;
1213 
1214 	if (cl->split || cl->defmap) {
1215 		opt.split = cl->split ? cl->split->common.classid : 0;
1216 		opt.defmap = cl->defmap;
1217 		opt.defchange = ~0;
1218 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1219 			goto nla_put_failure;
1220 	}
1221 	return skb->len;
1222 
1223 nla_put_failure:
1224 	nlmsg_trim(skb, b);
1225 	return -1;
1226 }
1227 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1228 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1229 {
1230 	if (cbq_dump_lss(skb, cl) < 0 ||
1231 	    cbq_dump_rate(skb, cl) < 0 ||
1232 	    cbq_dump_wrr(skb, cl) < 0 ||
1233 	    cbq_dump_fopt(skb, cl) < 0)
1234 		return -1;
1235 	return 0;
1236 }
1237 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1238 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1239 {
1240 	struct cbq_sched_data *q = qdisc_priv(sch);
1241 	struct nlattr *nest;
1242 
1243 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1244 	if (nest == NULL)
1245 		goto nla_put_failure;
1246 	if (cbq_dump_attr(skb, &q->link) < 0)
1247 		goto nla_put_failure;
1248 	return nla_nest_end(skb, nest);
1249 
1250 nla_put_failure:
1251 	nla_nest_cancel(skb, nest);
1252 	return -1;
1253 }
1254 
1255 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1256 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1257 {
1258 	struct cbq_sched_data *q = qdisc_priv(sch);
1259 
1260 	q->link.xstats.avgidle = q->link.avgidle;
1261 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1262 }
1263 
1264 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1265 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1266 	       struct sk_buff *skb, struct tcmsg *tcm)
1267 {
1268 	struct cbq_class *cl = (struct cbq_class *)arg;
1269 	struct nlattr *nest;
1270 
1271 	if (cl->tparent)
1272 		tcm->tcm_parent = cl->tparent->common.classid;
1273 	else
1274 		tcm->tcm_parent = TC_H_ROOT;
1275 	tcm->tcm_handle = cl->common.classid;
1276 	tcm->tcm_info = cl->q->handle;
1277 
1278 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1279 	if (nest == NULL)
1280 		goto nla_put_failure;
1281 	if (cbq_dump_attr(skb, cl) < 0)
1282 		goto nla_put_failure;
1283 	return nla_nest_end(skb, nest);
1284 
1285 nla_put_failure:
1286 	nla_nest_cancel(skb, nest);
1287 	return -1;
1288 }
1289 
1290 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1291 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1292 	struct gnet_dump *d)
1293 {
1294 	struct cbq_sched_data *q = qdisc_priv(sch);
1295 	struct cbq_class *cl = (struct cbq_class *)arg;
1296 	__u32 qlen;
1297 
1298 	cl->xstats.avgidle = cl->avgidle;
1299 	cl->xstats.undertime = 0;
1300 	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1301 
1302 	if (cl->undertime != PSCHED_PASTPERFECT)
1303 		cl->xstats.undertime = cl->undertime - q->now;
1304 
1305 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1306 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1307 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1308 		return -1;
1309 
1310 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1311 }
1312 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1313 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1314 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1315 {
1316 	struct cbq_class *cl = (struct cbq_class *)arg;
1317 
1318 	if (new == NULL) {
1319 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1320 					cl->common.classid, extack);
1321 		if (new == NULL)
1322 			return -ENOBUFS;
1323 	}
1324 
1325 	*old = qdisc_replace(sch, new, &cl->q);
1326 	return 0;
1327 }
1328 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1329 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1330 {
1331 	struct cbq_class *cl = (struct cbq_class *)arg;
1332 
1333 	return cl->q;
1334 }
1335 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1336 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1337 {
1338 	struct cbq_class *cl = (struct cbq_class *)arg;
1339 
1340 	cbq_deactivate_class(cl);
1341 }
1342 
cbq_find(struct Qdisc * sch,u32 classid)1343 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1344 {
1345 	struct cbq_sched_data *q = qdisc_priv(sch);
1346 
1347 	return (unsigned long)cbq_class_lookup(q, classid);
1348 }
1349 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1350 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1351 {
1352 	struct cbq_sched_data *q = qdisc_priv(sch);
1353 
1354 	WARN_ON(cl->filters);
1355 
1356 	tcf_block_put(cl->block);
1357 	qdisc_put(cl->q);
1358 	qdisc_put_rtab(cl->R_tab);
1359 	gen_kill_estimator(&cl->rate_est);
1360 	if (cl != &q->link)
1361 		kfree(cl);
1362 }
1363 
cbq_destroy(struct Qdisc * sch)1364 static void cbq_destroy(struct Qdisc *sch)
1365 {
1366 	struct cbq_sched_data *q = qdisc_priv(sch);
1367 	struct hlist_node *next;
1368 	struct cbq_class *cl;
1369 	unsigned int h;
1370 
1371 #ifdef CONFIG_NET_CLS_ACT
1372 	q->rx_class = NULL;
1373 #endif
1374 	/*
1375 	 * Filters must be destroyed first because we don't destroy the
1376 	 * classes from root to leafs which means that filters can still
1377 	 * be bound to classes which have been destroyed already. --TGR '04
1378 	 */
1379 	for (h = 0; h < q->clhash.hashsize; h++) {
1380 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1381 			tcf_block_put(cl->block);
1382 			cl->block = NULL;
1383 		}
1384 	}
1385 	for (h = 0; h < q->clhash.hashsize; h++) {
1386 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1387 					  common.hnode)
1388 			cbq_destroy_class(sch, cl);
1389 	}
1390 	qdisc_class_hash_destroy(&q->clhash);
1391 }
1392 
1393 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg,struct netlink_ext_ack * extack)1394 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1395 		 unsigned long *arg, struct netlink_ext_ack *extack)
1396 {
1397 	int err;
1398 	struct cbq_sched_data *q = qdisc_priv(sch);
1399 	struct cbq_class *cl = (struct cbq_class *)*arg;
1400 	struct nlattr *opt = tca[TCA_OPTIONS];
1401 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1402 	struct cbq_class *parent;
1403 	struct qdisc_rate_table *rtab = NULL;
1404 
1405 	err = cbq_opt_parse(tb, opt, extack);
1406 	if (err < 0)
1407 		return err;
1408 
1409 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1410 		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1411 		return -EOPNOTSUPP;
1412 	}
1413 
1414 	if (cl) {
1415 		/* Check parent */
1416 		if (parentid) {
1417 			if (cl->tparent &&
1418 			    cl->tparent->common.classid != parentid) {
1419 				NL_SET_ERR_MSG(extack, "Invalid parent id");
1420 				return -EINVAL;
1421 			}
1422 			if (!cl->tparent && parentid != TC_H_ROOT) {
1423 				NL_SET_ERR_MSG(extack, "Parent must be root");
1424 				return -EINVAL;
1425 			}
1426 		}
1427 
1428 		if (tb[TCA_CBQ_RATE]) {
1429 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1430 					      tb[TCA_CBQ_RTAB], extack);
1431 			if (rtab == NULL)
1432 				return -EINVAL;
1433 		}
1434 
1435 		if (tca[TCA_RATE]) {
1436 			err = gen_replace_estimator(&cl->bstats, NULL,
1437 						    &cl->rate_est,
1438 						    NULL,
1439 						    true,
1440 						    tca[TCA_RATE]);
1441 			if (err) {
1442 				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1443 				qdisc_put_rtab(rtab);
1444 				return err;
1445 			}
1446 		}
1447 
1448 		/* Change class parameters */
1449 		sch_tree_lock(sch);
1450 
1451 		if (cl->next_alive != NULL)
1452 			cbq_deactivate_class(cl);
1453 
1454 		if (rtab) {
1455 			qdisc_put_rtab(cl->R_tab);
1456 			cl->R_tab = rtab;
1457 		}
1458 
1459 		if (tb[TCA_CBQ_LSSOPT])
1460 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1461 
1462 		if (tb[TCA_CBQ_WRROPT]) {
1463 			cbq_rmprio(q, cl);
1464 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1465 		}
1466 
1467 		if (tb[TCA_CBQ_FOPT])
1468 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1469 
1470 		if (cl->q->q.qlen)
1471 			cbq_activate_class(cl);
1472 
1473 		sch_tree_unlock(sch);
1474 
1475 		return 0;
1476 	}
1477 
1478 	if (parentid == TC_H_ROOT)
1479 		return -EINVAL;
1480 
1481 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1482 		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1483 		return -EINVAL;
1484 	}
1485 
1486 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1487 			      extack);
1488 	if (rtab == NULL)
1489 		return -EINVAL;
1490 
1491 	if (classid) {
1492 		err = -EINVAL;
1493 		if (TC_H_MAJ(classid ^ sch->handle) ||
1494 		    cbq_class_lookup(q, classid)) {
1495 			NL_SET_ERR_MSG(extack, "Specified class not found");
1496 			goto failure;
1497 		}
1498 	} else {
1499 		int i;
1500 		classid = TC_H_MAKE(sch->handle, 0x8000);
1501 
1502 		for (i = 0; i < 0x8000; i++) {
1503 			if (++q->hgenerator >= 0x8000)
1504 				q->hgenerator = 1;
1505 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1506 				break;
1507 		}
1508 		err = -ENOSR;
1509 		if (i >= 0x8000) {
1510 			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1511 			goto failure;
1512 		}
1513 		classid = classid|q->hgenerator;
1514 	}
1515 
1516 	parent = &q->link;
1517 	if (parentid) {
1518 		parent = cbq_class_lookup(q, parentid);
1519 		err = -EINVAL;
1520 		if (!parent) {
1521 			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1522 			goto failure;
1523 		}
1524 	}
1525 
1526 	err = -ENOBUFS;
1527 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1528 	if (cl == NULL)
1529 		goto failure;
1530 
1531 	gnet_stats_basic_sync_init(&cl->bstats);
1532 	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1533 	if (err) {
1534 		kfree(cl);
1535 		goto failure;
1536 	}
1537 
1538 	if (tca[TCA_RATE]) {
1539 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1540 					NULL, true, tca[TCA_RATE]);
1541 		if (err) {
1542 			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1543 			tcf_block_put(cl->block);
1544 			kfree(cl);
1545 			goto failure;
1546 		}
1547 	}
1548 
1549 	cl->R_tab = rtab;
1550 	rtab = NULL;
1551 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1552 				  NULL);
1553 	if (!cl->q)
1554 		cl->q = &noop_qdisc;
1555 	else
1556 		qdisc_hash_add(cl->q, true);
1557 
1558 	cl->common.classid = classid;
1559 	cl->tparent = parent;
1560 	cl->qdisc = sch;
1561 	cl->allot = parent->allot;
1562 	cl->quantum = cl->allot;
1563 	cl->weight = cl->R_tab->rate.rate;
1564 
1565 	sch_tree_lock(sch);
1566 	cbq_link_class(cl);
1567 	cl->borrow = cl->tparent;
1568 	if (cl->tparent != &q->link)
1569 		cl->share = cl->tparent;
1570 	cbq_adjust_levels(parent);
1571 	cl->minidle = -0x7FFFFFFF;
1572 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1573 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1574 	if (cl->ewma_log == 0)
1575 		cl->ewma_log = q->link.ewma_log;
1576 	if (cl->maxidle == 0)
1577 		cl->maxidle = q->link.maxidle;
1578 	if (cl->avpkt == 0)
1579 		cl->avpkt = q->link.avpkt;
1580 	if (tb[TCA_CBQ_FOPT])
1581 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1582 	sch_tree_unlock(sch);
1583 
1584 	qdisc_class_hash_grow(sch, &q->clhash);
1585 
1586 	*arg = (unsigned long)cl;
1587 	return 0;
1588 
1589 failure:
1590 	qdisc_put_rtab(rtab);
1591 	return err;
1592 }
1593 
cbq_delete(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1594 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1595 		      struct netlink_ext_ack *extack)
1596 {
1597 	struct cbq_sched_data *q = qdisc_priv(sch);
1598 	struct cbq_class *cl = (struct cbq_class *)arg;
1599 
1600 	if (cl->filters || cl->children || cl == &q->link)
1601 		return -EBUSY;
1602 
1603 	sch_tree_lock(sch);
1604 
1605 	qdisc_purge_queue(cl->q);
1606 
1607 	if (cl->next_alive)
1608 		cbq_deactivate_class(cl);
1609 
1610 	if (q->tx_borrowed == cl)
1611 		q->tx_borrowed = q->tx_class;
1612 	if (q->tx_class == cl) {
1613 		q->tx_class = NULL;
1614 		q->tx_borrowed = NULL;
1615 	}
1616 #ifdef CONFIG_NET_CLS_ACT
1617 	if (q->rx_class == cl)
1618 		q->rx_class = NULL;
1619 #endif
1620 
1621 	cbq_unlink_class(cl);
1622 	cbq_adjust_levels(cl->tparent);
1623 	cl->defmap = 0;
1624 	cbq_sync_defmap(cl);
1625 
1626 	cbq_rmprio(q, cl);
1627 	sch_tree_unlock(sch);
1628 
1629 	cbq_destroy_class(sch, cl);
1630 	return 0;
1631 }
1632 
cbq_tcf_block(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1633 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1634 				       struct netlink_ext_ack *extack)
1635 {
1636 	struct cbq_sched_data *q = qdisc_priv(sch);
1637 	struct cbq_class *cl = (struct cbq_class *)arg;
1638 
1639 	if (cl == NULL)
1640 		cl = &q->link;
1641 
1642 	return cl->block;
1643 }
1644 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1645 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1646 				     u32 classid)
1647 {
1648 	struct cbq_sched_data *q = qdisc_priv(sch);
1649 	struct cbq_class *p = (struct cbq_class *)parent;
1650 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1651 
1652 	if (cl) {
1653 		if (p && p->level <= cl->level)
1654 			return 0;
1655 		cl->filters++;
1656 		return (unsigned long)cl;
1657 	}
1658 	return 0;
1659 }
1660 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1661 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1662 {
1663 	struct cbq_class *cl = (struct cbq_class *)arg;
1664 
1665 	cl->filters--;
1666 }
1667 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1668 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1669 {
1670 	struct cbq_sched_data *q = qdisc_priv(sch);
1671 	struct cbq_class *cl;
1672 	unsigned int h;
1673 
1674 	if (arg->stop)
1675 		return;
1676 
1677 	for (h = 0; h < q->clhash.hashsize; h++) {
1678 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1679 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
1680 				return;
1681 		}
1682 	}
1683 }
1684 
1685 static const struct Qdisc_class_ops cbq_class_ops = {
1686 	.graft		=	cbq_graft,
1687 	.leaf		=	cbq_leaf,
1688 	.qlen_notify	=	cbq_qlen_notify,
1689 	.find		=	cbq_find,
1690 	.change		=	cbq_change_class,
1691 	.delete		=	cbq_delete,
1692 	.walk		=	cbq_walk,
1693 	.tcf_block	=	cbq_tcf_block,
1694 	.bind_tcf	=	cbq_bind_filter,
1695 	.unbind_tcf	=	cbq_unbind_filter,
1696 	.dump		=	cbq_dump_class,
1697 	.dump_stats	=	cbq_dump_class_stats,
1698 };
1699 
1700 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1701 	.next		=	NULL,
1702 	.cl_ops		=	&cbq_class_ops,
1703 	.id		=	"cbq",
1704 	.priv_size	=	sizeof(struct cbq_sched_data),
1705 	.enqueue	=	cbq_enqueue,
1706 	.dequeue	=	cbq_dequeue,
1707 	.peek		=	qdisc_peek_dequeued,
1708 	.init		=	cbq_init,
1709 	.reset		=	cbq_reset,
1710 	.destroy	=	cbq_destroy,
1711 	.change		=	NULL,
1712 	.dump		=	cbq_dump,
1713 	.dump_stats	=	cbq_dump_stats,
1714 	.owner		=	THIS_MODULE,
1715 };
1716 
cbq_module_init(void)1717 static int __init cbq_module_init(void)
1718 {
1719 	return register_qdisc(&cbq_qdisc_ops);
1720 }
cbq_module_exit(void)1721 static void __exit cbq_module_exit(void)
1722 {
1723 	unregister_qdisc(&cbq_qdisc_ops);
1724 }
1725 module_init(cbq_module_init)
1726 module_exit(cbq_module_exit)
1727 MODULE_LICENSE("GPL");
1728