1 /*
2  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 
24 
25 /*	Class-Based Queueing (CBQ) algorithm.
26 	=======================================
27 
28 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
29 		 Management Models for Packet Networks",
30 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31 
32 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33 
34 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
35 		 Parameters", 1996
36 
37 		 [4] Sally Floyd and Michael Speer, "Experimental Results
38 		 for Class-Based Queueing", 1998, not published.
39 
40 	-----------------------------------------------------------------------
41 
42 	Algorithm skeleton was taken from NS simulator cbq.cc.
43 	If someone wants to check this code against the LBL version,
44 	he should take into account that ONLY the skeleton was borrowed,
45 	the implementation is different. Particularly:
46 
47 	--- The WRR algorithm is different. Our version looks more
48 	reasonable (I hope) and works when quanta are allowed to be
49 	less than MTU, which is always the case when real time classes
50 	have small rates. Note, that the statement of [3] is
51 	incomplete, delay may actually be estimated even if class
52 	per-round allotment is less than MTU. Namely, if per-round
53 	allotment is W*r_i, and r_1+...+r_k = r < 1
54 
55 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56 
57 	In the worst case we have IntServ estimate with D = W*r+k*MTU
58 	and C = MTU*r. The proof (if correct at all) is trivial.
59 
60 
61 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
62 	interpret some places, which look like wrong translations
63 	from NS. Anyone is advised to find these differences
64 	and explain to me, why I am wrong 8).
65 
66 	--- Linux has no EOI event, so that we cannot estimate true class
67 	idle time. Workaround is to consider the next dequeue event
68 	as sign that previous packet is finished. This is wrong because of
69 	internal device queueing, but on a permanently loaded link it is true.
70 	Moreover, combined with clock integrator, this scheme looks
71 	very close to an ideal solution.  */
72 
73 struct cbq_sched_data;
74 
75 
76 struct cbq_class {
77 	struct Qdisc_class_common common;
78 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
79 
80 /* Parameters */
81 	unsigned char		priority;	/* class priority */
82 	unsigned char		priority2;	/* priority to be used after overlimit */
83 	unsigned char		ewma_log;	/* time constant for idle time calculation */
84 
85 	u32			defmap;
86 
87 	/* Link-sharing scheduler parameters */
88 	long			maxidle;	/* Class parameters: see below. */
89 	long			offtime;
90 	long			minidle;
91 	u32			avpkt;
92 	struct qdisc_rate_table	*R_tab;
93 
94 	/* General scheduler (WRR) parameters */
95 	long			allot;
96 	long			quantum;	/* Allotment per WRR round */
97 	long			weight;		/* Relative allotment: see below */
98 
99 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
100 	struct cbq_class	*split;		/* Ptr to split node */
101 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
102 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
103 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
104 						   parent otherwise */
105 	struct cbq_class	*sibling;	/* Sibling chain */
106 	struct cbq_class	*children;	/* Pointer to children chain */
107 
108 	struct Qdisc		*q;		/* Elementary queueing discipline */
109 
110 
111 /* Variables */
112 	unsigned char		cpriority;	/* Effective priority */
113 	unsigned char		delayed;
114 	unsigned char		level;		/* level of the class in hierarchy:
115 						   0 for leaf classes, and maximal
116 						   level of children + 1 for nodes.
117 						 */
118 
119 	psched_time_t		last;		/* Last end of service */
120 	psched_time_t		undertime;
121 	long			avgidle;
122 	long			deficit;	/* Saved deficit for WRR */
123 	psched_time_t		penalized;
124 	struct gnet_stats_basic_packed bstats;
125 	struct gnet_stats_queue qstats;
126 	struct net_rate_estimator __rcu *rate_est;
127 	struct tc_cbq_xstats	xstats;
128 
129 	struct tcf_proto __rcu	*filter_list;
130 	struct tcf_block	*block;
131 
132 	int			filters;
133 
134 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
135 };
136 
137 struct cbq_sched_data {
138 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
139 	int			nclasses[TC_CBQ_MAXPRIO + 1];
140 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
141 
142 	struct cbq_class	link;
143 
144 	unsigned int		activemask;
145 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
146 								   with backlog */
147 
148 #ifdef CONFIG_NET_CLS_ACT
149 	struct cbq_class	*rx_class;
150 #endif
151 	struct cbq_class	*tx_class;
152 	struct cbq_class	*tx_borrowed;
153 	int			tx_len;
154 	psched_time_t		now;		/* Cached timestamp */
155 	unsigned int		pmask;
156 
157 	struct hrtimer		delay_timer;
158 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
159 						   started when CBQ has
160 						   backlog, but cannot
161 						   transmit just now */
162 	psched_tdiff_t		wd_expires;
163 	int			toplevel;
164 	u32			hgenerator;
165 };
166 
167 
168 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
169 
170 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)171 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
172 {
173 	struct Qdisc_class_common *clc;
174 
175 	clc = qdisc_class_find(&q->clhash, classid);
176 	if (clc == NULL)
177 		return NULL;
178 	return container_of(clc, struct cbq_class, common);
179 }
180 
181 #ifdef CONFIG_NET_CLS_ACT
182 
183 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)184 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
185 {
186 	struct cbq_class *cl;
187 
188 	for (cl = this->tparent; cl; cl = cl->tparent) {
189 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
190 
191 		if (new != NULL && new != this)
192 			return new;
193 	}
194 	return NULL;
195 }
196 
197 #endif
198 
199 /* Classify packet. The procedure is pretty complicated, but
200  * it allows us to combine link sharing and priority scheduling
201  * transparently.
202  *
203  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
204  * so that it resolves to split nodes. Then packets are classified
205  * by logical priority, or a more specific classifier may be attached
206  * to the split node.
207  */
208 
209 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)210 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
211 {
212 	struct cbq_sched_data *q = qdisc_priv(sch);
213 	struct cbq_class *head = &q->link;
214 	struct cbq_class **defmap;
215 	struct cbq_class *cl = NULL;
216 	u32 prio = skb->priority;
217 	struct tcf_proto *fl;
218 	struct tcf_result res;
219 
220 	/*
221 	 *  Step 1. If skb->priority points to one of our classes, use it.
222 	 */
223 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
224 	    (cl = cbq_class_lookup(q, prio)) != NULL)
225 		return cl;
226 
227 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
228 	for (;;) {
229 		int result = 0;
230 		defmap = head->defaults;
231 
232 		fl = rcu_dereference_bh(head->filter_list);
233 		/*
234 		 * Step 2+n. Apply classifier.
235 		 */
236 		result = tcf_classify(skb, fl, &res, true);
237 		if (!fl || result < 0)
238 			goto fallback;
239 
240 		cl = (void *)res.class;
241 		if (!cl) {
242 			if (TC_H_MAJ(res.classid))
243 				cl = cbq_class_lookup(q, res.classid);
244 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
245 				cl = defmap[TC_PRIO_BESTEFFORT];
246 
247 			if (cl == NULL)
248 				goto fallback;
249 		}
250 		if (cl->level >= head->level)
251 			goto fallback;
252 #ifdef CONFIG_NET_CLS_ACT
253 		switch (result) {
254 		case TC_ACT_QUEUED:
255 		case TC_ACT_STOLEN:
256 		case TC_ACT_TRAP:
257 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
258 			/* fall through */
259 		case TC_ACT_SHOT:
260 			return NULL;
261 		case TC_ACT_RECLASSIFY:
262 			return cbq_reclassify(skb, cl);
263 		}
264 #endif
265 		if (cl->level == 0)
266 			return cl;
267 
268 		/*
269 		 * Step 3+n. If classifier selected a link sharing class,
270 		 *	   apply agency specific classifier.
271 		 *	   Repeat this procdure until we hit a leaf node.
272 		 */
273 		head = cl;
274 	}
275 
276 fallback:
277 	cl = head;
278 
279 	/*
280 	 * Step 4. No success...
281 	 */
282 	if (TC_H_MAJ(prio) == 0 &&
283 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
284 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
285 		return head;
286 
287 	return cl;
288 }
289 
290 /*
291  * A packet has just been enqueued on the empty class.
292  * cbq_activate_class adds it to the tail of active class list
293  * of its priority band.
294  */
295 
cbq_activate_class(struct cbq_class * cl)296 static inline void cbq_activate_class(struct cbq_class *cl)
297 {
298 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
299 	int prio = cl->cpriority;
300 	struct cbq_class *cl_tail;
301 
302 	cl_tail = q->active[prio];
303 	q->active[prio] = cl;
304 
305 	if (cl_tail != NULL) {
306 		cl->next_alive = cl_tail->next_alive;
307 		cl_tail->next_alive = cl;
308 	} else {
309 		cl->next_alive = cl;
310 		q->activemask |= (1<<prio);
311 	}
312 }
313 
314 /*
315  * Unlink class from active chain.
316  * Note that this same procedure is done directly in cbq_dequeue*
317  * during round-robin procedure.
318  */
319 
cbq_deactivate_class(struct cbq_class * this)320 static void cbq_deactivate_class(struct cbq_class *this)
321 {
322 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
323 	int prio = this->cpriority;
324 	struct cbq_class *cl;
325 	struct cbq_class *cl_prev = q->active[prio];
326 
327 	do {
328 		cl = cl_prev->next_alive;
329 		if (cl == this) {
330 			cl_prev->next_alive = cl->next_alive;
331 			cl->next_alive = NULL;
332 
333 			if (cl == q->active[prio]) {
334 				q->active[prio] = cl_prev;
335 				if (cl == q->active[prio]) {
336 					q->active[prio] = NULL;
337 					q->activemask &= ~(1<<prio);
338 					return;
339 				}
340 			}
341 			return;
342 		}
343 	} while ((cl_prev = cl) != q->active[prio]);
344 }
345 
346 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)347 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
348 {
349 	int toplevel = q->toplevel;
350 
351 	if (toplevel > cl->level) {
352 		psched_time_t now = psched_get_time();
353 
354 		do {
355 			if (cl->undertime < now) {
356 				q->toplevel = cl->level;
357 				return;
358 			}
359 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
360 	}
361 }
362 
363 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)364 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
365 	    struct sk_buff **to_free)
366 {
367 	struct cbq_sched_data *q = qdisc_priv(sch);
368 	int uninitialized_var(ret);
369 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
370 
371 #ifdef CONFIG_NET_CLS_ACT
372 	q->rx_class = cl;
373 #endif
374 	if (cl == NULL) {
375 		if (ret & __NET_XMIT_BYPASS)
376 			qdisc_qstats_drop(sch);
377 		__qdisc_drop(skb, to_free);
378 		return ret;
379 	}
380 
381 	ret = qdisc_enqueue(skb, cl->q, to_free);
382 	if (ret == NET_XMIT_SUCCESS) {
383 		sch->q.qlen++;
384 		cbq_mark_toplevel(q, cl);
385 		if (!cl->next_alive)
386 			cbq_activate_class(cl);
387 		return ret;
388 	}
389 
390 	if (net_xmit_drop_count(ret)) {
391 		qdisc_qstats_drop(sch);
392 		cbq_mark_toplevel(q, cl);
393 		cl->qstats.drops++;
394 	}
395 	return ret;
396 }
397 
398 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)399 static void cbq_overlimit(struct cbq_class *cl)
400 {
401 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
402 	psched_tdiff_t delay = cl->undertime - q->now;
403 
404 	if (!cl->delayed) {
405 		delay += cl->offtime;
406 
407 		/*
408 		 * Class goes to sleep, so that it will have no
409 		 * chance to work avgidle. Let's forgive it 8)
410 		 *
411 		 * BTW cbq-2.0 has a crap in this
412 		 * place, apparently they forgot to shift it by cl->ewma_log.
413 		 */
414 		if (cl->avgidle < 0)
415 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
416 		if (cl->avgidle < cl->minidle)
417 			cl->avgidle = cl->minidle;
418 		if (delay <= 0)
419 			delay = 1;
420 		cl->undertime = q->now + delay;
421 
422 		cl->xstats.overactions++;
423 		cl->delayed = 1;
424 	}
425 	if (q->wd_expires == 0 || q->wd_expires > delay)
426 		q->wd_expires = delay;
427 
428 	/* Dirty work! We must schedule wakeups based on
429 	 * real available rate, rather than leaf rate,
430 	 * which may be tiny (even zero).
431 	 */
432 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
433 		struct cbq_class *b;
434 		psched_tdiff_t base_delay = q->wd_expires;
435 
436 		for (b = cl->borrow; b; b = b->borrow) {
437 			delay = b->undertime - q->now;
438 			if (delay < base_delay) {
439 				if (delay <= 0)
440 					delay = 1;
441 				base_delay = delay;
442 			}
443 		}
444 
445 		q->wd_expires = base_delay;
446 	}
447 }
448 
cbq_undelay_prio(struct cbq_sched_data * q,int prio,psched_time_t now)449 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
450 				       psched_time_t now)
451 {
452 	struct cbq_class *cl;
453 	struct cbq_class *cl_prev = q->active[prio];
454 	psched_time_t sched = now;
455 
456 	if (cl_prev == NULL)
457 		return 0;
458 
459 	do {
460 		cl = cl_prev->next_alive;
461 		if (now - cl->penalized > 0) {
462 			cl_prev->next_alive = cl->next_alive;
463 			cl->next_alive = NULL;
464 			cl->cpriority = cl->priority;
465 			cl->delayed = 0;
466 			cbq_activate_class(cl);
467 
468 			if (cl == q->active[prio]) {
469 				q->active[prio] = cl_prev;
470 				if (cl == q->active[prio]) {
471 					q->active[prio] = NULL;
472 					return 0;
473 				}
474 			}
475 
476 			cl = cl_prev->next_alive;
477 		} else if (sched - cl->penalized > 0)
478 			sched = cl->penalized;
479 	} while ((cl_prev = cl) != q->active[prio]);
480 
481 	return sched - now;
482 }
483 
cbq_undelay(struct hrtimer * timer)484 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
485 {
486 	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
487 						delay_timer);
488 	struct Qdisc *sch = q->watchdog.qdisc;
489 	psched_time_t now;
490 	psched_tdiff_t delay = 0;
491 	unsigned int pmask;
492 
493 	now = psched_get_time();
494 
495 	pmask = q->pmask;
496 	q->pmask = 0;
497 
498 	while (pmask) {
499 		int prio = ffz(~pmask);
500 		psched_tdiff_t tmp;
501 
502 		pmask &= ~(1<<prio);
503 
504 		tmp = cbq_undelay_prio(q, prio, now);
505 		if (tmp > 0) {
506 			q->pmask |= 1<<prio;
507 			if (tmp < delay || delay == 0)
508 				delay = tmp;
509 		}
510 	}
511 
512 	if (delay) {
513 		ktime_t time;
514 
515 		time = 0;
516 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
517 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
518 	}
519 
520 	__netif_schedule(qdisc_root(sch));
521 	return HRTIMER_NORESTART;
522 }
523 
524 /*
525  * It is mission critical procedure.
526  *
527  * We "regenerate" toplevel cutoff, if transmitting class
528  * has backlog and it is not regulated. It is not part of
529  * original CBQ description, but looks more reasonable.
530  * Probably, it is wrong. This question needs further investigation.
531  */
532 
533 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)534 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
535 		    struct cbq_class *borrowed)
536 {
537 	if (cl && q->toplevel >= borrowed->level) {
538 		if (cl->q->q.qlen > 1) {
539 			do {
540 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
541 					q->toplevel = borrowed->level;
542 					return;
543 				}
544 			} while ((borrowed = borrowed->borrow) != NULL);
545 		}
546 #if 0
547 	/* It is not necessary now. Uncommenting it
548 	   will save CPU cycles, but decrease fairness.
549 	 */
550 		q->toplevel = TC_CBQ_MAXLEVEL;
551 #endif
552 	}
553 }
554 
555 static void
cbq_update(struct cbq_sched_data * q)556 cbq_update(struct cbq_sched_data *q)
557 {
558 	struct cbq_class *this = q->tx_class;
559 	struct cbq_class *cl = this;
560 	int len = q->tx_len;
561 	psched_time_t now;
562 
563 	q->tx_class = NULL;
564 	/* Time integrator. We calculate EOS time
565 	 * by adding expected packet transmission time.
566 	 */
567 	now = q->now + L2T(&q->link, len);
568 
569 	for ( ; cl; cl = cl->share) {
570 		long avgidle = cl->avgidle;
571 		long idle;
572 
573 		cl->bstats.packets++;
574 		cl->bstats.bytes += len;
575 
576 		/*
577 		 * (now - last) is total time between packet right edges.
578 		 * (last_pktlen/rate) is "virtual" busy time, so that
579 		 *
580 		 *	idle = (now - last) - last_pktlen/rate
581 		 */
582 
583 		idle = now - cl->last;
584 		if ((unsigned long)idle > 128*1024*1024) {
585 			avgidle = cl->maxidle;
586 		} else {
587 			idle -= L2T(cl, len);
588 
589 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
590 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
591 		 * cl->avgidle == true_avgidle/W,
592 		 * hence:
593 		 */
594 			avgidle += idle - (avgidle>>cl->ewma_log);
595 		}
596 
597 		if (avgidle <= 0) {
598 			/* Overlimit or at-limit */
599 
600 			if (avgidle < cl->minidle)
601 				avgidle = cl->minidle;
602 
603 			cl->avgidle = avgidle;
604 
605 			/* Calculate expected time, when this class
606 			 * will be allowed to send.
607 			 * It will occur, when:
608 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
609 			 * idle = (1/W - 1)*(-true_avgidle)
610 			 * or
611 			 * idle = (1 - W)*(-cl->avgidle);
612 			 */
613 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
614 
615 			/*
616 			 * That is not all.
617 			 * To maintain the rate allocated to the class,
618 			 * we add to undertime virtual clock,
619 			 * necessary to complete transmitted packet.
620 			 * (len/phys_bandwidth has been already passed
621 			 * to the moment of cbq_update)
622 			 */
623 
624 			idle -= L2T(&q->link, len);
625 			idle += L2T(cl, len);
626 
627 			cl->undertime = now + idle;
628 		} else {
629 			/* Underlimit */
630 
631 			cl->undertime = PSCHED_PASTPERFECT;
632 			if (avgidle > cl->maxidle)
633 				cl->avgidle = cl->maxidle;
634 			else
635 				cl->avgidle = avgidle;
636 		}
637 		if ((s64)(now - cl->last) > 0)
638 			cl->last = now;
639 	}
640 
641 	cbq_update_toplevel(q, this, q->tx_borrowed);
642 }
643 
644 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)645 cbq_under_limit(struct cbq_class *cl)
646 {
647 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
648 	struct cbq_class *this_cl = cl;
649 
650 	if (cl->tparent == NULL)
651 		return cl;
652 
653 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
654 		cl->delayed = 0;
655 		return cl;
656 	}
657 
658 	do {
659 		/* It is very suspicious place. Now overlimit
660 		 * action is generated for not bounded classes
661 		 * only if link is completely congested.
662 		 * Though it is in agree with ancestor-only paradigm,
663 		 * it looks very stupid. Particularly,
664 		 * it means that this chunk of code will either
665 		 * never be called or result in strong amplification
666 		 * of burstiness. Dangerous, silly, and, however,
667 		 * no another solution exists.
668 		 */
669 		cl = cl->borrow;
670 		if (!cl) {
671 			this_cl->qstats.overlimits++;
672 			cbq_overlimit(this_cl);
673 			return NULL;
674 		}
675 		if (cl->level > q->toplevel)
676 			return NULL;
677 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
678 
679 	cl->delayed = 0;
680 	return cl;
681 }
682 
683 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)684 cbq_dequeue_prio(struct Qdisc *sch, int prio)
685 {
686 	struct cbq_sched_data *q = qdisc_priv(sch);
687 	struct cbq_class *cl_tail, *cl_prev, *cl;
688 	struct sk_buff *skb;
689 	int deficit;
690 
691 	cl_tail = cl_prev = q->active[prio];
692 	cl = cl_prev->next_alive;
693 
694 	do {
695 		deficit = 0;
696 
697 		/* Start round */
698 		do {
699 			struct cbq_class *borrow = cl;
700 
701 			if (cl->q->q.qlen &&
702 			    (borrow = cbq_under_limit(cl)) == NULL)
703 				goto skip_class;
704 
705 			if (cl->deficit <= 0) {
706 				/* Class exhausted its allotment per
707 				 * this round. Switch to the next one.
708 				 */
709 				deficit = 1;
710 				cl->deficit += cl->quantum;
711 				goto next_class;
712 			}
713 
714 			skb = cl->q->dequeue(cl->q);
715 
716 			/* Class did not give us any skb :-(
717 			 * It could occur even if cl->q->q.qlen != 0
718 			 * f.e. if cl->q == "tbf"
719 			 */
720 			if (skb == NULL)
721 				goto skip_class;
722 
723 			cl->deficit -= qdisc_pkt_len(skb);
724 			q->tx_class = cl;
725 			q->tx_borrowed = borrow;
726 			if (borrow != cl) {
727 #ifndef CBQ_XSTATS_BORROWS_BYTES
728 				borrow->xstats.borrows++;
729 				cl->xstats.borrows++;
730 #else
731 				borrow->xstats.borrows += qdisc_pkt_len(skb);
732 				cl->xstats.borrows += qdisc_pkt_len(skb);
733 #endif
734 			}
735 			q->tx_len = qdisc_pkt_len(skb);
736 
737 			if (cl->deficit <= 0) {
738 				q->active[prio] = cl;
739 				cl = cl->next_alive;
740 				cl->deficit += cl->quantum;
741 			}
742 			return skb;
743 
744 skip_class:
745 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
746 				/* Class is empty or penalized.
747 				 * Unlink it from active chain.
748 				 */
749 				cl_prev->next_alive = cl->next_alive;
750 				cl->next_alive = NULL;
751 
752 				/* Did cl_tail point to it? */
753 				if (cl == cl_tail) {
754 					/* Repair it! */
755 					cl_tail = cl_prev;
756 
757 					/* Was it the last class in this band? */
758 					if (cl == cl_tail) {
759 						/* Kill the band! */
760 						q->active[prio] = NULL;
761 						q->activemask &= ~(1<<prio);
762 						if (cl->q->q.qlen)
763 							cbq_activate_class(cl);
764 						return NULL;
765 					}
766 
767 					q->active[prio] = cl_tail;
768 				}
769 				if (cl->q->q.qlen)
770 					cbq_activate_class(cl);
771 
772 				cl = cl_prev;
773 			}
774 
775 next_class:
776 			cl_prev = cl;
777 			cl = cl->next_alive;
778 		} while (cl_prev != cl_tail);
779 	} while (deficit);
780 
781 	q->active[prio] = cl_prev;
782 
783 	return NULL;
784 }
785 
786 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)787 cbq_dequeue_1(struct Qdisc *sch)
788 {
789 	struct cbq_sched_data *q = qdisc_priv(sch);
790 	struct sk_buff *skb;
791 	unsigned int activemask;
792 
793 	activemask = q->activemask & 0xFF;
794 	while (activemask) {
795 		int prio = ffz(~activemask);
796 		activemask &= ~(1<<prio);
797 		skb = cbq_dequeue_prio(sch, prio);
798 		if (skb)
799 			return skb;
800 	}
801 	return NULL;
802 }
803 
804 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)805 cbq_dequeue(struct Qdisc *sch)
806 {
807 	struct sk_buff *skb;
808 	struct cbq_sched_data *q = qdisc_priv(sch);
809 	psched_time_t now;
810 
811 	now = psched_get_time();
812 
813 	if (q->tx_class)
814 		cbq_update(q);
815 
816 	q->now = now;
817 
818 	for (;;) {
819 		q->wd_expires = 0;
820 
821 		skb = cbq_dequeue_1(sch);
822 		if (skb) {
823 			qdisc_bstats_update(sch, skb);
824 			sch->q.qlen--;
825 			return skb;
826 		}
827 
828 		/* All the classes are overlimit.
829 		 *
830 		 * It is possible, if:
831 		 *
832 		 * 1. Scheduler is empty.
833 		 * 2. Toplevel cutoff inhibited borrowing.
834 		 * 3. Root class is overlimit.
835 		 *
836 		 * Reset 2d and 3d conditions and retry.
837 		 *
838 		 * Note, that NS and cbq-2.0 are buggy, peeking
839 		 * an arbitrary class is appropriate for ancestor-only
840 		 * sharing, but not for toplevel algorithm.
841 		 *
842 		 * Our version is better, but slower, because it requires
843 		 * two passes, but it is unavoidable with top-level sharing.
844 		 */
845 
846 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
847 		    q->link.undertime == PSCHED_PASTPERFECT)
848 			break;
849 
850 		q->toplevel = TC_CBQ_MAXLEVEL;
851 		q->link.undertime = PSCHED_PASTPERFECT;
852 	}
853 
854 	/* No packets in scheduler or nobody wants to give them to us :-(
855 	 * Sigh... start watchdog timer in the last case.
856 	 */
857 
858 	if (sch->q.qlen) {
859 		qdisc_qstats_overlimit(sch);
860 		if (q->wd_expires)
861 			qdisc_watchdog_schedule(&q->watchdog,
862 						now + q->wd_expires);
863 	}
864 	return NULL;
865 }
866 
867 /* CBQ class maintanance routines */
868 
cbq_adjust_levels(struct cbq_class * this)869 static void cbq_adjust_levels(struct cbq_class *this)
870 {
871 	if (this == NULL)
872 		return;
873 
874 	do {
875 		int level = 0;
876 		struct cbq_class *cl;
877 
878 		cl = this->children;
879 		if (cl) {
880 			do {
881 				if (cl->level > level)
882 					level = cl->level;
883 			} while ((cl = cl->sibling) != this->children);
884 		}
885 		this->level = level + 1;
886 	} while ((this = this->tparent) != NULL);
887 }
888 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)889 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
890 {
891 	struct cbq_class *cl;
892 	unsigned int h;
893 
894 	if (q->quanta[prio] == 0)
895 		return;
896 
897 	for (h = 0; h < q->clhash.hashsize; h++) {
898 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
899 			/* BUGGGG... Beware! This expression suffer of
900 			 * arithmetic overflows!
901 			 */
902 			if (cl->priority == prio) {
903 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
904 					q->quanta[prio];
905 			}
906 			if (cl->quantum <= 0 ||
907 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
908 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
909 					cl->common.classid, cl->quantum);
910 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
911 			}
912 		}
913 	}
914 }
915 
cbq_sync_defmap(struct cbq_class * cl)916 static void cbq_sync_defmap(struct cbq_class *cl)
917 {
918 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
919 	struct cbq_class *split = cl->split;
920 	unsigned int h;
921 	int i;
922 
923 	if (split == NULL)
924 		return;
925 
926 	for (i = 0; i <= TC_PRIO_MAX; i++) {
927 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
928 			split->defaults[i] = NULL;
929 	}
930 
931 	for (i = 0; i <= TC_PRIO_MAX; i++) {
932 		int level = split->level;
933 
934 		if (split->defaults[i])
935 			continue;
936 
937 		for (h = 0; h < q->clhash.hashsize; h++) {
938 			struct cbq_class *c;
939 
940 			hlist_for_each_entry(c, &q->clhash.hash[h],
941 					     common.hnode) {
942 				if (c->split == split && c->level < level &&
943 				    c->defmap & (1<<i)) {
944 					split->defaults[i] = c;
945 					level = c->level;
946 				}
947 			}
948 		}
949 	}
950 }
951 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)952 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
953 {
954 	struct cbq_class *split = NULL;
955 
956 	if (splitid == 0) {
957 		split = cl->split;
958 		if (!split)
959 			return;
960 		splitid = split->common.classid;
961 	}
962 
963 	if (split == NULL || split->common.classid != splitid) {
964 		for (split = cl->tparent; split; split = split->tparent)
965 			if (split->common.classid == splitid)
966 				break;
967 	}
968 
969 	if (split == NULL)
970 		return;
971 
972 	if (cl->split != split) {
973 		cl->defmap = 0;
974 		cbq_sync_defmap(cl);
975 		cl->split = split;
976 		cl->defmap = def & mask;
977 	} else
978 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
979 
980 	cbq_sync_defmap(cl);
981 }
982 
cbq_unlink_class(struct cbq_class * this)983 static void cbq_unlink_class(struct cbq_class *this)
984 {
985 	struct cbq_class *cl, **clp;
986 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
987 
988 	qdisc_class_hash_remove(&q->clhash, &this->common);
989 
990 	if (this->tparent) {
991 		clp = &this->sibling;
992 		cl = *clp;
993 		do {
994 			if (cl == this) {
995 				*clp = cl->sibling;
996 				break;
997 			}
998 			clp = &cl->sibling;
999 		} while ((cl = *clp) != this->sibling);
1000 
1001 		if (this->tparent->children == this) {
1002 			this->tparent->children = this->sibling;
1003 			if (this->sibling == this)
1004 				this->tparent->children = NULL;
1005 		}
1006 	} else {
1007 		WARN_ON(this->sibling != this);
1008 	}
1009 }
1010 
cbq_link_class(struct cbq_class * this)1011 static void cbq_link_class(struct cbq_class *this)
1012 {
1013 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1014 	struct cbq_class *parent = this->tparent;
1015 
1016 	this->sibling = this;
1017 	qdisc_class_hash_insert(&q->clhash, &this->common);
1018 
1019 	if (parent == NULL)
1020 		return;
1021 
1022 	if (parent->children == NULL) {
1023 		parent->children = this;
1024 	} else {
1025 		this->sibling = parent->children->sibling;
1026 		parent->children->sibling = this;
1027 	}
1028 }
1029 
1030 static void
cbq_reset(struct Qdisc * sch)1031 cbq_reset(struct Qdisc *sch)
1032 {
1033 	struct cbq_sched_data *q = qdisc_priv(sch);
1034 	struct cbq_class *cl;
1035 	int prio;
1036 	unsigned int h;
1037 
1038 	q->activemask = 0;
1039 	q->pmask = 0;
1040 	q->tx_class = NULL;
1041 	q->tx_borrowed = NULL;
1042 	qdisc_watchdog_cancel(&q->watchdog);
1043 	hrtimer_cancel(&q->delay_timer);
1044 	q->toplevel = TC_CBQ_MAXLEVEL;
1045 	q->now = psched_get_time();
1046 
1047 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1048 		q->active[prio] = NULL;
1049 
1050 	for (h = 0; h < q->clhash.hashsize; h++) {
1051 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1052 			qdisc_reset(cl->q);
1053 
1054 			cl->next_alive = NULL;
1055 			cl->undertime = PSCHED_PASTPERFECT;
1056 			cl->avgidle = cl->maxidle;
1057 			cl->deficit = cl->quantum;
1058 			cl->cpriority = cl->priority;
1059 		}
1060 	}
1061 	sch->q.qlen = 0;
1062 }
1063 
1064 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)1065 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1066 {
1067 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1068 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1069 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1070 	}
1071 	if (lss->change & TCF_CBQ_LSS_EWMA)
1072 		cl->ewma_log = lss->ewma_log;
1073 	if (lss->change & TCF_CBQ_LSS_AVPKT)
1074 		cl->avpkt = lss->avpkt;
1075 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1076 		cl->minidle = -(long)lss->minidle;
1077 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1078 		cl->maxidle = lss->maxidle;
1079 		cl->avgidle = lss->maxidle;
1080 	}
1081 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1082 		cl->offtime = lss->offtime;
1083 	return 0;
1084 }
1085 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1086 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1087 {
1088 	q->nclasses[cl->priority]--;
1089 	q->quanta[cl->priority] -= cl->weight;
1090 	cbq_normalize_quanta(q, cl->priority);
1091 }
1092 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1093 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1094 {
1095 	q->nclasses[cl->priority]++;
1096 	q->quanta[cl->priority] += cl->weight;
1097 	cbq_normalize_quanta(q, cl->priority);
1098 }
1099 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1100 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1101 {
1102 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1103 
1104 	if (wrr->allot)
1105 		cl->allot = wrr->allot;
1106 	if (wrr->weight)
1107 		cl->weight = wrr->weight;
1108 	if (wrr->priority) {
1109 		cl->priority = wrr->priority - 1;
1110 		cl->cpriority = cl->priority;
1111 		if (cl->priority >= cl->priority2)
1112 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1113 	}
1114 
1115 	cbq_addprio(q, cl);
1116 	return 0;
1117 }
1118 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1119 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1120 {
1121 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1122 	return 0;
1123 }
1124 
1125 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1126 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1127 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1128 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1129 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1130 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1131 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1132 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1133 };
1134 
cbq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1135 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1136 		    struct netlink_ext_ack *extack)
1137 {
1138 	struct cbq_sched_data *q = qdisc_priv(sch);
1139 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1140 	struct tc_ratespec *r;
1141 	int err;
1142 
1143 	qdisc_watchdog_init(&q->watchdog, sch);
1144 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1145 	q->delay_timer.function = cbq_undelay;
1146 
1147 	if (!opt) {
1148 		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1149 		return -EINVAL;
1150 	}
1151 
1152 	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1153 	if (err < 0)
1154 		return err;
1155 
1156 	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1157 		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1158 		return -EINVAL;
1159 	}
1160 
1161 	r = nla_data(tb[TCA_CBQ_RATE]);
1162 
1163 	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1164 	if (!q->link.R_tab)
1165 		return -EINVAL;
1166 
1167 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1168 	if (err)
1169 		goto put_rtab;
1170 
1171 	err = qdisc_class_hash_init(&q->clhash);
1172 	if (err < 0)
1173 		goto put_block;
1174 
1175 	q->link.sibling = &q->link;
1176 	q->link.common.classid = sch->handle;
1177 	q->link.qdisc = sch;
1178 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1179 				      sch->handle, NULL);
1180 	if (!q->link.q)
1181 		q->link.q = &noop_qdisc;
1182 	else
1183 		qdisc_hash_add(q->link.q, true);
1184 
1185 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1186 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1187 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1188 	q->link.allot = psched_mtu(qdisc_dev(sch));
1189 	q->link.quantum = q->link.allot;
1190 	q->link.weight = q->link.R_tab->rate.rate;
1191 
1192 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1193 	q->link.avpkt = q->link.allot/2;
1194 	q->link.minidle = -0x7FFFFFFF;
1195 
1196 	q->toplevel = TC_CBQ_MAXLEVEL;
1197 	q->now = psched_get_time();
1198 
1199 	cbq_link_class(&q->link);
1200 
1201 	if (tb[TCA_CBQ_LSSOPT])
1202 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1203 
1204 	cbq_addprio(q, &q->link);
1205 	return 0;
1206 
1207 put_block:
1208 	tcf_block_put(q->link.block);
1209 
1210 put_rtab:
1211 	qdisc_put_rtab(q->link.R_tab);
1212 	return err;
1213 }
1214 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1215 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1216 {
1217 	unsigned char *b = skb_tail_pointer(skb);
1218 
1219 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1220 		goto nla_put_failure;
1221 	return skb->len;
1222 
1223 nla_put_failure:
1224 	nlmsg_trim(skb, b);
1225 	return -1;
1226 }
1227 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1228 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1229 {
1230 	unsigned char *b = skb_tail_pointer(skb);
1231 	struct tc_cbq_lssopt opt;
1232 
1233 	opt.flags = 0;
1234 	if (cl->borrow == NULL)
1235 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1236 	if (cl->share == NULL)
1237 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1238 	opt.ewma_log = cl->ewma_log;
1239 	opt.level = cl->level;
1240 	opt.avpkt = cl->avpkt;
1241 	opt.maxidle = cl->maxidle;
1242 	opt.minidle = (u32)(-cl->minidle);
1243 	opt.offtime = cl->offtime;
1244 	opt.change = ~0;
1245 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1246 		goto nla_put_failure;
1247 	return skb->len;
1248 
1249 nla_put_failure:
1250 	nlmsg_trim(skb, b);
1251 	return -1;
1252 }
1253 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1254 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1255 {
1256 	unsigned char *b = skb_tail_pointer(skb);
1257 	struct tc_cbq_wrropt opt;
1258 
1259 	memset(&opt, 0, sizeof(opt));
1260 	opt.flags = 0;
1261 	opt.allot = cl->allot;
1262 	opt.priority = cl->priority + 1;
1263 	opt.cpriority = cl->cpriority + 1;
1264 	opt.weight = cl->weight;
1265 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1266 		goto nla_put_failure;
1267 	return skb->len;
1268 
1269 nla_put_failure:
1270 	nlmsg_trim(skb, b);
1271 	return -1;
1272 }
1273 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1274 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1275 {
1276 	unsigned char *b = skb_tail_pointer(skb);
1277 	struct tc_cbq_fopt opt;
1278 
1279 	if (cl->split || cl->defmap) {
1280 		opt.split = cl->split ? cl->split->common.classid : 0;
1281 		opt.defmap = cl->defmap;
1282 		opt.defchange = ~0;
1283 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1284 			goto nla_put_failure;
1285 	}
1286 	return skb->len;
1287 
1288 nla_put_failure:
1289 	nlmsg_trim(skb, b);
1290 	return -1;
1291 }
1292 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1293 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1294 {
1295 	if (cbq_dump_lss(skb, cl) < 0 ||
1296 	    cbq_dump_rate(skb, cl) < 0 ||
1297 	    cbq_dump_wrr(skb, cl) < 0 ||
1298 	    cbq_dump_fopt(skb, cl) < 0)
1299 		return -1;
1300 	return 0;
1301 }
1302 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1303 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1304 {
1305 	struct cbq_sched_data *q = qdisc_priv(sch);
1306 	struct nlattr *nest;
1307 
1308 	nest = nla_nest_start(skb, TCA_OPTIONS);
1309 	if (nest == NULL)
1310 		goto nla_put_failure;
1311 	if (cbq_dump_attr(skb, &q->link) < 0)
1312 		goto nla_put_failure;
1313 	return nla_nest_end(skb, nest);
1314 
1315 nla_put_failure:
1316 	nla_nest_cancel(skb, nest);
1317 	return -1;
1318 }
1319 
1320 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1321 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1322 {
1323 	struct cbq_sched_data *q = qdisc_priv(sch);
1324 
1325 	q->link.xstats.avgidle = q->link.avgidle;
1326 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1327 }
1328 
1329 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1330 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1331 	       struct sk_buff *skb, struct tcmsg *tcm)
1332 {
1333 	struct cbq_class *cl = (struct cbq_class *)arg;
1334 	struct nlattr *nest;
1335 
1336 	if (cl->tparent)
1337 		tcm->tcm_parent = cl->tparent->common.classid;
1338 	else
1339 		tcm->tcm_parent = TC_H_ROOT;
1340 	tcm->tcm_handle = cl->common.classid;
1341 	tcm->tcm_info = cl->q->handle;
1342 
1343 	nest = nla_nest_start(skb, TCA_OPTIONS);
1344 	if (nest == NULL)
1345 		goto nla_put_failure;
1346 	if (cbq_dump_attr(skb, cl) < 0)
1347 		goto nla_put_failure;
1348 	return nla_nest_end(skb, nest);
1349 
1350 nla_put_failure:
1351 	nla_nest_cancel(skb, nest);
1352 	return -1;
1353 }
1354 
1355 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1356 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1357 	struct gnet_dump *d)
1358 {
1359 	struct cbq_sched_data *q = qdisc_priv(sch);
1360 	struct cbq_class *cl = (struct cbq_class *)arg;
1361 
1362 	cl->xstats.avgidle = cl->avgidle;
1363 	cl->xstats.undertime = 0;
1364 
1365 	if (cl->undertime != PSCHED_PASTPERFECT)
1366 		cl->xstats.undertime = cl->undertime - q->now;
1367 
1368 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1369 				  d, NULL, &cl->bstats) < 0 ||
1370 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1371 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1372 		return -1;
1373 
1374 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1375 }
1376 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1377 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1378 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1379 {
1380 	struct cbq_class *cl = (struct cbq_class *)arg;
1381 
1382 	if (new == NULL) {
1383 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1384 					cl->common.classid, extack);
1385 		if (new == NULL)
1386 			return -ENOBUFS;
1387 	}
1388 
1389 	*old = qdisc_replace(sch, new, &cl->q);
1390 	return 0;
1391 }
1392 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1393 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1394 {
1395 	struct cbq_class *cl = (struct cbq_class *)arg;
1396 
1397 	return cl->q;
1398 }
1399 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1400 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1401 {
1402 	struct cbq_class *cl = (struct cbq_class *)arg;
1403 
1404 	cbq_deactivate_class(cl);
1405 }
1406 
cbq_find(struct Qdisc * sch,u32 classid)1407 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1408 {
1409 	struct cbq_sched_data *q = qdisc_priv(sch);
1410 
1411 	return (unsigned long)cbq_class_lookup(q, classid);
1412 }
1413 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1414 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1415 {
1416 	struct cbq_sched_data *q = qdisc_priv(sch);
1417 
1418 	WARN_ON(cl->filters);
1419 
1420 	tcf_block_put(cl->block);
1421 	qdisc_destroy(cl->q);
1422 	qdisc_put_rtab(cl->R_tab);
1423 	gen_kill_estimator(&cl->rate_est);
1424 	if (cl != &q->link)
1425 		kfree(cl);
1426 }
1427 
cbq_destroy(struct Qdisc * sch)1428 static void cbq_destroy(struct Qdisc *sch)
1429 {
1430 	struct cbq_sched_data *q = qdisc_priv(sch);
1431 	struct hlist_node *next;
1432 	struct cbq_class *cl;
1433 	unsigned int h;
1434 
1435 #ifdef CONFIG_NET_CLS_ACT
1436 	q->rx_class = NULL;
1437 #endif
1438 	/*
1439 	 * Filters must be destroyed first because we don't destroy the
1440 	 * classes from root to leafs which means that filters can still
1441 	 * be bound to classes which have been destroyed already. --TGR '04
1442 	 */
1443 	for (h = 0; h < q->clhash.hashsize; h++) {
1444 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1445 			tcf_block_put(cl->block);
1446 			cl->block = NULL;
1447 		}
1448 	}
1449 	for (h = 0; h < q->clhash.hashsize; h++) {
1450 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1451 					  common.hnode)
1452 			cbq_destroy_class(sch, cl);
1453 	}
1454 	qdisc_class_hash_destroy(&q->clhash);
1455 }
1456 
1457 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg,struct netlink_ext_ack * extack)1458 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1459 		 unsigned long *arg, struct netlink_ext_ack *extack)
1460 {
1461 	int err;
1462 	struct cbq_sched_data *q = qdisc_priv(sch);
1463 	struct cbq_class *cl = (struct cbq_class *)*arg;
1464 	struct nlattr *opt = tca[TCA_OPTIONS];
1465 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1466 	struct cbq_class *parent;
1467 	struct qdisc_rate_table *rtab = NULL;
1468 
1469 	if (!opt) {
1470 		NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
1471 		return -EINVAL;
1472 	}
1473 
1474 	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1475 	if (err < 0)
1476 		return err;
1477 
1478 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1479 		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1480 		return -EOPNOTSUPP;
1481 	}
1482 
1483 	if (cl) {
1484 		/* Check parent */
1485 		if (parentid) {
1486 			if (cl->tparent &&
1487 			    cl->tparent->common.classid != parentid) {
1488 				NL_SET_ERR_MSG(extack, "Invalid parent id");
1489 				return -EINVAL;
1490 			}
1491 			if (!cl->tparent && parentid != TC_H_ROOT) {
1492 				NL_SET_ERR_MSG(extack, "Parent must be root");
1493 				return -EINVAL;
1494 			}
1495 		}
1496 
1497 		if (tb[TCA_CBQ_RATE]) {
1498 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1499 					      tb[TCA_CBQ_RTAB], extack);
1500 			if (rtab == NULL)
1501 				return -EINVAL;
1502 		}
1503 
1504 		if (tca[TCA_RATE]) {
1505 			err = gen_replace_estimator(&cl->bstats, NULL,
1506 						    &cl->rate_est,
1507 						    NULL,
1508 						    qdisc_root_sleeping_running(sch),
1509 						    tca[TCA_RATE]);
1510 			if (err) {
1511 				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1512 				qdisc_put_rtab(rtab);
1513 				return err;
1514 			}
1515 		}
1516 
1517 		/* Change class parameters */
1518 		sch_tree_lock(sch);
1519 
1520 		if (cl->next_alive != NULL)
1521 			cbq_deactivate_class(cl);
1522 
1523 		if (rtab) {
1524 			qdisc_put_rtab(cl->R_tab);
1525 			cl->R_tab = rtab;
1526 		}
1527 
1528 		if (tb[TCA_CBQ_LSSOPT])
1529 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1530 
1531 		if (tb[TCA_CBQ_WRROPT]) {
1532 			cbq_rmprio(q, cl);
1533 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1534 		}
1535 
1536 		if (tb[TCA_CBQ_FOPT])
1537 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1538 
1539 		if (cl->q->q.qlen)
1540 			cbq_activate_class(cl);
1541 
1542 		sch_tree_unlock(sch);
1543 
1544 		return 0;
1545 	}
1546 
1547 	if (parentid == TC_H_ROOT)
1548 		return -EINVAL;
1549 
1550 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1551 		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1552 		return -EINVAL;
1553 	}
1554 
1555 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1556 			      extack);
1557 	if (rtab == NULL)
1558 		return -EINVAL;
1559 
1560 	if (classid) {
1561 		err = -EINVAL;
1562 		if (TC_H_MAJ(classid ^ sch->handle) ||
1563 		    cbq_class_lookup(q, classid)) {
1564 			NL_SET_ERR_MSG(extack, "Specified class not found");
1565 			goto failure;
1566 		}
1567 	} else {
1568 		int i;
1569 		classid = TC_H_MAKE(sch->handle, 0x8000);
1570 
1571 		for (i = 0; i < 0x8000; i++) {
1572 			if (++q->hgenerator >= 0x8000)
1573 				q->hgenerator = 1;
1574 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1575 				break;
1576 		}
1577 		err = -ENOSR;
1578 		if (i >= 0x8000) {
1579 			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1580 			goto failure;
1581 		}
1582 		classid = classid|q->hgenerator;
1583 	}
1584 
1585 	parent = &q->link;
1586 	if (parentid) {
1587 		parent = cbq_class_lookup(q, parentid);
1588 		err = -EINVAL;
1589 		if (!parent) {
1590 			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1591 			goto failure;
1592 		}
1593 	}
1594 
1595 	err = -ENOBUFS;
1596 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1597 	if (cl == NULL)
1598 		goto failure;
1599 
1600 	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1601 	if (err) {
1602 		kfree(cl);
1603 		return err;
1604 	}
1605 
1606 	if (tca[TCA_RATE]) {
1607 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1608 					NULL,
1609 					qdisc_root_sleeping_running(sch),
1610 					tca[TCA_RATE]);
1611 		if (err) {
1612 			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1613 			tcf_block_put(cl->block);
1614 			kfree(cl);
1615 			goto failure;
1616 		}
1617 	}
1618 
1619 	cl->R_tab = rtab;
1620 	rtab = NULL;
1621 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1622 				  NULL);
1623 	if (!cl->q)
1624 		cl->q = &noop_qdisc;
1625 	else
1626 		qdisc_hash_add(cl->q, true);
1627 
1628 	cl->common.classid = classid;
1629 	cl->tparent = parent;
1630 	cl->qdisc = sch;
1631 	cl->allot = parent->allot;
1632 	cl->quantum = cl->allot;
1633 	cl->weight = cl->R_tab->rate.rate;
1634 
1635 	sch_tree_lock(sch);
1636 	cbq_link_class(cl);
1637 	cl->borrow = cl->tparent;
1638 	if (cl->tparent != &q->link)
1639 		cl->share = cl->tparent;
1640 	cbq_adjust_levels(parent);
1641 	cl->minidle = -0x7FFFFFFF;
1642 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1643 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1644 	if (cl->ewma_log == 0)
1645 		cl->ewma_log = q->link.ewma_log;
1646 	if (cl->maxidle == 0)
1647 		cl->maxidle = q->link.maxidle;
1648 	if (cl->avpkt == 0)
1649 		cl->avpkt = q->link.avpkt;
1650 	if (tb[TCA_CBQ_FOPT])
1651 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1652 	sch_tree_unlock(sch);
1653 
1654 	qdisc_class_hash_grow(sch, &q->clhash);
1655 
1656 	*arg = (unsigned long)cl;
1657 	return 0;
1658 
1659 failure:
1660 	qdisc_put_rtab(rtab);
1661 	return err;
1662 }
1663 
cbq_delete(struct Qdisc * sch,unsigned long arg)1664 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1665 {
1666 	struct cbq_sched_data *q = qdisc_priv(sch);
1667 	struct cbq_class *cl = (struct cbq_class *)arg;
1668 	unsigned int qlen, backlog;
1669 
1670 	if (cl->filters || cl->children || cl == &q->link)
1671 		return -EBUSY;
1672 
1673 	sch_tree_lock(sch);
1674 
1675 	qlen = cl->q->q.qlen;
1676 	backlog = cl->q->qstats.backlog;
1677 	qdisc_reset(cl->q);
1678 	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1679 
1680 	if (cl->next_alive)
1681 		cbq_deactivate_class(cl);
1682 
1683 	if (q->tx_borrowed == cl)
1684 		q->tx_borrowed = q->tx_class;
1685 	if (q->tx_class == cl) {
1686 		q->tx_class = NULL;
1687 		q->tx_borrowed = NULL;
1688 	}
1689 #ifdef CONFIG_NET_CLS_ACT
1690 	if (q->rx_class == cl)
1691 		q->rx_class = NULL;
1692 #endif
1693 
1694 	cbq_unlink_class(cl);
1695 	cbq_adjust_levels(cl->tparent);
1696 	cl->defmap = 0;
1697 	cbq_sync_defmap(cl);
1698 
1699 	cbq_rmprio(q, cl);
1700 	sch_tree_unlock(sch);
1701 
1702 	cbq_destroy_class(sch, cl);
1703 	return 0;
1704 }
1705 
cbq_tcf_block(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1706 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1707 				       struct netlink_ext_ack *extack)
1708 {
1709 	struct cbq_sched_data *q = qdisc_priv(sch);
1710 	struct cbq_class *cl = (struct cbq_class *)arg;
1711 
1712 	if (cl == NULL)
1713 		cl = &q->link;
1714 
1715 	return cl->block;
1716 }
1717 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1718 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1719 				     u32 classid)
1720 {
1721 	struct cbq_sched_data *q = qdisc_priv(sch);
1722 	struct cbq_class *p = (struct cbq_class *)parent;
1723 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1724 
1725 	if (cl) {
1726 		if (p && p->level <= cl->level)
1727 			return 0;
1728 		cl->filters++;
1729 		return (unsigned long)cl;
1730 	}
1731 	return 0;
1732 }
1733 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1734 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1735 {
1736 	struct cbq_class *cl = (struct cbq_class *)arg;
1737 
1738 	cl->filters--;
1739 }
1740 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1741 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1742 {
1743 	struct cbq_sched_data *q = qdisc_priv(sch);
1744 	struct cbq_class *cl;
1745 	unsigned int h;
1746 
1747 	if (arg->stop)
1748 		return;
1749 
1750 	for (h = 0; h < q->clhash.hashsize; h++) {
1751 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1752 			if (arg->count < arg->skip) {
1753 				arg->count++;
1754 				continue;
1755 			}
1756 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1757 				arg->stop = 1;
1758 				return;
1759 			}
1760 			arg->count++;
1761 		}
1762 	}
1763 }
1764 
1765 static const struct Qdisc_class_ops cbq_class_ops = {
1766 	.graft		=	cbq_graft,
1767 	.leaf		=	cbq_leaf,
1768 	.qlen_notify	=	cbq_qlen_notify,
1769 	.find		=	cbq_find,
1770 	.change		=	cbq_change_class,
1771 	.delete		=	cbq_delete,
1772 	.walk		=	cbq_walk,
1773 	.tcf_block	=	cbq_tcf_block,
1774 	.bind_tcf	=	cbq_bind_filter,
1775 	.unbind_tcf	=	cbq_unbind_filter,
1776 	.dump		=	cbq_dump_class,
1777 	.dump_stats	=	cbq_dump_class_stats,
1778 };
1779 
1780 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1781 	.next		=	NULL,
1782 	.cl_ops		=	&cbq_class_ops,
1783 	.id		=	"cbq",
1784 	.priv_size	=	sizeof(struct cbq_sched_data),
1785 	.enqueue	=	cbq_enqueue,
1786 	.dequeue	=	cbq_dequeue,
1787 	.peek		=	qdisc_peek_dequeued,
1788 	.init		=	cbq_init,
1789 	.reset		=	cbq_reset,
1790 	.destroy	=	cbq_destroy,
1791 	.change		=	NULL,
1792 	.dump		=	cbq_dump,
1793 	.dump_stats	=	cbq_dump_stats,
1794 	.owner		=	THIS_MODULE,
1795 };
1796 
cbq_module_init(void)1797 static int __init cbq_module_init(void)
1798 {
1799 	return register_qdisc(&cbq_qdisc_ops);
1800 }
cbq_module_exit(void)1801 static void __exit cbq_module_exit(void)
1802 {
1803 	unregister_qdisc(&cbq_qdisc_ops);
1804 }
1805 module_init(cbq_module_init)
1806 module_exit(cbq_module_exit)
1807 MODULE_LICENSE("GPL");
1808