1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
register_qdisc(struct Qdisc_ops * qops)133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
unregister_qdisc(struct Qdisc_ops * qops)177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
qdisc_lookup_default(const char * name)204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
qdisc_set_default(const char * name)220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
sch_default_qdisc(void)250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
qdisc_match_from_root(struct Qdisc * root,u32 handle)262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
qdisc_hash_add(struct Qdisc * q,bool invisible)280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
qdisc_hash_del(struct Qdisc * q)291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
qdisc_lookup(struct net_device * dev,u32 handle)300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
qdisc_leaf(struct Qdisc * p,u32 classid)318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
qdisc_lookup_ops(struct nlattr * kind)336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
qdisc_put_rtab(struct qdisc_rate_table * tab)431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
stab_kfree_rcu(struct rcu_head * head)514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
qdisc_put_stab(struct qdisc_size_table * tab)519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
qdisc_watchdog(struct hrtimer * timer)587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)599 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600 				 clockid_t clockid)
601 {
602 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603 	wd->timer.function = qdisc_watchdog;
604 	wd->qdisc = qdisc;
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)608 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609 {
610 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611 }
612 EXPORT_SYMBOL(qdisc_watchdog_init);
613 
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)614 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615 {
616 	if (test_bit(__QDISC_STATE_DEACTIVATED,
617 		     &qdisc_root_sleeping(wd->qdisc)->state))
618 		return;
619 
620 	if (wd->last_expires == expires)
621 		return;
622 
623 	wd->last_expires = expires;
624 	hrtimer_start(&wd->timer,
625 		      ns_to_ktime(expires),
626 		      HRTIMER_MODE_ABS_PINNED);
627 }
628 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)630 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
631 {
632 	hrtimer_cancel(&wd->timer);
633 }
634 EXPORT_SYMBOL(qdisc_watchdog_cancel);
635 
qdisc_class_hash_alloc(unsigned int n)636 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 {
638 	struct hlist_head *h;
639 	unsigned int i;
640 
641 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642 
643 	if (h != NULL) {
644 		for (i = 0; i < n; i++)
645 			INIT_HLIST_HEAD(&h[i]);
646 	}
647 	return h;
648 }
649 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)650 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
651 {
652 	struct Qdisc_class_common *cl;
653 	struct hlist_node *next;
654 	struct hlist_head *nhash, *ohash;
655 	unsigned int nsize, nmask, osize;
656 	unsigned int i, h;
657 
658 	/* Rehash when load factor exceeds 0.75 */
659 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
660 		return;
661 	nsize = clhash->hashsize * 2;
662 	nmask = nsize - 1;
663 	nhash = qdisc_class_hash_alloc(nsize);
664 	if (nhash == NULL)
665 		return;
666 
667 	ohash = clhash->hash;
668 	osize = clhash->hashsize;
669 
670 	sch_tree_lock(sch);
671 	for (i = 0; i < osize; i++) {
672 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673 			h = qdisc_class_hash(cl->classid, nmask);
674 			hlist_add_head(&cl->hnode, &nhash[h]);
675 		}
676 	}
677 	clhash->hash     = nhash;
678 	clhash->hashsize = nsize;
679 	clhash->hashmask = nmask;
680 	sch_tree_unlock(sch);
681 
682 	kvfree(ohash);
683 }
684 EXPORT_SYMBOL(qdisc_class_hash_grow);
685 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)686 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
687 {
688 	unsigned int size = 4;
689 
690 	clhash->hash = qdisc_class_hash_alloc(size);
691 	if (!clhash->hash)
692 		return -ENOMEM;
693 	clhash->hashsize  = size;
694 	clhash->hashmask  = size - 1;
695 	clhash->hashelems = 0;
696 	return 0;
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_init);
699 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)700 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
701 {
702 	kvfree(clhash->hash);
703 }
704 EXPORT_SYMBOL(qdisc_class_hash_destroy);
705 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)706 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
707 			     struct Qdisc_class_common *cl)
708 {
709 	unsigned int h;
710 
711 	INIT_HLIST_NODE(&cl->hnode);
712 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
713 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
714 	clhash->hashelems++;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_insert);
717 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)718 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
719 			     struct Qdisc_class_common *cl)
720 {
721 	hlist_del(&cl->hnode);
722 	clhash->hashelems--;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_remove);
725 
726 /* Allocate an unique handle from space managed by kernel
727  * Possible range is [8000-FFFF]:0000 (0x8000 values)
728  */
qdisc_alloc_handle(struct net_device * dev)729 static u32 qdisc_alloc_handle(struct net_device *dev)
730 {
731 	int i = 0x8000;
732 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
733 
734 	do {
735 		autohandle += TC_H_MAKE(0x10000U, 0);
736 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
737 			autohandle = TC_H_MAKE(0x80000000U, 0);
738 		if (!qdisc_lookup(dev, autohandle))
739 			return autohandle;
740 		cond_resched();
741 	} while	(--i > 0);
742 
743 	return 0;
744 }
745 
qdisc_tree_reduce_backlog(struct Qdisc * sch,unsigned int n,unsigned int len)746 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
747 			       unsigned int len)
748 {
749 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750 	const struct Qdisc_class_ops *cops;
751 	unsigned long cl;
752 	u32 parentid;
753 	bool notify;
754 	int drops;
755 
756 	if (n == 0 && len == 0)
757 		return;
758 	drops = max_t(int, n, 0);
759 	rcu_read_lock();
760 	while ((parentid = sch->parent)) {
761 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762 			break;
763 
764 		if (sch->flags & TCQ_F_NOPARENT)
765 			break;
766 		/* Notify parent qdisc only if child qdisc becomes empty.
767 		 *
768 		 * If child was empty even before update then backlog
769 		 * counter is screwed and we skip notification because
770 		 * parent class is already passive.
771 		 *
772 		 * If the original child was offloaded then it is allowed
773 		 * to be seem as empty, so the parent is notified anyway.
774 		 */
775 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
776 						       !qdisc_is_offloaded);
777 		/* TODO: perform the search on a per txq basis */
778 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779 		if (sch == NULL) {
780 			WARN_ON_ONCE(parentid != TC_H_ROOT);
781 			break;
782 		}
783 		cops = sch->ops->cl_ops;
784 		if (notify && cops->qlen_notify) {
785 			cl = cops->find(sch, parentid);
786 			cops->qlen_notify(sch, cl);
787 		}
788 		sch->q.qlen -= n;
789 		sch->qstats.backlog -= len;
790 		__qdisc_qstats_drop(sch, drops);
791 	}
792 	rcu_read_unlock();
793 }
794 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
795 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)796 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
797 			 u32 portid, u32 seq, u16 flags, int event)
798 {
799 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
800 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
801 	struct tcmsg *tcm;
802 	struct nlmsghdr  *nlh;
803 	unsigned char *b = skb_tail_pointer(skb);
804 	struct gnet_dump d;
805 	struct qdisc_size_table *stab;
806 	u32 block_index;
807 	__u32 qlen;
808 
809 	cond_resched();
810 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
811 	if (!nlh)
812 		goto out_nlmsg_trim;
813 	tcm = nlmsg_data(nlh);
814 	tcm->tcm_family = AF_UNSPEC;
815 	tcm->tcm__pad1 = 0;
816 	tcm->tcm__pad2 = 0;
817 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
818 	tcm->tcm_parent = clid;
819 	tcm->tcm_handle = q->handle;
820 	tcm->tcm_info = refcount_read(&q->refcnt);
821 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
822 		goto nla_put_failure;
823 	if (q->ops->ingress_block_get) {
824 		block_index = q->ops->ingress_block_get(q);
825 		if (block_index &&
826 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
827 			goto nla_put_failure;
828 	}
829 	if (q->ops->egress_block_get) {
830 		block_index = q->ops->egress_block_get(q);
831 		if (block_index &&
832 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
833 			goto nla_put_failure;
834 	}
835 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
836 		goto nla_put_failure;
837 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
838 		goto nla_put_failure;
839 	qlen = qdisc_qlen_sum(q);
840 
841 	stab = rtnl_dereference(q->stab);
842 	if (stab && qdisc_dump_stab(skb, stab) < 0)
843 		goto nla_put_failure;
844 
845 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
846 					 NULL, &d, TCA_PAD) < 0)
847 		goto nla_put_failure;
848 
849 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
850 		goto nla_put_failure;
851 
852 	if (qdisc_is_percpu_stats(q)) {
853 		cpu_bstats = q->cpu_bstats;
854 		cpu_qstats = q->cpu_qstats;
855 	}
856 
857 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
858 				  &d, cpu_bstats, &q->bstats) < 0 ||
859 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
860 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
861 		goto nla_put_failure;
862 
863 	if (gnet_stats_finish_copy(&d) < 0)
864 		goto nla_put_failure;
865 
866 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
867 	return skb->len;
868 
869 out_nlmsg_trim:
870 nla_put_failure:
871 	nlmsg_trim(skb, b);
872 	return -1;
873 }
874 
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)875 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
876 {
877 	if (q->flags & TCQ_F_BUILTIN)
878 		return true;
879 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
880 		return true;
881 
882 	return false;
883 }
884 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)885 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
886 			struct nlmsghdr *n, u32 clid,
887 			struct Qdisc *old, struct Qdisc *new)
888 {
889 	struct sk_buff *skb;
890 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
891 
892 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
893 	if (!skb)
894 		return -ENOBUFS;
895 
896 	if (old && !tc_qdisc_dump_ignore(old, false)) {
897 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
898 				  0, RTM_DELQDISC) < 0)
899 			goto err_out;
900 	}
901 	if (new && !tc_qdisc_dump_ignore(new, false)) {
902 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
903 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
904 			goto err_out;
905 	}
906 
907 	if (skb->len)
908 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
909 				      n->nlmsg_flags & NLM_F_ECHO);
910 
911 err_out:
912 	kfree_skb(skb);
913 	return -EINVAL;
914 }
915 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)916 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
917 			       struct nlmsghdr *n, u32 clid,
918 			       struct Qdisc *old, struct Qdisc *new)
919 {
920 	if (new || old)
921 		qdisc_notify(net, skb, n, clid, old, new);
922 
923 	if (old)
924 		qdisc_destroy(old);
925 }
926 
927 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
928  * to device "dev".
929  *
930  * When appropriate send a netlink notification using 'skb'
931  * and "n".
932  *
933  * On success, destroy old qdisc.
934  */
935 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)936 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938 		       struct Qdisc *new, struct Qdisc *old,
939 		       struct netlink_ext_ack *extack)
940 {
941 	struct Qdisc *q = old;
942 	struct net *net = dev_net(dev);
943 	int err = 0;
944 
945 	if (parent == NULL) {
946 		unsigned int i, num_q, ingress;
947 
948 		ingress = 0;
949 		num_q = dev->num_tx_queues;
950 		if ((q && q->flags & TCQ_F_INGRESS) ||
951 		    (new && new->flags & TCQ_F_INGRESS)) {
952 			num_q = 1;
953 			ingress = 1;
954 			if (!dev_ingress_queue(dev)) {
955 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956 				return -ENOENT;
957 			}
958 		}
959 
960 		if (dev->flags & IFF_UP)
961 			dev_deactivate(dev);
962 
963 		if (new && new->ops->attach)
964 			goto skip;
965 
966 		for (i = 0; i < num_q; i++) {
967 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968 
969 			if (!ingress)
970 				dev_queue = netdev_get_tx_queue(dev, i);
971 
972 			old = dev_graft_qdisc(dev_queue, new);
973 			if (new && i > 0)
974 				qdisc_refcount_inc(new);
975 
976 			if (!ingress)
977 				qdisc_destroy(old);
978 		}
979 
980 skip:
981 		if (!ingress) {
982 			notify_and_destroy(net, skb, n, classid,
983 					   dev->qdisc, new);
984 			if (new && !new->ops->attach)
985 				qdisc_refcount_inc(new);
986 			dev->qdisc = new ? : &noop_qdisc;
987 
988 			if (new && new->ops->attach)
989 				new->ops->attach(new);
990 		} else {
991 			notify_and_destroy(net, skb, n, classid, old, new);
992 		}
993 
994 		if (dev->flags & IFF_UP)
995 			dev_activate(dev);
996 	} else {
997 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
998 
999 		/* Only support running class lockless if parent is lockless */
1000 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1001 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1002 			new->flags &= ~TCQ_F_NOLOCK;
1003 
1004 		err = -EOPNOTSUPP;
1005 		if (cops && cops->graft) {
1006 			unsigned long cl = cops->find(parent, classid);
1007 
1008 			if (cl) {
1009 				err = cops->graft(parent, cl, new, &old,
1010 						  extack);
1011 			} else {
1012 				NL_SET_ERR_MSG(extack, "Specified class not found");
1013 				err = -ENOENT;
1014 			}
1015 		}
1016 		if (!err)
1017 			notify_and_destroy(net, skb, n, classid, old, new);
1018 	}
1019 	return err;
1020 }
1021 
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1022 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1023 				   struct netlink_ext_ack *extack)
1024 {
1025 	u32 block_index;
1026 
1027 	if (tca[TCA_INGRESS_BLOCK]) {
1028 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1029 
1030 		if (!block_index) {
1031 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1032 			return -EINVAL;
1033 		}
1034 		if (!sch->ops->ingress_block_set) {
1035 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1036 			return -EOPNOTSUPP;
1037 		}
1038 		sch->ops->ingress_block_set(sch, block_index);
1039 	}
1040 	if (tca[TCA_EGRESS_BLOCK]) {
1041 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1042 
1043 		if (!block_index) {
1044 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1045 			return -EINVAL;
1046 		}
1047 		if (!sch->ops->egress_block_set) {
1048 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1049 			return -EOPNOTSUPP;
1050 		}
1051 		sch->ops->egress_block_set(sch, block_index);
1052 	}
1053 	return 0;
1054 }
1055 
1056 /* lockdep annotation is needed for ingress; egress gets it only for name */
1057 static struct lock_class_key qdisc_tx_lock;
1058 static struct lock_class_key qdisc_rx_lock;
1059 
1060 /*
1061    Allocate and initialize new qdisc.
1062 
1063    Parameters are passed via opt.
1064  */
1065 
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1066 static struct Qdisc *qdisc_create(struct net_device *dev,
1067 				  struct netdev_queue *dev_queue,
1068 				  struct Qdisc *p, u32 parent, u32 handle,
1069 				  struct nlattr **tca, int *errp,
1070 				  struct netlink_ext_ack *extack)
1071 {
1072 	int err;
1073 	struct nlattr *kind = tca[TCA_KIND];
1074 	struct Qdisc *sch;
1075 	struct Qdisc_ops *ops;
1076 	struct qdisc_size_table *stab;
1077 
1078 	ops = qdisc_lookup_ops(kind);
1079 #ifdef CONFIG_MODULES
1080 	if (ops == NULL && kind != NULL) {
1081 		char name[IFNAMSIZ];
1082 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1083 			/* We dropped the RTNL semaphore in order to
1084 			 * perform the module load.  So, even if we
1085 			 * succeeded in loading the module we have to
1086 			 * tell the caller to replay the request.  We
1087 			 * indicate this using -EAGAIN.
1088 			 * We replay the request because the device may
1089 			 * go away in the mean time.
1090 			 */
1091 			rtnl_unlock();
1092 			request_module("sch_%s", name);
1093 			rtnl_lock();
1094 			ops = qdisc_lookup_ops(kind);
1095 			if (ops != NULL) {
1096 				/* We will try again qdisc_lookup_ops,
1097 				 * so don't keep a reference.
1098 				 */
1099 				module_put(ops->owner);
1100 				err = -EAGAIN;
1101 				goto err_out;
1102 			}
1103 		}
1104 	}
1105 #endif
1106 
1107 	err = -ENOENT;
1108 	if (!ops) {
1109 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1110 		goto err_out;
1111 	}
1112 
1113 	sch = qdisc_alloc(dev_queue, ops, extack);
1114 	if (IS_ERR(sch)) {
1115 		err = PTR_ERR(sch);
1116 		goto err_out2;
1117 	}
1118 
1119 	sch->parent = parent;
1120 
1121 	if (handle == TC_H_INGRESS) {
1122 		sch->flags |= TCQ_F_INGRESS;
1123 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1124 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1125 	} else {
1126 		if (handle == 0) {
1127 			handle = qdisc_alloc_handle(dev);
1128 			err = -ENOMEM;
1129 			if (handle == 0)
1130 				goto err_out3;
1131 		}
1132 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1133 		if (!netif_is_multiqueue(dev))
1134 			sch->flags |= TCQ_F_ONETXQUEUE;
1135 	}
1136 
1137 	sch->handle = handle;
1138 
1139 	/* This exist to keep backward compatible with a userspace
1140 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1141 	 * facility on older kernels by setting tx_queue_len=0 (prior
1142 	 * to qdisc init), and then forgot to reinit tx_queue_len
1143 	 * before again attaching a qdisc.
1144 	 */
1145 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1146 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1147 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1148 	}
1149 
1150 	err = qdisc_block_indexes_set(sch, tca, extack);
1151 	if (err)
1152 		goto err_out3;
1153 
1154 	if (ops->init) {
1155 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1156 		if (err != 0)
1157 			goto err_out5;
1158 	}
1159 
1160 	if (tca[TCA_STAB]) {
1161 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1162 		if (IS_ERR(stab)) {
1163 			err = PTR_ERR(stab);
1164 			goto err_out4;
1165 		}
1166 		rcu_assign_pointer(sch->stab, stab);
1167 	}
1168 	if (tca[TCA_RATE]) {
1169 		seqcount_t *running;
1170 
1171 		err = -EOPNOTSUPP;
1172 		if (sch->flags & TCQ_F_MQROOT) {
1173 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1174 			goto err_out4;
1175 		}
1176 
1177 		if (sch->parent != TC_H_ROOT &&
1178 		    !(sch->flags & TCQ_F_INGRESS) &&
1179 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1180 			running = qdisc_root_sleeping_running(sch);
1181 		else
1182 			running = &sch->running;
1183 
1184 		err = gen_new_estimator(&sch->bstats,
1185 					sch->cpu_bstats,
1186 					&sch->rate_est,
1187 					NULL,
1188 					running,
1189 					tca[TCA_RATE]);
1190 		if (err) {
1191 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1192 			goto err_out4;
1193 		}
1194 	}
1195 
1196 	qdisc_hash_add(sch, false);
1197 
1198 	return sch;
1199 
1200 err_out5:
1201 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1202 	if (ops->destroy)
1203 		ops->destroy(sch);
1204 err_out3:
1205 	dev_put(dev);
1206 	qdisc_free(sch);
1207 err_out2:
1208 	module_put(ops->owner);
1209 err_out:
1210 	*errp = err;
1211 	return NULL;
1212 
1213 err_out4:
1214 	/*
1215 	 * Any broken qdiscs that would require a ops->reset() here?
1216 	 * The qdisc was never in action so it shouldn't be necessary.
1217 	 */
1218 	qdisc_put_stab(rtnl_dereference(sch->stab));
1219 	if (ops->destroy)
1220 		ops->destroy(sch);
1221 	goto err_out3;
1222 }
1223 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1224 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1225 			struct netlink_ext_ack *extack)
1226 {
1227 	struct qdisc_size_table *ostab, *stab = NULL;
1228 	int err = 0;
1229 
1230 	if (tca[TCA_OPTIONS]) {
1231 		if (!sch->ops->change) {
1232 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1233 			return -EINVAL;
1234 		}
1235 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1236 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1237 			return -EOPNOTSUPP;
1238 		}
1239 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1240 		if (err)
1241 			return err;
1242 	}
1243 
1244 	if (tca[TCA_STAB]) {
1245 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1246 		if (IS_ERR(stab))
1247 			return PTR_ERR(stab);
1248 	}
1249 
1250 	ostab = rtnl_dereference(sch->stab);
1251 	rcu_assign_pointer(sch->stab, stab);
1252 	qdisc_put_stab(ostab);
1253 
1254 	if (tca[TCA_RATE]) {
1255 		/* NB: ignores errors from replace_estimator
1256 		   because change can't be undone. */
1257 		if (sch->flags & TCQ_F_MQROOT)
1258 			goto out;
1259 		gen_replace_estimator(&sch->bstats,
1260 				      sch->cpu_bstats,
1261 				      &sch->rate_est,
1262 				      NULL,
1263 				      qdisc_root_sleeping_running(sch),
1264 				      tca[TCA_RATE]);
1265 	}
1266 out:
1267 	return 0;
1268 }
1269 
1270 struct check_loop_arg {
1271 	struct qdisc_walker	w;
1272 	struct Qdisc		*p;
1273 	int			depth;
1274 };
1275 
1276 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1277 			 struct qdisc_walker *w);
1278 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1279 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1280 {
1281 	struct check_loop_arg	arg;
1282 
1283 	if (q->ops->cl_ops == NULL)
1284 		return 0;
1285 
1286 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1287 	arg.w.fn = check_loop_fn;
1288 	arg.depth = depth;
1289 	arg.p = p;
1290 	q->ops->cl_ops->walk(q, &arg.w);
1291 	return arg.w.stop ? -ELOOP : 0;
1292 }
1293 
1294 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1295 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1296 {
1297 	struct Qdisc *leaf;
1298 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1299 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1300 
1301 	leaf = cops->leaf(q, cl);
1302 	if (leaf) {
1303 		if (leaf == arg->p || arg->depth > 7)
1304 			return -ELOOP;
1305 		return check_loop(leaf, arg->p, arg->depth + 1);
1306 	}
1307 	return 0;
1308 }
1309 
1310 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1311 	[TCA_KIND]		= { .type = NLA_STRING },
1312 	[TCA_OPTIONS]		= { .type = NLA_NESTED },
1313 	[TCA_RATE]		= { .type = NLA_BINARY,
1314 				    .len = sizeof(struct tc_estimator) },
1315 	[TCA_STAB]		= { .type = NLA_NESTED },
1316 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1317 	[TCA_CHAIN]		= { .type = NLA_U32 },
1318 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1319 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1320 };
1321 
1322 /*
1323  * Delete/get qdisc.
1324  */
1325 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1326 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1327 			struct netlink_ext_ack *extack)
1328 {
1329 	struct net *net = sock_net(skb->sk);
1330 	struct tcmsg *tcm = nlmsg_data(n);
1331 	struct nlattr *tca[TCA_MAX + 1];
1332 	struct net_device *dev;
1333 	u32 clid;
1334 	struct Qdisc *q = NULL;
1335 	struct Qdisc *p = NULL;
1336 	int err;
1337 
1338 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1339 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1340 		return -EPERM;
1341 
1342 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1343 			  extack);
1344 	if (err < 0)
1345 		return err;
1346 
1347 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1348 	if (!dev)
1349 		return -ENODEV;
1350 
1351 	clid = tcm->tcm_parent;
1352 	if (clid) {
1353 		if (clid != TC_H_ROOT) {
1354 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1355 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1356 				if (!p) {
1357 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1358 					return -ENOENT;
1359 				}
1360 				q = qdisc_leaf(p, clid);
1361 			} else if (dev_ingress_queue(dev)) {
1362 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1363 			}
1364 		} else {
1365 			q = dev->qdisc;
1366 		}
1367 		if (!q) {
1368 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1369 			return -ENOENT;
1370 		}
1371 
1372 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1373 			NL_SET_ERR_MSG(extack, "Invalid handle");
1374 			return -EINVAL;
1375 		}
1376 	} else {
1377 		q = qdisc_lookup(dev, tcm->tcm_handle);
1378 		if (!q) {
1379 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1380 			return -ENOENT;
1381 		}
1382 	}
1383 
1384 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1385 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1386 		return -EINVAL;
1387 	}
1388 
1389 	if (n->nlmsg_type == RTM_DELQDISC) {
1390 		if (!clid) {
1391 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1392 			return -EINVAL;
1393 		}
1394 		if (q->handle == 0) {
1395 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1396 			return -ENOENT;
1397 		}
1398 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1399 		if (err != 0)
1400 			return err;
1401 	} else {
1402 		qdisc_notify(net, skb, n, clid, NULL, q);
1403 	}
1404 	return 0;
1405 }
1406 
1407 /*
1408  * Create/change qdisc.
1409  */
1410 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1411 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1412 			   struct netlink_ext_ack *extack)
1413 {
1414 	struct net *net = sock_net(skb->sk);
1415 	struct tcmsg *tcm;
1416 	struct nlattr *tca[TCA_MAX + 1];
1417 	struct net_device *dev;
1418 	u32 clid;
1419 	struct Qdisc *q, *p;
1420 	int err;
1421 
1422 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1423 		return -EPERM;
1424 
1425 replay:
1426 	/* Reinit, just in case something touches this. */
1427 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1428 			  extack);
1429 	if (err < 0)
1430 		return err;
1431 
1432 	tcm = nlmsg_data(n);
1433 	clid = tcm->tcm_parent;
1434 	q = p = NULL;
1435 
1436 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1437 	if (!dev)
1438 		return -ENODEV;
1439 
1440 
1441 	if (clid) {
1442 		if (clid != TC_H_ROOT) {
1443 			if (clid != TC_H_INGRESS) {
1444 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1445 				if (!p) {
1446 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1447 					return -ENOENT;
1448 				}
1449 				q = qdisc_leaf(p, clid);
1450 			} else if (dev_ingress_queue_create(dev)) {
1451 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1452 			}
1453 		} else {
1454 			q = dev->qdisc;
1455 		}
1456 
1457 		/* It may be default qdisc, ignore it */
1458 		if (q && q->handle == 0)
1459 			q = NULL;
1460 
1461 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1462 			if (tcm->tcm_handle) {
1463 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1464 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1465 					return -EEXIST;
1466 				}
1467 				if (TC_H_MIN(tcm->tcm_handle)) {
1468 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1469 					return -EINVAL;
1470 				}
1471 				q = qdisc_lookup(dev, tcm->tcm_handle);
1472 				if (!q)
1473 					goto create_n_graft;
1474 				if (n->nlmsg_flags & NLM_F_EXCL) {
1475 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1476 					return -EEXIST;
1477 				}
1478 				if (tca[TCA_KIND] &&
1479 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1480 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1481 					return -EINVAL;
1482 				}
1483 				if (q == p ||
1484 				    (p && check_loop(q, p, 0))) {
1485 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1486 					return -ELOOP;
1487 				}
1488 				qdisc_refcount_inc(q);
1489 				goto graft;
1490 			} else {
1491 				if (!q)
1492 					goto create_n_graft;
1493 
1494 				/* This magic test requires explanation.
1495 				 *
1496 				 *   We know, that some child q is already
1497 				 *   attached to this parent and have choice:
1498 				 *   either to change it or to create/graft new one.
1499 				 *
1500 				 *   1. We are allowed to create/graft only
1501 				 *   if CREATE and REPLACE flags are set.
1502 				 *
1503 				 *   2. If EXCL is set, requestor wanted to say,
1504 				 *   that qdisc tcm_handle is not expected
1505 				 *   to exist, so that we choose create/graft too.
1506 				 *
1507 				 *   3. The last case is when no flags are set.
1508 				 *   Alas, it is sort of hole in API, we
1509 				 *   cannot decide what to do unambiguously.
1510 				 *   For now we select create/graft, if
1511 				 *   user gave KIND, which does not match existing.
1512 				 */
1513 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1514 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1515 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1516 				     (tca[TCA_KIND] &&
1517 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1518 					goto create_n_graft;
1519 			}
1520 		}
1521 	} else {
1522 		if (!tcm->tcm_handle) {
1523 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1524 			return -EINVAL;
1525 		}
1526 		q = qdisc_lookup(dev, tcm->tcm_handle);
1527 	}
1528 
1529 	/* Change qdisc parameters */
1530 	if (!q) {
1531 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1532 		return -ENOENT;
1533 	}
1534 	if (n->nlmsg_flags & NLM_F_EXCL) {
1535 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1536 		return -EEXIST;
1537 	}
1538 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1539 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1540 		return -EINVAL;
1541 	}
1542 	err = qdisc_change(q, tca, extack);
1543 	if (err == 0)
1544 		qdisc_notify(net, skb, n, clid, NULL, q);
1545 	return err;
1546 
1547 create_n_graft:
1548 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1549 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1550 		return -ENOENT;
1551 	}
1552 	if (clid == TC_H_INGRESS) {
1553 		if (dev_ingress_queue(dev)) {
1554 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1555 					 tcm->tcm_parent, tcm->tcm_parent,
1556 					 tca, &err, extack);
1557 		} else {
1558 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1559 			err = -ENOENT;
1560 		}
1561 	} else {
1562 		struct netdev_queue *dev_queue;
1563 
1564 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1565 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1566 		else if (p)
1567 			dev_queue = p->dev_queue;
1568 		else
1569 			dev_queue = netdev_get_tx_queue(dev, 0);
1570 
1571 		q = qdisc_create(dev, dev_queue, p,
1572 				 tcm->tcm_parent, tcm->tcm_handle,
1573 				 tca, &err, extack);
1574 	}
1575 	if (q == NULL) {
1576 		if (err == -EAGAIN)
1577 			goto replay;
1578 		return err;
1579 	}
1580 
1581 graft:
1582 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1583 	if (err) {
1584 		if (q)
1585 			qdisc_destroy(q);
1586 		return err;
1587 	}
1588 
1589 	return 0;
1590 }
1591 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1592 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1593 			      struct netlink_callback *cb,
1594 			      int *q_idx_p, int s_q_idx, bool recur,
1595 			      bool dump_invisible)
1596 {
1597 	int ret = 0, q_idx = *q_idx_p;
1598 	struct Qdisc *q;
1599 	int b;
1600 
1601 	if (!root)
1602 		return 0;
1603 
1604 	q = root;
1605 	if (q_idx < s_q_idx) {
1606 		q_idx++;
1607 	} else {
1608 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1609 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1610 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1611 				  RTM_NEWQDISC) <= 0)
1612 			goto done;
1613 		q_idx++;
1614 	}
1615 
1616 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1617 	 * itself has already been dumped.
1618 	 *
1619 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1620 	 * qdisc hashtable, we don't want to hit it again
1621 	 */
1622 	if (!qdisc_dev(root) || !recur)
1623 		goto out;
1624 
1625 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1626 		if (q_idx < s_q_idx) {
1627 			q_idx++;
1628 			continue;
1629 		}
1630 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1631 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1632 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1633 				  RTM_NEWQDISC) <= 0)
1634 			goto done;
1635 		q_idx++;
1636 	}
1637 
1638 out:
1639 	*q_idx_p = q_idx;
1640 	return ret;
1641 done:
1642 	ret = -1;
1643 	goto out;
1644 }
1645 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1646 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1647 {
1648 	struct net *net = sock_net(skb->sk);
1649 	int idx, q_idx;
1650 	int s_idx, s_q_idx;
1651 	struct net_device *dev;
1652 	const struct nlmsghdr *nlh = cb->nlh;
1653 	struct nlattr *tca[TCA_MAX + 1];
1654 	int err;
1655 
1656 	s_idx = cb->args[0];
1657 	s_q_idx = q_idx = cb->args[1];
1658 
1659 	idx = 0;
1660 	ASSERT_RTNL();
1661 
1662 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1663 			  rtm_tca_policy, NULL);
1664 	if (err < 0)
1665 		return err;
1666 
1667 	for_each_netdev(net, dev) {
1668 		struct netdev_queue *dev_queue;
1669 
1670 		if (idx < s_idx)
1671 			goto cont;
1672 		if (idx > s_idx)
1673 			s_q_idx = 0;
1674 		q_idx = 0;
1675 
1676 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1677 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1678 			goto done;
1679 
1680 		dev_queue = dev_ingress_queue(dev);
1681 		if (dev_queue &&
1682 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1683 				       &q_idx, s_q_idx, false,
1684 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1685 			goto done;
1686 
1687 cont:
1688 		idx++;
1689 	}
1690 
1691 done:
1692 	cb->args[0] = idx;
1693 	cb->args[1] = q_idx;
1694 
1695 	return skb->len;
1696 }
1697 
1698 
1699 
1700 /************************************************
1701  *	Traffic classes manipulation.		*
1702  ************************************************/
1703 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1704 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1705 			  unsigned long cl,
1706 			  u32 portid, u32 seq, u16 flags, int event)
1707 {
1708 	struct tcmsg *tcm;
1709 	struct nlmsghdr  *nlh;
1710 	unsigned char *b = skb_tail_pointer(skb);
1711 	struct gnet_dump d;
1712 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1713 
1714 	cond_resched();
1715 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1716 	if (!nlh)
1717 		goto out_nlmsg_trim;
1718 	tcm = nlmsg_data(nlh);
1719 	tcm->tcm_family = AF_UNSPEC;
1720 	tcm->tcm__pad1 = 0;
1721 	tcm->tcm__pad2 = 0;
1722 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1723 	tcm->tcm_parent = q->handle;
1724 	tcm->tcm_handle = q->handle;
1725 	tcm->tcm_info = 0;
1726 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1727 		goto nla_put_failure;
1728 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1729 		goto nla_put_failure;
1730 
1731 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1732 					 NULL, &d, TCA_PAD) < 0)
1733 		goto nla_put_failure;
1734 
1735 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1736 		goto nla_put_failure;
1737 
1738 	if (gnet_stats_finish_copy(&d) < 0)
1739 		goto nla_put_failure;
1740 
1741 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1742 	return skb->len;
1743 
1744 out_nlmsg_trim:
1745 nla_put_failure:
1746 	nlmsg_trim(skb, b);
1747 	return -1;
1748 }
1749 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1750 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1751 			 struct nlmsghdr *n, struct Qdisc *q,
1752 			 unsigned long cl, int event)
1753 {
1754 	struct sk_buff *skb;
1755 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1756 
1757 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1758 	if (!skb)
1759 		return -ENOBUFS;
1760 
1761 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1762 		kfree_skb(skb);
1763 		return -EINVAL;
1764 	}
1765 
1766 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1767 			      n->nlmsg_flags & NLM_F_ECHO);
1768 }
1769 
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl)1770 static int tclass_del_notify(struct net *net,
1771 			     const struct Qdisc_class_ops *cops,
1772 			     struct sk_buff *oskb, struct nlmsghdr *n,
1773 			     struct Qdisc *q, unsigned long cl)
1774 {
1775 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1776 	struct sk_buff *skb;
1777 	int err = 0;
1778 
1779 	if (!cops->delete)
1780 		return -EOPNOTSUPP;
1781 
1782 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1783 	if (!skb)
1784 		return -ENOBUFS;
1785 
1786 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1787 			   RTM_DELTCLASS) < 0) {
1788 		kfree_skb(skb);
1789 		return -EINVAL;
1790 	}
1791 
1792 	err = cops->delete(q, cl);
1793 	if (err) {
1794 		kfree_skb(skb);
1795 		return err;
1796 	}
1797 
1798 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1799 			      n->nlmsg_flags & NLM_F_ECHO);
1800 }
1801 
1802 #ifdef CONFIG_NET_CLS
1803 
1804 struct tcf_bind_args {
1805 	struct tcf_walker w;
1806 	u32 classid;
1807 	unsigned long cl;
1808 };
1809 
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1810 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1811 {
1812 	struct tcf_bind_args *a = (void *)arg;
1813 
1814 	if (tp->ops->bind_class) {
1815 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1816 
1817 		sch_tree_lock(q);
1818 		tp->ops->bind_class(n, a->classid, a->cl);
1819 		sch_tree_unlock(q);
1820 	}
1821 	return 0;
1822 }
1823 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1824 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1825 			   unsigned long new_cl)
1826 {
1827 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1828 	struct tcf_block *block;
1829 	struct tcf_chain *chain;
1830 	unsigned long cl;
1831 
1832 	cl = cops->find(q, portid);
1833 	if (!cl)
1834 		return;
1835 	block = cops->tcf_block(q, cl, NULL);
1836 	if (!block)
1837 		return;
1838 	list_for_each_entry(chain, &block->chain_list, list) {
1839 		struct tcf_proto *tp;
1840 
1841 		for (tp = rtnl_dereference(chain->filter_chain);
1842 		     tp; tp = rtnl_dereference(tp->next)) {
1843 			struct tcf_bind_args arg = {};
1844 
1845 			arg.w.fn = tcf_node_bind;
1846 			arg.classid = clid;
1847 			arg.cl = new_cl;
1848 			tp->ops->walk(tp, &arg.w);
1849 		}
1850 	}
1851 }
1852 
1853 #else
1854 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1855 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1856 			   unsigned long new_cl)
1857 {
1858 }
1859 
1860 #endif
1861 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1862 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1863 			 struct netlink_ext_ack *extack)
1864 {
1865 	struct net *net = sock_net(skb->sk);
1866 	struct tcmsg *tcm = nlmsg_data(n);
1867 	struct nlattr *tca[TCA_MAX + 1];
1868 	struct net_device *dev;
1869 	struct Qdisc *q = NULL;
1870 	const struct Qdisc_class_ops *cops;
1871 	unsigned long cl = 0;
1872 	unsigned long new_cl;
1873 	u32 portid;
1874 	u32 clid;
1875 	u32 qid;
1876 	int err;
1877 
1878 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1879 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1880 		return -EPERM;
1881 
1882 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1883 			  extack);
1884 	if (err < 0)
1885 		return err;
1886 
1887 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1888 	if (!dev)
1889 		return -ENODEV;
1890 
1891 	/*
1892 	   parent == TC_H_UNSPEC - unspecified parent.
1893 	   parent == TC_H_ROOT   - class is root, which has no parent.
1894 	   parent == X:0	 - parent is root class.
1895 	   parent == X:Y	 - parent is a node in hierarchy.
1896 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1897 
1898 	   handle == 0:0	 - generate handle from kernel pool.
1899 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1900 	   handle == X:Y	 - clear.
1901 	   handle == X:0	 - root class.
1902 	 */
1903 
1904 	/* Step 1. Determine qdisc handle X:0 */
1905 
1906 	portid = tcm->tcm_parent;
1907 	clid = tcm->tcm_handle;
1908 	qid = TC_H_MAJ(clid);
1909 
1910 	if (portid != TC_H_ROOT) {
1911 		u32 qid1 = TC_H_MAJ(portid);
1912 
1913 		if (qid && qid1) {
1914 			/* If both majors are known, they must be identical. */
1915 			if (qid != qid1)
1916 				return -EINVAL;
1917 		} else if (qid1) {
1918 			qid = qid1;
1919 		} else if (qid == 0)
1920 			qid = dev->qdisc->handle;
1921 
1922 		/* Now qid is genuine qdisc handle consistent
1923 		 * both with parent and child.
1924 		 *
1925 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1926 		 */
1927 		if (portid)
1928 			portid = TC_H_MAKE(qid, portid);
1929 	} else {
1930 		if (qid == 0)
1931 			qid = dev->qdisc->handle;
1932 	}
1933 
1934 	/* OK. Locate qdisc */
1935 	q = qdisc_lookup(dev, qid);
1936 	if (!q)
1937 		return -ENOENT;
1938 
1939 	/* An check that it supports classes */
1940 	cops = q->ops->cl_ops;
1941 	if (cops == NULL)
1942 		return -EINVAL;
1943 
1944 	/* Now try to get class */
1945 	if (clid == 0) {
1946 		if (portid == TC_H_ROOT)
1947 			clid = qid;
1948 	} else
1949 		clid = TC_H_MAKE(qid, clid);
1950 
1951 	if (clid)
1952 		cl = cops->find(q, clid);
1953 
1954 	if (cl == 0) {
1955 		err = -ENOENT;
1956 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1957 		    !(n->nlmsg_flags & NLM_F_CREATE))
1958 			goto out;
1959 	} else {
1960 		switch (n->nlmsg_type) {
1961 		case RTM_NEWTCLASS:
1962 			err = -EEXIST;
1963 			if (n->nlmsg_flags & NLM_F_EXCL)
1964 				goto out;
1965 			break;
1966 		case RTM_DELTCLASS:
1967 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1968 			/* Unbind the class with flilters with 0 */
1969 			tc_bind_tclass(q, portid, clid, 0);
1970 			goto out;
1971 		case RTM_GETTCLASS:
1972 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1973 			goto out;
1974 		default:
1975 			err = -EINVAL;
1976 			goto out;
1977 		}
1978 	}
1979 
1980 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1981 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1982 		return -EOPNOTSUPP;
1983 	}
1984 
1985 	new_cl = cl;
1986 	err = -EOPNOTSUPP;
1987 	if (cops->change)
1988 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1989 	if (err == 0) {
1990 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1991 		/* We just create a new class, need to do reverse binding. */
1992 		if (cl != new_cl)
1993 			tc_bind_tclass(q, portid, clid, new_cl);
1994 	}
1995 out:
1996 	return err;
1997 }
1998 
1999 struct qdisc_dump_args {
2000 	struct qdisc_walker	w;
2001 	struct sk_buff		*skb;
2002 	struct netlink_callback	*cb;
2003 };
2004 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2005 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2006 			    struct qdisc_walker *arg)
2007 {
2008 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2009 
2010 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2011 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2012 			      RTM_NEWTCLASS);
2013 }
2014 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2015 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2016 				struct tcmsg *tcm, struct netlink_callback *cb,
2017 				int *t_p, int s_t)
2018 {
2019 	struct qdisc_dump_args arg;
2020 
2021 	if (tc_qdisc_dump_ignore(q, false) ||
2022 	    *t_p < s_t || !q->ops->cl_ops ||
2023 	    (tcm->tcm_parent &&
2024 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2025 		(*t_p)++;
2026 		return 0;
2027 	}
2028 	if (*t_p > s_t)
2029 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2030 	arg.w.fn = qdisc_class_dump;
2031 	arg.skb = skb;
2032 	arg.cb = cb;
2033 	arg.w.stop  = 0;
2034 	arg.w.skip = cb->args[1];
2035 	arg.w.count = 0;
2036 	q->ops->cl_ops->walk(q, &arg.w);
2037 	cb->args[1] = arg.w.count;
2038 	if (arg.w.stop)
2039 		return -1;
2040 	(*t_p)++;
2041 	return 0;
2042 }
2043 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2044 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2045 			       struct tcmsg *tcm, struct netlink_callback *cb,
2046 			       int *t_p, int s_t)
2047 {
2048 	struct Qdisc *q;
2049 	int b;
2050 
2051 	if (!root)
2052 		return 0;
2053 
2054 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2055 		return -1;
2056 
2057 	if (!qdisc_dev(root))
2058 		return 0;
2059 
2060 	if (tcm->tcm_parent) {
2061 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2062 		if (q && q != root &&
2063 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2064 			return -1;
2065 		return 0;
2066 	}
2067 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2068 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2069 			return -1;
2070 	}
2071 
2072 	return 0;
2073 }
2074 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2075 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2076 {
2077 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2078 	struct net *net = sock_net(skb->sk);
2079 	struct netdev_queue *dev_queue;
2080 	struct net_device *dev;
2081 	int t, s_t;
2082 
2083 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2084 		return 0;
2085 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2086 	if (!dev)
2087 		return 0;
2088 
2089 	s_t = cb->args[0];
2090 	t = 0;
2091 
2092 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2093 		goto done;
2094 
2095 	dev_queue = dev_ingress_queue(dev);
2096 	if (dev_queue &&
2097 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2098 				&t, s_t) < 0)
2099 		goto done;
2100 
2101 done:
2102 	cb->args[0] = t;
2103 
2104 	dev_put(dev);
2105 	return skb->len;
2106 }
2107 
2108 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2109 static int psched_show(struct seq_file *seq, void *v)
2110 {
2111 	seq_printf(seq, "%08x %08x %08x %08x\n",
2112 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2113 		   1000000,
2114 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2115 
2116 	return 0;
2117 }
2118 
psched_net_init(struct net * net)2119 static int __net_init psched_net_init(struct net *net)
2120 {
2121 	struct proc_dir_entry *e;
2122 
2123 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2124 	if (e == NULL)
2125 		return -ENOMEM;
2126 
2127 	return 0;
2128 }
2129 
psched_net_exit(struct net * net)2130 static void __net_exit psched_net_exit(struct net *net)
2131 {
2132 	remove_proc_entry("psched", net->proc_net);
2133 }
2134 #else
psched_net_init(struct net * net)2135 static int __net_init psched_net_init(struct net *net)
2136 {
2137 	return 0;
2138 }
2139 
psched_net_exit(struct net * net)2140 static void __net_exit psched_net_exit(struct net *net)
2141 {
2142 }
2143 #endif
2144 
2145 static struct pernet_operations psched_net_ops = {
2146 	.init = psched_net_init,
2147 	.exit = psched_net_exit,
2148 };
2149 
pktsched_init(void)2150 static int __init pktsched_init(void)
2151 {
2152 	int err;
2153 
2154 	err = register_pernet_subsys(&psched_net_ops);
2155 	if (err) {
2156 		pr_err("pktsched_init: "
2157 		       "cannot initialize per netns operations\n");
2158 		return err;
2159 	}
2160 
2161 	register_qdisc(&pfifo_fast_ops);
2162 	register_qdisc(&pfifo_qdisc_ops);
2163 	register_qdisc(&bfifo_qdisc_ops);
2164 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2165 	register_qdisc(&mq_qdisc_ops);
2166 	register_qdisc(&noqueue_qdisc_ops);
2167 
2168 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2169 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2170 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2171 		      0);
2172 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2173 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2174 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2175 		      0);
2176 
2177 	return 0;
2178 }
2179 
2180 subsys_initcall(pktsched_init);
2181