1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 			 struct fib6_info *rt, struct dst_entry *dst,
109 			 struct in6_addr *dest, struct in6_addr *src,
110 			 int iif, int type, u32 portid, u32 seq,
111 			 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 					   struct in6_addr *daddr,
114 					   struct in6_addr *saddr);
115 
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 					   const struct in6_addr *prefix, int prefixlen,
119 					   const struct in6_addr *gwaddr,
120 					   struct net_device *dev,
121 					   unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 					   const struct in6_addr *prefix, int prefixlen,
124 					   const struct in6_addr *gwaddr,
125 					   struct net_device *dev);
126 #endif
127 
128 struct uncached_list {
129 	spinlock_t		lock;
130 	struct list_head	head;
131 };
132 
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134 
rt6_uncached_list_add(struct rt6_info * rt)135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138 
139 	rt->rt6i_uncached_list = ul;
140 
141 	spin_lock_bh(&ul->lock);
142 	list_add_tail(&rt->rt6i_uncached, &ul->head);
143 	spin_unlock_bh(&ul->lock);
144 }
145 
rt6_uncached_list_del(struct rt6_info * rt)146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148 	if (!list_empty(&rt->rt6i_uncached)) {
149 		struct uncached_list *ul = rt->rt6i_uncached_list;
150 		struct net *net = dev_net(rt->dst.dev);
151 
152 		spin_lock_bh(&ul->lock);
153 		list_del(&rt->rt6i_uncached);
154 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 		spin_unlock_bh(&ul->lock);
156 	}
157 }
158 
rt6_uncached_list_flush_dev(struct net * net,struct net_device * dev)159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161 	struct net_device *loopback_dev = net->loopback_dev;
162 	int cpu;
163 
164 	if (dev == loopback_dev)
165 		return;
166 
167 	for_each_possible_cpu(cpu) {
168 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 		struct rt6_info *rt;
170 
171 		spin_lock_bh(&ul->lock);
172 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 			struct inet6_dev *rt_idev = rt->rt6i_idev;
174 			struct net_device *rt_dev = rt->dst.dev;
175 
176 			if (rt_idev->dev == dev) {
177 				rt->rt6i_idev = in6_dev_get(loopback_dev);
178 				in6_dev_put(rt_idev);
179 			}
180 
181 			if (rt_dev == dev) {
182 				rt->dst.dev = loopback_dev;
183 				dev_hold(rt->dst.dev);
184 				dev_put(rt_dev);
185 			}
186 		}
187 		spin_unlock_bh(&ul->lock);
188 	}
189 }
190 
choose_neigh_daddr(const struct in6_addr * p,struct sk_buff * skb,const void * daddr)191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	if (!ipv6_addr_any(p))
196 		return (const void *) p;
197 	else if (skb)
198 		return &ipv6_hdr(skb)->daddr;
199 	return daddr;
200 }
201 
ip6_neigh_lookup(const struct in6_addr * gw,struct net_device * dev,struct sk_buff * skb,const void * daddr)202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 				   struct net_device *dev,
204 				   struct sk_buff *skb,
205 				   const void *daddr)
206 {
207 	struct neighbour *n;
208 
209 	daddr = choose_neigh_daddr(gw, skb, daddr);
210 	n = __ipv6_neigh_lookup(dev, daddr);
211 	if (n)
212 		return n;
213 	return neigh_create(&nd_tbl, daddr, dev);
214 }
215 
ip6_dst_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 					      struct sk_buff *skb,
218 					      const void *daddr)
219 {
220 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221 
222 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224 
ip6_confirm_neigh(const struct dst_entry * dst,const void * daddr)225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227 	struct net_device *dev = dst->dev;
228 	struct rt6_info *rt = (struct rt6_info *)dst;
229 
230 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 	if (!daddr)
232 		return;
233 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 		return;
235 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 		return;
237 	__ipv6_confirm_neigh(dev, daddr);
238 }
239 
240 static struct dst_ops ip6_dst_ops_template = {
241 	.family			=	AF_INET6,
242 	.gc			=	ip6_dst_gc,
243 	.gc_thresh		=	1024,
244 	.check			=	ip6_dst_check,
245 	.default_advmss		=	ip6_default_advmss,
246 	.mtu			=	ip6_mtu,
247 	.cow_metrics		=	dst_cow_metrics_generic,
248 	.destroy		=	ip6_dst_destroy,
249 	.ifdown			=	ip6_dst_ifdown,
250 	.negative_advice	=	ip6_negative_advice,
251 	.link_failure		=	ip6_link_failure,
252 	.update_pmtu		=	ip6_rt_update_pmtu,
253 	.redirect		=	rt6_do_redirect,
254 	.local_out		=	__ip6_local_out,
255 	.neigh_lookup		=	ip6_dst_neigh_lookup,
256 	.confirm_neigh		=	ip6_confirm_neigh,
257 };
258 
ip6_blackhole_mtu(const struct dst_entry * dst)259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262 
263 	return mtu ? : dst->dev->mtu;
264 }
265 
ip6_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 					 struct sk_buff *skb, u32 mtu)
268 {
269 }
270 
ip6_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 				      struct sk_buff *skb)
273 {
274 }
275 
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 	.family			=	AF_INET6,
278 	.destroy		=	ip6_dst_destroy,
279 	.check			=	ip6_dst_check,
280 	.mtu			=	ip6_blackhole_mtu,
281 	.default_advmss		=	ip6_default_advmss,
282 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
283 	.redirect		=	ip6_rt_blackhole_redirect,
284 	.cow_metrics		=	dst_cow_metrics_generic,
285 	.neigh_lookup		=	ip6_dst_neigh_lookup,
286 };
287 
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 	[RTAX_HOPLIMIT - 1] = 0,
290 };
291 
292 static const struct fib6_info fib6_null_entry_template = {
293 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
294 	.fib6_protocol  = RTPROT_KERNEL,
295 	.fib6_metric	= ~(u32)0,
296 	.fib6_ref	= ATOMIC_INIT(1),
297 	.fib6_type	= RTN_UNREACHABLE,
298 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
299 };
300 
301 static const struct rt6_info ip6_null_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -ENETUNREACH,
307 		.input		= ip6_pkt_discard,
308 		.output		= ip6_pkt_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 };
326 
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328 	.dst = {
329 		.__refcnt	= ATOMIC_INIT(1),
330 		.__use		= 1,
331 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
332 		.error		= -EINVAL,
333 		.input		= dst_discard,
334 		.output		= dst_discard_out,
335 	},
336 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
337 };
338 
339 #endif
340 
rt6_info_init(struct rt6_info * rt)341 static void rt6_info_init(struct rt6_info *rt)
342 {
343 	struct dst_entry *dst = &rt->dst;
344 
345 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 	INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348 
349 /* allocate dst with ip6_dst_ops */
ip6_dst_alloc(struct net * net,struct net_device * dev,int flags)350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 			       int flags)
352 {
353 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 					1, DST_OBSOLETE_FORCE_CHK, flags);
355 
356 	if (rt) {
357 		rt6_info_init(rt);
358 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359 	}
360 
361 	return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364 
ip6_dst_destroy(struct dst_entry * dst)365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
368 	struct rt6_info *rt = (struct rt6_info *)dst;
369 	struct fib6_info *from;
370 	struct inet6_dev *idev;
371 
372 	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
373 		kfree(p);
374 
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
ip6_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
__rt6_check_expired(const struct rt6_info * rt)407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
rt6_check_expired(const struct rt6_info * rt)415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
fib6_multipath_select(const struct net * net,struct fib6_info * match,struct flowi6 * fl6,int oif,const struct sk_buff * skb,int strict)431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 					struct fib6_info *match,
433 					struct flowi6 *fl6, int oif,
434 					const struct sk_buff *skb,
435 					int strict)
436 {
437 	struct fib6_info *sibling, *next_sibling;
438 
439 	/* We might have already computed the hash for ICMPv6 errors. In such
440 	 * case it will always be non-zero. Otherwise now is the time to do it.
441 	 */
442 	if (!fl6->mp_hash)
443 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 
445 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
446 		return match;
447 
448 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 				 fib6_siblings) {
450 		int nh_upper_bound;
451 
452 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
453 		if (fl6->mp_hash > nh_upper_bound)
454 			continue;
455 		if (rt6_score_route(sibling, oif, strict) < 0)
456 			break;
457 		match = sibling;
458 		break;
459 	}
460 
461 	return match;
462 }
463 
464 /*
465  *	Route lookup. rcu_read_lock() should be held.
466  */
467 
rt6_device_match(struct net * net,struct fib6_info * rt,const struct in6_addr * saddr,int oif,int flags)468 static inline struct fib6_info *rt6_device_match(struct net *net,
469 						 struct fib6_info *rt,
470 						    const struct in6_addr *saddr,
471 						    int oif,
472 						    int flags)
473 {
474 	struct fib6_info *sprt;
475 
476 	if (!oif && ipv6_addr_any(saddr) &&
477 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478 		return rt;
479 
480 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
481 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 
483 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 			continue;
485 
486 		if (oif) {
487 			if (dev->ifindex == oif)
488 				return sprt;
489 		} else {
490 			if (ipv6_chk_addr(net, saddr, dev,
491 					  flags & RT6_LOOKUP_F_IFACE))
492 				return sprt;
493 		}
494 	}
495 
496 	if (oif && flags & RT6_LOOKUP_F_IFACE)
497 		return net->ipv6.fib6_null_entry;
498 
499 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
500 }
501 
502 #ifdef CONFIG_IPV6_ROUTER_PREF
503 struct __rt6_probe_work {
504 	struct work_struct work;
505 	struct in6_addr target;
506 	struct net_device *dev;
507 };
508 
rt6_probe_deferred(struct work_struct * w)509 static void rt6_probe_deferred(struct work_struct *w)
510 {
511 	struct in6_addr mcaddr;
512 	struct __rt6_probe_work *work =
513 		container_of(w, struct __rt6_probe_work, work);
514 
515 	addrconf_addr_solict_mult(&work->target, &mcaddr);
516 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
517 	dev_put(work->dev);
518 	kfree(work);
519 }
520 
rt6_probe(struct fib6_info * rt)521 static void rt6_probe(struct fib6_info *rt)
522 {
523 	struct __rt6_probe_work *work = NULL;
524 	const struct in6_addr *nh_gw;
525 	struct neighbour *neigh;
526 	struct net_device *dev;
527 	struct inet6_dev *idev;
528 
529 	/*
530 	 * Okay, this does not seem to be appropriate
531 	 * for now, however, we need to check if it
532 	 * is really so; aka Router Reachability Probing.
533 	 *
534 	 * Router Reachability Probe MUST be rate-limited
535 	 * to no more than one per minute.
536 	 */
537 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
538 		return;
539 
540 	nh_gw = &rt->fib6_nh.nh_gw;
541 	dev = rt->fib6_nh.nh_dev;
542 	rcu_read_lock_bh();
543 	idev = __in6_dev_get(dev);
544 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 	if (neigh) {
546 		if (neigh->nud_state & NUD_VALID)
547 			goto out;
548 
549 		write_lock(&neigh->lock);
550 		if (!(neigh->nud_state & NUD_VALID) &&
551 		    time_after(jiffies,
552 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
553 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 			if (work)
555 				__neigh_set_probe_once(neigh);
556 		}
557 		write_unlock(&neigh->lock);
558 	} else if (time_after(jiffies, rt->last_probe +
559 				       idev->cnf.rtr_probe_interval)) {
560 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
561 	}
562 
563 	if (work) {
564 		rt->last_probe = jiffies;
565 		INIT_WORK(&work->work, rt6_probe_deferred);
566 		work->target = *nh_gw;
567 		dev_hold(dev);
568 		work->dev = dev;
569 		schedule_work(&work->work);
570 	}
571 
572 out:
573 	rcu_read_unlock_bh();
574 }
575 #else
rt6_probe(struct fib6_info * rt)576 static inline void rt6_probe(struct fib6_info *rt)
577 {
578 }
579 #endif
580 
581 /*
582  * Default Router Selection (RFC 2461 6.3.6)
583  */
rt6_check_dev(struct fib6_info * rt,int oif)584 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 {
586 	const struct net_device *dev = rt->fib6_nh.nh_dev;
587 
588 	if (!oif || dev->ifindex == oif)
589 		return 2;
590 	return 0;
591 }
592 
rt6_check_neigh(struct fib6_info * rt)593 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 {
595 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
596 	struct neighbour *neigh;
597 
598 	if (rt->fib6_flags & RTF_NONEXTHOP ||
599 	    !(rt->fib6_flags & RTF_GATEWAY))
600 		return RT6_NUD_SUCCEED;
601 
602 	rcu_read_lock_bh();
603 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
604 					  &rt->fib6_nh.nh_gw);
605 	if (neigh) {
606 		read_lock(&neigh->lock);
607 		if (neigh->nud_state & NUD_VALID)
608 			ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610 		else if (!(neigh->nud_state & NUD_FAILED))
611 			ret = RT6_NUD_SUCCEED;
612 		else
613 			ret = RT6_NUD_FAIL_PROBE;
614 #endif
615 		read_unlock(&neigh->lock);
616 	} else {
617 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 	}
620 	rcu_read_unlock_bh();
621 
622 	return ret;
623 }
624 
rt6_score_route(struct fib6_info * rt,int oif,int strict)625 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 {
627 	int m;
628 
629 	m = rt6_check_dev(rt, oif);
630 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
631 		return RT6_NUD_FAIL_HARD;
632 #ifdef CONFIG_IPV6_ROUTER_PREF
633 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 #endif
635 	if (strict & RT6_LOOKUP_F_REACHABLE) {
636 		int n = rt6_check_neigh(rt);
637 		if (n < 0)
638 			return n;
639 	}
640 	return m;
641 }
642 
643 /* called with rc_read_lock held */
fib6_ignore_linkdown(const struct fib6_info * f6i)644 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 {
646 	const struct net_device *dev = fib6_info_nh_dev(f6i);
647 	bool rc = false;
648 
649 	if (dev) {
650 		const struct inet6_dev *idev = __in6_dev_get(dev);
651 
652 		rc = !!idev->cnf.ignore_routes_with_linkdown;
653 	}
654 
655 	return rc;
656 }
657 
find_match(struct fib6_info * rt,int oif,int strict,int * mpri,struct fib6_info * match,bool * do_rr)658 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
659 				   int *mpri, struct fib6_info *match,
660 				   bool *do_rr)
661 {
662 	int m;
663 	bool match_do_rr = false;
664 
665 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
666 		goto out;
667 
668 	if (fib6_ignore_linkdown(rt) &&
669 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
670 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671 		goto out;
672 
673 	if (fib6_check_expired(rt))
674 		goto out;
675 
676 	m = rt6_score_route(rt, oif, strict);
677 	if (m == RT6_NUD_FAIL_DO_RR) {
678 		match_do_rr = true;
679 		m = 0; /* lowest valid score */
680 	} else if (m == RT6_NUD_FAIL_HARD) {
681 		goto out;
682 	}
683 
684 	if (strict & RT6_LOOKUP_F_REACHABLE)
685 		rt6_probe(rt);
686 
687 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 	if (m > *mpri) {
689 		*do_rr = match_do_rr;
690 		*mpri = m;
691 		match = rt;
692 	}
693 out:
694 	return match;
695 }
696 
find_rr_leaf(struct fib6_node * fn,struct fib6_info * leaf,struct fib6_info * rr_head,u32 metric,int oif,int strict,bool * do_rr)697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698 				     struct fib6_info *leaf,
699 				     struct fib6_info *rr_head,
700 				     u32 metric, int oif, int strict,
701 				     bool *do_rr)
702 {
703 	struct fib6_info *rt, *match, *cont;
704 	int mpri = -1;
705 
706 	match = NULL;
707 	cont = NULL;
708 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
709 		if (rt->fib6_metric != metric) {
710 			cont = rt;
711 			break;
712 		}
713 
714 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
715 	}
716 
717 	for (rt = leaf; rt && rt != rr_head;
718 	     rt = rcu_dereference(rt->fib6_next)) {
719 		if (rt->fib6_metric != metric) {
720 			cont = rt;
721 			break;
722 		}
723 
724 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 	}
726 
727 	if (match || !cont)
728 		return match;
729 
730 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
731 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 
733 	return match;
734 }
735 
rt6_select(struct net * net,struct fib6_node * fn,int oif,int strict)736 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
737 				   int oif, int strict)
738 {
739 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
740 	struct fib6_info *match, *rt0;
741 	bool do_rr = false;
742 	int key_plen;
743 
744 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
745 		return net->ipv6.fib6_null_entry;
746 
747 	rt0 = rcu_dereference(fn->rr_ptr);
748 	if (!rt0)
749 		rt0 = leaf;
750 
751 	/* Double check to make sure fn is not an intermediate node
752 	 * and fn->leaf does not points to its child's leaf
753 	 * (This might happen if all routes under fn are deleted from
754 	 * the tree and fib6_repair_tree() is called on the node.)
755 	 */
756 	key_plen = rt0->fib6_dst.plen;
757 #ifdef CONFIG_IPV6_SUBTREES
758 	if (rt0->fib6_src.plen)
759 		key_plen = rt0->fib6_src.plen;
760 #endif
761 	if (fn->fn_bit != key_plen)
762 		return net->ipv6.fib6_null_entry;
763 
764 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 			     &do_rr);
766 
767 	if (do_rr) {
768 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 
770 		/* no entries matched; do round-robin */
771 		if (!next || next->fib6_metric != rt0->fib6_metric)
772 			next = leaf;
773 
774 		if (next != rt0) {
775 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
776 			/* make sure next is not being deleted from the tree */
777 			if (next->fib6_node)
778 				rcu_assign_pointer(fn->rr_ptr, next);
779 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 		}
781 	}
782 
783 	return match ? match : net->ipv6.fib6_null_entry;
784 }
785 
rt6_is_gw_or_nonexthop(const struct fib6_info * rt)786 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 {
788 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
789 }
790 
791 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_route_rcv(struct net_device * dev,u8 * opt,int len,const struct in6_addr * gwaddr)792 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
793 		  const struct in6_addr *gwaddr)
794 {
795 	struct net *net = dev_net(dev);
796 	struct route_info *rinfo = (struct route_info *) opt;
797 	struct in6_addr prefix_buf, *prefix;
798 	unsigned int pref;
799 	unsigned long lifetime;
800 	struct fib6_info *rt;
801 
802 	if (len < sizeof(struct route_info)) {
803 		return -EINVAL;
804 	}
805 
806 	/* Sanity check for prefix_len and length */
807 	if (rinfo->length > 3) {
808 		return -EINVAL;
809 	} else if (rinfo->prefix_len > 128) {
810 		return -EINVAL;
811 	} else if (rinfo->prefix_len > 64) {
812 		if (rinfo->length < 2) {
813 			return -EINVAL;
814 		}
815 	} else if (rinfo->prefix_len > 0) {
816 		if (rinfo->length < 1) {
817 			return -EINVAL;
818 		}
819 	}
820 
821 	pref = rinfo->route_pref;
822 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
823 		return -EINVAL;
824 
825 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 
827 	if (rinfo->length == 3)
828 		prefix = (struct in6_addr *)rinfo->prefix;
829 	else {
830 		/* this function is safe */
831 		ipv6_addr_prefix(&prefix_buf,
832 				 (struct in6_addr *)rinfo->prefix,
833 				 rinfo->prefix_len);
834 		prefix = &prefix_buf;
835 	}
836 
837 	if (rinfo->prefix_len == 0)
838 		rt = rt6_get_dflt_router(net, gwaddr, dev);
839 	else
840 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
841 					gwaddr, dev);
842 
843 	if (rt && !lifetime) {
844 		ip6_del_rt(net, rt);
845 		rt = NULL;
846 	}
847 
848 	if (!rt && lifetime)
849 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
850 					dev, pref);
851 	else if (rt)
852 		rt->fib6_flags = RTF_ROUTEINFO |
853 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
854 
855 	if (rt) {
856 		if (!addrconf_finite_timeout(lifetime))
857 			fib6_clean_expires(rt);
858 		else
859 			fib6_set_expires(rt, jiffies + HZ * lifetime);
860 
861 		fib6_info_release(rt);
862 	}
863 	return 0;
864 }
865 #endif
866 
867 /*
868  *	Misc support functions
869  */
870 
871 /* called with rcu_lock held */
ip6_rt_get_dev_rcu(struct fib6_info * rt)872 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 {
874 	struct net_device *dev = rt->fib6_nh.nh_dev;
875 
876 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
877 		/* for copies of local routes, dst->dev needs to be the
878 		 * device if it is a master device, the master device if
879 		 * device is enslaved, and the loopback as the default
880 		 */
881 		if (netif_is_l3_slave(dev) &&
882 		    !rt6_need_strict(&rt->fib6_dst.addr))
883 			dev = l3mdev_master_dev_rcu(dev);
884 		else if (!netif_is_l3_master(dev))
885 			dev = dev_net(dev)->loopback_dev;
886 		/* last case is netif_is_l3_master(dev) is true in which
887 		 * case we want dev returned to be dev
888 		 */
889 	}
890 
891 	return dev;
892 }
893 
894 static const int fib6_prop[RTN_MAX + 1] = {
895 	[RTN_UNSPEC]	= 0,
896 	[RTN_UNICAST]	= 0,
897 	[RTN_LOCAL]	= 0,
898 	[RTN_BROADCAST]	= 0,
899 	[RTN_ANYCAST]	= 0,
900 	[RTN_MULTICAST]	= 0,
901 	[RTN_BLACKHOLE]	= -EINVAL,
902 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
903 	[RTN_PROHIBIT]	= -EACCES,
904 	[RTN_THROW]	= -EAGAIN,
905 	[RTN_NAT]	= -EINVAL,
906 	[RTN_XRESOLVE]	= -EINVAL,
907 };
908 
ip6_rt_type_to_error(u8 fib6_type)909 static int ip6_rt_type_to_error(u8 fib6_type)
910 {
911 	return fib6_prop[fib6_type];
912 }
913 
fib6_info_dst_flags(struct fib6_info * rt)914 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 {
916 	unsigned short flags = 0;
917 
918 	if (rt->dst_nocount)
919 		flags |= DST_NOCOUNT;
920 	if (rt->dst_nopolicy)
921 		flags |= DST_NOPOLICY;
922 	if (rt->dst_host)
923 		flags |= DST_HOST;
924 
925 	return flags;
926 }
927 
ip6_rt_init_dst_reject(struct rt6_info * rt,struct fib6_info * ort)928 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 {
930 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 
932 	switch (ort->fib6_type) {
933 	case RTN_BLACKHOLE:
934 		rt->dst.output = dst_discard_out;
935 		rt->dst.input = dst_discard;
936 		break;
937 	case RTN_PROHIBIT:
938 		rt->dst.output = ip6_pkt_prohibit_out;
939 		rt->dst.input = ip6_pkt_prohibit;
940 		break;
941 	case RTN_THROW:
942 	case RTN_UNREACHABLE:
943 	default:
944 		rt->dst.output = ip6_pkt_discard_out;
945 		rt->dst.input = ip6_pkt_discard;
946 		break;
947 	}
948 }
949 
ip6_rt_init_dst(struct rt6_info * rt,struct fib6_info * ort)950 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 {
952 	if (ort->fib6_flags & RTF_REJECT) {
953 		ip6_rt_init_dst_reject(rt, ort);
954 		return;
955 	}
956 
957 	rt->dst.error = 0;
958 	rt->dst.output = ip6_output;
959 
960 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
961 		rt->dst.input = ip6_input;
962 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
963 		rt->dst.input = ip6_mc_input;
964 	} else {
965 		rt->dst.input = ip6_forward;
966 	}
967 
968 	if (ort->fib6_nh.nh_lwtstate) {
969 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
970 		lwtunnel_set_redirect(&rt->dst);
971 	}
972 
973 	rt->dst.lastuse = jiffies;
974 }
975 
976 /* Caller must already hold reference to @from */
rt6_set_from(struct rt6_info * rt,struct fib6_info * from)977 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 {
979 	rt->rt6i_flags &= ~RTF_EXPIRES;
980 	rcu_assign_pointer(rt->from, from);
981 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
982 	if (from->fib6_metrics != &dst_default_metrics) {
983 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
984 		refcount_inc(&from->fib6_metrics->refcnt);
985 	}
986 }
987 
988 /* Caller must already hold reference to @ort */
ip6_rt_copy_init(struct rt6_info * rt,struct fib6_info * ort)989 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990 {
991 	struct net_device *dev = fib6_info_nh_dev(ort);
992 
993 	ip6_rt_init_dst(rt, ort);
994 
995 	rt->rt6i_dst = ort->fib6_dst;
996 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
997 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
998 	rt->rt6i_flags = ort->fib6_flags;
999 	rt6_set_from(rt, ort);
1000 #ifdef CONFIG_IPV6_SUBTREES
1001 	rt->rt6i_src = ort->fib6_src;
1002 #endif
1003 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
1004 }
1005 
fib6_backtrack(struct fib6_node * fn,struct in6_addr * saddr)1006 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1007 					struct in6_addr *saddr)
1008 {
1009 	struct fib6_node *pn, *sn;
1010 	while (1) {
1011 		if (fn->fn_flags & RTN_TL_ROOT)
1012 			return NULL;
1013 		pn = rcu_dereference(fn->parent);
1014 		sn = FIB6_SUBTREE(pn);
1015 		if (sn && sn != fn)
1016 			fn = fib6_node_lookup(sn, NULL, saddr);
1017 		else
1018 			fn = pn;
1019 		if (fn->fn_flags & RTN_RTINFO)
1020 			return fn;
1021 	}
1022 }
1023 
ip6_hold_safe(struct net * net,struct rt6_info ** prt,bool null_fallback)1024 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1025 			  bool null_fallback)
1026 {
1027 	struct rt6_info *rt = *prt;
1028 
1029 	if (dst_hold_safe(&rt->dst))
1030 		return true;
1031 	if (null_fallback) {
1032 		rt = net->ipv6.ip6_null_entry;
1033 		dst_hold(&rt->dst);
1034 	} else {
1035 		rt = NULL;
1036 	}
1037 	*prt = rt;
1038 	return false;
1039 }
1040 
1041 /* called with rcu_lock held */
ip6_create_rt_rcu(struct fib6_info * rt)1042 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 {
1044 	unsigned short flags = fib6_info_dst_flags(rt);
1045 	struct net_device *dev = rt->fib6_nh.nh_dev;
1046 	struct rt6_info *nrt;
1047 
1048 	if (!fib6_info_hold_safe(rt))
1049 		return NULL;
1050 
1051 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1052 	if (nrt)
1053 		ip6_rt_copy_init(nrt, rt);
1054 	else
1055 		fib6_info_release(rt);
1056 
1057 	return nrt;
1058 }
1059 
ip6_pol_route_lookup(struct net * net,struct fib6_table * table,struct flowi6 * fl6,const struct sk_buff * skb,int flags)1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 					     struct fib6_table *table,
1062 					     struct flowi6 *fl6,
1063 					     const struct sk_buff *skb,
1064 					     int flags)
1065 {
1066 	struct fib6_info *f6i;
1067 	struct fib6_node *fn;
1068 	struct rt6_info *rt;
1069 
1070 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 		flags &= ~RT6_LOOKUP_F_IFACE;
1072 
1073 	rcu_read_lock();
1074 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076 	f6i = rcu_dereference(fn->leaf);
1077 	if (!f6i) {
1078 		f6i = net->ipv6.fib6_null_entry;
1079 	} else {
1080 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 				      fl6->flowi6_oif, flags);
1082 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1083 			f6i = fib6_multipath_select(net, f6i, fl6,
1084 						    fl6->flowi6_oif, skb,
1085 						    flags);
1086 	}
1087 	if (f6i == net->ipv6.fib6_null_entry) {
1088 		fn = fib6_backtrack(fn, &fl6->saddr);
1089 		if (fn)
1090 			goto restart;
1091 	}
1092 
1093 	trace_fib6_table_lookup(net, f6i, table, fl6);
1094 
1095 	/* Search through exception table */
1096 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1097 	if (rt) {
1098 		if (ip6_hold_safe(net, &rt, true))
1099 			dst_use_noref(&rt->dst, jiffies);
1100 	} else if (f6i == net->ipv6.fib6_null_entry) {
1101 		rt = net->ipv6.ip6_null_entry;
1102 		dst_hold(&rt->dst);
1103 	} else {
1104 		rt = ip6_create_rt_rcu(f6i);
1105 		if (!rt) {
1106 			rt = net->ipv6.ip6_null_entry;
1107 			dst_hold(&rt->dst);
1108 		}
1109 	}
1110 
1111 	rcu_read_unlock();
1112 
1113 	return rt;
1114 }
1115 
ip6_route_lookup(struct net * net,struct flowi6 * fl6,const struct sk_buff * skb,int flags)1116 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1117 				   const struct sk_buff *skb, int flags)
1118 {
1119 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1122 
rt6_lookup(struct net * net,const struct in6_addr * daddr,const struct in6_addr * saddr,int oif,const struct sk_buff * skb,int strict)1123 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1124 			    const struct in6_addr *saddr, int oif,
1125 			    const struct sk_buff *skb, int strict)
1126 {
1127 	struct flowi6 fl6 = {
1128 		.flowi6_oif = oif,
1129 		.daddr = *daddr,
1130 	};
1131 	struct dst_entry *dst;
1132 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1133 
1134 	if (saddr) {
1135 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1137 	}
1138 
1139 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1140 	if (dst->error == 0)
1141 		return (struct rt6_info *) dst;
1142 
1143 	dst_release(dst);
1144 
1145 	return NULL;
1146 }
1147 EXPORT_SYMBOL(rt6_lookup);
1148 
1149 /* ip6_ins_rt is called with FREE table->tb6_lock.
1150  * It takes new route entry, the addition fails by any reason the
1151  * route is released.
1152  * Caller must hold dst before calling it.
1153  */
1154 
__ip6_ins_rt(struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack)1155 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1156 			struct netlink_ext_ack *extack)
1157 {
1158 	int err;
1159 	struct fib6_table *table;
1160 
1161 	table = rt->fib6_table;
1162 	spin_lock_bh(&table->tb6_lock);
1163 	err = fib6_add(&table->tb6_root, rt, info, extack);
1164 	spin_unlock_bh(&table->tb6_lock);
1165 
1166 	return err;
1167 }
1168 
ip6_ins_rt(struct net * net,struct fib6_info * rt)1169 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1170 {
1171 	struct nl_info info = {	.nl_net = net, };
1172 
1173 	return __ip6_ins_rt(rt, &info, NULL);
1174 }
1175 
ip6_rt_cache_alloc(struct fib6_info * ort,const struct in6_addr * daddr,const struct in6_addr * saddr)1176 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1177 					   const struct in6_addr *daddr,
1178 					   const struct in6_addr *saddr)
1179 {
1180 	struct net_device *dev;
1181 	struct rt6_info *rt;
1182 
1183 	/*
1184 	 *	Clone the route.
1185 	 */
1186 
1187 	if (!fib6_info_hold_safe(ort))
1188 		return NULL;
1189 
1190 	dev = ip6_rt_get_dev_rcu(ort);
1191 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1192 	if (!rt) {
1193 		fib6_info_release(ort);
1194 		return NULL;
1195 	}
1196 
1197 	ip6_rt_copy_init(rt, ort);
1198 	rt->rt6i_flags |= RTF_CACHE;
1199 	rt->dst.flags |= DST_HOST;
1200 	rt->rt6i_dst.addr = *daddr;
1201 	rt->rt6i_dst.plen = 128;
1202 
1203 	if (!rt6_is_gw_or_nonexthop(ort)) {
1204 		if (ort->fib6_dst.plen != 128 &&
1205 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1206 			rt->rt6i_flags |= RTF_ANYCAST;
1207 #ifdef CONFIG_IPV6_SUBTREES
1208 		if (rt->rt6i_src.plen && saddr) {
1209 			rt->rt6i_src.addr = *saddr;
1210 			rt->rt6i_src.plen = 128;
1211 		}
1212 #endif
1213 	}
1214 
1215 	return rt;
1216 }
1217 
ip6_rt_pcpu_alloc(struct fib6_info * rt)1218 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1219 {
1220 	unsigned short flags = fib6_info_dst_flags(rt);
1221 	struct net_device *dev;
1222 	struct rt6_info *pcpu_rt;
1223 
1224 	if (!fib6_info_hold_safe(rt))
1225 		return NULL;
1226 
1227 	rcu_read_lock();
1228 	dev = ip6_rt_get_dev_rcu(rt);
1229 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1230 	rcu_read_unlock();
1231 	if (!pcpu_rt) {
1232 		fib6_info_release(rt);
1233 		return NULL;
1234 	}
1235 	ip6_rt_copy_init(pcpu_rt, rt);
1236 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1237 	return pcpu_rt;
1238 }
1239 
1240 /* It should be called with rcu_read_lock() acquired */
rt6_get_pcpu_route(struct fib6_info * rt)1241 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1242 {
1243 	struct rt6_info *pcpu_rt, **p;
1244 
1245 	p = this_cpu_ptr(rt->rt6i_pcpu);
1246 	pcpu_rt = *p;
1247 
1248 	if (pcpu_rt)
1249 		ip6_hold_safe(NULL, &pcpu_rt, false);
1250 
1251 	return pcpu_rt;
1252 }
1253 
rt6_make_pcpu_route(struct net * net,struct fib6_info * rt)1254 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1255 					    struct fib6_info *rt)
1256 {
1257 	struct rt6_info *pcpu_rt, *prev, **p;
1258 
1259 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1260 	if (!pcpu_rt) {
1261 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1262 		return net->ipv6.ip6_null_entry;
1263 	}
1264 
1265 	dst_hold(&pcpu_rt->dst);
1266 	p = this_cpu_ptr(rt->rt6i_pcpu);
1267 	prev = cmpxchg(p, NULL, pcpu_rt);
1268 	BUG_ON(prev);
1269 
1270 	return pcpu_rt;
1271 }
1272 
1273 /* exception hash table implementation
1274  */
1275 static DEFINE_SPINLOCK(rt6_exception_lock);
1276 
1277 /* Remove rt6_ex from hash table and free the memory
1278  * Caller must hold rt6_exception_lock
1279  */
rt6_remove_exception(struct rt6_exception_bucket * bucket,struct rt6_exception * rt6_ex)1280 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1281 				 struct rt6_exception *rt6_ex)
1282 {
1283 	struct net *net;
1284 
1285 	if (!bucket || !rt6_ex)
1286 		return;
1287 
1288 	net = dev_net(rt6_ex->rt6i->dst.dev);
1289 	hlist_del_rcu(&rt6_ex->hlist);
1290 	dst_release(&rt6_ex->rt6i->dst);
1291 	kfree_rcu(rt6_ex, rcu);
1292 	WARN_ON_ONCE(!bucket->depth);
1293 	bucket->depth--;
1294 	net->ipv6.rt6_stats->fib_rt_cache--;
1295 }
1296 
1297 /* Remove oldest rt6_ex in bucket and free the memory
1298  * Caller must hold rt6_exception_lock
1299  */
rt6_exception_remove_oldest(struct rt6_exception_bucket * bucket)1300 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1301 {
1302 	struct rt6_exception *rt6_ex, *oldest = NULL;
1303 
1304 	if (!bucket)
1305 		return;
1306 
1307 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1308 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1309 			oldest = rt6_ex;
1310 	}
1311 	rt6_remove_exception(bucket, oldest);
1312 }
1313 
rt6_exception_hash(const struct in6_addr * dst,const struct in6_addr * src)1314 static u32 rt6_exception_hash(const struct in6_addr *dst,
1315 			      const struct in6_addr *src)
1316 {
1317 	static u32 seed __read_mostly;
1318 	u32 val;
1319 
1320 	net_get_random_once(&seed, sizeof(seed));
1321 	val = jhash(dst, sizeof(*dst), seed);
1322 
1323 #ifdef CONFIG_IPV6_SUBTREES
1324 	if (src)
1325 		val = jhash(src, sizeof(*src), val);
1326 #endif
1327 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1328 }
1329 
1330 /* Helper function to find the cached rt in the hash table
1331  * and update bucket pointer to point to the bucket for this
1332  * (daddr, saddr) pair
1333  * Caller must hold rt6_exception_lock
1334  */
1335 static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket ** bucket,const struct in6_addr * daddr,const struct in6_addr * saddr)1336 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1337 			      const struct in6_addr *daddr,
1338 			      const struct in6_addr *saddr)
1339 {
1340 	struct rt6_exception *rt6_ex;
1341 	u32 hval;
1342 
1343 	if (!(*bucket) || !daddr)
1344 		return NULL;
1345 
1346 	hval = rt6_exception_hash(daddr, saddr);
1347 	*bucket += hval;
1348 
1349 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1350 		struct rt6_info *rt6 = rt6_ex->rt6i;
1351 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1352 
1353 #ifdef CONFIG_IPV6_SUBTREES
1354 		if (matched && saddr)
1355 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1356 #endif
1357 		if (matched)
1358 			return rt6_ex;
1359 	}
1360 	return NULL;
1361 }
1362 
1363 /* Helper function to find the cached rt in the hash table
1364  * and update bucket pointer to point to the bucket for this
1365  * (daddr, saddr) pair
1366  * Caller must hold rcu_read_lock()
1367  */
1368 static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket ** bucket,const struct in6_addr * daddr,const struct in6_addr * saddr)1369 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1370 			 const struct in6_addr *daddr,
1371 			 const struct in6_addr *saddr)
1372 {
1373 	struct rt6_exception *rt6_ex;
1374 	u32 hval;
1375 
1376 	WARN_ON_ONCE(!rcu_read_lock_held());
1377 
1378 	if (!(*bucket) || !daddr)
1379 		return NULL;
1380 
1381 	hval = rt6_exception_hash(daddr, saddr);
1382 	*bucket += hval;
1383 
1384 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1385 		struct rt6_info *rt6 = rt6_ex->rt6i;
1386 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1387 
1388 #ifdef CONFIG_IPV6_SUBTREES
1389 		if (matched && saddr)
1390 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1391 #endif
1392 		if (matched)
1393 			return rt6_ex;
1394 	}
1395 	return NULL;
1396 }
1397 
fib6_mtu(const struct fib6_info * rt)1398 static unsigned int fib6_mtu(const struct fib6_info *rt)
1399 {
1400 	unsigned int mtu;
1401 
1402 	if (rt->fib6_pmtu) {
1403 		mtu = rt->fib6_pmtu;
1404 	} else {
1405 		struct net_device *dev = fib6_info_nh_dev(rt);
1406 		struct inet6_dev *idev;
1407 
1408 		rcu_read_lock();
1409 		idev = __in6_dev_get(dev);
1410 		mtu = idev->cnf.mtu6;
1411 		rcu_read_unlock();
1412 	}
1413 
1414 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1415 
1416 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1417 }
1418 
rt6_insert_exception(struct rt6_info * nrt,struct fib6_info * ort)1419 static int rt6_insert_exception(struct rt6_info *nrt,
1420 				struct fib6_info *ort)
1421 {
1422 	struct net *net = dev_net(nrt->dst.dev);
1423 	struct rt6_exception_bucket *bucket;
1424 	struct in6_addr *src_key = NULL;
1425 	struct rt6_exception *rt6_ex;
1426 	int err = 0;
1427 
1428 	spin_lock_bh(&rt6_exception_lock);
1429 
1430 	if (ort->exception_bucket_flushed) {
1431 		err = -EINVAL;
1432 		goto out;
1433 	}
1434 
1435 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1436 					lockdep_is_held(&rt6_exception_lock));
1437 	if (!bucket) {
1438 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1439 				 GFP_ATOMIC);
1440 		if (!bucket) {
1441 			err = -ENOMEM;
1442 			goto out;
1443 		}
1444 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1445 	}
1446 
1447 #ifdef CONFIG_IPV6_SUBTREES
1448 	/* rt6i_src.plen != 0 indicates ort is in subtree
1449 	 * and exception table is indexed by a hash of
1450 	 * both rt6i_dst and rt6i_src.
1451 	 * Otherwise, the exception table is indexed by
1452 	 * a hash of only rt6i_dst.
1453 	 */
1454 	if (ort->fib6_src.plen)
1455 		src_key = &nrt->rt6i_src.addr;
1456 #endif
1457 
1458 	/* Update rt6i_prefsrc as it could be changed
1459 	 * in rt6_remove_prefsrc()
1460 	 */
1461 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1462 	/* rt6_mtu_change() might lower mtu on ort.
1463 	 * Only insert this exception route if its mtu
1464 	 * is less than ort's mtu value.
1465 	 */
1466 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467 		err = -EINVAL;
1468 		goto out;
1469 	}
1470 
1471 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1472 					       src_key);
1473 	if (rt6_ex)
1474 		rt6_remove_exception(bucket, rt6_ex);
1475 
1476 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477 	if (!rt6_ex) {
1478 		err = -ENOMEM;
1479 		goto out;
1480 	}
1481 	rt6_ex->rt6i = nrt;
1482 	rt6_ex->stamp = jiffies;
1483 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484 	bucket->depth++;
1485 	net->ipv6.rt6_stats->fib_rt_cache++;
1486 
1487 	if (bucket->depth > FIB6_MAX_DEPTH)
1488 		rt6_exception_remove_oldest(bucket);
1489 
1490 out:
1491 	spin_unlock_bh(&rt6_exception_lock);
1492 
1493 	/* Update fn->fn_sernum to invalidate all cached dst */
1494 	if (!err) {
1495 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1496 		fib6_update_sernum(net, ort);
1497 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498 		fib6_force_start_gc(net);
1499 	}
1500 
1501 	return err;
1502 }
1503 
rt6_flush_exceptions(struct fib6_info * rt)1504 void rt6_flush_exceptions(struct fib6_info *rt)
1505 {
1506 	struct rt6_exception_bucket *bucket;
1507 	struct rt6_exception *rt6_ex;
1508 	struct hlist_node *tmp;
1509 	int i;
1510 
1511 	spin_lock_bh(&rt6_exception_lock);
1512 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1513 	rt->exception_bucket_flushed = 1;
1514 
1515 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516 				    lockdep_is_held(&rt6_exception_lock));
1517 	if (!bucket)
1518 		goto out;
1519 
1520 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522 			rt6_remove_exception(bucket, rt6_ex);
1523 		WARN_ON_ONCE(bucket->depth);
1524 		bucket++;
1525 	}
1526 
1527 out:
1528 	spin_unlock_bh(&rt6_exception_lock);
1529 }
1530 
1531 /* Find cached rt in the hash table inside passed in rt
1532  * Caller has to hold rcu_read_lock()
1533  */
rt6_find_cached_rt(struct fib6_info * rt,struct in6_addr * daddr,struct in6_addr * saddr)1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535 					   struct in6_addr *daddr,
1536 					   struct in6_addr *saddr)
1537 {
1538 	struct rt6_exception_bucket *bucket;
1539 	struct in6_addr *src_key = NULL;
1540 	struct rt6_exception *rt6_ex;
1541 	struct rt6_info *res = NULL;
1542 
1543 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544 
1545 #ifdef CONFIG_IPV6_SUBTREES
1546 	/* rt6i_src.plen != 0 indicates rt is in subtree
1547 	 * and exception table is indexed by a hash of
1548 	 * both rt6i_dst and rt6i_src.
1549 	 * Otherwise, the exception table is indexed by
1550 	 * a hash of only rt6i_dst.
1551 	 */
1552 	if (rt->fib6_src.plen)
1553 		src_key = saddr;
1554 #endif
1555 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556 
1557 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558 		res = rt6_ex->rt6i;
1559 
1560 	return res;
1561 }
1562 
1563 /* Remove the passed in cached rt from the hash table that contains it */
rt6_remove_exception_rt(struct rt6_info * rt)1564 static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 {
1566 	struct rt6_exception_bucket *bucket;
1567 	struct in6_addr *src_key = NULL;
1568 	struct rt6_exception *rt6_ex;
1569 	struct fib6_info *from;
1570 	int err;
1571 
1572 	from = rcu_dereference(rt->from);
1573 	if (!from ||
1574 	    !(rt->rt6i_flags & RTF_CACHE))
1575 		return -EINVAL;
1576 
1577 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1578 		return -ENOENT;
1579 
1580 	spin_lock_bh(&rt6_exception_lock);
1581 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582 				    lockdep_is_held(&rt6_exception_lock));
1583 #ifdef CONFIG_IPV6_SUBTREES
1584 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1585 	 * and exception table is indexed by a hash of
1586 	 * both rt6i_dst and rt6i_src.
1587 	 * Otherwise, the exception table is indexed by
1588 	 * a hash of only rt6i_dst.
1589 	 */
1590 	if (from->fib6_src.plen)
1591 		src_key = &rt->rt6i_src.addr;
1592 #endif
1593 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1594 					       &rt->rt6i_dst.addr,
1595 					       src_key);
1596 	if (rt6_ex) {
1597 		rt6_remove_exception(bucket, rt6_ex);
1598 		err = 0;
1599 	} else {
1600 		err = -ENOENT;
1601 	}
1602 
1603 	spin_unlock_bh(&rt6_exception_lock);
1604 	return err;
1605 }
1606 
1607 /* Find rt6_ex which contains the passed in rt cache and
1608  * refresh its stamp
1609  */
rt6_update_exception_stamp_rt(struct rt6_info * rt)1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611 {
1612 	struct rt6_exception_bucket *bucket;
1613 	struct fib6_info *from = rt->from;
1614 	struct in6_addr *src_key = NULL;
1615 	struct rt6_exception *rt6_ex;
1616 
1617 	if (!from ||
1618 	    !(rt->rt6i_flags & RTF_CACHE))
1619 		return;
1620 
1621 	rcu_read_lock();
1622 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1623 
1624 #ifdef CONFIG_IPV6_SUBTREES
1625 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1626 	 * and exception table is indexed by a hash of
1627 	 * both rt6i_dst and rt6i_src.
1628 	 * Otherwise, the exception table is indexed by
1629 	 * a hash of only rt6i_dst.
1630 	 */
1631 	if (from->fib6_src.plen)
1632 		src_key = &rt->rt6i_src.addr;
1633 #endif
1634 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1635 					  &rt->rt6i_dst.addr,
1636 					  src_key);
1637 	if (rt6_ex)
1638 		rt6_ex->stamp = jiffies;
1639 
1640 	rcu_read_unlock();
1641 }
1642 
rt6_exceptions_remove_prefsrc(struct fib6_info * rt)1643 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1644 {
1645 	struct rt6_exception_bucket *bucket;
1646 	struct rt6_exception *rt6_ex;
1647 	int i;
1648 
1649 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1650 					lockdep_is_held(&rt6_exception_lock));
1651 
1652 	if (bucket) {
1653 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1654 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1655 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1656 			}
1657 			bucket++;
1658 		}
1659 	}
1660 }
1661 
rt6_mtu_change_route_allowed(struct inet6_dev * idev,struct rt6_info * rt,int mtu)1662 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1663 					 struct rt6_info *rt, int mtu)
1664 {
1665 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1666 	 * lowest MTU in the path: always allow updating the route PMTU to
1667 	 * reflect PMTU decreases.
1668 	 *
1669 	 * If the new MTU is higher, and the route PMTU is equal to the local
1670 	 * MTU, this means the old MTU is the lowest in the path, so allow
1671 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1672 	 * handle this.
1673 	 */
1674 
1675 	if (dst_mtu(&rt->dst) >= mtu)
1676 		return true;
1677 
1678 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1679 		return true;
1680 
1681 	return false;
1682 }
1683 
rt6_exceptions_update_pmtu(struct inet6_dev * idev,struct fib6_info * rt,int mtu)1684 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1685 				       struct fib6_info *rt, int mtu)
1686 {
1687 	struct rt6_exception_bucket *bucket;
1688 	struct rt6_exception *rt6_ex;
1689 	int i;
1690 
1691 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1692 					lockdep_is_held(&rt6_exception_lock));
1693 
1694 	if (!bucket)
1695 		return;
1696 
1697 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1698 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1699 			struct rt6_info *entry = rt6_ex->rt6i;
1700 
1701 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1702 			 * route), the metrics of its rt->from have already
1703 			 * been updated.
1704 			 */
1705 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1706 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1707 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708 		}
1709 		bucket++;
1710 	}
1711 }
1712 
1713 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1714 
rt6_exceptions_clean_tohost(struct fib6_info * rt,struct in6_addr * gateway)1715 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1716 					struct in6_addr *gateway)
1717 {
1718 	struct rt6_exception_bucket *bucket;
1719 	struct rt6_exception *rt6_ex;
1720 	struct hlist_node *tmp;
1721 	int i;
1722 
1723 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1724 		return;
1725 
1726 	spin_lock_bh(&rt6_exception_lock);
1727 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728 				     lockdep_is_held(&rt6_exception_lock));
1729 
1730 	if (bucket) {
1731 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1732 			hlist_for_each_entry_safe(rt6_ex, tmp,
1733 						  &bucket->chain, hlist) {
1734 				struct rt6_info *entry = rt6_ex->rt6i;
1735 
1736 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1737 				    RTF_CACHE_GATEWAY &&
1738 				    ipv6_addr_equal(gateway,
1739 						    &entry->rt6i_gateway)) {
1740 					rt6_remove_exception(bucket, rt6_ex);
1741 				}
1742 			}
1743 			bucket++;
1744 		}
1745 	}
1746 
1747 	spin_unlock_bh(&rt6_exception_lock);
1748 }
1749 
rt6_age_examine_exception(struct rt6_exception_bucket * bucket,struct rt6_exception * rt6_ex,struct fib6_gc_args * gc_args,unsigned long now)1750 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1751 				      struct rt6_exception *rt6_ex,
1752 				      struct fib6_gc_args *gc_args,
1753 				      unsigned long now)
1754 {
1755 	struct rt6_info *rt = rt6_ex->rt6i;
1756 
1757 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1758 	 * even if others have still references to them, so that on next
1759 	 * dst_check() such references can be dropped.
1760 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1761 	 * expired, independently from their aging, as per RFC 8201 section 4
1762 	 */
1763 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1764 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1765 			RT6_TRACE("aging clone %p\n", rt);
1766 			rt6_remove_exception(bucket, rt6_ex);
1767 			return;
1768 		}
1769 	} else if (time_after(jiffies, rt->dst.expires)) {
1770 		RT6_TRACE("purging expired route %p\n", rt);
1771 		rt6_remove_exception(bucket, rt6_ex);
1772 		return;
1773 	}
1774 
1775 	if (rt->rt6i_flags & RTF_GATEWAY) {
1776 		struct neighbour *neigh;
1777 		__u8 neigh_flags = 0;
1778 
1779 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1780 		if (neigh)
1781 			neigh_flags = neigh->flags;
1782 
1783 		if (!(neigh_flags & NTF_ROUTER)) {
1784 			RT6_TRACE("purging route %p via non-router but gateway\n",
1785 				  rt);
1786 			rt6_remove_exception(bucket, rt6_ex);
1787 			return;
1788 		}
1789 	}
1790 
1791 	gc_args->more++;
1792 }
1793 
rt6_age_exceptions(struct fib6_info * rt,struct fib6_gc_args * gc_args,unsigned long now)1794 void rt6_age_exceptions(struct fib6_info *rt,
1795 			struct fib6_gc_args *gc_args,
1796 			unsigned long now)
1797 {
1798 	struct rt6_exception_bucket *bucket;
1799 	struct rt6_exception *rt6_ex;
1800 	struct hlist_node *tmp;
1801 	int i;
1802 
1803 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1804 		return;
1805 
1806 	rcu_read_lock_bh();
1807 	spin_lock(&rt6_exception_lock);
1808 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1809 				    lockdep_is_held(&rt6_exception_lock));
1810 
1811 	if (bucket) {
1812 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1813 			hlist_for_each_entry_safe(rt6_ex, tmp,
1814 						  &bucket->chain, hlist) {
1815 				rt6_age_examine_exception(bucket, rt6_ex,
1816 							  gc_args, now);
1817 			}
1818 			bucket++;
1819 		}
1820 	}
1821 	spin_unlock(&rt6_exception_lock);
1822 	rcu_read_unlock_bh();
1823 }
1824 
1825 /* must be called with rcu lock held */
fib6_table_lookup(struct net * net,struct fib6_table * table,int oif,struct flowi6 * fl6,int strict)1826 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1827 				    int oif, struct flowi6 *fl6, int strict)
1828 {
1829 	struct fib6_node *fn, *saved_fn;
1830 	struct fib6_info *f6i;
1831 
1832 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1833 	saved_fn = fn;
1834 
1835 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1836 		oif = 0;
1837 
1838 redo_rt6_select:
1839 	f6i = rt6_select(net, fn, oif, strict);
1840 	if (f6i == net->ipv6.fib6_null_entry) {
1841 		fn = fib6_backtrack(fn, &fl6->saddr);
1842 		if (fn)
1843 			goto redo_rt6_select;
1844 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1845 			/* also consider unreachable route */
1846 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1847 			fn = saved_fn;
1848 			goto redo_rt6_select;
1849 		}
1850 	}
1851 
1852 	trace_fib6_table_lookup(net, f6i, table, fl6);
1853 
1854 	return f6i;
1855 }
1856 
ip6_pol_route(struct net * net,struct fib6_table * table,int oif,struct flowi6 * fl6,const struct sk_buff * skb,int flags)1857 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1858 			       int oif, struct flowi6 *fl6,
1859 			       const struct sk_buff *skb, int flags)
1860 {
1861 	struct fib6_info *f6i;
1862 	struct rt6_info *rt;
1863 	int strict = 0;
1864 
1865 	strict |= flags & RT6_LOOKUP_F_IFACE;
1866 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1867 	if (net->ipv6.devconf_all->forwarding == 0)
1868 		strict |= RT6_LOOKUP_F_REACHABLE;
1869 
1870 	rcu_read_lock();
1871 
1872 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1873 	if (f6i->fib6_nsiblings)
1874 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1875 
1876 	if (f6i == net->ipv6.fib6_null_entry) {
1877 		rt = net->ipv6.ip6_null_entry;
1878 		rcu_read_unlock();
1879 		dst_hold(&rt->dst);
1880 		return rt;
1881 	}
1882 
1883 	/*Search through exception table */
1884 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1885 	if (rt) {
1886 		if (ip6_hold_safe(net, &rt, true))
1887 			dst_use_noref(&rt->dst, jiffies);
1888 
1889 		rcu_read_unlock();
1890 		return rt;
1891 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1892 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1893 		/* Create a RTF_CACHE clone which will not be
1894 		 * owned by the fib6 tree.  It is for the special case where
1895 		 * the daddr in the skb during the neighbor look-up is different
1896 		 * from the fl6->daddr used to look-up route here.
1897 		 */
1898 		struct rt6_info *uncached_rt;
1899 
1900 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1901 
1902 		rcu_read_unlock();
1903 
1904 		if (uncached_rt) {
1905 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1906 			 * No need for another dst_hold()
1907 			 */
1908 			rt6_uncached_list_add(uncached_rt);
1909 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1910 		} else {
1911 			uncached_rt = net->ipv6.ip6_null_entry;
1912 			dst_hold(&uncached_rt->dst);
1913 		}
1914 
1915 		return uncached_rt;
1916 	} else {
1917 		/* Get a percpu copy */
1918 
1919 		struct rt6_info *pcpu_rt;
1920 
1921 		local_bh_disable();
1922 		pcpu_rt = rt6_get_pcpu_route(f6i);
1923 
1924 		if (!pcpu_rt)
1925 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1926 
1927 		local_bh_enable();
1928 		rcu_read_unlock();
1929 
1930 		return pcpu_rt;
1931 	}
1932 }
1933 EXPORT_SYMBOL_GPL(ip6_pol_route);
1934 
ip6_pol_route_input(struct net * net,struct fib6_table * table,struct flowi6 * fl6,const struct sk_buff * skb,int flags)1935 static struct rt6_info *ip6_pol_route_input(struct net *net,
1936 					    struct fib6_table *table,
1937 					    struct flowi6 *fl6,
1938 					    const struct sk_buff *skb,
1939 					    int flags)
1940 {
1941 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1942 }
1943 
ip6_route_input_lookup(struct net * net,struct net_device * dev,struct flowi6 * fl6,const struct sk_buff * skb,int flags)1944 struct dst_entry *ip6_route_input_lookup(struct net *net,
1945 					 struct net_device *dev,
1946 					 struct flowi6 *fl6,
1947 					 const struct sk_buff *skb,
1948 					 int flags)
1949 {
1950 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1951 		flags |= RT6_LOOKUP_F_IFACE;
1952 
1953 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1954 }
1955 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1956 
ip6_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * keys,struct flow_keys * flkeys)1957 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1958 				  struct flow_keys *keys,
1959 				  struct flow_keys *flkeys)
1960 {
1961 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1962 	const struct ipv6hdr *key_iph = outer_iph;
1963 	struct flow_keys *_flkeys = flkeys;
1964 	const struct ipv6hdr *inner_iph;
1965 	const struct icmp6hdr *icmph;
1966 	struct ipv6hdr _inner_iph;
1967 	struct icmp6hdr _icmph;
1968 
1969 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1970 		goto out;
1971 
1972 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1973 				   sizeof(_icmph), &_icmph);
1974 	if (!icmph)
1975 		goto out;
1976 
1977 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1978 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1979 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1980 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1981 		goto out;
1982 
1983 	inner_iph = skb_header_pointer(skb,
1984 				       skb_transport_offset(skb) + sizeof(*icmph),
1985 				       sizeof(_inner_iph), &_inner_iph);
1986 	if (!inner_iph)
1987 		goto out;
1988 
1989 	key_iph = inner_iph;
1990 	_flkeys = NULL;
1991 out:
1992 	if (_flkeys) {
1993 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1994 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1995 		keys->tags.flow_label = _flkeys->tags.flow_label;
1996 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1997 	} else {
1998 		keys->addrs.v6addrs.src = key_iph->saddr;
1999 		keys->addrs.v6addrs.dst = key_iph->daddr;
2000 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2001 		keys->basic.ip_proto = key_iph->nexthdr;
2002 	}
2003 }
2004 
2005 /* if skb is set it will be used and fl6 can be NULL */
rt6_multipath_hash(const struct net * net,const struct flowi6 * fl6,const struct sk_buff * skb,struct flow_keys * flkeys)2006 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2007 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2008 {
2009 	struct flow_keys hash_keys;
2010 	u32 mhash;
2011 
2012 	switch (ip6_multipath_hash_policy(net)) {
2013 	case 0:
2014 		memset(&hash_keys, 0, sizeof(hash_keys));
2015 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2016 		if (skb) {
2017 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2018 		} else {
2019 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2020 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2022 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2023 		}
2024 		break;
2025 	case 1:
2026 		if (skb) {
2027 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2028 			struct flow_keys keys;
2029 
2030 			/* short-circuit if we already have L4 hash present */
2031 			if (skb->l4_hash)
2032 				return skb_get_hash_raw(skb) >> 1;
2033 
2034 			memset(&hash_keys, 0, sizeof(hash_keys));
2035 
2036                         if (!flkeys) {
2037 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2038 				flkeys = &keys;
2039 			}
2040 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2041 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2042 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2043 			hash_keys.ports.src = flkeys->ports.src;
2044 			hash_keys.ports.dst = flkeys->ports.dst;
2045 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2046 		} else {
2047 			memset(&hash_keys, 0, sizeof(hash_keys));
2048 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2049 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2050 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2051 			hash_keys.ports.src = fl6->fl6_sport;
2052 			hash_keys.ports.dst = fl6->fl6_dport;
2053 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054 		}
2055 		break;
2056 	}
2057 	mhash = flow_hash_from_keys(&hash_keys);
2058 
2059 	return mhash >> 1;
2060 }
2061 
ip6_route_input(struct sk_buff * skb)2062 void ip6_route_input(struct sk_buff *skb)
2063 {
2064 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2065 	struct net *net = dev_net(skb->dev);
2066 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2067 	struct ip_tunnel_info *tun_info;
2068 	struct flowi6 fl6 = {
2069 		.flowi6_iif = skb->dev->ifindex,
2070 		.daddr = iph->daddr,
2071 		.saddr = iph->saddr,
2072 		.flowlabel = ip6_flowinfo(iph),
2073 		.flowi6_mark = skb->mark,
2074 		.flowi6_proto = iph->nexthdr,
2075 	};
2076 	struct flow_keys *flkeys = NULL, _flkeys;
2077 
2078 	tun_info = skb_tunnel_info(skb);
2079 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2080 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2081 
2082 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2083 		flkeys = &_flkeys;
2084 
2085 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2086 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2087 	skb_dst_drop(skb);
2088 	skb_dst_set(skb,
2089 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2090 }
2091 
ip6_pol_route_output(struct net * net,struct fib6_table * table,struct flowi6 * fl6,const struct sk_buff * skb,int flags)2092 static struct rt6_info *ip6_pol_route_output(struct net *net,
2093 					     struct fib6_table *table,
2094 					     struct flowi6 *fl6,
2095 					     const struct sk_buff *skb,
2096 					     int flags)
2097 {
2098 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2099 }
2100 
ip6_route_output_flags(struct net * net,const struct sock * sk,struct flowi6 * fl6,int flags)2101 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2102 					 struct flowi6 *fl6, int flags)
2103 {
2104 	bool any_src;
2105 
2106 	if (rt6_need_strict(&fl6->daddr)) {
2107 		struct dst_entry *dst;
2108 
2109 		dst = l3mdev_link_scope_lookup(net, fl6);
2110 		if (dst)
2111 			return dst;
2112 	}
2113 
2114 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2115 
2116 	any_src = ipv6_addr_any(&fl6->saddr);
2117 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2118 	    (fl6->flowi6_oif && any_src))
2119 		flags |= RT6_LOOKUP_F_IFACE;
2120 
2121 	if (!any_src)
2122 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2123 	else if (sk)
2124 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2125 
2126 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2127 }
2128 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2129 
ip6_blackhole_route(struct net * net,struct dst_entry * dst_orig)2130 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2131 {
2132 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2133 	struct net_device *loopback_dev = net->loopback_dev;
2134 	struct dst_entry *new = NULL;
2135 
2136 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2137 		       DST_OBSOLETE_DEAD, 0);
2138 	if (rt) {
2139 		rt6_info_init(rt);
2140 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2141 
2142 		new = &rt->dst;
2143 		new->__use = 1;
2144 		new->input = dst_discard;
2145 		new->output = dst_discard_out;
2146 
2147 		dst_copy_metrics(new, &ort->dst);
2148 
2149 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2150 		rt->rt6i_gateway = ort->rt6i_gateway;
2151 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2152 
2153 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2154 #ifdef CONFIG_IPV6_SUBTREES
2155 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2156 #endif
2157 	}
2158 
2159 	dst_release(dst_orig);
2160 	return new ? new : ERR_PTR(-ENOMEM);
2161 }
2162 
2163 /*
2164  *	Destination cache support functions
2165  */
2166 
fib6_check(struct fib6_info * f6i,u32 cookie)2167 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2168 {
2169 	u32 rt_cookie = 0;
2170 
2171 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2172 		return false;
2173 
2174 	if (fib6_check_expired(f6i))
2175 		return false;
2176 
2177 	return true;
2178 }
2179 
rt6_check(struct rt6_info * rt,struct fib6_info * from,u32 cookie)2180 static struct dst_entry *rt6_check(struct rt6_info *rt,
2181 				   struct fib6_info *from,
2182 				   u32 cookie)
2183 {
2184 	u32 rt_cookie = 0;
2185 
2186 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2187 	    rt_cookie != cookie)
2188 		return NULL;
2189 
2190 	if (rt6_check_expired(rt))
2191 		return NULL;
2192 
2193 	return &rt->dst;
2194 }
2195 
rt6_dst_from_check(struct rt6_info * rt,struct fib6_info * from,u32 cookie)2196 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2197 					    struct fib6_info *from,
2198 					    u32 cookie)
2199 {
2200 	if (!__rt6_check_expired(rt) &&
2201 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2202 	    fib6_check(from, cookie))
2203 		return &rt->dst;
2204 	else
2205 		return NULL;
2206 }
2207 
ip6_dst_check(struct dst_entry * dst,u32 cookie)2208 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2209 {
2210 	struct dst_entry *dst_ret;
2211 	struct fib6_info *from;
2212 	struct rt6_info *rt;
2213 
2214 	rt = container_of(dst, struct rt6_info, dst);
2215 
2216 	rcu_read_lock();
2217 
2218 	/* All IPV6 dsts are created with ->obsolete set to the value
2219 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220 	 * into this function always.
2221 	 */
2222 
2223 	from = rcu_dereference(rt->from);
2224 
2225 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2226 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2227 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2228 	else
2229 		dst_ret = rt6_check(rt, from, cookie);
2230 
2231 	rcu_read_unlock();
2232 
2233 	return dst_ret;
2234 }
2235 
ip6_negative_advice(struct dst_entry * dst)2236 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2237 {
2238 	struct rt6_info *rt = (struct rt6_info *) dst;
2239 
2240 	if (rt) {
2241 		if (rt->rt6i_flags & RTF_CACHE) {
2242 			rcu_read_lock();
2243 			if (rt6_check_expired(rt)) {
2244 				rt6_remove_exception_rt(rt);
2245 				dst = NULL;
2246 			}
2247 			rcu_read_unlock();
2248 		} else {
2249 			dst_release(dst);
2250 			dst = NULL;
2251 		}
2252 	}
2253 	return dst;
2254 }
2255 
ip6_link_failure(struct sk_buff * skb)2256 static void ip6_link_failure(struct sk_buff *skb)
2257 {
2258 	struct rt6_info *rt;
2259 
2260 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2261 
2262 	rt = (struct rt6_info *) skb_dst(skb);
2263 	if (rt) {
2264 		rcu_read_lock();
2265 		if (rt->rt6i_flags & RTF_CACHE) {
2266 			if (dst_hold_safe(&rt->dst))
2267 				rt6_remove_exception_rt(rt);
2268 		} else {
2269 			struct fib6_info *from;
2270 			struct fib6_node *fn;
2271 
2272 			from = rcu_dereference(rt->from);
2273 			if (from) {
2274 				fn = rcu_dereference(from->fib6_node);
2275 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2276 					fn->fn_sernum = -1;
2277 			}
2278 		}
2279 		rcu_read_unlock();
2280 	}
2281 }
2282 
rt6_update_expires(struct rt6_info * rt0,int timeout)2283 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2284 {
2285 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2286 		struct fib6_info *from;
2287 
2288 		rcu_read_lock();
2289 		from = rcu_dereference(rt0->from);
2290 		if (from)
2291 			rt0->dst.expires = from->expires;
2292 		rcu_read_unlock();
2293 	}
2294 
2295 	dst_set_expires(&rt0->dst, timeout);
2296 	rt0->rt6i_flags |= RTF_EXPIRES;
2297 }
2298 
rt6_do_update_pmtu(struct rt6_info * rt,u32 mtu)2299 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2300 {
2301 	struct net *net = dev_net(rt->dst.dev);
2302 
2303 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2304 	rt->rt6i_flags |= RTF_MODIFIED;
2305 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2306 }
2307 
rt6_cache_allowed_for_pmtu(const struct rt6_info * rt)2308 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2309 {
2310 	bool from_set;
2311 
2312 	rcu_read_lock();
2313 	from_set = !!rcu_dereference(rt->from);
2314 	rcu_read_unlock();
2315 
2316 	return !(rt->rt6i_flags & RTF_CACHE) &&
2317 		(rt->rt6i_flags & RTF_PCPU || from_set);
2318 }
2319 
__ip6_rt_update_pmtu(struct dst_entry * dst,const struct sock * sk,const struct ipv6hdr * iph,u32 mtu)2320 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2321 				 const struct ipv6hdr *iph, u32 mtu)
2322 {
2323 	const struct in6_addr *daddr, *saddr;
2324 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2325 
2326 	if (dst_metric_locked(dst, RTAX_MTU))
2327 		return;
2328 
2329 	if (iph) {
2330 		daddr = &iph->daddr;
2331 		saddr = &iph->saddr;
2332 	} else if (sk) {
2333 		daddr = &sk->sk_v6_daddr;
2334 		saddr = &inet6_sk(sk)->saddr;
2335 	} else {
2336 		daddr = NULL;
2337 		saddr = NULL;
2338 	}
2339 	dst_confirm_neigh(dst, daddr);
2340 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2341 	if (mtu >= dst_mtu(dst))
2342 		return;
2343 
2344 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2345 		rt6_do_update_pmtu(rt6, mtu);
2346 		/* update rt6_ex->stamp for cache */
2347 		if (rt6->rt6i_flags & RTF_CACHE)
2348 			rt6_update_exception_stamp_rt(rt6);
2349 	} else if (daddr) {
2350 		struct fib6_info *from;
2351 		struct rt6_info *nrt6;
2352 
2353 		rcu_read_lock();
2354 		from = rcu_dereference(rt6->from);
2355 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2356 		if (nrt6) {
2357 			rt6_do_update_pmtu(nrt6, mtu);
2358 			if (rt6_insert_exception(nrt6, from))
2359 				dst_release_immediate(&nrt6->dst);
2360 		}
2361 		rcu_read_unlock();
2362 	}
2363 }
2364 
ip6_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)2365 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2366 			       struct sk_buff *skb, u32 mtu)
2367 {
2368 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2369 }
2370 
ip6_update_pmtu(struct sk_buff * skb,struct net * net,__be32 mtu,int oif,u32 mark,kuid_t uid)2371 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2372 		     int oif, u32 mark, kuid_t uid)
2373 {
2374 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2375 	struct dst_entry *dst;
2376 	struct flowi6 fl6;
2377 
2378 	memset(&fl6, 0, sizeof(fl6));
2379 	fl6.flowi6_oif = oif;
2380 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2381 	fl6.daddr = iph->daddr;
2382 	fl6.saddr = iph->saddr;
2383 	fl6.flowlabel = ip6_flowinfo(iph);
2384 	fl6.flowi6_uid = uid;
2385 
2386 	dst = ip6_route_output(net, NULL, &fl6);
2387 	if (!dst->error)
2388 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2389 	dst_release(dst);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2392 
ip6_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,__be32 mtu)2393 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2394 {
2395 	struct dst_entry *dst;
2396 
2397 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2398 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2399 
2400 	dst = __sk_dst_get(sk);
2401 	if (!dst || !dst->obsolete ||
2402 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2403 		return;
2404 
2405 	bh_lock_sock(sk);
2406 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2407 		ip6_datagram_dst_update(sk, false);
2408 	bh_unlock_sock(sk);
2409 }
2410 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2411 
ip6_sk_dst_store_flow(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)2412 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2413 			   const struct flowi6 *fl6)
2414 {
2415 #ifdef CONFIG_IPV6_SUBTREES
2416 	struct ipv6_pinfo *np = inet6_sk(sk);
2417 #endif
2418 
2419 	ip6_dst_store(sk, dst,
2420 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2421 		      &sk->sk_v6_daddr : NULL,
2422 #ifdef CONFIG_IPV6_SUBTREES
2423 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2424 		      &np->saddr :
2425 #endif
2426 		      NULL);
2427 }
2428 
2429 /* Handle redirects */
2430 struct ip6rd_flowi {
2431 	struct flowi6 fl6;
2432 	struct in6_addr gateway;
2433 };
2434 
__ip6_route_redirect(struct net * net,struct fib6_table * table,struct flowi6 * fl6,const struct sk_buff * skb,int flags)2435 static struct rt6_info *__ip6_route_redirect(struct net *net,
2436 					     struct fib6_table *table,
2437 					     struct flowi6 *fl6,
2438 					     const struct sk_buff *skb,
2439 					     int flags)
2440 {
2441 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2442 	struct rt6_info *ret = NULL, *rt_cache;
2443 	struct fib6_info *rt;
2444 	struct fib6_node *fn;
2445 
2446 	/* Get the "current" route for this destination and
2447 	 * check if the redirect has come from appropriate router.
2448 	 *
2449 	 * RFC 4861 specifies that redirects should only be
2450 	 * accepted if they come from the nexthop to the target.
2451 	 * Due to the way the routes are chosen, this notion
2452 	 * is a bit fuzzy and one might need to check all possible
2453 	 * routes.
2454 	 */
2455 
2456 	rcu_read_lock();
2457 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2458 restart:
2459 	for_each_fib6_node_rt_rcu(fn) {
2460 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2461 			continue;
2462 		if (fib6_check_expired(rt))
2463 			continue;
2464 		if (rt->fib6_flags & RTF_REJECT)
2465 			break;
2466 		if (!(rt->fib6_flags & RTF_GATEWAY))
2467 			continue;
2468 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2469 			continue;
2470 		/* rt_cache's gateway might be different from its 'parent'
2471 		 * in the case of an ip redirect.
2472 		 * So we keep searching in the exception table if the gateway
2473 		 * is different.
2474 		 */
2475 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2476 			rt_cache = rt6_find_cached_rt(rt,
2477 						      &fl6->daddr,
2478 						      &fl6->saddr);
2479 			if (rt_cache &&
2480 			    ipv6_addr_equal(&rdfl->gateway,
2481 					    &rt_cache->rt6i_gateway)) {
2482 				ret = rt_cache;
2483 				break;
2484 			}
2485 			continue;
2486 		}
2487 		break;
2488 	}
2489 
2490 	if (!rt)
2491 		rt = net->ipv6.fib6_null_entry;
2492 	else if (rt->fib6_flags & RTF_REJECT) {
2493 		ret = net->ipv6.ip6_null_entry;
2494 		goto out;
2495 	}
2496 
2497 	if (rt == net->ipv6.fib6_null_entry) {
2498 		fn = fib6_backtrack(fn, &fl6->saddr);
2499 		if (fn)
2500 			goto restart;
2501 	}
2502 
2503 out:
2504 	if (ret)
2505 		ip6_hold_safe(net, &ret, true);
2506 	else
2507 		ret = ip6_create_rt_rcu(rt);
2508 
2509 	rcu_read_unlock();
2510 
2511 	trace_fib6_table_lookup(net, rt, table, fl6);
2512 	return ret;
2513 };
2514 
ip6_route_redirect(struct net * net,const struct flowi6 * fl6,const struct sk_buff * skb,const struct in6_addr * gateway)2515 static struct dst_entry *ip6_route_redirect(struct net *net,
2516 					    const struct flowi6 *fl6,
2517 					    const struct sk_buff *skb,
2518 					    const struct in6_addr *gateway)
2519 {
2520 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2521 	struct ip6rd_flowi rdfl;
2522 
2523 	rdfl.fl6 = *fl6;
2524 	rdfl.gateway = *gateway;
2525 
2526 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2527 				flags, __ip6_route_redirect);
2528 }
2529 
ip6_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,kuid_t uid)2530 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2531 		  kuid_t uid)
2532 {
2533 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2534 	struct dst_entry *dst;
2535 	struct flowi6 fl6;
2536 
2537 	memset(&fl6, 0, sizeof(fl6));
2538 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2539 	fl6.flowi6_oif = oif;
2540 	fl6.flowi6_mark = mark;
2541 	fl6.daddr = iph->daddr;
2542 	fl6.saddr = iph->saddr;
2543 	fl6.flowlabel = ip6_flowinfo(iph);
2544 	fl6.flowi6_uid = uid;
2545 
2546 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2547 	rt6_do_redirect(dst, NULL, skb);
2548 	dst_release(dst);
2549 }
2550 EXPORT_SYMBOL_GPL(ip6_redirect);
2551 
ip6_redirect_no_header(struct sk_buff * skb,struct net * net,int oif,u32 mark)2552 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2553 			    u32 mark)
2554 {
2555 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2556 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2557 	struct dst_entry *dst;
2558 	struct flowi6 fl6;
2559 
2560 	memset(&fl6, 0, sizeof(fl6));
2561 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2562 	fl6.flowi6_oif = oif;
2563 	fl6.flowi6_mark = mark;
2564 	fl6.daddr = msg->dest;
2565 	fl6.saddr = iph->daddr;
2566 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2567 
2568 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2569 	rt6_do_redirect(dst, NULL, skb);
2570 	dst_release(dst);
2571 }
2572 
ip6_sk_redirect(struct sk_buff * skb,struct sock * sk)2573 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2574 {
2575 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2576 		     sk->sk_uid);
2577 }
2578 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2579 
ip6_default_advmss(const struct dst_entry * dst)2580 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2581 {
2582 	struct net_device *dev = dst->dev;
2583 	unsigned int mtu = dst_mtu(dst);
2584 	struct net *net = dev_net(dev);
2585 
2586 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2587 
2588 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2589 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2590 
2591 	/*
2592 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2593 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2594 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2595 	 * rely only on pmtu discovery"
2596 	 */
2597 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2598 		mtu = IPV6_MAXPLEN;
2599 	return mtu;
2600 }
2601 
ip6_mtu(const struct dst_entry * dst)2602 static unsigned int ip6_mtu(const struct dst_entry *dst)
2603 {
2604 	struct inet6_dev *idev;
2605 	unsigned int mtu;
2606 
2607 	mtu = dst_metric_raw(dst, RTAX_MTU);
2608 	if (mtu)
2609 		goto out;
2610 
2611 	mtu = IPV6_MIN_MTU;
2612 
2613 	rcu_read_lock();
2614 	idev = __in6_dev_get(dst->dev);
2615 	if (idev)
2616 		mtu = idev->cnf.mtu6;
2617 	rcu_read_unlock();
2618 
2619 out:
2620 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2621 
2622 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2623 }
2624 
2625 /* MTU selection:
2626  * 1. mtu on route is locked - use it
2627  * 2. mtu from nexthop exception
2628  * 3. mtu from egress device
2629  *
2630  * based on ip6_dst_mtu_forward and exception logic of
2631  * rt6_find_cached_rt; called with rcu_read_lock
2632  */
ip6_mtu_from_fib6(struct fib6_info * f6i,struct in6_addr * daddr,struct in6_addr * saddr)2633 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2634 		      struct in6_addr *saddr)
2635 {
2636 	struct rt6_exception_bucket *bucket;
2637 	struct rt6_exception *rt6_ex;
2638 	struct in6_addr *src_key;
2639 	struct inet6_dev *idev;
2640 	u32 mtu = 0;
2641 
2642 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2643 		mtu = f6i->fib6_pmtu;
2644 		if (mtu)
2645 			goto out;
2646 	}
2647 
2648 	src_key = NULL;
2649 #ifdef CONFIG_IPV6_SUBTREES
2650 	if (f6i->fib6_src.plen)
2651 		src_key = saddr;
2652 #endif
2653 
2654 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2655 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2656 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2657 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2658 
2659 	if (likely(!mtu)) {
2660 		struct net_device *dev = fib6_info_nh_dev(f6i);
2661 
2662 		mtu = IPV6_MIN_MTU;
2663 		idev = __in6_dev_get(dev);
2664 		if (idev && idev->cnf.mtu6 > mtu)
2665 			mtu = idev->cnf.mtu6;
2666 	}
2667 
2668 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2669 out:
2670 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2671 }
2672 
icmp6_dst_alloc(struct net_device * dev,struct flowi6 * fl6)2673 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2674 				  struct flowi6 *fl6)
2675 {
2676 	struct dst_entry *dst;
2677 	struct rt6_info *rt;
2678 	struct inet6_dev *idev = in6_dev_get(dev);
2679 	struct net *net = dev_net(dev);
2680 
2681 	if (unlikely(!idev))
2682 		return ERR_PTR(-ENODEV);
2683 
2684 	rt = ip6_dst_alloc(net, dev, 0);
2685 	if (unlikely(!rt)) {
2686 		in6_dev_put(idev);
2687 		dst = ERR_PTR(-ENOMEM);
2688 		goto out;
2689 	}
2690 
2691 	rt->dst.flags |= DST_HOST;
2692 	rt->dst.input = ip6_input;
2693 	rt->dst.output  = ip6_output;
2694 	rt->rt6i_gateway  = fl6->daddr;
2695 	rt->rt6i_dst.addr = fl6->daddr;
2696 	rt->rt6i_dst.plen = 128;
2697 	rt->rt6i_idev     = idev;
2698 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2699 
2700 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2701 	 * do proper release of the net_device
2702 	 */
2703 	rt6_uncached_list_add(rt);
2704 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2705 
2706 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2707 
2708 out:
2709 	return dst;
2710 }
2711 
ip6_dst_gc(struct dst_ops * ops)2712 static int ip6_dst_gc(struct dst_ops *ops)
2713 {
2714 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2715 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2716 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2717 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2718 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2719 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2720 	int entries;
2721 
2722 	entries = dst_entries_get_fast(ops);
2723 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2724 	    entries <= rt_max_size)
2725 		goto out;
2726 
2727 	net->ipv6.ip6_rt_gc_expire++;
2728 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2729 	entries = dst_entries_get_slow(ops);
2730 	if (entries < ops->gc_thresh)
2731 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2732 out:
2733 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2734 	return entries > rt_max_size;
2735 }
2736 
ip6_convert_metrics(struct net * net,struct fib6_info * rt,struct fib6_config * cfg)2737 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2738 			       struct fib6_config *cfg)
2739 {
2740 	struct dst_metrics *p;
2741 
2742 	if (!cfg->fc_mx)
2743 		return 0;
2744 
2745 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2746 	if (unlikely(!p))
2747 		return -ENOMEM;
2748 
2749 	refcount_set(&p->refcnt, 1);
2750 	rt->fib6_metrics = p;
2751 
2752 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2753 }
2754 
ip6_nh_lookup_table(struct net * net,struct fib6_config * cfg,const struct in6_addr * gw_addr,u32 tbid,int flags)2755 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2756 					    struct fib6_config *cfg,
2757 					    const struct in6_addr *gw_addr,
2758 					    u32 tbid, int flags)
2759 {
2760 	struct flowi6 fl6 = {
2761 		.flowi6_oif = cfg->fc_ifindex,
2762 		.daddr = *gw_addr,
2763 		.saddr = cfg->fc_prefsrc,
2764 	};
2765 	struct fib6_table *table;
2766 	struct rt6_info *rt;
2767 
2768 	table = fib6_get_table(net, tbid);
2769 	if (!table)
2770 		return NULL;
2771 
2772 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2773 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2774 
2775 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2776 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2777 
2778 	/* if table lookup failed, fall back to full lookup */
2779 	if (rt == net->ipv6.ip6_null_entry) {
2780 		ip6_rt_put(rt);
2781 		rt = NULL;
2782 	}
2783 
2784 	return rt;
2785 }
2786 
ip6_route_check_nh_onlink(struct net * net,struct fib6_config * cfg,const struct net_device * dev,struct netlink_ext_ack * extack)2787 static int ip6_route_check_nh_onlink(struct net *net,
2788 				     struct fib6_config *cfg,
2789 				     const struct net_device *dev,
2790 				     struct netlink_ext_ack *extack)
2791 {
2792 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2793 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2794 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2795 	struct rt6_info *grt;
2796 	int err;
2797 
2798 	err = 0;
2799 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2800 	if (grt) {
2801 		if (!grt->dst.error &&
2802 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2803 			NL_SET_ERR_MSG(extack,
2804 				       "Nexthop has invalid gateway or device mismatch");
2805 			err = -EINVAL;
2806 		}
2807 
2808 		ip6_rt_put(grt);
2809 	}
2810 
2811 	return err;
2812 }
2813 
ip6_route_check_nh(struct net * net,struct fib6_config * cfg,struct net_device ** _dev,struct inet6_dev ** idev)2814 static int ip6_route_check_nh(struct net *net,
2815 			      struct fib6_config *cfg,
2816 			      struct net_device **_dev,
2817 			      struct inet6_dev **idev)
2818 {
2819 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820 	struct net_device *dev = _dev ? *_dev : NULL;
2821 	struct rt6_info *grt = NULL;
2822 	int err = -EHOSTUNREACH;
2823 
2824 	if (cfg->fc_table) {
2825 		int flags = RT6_LOOKUP_F_IFACE;
2826 
2827 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2828 					  cfg->fc_table, flags);
2829 		if (grt) {
2830 			if (grt->rt6i_flags & RTF_GATEWAY ||
2831 			    (dev && dev != grt->dst.dev)) {
2832 				ip6_rt_put(grt);
2833 				grt = NULL;
2834 			}
2835 		}
2836 	}
2837 
2838 	if (!grt)
2839 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2840 
2841 	if (!grt)
2842 		goto out;
2843 
2844 	if (dev) {
2845 		if (dev != grt->dst.dev) {
2846 			ip6_rt_put(grt);
2847 			goto out;
2848 		}
2849 	} else {
2850 		*_dev = dev = grt->dst.dev;
2851 		*idev = grt->rt6i_idev;
2852 		dev_hold(dev);
2853 		in6_dev_hold(grt->rt6i_idev);
2854 	}
2855 
2856 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2857 		err = 0;
2858 
2859 	ip6_rt_put(grt);
2860 
2861 out:
2862 	return err;
2863 }
2864 
ip6_validate_gw(struct net * net,struct fib6_config * cfg,struct net_device ** _dev,struct inet6_dev ** idev,struct netlink_ext_ack * extack)2865 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2866 			   struct net_device **_dev, struct inet6_dev **idev,
2867 			   struct netlink_ext_ack *extack)
2868 {
2869 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2870 	int gwa_type = ipv6_addr_type(gw_addr);
2871 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2872 	const struct net_device *dev = *_dev;
2873 	bool need_addr_check = !dev;
2874 	int err = -EINVAL;
2875 
2876 	/* if gw_addr is local we will fail to detect this in case
2877 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2878 	 * will return already-added prefix route via interface that
2879 	 * prefix route was assigned to, which might be non-loopback.
2880 	 */
2881 	if (dev &&
2882 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2883 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2884 		goto out;
2885 	}
2886 
2887 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2888 		/* IPv6 strictly inhibits using not link-local
2889 		 * addresses as nexthop address.
2890 		 * Otherwise, router will not able to send redirects.
2891 		 * It is very good, but in some (rare!) circumstances
2892 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2893 		 * some exceptions. --ANK
2894 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2895 		 * addressing
2896 		 */
2897 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2898 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2899 			goto out;
2900 		}
2901 
2902 		if (cfg->fc_flags & RTNH_F_ONLINK)
2903 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2904 		else
2905 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2906 
2907 		if (err)
2908 			goto out;
2909 	}
2910 
2911 	/* reload in case device was changed */
2912 	dev = *_dev;
2913 
2914 	err = -EINVAL;
2915 	if (!dev) {
2916 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2917 		goto out;
2918 	} else if (dev->flags & IFF_LOOPBACK) {
2919 		NL_SET_ERR_MSG(extack,
2920 			       "Egress device can not be loopback device for this route");
2921 		goto out;
2922 	}
2923 
2924 	/* if we did not check gw_addr above, do so now that the
2925 	 * egress device has been resolved.
2926 	 */
2927 	if (need_addr_check &&
2928 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2929 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2930 		goto out;
2931 	}
2932 
2933 	err = 0;
2934 out:
2935 	return err;
2936 }
2937 
ip6_route_info_create(struct fib6_config * cfg,gfp_t gfp_flags,struct netlink_ext_ack * extack)2938 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2939 					      gfp_t gfp_flags,
2940 					      struct netlink_ext_ack *extack)
2941 {
2942 	struct net *net = cfg->fc_nlinfo.nl_net;
2943 	struct fib6_info *rt = NULL;
2944 	struct net_device *dev = NULL;
2945 	struct inet6_dev *idev = NULL;
2946 	struct fib6_table *table;
2947 	int addr_type;
2948 	int err = -EINVAL;
2949 
2950 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2951 	if (cfg->fc_flags & RTF_PCPU) {
2952 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2953 		goto out;
2954 	}
2955 
2956 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2957 	if (cfg->fc_flags & RTF_CACHE) {
2958 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2959 		goto out;
2960 	}
2961 
2962 	if (cfg->fc_type > RTN_MAX) {
2963 		NL_SET_ERR_MSG(extack, "Invalid route type");
2964 		goto out;
2965 	}
2966 
2967 	if (cfg->fc_dst_len > 128) {
2968 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2969 		goto out;
2970 	}
2971 	if (cfg->fc_src_len > 128) {
2972 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2973 		goto out;
2974 	}
2975 #ifndef CONFIG_IPV6_SUBTREES
2976 	if (cfg->fc_src_len) {
2977 		NL_SET_ERR_MSG(extack,
2978 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2979 		goto out;
2980 	}
2981 #endif
2982 	if (cfg->fc_ifindex) {
2983 		err = -ENODEV;
2984 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2985 		if (!dev)
2986 			goto out;
2987 		idev = in6_dev_get(dev);
2988 		if (!idev)
2989 			goto out;
2990 	}
2991 
2992 	if (cfg->fc_metric == 0)
2993 		cfg->fc_metric = IP6_RT_PRIO_USER;
2994 
2995 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2996 		if (!dev) {
2997 			NL_SET_ERR_MSG(extack,
2998 				       "Nexthop device required for onlink");
2999 			err = -ENODEV;
3000 			goto out;
3001 		}
3002 
3003 		if (!(dev->flags & IFF_UP)) {
3004 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3005 			err = -ENETDOWN;
3006 			goto out;
3007 		}
3008 	}
3009 
3010 	err = -ENOBUFS;
3011 	if (cfg->fc_nlinfo.nlh &&
3012 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3013 		table = fib6_get_table(net, cfg->fc_table);
3014 		if (!table) {
3015 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3016 			table = fib6_new_table(net, cfg->fc_table);
3017 		}
3018 	} else {
3019 		table = fib6_new_table(net, cfg->fc_table);
3020 	}
3021 
3022 	if (!table)
3023 		goto out;
3024 
3025 	err = -ENOMEM;
3026 	rt = fib6_info_alloc(gfp_flags);
3027 	if (!rt)
3028 		goto out;
3029 
3030 	if (cfg->fc_flags & RTF_ADDRCONF)
3031 		rt->dst_nocount = true;
3032 
3033 	err = ip6_convert_metrics(net, rt, cfg);
3034 	if (err < 0)
3035 		goto out;
3036 
3037 	if (cfg->fc_flags & RTF_EXPIRES)
3038 		fib6_set_expires(rt, jiffies +
3039 				clock_t_to_jiffies(cfg->fc_expires));
3040 	else
3041 		fib6_clean_expires(rt);
3042 
3043 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3044 		cfg->fc_protocol = RTPROT_BOOT;
3045 	rt->fib6_protocol = cfg->fc_protocol;
3046 
3047 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3048 
3049 	if (cfg->fc_encap) {
3050 		struct lwtunnel_state *lwtstate;
3051 
3052 		err = lwtunnel_build_state(cfg->fc_encap_type,
3053 					   cfg->fc_encap, AF_INET6, cfg,
3054 					   &lwtstate, extack);
3055 		if (err)
3056 			goto out;
3057 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3058 	}
3059 
3060 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3061 	rt->fib6_dst.plen = cfg->fc_dst_len;
3062 	if (rt->fib6_dst.plen == 128)
3063 		rt->dst_host = true;
3064 
3065 #ifdef CONFIG_IPV6_SUBTREES
3066 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3067 	rt->fib6_src.plen = cfg->fc_src_len;
3068 #endif
3069 
3070 	rt->fib6_metric = cfg->fc_metric;
3071 	rt->fib6_nh.nh_weight = 1;
3072 
3073 	rt->fib6_type = cfg->fc_type;
3074 
3075 	/* We cannot add true routes via loopback here,
3076 	   they would result in kernel looping; promote them to reject routes
3077 	 */
3078 	if ((cfg->fc_flags & RTF_REJECT) ||
3079 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3080 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3081 	     !(cfg->fc_flags & RTF_LOCAL))) {
3082 		/* hold loopback dev/idev if we haven't done so. */
3083 		if (dev != net->loopback_dev) {
3084 			if (dev) {
3085 				dev_put(dev);
3086 				in6_dev_put(idev);
3087 			}
3088 			dev = net->loopback_dev;
3089 			dev_hold(dev);
3090 			idev = in6_dev_get(dev);
3091 			if (!idev) {
3092 				err = -ENODEV;
3093 				goto out;
3094 			}
3095 		}
3096 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3097 		goto install_route;
3098 	}
3099 
3100 	if (cfg->fc_flags & RTF_GATEWAY) {
3101 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3102 		if (err)
3103 			goto out;
3104 
3105 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3106 	}
3107 
3108 	err = -ENODEV;
3109 	if (!dev)
3110 		goto out;
3111 
3112 	if (idev->cnf.disable_ipv6) {
3113 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3114 		err = -EACCES;
3115 		goto out;
3116 	}
3117 
3118 	if (!(dev->flags & IFF_UP)) {
3119 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3120 		err = -ENETDOWN;
3121 		goto out;
3122 	}
3123 
3124 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3125 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3126 			NL_SET_ERR_MSG(extack, "Invalid source address");
3127 			err = -EINVAL;
3128 			goto out;
3129 		}
3130 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3131 		rt->fib6_prefsrc.plen = 128;
3132 	} else
3133 		rt->fib6_prefsrc.plen = 0;
3134 
3135 	rt->fib6_flags = cfg->fc_flags;
3136 
3137 install_route:
3138 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3139 	    !netif_carrier_ok(dev))
3140 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3141 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3142 	rt->fib6_nh.nh_dev = dev;
3143 	rt->fib6_table = table;
3144 
3145 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3146 
3147 	if (idev)
3148 		in6_dev_put(idev);
3149 
3150 	return rt;
3151 out:
3152 	if (dev)
3153 		dev_put(dev);
3154 	if (idev)
3155 		in6_dev_put(idev);
3156 
3157 	fib6_info_release(rt);
3158 	return ERR_PTR(err);
3159 }
3160 
ip6_route_add(struct fib6_config * cfg,gfp_t gfp_flags,struct netlink_ext_ack * extack)3161 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3162 		  struct netlink_ext_ack *extack)
3163 {
3164 	struct fib6_info *rt;
3165 	int err;
3166 
3167 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3168 	if (IS_ERR(rt))
3169 		return PTR_ERR(rt);
3170 
3171 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3172 	fib6_info_release(rt);
3173 
3174 	return err;
3175 }
3176 
__ip6_del_rt(struct fib6_info * rt,struct nl_info * info)3177 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3178 {
3179 	struct net *net = info->nl_net;
3180 	struct fib6_table *table;
3181 	int err;
3182 
3183 	if (rt == net->ipv6.fib6_null_entry) {
3184 		err = -ENOENT;
3185 		goto out;
3186 	}
3187 
3188 	table = rt->fib6_table;
3189 	spin_lock_bh(&table->tb6_lock);
3190 	err = fib6_del(rt, info);
3191 	spin_unlock_bh(&table->tb6_lock);
3192 
3193 out:
3194 	fib6_info_release(rt);
3195 	return err;
3196 }
3197 
ip6_del_rt(struct net * net,struct fib6_info * rt)3198 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3199 {
3200 	struct nl_info info = { .nl_net = net };
3201 
3202 	return __ip6_del_rt(rt, &info);
3203 }
3204 
__ip6_del_rt_siblings(struct fib6_info * rt,struct fib6_config * cfg)3205 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3206 {
3207 	struct nl_info *info = &cfg->fc_nlinfo;
3208 	struct net *net = info->nl_net;
3209 	struct sk_buff *skb = NULL;
3210 	struct fib6_table *table;
3211 	int err = -ENOENT;
3212 
3213 	if (rt == net->ipv6.fib6_null_entry)
3214 		goto out_put;
3215 	table = rt->fib6_table;
3216 	spin_lock_bh(&table->tb6_lock);
3217 
3218 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3219 		struct fib6_info *sibling, *next_sibling;
3220 
3221 		/* prefer to send a single notification with all hops */
3222 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3223 		if (skb) {
3224 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3225 
3226 			if (rt6_fill_node(net, skb, rt, NULL,
3227 					  NULL, NULL, 0, RTM_DELROUTE,
3228 					  info->portid, seq, 0) < 0) {
3229 				kfree_skb(skb);
3230 				skb = NULL;
3231 			} else
3232 				info->skip_notify = 1;
3233 		}
3234 
3235 		list_for_each_entry_safe(sibling, next_sibling,
3236 					 &rt->fib6_siblings,
3237 					 fib6_siblings) {
3238 			err = fib6_del(sibling, info);
3239 			if (err)
3240 				goto out_unlock;
3241 		}
3242 	}
3243 
3244 	err = fib6_del(rt, info);
3245 out_unlock:
3246 	spin_unlock_bh(&table->tb6_lock);
3247 out_put:
3248 	fib6_info_release(rt);
3249 
3250 	if (skb) {
3251 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3252 			    info->nlh, gfp_any());
3253 	}
3254 	return err;
3255 }
3256 
ip6_del_cached_rt(struct rt6_info * rt,struct fib6_config * cfg)3257 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3258 {
3259 	int rc = -ESRCH;
3260 
3261 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3262 		goto out;
3263 
3264 	if (cfg->fc_flags & RTF_GATEWAY &&
3265 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3266 		goto out;
3267 	if (dst_hold_safe(&rt->dst))
3268 		rc = rt6_remove_exception_rt(rt);
3269 out:
3270 	return rc;
3271 }
3272 
ip6_route_del(struct fib6_config * cfg,struct netlink_ext_ack * extack)3273 static int ip6_route_del(struct fib6_config *cfg,
3274 			 struct netlink_ext_ack *extack)
3275 {
3276 	struct rt6_info *rt_cache;
3277 	struct fib6_table *table;
3278 	struct fib6_info *rt;
3279 	struct fib6_node *fn;
3280 	int err = -ESRCH;
3281 
3282 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3283 	if (!table) {
3284 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3285 		return err;
3286 	}
3287 
3288 	rcu_read_lock();
3289 
3290 	fn = fib6_locate(&table->tb6_root,
3291 			 &cfg->fc_dst, cfg->fc_dst_len,
3292 			 &cfg->fc_src, cfg->fc_src_len,
3293 			 !(cfg->fc_flags & RTF_CACHE));
3294 
3295 	if (fn) {
3296 		for_each_fib6_node_rt_rcu(fn) {
3297 			if (cfg->fc_flags & RTF_CACHE) {
3298 				int rc;
3299 
3300 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3301 							      &cfg->fc_src);
3302 				if (rt_cache) {
3303 					rc = ip6_del_cached_rt(rt_cache, cfg);
3304 					if (rc != -ESRCH) {
3305 						rcu_read_unlock();
3306 						return rc;
3307 					}
3308 				}
3309 				continue;
3310 			}
3311 			if (cfg->fc_ifindex &&
3312 			    (!rt->fib6_nh.nh_dev ||
3313 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3314 				continue;
3315 			if (cfg->fc_flags & RTF_GATEWAY &&
3316 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3317 				continue;
3318 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3319 				continue;
3320 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3321 				continue;
3322 			if (!fib6_info_hold_safe(rt))
3323 				continue;
3324 			rcu_read_unlock();
3325 
3326 			/* if gateway was specified only delete the one hop */
3327 			if (cfg->fc_flags & RTF_GATEWAY)
3328 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3329 
3330 			return __ip6_del_rt_siblings(rt, cfg);
3331 		}
3332 	}
3333 	rcu_read_unlock();
3334 
3335 	return err;
3336 }
3337 
rt6_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)3338 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3339 {
3340 	struct netevent_redirect netevent;
3341 	struct rt6_info *rt, *nrt = NULL;
3342 	struct ndisc_options ndopts;
3343 	struct inet6_dev *in6_dev;
3344 	struct neighbour *neigh;
3345 	struct fib6_info *from;
3346 	struct rd_msg *msg;
3347 	int optlen, on_link;
3348 	u8 *lladdr;
3349 
3350 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3351 	optlen -= sizeof(*msg);
3352 
3353 	if (optlen < 0) {
3354 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3355 		return;
3356 	}
3357 
3358 	msg = (struct rd_msg *)icmp6_hdr(skb);
3359 
3360 	if (ipv6_addr_is_multicast(&msg->dest)) {
3361 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3362 		return;
3363 	}
3364 
3365 	on_link = 0;
3366 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3367 		on_link = 1;
3368 	} else if (ipv6_addr_type(&msg->target) !=
3369 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3370 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3371 		return;
3372 	}
3373 
3374 	in6_dev = __in6_dev_get(skb->dev);
3375 	if (!in6_dev)
3376 		return;
3377 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3378 		return;
3379 
3380 	/* RFC2461 8.1:
3381 	 *	The IP source address of the Redirect MUST be the same as the current
3382 	 *	first-hop router for the specified ICMP Destination Address.
3383 	 */
3384 
3385 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3386 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3387 		return;
3388 	}
3389 
3390 	lladdr = NULL;
3391 	if (ndopts.nd_opts_tgt_lladdr) {
3392 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3393 					     skb->dev);
3394 		if (!lladdr) {
3395 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3396 			return;
3397 		}
3398 	}
3399 
3400 	rt = (struct rt6_info *) dst;
3401 	if (rt->rt6i_flags & RTF_REJECT) {
3402 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3403 		return;
3404 	}
3405 
3406 	/* Redirect received -> path was valid.
3407 	 * Look, redirects are sent only in response to data packets,
3408 	 * so that this nexthop apparently is reachable. --ANK
3409 	 */
3410 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3411 
3412 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3413 	if (!neigh)
3414 		return;
3415 
3416 	/*
3417 	 *	We have finally decided to accept it.
3418 	 */
3419 
3420 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3421 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3422 		     NEIGH_UPDATE_F_OVERRIDE|
3423 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3424 				     NEIGH_UPDATE_F_ISROUTER)),
3425 		     NDISC_REDIRECT, &ndopts);
3426 
3427 	rcu_read_lock();
3428 	from = rcu_dereference(rt->from);
3429 	/* This fib6_info_hold() is safe here because we hold reference to rt
3430 	 * and rt already holds reference to fib6_info.
3431 	 */
3432 	fib6_info_hold(from);
3433 	rcu_read_unlock();
3434 
3435 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3436 	if (!nrt)
3437 		goto out;
3438 
3439 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3440 	if (on_link)
3441 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3442 
3443 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3444 
3445 	/* No need to remove rt from the exception table if rt is
3446 	 * a cached route because rt6_insert_exception() will
3447 	 * takes care of it
3448 	 */
3449 	if (rt6_insert_exception(nrt, from)) {
3450 		dst_release_immediate(&nrt->dst);
3451 		goto out;
3452 	}
3453 
3454 	netevent.old = &rt->dst;
3455 	netevent.new = &nrt->dst;
3456 	netevent.daddr = &msg->dest;
3457 	netevent.neigh = neigh;
3458 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3459 
3460 out:
3461 	fib6_info_release(from);
3462 	neigh_release(neigh);
3463 }
3464 
3465 #ifdef CONFIG_IPV6_ROUTE_INFO
rt6_get_route_info(struct net * net,const struct in6_addr * prefix,int prefixlen,const struct in6_addr * gwaddr,struct net_device * dev)3466 static struct fib6_info *rt6_get_route_info(struct net *net,
3467 					   const struct in6_addr *prefix, int prefixlen,
3468 					   const struct in6_addr *gwaddr,
3469 					   struct net_device *dev)
3470 {
3471 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3472 	int ifindex = dev->ifindex;
3473 	struct fib6_node *fn;
3474 	struct fib6_info *rt = NULL;
3475 	struct fib6_table *table;
3476 
3477 	table = fib6_get_table(net, tb_id);
3478 	if (!table)
3479 		return NULL;
3480 
3481 	rcu_read_lock();
3482 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3483 	if (!fn)
3484 		goto out;
3485 
3486 	for_each_fib6_node_rt_rcu(fn) {
3487 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3488 			continue;
3489 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3490 			continue;
3491 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3492 			continue;
3493 		if (!fib6_info_hold_safe(rt))
3494 			continue;
3495 		break;
3496 	}
3497 out:
3498 	rcu_read_unlock();
3499 	return rt;
3500 }
3501 
rt6_add_route_info(struct net * net,const struct in6_addr * prefix,int prefixlen,const struct in6_addr * gwaddr,struct net_device * dev,unsigned int pref)3502 static struct fib6_info *rt6_add_route_info(struct net *net,
3503 					   const struct in6_addr *prefix, int prefixlen,
3504 					   const struct in6_addr *gwaddr,
3505 					   struct net_device *dev,
3506 					   unsigned int pref)
3507 {
3508 	struct fib6_config cfg = {
3509 		.fc_metric	= IP6_RT_PRIO_USER,
3510 		.fc_ifindex	= dev->ifindex,
3511 		.fc_dst_len	= prefixlen,
3512 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3513 				  RTF_UP | RTF_PREF(pref),
3514 		.fc_protocol = RTPROT_RA,
3515 		.fc_type = RTN_UNICAST,
3516 		.fc_nlinfo.portid = 0,
3517 		.fc_nlinfo.nlh = NULL,
3518 		.fc_nlinfo.nl_net = net,
3519 	};
3520 
3521 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3522 	cfg.fc_dst = *prefix;
3523 	cfg.fc_gateway = *gwaddr;
3524 
3525 	/* We should treat it as a default route if prefix length is 0. */
3526 	if (!prefixlen)
3527 		cfg.fc_flags |= RTF_DEFAULT;
3528 
3529 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3530 
3531 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3532 }
3533 #endif
3534 
rt6_get_dflt_router(struct net * net,const struct in6_addr * addr,struct net_device * dev)3535 struct fib6_info *rt6_get_dflt_router(struct net *net,
3536 				     const struct in6_addr *addr,
3537 				     struct net_device *dev)
3538 {
3539 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3540 	struct fib6_info *rt;
3541 	struct fib6_table *table;
3542 
3543 	table = fib6_get_table(net, tb_id);
3544 	if (!table)
3545 		return NULL;
3546 
3547 	rcu_read_lock();
3548 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3549 		if (dev == rt->fib6_nh.nh_dev &&
3550 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3551 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3552 			break;
3553 	}
3554 	if (rt && !fib6_info_hold_safe(rt))
3555 		rt = NULL;
3556 	rcu_read_unlock();
3557 	return rt;
3558 }
3559 
rt6_add_dflt_router(struct net * net,const struct in6_addr * gwaddr,struct net_device * dev,unsigned int pref)3560 struct fib6_info *rt6_add_dflt_router(struct net *net,
3561 				     const struct in6_addr *gwaddr,
3562 				     struct net_device *dev,
3563 				     unsigned int pref)
3564 {
3565 	struct fib6_config cfg = {
3566 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3567 		.fc_metric	= IP6_RT_PRIO_USER,
3568 		.fc_ifindex	= dev->ifindex,
3569 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3570 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3571 		.fc_protocol = RTPROT_RA,
3572 		.fc_type = RTN_UNICAST,
3573 		.fc_nlinfo.portid = 0,
3574 		.fc_nlinfo.nlh = NULL,
3575 		.fc_nlinfo.nl_net = net,
3576 	};
3577 
3578 	cfg.fc_gateway = *gwaddr;
3579 
3580 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3581 		struct fib6_table *table;
3582 
3583 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3584 		if (table)
3585 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3586 	}
3587 
3588 	return rt6_get_dflt_router(net, gwaddr, dev);
3589 }
3590 
__rt6_purge_dflt_routers(struct net * net,struct fib6_table * table)3591 static void __rt6_purge_dflt_routers(struct net *net,
3592 				     struct fib6_table *table)
3593 {
3594 	struct fib6_info *rt;
3595 
3596 restart:
3597 	rcu_read_lock();
3598 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3599 		struct net_device *dev = fib6_info_nh_dev(rt);
3600 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3601 
3602 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3603 		    (!idev || idev->cnf.accept_ra != 2) &&
3604 		    fib6_info_hold_safe(rt)) {
3605 			rcu_read_unlock();
3606 			ip6_del_rt(net, rt);
3607 			goto restart;
3608 		}
3609 	}
3610 	rcu_read_unlock();
3611 
3612 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3613 }
3614 
rt6_purge_dflt_routers(struct net * net)3615 void rt6_purge_dflt_routers(struct net *net)
3616 {
3617 	struct fib6_table *table;
3618 	struct hlist_head *head;
3619 	unsigned int h;
3620 
3621 	rcu_read_lock();
3622 
3623 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3624 		head = &net->ipv6.fib_table_hash[h];
3625 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3626 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3627 				__rt6_purge_dflt_routers(net, table);
3628 		}
3629 	}
3630 
3631 	rcu_read_unlock();
3632 }
3633 
rtmsg_to_fib6_config(struct net * net,struct in6_rtmsg * rtmsg,struct fib6_config * cfg)3634 static void rtmsg_to_fib6_config(struct net *net,
3635 				 struct in6_rtmsg *rtmsg,
3636 				 struct fib6_config *cfg)
3637 {
3638 	memset(cfg, 0, sizeof(*cfg));
3639 
3640 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3641 			 : RT6_TABLE_MAIN;
3642 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3643 	cfg->fc_metric = rtmsg->rtmsg_metric;
3644 	cfg->fc_expires = rtmsg->rtmsg_info;
3645 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3646 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3647 	cfg->fc_flags = rtmsg->rtmsg_flags;
3648 	cfg->fc_type = rtmsg->rtmsg_type;
3649 
3650 	cfg->fc_nlinfo.nl_net = net;
3651 
3652 	cfg->fc_dst = rtmsg->rtmsg_dst;
3653 	cfg->fc_src = rtmsg->rtmsg_src;
3654 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3655 }
3656 
ipv6_route_ioctl(struct net * net,unsigned int cmd,void __user * arg)3657 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3658 {
3659 	struct fib6_config cfg;
3660 	struct in6_rtmsg rtmsg;
3661 	int err;
3662 
3663 	switch (cmd) {
3664 	case SIOCADDRT:		/* Add a route */
3665 	case SIOCDELRT:		/* Delete a route */
3666 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3667 			return -EPERM;
3668 		err = copy_from_user(&rtmsg, arg,
3669 				     sizeof(struct in6_rtmsg));
3670 		if (err)
3671 			return -EFAULT;
3672 
3673 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3674 
3675 		rtnl_lock();
3676 		switch (cmd) {
3677 		case SIOCADDRT:
3678 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3679 			break;
3680 		case SIOCDELRT:
3681 			err = ip6_route_del(&cfg, NULL);
3682 			break;
3683 		default:
3684 			err = -EINVAL;
3685 		}
3686 		rtnl_unlock();
3687 
3688 		return err;
3689 	}
3690 
3691 	return -EINVAL;
3692 }
3693 
3694 /*
3695  *	Drop the packet on the floor
3696  */
3697 
ip6_pkt_drop(struct sk_buff * skb,u8 code,int ipstats_mib_noroutes)3698 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3699 {
3700 	int type;
3701 	struct dst_entry *dst = skb_dst(skb);
3702 	switch (ipstats_mib_noroutes) {
3703 	case IPSTATS_MIB_INNOROUTES:
3704 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3705 		if (type == IPV6_ADDR_ANY) {
3706 			IP6_INC_STATS(dev_net(dst->dev),
3707 				      __in6_dev_get_safely(skb->dev),
3708 				      IPSTATS_MIB_INADDRERRORS);
3709 			break;
3710 		}
3711 		/* FALLTHROUGH */
3712 	case IPSTATS_MIB_OUTNOROUTES:
3713 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3714 			      ipstats_mib_noroutes);
3715 		break;
3716 	}
3717 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3718 	kfree_skb(skb);
3719 	return 0;
3720 }
3721 
ip6_pkt_discard(struct sk_buff * skb)3722 static int ip6_pkt_discard(struct sk_buff *skb)
3723 {
3724 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3725 }
3726 
ip6_pkt_discard_out(struct net * net,struct sock * sk,struct sk_buff * skb)3727 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3728 {
3729 	skb->dev = skb_dst(skb)->dev;
3730 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3731 }
3732 
ip6_pkt_prohibit(struct sk_buff * skb)3733 static int ip6_pkt_prohibit(struct sk_buff *skb)
3734 {
3735 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3736 }
3737 
ip6_pkt_prohibit_out(struct net * net,struct sock * sk,struct sk_buff * skb)3738 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3739 {
3740 	skb->dev = skb_dst(skb)->dev;
3741 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3742 }
3743 
3744 /*
3745  *	Allocate a dst for local (unicast / anycast) address.
3746  */
3747 
addrconf_f6i_alloc(struct net * net,struct inet6_dev * idev,const struct in6_addr * addr,bool anycast,gfp_t gfp_flags)3748 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3749 				     struct inet6_dev *idev,
3750 				     const struct in6_addr *addr,
3751 				     bool anycast, gfp_t gfp_flags)
3752 {
3753 	u32 tb_id;
3754 	struct net_device *dev = idev->dev;
3755 	struct fib6_info *f6i;
3756 
3757 	f6i = fib6_info_alloc(gfp_flags);
3758 	if (!f6i)
3759 		return ERR_PTR(-ENOMEM);
3760 
3761 	f6i->dst_nocount = true;
3762 	f6i->dst_host = true;
3763 	f6i->fib6_protocol = RTPROT_KERNEL;
3764 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3765 	if (anycast) {
3766 		f6i->fib6_type = RTN_ANYCAST;
3767 		f6i->fib6_flags |= RTF_ANYCAST;
3768 	} else {
3769 		f6i->fib6_type = RTN_LOCAL;
3770 		f6i->fib6_flags |= RTF_LOCAL;
3771 	}
3772 
3773 	f6i->fib6_nh.nh_gw = *addr;
3774 	dev_hold(dev);
3775 	f6i->fib6_nh.nh_dev = dev;
3776 	f6i->fib6_dst.addr = *addr;
3777 	f6i->fib6_dst.plen = 128;
3778 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3779 	f6i->fib6_table = fib6_get_table(net, tb_id);
3780 
3781 	return f6i;
3782 }
3783 
3784 /* remove deleted ip from prefsrc entries */
3785 struct arg_dev_net_ip {
3786 	struct net_device *dev;
3787 	struct net *net;
3788 	struct in6_addr *addr;
3789 };
3790 
fib6_remove_prefsrc(struct fib6_info * rt,void * arg)3791 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3792 {
3793 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3794 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3795 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3796 
3797 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3798 	    rt != net->ipv6.fib6_null_entry &&
3799 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3800 		spin_lock_bh(&rt6_exception_lock);
3801 		/* remove prefsrc entry */
3802 		rt->fib6_prefsrc.plen = 0;
3803 		/* need to update cache as well */
3804 		rt6_exceptions_remove_prefsrc(rt);
3805 		spin_unlock_bh(&rt6_exception_lock);
3806 	}
3807 	return 0;
3808 }
3809 
rt6_remove_prefsrc(struct inet6_ifaddr * ifp)3810 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3811 {
3812 	struct net *net = dev_net(ifp->idev->dev);
3813 	struct arg_dev_net_ip adni = {
3814 		.dev = ifp->idev->dev,
3815 		.net = net,
3816 		.addr = &ifp->addr,
3817 	};
3818 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3819 }
3820 
3821 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3822 
3823 /* Remove routers and update dst entries when gateway turn into host. */
fib6_clean_tohost(struct fib6_info * rt,void * arg)3824 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3825 {
3826 	struct in6_addr *gateway = (struct in6_addr *)arg;
3827 
3828 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3829 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3830 		return -1;
3831 	}
3832 
3833 	/* Further clean up cached routes in exception table.
3834 	 * This is needed because cached route may have a different
3835 	 * gateway than its 'parent' in the case of an ip redirect.
3836 	 */
3837 	rt6_exceptions_clean_tohost(rt, gateway);
3838 
3839 	return 0;
3840 }
3841 
rt6_clean_tohost(struct net * net,struct in6_addr * gateway)3842 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3843 {
3844 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3845 }
3846 
3847 struct arg_netdev_event {
3848 	const struct net_device *dev;
3849 	union {
3850 		unsigned int nh_flags;
3851 		unsigned long event;
3852 	};
3853 };
3854 
rt6_multipath_first_sibling(const struct fib6_info * rt)3855 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3856 {
3857 	struct fib6_info *iter;
3858 	struct fib6_node *fn;
3859 
3860 	fn = rcu_dereference_protected(rt->fib6_node,
3861 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3862 	iter = rcu_dereference_protected(fn->leaf,
3863 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3864 	while (iter) {
3865 		if (iter->fib6_metric == rt->fib6_metric &&
3866 		    rt6_qualify_for_ecmp(iter))
3867 			return iter;
3868 		iter = rcu_dereference_protected(iter->fib6_next,
3869 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3870 	}
3871 
3872 	return NULL;
3873 }
3874 
rt6_is_dead(const struct fib6_info * rt)3875 static bool rt6_is_dead(const struct fib6_info *rt)
3876 {
3877 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3878 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3879 	     fib6_ignore_linkdown(rt)))
3880 		return true;
3881 
3882 	return false;
3883 }
3884 
rt6_multipath_total_weight(const struct fib6_info * rt)3885 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3886 {
3887 	struct fib6_info *iter;
3888 	int total = 0;
3889 
3890 	if (!rt6_is_dead(rt))
3891 		total += rt->fib6_nh.nh_weight;
3892 
3893 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3894 		if (!rt6_is_dead(iter))
3895 			total += iter->fib6_nh.nh_weight;
3896 	}
3897 
3898 	return total;
3899 }
3900 
rt6_upper_bound_set(struct fib6_info * rt,int * weight,int total)3901 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3902 {
3903 	int upper_bound = -1;
3904 
3905 	if (!rt6_is_dead(rt)) {
3906 		*weight += rt->fib6_nh.nh_weight;
3907 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3908 						    total) - 1;
3909 	}
3910 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3911 }
3912 
rt6_multipath_upper_bound_set(struct fib6_info * rt,int total)3913 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3914 {
3915 	struct fib6_info *iter;
3916 	int weight = 0;
3917 
3918 	rt6_upper_bound_set(rt, &weight, total);
3919 
3920 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3921 		rt6_upper_bound_set(iter, &weight, total);
3922 }
3923 
rt6_multipath_rebalance(struct fib6_info * rt)3924 void rt6_multipath_rebalance(struct fib6_info *rt)
3925 {
3926 	struct fib6_info *first;
3927 	int total;
3928 
3929 	/* In case the entire multipath route was marked for flushing,
3930 	 * then there is no need to rebalance upon the removal of every
3931 	 * sibling route.
3932 	 */
3933 	if (!rt->fib6_nsiblings || rt->should_flush)
3934 		return;
3935 
3936 	/* During lookup routes are evaluated in order, so we need to
3937 	 * make sure upper bounds are assigned from the first sibling
3938 	 * onwards.
3939 	 */
3940 	first = rt6_multipath_first_sibling(rt);
3941 	if (WARN_ON_ONCE(!first))
3942 		return;
3943 
3944 	total = rt6_multipath_total_weight(first);
3945 	rt6_multipath_upper_bound_set(first, total);
3946 }
3947 
fib6_ifup(struct fib6_info * rt,void * p_arg)3948 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3949 {
3950 	const struct arg_netdev_event *arg = p_arg;
3951 	struct net *net = dev_net(arg->dev);
3952 
3953 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3954 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3955 		fib6_update_sernum_upto_root(net, rt);
3956 		rt6_multipath_rebalance(rt);
3957 	}
3958 
3959 	return 0;
3960 }
3961 
rt6_sync_up(struct net_device * dev,unsigned int nh_flags)3962 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3963 {
3964 	struct arg_netdev_event arg = {
3965 		.dev = dev,
3966 		{
3967 			.nh_flags = nh_flags,
3968 		},
3969 	};
3970 
3971 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3972 		arg.nh_flags |= RTNH_F_LINKDOWN;
3973 
3974 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3975 }
3976 
rt6_multipath_uses_dev(const struct fib6_info * rt,const struct net_device * dev)3977 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3978 				   const struct net_device *dev)
3979 {
3980 	struct fib6_info *iter;
3981 
3982 	if (rt->fib6_nh.nh_dev == dev)
3983 		return true;
3984 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3985 		if (iter->fib6_nh.nh_dev == dev)
3986 			return true;
3987 
3988 	return false;
3989 }
3990 
rt6_multipath_flush(struct fib6_info * rt)3991 static void rt6_multipath_flush(struct fib6_info *rt)
3992 {
3993 	struct fib6_info *iter;
3994 
3995 	rt->should_flush = 1;
3996 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3997 		iter->should_flush = 1;
3998 }
3999 
rt6_multipath_dead_count(const struct fib6_info * rt,const struct net_device * down_dev)4000 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4001 					     const struct net_device *down_dev)
4002 {
4003 	struct fib6_info *iter;
4004 	unsigned int dead = 0;
4005 
4006 	if (rt->fib6_nh.nh_dev == down_dev ||
4007 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4008 		dead++;
4009 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4010 		if (iter->fib6_nh.nh_dev == down_dev ||
4011 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4012 			dead++;
4013 
4014 	return dead;
4015 }
4016 
rt6_multipath_nh_flags_set(struct fib6_info * rt,const struct net_device * dev,unsigned int nh_flags)4017 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4018 				       const struct net_device *dev,
4019 				       unsigned int nh_flags)
4020 {
4021 	struct fib6_info *iter;
4022 
4023 	if (rt->fib6_nh.nh_dev == dev)
4024 		rt->fib6_nh.nh_flags |= nh_flags;
4025 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4026 		if (iter->fib6_nh.nh_dev == dev)
4027 			iter->fib6_nh.nh_flags |= nh_flags;
4028 }
4029 
4030 /* called with write lock held for table with rt */
fib6_ifdown(struct fib6_info * rt,void * p_arg)4031 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4032 {
4033 	const struct arg_netdev_event *arg = p_arg;
4034 	const struct net_device *dev = arg->dev;
4035 	struct net *net = dev_net(dev);
4036 
4037 	if (rt == net->ipv6.fib6_null_entry)
4038 		return 0;
4039 
4040 	switch (arg->event) {
4041 	case NETDEV_UNREGISTER:
4042 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4043 	case NETDEV_DOWN:
4044 		if (rt->should_flush)
4045 			return -1;
4046 		if (!rt->fib6_nsiblings)
4047 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4048 		if (rt6_multipath_uses_dev(rt, dev)) {
4049 			unsigned int count;
4050 
4051 			count = rt6_multipath_dead_count(rt, dev);
4052 			if (rt->fib6_nsiblings + 1 == count) {
4053 				rt6_multipath_flush(rt);
4054 				return -1;
4055 			}
4056 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4057 						   RTNH_F_LINKDOWN);
4058 			fib6_update_sernum(net, rt);
4059 			rt6_multipath_rebalance(rt);
4060 		}
4061 		return -2;
4062 	case NETDEV_CHANGE:
4063 		if (rt->fib6_nh.nh_dev != dev ||
4064 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4065 			break;
4066 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4067 		rt6_multipath_rebalance(rt);
4068 		break;
4069 	}
4070 
4071 	return 0;
4072 }
4073 
rt6_sync_down_dev(struct net_device * dev,unsigned long event)4074 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4075 {
4076 	struct arg_netdev_event arg = {
4077 		.dev = dev,
4078 		{
4079 			.event = event,
4080 		},
4081 	};
4082 
4083 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4084 }
4085 
rt6_disable_ip(struct net_device * dev,unsigned long event)4086 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4087 {
4088 	rt6_sync_down_dev(dev, event);
4089 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4090 	neigh_ifdown(&nd_tbl, dev);
4091 }
4092 
4093 struct rt6_mtu_change_arg {
4094 	struct net_device *dev;
4095 	unsigned int mtu;
4096 };
4097 
rt6_mtu_change_route(struct fib6_info * rt,void * p_arg)4098 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4099 {
4100 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4101 	struct inet6_dev *idev;
4102 
4103 	/* In IPv6 pmtu discovery is not optional,
4104 	   so that RTAX_MTU lock cannot disable it.
4105 	   We still use this lock to block changes
4106 	   caused by addrconf/ndisc.
4107 	*/
4108 
4109 	idev = __in6_dev_get(arg->dev);
4110 	if (!idev)
4111 		return 0;
4112 
4113 	/* For administrative MTU increase, there is no way to discover
4114 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4115 	   Since RFC 1981 doesn't include administrative MTU increase
4116 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4117 	 */
4118 	if (rt->fib6_nh.nh_dev == arg->dev &&
4119 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4120 		u32 mtu = rt->fib6_pmtu;
4121 
4122 		if (mtu >= arg->mtu ||
4123 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4124 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4125 
4126 		spin_lock_bh(&rt6_exception_lock);
4127 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4128 		spin_unlock_bh(&rt6_exception_lock);
4129 	}
4130 	return 0;
4131 }
4132 
rt6_mtu_change(struct net_device * dev,unsigned int mtu)4133 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4134 {
4135 	struct rt6_mtu_change_arg arg = {
4136 		.dev = dev,
4137 		.mtu = mtu,
4138 	};
4139 
4140 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4141 }
4142 
4143 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4144 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4145 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4146 	[RTA_OIF]               = { .type = NLA_U32 },
4147 	[RTA_IIF]		= { .type = NLA_U32 },
4148 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4149 	[RTA_METRICS]           = { .type = NLA_NESTED },
4150 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4151 	[RTA_PREF]              = { .type = NLA_U8 },
4152 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4153 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4154 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4155 	[RTA_UID]		= { .type = NLA_U32 },
4156 	[RTA_MARK]		= { .type = NLA_U32 },
4157 	[RTA_TABLE]		= { .type = NLA_U32 },
4158 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4159 	[RTA_SPORT]		= { .type = NLA_U16 },
4160 	[RTA_DPORT]		= { .type = NLA_U16 },
4161 };
4162 
rtm_to_fib6_config(struct sk_buff * skb,struct nlmsghdr * nlh,struct fib6_config * cfg,struct netlink_ext_ack * extack)4163 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4164 			      struct fib6_config *cfg,
4165 			      struct netlink_ext_ack *extack)
4166 {
4167 	struct rtmsg *rtm;
4168 	struct nlattr *tb[RTA_MAX+1];
4169 	unsigned int pref;
4170 	int err;
4171 
4172 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4173 			  NULL);
4174 	if (err < 0)
4175 		goto errout;
4176 
4177 	err = -EINVAL;
4178 	rtm = nlmsg_data(nlh);
4179 	memset(cfg, 0, sizeof(*cfg));
4180 
4181 	cfg->fc_table = rtm->rtm_table;
4182 	cfg->fc_dst_len = rtm->rtm_dst_len;
4183 	cfg->fc_src_len = rtm->rtm_src_len;
4184 	cfg->fc_flags = RTF_UP;
4185 	cfg->fc_protocol = rtm->rtm_protocol;
4186 	cfg->fc_type = rtm->rtm_type;
4187 
4188 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4189 	    rtm->rtm_type == RTN_BLACKHOLE ||
4190 	    rtm->rtm_type == RTN_PROHIBIT ||
4191 	    rtm->rtm_type == RTN_THROW)
4192 		cfg->fc_flags |= RTF_REJECT;
4193 
4194 	if (rtm->rtm_type == RTN_LOCAL)
4195 		cfg->fc_flags |= RTF_LOCAL;
4196 
4197 	if (rtm->rtm_flags & RTM_F_CLONED)
4198 		cfg->fc_flags |= RTF_CACHE;
4199 
4200 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4201 
4202 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4203 	cfg->fc_nlinfo.nlh = nlh;
4204 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4205 
4206 	if (tb[RTA_GATEWAY]) {
4207 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4208 		cfg->fc_flags |= RTF_GATEWAY;
4209 	}
4210 
4211 	if (tb[RTA_DST]) {
4212 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4213 
4214 		if (nla_len(tb[RTA_DST]) < plen)
4215 			goto errout;
4216 
4217 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4218 	}
4219 
4220 	if (tb[RTA_SRC]) {
4221 		int plen = (rtm->rtm_src_len + 7) >> 3;
4222 
4223 		if (nla_len(tb[RTA_SRC]) < plen)
4224 			goto errout;
4225 
4226 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4227 	}
4228 
4229 	if (tb[RTA_PREFSRC])
4230 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4231 
4232 	if (tb[RTA_OIF])
4233 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4234 
4235 	if (tb[RTA_PRIORITY])
4236 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4237 
4238 	if (tb[RTA_METRICS]) {
4239 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4240 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4241 	}
4242 
4243 	if (tb[RTA_TABLE])
4244 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4245 
4246 	if (tb[RTA_MULTIPATH]) {
4247 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4248 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4249 
4250 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4251 						     cfg->fc_mp_len, extack);
4252 		if (err < 0)
4253 			goto errout;
4254 	}
4255 
4256 	if (tb[RTA_PREF]) {
4257 		pref = nla_get_u8(tb[RTA_PREF]);
4258 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4259 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4260 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4261 		cfg->fc_flags |= RTF_PREF(pref);
4262 	}
4263 
4264 	if (tb[RTA_ENCAP])
4265 		cfg->fc_encap = tb[RTA_ENCAP];
4266 
4267 	if (tb[RTA_ENCAP_TYPE]) {
4268 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4269 
4270 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4271 		if (err < 0)
4272 			goto errout;
4273 	}
4274 
4275 	if (tb[RTA_EXPIRES]) {
4276 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4277 
4278 		if (addrconf_finite_timeout(timeout)) {
4279 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4280 			cfg->fc_flags |= RTF_EXPIRES;
4281 		}
4282 	}
4283 
4284 	err = 0;
4285 errout:
4286 	return err;
4287 }
4288 
4289 struct rt6_nh {
4290 	struct fib6_info *fib6_info;
4291 	struct fib6_config r_cfg;
4292 	struct list_head next;
4293 };
4294 
ip6_print_replace_route_err(struct list_head * rt6_nh_list)4295 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4296 {
4297 	struct rt6_nh *nh;
4298 
4299 	list_for_each_entry(nh, rt6_nh_list, next) {
4300 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4301 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4302 		        nh->r_cfg.fc_ifindex);
4303 	}
4304 }
4305 
ip6_route_info_append(struct net * net,struct list_head * rt6_nh_list,struct fib6_info * rt,struct fib6_config * r_cfg)4306 static int ip6_route_info_append(struct net *net,
4307 				 struct list_head *rt6_nh_list,
4308 				 struct fib6_info *rt,
4309 				 struct fib6_config *r_cfg)
4310 {
4311 	struct rt6_nh *nh;
4312 	int err = -EEXIST;
4313 
4314 	list_for_each_entry(nh, rt6_nh_list, next) {
4315 		/* check if fib6_info already exists */
4316 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4317 			return err;
4318 	}
4319 
4320 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4321 	if (!nh)
4322 		return -ENOMEM;
4323 	nh->fib6_info = rt;
4324 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4325 	list_add_tail(&nh->next, rt6_nh_list);
4326 
4327 	return 0;
4328 }
4329 
ip6_route_mpath_notify(struct fib6_info * rt,struct fib6_info * rt_last,struct nl_info * info,__u16 nlflags)4330 static void ip6_route_mpath_notify(struct fib6_info *rt,
4331 				   struct fib6_info *rt_last,
4332 				   struct nl_info *info,
4333 				   __u16 nlflags)
4334 {
4335 	/* if this is an APPEND route, then rt points to the first route
4336 	 * inserted and rt_last points to last route inserted. Userspace
4337 	 * wants a consistent dump of the route which starts at the first
4338 	 * nexthop. Since sibling routes are always added at the end of
4339 	 * the list, find the first sibling of the last route appended
4340 	 */
4341 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4342 		rt = list_first_entry(&rt_last->fib6_siblings,
4343 				      struct fib6_info,
4344 				      fib6_siblings);
4345 	}
4346 
4347 	if (rt)
4348 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4349 }
4350 
ip6_route_multipath_add(struct fib6_config * cfg,struct netlink_ext_ack * extack)4351 static int ip6_route_multipath_add(struct fib6_config *cfg,
4352 				   struct netlink_ext_ack *extack)
4353 {
4354 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4355 	struct nl_info *info = &cfg->fc_nlinfo;
4356 	struct fib6_config r_cfg;
4357 	struct rtnexthop *rtnh;
4358 	struct fib6_info *rt;
4359 	struct rt6_nh *err_nh;
4360 	struct rt6_nh *nh, *nh_safe;
4361 	__u16 nlflags;
4362 	int remaining;
4363 	int attrlen;
4364 	int err = 1;
4365 	int nhn = 0;
4366 	int replace = (cfg->fc_nlinfo.nlh &&
4367 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4368 	LIST_HEAD(rt6_nh_list);
4369 
4370 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4371 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4372 		nlflags |= NLM_F_APPEND;
4373 
4374 	remaining = cfg->fc_mp_len;
4375 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4376 
4377 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4378 	 * fib6_info structs per nexthop
4379 	 */
4380 	while (rtnh_ok(rtnh, remaining)) {
4381 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4382 		if (rtnh->rtnh_ifindex)
4383 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4384 
4385 		attrlen = rtnh_attrlen(rtnh);
4386 		if (attrlen > 0) {
4387 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4388 
4389 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4390 			if (nla) {
4391 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4392 				r_cfg.fc_flags |= RTF_GATEWAY;
4393 			}
4394 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4395 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4396 			if (nla)
4397 				r_cfg.fc_encap_type = nla_get_u16(nla);
4398 		}
4399 
4400 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4401 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4402 		if (IS_ERR(rt)) {
4403 			err = PTR_ERR(rt);
4404 			rt = NULL;
4405 			goto cleanup;
4406 		}
4407 		if (!rt6_qualify_for_ecmp(rt)) {
4408 			err = -EINVAL;
4409 			NL_SET_ERR_MSG(extack,
4410 				       "Device only routes can not be added for IPv6 using the multipath API.");
4411 			fib6_info_release(rt);
4412 			goto cleanup;
4413 		}
4414 
4415 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4416 
4417 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4418 					    rt, &r_cfg);
4419 		if (err) {
4420 			fib6_info_release(rt);
4421 			goto cleanup;
4422 		}
4423 
4424 		rtnh = rtnh_next(rtnh, &remaining);
4425 	}
4426 
4427 	/* for add and replace send one notification with all nexthops.
4428 	 * Skip the notification in fib6_add_rt2node and send one with
4429 	 * the full route when done
4430 	 */
4431 	info->skip_notify = 1;
4432 
4433 	err_nh = NULL;
4434 	list_for_each_entry(nh, &rt6_nh_list, next) {
4435 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4436 		fib6_info_release(nh->fib6_info);
4437 
4438 		if (!err) {
4439 			/* save reference to last route successfully inserted */
4440 			rt_last = nh->fib6_info;
4441 
4442 			/* save reference to first route for notification */
4443 			if (!rt_notif)
4444 				rt_notif = nh->fib6_info;
4445 		}
4446 
4447 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4448 		nh->fib6_info = NULL;
4449 		if (err) {
4450 			if (replace && nhn)
4451 				ip6_print_replace_route_err(&rt6_nh_list);
4452 			err_nh = nh;
4453 			goto add_errout;
4454 		}
4455 
4456 		/* Because each route is added like a single route we remove
4457 		 * these flags after the first nexthop: if there is a collision,
4458 		 * we have already failed to add the first nexthop:
4459 		 * fib6_add_rt2node() has rejected it; when replacing, old
4460 		 * nexthops have been replaced by first new, the rest should
4461 		 * be added to it.
4462 		 */
4463 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4464 						     NLM_F_REPLACE);
4465 		nhn++;
4466 	}
4467 
4468 	/* success ... tell user about new route */
4469 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4470 	goto cleanup;
4471 
4472 add_errout:
4473 	/* send notification for routes that were added so that
4474 	 * the delete notifications sent by ip6_route_del are
4475 	 * coherent
4476 	 */
4477 	if (rt_notif)
4478 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4479 
4480 	/* Delete routes that were already added */
4481 	list_for_each_entry(nh, &rt6_nh_list, next) {
4482 		if (err_nh == nh)
4483 			break;
4484 		ip6_route_del(&nh->r_cfg, extack);
4485 	}
4486 
4487 cleanup:
4488 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4489 		if (nh->fib6_info)
4490 			fib6_info_release(nh->fib6_info);
4491 		list_del(&nh->next);
4492 		kfree(nh);
4493 	}
4494 
4495 	return err;
4496 }
4497 
ip6_route_multipath_del(struct fib6_config * cfg,struct netlink_ext_ack * extack)4498 static int ip6_route_multipath_del(struct fib6_config *cfg,
4499 				   struct netlink_ext_ack *extack)
4500 {
4501 	struct fib6_config r_cfg;
4502 	struct rtnexthop *rtnh;
4503 	int remaining;
4504 	int attrlen;
4505 	int err = 1, last_err = 0;
4506 
4507 	remaining = cfg->fc_mp_len;
4508 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4509 
4510 	/* Parse a Multipath Entry */
4511 	while (rtnh_ok(rtnh, remaining)) {
4512 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4513 		if (rtnh->rtnh_ifindex)
4514 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4515 
4516 		attrlen = rtnh_attrlen(rtnh);
4517 		if (attrlen > 0) {
4518 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4519 
4520 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4521 			if (nla) {
4522 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4523 				r_cfg.fc_flags |= RTF_GATEWAY;
4524 			}
4525 		}
4526 		err = ip6_route_del(&r_cfg, extack);
4527 		if (err)
4528 			last_err = err;
4529 
4530 		rtnh = rtnh_next(rtnh, &remaining);
4531 	}
4532 
4533 	return last_err;
4534 }
4535 
inet6_rtm_delroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)4536 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4537 			      struct netlink_ext_ack *extack)
4538 {
4539 	struct fib6_config cfg;
4540 	int err;
4541 
4542 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4543 	if (err < 0)
4544 		return err;
4545 
4546 	if (cfg.fc_mp)
4547 		return ip6_route_multipath_del(&cfg, extack);
4548 	else {
4549 		cfg.fc_delete_all_nh = 1;
4550 		return ip6_route_del(&cfg, extack);
4551 	}
4552 }
4553 
inet6_rtm_newroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)4554 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4555 			      struct netlink_ext_ack *extack)
4556 {
4557 	struct fib6_config cfg;
4558 	int err;
4559 
4560 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4561 	if (err < 0)
4562 		return err;
4563 
4564 	if (cfg.fc_mp)
4565 		return ip6_route_multipath_add(&cfg, extack);
4566 	else
4567 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4568 }
4569 
rt6_nlmsg_size(struct fib6_info * rt)4570 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4571 {
4572 	int nexthop_len = 0;
4573 
4574 	if (rt->fib6_nsiblings) {
4575 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4576 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4577 			    + nla_total_size(16) /* RTA_GATEWAY */
4578 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4579 
4580 		nexthop_len *= rt->fib6_nsiblings;
4581 	}
4582 
4583 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4584 	       + nla_total_size(16) /* RTA_SRC */
4585 	       + nla_total_size(16) /* RTA_DST */
4586 	       + nla_total_size(16) /* RTA_GATEWAY */
4587 	       + nla_total_size(16) /* RTA_PREFSRC */
4588 	       + nla_total_size(4) /* RTA_TABLE */
4589 	       + nla_total_size(4) /* RTA_IIF */
4590 	       + nla_total_size(4) /* RTA_OIF */
4591 	       + nla_total_size(4) /* RTA_PRIORITY */
4592 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4593 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4594 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4595 	       + nla_total_size(1) /* RTA_PREF */
4596 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4597 	       + nexthop_len;
4598 }
4599 
rt6_nexthop_info(struct sk_buff * skb,struct fib6_info * rt,unsigned int * flags,bool skip_oif)4600 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4601 			    unsigned int *flags, bool skip_oif)
4602 {
4603 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4604 		*flags |= RTNH_F_DEAD;
4605 
4606 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4607 		*flags |= RTNH_F_LINKDOWN;
4608 
4609 		rcu_read_lock();
4610 		if (fib6_ignore_linkdown(rt))
4611 			*flags |= RTNH_F_DEAD;
4612 		rcu_read_unlock();
4613 	}
4614 
4615 	if (rt->fib6_flags & RTF_GATEWAY) {
4616 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4617 			goto nla_put_failure;
4618 	}
4619 
4620 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4621 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4622 		*flags |= RTNH_F_OFFLOAD;
4623 
4624 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4625 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4626 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4627 		goto nla_put_failure;
4628 
4629 	if (rt->fib6_nh.nh_lwtstate &&
4630 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4631 		goto nla_put_failure;
4632 
4633 	return 0;
4634 
4635 nla_put_failure:
4636 	return -EMSGSIZE;
4637 }
4638 
4639 /* add multipath next hop */
rt6_add_nexthop(struct sk_buff * skb,struct fib6_info * rt)4640 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4641 {
4642 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4643 	struct rtnexthop *rtnh;
4644 	unsigned int flags = 0;
4645 
4646 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4647 	if (!rtnh)
4648 		goto nla_put_failure;
4649 
4650 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4651 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4652 
4653 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4654 		goto nla_put_failure;
4655 
4656 	rtnh->rtnh_flags = flags;
4657 
4658 	/* length of rtnetlink header + attributes */
4659 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4660 
4661 	return 0;
4662 
4663 nla_put_failure:
4664 	return -EMSGSIZE;
4665 }
4666 
rt6_fill_node(struct net * net,struct sk_buff * skb,struct fib6_info * rt,struct dst_entry * dst,struct in6_addr * dest,struct in6_addr * src,int iif,int type,u32 portid,u32 seq,unsigned int flags)4667 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4668 			 struct fib6_info *rt, struct dst_entry *dst,
4669 			 struct in6_addr *dest, struct in6_addr *src,
4670 			 int iif, int type, u32 portid, u32 seq,
4671 			 unsigned int flags)
4672 {
4673 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4674 	struct rt6key *rt6_dst, *rt6_src;
4675 	u32 *pmetrics, table, rt6_flags;
4676 	struct nlmsghdr *nlh;
4677 	struct rtmsg *rtm;
4678 	long expires = 0;
4679 
4680 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4681 	if (!nlh)
4682 		return -EMSGSIZE;
4683 
4684 	if (rt6) {
4685 		rt6_dst = &rt6->rt6i_dst;
4686 		rt6_src = &rt6->rt6i_src;
4687 		rt6_flags = rt6->rt6i_flags;
4688 	} else {
4689 		rt6_dst = &rt->fib6_dst;
4690 		rt6_src = &rt->fib6_src;
4691 		rt6_flags = rt->fib6_flags;
4692 	}
4693 
4694 	rtm = nlmsg_data(nlh);
4695 	rtm->rtm_family = AF_INET6;
4696 	rtm->rtm_dst_len = rt6_dst->plen;
4697 	rtm->rtm_src_len = rt6_src->plen;
4698 	rtm->rtm_tos = 0;
4699 	if (rt->fib6_table)
4700 		table = rt->fib6_table->tb6_id;
4701 	else
4702 		table = RT6_TABLE_UNSPEC;
4703 	rtm->rtm_table = table;
4704 	if (nla_put_u32(skb, RTA_TABLE, table))
4705 		goto nla_put_failure;
4706 
4707 	rtm->rtm_type = rt->fib6_type;
4708 	rtm->rtm_flags = 0;
4709 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4710 	rtm->rtm_protocol = rt->fib6_protocol;
4711 
4712 	if (rt6_flags & RTF_CACHE)
4713 		rtm->rtm_flags |= RTM_F_CLONED;
4714 
4715 	if (dest) {
4716 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4717 			goto nla_put_failure;
4718 		rtm->rtm_dst_len = 128;
4719 	} else if (rtm->rtm_dst_len)
4720 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4721 			goto nla_put_failure;
4722 #ifdef CONFIG_IPV6_SUBTREES
4723 	if (src) {
4724 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4725 			goto nla_put_failure;
4726 		rtm->rtm_src_len = 128;
4727 	} else if (rtm->rtm_src_len &&
4728 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4729 		goto nla_put_failure;
4730 #endif
4731 	if (iif) {
4732 #ifdef CONFIG_IPV6_MROUTE
4733 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4734 			int err = ip6mr_get_route(net, skb, rtm, portid);
4735 
4736 			if (err == 0)
4737 				return 0;
4738 			if (err < 0)
4739 				goto nla_put_failure;
4740 		} else
4741 #endif
4742 			if (nla_put_u32(skb, RTA_IIF, iif))
4743 				goto nla_put_failure;
4744 	} else if (dest) {
4745 		struct in6_addr saddr_buf;
4746 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4747 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4748 			goto nla_put_failure;
4749 	}
4750 
4751 	if (rt->fib6_prefsrc.plen) {
4752 		struct in6_addr saddr_buf;
4753 		saddr_buf = rt->fib6_prefsrc.addr;
4754 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4755 			goto nla_put_failure;
4756 	}
4757 
4758 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4759 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4760 		goto nla_put_failure;
4761 
4762 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4763 		goto nla_put_failure;
4764 
4765 	/* For multipath routes, walk the siblings list and add
4766 	 * each as a nexthop within RTA_MULTIPATH.
4767 	 */
4768 	if (rt6) {
4769 		if (rt6_flags & RTF_GATEWAY &&
4770 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4771 			goto nla_put_failure;
4772 
4773 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4774 			goto nla_put_failure;
4775 	} else if (rt->fib6_nsiblings) {
4776 		struct fib6_info *sibling, *next_sibling;
4777 		struct nlattr *mp;
4778 
4779 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4780 		if (!mp)
4781 			goto nla_put_failure;
4782 
4783 		if (rt6_add_nexthop(skb, rt) < 0)
4784 			goto nla_put_failure;
4785 
4786 		list_for_each_entry_safe(sibling, next_sibling,
4787 					 &rt->fib6_siblings, fib6_siblings) {
4788 			if (rt6_add_nexthop(skb, sibling) < 0)
4789 				goto nla_put_failure;
4790 		}
4791 
4792 		nla_nest_end(skb, mp);
4793 	} else {
4794 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4795 			goto nla_put_failure;
4796 	}
4797 
4798 	if (rt6_flags & RTF_EXPIRES) {
4799 		expires = dst ? dst->expires : rt->expires;
4800 		expires -= jiffies;
4801 	}
4802 
4803 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4804 		goto nla_put_failure;
4805 
4806 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4807 		goto nla_put_failure;
4808 
4809 
4810 	nlmsg_end(skb, nlh);
4811 	return 0;
4812 
4813 nla_put_failure:
4814 	nlmsg_cancel(skb, nlh);
4815 	return -EMSGSIZE;
4816 }
4817 
rt6_dump_route(struct fib6_info * rt,void * p_arg)4818 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4819 {
4820 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4821 	struct net *net = arg->net;
4822 
4823 	if (rt == net->ipv6.fib6_null_entry)
4824 		return 0;
4825 
4826 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4827 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4828 
4829 		/* user wants prefix routes only */
4830 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4831 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4832 			/* success since this is not a prefix route */
4833 			return 1;
4834 		}
4835 	}
4836 
4837 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4838 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4839 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4840 }
4841 
inet6_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)4842 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4843 			      struct netlink_ext_ack *extack)
4844 {
4845 	struct net *net = sock_net(in_skb->sk);
4846 	struct nlattr *tb[RTA_MAX+1];
4847 	int err, iif = 0, oif = 0;
4848 	struct fib6_info *from;
4849 	struct dst_entry *dst;
4850 	struct rt6_info *rt;
4851 	struct sk_buff *skb;
4852 	struct rtmsg *rtm;
4853 	struct flowi6 fl6;
4854 	bool fibmatch;
4855 
4856 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4857 			  extack);
4858 	if (err < 0)
4859 		goto errout;
4860 
4861 	err = -EINVAL;
4862 	memset(&fl6, 0, sizeof(fl6));
4863 	rtm = nlmsg_data(nlh);
4864 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4865 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4866 
4867 	if (tb[RTA_SRC]) {
4868 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4869 			goto errout;
4870 
4871 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4872 	}
4873 
4874 	if (tb[RTA_DST]) {
4875 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4876 			goto errout;
4877 
4878 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4879 	}
4880 
4881 	if (tb[RTA_IIF])
4882 		iif = nla_get_u32(tb[RTA_IIF]);
4883 
4884 	if (tb[RTA_OIF])
4885 		oif = nla_get_u32(tb[RTA_OIF]);
4886 
4887 	if (tb[RTA_MARK])
4888 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4889 
4890 	if (tb[RTA_UID])
4891 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4892 					   nla_get_u32(tb[RTA_UID]));
4893 	else
4894 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4895 
4896 	if (tb[RTA_SPORT])
4897 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4898 
4899 	if (tb[RTA_DPORT])
4900 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4901 
4902 	if (tb[RTA_IP_PROTO]) {
4903 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4904 						  &fl6.flowi6_proto, extack);
4905 		if (err)
4906 			goto errout;
4907 	}
4908 
4909 	if (iif) {
4910 		struct net_device *dev;
4911 		int flags = 0;
4912 
4913 		rcu_read_lock();
4914 
4915 		dev = dev_get_by_index_rcu(net, iif);
4916 		if (!dev) {
4917 			rcu_read_unlock();
4918 			err = -ENODEV;
4919 			goto errout;
4920 		}
4921 
4922 		fl6.flowi6_iif = iif;
4923 
4924 		if (!ipv6_addr_any(&fl6.saddr))
4925 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4926 
4927 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4928 
4929 		rcu_read_unlock();
4930 	} else {
4931 		fl6.flowi6_oif = oif;
4932 
4933 		dst = ip6_route_output(net, NULL, &fl6);
4934 	}
4935 
4936 
4937 	rt = container_of(dst, struct rt6_info, dst);
4938 	if (rt->dst.error) {
4939 		err = rt->dst.error;
4940 		ip6_rt_put(rt);
4941 		goto errout;
4942 	}
4943 
4944 	if (rt == net->ipv6.ip6_null_entry) {
4945 		err = rt->dst.error;
4946 		ip6_rt_put(rt);
4947 		goto errout;
4948 	}
4949 
4950 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4951 	if (!skb) {
4952 		ip6_rt_put(rt);
4953 		err = -ENOBUFS;
4954 		goto errout;
4955 	}
4956 
4957 	skb_dst_set(skb, &rt->dst);
4958 
4959 	rcu_read_lock();
4960 	from = rcu_dereference(rt->from);
4961 
4962 	if (fibmatch)
4963 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4964 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4965 				    nlh->nlmsg_seq, 0);
4966 	else
4967 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4968 				    &fl6.saddr, iif, RTM_NEWROUTE,
4969 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4970 				    0);
4971 	rcu_read_unlock();
4972 
4973 	if (err < 0) {
4974 		kfree_skb(skb);
4975 		goto errout;
4976 	}
4977 
4978 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4979 errout:
4980 	return err;
4981 }
4982 
inet6_rt_notify(int event,struct fib6_info * rt,struct nl_info * info,unsigned int nlm_flags)4983 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4984 		     unsigned int nlm_flags)
4985 {
4986 	struct sk_buff *skb;
4987 	struct net *net = info->nl_net;
4988 	u32 seq;
4989 	int err;
4990 
4991 	err = -ENOBUFS;
4992 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4993 
4994 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4995 	if (!skb)
4996 		goto errout;
4997 
4998 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4999 			    event, info->portid, seq, nlm_flags);
5000 	if (err < 0) {
5001 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5002 		WARN_ON(err == -EMSGSIZE);
5003 		kfree_skb(skb);
5004 		goto errout;
5005 	}
5006 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5007 		    info->nlh, gfp_any());
5008 	return;
5009 errout:
5010 	if (err < 0)
5011 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5012 }
5013 
ip6_route_dev_notify(struct notifier_block * this,unsigned long event,void * ptr)5014 static int ip6_route_dev_notify(struct notifier_block *this,
5015 				unsigned long event, void *ptr)
5016 {
5017 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5018 	struct net *net = dev_net(dev);
5019 
5020 	if (!(dev->flags & IFF_LOOPBACK))
5021 		return NOTIFY_OK;
5022 
5023 	if (event == NETDEV_REGISTER) {
5024 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5025 		net->ipv6.ip6_null_entry->dst.dev = dev;
5026 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5027 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5028 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5029 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5030 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5031 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5032 #endif
5033 	 } else if (event == NETDEV_UNREGISTER &&
5034 		    dev->reg_state != NETREG_UNREGISTERED) {
5035 		/* NETDEV_UNREGISTER could be fired for multiple times by
5036 		 * netdev_wait_allrefs(). Make sure we only call this once.
5037 		 */
5038 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5039 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5040 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5041 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5042 #endif
5043 	}
5044 
5045 	return NOTIFY_OK;
5046 }
5047 
5048 /*
5049  *	/proc
5050  */
5051 
5052 #ifdef CONFIG_PROC_FS
rt6_stats_seq_show(struct seq_file * seq,void * v)5053 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5054 {
5055 	struct net *net = (struct net *)seq->private;
5056 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5057 		   net->ipv6.rt6_stats->fib_nodes,
5058 		   net->ipv6.rt6_stats->fib_route_nodes,
5059 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5060 		   net->ipv6.rt6_stats->fib_rt_entries,
5061 		   net->ipv6.rt6_stats->fib_rt_cache,
5062 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5063 		   net->ipv6.rt6_stats->fib_discarded_routes);
5064 
5065 	return 0;
5066 }
5067 #endif	/* CONFIG_PROC_FS */
5068 
5069 #ifdef CONFIG_SYSCTL
5070 
5071 static
ipv6_sysctl_rtcache_flush(struct ctl_table * ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)5072 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5073 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5074 {
5075 	struct net *net;
5076 	int delay;
5077 	if (!write)
5078 		return -EINVAL;
5079 
5080 	net = (struct net *)ctl->extra1;
5081 	delay = net->ipv6.sysctl.flush_delay;
5082 	proc_dointvec(ctl, write, buffer, lenp, ppos);
5083 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5084 	return 0;
5085 }
5086 
5087 struct ctl_table ipv6_route_table_template[] = {
5088 	{
5089 		.procname	=	"flush",
5090 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5091 		.maxlen		=	sizeof(int),
5092 		.mode		=	0200,
5093 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5094 	},
5095 	{
5096 		.procname	=	"gc_thresh",
5097 		.data		=	&ip6_dst_ops_template.gc_thresh,
5098 		.maxlen		=	sizeof(int),
5099 		.mode		=	0644,
5100 		.proc_handler	=	proc_dointvec,
5101 	},
5102 	{
5103 		.procname	=	"max_size",
5104 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5105 		.maxlen		=	sizeof(int),
5106 		.mode		=	0644,
5107 		.proc_handler	=	proc_dointvec,
5108 	},
5109 	{
5110 		.procname	=	"gc_min_interval",
5111 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5112 		.maxlen		=	sizeof(int),
5113 		.mode		=	0644,
5114 		.proc_handler	=	proc_dointvec_jiffies,
5115 	},
5116 	{
5117 		.procname	=	"gc_timeout",
5118 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5119 		.maxlen		=	sizeof(int),
5120 		.mode		=	0644,
5121 		.proc_handler	=	proc_dointvec_jiffies,
5122 	},
5123 	{
5124 		.procname	=	"gc_interval",
5125 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5126 		.maxlen		=	sizeof(int),
5127 		.mode		=	0644,
5128 		.proc_handler	=	proc_dointvec_jiffies,
5129 	},
5130 	{
5131 		.procname	=	"gc_elasticity",
5132 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5133 		.maxlen		=	sizeof(int),
5134 		.mode		=	0644,
5135 		.proc_handler	=	proc_dointvec,
5136 	},
5137 	{
5138 		.procname	=	"mtu_expires",
5139 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5140 		.maxlen		=	sizeof(int),
5141 		.mode		=	0644,
5142 		.proc_handler	=	proc_dointvec_jiffies,
5143 	},
5144 	{
5145 		.procname	=	"min_adv_mss",
5146 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5147 		.maxlen		=	sizeof(int),
5148 		.mode		=	0644,
5149 		.proc_handler	=	proc_dointvec,
5150 	},
5151 	{
5152 		.procname	=	"gc_min_interval_ms",
5153 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5154 		.maxlen		=	sizeof(int),
5155 		.mode		=	0644,
5156 		.proc_handler	=	proc_dointvec_ms_jiffies,
5157 	},
5158 	{ }
5159 };
5160 
ipv6_route_sysctl_init(struct net * net)5161 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5162 {
5163 	struct ctl_table *table;
5164 
5165 	table = kmemdup(ipv6_route_table_template,
5166 			sizeof(ipv6_route_table_template),
5167 			GFP_KERNEL);
5168 
5169 	if (table) {
5170 		table[0].data = &net->ipv6.sysctl.flush_delay;
5171 		table[0].extra1 = net;
5172 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5173 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5174 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5175 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5176 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5177 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5178 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5179 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5180 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5181 
5182 		/* Don't export sysctls to unprivileged users */
5183 		if (net->user_ns != &init_user_ns)
5184 			table[0].procname = NULL;
5185 	}
5186 
5187 	return table;
5188 }
5189 #endif
5190 
ip6_route_net_init(struct net * net)5191 static int __net_init ip6_route_net_init(struct net *net)
5192 {
5193 	int ret = -ENOMEM;
5194 
5195 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5196 	       sizeof(net->ipv6.ip6_dst_ops));
5197 
5198 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5199 		goto out_ip6_dst_ops;
5200 
5201 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5202 					    sizeof(*net->ipv6.fib6_null_entry),
5203 					    GFP_KERNEL);
5204 	if (!net->ipv6.fib6_null_entry)
5205 		goto out_ip6_dst_entries;
5206 
5207 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5208 					   sizeof(*net->ipv6.ip6_null_entry),
5209 					   GFP_KERNEL);
5210 	if (!net->ipv6.ip6_null_entry)
5211 		goto out_fib6_null_entry;
5212 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5213 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5214 			 ip6_template_metrics, true);
5215 
5216 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5217 	net->ipv6.fib6_has_custom_rules = false;
5218 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5219 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5220 					       GFP_KERNEL);
5221 	if (!net->ipv6.ip6_prohibit_entry)
5222 		goto out_ip6_null_entry;
5223 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5224 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5225 			 ip6_template_metrics, true);
5226 
5227 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5228 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5229 					       GFP_KERNEL);
5230 	if (!net->ipv6.ip6_blk_hole_entry)
5231 		goto out_ip6_prohibit_entry;
5232 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5233 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5234 			 ip6_template_metrics, true);
5235 #endif
5236 
5237 	net->ipv6.sysctl.flush_delay = 0;
5238 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5239 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5240 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5241 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5242 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5243 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5244 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5245 
5246 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5247 
5248 	ret = 0;
5249 out:
5250 	return ret;
5251 
5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5253 out_ip6_prohibit_entry:
5254 	kfree(net->ipv6.ip6_prohibit_entry);
5255 out_ip6_null_entry:
5256 	kfree(net->ipv6.ip6_null_entry);
5257 #endif
5258 out_fib6_null_entry:
5259 	kfree(net->ipv6.fib6_null_entry);
5260 out_ip6_dst_entries:
5261 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5262 out_ip6_dst_ops:
5263 	goto out;
5264 }
5265 
ip6_route_net_exit(struct net * net)5266 static void __net_exit ip6_route_net_exit(struct net *net)
5267 {
5268 	kfree(net->ipv6.fib6_null_entry);
5269 	kfree(net->ipv6.ip6_null_entry);
5270 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5271 	kfree(net->ipv6.ip6_prohibit_entry);
5272 	kfree(net->ipv6.ip6_blk_hole_entry);
5273 #endif
5274 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5275 }
5276 
ip6_route_net_init_late(struct net * net)5277 static int __net_init ip6_route_net_init_late(struct net *net)
5278 {
5279 #ifdef CONFIG_PROC_FS
5280 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5281 			sizeof(struct ipv6_route_iter));
5282 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5283 			rt6_stats_seq_show, NULL);
5284 #endif
5285 	return 0;
5286 }
5287 
ip6_route_net_exit_late(struct net * net)5288 static void __net_exit ip6_route_net_exit_late(struct net *net)
5289 {
5290 #ifdef CONFIG_PROC_FS
5291 	remove_proc_entry("ipv6_route", net->proc_net);
5292 	remove_proc_entry("rt6_stats", net->proc_net);
5293 #endif
5294 }
5295 
5296 static struct pernet_operations ip6_route_net_ops = {
5297 	.init = ip6_route_net_init,
5298 	.exit = ip6_route_net_exit,
5299 };
5300 
ipv6_inetpeer_init(struct net * net)5301 static int __net_init ipv6_inetpeer_init(struct net *net)
5302 {
5303 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5304 
5305 	if (!bp)
5306 		return -ENOMEM;
5307 	inet_peer_base_init(bp);
5308 	net->ipv6.peers = bp;
5309 	return 0;
5310 }
5311 
ipv6_inetpeer_exit(struct net * net)5312 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5313 {
5314 	struct inet_peer_base *bp = net->ipv6.peers;
5315 
5316 	net->ipv6.peers = NULL;
5317 	inetpeer_invalidate_tree(bp);
5318 	kfree(bp);
5319 }
5320 
5321 static struct pernet_operations ipv6_inetpeer_ops = {
5322 	.init	=	ipv6_inetpeer_init,
5323 	.exit	=	ipv6_inetpeer_exit,
5324 };
5325 
5326 static struct pernet_operations ip6_route_net_late_ops = {
5327 	.init = ip6_route_net_init_late,
5328 	.exit = ip6_route_net_exit_late,
5329 };
5330 
5331 static struct notifier_block ip6_route_dev_notifier = {
5332 	.notifier_call = ip6_route_dev_notify,
5333 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5334 };
5335 
ip6_route_init_special_entries(void)5336 void __init ip6_route_init_special_entries(void)
5337 {
5338 	/* Registering of the loopback is done before this portion of code,
5339 	 * the loopback reference in rt6_info will not be taken, do it
5340 	 * manually for init_net */
5341 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5342 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5343 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5344   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5345 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5346 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5347 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5348 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5349   #endif
5350 }
5351 
ip6_route_init(void)5352 int __init ip6_route_init(void)
5353 {
5354 	int ret;
5355 	int cpu;
5356 
5357 	ret = -ENOMEM;
5358 	ip6_dst_ops_template.kmem_cachep =
5359 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5360 				  SLAB_HWCACHE_ALIGN, NULL);
5361 	if (!ip6_dst_ops_template.kmem_cachep)
5362 		goto out;
5363 
5364 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5365 	if (ret)
5366 		goto out_kmem_cache;
5367 
5368 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5369 	if (ret)
5370 		goto out_dst_entries;
5371 
5372 	ret = register_pernet_subsys(&ip6_route_net_ops);
5373 	if (ret)
5374 		goto out_register_inetpeer;
5375 
5376 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5377 
5378 	ret = fib6_init();
5379 	if (ret)
5380 		goto out_register_subsys;
5381 
5382 	ret = xfrm6_init();
5383 	if (ret)
5384 		goto out_fib6_init;
5385 
5386 	ret = fib6_rules_init();
5387 	if (ret)
5388 		goto xfrm6_init;
5389 
5390 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5391 	if (ret)
5392 		goto fib6_rules_init;
5393 
5394 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5395 				   inet6_rtm_newroute, NULL, 0);
5396 	if (ret < 0)
5397 		goto out_register_late_subsys;
5398 
5399 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5400 				   inet6_rtm_delroute, NULL, 0);
5401 	if (ret < 0)
5402 		goto out_register_late_subsys;
5403 
5404 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5405 				   inet6_rtm_getroute, NULL,
5406 				   RTNL_FLAG_DOIT_UNLOCKED);
5407 	if (ret < 0)
5408 		goto out_register_late_subsys;
5409 
5410 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5411 	if (ret)
5412 		goto out_register_late_subsys;
5413 
5414 	for_each_possible_cpu(cpu) {
5415 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5416 
5417 		INIT_LIST_HEAD(&ul->head);
5418 		spin_lock_init(&ul->lock);
5419 	}
5420 
5421 out:
5422 	return ret;
5423 
5424 out_register_late_subsys:
5425 	rtnl_unregister_all(PF_INET6);
5426 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5427 fib6_rules_init:
5428 	fib6_rules_cleanup();
5429 xfrm6_init:
5430 	xfrm6_fini();
5431 out_fib6_init:
5432 	fib6_gc_cleanup();
5433 out_register_subsys:
5434 	unregister_pernet_subsys(&ip6_route_net_ops);
5435 out_register_inetpeer:
5436 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5437 out_dst_entries:
5438 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5439 out_kmem_cache:
5440 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5441 	goto out;
5442 }
5443 
ip6_route_cleanup(void)5444 void ip6_route_cleanup(void)
5445 {
5446 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5447 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5448 	fib6_rules_cleanup();
5449 	xfrm6_fini();
5450 	fib6_gc_cleanup();
5451 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5452 	unregister_pernet_subsys(&ip6_route_net_ops);
5453 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5454 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5455 }
5456