1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * INET		An implementation of the TCP/IP protocol suite for the LINUX
4   *		operating system.  INET is implemented using the  BSD Socket
5   *		interface as the means of communication with the user level.
6   *
7   *		ROUTE - implementation of the IP router.
8   *
9   * Authors:	Ross Biro
10   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12   *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13   *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14   *
15   * Fixes:
16   *		Alan Cox	:	Verify area fixes.
17   *		Alan Cox	:	cli() protects routing changes
18   *		Rui Oliveira	:	ICMP routing table updates
19   *		(rco@di.uminho.pt)	Routing table insertion and update
20   *		Linus Torvalds	:	Rewrote bits to be sensible
21   *		Alan Cox	:	Added BSD route gw semantics
22   *		Alan Cox	:	Super /proc >4K
23   *		Alan Cox	:	MTU in route table
24   *		Alan Cox	:	MSS actually. Also added the window
25   *					clamper.
26   *		Sam Lantinga	:	Fixed route matching in rt_del()
27   *		Alan Cox	:	Routing cache support.
28   *		Alan Cox	:	Removed compatibility cruft.
29   *		Alan Cox	:	RTF_REJECT support.
30   *		Alan Cox	:	TCP irtt support.
31   *		Jonathan Naylor	:	Added Metric support.
32   *	Miquel van Smoorenburg	:	BSD API fixes.
33   *	Miquel van Smoorenburg	:	Metrics.
34   *		Alan Cox	:	Use __u32 properly
35   *		Alan Cox	:	Aligned routing errors more closely with BSD
36   *					our system is still very different.
37   *		Alan Cox	:	Faster /proc handling
38   *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
39   *					routing caches and better behaviour.
40   *
41   *		Olaf Erb	:	irtt wasn't being copied right.
42   *		Bjorn Ekwall	:	Kerneld route support.
43   *		Alan Cox	:	Multicast fixed (I hope)
44   *		Pavel Krauz	:	Limited broadcast fixed
45   *		Mike McLagan	:	Routing by source
46   *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
47   *					route.c and rewritten from scratch.
48   *		Andi Kleen	:	Load-limit warning messages.
49   *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
50   *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
51   *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
52   *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
53   *		Marc Boucher	:	routing by fwmark
54   *	Robert Olsson		:	Added rt_cache statistics
55   *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
56   *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
57   *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
58   *	Ilia Sotnikov		:	Removed TOS from hash calculations
59   */
60  
61  #define pr_fmt(fmt) "IPv4: " fmt
62  
63  #include <linux/module.h>
64  #include <linux/bitops.h>
65  #include <linux/kernel.h>
66  #include <linux/mm.h>
67  #include <linux/memblock.h>
68  #include <linux/socket.h>
69  #include <linux/errno.h>
70  #include <linux/in.h>
71  #include <linux/inet.h>
72  #include <linux/netdevice.h>
73  #include <linux/proc_fs.h>
74  #include <linux/init.h>
75  #include <linux/skbuff.h>
76  #include <linux/inetdevice.h>
77  #include <linux/igmp.h>
78  #include <linux/pkt_sched.h>
79  #include <linux/mroute.h>
80  #include <linux/netfilter_ipv4.h>
81  #include <linux/random.h>
82  #include <linux/rcupdate.h>
83  #include <linux/slab.h>
84  #include <linux/jhash.h>
85  #include <net/dst.h>
86  #include <net/dst_metadata.h>
87  #include <net/inet_dscp.h>
88  #include <net/net_namespace.h>
89  #include <net/ip.h>
90  #include <net/route.h>
91  #include <net/inetpeer.h>
92  #include <net/sock.h>
93  #include <net/ip_fib.h>
94  #include <net/nexthop.h>
95  #include <net/tcp.h>
96  #include <net/icmp.h>
97  #include <net/xfrm.h>
98  #include <net/lwtunnel.h>
99  #include <net/netevent.h>
100  #include <net/rtnetlink.h>
101  #ifdef CONFIG_SYSCTL
102  #include <linux/sysctl.h>
103  #endif
104  #include <net/secure_seq.h>
105  #include <net/ip_tunnels.h>
106  
107  #include "fib_lookup.h"
108  
109  #define RT_FL_TOS(oldflp4) \
110  	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
111  
112  #define RT_GC_TIMEOUT (300*HZ)
113  
114  #define DEFAULT_MIN_PMTU (512 + 20 + 20)
115  #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
116  #define DEFAULT_MIN_ADVMSS 256
117  static int ip_rt_max_size;
118  static int ip_rt_redirect_number __read_mostly	= 9;
119  static int ip_rt_redirect_load __read_mostly	= HZ / 50;
120  static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
121  static int ip_rt_error_cost __read_mostly	= HZ;
122  static int ip_rt_error_burst __read_mostly	= 5 * HZ;
123  
124  static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
125  
126  /*
127   *	Interface to generic destination cache.
128   */
129  
130  INDIRECT_CALLABLE_SCOPE
131  struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132  static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
133  INDIRECT_CALLABLE_SCOPE
134  unsigned int		ipv4_mtu(const struct dst_entry *dst);
135  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136  static void		 ipv4_link_failure(struct sk_buff *skb);
137  static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138  					   struct sk_buff *skb, u32 mtu,
139  					   bool confirm_neigh);
140  static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141  					struct sk_buff *skb);
142  static void		ipv4_dst_destroy(struct dst_entry *dst);
143  
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)144  static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
145  {
146  	WARN_ON(1);
147  	return NULL;
148  }
149  
150  static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
151  					   struct sk_buff *skb,
152  					   const void *daddr);
153  static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
154  
155  static struct dst_ops ipv4_dst_ops = {
156  	.family =		AF_INET,
157  	.check =		ipv4_dst_check,
158  	.default_advmss =	ipv4_default_advmss,
159  	.mtu =			ipv4_mtu,
160  	.cow_metrics =		ipv4_cow_metrics,
161  	.destroy =		ipv4_dst_destroy,
162  	.negative_advice =	ipv4_negative_advice,
163  	.link_failure =		ipv4_link_failure,
164  	.update_pmtu =		ip_rt_update_pmtu,
165  	.redirect =		ip_do_redirect,
166  	.local_out =		__ip_local_out,
167  	.neigh_lookup =		ipv4_neigh_lookup,
168  	.confirm_neigh =	ipv4_confirm_neigh,
169  };
170  
171  #define ECN_OR_COST(class)	TC_PRIO_##class
172  
173  const __u8 ip_tos2prio[16] = {
174  	TC_PRIO_BESTEFFORT,
175  	ECN_OR_COST(BESTEFFORT),
176  	TC_PRIO_BESTEFFORT,
177  	ECN_OR_COST(BESTEFFORT),
178  	TC_PRIO_BULK,
179  	ECN_OR_COST(BULK),
180  	TC_PRIO_BULK,
181  	ECN_OR_COST(BULK),
182  	TC_PRIO_INTERACTIVE,
183  	ECN_OR_COST(INTERACTIVE),
184  	TC_PRIO_INTERACTIVE,
185  	ECN_OR_COST(INTERACTIVE),
186  	TC_PRIO_INTERACTIVE_BULK,
187  	ECN_OR_COST(INTERACTIVE_BULK),
188  	TC_PRIO_INTERACTIVE_BULK,
189  	ECN_OR_COST(INTERACTIVE_BULK)
190  };
191  EXPORT_SYMBOL(ip_tos2prio);
192  
193  static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
194  #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
195  
196  #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)197  static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
198  {
199  	if (*pos)
200  		return NULL;
201  	return SEQ_START_TOKEN;
202  }
203  
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)204  static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
205  {
206  	++*pos;
207  	return NULL;
208  }
209  
rt_cache_seq_stop(struct seq_file * seq,void * v)210  static void rt_cache_seq_stop(struct seq_file *seq, void *v)
211  {
212  }
213  
rt_cache_seq_show(struct seq_file * seq,void * v)214  static int rt_cache_seq_show(struct seq_file *seq, void *v)
215  {
216  	if (v == SEQ_START_TOKEN)
217  		seq_printf(seq, "%-127s\n",
218  			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
219  			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
220  			   "HHUptod\tSpecDst");
221  	return 0;
222  }
223  
224  static const struct seq_operations rt_cache_seq_ops = {
225  	.start  = rt_cache_seq_start,
226  	.next   = rt_cache_seq_next,
227  	.stop   = rt_cache_seq_stop,
228  	.show   = rt_cache_seq_show,
229  };
230  
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)231  static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
232  {
233  	int cpu;
234  
235  	if (*pos == 0)
236  		return SEQ_START_TOKEN;
237  
238  	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
239  		if (!cpu_possible(cpu))
240  			continue;
241  		*pos = cpu+1;
242  		return &per_cpu(rt_cache_stat, cpu);
243  	}
244  	return NULL;
245  }
246  
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)247  static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
248  {
249  	int cpu;
250  
251  	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
252  		if (!cpu_possible(cpu))
253  			continue;
254  		*pos = cpu+1;
255  		return &per_cpu(rt_cache_stat, cpu);
256  	}
257  	(*pos)++;
258  	return NULL;
259  
260  }
261  
rt_cpu_seq_stop(struct seq_file * seq,void * v)262  static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
263  {
264  
265  }
266  
rt_cpu_seq_show(struct seq_file * seq,void * v)267  static int rt_cpu_seq_show(struct seq_file *seq, void *v)
268  {
269  	struct rt_cache_stat *st = v;
270  
271  	if (v == SEQ_START_TOKEN) {
272  		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
273  		return 0;
274  	}
275  
276  	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
277  			"%08x       %08x %08x     %08x    %08x %08x   "
278  			"%08x     %08x        %08x        %08x\n",
279  		   dst_entries_get_slow(&ipv4_dst_ops),
280  		   0, /* st->in_hit */
281  		   st->in_slow_tot,
282  		   st->in_slow_mc,
283  		   st->in_no_route,
284  		   st->in_brd,
285  		   st->in_martian_dst,
286  		   st->in_martian_src,
287  
288  		   0, /* st->out_hit */
289  		   st->out_slow_tot,
290  		   st->out_slow_mc,
291  
292  		   0, /* st->gc_total */
293  		   0, /* st->gc_ignored */
294  		   0, /* st->gc_goal_miss */
295  		   0, /* st->gc_dst_overflow */
296  		   0, /* st->in_hlist_search */
297  		   0  /* st->out_hlist_search */
298  		);
299  	return 0;
300  }
301  
302  static const struct seq_operations rt_cpu_seq_ops = {
303  	.start  = rt_cpu_seq_start,
304  	.next   = rt_cpu_seq_next,
305  	.stop   = rt_cpu_seq_stop,
306  	.show   = rt_cpu_seq_show,
307  };
308  
309  #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)310  static int rt_acct_proc_show(struct seq_file *m, void *v)
311  {
312  	struct ip_rt_acct *dst, *src;
313  	unsigned int i, j;
314  
315  	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
316  	if (!dst)
317  		return -ENOMEM;
318  
319  	for_each_possible_cpu(i) {
320  		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
321  		for (j = 0; j < 256; j++) {
322  			dst[j].o_bytes   += src[j].o_bytes;
323  			dst[j].o_packets += src[j].o_packets;
324  			dst[j].i_bytes   += src[j].i_bytes;
325  			dst[j].i_packets += src[j].i_packets;
326  		}
327  	}
328  
329  	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
330  	kfree(dst);
331  	return 0;
332  }
333  #endif
334  
ip_rt_do_proc_init(struct net * net)335  static int __net_init ip_rt_do_proc_init(struct net *net)
336  {
337  	struct proc_dir_entry *pde;
338  
339  	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
340  			      &rt_cache_seq_ops);
341  	if (!pde)
342  		goto err1;
343  
344  	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
345  			      &rt_cpu_seq_ops);
346  	if (!pde)
347  		goto err2;
348  
349  #ifdef CONFIG_IP_ROUTE_CLASSID
350  	pde = proc_create_single("rt_acct", 0, net->proc_net,
351  			rt_acct_proc_show);
352  	if (!pde)
353  		goto err3;
354  #endif
355  	return 0;
356  
357  #ifdef CONFIG_IP_ROUTE_CLASSID
358  err3:
359  	remove_proc_entry("rt_cache", net->proc_net_stat);
360  #endif
361  err2:
362  	remove_proc_entry("rt_cache", net->proc_net);
363  err1:
364  	return -ENOMEM;
365  }
366  
ip_rt_do_proc_exit(struct net * net)367  static void __net_exit ip_rt_do_proc_exit(struct net *net)
368  {
369  	remove_proc_entry("rt_cache", net->proc_net_stat);
370  	remove_proc_entry("rt_cache", net->proc_net);
371  #ifdef CONFIG_IP_ROUTE_CLASSID
372  	remove_proc_entry("rt_acct", net->proc_net);
373  #endif
374  }
375  
376  static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
377  	.init = ip_rt_do_proc_init,
378  	.exit = ip_rt_do_proc_exit,
379  };
380  
ip_rt_proc_init(void)381  static int __init ip_rt_proc_init(void)
382  {
383  	return register_pernet_subsys(&ip_rt_proc_ops);
384  }
385  
386  #else
ip_rt_proc_init(void)387  static inline int ip_rt_proc_init(void)
388  {
389  	return 0;
390  }
391  #endif /* CONFIG_PROC_FS */
392  
rt_is_expired(const struct rtable * rth)393  static inline bool rt_is_expired(const struct rtable *rth)
394  {
395  	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
396  }
397  
rt_cache_flush(struct net * net)398  void rt_cache_flush(struct net *net)
399  {
400  	rt_genid_bump_ipv4(net);
401  }
402  
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)403  static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
404  					   struct sk_buff *skb,
405  					   const void *daddr)
406  {
407  	const struct rtable *rt = container_of(dst, struct rtable, dst);
408  	struct net_device *dev = dst->dev;
409  	struct neighbour *n;
410  
411  	rcu_read_lock_bh();
412  
413  	if (likely(rt->rt_gw_family == AF_INET)) {
414  		n = ip_neigh_gw4(dev, rt->rt_gw4);
415  	} else if (rt->rt_gw_family == AF_INET6) {
416  		n = ip_neigh_gw6(dev, &rt->rt_gw6);
417          } else {
418  		__be32 pkey;
419  
420  		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
421  		n = ip_neigh_gw4(dev, pkey);
422  	}
423  
424  	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
425  		n = NULL;
426  
427  	rcu_read_unlock_bh();
428  
429  	return n;
430  }
431  
ipv4_confirm_neigh(const struct dst_entry * dst,const void * daddr)432  static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
433  {
434  	const struct rtable *rt = container_of(dst, struct rtable, dst);
435  	struct net_device *dev = dst->dev;
436  	const __be32 *pkey = daddr;
437  
438  	if (rt->rt_gw_family == AF_INET) {
439  		pkey = (const __be32 *)&rt->rt_gw4;
440  	} else if (rt->rt_gw_family == AF_INET6) {
441  		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
442  	} else if (!daddr ||
443  		 (rt->rt_flags &
444  		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
445  		return;
446  	}
447  	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
448  }
449  
450  /* Hash tables of size 2048..262144 depending on RAM size.
451   * Each bucket uses 8 bytes.
452   */
453  static u32 ip_idents_mask __read_mostly;
454  static atomic_t *ip_idents __read_mostly;
455  static u32 *ip_tstamps __read_mostly;
456  
457  /* In order to protect privacy, we add a perturbation to identifiers
458   * if one generator is seldom used. This makes hard for an attacker
459   * to infer how many packets were sent between two points in time.
460   */
ip_idents_reserve(u32 hash,int segs)461  static u32 ip_idents_reserve(u32 hash, int segs)
462  {
463  	u32 bucket, old, now = (u32)jiffies;
464  	atomic_t *p_id;
465  	u32 *p_tstamp;
466  	u32 delta = 0;
467  
468  	bucket = hash & ip_idents_mask;
469  	p_tstamp = ip_tstamps + bucket;
470  	p_id = ip_idents + bucket;
471  	old = READ_ONCE(*p_tstamp);
472  
473  	if (old != now && cmpxchg(p_tstamp, old, now) == old)
474  		delta = prandom_u32_max(now - old);
475  
476  	/* If UBSAN reports an error there, please make sure your compiler
477  	 * supports -fno-strict-overflow before reporting it that was a bug
478  	 * in UBSAN, and it has been fixed in GCC-8.
479  	 */
480  	return atomic_add_return(segs + delta, p_id) - segs;
481  }
482  
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)483  void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
484  {
485  	u32 hash, id;
486  
487  	/* Note the following code is not safe, but this is okay. */
488  	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
489  		get_random_bytes(&net->ipv4.ip_id_key,
490  				 sizeof(net->ipv4.ip_id_key));
491  
492  	hash = siphash_3u32((__force u32)iph->daddr,
493  			    (__force u32)iph->saddr,
494  			    iph->protocol,
495  			    &net->ipv4.ip_id_key);
496  	id = ip_idents_reserve(hash, segs);
497  	iph->id = htons(id);
498  }
499  EXPORT_SYMBOL(__ip_select_ident);
500  
ip_rt_fix_tos(struct flowi4 * fl4)501  static void ip_rt_fix_tos(struct flowi4 *fl4)
502  {
503  	__u8 tos = RT_FL_TOS(fl4);
504  
505  	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
506  	if (tos & RTO_ONLINK)
507  		fl4->flowi4_scope = RT_SCOPE_LINK;
508  }
509  
__build_flow_key(const struct net * net,struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,__u8 tos,u8 prot,u32 mark,int flow_flags)510  static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
511  			     const struct sock *sk, const struct iphdr *iph,
512  			     int oif, __u8 tos, u8 prot, u32 mark,
513  			     int flow_flags)
514  {
515  	__u8 scope = RT_SCOPE_UNIVERSE;
516  
517  	if (sk) {
518  		const struct inet_sock *inet = inet_sk(sk);
519  
520  		oif = sk->sk_bound_dev_if;
521  		mark = sk->sk_mark;
522  		tos = ip_sock_rt_tos(sk);
523  		scope = ip_sock_rt_scope(sk);
524  		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
525  	}
526  
527  	flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
528  			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
529  			   sock_net_uid(net, sk));
530  }
531  
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)532  static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
533  			       const struct sock *sk)
534  {
535  	const struct net *net = dev_net(skb->dev);
536  	const struct iphdr *iph = ip_hdr(skb);
537  	int oif = skb->dev->ifindex;
538  	u8 prot = iph->protocol;
539  	u32 mark = skb->mark;
540  	__u8 tos = iph->tos;
541  
542  	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
543  }
544  
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)545  static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
546  {
547  	const struct inet_sock *inet = inet_sk(sk);
548  	const struct ip_options_rcu *inet_opt;
549  	__be32 daddr = inet->inet_daddr;
550  
551  	rcu_read_lock();
552  	inet_opt = rcu_dereference(inet->inet_opt);
553  	if (inet_opt && inet_opt->opt.srr)
554  		daddr = inet_opt->opt.faddr;
555  	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
556  			   ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
557  			   ip_sock_rt_scope(sk),
558  			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
559  			   inet_sk_flowi_flags(sk),
560  			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
561  	rcu_read_unlock();
562  }
563  
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)564  static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
565  				 const struct sk_buff *skb)
566  {
567  	if (skb)
568  		build_skb_flow_key(fl4, skb, sk);
569  	else
570  		build_sk_flow_key(fl4, sk);
571  }
572  
573  static DEFINE_SPINLOCK(fnhe_lock);
574  
fnhe_flush_routes(struct fib_nh_exception * fnhe)575  static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
576  {
577  	struct rtable *rt;
578  
579  	rt = rcu_dereference(fnhe->fnhe_rth_input);
580  	if (rt) {
581  		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
582  		dst_dev_put(&rt->dst);
583  		dst_release(&rt->dst);
584  	}
585  	rt = rcu_dereference(fnhe->fnhe_rth_output);
586  	if (rt) {
587  		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
588  		dst_dev_put(&rt->dst);
589  		dst_release(&rt->dst);
590  	}
591  }
592  
fnhe_remove_oldest(struct fnhe_hash_bucket * hash)593  static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
594  {
595  	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
596  	struct fib_nh_exception *fnhe, *oldest = NULL;
597  
598  	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
599  		fnhe = rcu_dereference_protected(*fnhe_p,
600  						 lockdep_is_held(&fnhe_lock));
601  		if (!fnhe)
602  			break;
603  		if (!oldest ||
604  		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
605  			oldest = fnhe;
606  			oldest_p = fnhe_p;
607  		}
608  	}
609  	fnhe_flush_routes(oldest);
610  	*oldest_p = oldest->fnhe_next;
611  	kfree_rcu(oldest, rcu);
612  }
613  
fnhe_hashfun(__be32 daddr)614  static u32 fnhe_hashfun(__be32 daddr)
615  {
616  	static siphash_aligned_key_t fnhe_hash_key;
617  	u64 hval;
618  
619  	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
620  	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
621  	return hash_64(hval, FNHE_HASH_SHIFT);
622  }
623  
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)624  static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
625  {
626  	rt->rt_pmtu = fnhe->fnhe_pmtu;
627  	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
628  	rt->dst.expires = fnhe->fnhe_expires;
629  
630  	if (fnhe->fnhe_gw) {
631  		rt->rt_flags |= RTCF_REDIRECTED;
632  		rt->rt_uses_gateway = 1;
633  		rt->rt_gw_family = AF_INET;
634  		rt->rt_gw4 = fnhe->fnhe_gw;
635  	}
636  }
637  
update_or_create_fnhe(struct fib_nh_common * nhc,__be32 daddr,__be32 gw,u32 pmtu,bool lock,unsigned long expires)638  static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
639  				  __be32 gw, u32 pmtu, bool lock,
640  				  unsigned long expires)
641  {
642  	struct fnhe_hash_bucket *hash;
643  	struct fib_nh_exception *fnhe;
644  	struct rtable *rt;
645  	u32 genid, hval;
646  	unsigned int i;
647  	int depth;
648  
649  	genid = fnhe_genid(dev_net(nhc->nhc_dev));
650  	hval = fnhe_hashfun(daddr);
651  
652  	spin_lock_bh(&fnhe_lock);
653  
654  	hash = rcu_dereference(nhc->nhc_exceptions);
655  	if (!hash) {
656  		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
657  		if (!hash)
658  			goto out_unlock;
659  		rcu_assign_pointer(nhc->nhc_exceptions, hash);
660  	}
661  
662  	hash += hval;
663  
664  	depth = 0;
665  	for (fnhe = rcu_dereference(hash->chain); fnhe;
666  	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
667  		if (fnhe->fnhe_daddr == daddr)
668  			break;
669  		depth++;
670  	}
671  
672  	if (fnhe) {
673  		if (fnhe->fnhe_genid != genid)
674  			fnhe->fnhe_genid = genid;
675  		if (gw)
676  			fnhe->fnhe_gw = gw;
677  		if (pmtu) {
678  			fnhe->fnhe_pmtu = pmtu;
679  			fnhe->fnhe_mtu_locked = lock;
680  		}
681  		fnhe->fnhe_expires = max(1UL, expires);
682  		/* Update all cached dsts too */
683  		rt = rcu_dereference(fnhe->fnhe_rth_input);
684  		if (rt)
685  			fill_route_from_fnhe(rt, fnhe);
686  		rt = rcu_dereference(fnhe->fnhe_rth_output);
687  		if (rt)
688  			fill_route_from_fnhe(rt, fnhe);
689  	} else {
690  		/* Randomize max depth to avoid some side channels attacks. */
691  		int max_depth = FNHE_RECLAIM_DEPTH +
692  				prandom_u32_max(FNHE_RECLAIM_DEPTH);
693  
694  		while (depth > max_depth) {
695  			fnhe_remove_oldest(hash);
696  			depth--;
697  		}
698  
699  		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700  		if (!fnhe)
701  			goto out_unlock;
702  
703  		fnhe->fnhe_next = hash->chain;
704  
705  		fnhe->fnhe_genid = genid;
706  		fnhe->fnhe_daddr = daddr;
707  		fnhe->fnhe_gw = gw;
708  		fnhe->fnhe_pmtu = pmtu;
709  		fnhe->fnhe_mtu_locked = lock;
710  		fnhe->fnhe_expires = max(1UL, expires);
711  
712  		rcu_assign_pointer(hash->chain, fnhe);
713  
714  		/* Exception created; mark the cached routes for the nexthop
715  		 * stale, so anyone caching it rechecks if this exception
716  		 * applies to them.
717  		 */
718  		rt = rcu_dereference(nhc->nhc_rth_input);
719  		if (rt)
720  			rt->dst.obsolete = DST_OBSOLETE_KILL;
721  
722  		for_each_possible_cpu(i) {
723  			struct rtable __rcu **prt;
724  
725  			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
726  			rt = rcu_dereference(*prt);
727  			if (rt)
728  				rt->dst.obsolete = DST_OBSOLETE_KILL;
729  		}
730  	}
731  
732  	fnhe->fnhe_stamp = jiffies;
733  
734  out_unlock:
735  	spin_unlock_bh(&fnhe_lock);
736  }
737  
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)738  static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739  			     bool kill_route)
740  {
741  	__be32 new_gw = icmp_hdr(skb)->un.gateway;
742  	__be32 old_gw = ip_hdr(skb)->saddr;
743  	struct net_device *dev = skb->dev;
744  	struct in_device *in_dev;
745  	struct fib_result res;
746  	struct neighbour *n;
747  	struct net *net;
748  
749  	switch (icmp_hdr(skb)->code & 7) {
750  	case ICMP_REDIR_NET:
751  	case ICMP_REDIR_NETTOS:
752  	case ICMP_REDIR_HOST:
753  	case ICMP_REDIR_HOSTTOS:
754  		break;
755  
756  	default:
757  		return;
758  	}
759  
760  	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761  		return;
762  
763  	in_dev = __in_dev_get_rcu(dev);
764  	if (!in_dev)
765  		return;
766  
767  	net = dev_net(dev);
768  	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769  	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770  	    ipv4_is_zeronet(new_gw))
771  		goto reject_redirect;
772  
773  	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774  		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775  			goto reject_redirect;
776  		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777  			goto reject_redirect;
778  	} else {
779  		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
780  			goto reject_redirect;
781  	}
782  
783  	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784  	if (!n)
785  		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
786  	if (!IS_ERR(n)) {
787  		if (!(n->nud_state & NUD_VALID)) {
788  			neigh_event_send(n, NULL);
789  		} else {
790  			if (fib_lookup(net, fl4, &res, 0) == 0) {
791  				struct fib_nh_common *nhc;
792  
793  				fib_select_path(net, &res, fl4, skb);
794  				nhc = FIB_RES_NHC(res);
795  				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
796  						0, false,
797  						jiffies + ip_rt_gc_timeout);
798  			}
799  			if (kill_route)
800  				rt->dst.obsolete = DST_OBSOLETE_KILL;
801  			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
802  		}
803  		neigh_release(n);
804  	}
805  	return;
806  
807  reject_redirect:
808  #ifdef CONFIG_IP_ROUTE_VERBOSE
809  	if (IN_DEV_LOG_MARTIANS(in_dev)) {
810  		const struct iphdr *iph = (const struct iphdr *) skb->data;
811  		__be32 daddr = iph->daddr;
812  		__be32 saddr = iph->saddr;
813  
814  		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
815  				     "  Advised path = %pI4 -> %pI4\n",
816  				     &old_gw, dev->name, &new_gw,
817  				     &saddr, &daddr);
818  	}
819  #endif
820  	;
821  }
822  
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)823  static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
824  {
825  	struct rtable *rt;
826  	struct flowi4 fl4;
827  	const struct iphdr *iph = (const struct iphdr *) skb->data;
828  	struct net *net = dev_net(skb->dev);
829  	int oif = skb->dev->ifindex;
830  	u8 prot = iph->protocol;
831  	u32 mark = skb->mark;
832  	__u8 tos = iph->tos;
833  
834  	rt = (struct rtable *) dst;
835  
836  	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
837  	__ip_do_redirect(rt, skb, &fl4, true);
838  }
839  
ipv4_negative_advice(struct dst_entry * dst)840  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
841  {
842  	struct rtable *rt = (struct rtable *)dst;
843  	struct dst_entry *ret = dst;
844  
845  	if (rt) {
846  		if (dst->obsolete > 0) {
847  			ip_rt_put(rt);
848  			ret = NULL;
849  		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
850  			   rt->dst.expires) {
851  			ip_rt_put(rt);
852  			ret = NULL;
853  		}
854  	}
855  	return ret;
856  }
857  
858  /*
859   * Algorithm:
860   *	1. The first ip_rt_redirect_number redirects are sent
861   *	   with exponential backoff, then we stop sending them at all,
862   *	   assuming that the host ignores our redirects.
863   *	2. If we did not see packets requiring redirects
864   *	   during ip_rt_redirect_silence, we assume that the host
865   *	   forgot redirected route and start to send redirects again.
866   *
867   * This algorithm is much cheaper and more intelligent than dumb load limiting
868   * in icmp.c.
869   *
870   * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
871   * and "frag. need" (breaks PMTU discovery) in icmp.c.
872   */
873  
ip_rt_send_redirect(struct sk_buff * skb)874  void ip_rt_send_redirect(struct sk_buff *skb)
875  {
876  	struct rtable *rt = skb_rtable(skb);
877  	struct in_device *in_dev;
878  	struct inet_peer *peer;
879  	struct net *net;
880  	int log_martians;
881  	int vif;
882  
883  	rcu_read_lock();
884  	in_dev = __in_dev_get_rcu(rt->dst.dev);
885  	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
886  		rcu_read_unlock();
887  		return;
888  	}
889  	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
890  	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
891  	rcu_read_unlock();
892  
893  	net = dev_net(rt->dst.dev);
894  	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
895  	if (!peer) {
896  		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
897  			  rt_nexthop(rt, ip_hdr(skb)->daddr));
898  		return;
899  	}
900  
901  	/* No redirected packets during ip_rt_redirect_silence;
902  	 * reset the algorithm.
903  	 */
904  	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
905  		peer->rate_tokens = 0;
906  		peer->n_redirects = 0;
907  	}
908  
909  	/* Too many ignored redirects; do not send anything
910  	 * set dst.rate_last to the last seen redirected packet.
911  	 */
912  	if (peer->n_redirects >= ip_rt_redirect_number) {
913  		peer->rate_last = jiffies;
914  		goto out_put_peer;
915  	}
916  
917  	/* Check for load limit; set rate_last to the latest sent
918  	 * redirect.
919  	 */
920  	if (peer->n_redirects == 0 ||
921  	    time_after(jiffies,
922  		       (peer->rate_last +
923  			(ip_rt_redirect_load << peer->n_redirects)))) {
924  		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
925  
926  		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
927  		peer->rate_last = jiffies;
928  		++peer->n_redirects;
929  #ifdef CONFIG_IP_ROUTE_VERBOSE
930  		if (log_martians &&
931  		    peer->n_redirects == ip_rt_redirect_number)
932  			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
933  					     &ip_hdr(skb)->saddr, inet_iif(skb),
934  					     &ip_hdr(skb)->daddr, &gw);
935  #endif
936  	}
937  out_put_peer:
938  	inet_putpeer(peer);
939  }
940  
ip_error(struct sk_buff * skb)941  static int ip_error(struct sk_buff *skb)
942  {
943  	struct rtable *rt = skb_rtable(skb);
944  	struct net_device *dev = skb->dev;
945  	struct in_device *in_dev;
946  	struct inet_peer *peer;
947  	unsigned long now;
948  	struct net *net;
949  	SKB_DR(reason);
950  	bool send;
951  	int code;
952  
953  	if (netif_is_l3_master(skb->dev)) {
954  		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
955  		if (!dev)
956  			goto out;
957  	}
958  
959  	in_dev = __in_dev_get_rcu(dev);
960  
961  	/* IP on this device is disabled. */
962  	if (!in_dev)
963  		goto out;
964  
965  	net = dev_net(rt->dst.dev);
966  	if (!IN_DEV_FORWARD(in_dev)) {
967  		switch (rt->dst.error) {
968  		case EHOSTUNREACH:
969  			SKB_DR_SET(reason, IP_INADDRERRORS);
970  			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
971  			break;
972  
973  		case ENETUNREACH:
974  			SKB_DR_SET(reason, IP_INNOROUTES);
975  			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
976  			break;
977  		}
978  		goto out;
979  	}
980  
981  	switch (rt->dst.error) {
982  	case EINVAL:
983  	default:
984  		goto out;
985  	case EHOSTUNREACH:
986  		code = ICMP_HOST_UNREACH;
987  		break;
988  	case ENETUNREACH:
989  		code = ICMP_NET_UNREACH;
990  		SKB_DR_SET(reason, IP_INNOROUTES);
991  		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992  		break;
993  	case EACCES:
994  		code = ICMP_PKT_FILTERED;
995  		break;
996  	}
997  
998  	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
999  			       l3mdev_master_ifindex(skb->dev), 1);
1000  
1001  	send = true;
1002  	if (peer) {
1003  		now = jiffies;
1004  		peer->rate_tokens += now - peer->rate_last;
1005  		if (peer->rate_tokens > ip_rt_error_burst)
1006  			peer->rate_tokens = ip_rt_error_burst;
1007  		peer->rate_last = now;
1008  		if (peer->rate_tokens >= ip_rt_error_cost)
1009  			peer->rate_tokens -= ip_rt_error_cost;
1010  		else
1011  			send = false;
1012  		inet_putpeer(peer);
1013  	}
1014  	if (send)
1015  		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1016  
1017  out:	kfree_skb_reason(skb, reason);
1018  	return 0;
1019  }
1020  
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)1021  static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1022  {
1023  	struct dst_entry *dst = &rt->dst;
1024  	struct net *net = dev_net(dst->dev);
1025  	struct fib_result res;
1026  	bool lock = false;
1027  	u32 old_mtu;
1028  
1029  	if (ip_mtu_locked(dst))
1030  		return;
1031  
1032  	old_mtu = ipv4_mtu(dst);
1033  	if (old_mtu < mtu)
1034  		return;
1035  
1036  	if (mtu < net->ipv4.ip_rt_min_pmtu) {
1037  		lock = true;
1038  		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1039  	}
1040  
1041  	if (rt->rt_pmtu == mtu && !lock &&
1042  	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1043  		return;
1044  
1045  	rcu_read_lock();
1046  	if (fib_lookup(net, fl4, &res, 0) == 0) {
1047  		struct fib_nh_common *nhc;
1048  
1049  		fib_select_path(net, &res, fl4, NULL);
1050  		nhc = FIB_RES_NHC(res);
1051  		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052  				      jiffies + net->ipv4.ip_rt_mtu_expires);
1053  	}
1054  	rcu_read_unlock();
1055  }
1056  
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu,bool confirm_neigh)1057  static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1058  			      struct sk_buff *skb, u32 mtu,
1059  			      bool confirm_neigh)
1060  {
1061  	struct rtable *rt = (struct rtable *) dst;
1062  	struct flowi4 fl4;
1063  
1064  	ip_rt_build_flow_key(&fl4, sk, skb);
1065  
1066  	/* Don't make lookup fail for bridged encapsulations */
1067  	if (skb && netif_is_any_bridge_port(skb->dev))
1068  		fl4.flowi4_oif = 0;
1069  
1070  	__ip_rt_update_pmtu(rt, &fl4, mtu);
1071  }
1072  
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u8 protocol)1073  void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1074  		      int oif, u8 protocol)
1075  {
1076  	const struct iphdr *iph = (const struct iphdr *)skb->data;
1077  	struct flowi4 fl4;
1078  	struct rtable *rt;
1079  	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1080  
1081  	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1082  			 0);
1083  	rt = __ip_route_output_key(net, &fl4);
1084  	if (!IS_ERR(rt)) {
1085  		__ip_rt_update_pmtu(rt, &fl4, mtu);
1086  		ip_rt_put(rt);
1087  	}
1088  }
1089  EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1090  
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1091  static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092  {
1093  	const struct iphdr *iph = (const struct iphdr *)skb->data;
1094  	struct flowi4 fl4;
1095  	struct rtable *rt;
1096  
1097  	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1098  
1099  	if (!fl4.flowi4_mark)
1100  		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1101  
1102  	rt = __ip_route_output_key(sock_net(sk), &fl4);
1103  	if (!IS_ERR(rt)) {
1104  		__ip_rt_update_pmtu(rt, &fl4, mtu);
1105  		ip_rt_put(rt);
1106  	}
1107  }
1108  
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1109  void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1110  {
1111  	const struct iphdr *iph = (const struct iphdr *)skb->data;
1112  	struct flowi4 fl4;
1113  	struct rtable *rt;
1114  	struct dst_entry *odst = NULL;
1115  	bool new = false;
1116  	struct net *net = sock_net(sk);
1117  
1118  	bh_lock_sock(sk);
1119  
1120  	if (!ip_sk_accept_pmtu(sk))
1121  		goto out;
1122  
1123  	odst = sk_dst_get(sk);
1124  
1125  	if (sock_owned_by_user(sk) || !odst) {
1126  		__ipv4_sk_update_pmtu(skb, sk, mtu);
1127  		goto out;
1128  	}
1129  
1130  	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1131  
1132  	rt = (struct rtable *)odst;
1133  	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1134  		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1135  		if (IS_ERR(rt))
1136  			goto out;
1137  
1138  		new = true;
1139  	}
1140  
1141  	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1142  
1143  	if (!dst_check(&rt->dst, 0)) {
1144  		if (new)
1145  			dst_release(&rt->dst);
1146  
1147  		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1148  		if (IS_ERR(rt))
1149  			goto out;
1150  
1151  		new = true;
1152  	}
1153  
1154  	if (new)
1155  		sk_dst_set(sk, &rt->dst);
1156  
1157  out:
1158  	bh_unlock_sock(sk);
1159  	dst_release(odst);
1160  }
1161  EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1162  
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u8 protocol)1163  void ipv4_redirect(struct sk_buff *skb, struct net *net,
1164  		   int oif, u8 protocol)
1165  {
1166  	const struct iphdr *iph = (const struct iphdr *)skb->data;
1167  	struct flowi4 fl4;
1168  	struct rtable *rt;
1169  
1170  	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
1171  	rt = __ip_route_output_key(net, &fl4);
1172  	if (!IS_ERR(rt)) {
1173  		__ip_do_redirect(rt, skb, &fl4, false);
1174  		ip_rt_put(rt);
1175  	}
1176  }
1177  EXPORT_SYMBOL_GPL(ipv4_redirect);
1178  
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1179  void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1180  {
1181  	const struct iphdr *iph = (const struct iphdr *)skb->data;
1182  	struct flowi4 fl4;
1183  	struct rtable *rt;
1184  	struct net *net = sock_net(sk);
1185  
1186  	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1187  	rt = __ip_route_output_key(net, &fl4);
1188  	if (!IS_ERR(rt)) {
1189  		__ip_do_redirect(rt, skb, &fl4, false);
1190  		ip_rt_put(rt);
1191  	}
1192  }
1193  EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1194  
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1195  INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1196  							 u32 cookie)
1197  {
1198  	struct rtable *rt = (struct rtable *) dst;
1199  
1200  	/* All IPV4 dsts are created with ->obsolete set to the value
1201  	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1202  	 * into this function always.
1203  	 *
1204  	 * When a PMTU/redirect information update invalidates a route,
1205  	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1206  	 * DST_OBSOLETE_DEAD.
1207  	 */
1208  	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1209  		return NULL;
1210  	return dst;
1211  }
1212  EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1213  
ipv4_send_dest_unreach(struct sk_buff * skb)1214  static void ipv4_send_dest_unreach(struct sk_buff *skb)
1215  {
1216  	struct ip_options opt;
1217  	int res;
1218  
1219  	/* Recompile ip options since IPCB may not be valid anymore.
1220  	 * Also check we have a reasonable ipv4 header.
1221  	 */
1222  	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1223  	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1224  		return;
1225  
1226  	memset(&opt, 0, sizeof(opt));
1227  	if (ip_hdr(skb)->ihl > 5) {
1228  		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1229  			return;
1230  		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1231  
1232  		rcu_read_lock();
1233  		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1234  		rcu_read_unlock();
1235  
1236  		if (res)
1237  			return;
1238  	}
1239  	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1240  }
1241  
ipv4_link_failure(struct sk_buff * skb)1242  static void ipv4_link_failure(struct sk_buff *skb)
1243  {
1244  	struct rtable *rt;
1245  
1246  	ipv4_send_dest_unreach(skb);
1247  
1248  	rt = skb_rtable(skb);
1249  	if (rt)
1250  		dst_set_expires(&rt->dst, 0);
1251  }
1252  
ip_rt_bug(struct net * net,struct sock * sk,struct sk_buff * skb)1253  static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1254  {
1255  	pr_debug("%s: %pI4 -> %pI4, %s\n",
1256  		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1257  		 skb->dev ? skb->dev->name : "?");
1258  	kfree_skb(skb);
1259  	WARN_ON(1);
1260  	return 0;
1261  }
1262  
1263  /*
1264   * We do not cache source address of outgoing interface,
1265   * because it is used only by IP RR, TS and SRR options,
1266   * so that it out of fast path.
1267   *
1268   * BTW remember: "addr" is allowed to be not aligned
1269   * in IP options!
1270   */
1271  
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1272  void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1273  {
1274  	__be32 src;
1275  
1276  	if (rt_is_output_route(rt))
1277  		src = ip_hdr(skb)->saddr;
1278  	else {
1279  		struct fib_result res;
1280  		struct iphdr *iph = ip_hdr(skb);
1281  		struct flowi4 fl4 = {
1282  			.daddr = iph->daddr,
1283  			.saddr = iph->saddr,
1284  			.flowi4_tos = RT_TOS(iph->tos),
1285  			.flowi4_oif = rt->dst.dev->ifindex,
1286  			.flowi4_iif = skb->dev->ifindex,
1287  			.flowi4_mark = skb->mark,
1288  		};
1289  
1290  		rcu_read_lock();
1291  		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1292  			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1293  		else
1294  			src = inet_select_addr(rt->dst.dev,
1295  					       rt_nexthop(rt, iph->daddr),
1296  					       RT_SCOPE_UNIVERSE);
1297  		rcu_read_unlock();
1298  	}
1299  	memcpy(addr, &src, 4);
1300  }
1301  
1302  #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1303  static void set_class_tag(struct rtable *rt, u32 tag)
1304  {
1305  	if (!(rt->dst.tclassid & 0xFFFF))
1306  		rt->dst.tclassid |= tag & 0xFFFF;
1307  	if (!(rt->dst.tclassid & 0xFFFF0000))
1308  		rt->dst.tclassid |= tag & 0xFFFF0000;
1309  }
1310  #endif
1311  
ipv4_default_advmss(const struct dst_entry * dst)1312  static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1313  {
1314  	struct net *net = dev_net(dst->dev);
1315  	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1316  	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1317  				    net->ipv4.ip_rt_min_advmss);
1318  
1319  	return min(advmss, IPV4_MAX_PMTU - header_size);
1320  }
1321  
ipv4_mtu(const struct dst_entry * dst)1322  INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1323  {
1324  	return ip_dst_mtu_maybe_forward(dst, false);
1325  }
1326  EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1327  
ip_del_fnhe(struct fib_nh_common * nhc,__be32 daddr)1328  static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329  {
1330  	struct fnhe_hash_bucket *hash;
1331  	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332  	u32 hval = fnhe_hashfun(daddr);
1333  
1334  	spin_lock_bh(&fnhe_lock);
1335  
1336  	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337  					 lockdep_is_held(&fnhe_lock));
1338  	hash += hval;
1339  
1340  	fnhe_p = &hash->chain;
1341  	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342  	while (fnhe) {
1343  		if (fnhe->fnhe_daddr == daddr) {
1344  			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345  				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346  			/* set fnhe_daddr to 0 to ensure it won't bind with
1347  			 * new dsts in rt_bind_exception().
1348  			 */
1349  			fnhe->fnhe_daddr = 0;
1350  			fnhe_flush_routes(fnhe);
1351  			kfree_rcu(fnhe, rcu);
1352  			break;
1353  		}
1354  		fnhe_p = &fnhe->fnhe_next;
1355  		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356  						 lockdep_is_held(&fnhe_lock));
1357  	}
1358  
1359  	spin_unlock_bh(&fnhe_lock);
1360  }
1361  
find_exception(struct fib_nh_common * nhc,__be32 daddr)1362  static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363  					       __be32 daddr)
1364  {
1365  	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366  	struct fib_nh_exception *fnhe;
1367  	u32 hval;
1368  
1369  	if (!hash)
1370  		return NULL;
1371  
1372  	hval = fnhe_hashfun(daddr);
1373  
1374  	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375  	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376  		if (fnhe->fnhe_daddr == daddr) {
1377  			if (fnhe->fnhe_expires &&
1378  			    time_after(jiffies, fnhe->fnhe_expires)) {
1379  				ip_del_fnhe(nhc, daddr);
1380  				break;
1381  			}
1382  			return fnhe;
1383  		}
1384  	}
1385  	return NULL;
1386  }
1387  
1388  /* MTU selection:
1389   * 1. mtu on route is locked - use it
1390   * 2. mtu from nexthop exception
1391   * 3. mtu from egress device
1392   */
1393  
ip_mtu_from_fib_result(struct fib_result * res,__be32 daddr)1394  u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395  {
1396  	struct fib_nh_common *nhc = res->nhc;
1397  	struct net_device *dev = nhc->nhc_dev;
1398  	struct fib_info *fi = res->fi;
1399  	u32 mtu = 0;
1400  
1401  	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1402  	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403  		mtu = fi->fib_mtu;
1404  
1405  	if (likely(!mtu)) {
1406  		struct fib_nh_exception *fnhe;
1407  
1408  		fnhe = find_exception(nhc, daddr);
1409  		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410  			mtu = fnhe->fnhe_pmtu;
1411  	}
1412  
1413  	if (likely(!mtu))
1414  		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415  
1416  	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417  }
1418  
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr,const bool do_cache)1419  static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420  			      __be32 daddr, const bool do_cache)
1421  {
1422  	bool ret = false;
1423  
1424  	spin_lock_bh(&fnhe_lock);
1425  
1426  	if (daddr == fnhe->fnhe_daddr) {
1427  		struct rtable __rcu **porig;
1428  		struct rtable *orig;
1429  		int genid = fnhe_genid(dev_net(rt->dst.dev));
1430  
1431  		if (rt_is_input_route(rt))
1432  			porig = &fnhe->fnhe_rth_input;
1433  		else
1434  			porig = &fnhe->fnhe_rth_output;
1435  		orig = rcu_dereference(*porig);
1436  
1437  		if (fnhe->fnhe_genid != genid) {
1438  			fnhe->fnhe_genid = genid;
1439  			fnhe->fnhe_gw = 0;
1440  			fnhe->fnhe_pmtu = 0;
1441  			fnhe->fnhe_expires = 0;
1442  			fnhe->fnhe_mtu_locked = false;
1443  			fnhe_flush_routes(fnhe);
1444  			orig = NULL;
1445  		}
1446  		fill_route_from_fnhe(rt, fnhe);
1447  		if (!rt->rt_gw4) {
1448  			rt->rt_gw4 = daddr;
1449  			rt->rt_gw_family = AF_INET;
1450  		}
1451  
1452  		if (do_cache) {
1453  			dst_hold(&rt->dst);
1454  			rcu_assign_pointer(*porig, rt);
1455  			if (orig) {
1456  				dst_dev_put(&orig->dst);
1457  				dst_release(&orig->dst);
1458  			}
1459  			ret = true;
1460  		}
1461  
1462  		fnhe->fnhe_stamp = jiffies;
1463  	}
1464  	spin_unlock_bh(&fnhe_lock);
1465  
1466  	return ret;
1467  }
1468  
rt_cache_route(struct fib_nh_common * nhc,struct rtable * rt)1469  static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470  {
1471  	struct rtable *orig, *prev, **p;
1472  	bool ret = true;
1473  
1474  	if (rt_is_input_route(rt)) {
1475  		p = (struct rtable **)&nhc->nhc_rth_input;
1476  	} else {
1477  		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478  	}
1479  	orig = *p;
1480  
1481  	/* hold dst before doing cmpxchg() to avoid race condition
1482  	 * on this dst
1483  	 */
1484  	dst_hold(&rt->dst);
1485  	prev = cmpxchg(p, orig, rt);
1486  	if (prev == orig) {
1487  		if (orig) {
1488  			rt_add_uncached_list(orig);
1489  			dst_release(&orig->dst);
1490  		}
1491  	} else {
1492  		dst_release(&rt->dst);
1493  		ret = false;
1494  	}
1495  
1496  	return ret;
1497  }
1498  
1499  struct uncached_list {
1500  	spinlock_t		lock;
1501  	struct list_head	head;
1502  	struct list_head	quarantine;
1503  };
1504  
1505  static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1506  
rt_add_uncached_list(struct rtable * rt)1507  void rt_add_uncached_list(struct rtable *rt)
1508  {
1509  	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1510  
1511  	rt->rt_uncached_list = ul;
1512  
1513  	spin_lock_bh(&ul->lock);
1514  	list_add_tail(&rt->rt_uncached, &ul->head);
1515  	spin_unlock_bh(&ul->lock);
1516  }
1517  
rt_del_uncached_list(struct rtable * rt)1518  void rt_del_uncached_list(struct rtable *rt)
1519  {
1520  	if (!list_empty(&rt->rt_uncached)) {
1521  		struct uncached_list *ul = rt->rt_uncached_list;
1522  
1523  		spin_lock_bh(&ul->lock);
1524  		list_del_init(&rt->rt_uncached);
1525  		spin_unlock_bh(&ul->lock);
1526  	}
1527  }
1528  
ipv4_dst_destroy(struct dst_entry * dst)1529  static void ipv4_dst_destroy(struct dst_entry *dst)
1530  {
1531  	struct rtable *rt = (struct rtable *)dst;
1532  
1533  	ip_dst_metrics_put(dst);
1534  	rt_del_uncached_list(rt);
1535  }
1536  
rt_flush_dev(struct net_device * dev)1537  void rt_flush_dev(struct net_device *dev)
1538  {
1539  	struct rtable *rt, *safe;
1540  	int cpu;
1541  
1542  	for_each_possible_cpu(cpu) {
1543  		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1544  
1545  		if (list_empty(&ul->head))
1546  			continue;
1547  
1548  		spin_lock_bh(&ul->lock);
1549  		list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
1550  			if (rt->dst.dev != dev)
1551  				continue;
1552  			rt->dst.dev = blackhole_netdev;
1553  			netdev_ref_replace(dev, blackhole_netdev,
1554  					   &rt->dst.dev_tracker, GFP_ATOMIC);
1555  			list_move(&rt->rt_uncached, &ul->quarantine);
1556  		}
1557  		spin_unlock_bh(&ul->lock);
1558  	}
1559  }
1560  
rt_cache_valid(const struct rtable * rt)1561  static bool rt_cache_valid(const struct rtable *rt)
1562  {
1563  	return	rt &&
1564  		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1565  		!rt_is_expired(rt);
1566  }
1567  
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag,const bool do_cache)1568  static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1569  			   const struct fib_result *res,
1570  			   struct fib_nh_exception *fnhe,
1571  			   struct fib_info *fi, u16 type, u32 itag,
1572  			   const bool do_cache)
1573  {
1574  	bool cached = false;
1575  
1576  	if (fi) {
1577  		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1578  
1579  		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1580  			rt->rt_uses_gateway = 1;
1581  			rt->rt_gw_family = nhc->nhc_gw_family;
1582  			/* only INET and INET6 are supported */
1583  			if (likely(nhc->nhc_gw_family == AF_INET))
1584  				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1585  			else
1586  				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1587  		}
1588  
1589  		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1590  
1591  #ifdef CONFIG_IP_ROUTE_CLASSID
1592  		if (nhc->nhc_family == AF_INET) {
1593  			struct fib_nh *nh;
1594  
1595  			nh = container_of(nhc, struct fib_nh, nh_common);
1596  			rt->dst.tclassid = nh->nh_tclassid;
1597  		}
1598  #endif
1599  		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1600  		if (unlikely(fnhe))
1601  			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1602  		else if (do_cache)
1603  			cached = rt_cache_route(nhc, rt);
1604  		if (unlikely(!cached)) {
1605  			/* Routes we intend to cache in nexthop exception or
1606  			 * FIB nexthop have the DST_NOCACHE bit clear.
1607  			 * However, if we are unsuccessful at storing this
1608  			 * route into the cache we really need to set it.
1609  			 */
1610  			if (!rt->rt_gw4) {
1611  				rt->rt_gw_family = AF_INET;
1612  				rt->rt_gw4 = daddr;
1613  			}
1614  			rt_add_uncached_list(rt);
1615  		}
1616  	} else
1617  		rt_add_uncached_list(rt);
1618  
1619  #ifdef CONFIG_IP_ROUTE_CLASSID
1620  #ifdef CONFIG_IP_MULTIPLE_TABLES
1621  	set_class_tag(rt, res->tclassid);
1622  #endif
1623  	set_class_tag(rt, itag);
1624  #endif
1625  }
1626  
rt_dst_alloc(struct net_device * dev,unsigned int flags,u16 type,bool noxfrm)1627  struct rtable *rt_dst_alloc(struct net_device *dev,
1628  			    unsigned int flags, u16 type,
1629  			    bool noxfrm)
1630  {
1631  	struct rtable *rt;
1632  
1633  	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1634  		       (noxfrm ? DST_NOXFRM : 0));
1635  
1636  	if (rt) {
1637  		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638  		rt->rt_flags = flags;
1639  		rt->rt_type = type;
1640  		rt->rt_is_input = 0;
1641  		rt->rt_iif = 0;
1642  		rt->rt_pmtu = 0;
1643  		rt->rt_mtu_locked = 0;
1644  		rt->rt_uses_gateway = 0;
1645  		rt->rt_gw_family = 0;
1646  		rt->rt_gw4 = 0;
1647  		INIT_LIST_HEAD(&rt->rt_uncached);
1648  
1649  		rt->dst.output = ip_output;
1650  		if (flags & RTCF_LOCAL)
1651  			rt->dst.input = ip_local_deliver;
1652  	}
1653  
1654  	return rt;
1655  }
1656  EXPORT_SYMBOL(rt_dst_alloc);
1657  
rt_dst_clone(struct net_device * dev,struct rtable * rt)1658  struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1659  {
1660  	struct rtable *new_rt;
1661  
1662  	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1663  			   rt->dst.flags);
1664  
1665  	if (new_rt) {
1666  		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667  		new_rt->rt_flags = rt->rt_flags;
1668  		new_rt->rt_type = rt->rt_type;
1669  		new_rt->rt_is_input = rt->rt_is_input;
1670  		new_rt->rt_iif = rt->rt_iif;
1671  		new_rt->rt_pmtu = rt->rt_pmtu;
1672  		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673  		new_rt->rt_gw_family = rt->rt_gw_family;
1674  		if (rt->rt_gw_family == AF_INET)
1675  			new_rt->rt_gw4 = rt->rt_gw4;
1676  		else if (rt->rt_gw_family == AF_INET6)
1677  			new_rt->rt_gw6 = rt->rt_gw6;
1678  		INIT_LIST_HEAD(&new_rt->rt_uncached);
1679  
1680  		new_rt->dst.input = rt->dst.input;
1681  		new_rt->dst.output = rt->dst.output;
1682  		new_rt->dst.error = rt->dst.error;
1683  		new_rt->dst.lastuse = jiffies;
1684  		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685  	}
1686  	return new_rt;
1687  }
1688  EXPORT_SYMBOL(rt_dst_clone);
1689  
1690  /* called in rcu_read_lock() section */
ip_mc_validate_source(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct in_device * in_dev,u32 * itag)1691  int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692  			  u8 tos, struct net_device *dev,
1693  			  struct in_device *in_dev, u32 *itag)
1694  {
1695  	int err;
1696  
1697  	/* Primary sanity checks. */
1698  	if (!in_dev)
1699  		return -EINVAL;
1700  
1701  	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702  	    skb->protocol != htons(ETH_P_IP))
1703  		return -EINVAL;
1704  
1705  	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706  		return -EINVAL;
1707  
1708  	if (ipv4_is_zeronet(saddr)) {
1709  		if (!ipv4_is_local_multicast(daddr) &&
1710  		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711  			return -EINVAL;
1712  	} else {
1713  		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714  					  in_dev, itag);
1715  		if (err < 0)
1716  			return err;
1717  	}
1718  	return 0;
1719  }
1720  
1721  /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1722  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723  			     u8 tos, struct net_device *dev, int our)
1724  {
1725  	struct in_device *in_dev = __in_dev_get_rcu(dev);
1726  	unsigned int flags = RTCF_MULTICAST;
1727  	struct rtable *rth;
1728  	u32 itag = 0;
1729  	int err;
1730  
1731  	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732  	if (err)
1733  		return err;
1734  
1735  	if (our)
1736  		flags |= RTCF_LOCAL;
1737  
1738  	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1739  		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1740  
1741  	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1742  			   false);
1743  	if (!rth)
1744  		return -ENOBUFS;
1745  
1746  #ifdef CONFIG_IP_ROUTE_CLASSID
1747  	rth->dst.tclassid = itag;
1748  #endif
1749  	rth->dst.output = ip_rt_bug;
1750  	rth->rt_is_input= 1;
1751  
1752  #ifdef CONFIG_IP_MROUTE
1753  	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1754  		rth->dst.input = ip_mr_input;
1755  #endif
1756  	RT_CACHE_STAT_INC(in_slow_mc);
1757  
1758  	skb_dst_drop(skb);
1759  	skb_dst_set(skb, &rth->dst);
1760  	return 0;
1761  }
1762  
1763  
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1764  static void ip_handle_martian_source(struct net_device *dev,
1765  				     struct in_device *in_dev,
1766  				     struct sk_buff *skb,
1767  				     __be32 daddr,
1768  				     __be32 saddr)
1769  {
1770  	RT_CACHE_STAT_INC(in_martian_src);
1771  #ifdef CONFIG_IP_ROUTE_VERBOSE
1772  	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773  		/*
1774  		 *	RFC1812 recommendation, if source is martian,
1775  		 *	the only hint is MAC header.
1776  		 */
1777  		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778  			&daddr, &saddr, dev->name);
1779  		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780  			print_hex_dump(KERN_WARNING, "ll header: ",
1781  				       DUMP_PREFIX_OFFSET, 16, 1,
1782  				       skb_mac_header(skb),
1783  				       dev->hard_header_len, false);
1784  		}
1785  	}
1786  #endif
1787  }
1788  
1789  /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1790  static int __mkroute_input(struct sk_buff *skb,
1791  			   const struct fib_result *res,
1792  			   struct in_device *in_dev,
1793  			   __be32 daddr, __be32 saddr, u32 tos)
1794  {
1795  	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796  	struct net_device *dev = nhc->nhc_dev;
1797  	struct fib_nh_exception *fnhe;
1798  	struct rtable *rth;
1799  	int err;
1800  	struct in_device *out_dev;
1801  	bool do_cache;
1802  	u32 itag = 0;
1803  
1804  	/* get a working reference to the output device */
1805  	out_dev = __in_dev_get_rcu(dev);
1806  	if (!out_dev) {
1807  		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808  		return -EINVAL;
1809  	}
1810  
1811  	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812  				  in_dev->dev, in_dev, &itag);
1813  	if (err < 0) {
1814  		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815  					 saddr);
1816  
1817  		goto cleanup;
1818  	}
1819  
1820  	do_cache = res->fi && !itag;
1821  	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822  	    skb->protocol == htons(ETH_P_IP)) {
1823  		__be32 gw;
1824  
1825  		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826  		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827  		    inet_addr_onlink(out_dev, saddr, gw))
1828  			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829  	}
1830  
1831  	if (skb->protocol != htons(ETH_P_IP)) {
1832  		/* Not IP (i.e. ARP). Do not create route, if it is
1833  		 * invalid for proxy arp. DNAT routes are always valid.
1834  		 *
1835  		 * Proxy arp feature have been extended to allow, ARP
1836  		 * replies back to the same interface, to support
1837  		 * Private VLAN switch technologies. See arp.c.
1838  		 */
1839  		if (out_dev == in_dev &&
1840  		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841  			err = -EINVAL;
1842  			goto cleanup;
1843  		}
1844  	}
1845  
1846  	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1847  		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1848  
1849  	fnhe = find_exception(nhc, daddr);
1850  	if (do_cache) {
1851  		if (fnhe)
1852  			rth = rcu_dereference(fnhe->fnhe_rth_input);
1853  		else
1854  			rth = rcu_dereference(nhc->nhc_rth_input);
1855  		if (rt_cache_valid(rth)) {
1856  			skb_dst_set_noref(skb, &rth->dst);
1857  			goto out;
1858  		}
1859  	}
1860  
1861  	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862  			   IN_DEV_ORCONF(out_dev, NOXFRM));
1863  	if (!rth) {
1864  		err = -ENOBUFS;
1865  		goto cleanup;
1866  	}
1867  
1868  	rth->rt_is_input = 1;
1869  	RT_CACHE_STAT_INC(in_slow_tot);
1870  
1871  	rth->dst.input = ip_forward;
1872  
1873  	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1874  		       do_cache);
1875  	lwtunnel_set_redirect(&rth->dst);
1876  	skb_dst_set(skb, &rth->dst);
1877  out:
1878  	err = 0;
1879   cleanup:
1880  	return err;
1881  }
1882  
1883  #ifdef CONFIG_IP_ROUTE_MULTIPATH
1884  /* To make ICMP packets follow the right flow, the multipath hash is
1885   * calculated from the inner IP addresses.
1886   */
ip_multipath_l3_keys(const struct sk_buff * skb,struct flow_keys * hash_keys)1887  static void ip_multipath_l3_keys(const struct sk_buff *skb,
1888  				 struct flow_keys *hash_keys)
1889  {
1890  	const struct iphdr *outer_iph = ip_hdr(skb);
1891  	const struct iphdr *key_iph = outer_iph;
1892  	const struct iphdr *inner_iph;
1893  	const struct icmphdr *icmph;
1894  	struct iphdr _inner_iph;
1895  	struct icmphdr _icmph;
1896  
1897  	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1898  		goto out;
1899  
1900  	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1901  		goto out;
1902  
1903  	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1904  				   &_icmph);
1905  	if (!icmph)
1906  		goto out;
1907  
1908  	if (!icmp_is_err(icmph->type))
1909  		goto out;
1910  
1911  	inner_iph = skb_header_pointer(skb,
1912  				       outer_iph->ihl * 4 + sizeof(_icmph),
1913  				       sizeof(_inner_iph), &_inner_iph);
1914  	if (!inner_iph)
1915  		goto out;
1916  
1917  	key_iph = inner_iph;
1918  out:
1919  	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1920  	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1921  }
1922  
fib_multipath_custom_hash_outer(const struct net * net,const struct sk_buff * skb,bool * p_has_inner)1923  static u32 fib_multipath_custom_hash_outer(const struct net *net,
1924  					   const struct sk_buff *skb,
1925  					   bool *p_has_inner)
1926  {
1927  	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1928  	struct flow_keys keys, hash_keys;
1929  
1930  	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1931  		return 0;
1932  
1933  	memset(&hash_keys, 0, sizeof(hash_keys));
1934  	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1935  
1936  	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1937  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1938  		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1939  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1940  		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1941  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1942  		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1943  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1944  		hash_keys.ports.src = keys.ports.src;
1945  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1946  		hash_keys.ports.dst = keys.ports.dst;
1947  
1948  	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1949  	return flow_hash_from_keys(&hash_keys);
1950  }
1951  
fib_multipath_custom_hash_inner(const struct net * net,const struct sk_buff * skb,bool has_inner)1952  static u32 fib_multipath_custom_hash_inner(const struct net *net,
1953  					   const struct sk_buff *skb,
1954  					   bool has_inner)
1955  {
1956  	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1957  	struct flow_keys keys, hash_keys;
1958  
1959  	/* We assume the packet carries an encapsulation, but if none was
1960  	 * encountered during dissection of the outer flow, then there is no
1961  	 * point in calling the flow dissector again.
1962  	 */
1963  	if (!has_inner)
1964  		return 0;
1965  
1966  	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1967  		return 0;
1968  
1969  	memset(&hash_keys, 0, sizeof(hash_keys));
1970  	skb_flow_dissect_flow_keys(skb, &keys, 0);
1971  
1972  	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1973  		return 0;
1974  
1975  	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976  		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977  		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1978  			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979  		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1980  			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981  	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982  		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983  		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1984  			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1985  		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1986  			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1987  		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1988  			hash_keys.tags.flow_label = keys.tags.flow_label;
1989  	}
1990  
1991  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1992  		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1993  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1994  		hash_keys.ports.src = keys.ports.src;
1995  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1996  		hash_keys.ports.dst = keys.ports.dst;
1997  
1998  	return flow_hash_from_keys(&hash_keys);
1999  }
2000  
fib_multipath_custom_hash_skb(const struct net * net,const struct sk_buff * skb)2001  static u32 fib_multipath_custom_hash_skb(const struct net *net,
2002  					 const struct sk_buff *skb)
2003  {
2004  	u32 mhash, mhash_inner;
2005  	bool has_inner = true;
2006  
2007  	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2008  	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2009  
2010  	return jhash_2words(mhash, mhash_inner, 0);
2011  }
2012  
fib_multipath_custom_hash_fl4(const struct net * net,const struct flowi4 * fl4)2013  static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2014  					 const struct flowi4 *fl4)
2015  {
2016  	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2017  	struct flow_keys hash_keys;
2018  
2019  	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2020  		return 0;
2021  
2022  	memset(&hash_keys, 0, sizeof(hash_keys));
2023  	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2025  		hash_keys.addrs.v4addrs.src = fl4->saddr;
2026  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2027  		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2028  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2029  		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2030  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2031  		hash_keys.ports.src = fl4->fl4_sport;
2032  	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2033  		hash_keys.ports.dst = fl4->fl4_dport;
2034  
2035  	return flow_hash_from_keys(&hash_keys);
2036  }
2037  
2038  /* if skb is set it will be used and fl4 can be NULL */
fib_multipath_hash(const struct net * net,const struct flowi4 * fl4,const struct sk_buff * skb,struct flow_keys * flkeys)2039  int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2040  		       const struct sk_buff *skb, struct flow_keys *flkeys)
2041  {
2042  	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2043  	struct flow_keys hash_keys;
2044  	u32 mhash = 0;
2045  
2046  	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2047  	case 0:
2048  		memset(&hash_keys, 0, sizeof(hash_keys));
2049  		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2050  		if (skb) {
2051  			ip_multipath_l3_keys(skb, &hash_keys);
2052  		} else {
2053  			hash_keys.addrs.v4addrs.src = fl4->saddr;
2054  			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2055  		}
2056  		mhash = flow_hash_from_keys(&hash_keys);
2057  		break;
2058  	case 1:
2059  		/* skb is currently provided only when forwarding */
2060  		if (skb) {
2061  			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062  			struct flow_keys keys;
2063  
2064  			/* short-circuit if we already have L4 hash present */
2065  			if (skb->l4_hash)
2066  				return skb_get_hash_raw(skb) >> 1;
2067  
2068  			memset(&hash_keys, 0, sizeof(hash_keys));
2069  
2070  			if (!flkeys) {
2071  				skb_flow_dissect_flow_keys(skb, &keys, flag);
2072  				flkeys = &keys;
2073  			}
2074  
2075  			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2076  			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2077  			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2078  			hash_keys.ports.src = flkeys->ports.src;
2079  			hash_keys.ports.dst = flkeys->ports.dst;
2080  			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081  		} else {
2082  			memset(&hash_keys, 0, sizeof(hash_keys));
2083  			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2084  			hash_keys.addrs.v4addrs.src = fl4->saddr;
2085  			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2086  			hash_keys.ports.src = fl4->fl4_sport;
2087  			hash_keys.ports.dst = fl4->fl4_dport;
2088  			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2089  		}
2090  		mhash = flow_hash_from_keys(&hash_keys);
2091  		break;
2092  	case 2:
2093  		memset(&hash_keys, 0, sizeof(hash_keys));
2094  		/* skb is currently provided only when forwarding */
2095  		if (skb) {
2096  			struct flow_keys keys;
2097  
2098  			skb_flow_dissect_flow_keys(skb, &keys, 0);
2099  			/* Inner can be v4 or v6 */
2100  			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2101  				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2102  				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2103  				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2104  			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2105  				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2106  				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2107  				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2108  				hash_keys.tags.flow_label = keys.tags.flow_label;
2109  				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2110  			} else {
2111  				/* Same as case 0 */
2112  				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2113  				ip_multipath_l3_keys(skb, &hash_keys);
2114  			}
2115  		} else {
2116  			/* Same as case 0 */
2117  			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2118  			hash_keys.addrs.v4addrs.src = fl4->saddr;
2119  			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2120  		}
2121  		mhash = flow_hash_from_keys(&hash_keys);
2122  		break;
2123  	case 3:
2124  		if (skb)
2125  			mhash = fib_multipath_custom_hash_skb(net, skb);
2126  		else
2127  			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2128  		break;
2129  	}
2130  
2131  	if (multipath_hash)
2132  		mhash = jhash_2words(mhash, multipath_hash, 0);
2133  
2134  	return mhash >> 1;
2135  }
2136  #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2137  
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct flow_keys * hkeys)2138  static int ip_mkroute_input(struct sk_buff *skb,
2139  			    struct fib_result *res,
2140  			    struct in_device *in_dev,
2141  			    __be32 daddr, __be32 saddr, u32 tos,
2142  			    struct flow_keys *hkeys)
2143  {
2144  #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145  	if (res->fi && fib_info_num_path(res->fi) > 1) {
2146  		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2147  
2148  		fib_select_multipath(res, h);
2149  	}
2150  #endif
2151  
2152  	/* create a routing cache entry */
2153  	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2154  }
2155  
2156  /* Implements all the saddr-related checks as ip_route_input_slow(),
2157   * assuming daddr is valid and the destination is not a local broadcast one.
2158   * Uses the provided hint instead of performing a route lookup.
2159   */
ip_route_use_hint(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,const struct sk_buff * hint)2160  int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2161  		      u8 tos, struct net_device *dev,
2162  		      const struct sk_buff *hint)
2163  {
2164  	struct in_device *in_dev = __in_dev_get_rcu(dev);
2165  	struct rtable *rt = skb_rtable(hint);
2166  	struct net *net = dev_net(dev);
2167  	int err = -EINVAL;
2168  	u32 tag = 0;
2169  
2170  	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2171  		goto martian_source;
2172  
2173  	if (ipv4_is_zeronet(saddr))
2174  		goto martian_source;
2175  
2176  	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2177  		goto martian_source;
2178  
2179  	if (rt->rt_type != RTN_LOCAL)
2180  		goto skip_validate_source;
2181  
2182  	tos &= IPTOS_RT_MASK;
2183  	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2184  	if (err < 0)
2185  		goto martian_source;
2186  
2187  skip_validate_source:
2188  	skb_dst_copy(skb, hint);
2189  	return 0;
2190  
2191  martian_source:
2192  	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2193  	return err;
2194  }
2195  
2196  /* get device for dst_alloc with local routes */
ip_rt_get_dev(struct net * net,const struct fib_result * res)2197  static struct net_device *ip_rt_get_dev(struct net *net,
2198  					const struct fib_result *res)
2199  {
2200  	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2201  	struct net_device *dev = NULL;
2202  
2203  	if (nhc)
2204  		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2205  
2206  	return dev ? : net->loopback_dev;
2207  }
2208  
2209  /*
2210   *	NOTE. We drop all the packets that has local source
2211   *	addresses, because every properly looped back packet
2212   *	must have correct destination already attached by output routine.
2213   *	Changes in the enforced policies must be applied also to
2214   *	ip_route_use_hint().
2215   *
2216   *	Such approach solves two big problems:
2217   *	1. Not simplex devices are handled properly.
2218   *	2. IP spoofing attempts are filtered with 100% of guarantee.
2219   *	called with rcu_read_lock()
2220   */
2221  
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2222  static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2223  			       u8 tos, struct net_device *dev,
2224  			       struct fib_result *res)
2225  {
2226  	struct in_device *in_dev = __in_dev_get_rcu(dev);
2227  	struct flow_keys *flkeys = NULL, _flkeys;
2228  	struct net    *net = dev_net(dev);
2229  	struct ip_tunnel_info *tun_info;
2230  	int		err = -EINVAL;
2231  	unsigned int	flags = 0;
2232  	u32		itag = 0;
2233  	struct rtable	*rth;
2234  	struct flowi4	fl4;
2235  	bool do_cache = true;
2236  
2237  	/* IP on this device is disabled. */
2238  
2239  	if (!in_dev)
2240  		goto out;
2241  
2242  	/* Check for the most weird martians, which can be not detected
2243  	 * by fib_lookup.
2244  	 */
2245  
2246  	tun_info = skb_tunnel_info(skb);
2247  	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2248  		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2249  	else
2250  		fl4.flowi4_tun_key.tun_id = 0;
2251  	skb_dst_drop(skb);
2252  
2253  	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2254  		goto martian_source;
2255  
2256  	res->fi = NULL;
2257  	res->table = NULL;
2258  	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2259  		goto brd_input;
2260  
2261  	/* Accept zero addresses only to limited broadcast;
2262  	 * I even do not know to fix it or not. Waiting for complains :-)
2263  	 */
2264  	if (ipv4_is_zeronet(saddr))
2265  		goto martian_source;
2266  
2267  	if (ipv4_is_zeronet(daddr))
2268  		goto martian_destination;
2269  
2270  	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2271  	 * and call it once if daddr or/and saddr are loopback addresses
2272  	 */
2273  	if (ipv4_is_loopback(daddr)) {
2274  		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2275  			goto martian_destination;
2276  	} else if (ipv4_is_loopback(saddr)) {
2277  		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2278  			goto martian_source;
2279  	}
2280  
2281  	/*
2282  	 *	Now we are ready to route packet.
2283  	 */
2284  	fl4.flowi4_l3mdev = 0;
2285  	fl4.flowi4_oif = 0;
2286  	fl4.flowi4_iif = dev->ifindex;
2287  	fl4.flowi4_mark = skb->mark;
2288  	fl4.flowi4_tos = tos;
2289  	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2290  	fl4.flowi4_flags = 0;
2291  	fl4.daddr = daddr;
2292  	fl4.saddr = saddr;
2293  	fl4.flowi4_uid = sock_net_uid(net, NULL);
2294  	fl4.flowi4_multipath_hash = 0;
2295  
2296  	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2297  		flkeys = &_flkeys;
2298  	} else {
2299  		fl4.flowi4_proto = 0;
2300  		fl4.fl4_sport = 0;
2301  		fl4.fl4_dport = 0;
2302  	}
2303  
2304  	err = fib_lookup(net, &fl4, res, 0);
2305  	if (err != 0) {
2306  		if (!IN_DEV_FORWARD(in_dev))
2307  			err = -EHOSTUNREACH;
2308  		goto no_route;
2309  	}
2310  
2311  	if (res->type == RTN_BROADCAST) {
2312  		if (IN_DEV_BFORWARD(in_dev))
2313  			goto make_route;
2314  		/* not do cache if bc_forwarding is enabled */
2315  		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2316  			do_cache = false;
2317  		goto brd_input;
2318  	}
2319  
2320  	if (res->type == RTN_LOCAL) {
2321  		err = fib_validate_source(skb, saddr, daddr, tos,
2322  					  0, dev, in_dev, &itag);
2323  		if (err < 0)
2324  			goto martian_source;
2325  		goto local_input;
2326  	}
2327  
2328  	if (!IN_DEV_FORWARD(in_dev)) {
2329  		err = -EHOSTUNREACH;
2330  		goto no_route;
2331  	}
2332  	if (res->type != RTN_UNICAST)
2333  		goto martian_destination;
2334  
2335  make_route:
2336  	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2337  out:	return err;
2338  
2339  brd_input:
2340  	if (skb->protocol != htons(ETH_P_IP))
2341  		goto e_inval;
2342  
2343  	if (!ipv4_is_zeronet(saddr)) {
2344  		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2345  					  in_dev, &itag);
2346  		if (err < 0)
2347  			goto martian_source;
2348  	}
2349  	flags |= RTCF_BROADCAST;
2350  	res->type = RTN_BROADCAST;
2351  	RT_CACHE_STAT_INC(in_brd);
2352  
2353  local_input:
2354  	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2355  		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2356  
2357  	do_cache &= res->fi && !itag;
2358  	if (do_cache) {
2359  		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2360  
2361  		rth = rcu_dereference(nhc->nhc_rth_input);
2362  		if (rt_cache_valid(rth)) {
2363  			skb_dst_set_noref(skb, &rth->dst);
2364  			err = 0;
2365  			goto out;
2366  		}
2367  	}
2368  
2369  	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2370  			   flags | RTCF_LOCAL, res->type, false);
2371  	if (!rth)
2372  		goto e_nobufs;
2373  
2374  	rth->dst.output= ip_rt_bug;
2375  #ifdef CONFIG_IP_ROUTE_CLASSID
2376  	rth->dst.tclassid = itag;
2377  #endif
2378  	rth->rt_is_input = 1;
2379  
2380  	RT_CACHE_STAT_INC(in_slow_tot);
2381  	if (res->type == RTN_UNREACHABLE) {
2382  		rth->dst.input= ip_error;
2383  		rth->dst.error= -err;
2384  		rth->rt_flags	&= ~RTCF_LOCAL;
2385  	}
2386  
2387  	if (do_cache) {
2388  		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2389  
2390  		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2391  		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2392  			WARN_ON(rth->dst.input == lwtunnel_input);
2393  			rth->dst.lwtstate->orig_input = rth->dst.input;
2394  			rth->dst.input = lwtunnel_input;
2395  		}
2396  
2397  		if (unlikely(!rt_cache_route(nhc, rth)))
2398  			rt_add_uncached_list(rth);
2399  	}
2400  	skb_dst_set(skb, &rth->dst);
2401  	err = 0;
2402  	goto out;
2403  
2404  no_route:
2405  	RT_CACHE_STAT_INC(in_no_route);
2406  	res->type = RTN_UNREACHABLE;
2407  	res->fi = NULL;
2408  	res->table = NULL;
2409  	goto local_input;
2410  
2411  	/*
2412  	 *	Do not cache martian addresses: they should be logged (RFC1812)
2413  	 */
2414  martian_destination:
2415  	RT_CACHE_STAT_INC(in_martian_dst);
2416  #ifdef CONFIG_IP_ROUTE_VERBOSE
2417  	if (IN_DEV_LOG_MARTIANS(in_dev))
2418  		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2419  				     &daddr, &saddr, dev->name);
2420  #endif
2421  
2422  e_inval:
2423  	err = -EINVAL;
2424  	goto out;
2425  
2426  e_nobufs:
2427  	err = -ENOBUFS;
2428  	goto out;
2429  
2430  martian_source:
2431  	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2432  	goto out;
2433  }
2434  
2435  /* called with rcu_read_lock held */
ip_route_input_rcu(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,struct fib_result * res)2436  static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2437  			      u8 tos, struct net_device *dev, struct fib_result *res)
2438  {
2439  	/* Multicast recognition logic is moved from route cache to here.
2440  	 * The problem was that too many Ethernet cards have broken/missing
2441  	 * hardware multicast filters :-( As result the host on multicasting
2442  	 * network acquires a lot of useless route cache entries, sort of
2443  	 * SDR messages from all the world. Now we try to get rid of them.
2444  	 * Really, provided software IP multicast filter is organized
2445  	 * reasonably (at least, hashed), it does not result in a slowdown
2446  	 * comparing with route cache reject entries.
2447  	 * Note, that multicast routers are not affected, because
2448  	 * route cache entry is created eventually.
2449  	 */
2450  	if (ipv4_is_multicast(daddr)) {
2451  		struct in_device *in_dev = __in_dev_get_rcu(dev);
2452  		int our = 0;
2453  		int err = -EINVAL;
2454  
2455  		if (!in_dev)
2456  			return err;
2457  		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2458  				      ip_hdr(skb)->protocol);
2459  
2460  		/* check l3 master if no match yet */
2461  		if (!our && netif_is_l3_slave(dev)) {
2462  			struct in_device *l3_in_dev;
2463  
2464  			l3_in_dev = __in_dev_get_rcu(skb->dev);
2465  			if (l3_in_dev)
2466  				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2467  						      ip_hdr(skb)->protocol);
2468  		}
2469  
2470  		if (our
2471  #ifdef CONFIG_IP_MROUTE
2472  			||
2473  		    (!ipv4_is_local_multicast(daddr) &&
2474  		     IN_DEV_MFORWARD(in_dev))
2475  #endif
2476  		   ) {
2477  			err = ip_route_input_mc(skb, daddr, saddr,
2478  						tos, dev, our);
2479  		}
2480  		return err;
2481  	}
2482  
2483  	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2484  }
2485  
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2486  int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2487  			 u8 tos, struct net_device *dev)
2488  {
2489  	struct fib_result res;
2490  	int err;
2491  
2492  	tos &= IPTOS_RT_MASK;
2493  	rcu_read_lock();
2494  	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2495  	rcu_read_unlock();
2496  
2497  	return err;
2498  }
2499  EXPORT_SYMBOL(ip_route_input_noref);
2500  
2501  /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)2502  static struct rtable *__mkroute_output(const struct fib_result *res,
2503  				       const struct flowi4 *fl4, int orig_oif,
2504  				       struct net_device *dev_out,
2505  				       unsigned int flags)
2506  {
2507  	struct fib_info *fi = res->fi;
2508  	struct fib_nh_exception *fnhe;
2509  	struct in_device *in_dev;
2510  	u16 type = res->type;
2511  	struct rtable *rth;
2512  	bool do_cache;
2513  
2514  	in_dev = __in_dev_get_rcu(dev_out);
2515  	if (!in_dev)
2516  		return ERR_PTR(-EINVAL);
2517  
2518  	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2519  		if (ipv4_is_loopback(fl4->saddr) &&
2520  		    !(dev_out->flags & IFF_LOOPBACK) &&
2521  		    !netif_is_l3_master(dev_out))
2522  			return ERR_PTR(-EINVAL);
2523  
2524  	if (ipv4_is_lbcast(fl4->daddr))
2525  		type = RTN_BROADCAST;
2526  	else if (ipv4_is_multicast(fl4->daddr))
2527  		type = RTN_MULTICAST;
2528  	else if (ipv4_is_zeronet(fl4->daddr))
2529  		return ERR_PTR(-EINVAL);
2530  
2531  	if (dev_out->flags & IFF_LOOPBACK)
2532  		flags |= RTCF_LOCAL;
2533  
2534  	do_cache = true;
2535  	if (type == RTN_BROADCAST) {
2536  		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2537  		fi = NULL;
2538  	} else if (type == RTN_MULTICAST) {
2539  		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2540  		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2541  				     fl4->flowi4_proto))
2542  			flags &= ~RTCF_LOCAL;
2543  		else
2544  			do_cache = false;
2545  		/* If multicast route do not exist use
2546  		 * default one, but do not gateway in this case.
2547  		 * Yes, it is hack.
2548  		 */
2549  		if (fi && res->prefixlen < 4)
2550  			fi = NULL;
2551  	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2552  		   (orig_oif != dev_out->ifindex)) {
2553  		/* For local routes that require a particular output interface
2554  		 * we do not want to cache the result.  Caching the result
2555  		 * causes incorrect behaviour when there are multiple source
2556  		 * addresses on the interface, the end result being that if the
2557  		 * intended recipient is waiting on that interface for the
2558  		 * packet he won't receive it because it will be delivered on
2559  		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2560  		 * be set to the loopback interface as well.
2561  		 */
2562  		do_cache = false;
2563  	}
2564  
2565  	fnhe = NULL;
2566  	do_cache &= fi != NULL;
2567  	if (fi) {
2568  		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2569  		struct rtable __rcu **prth;
2570  
2571  		fnhe = find_exception(nhc, fl4->daddr);
2572  		if (!do_cache)
2573  			goto add;
2574  		if (fnhe) {
2575  			prth = &fnhe->fnhe_rth_output;
2576  		} else {
2577  			if (unlikely(fl4->flowi4_flags &
2578  				     FLOWI_FLAG_KNOWN_NH &&
2579  				     !(nhc->nhc_gw_family &&
2580  				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2581  				do_cache = false;
2582  				goto add;
2583  			}
2584  			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2585  		}
2586  		rth = rcu_dereference(*prth);
2587  		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2588  			return rth;
2589  	}
2590  
2591  add:
2592  	rth = rt_dst_alloc(dev_out, flags, type,
2593  			   IN_DEV_ORCONF(in_dev, NOXFRM));
2594  	if (!rth)
2595  		return ERR_PTR(-ENOBUFS);
2596  
2597  	rth->rt_iif = orig_oif;
2598  
2599  	RT_CACHE_STAT_INC(out_slow_tot);
2600  
2601  	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602  		if (flags & RTCF_LOCAL &&
2603  		    !(dev_out->flags & IFF_LOOPBACK)) {
2604  			rth->dst.output = ip_mc_output;
2605  			RT_CACHE_STAT_INC(out_slow_mc);
2606  		}
2607  #ifdef CONFIG_IP_MROUTE
2608  		if (type == RTN_MULTICAST) {
2609  			if (IN_DEV_MFORWARD(in_dev) &&
2610  			    !ipv4_is_local_multicast(fl4->daddr)) {
2611  				rth->dst.input = ip_mr_input;
2612  				rth->dst.output = ip_mc_output;
2613  			}
2614  		}
2615  #endif
2616  	}
2617  
2618  	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2619  	lwtunnel_set_redirect(&rth->dst);
2620  
2621  	return rth;
2622  }
2623  
2624  /*
2625   * Major route resolver routine.
2626   */
2627  
ip_route_output_key_hash(struct net * net,struct flowi4 * fl4,const struct sk_buff * skb)2628  struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2629  					const struct sk_buff *skb)
2630  {
2631  	struct fib_result res = {
2632  		.type		= RTN_UNSPEC,
2633  		.fi		= NULL,
2634  		.table		= NULL,
2635  		.tclassid	= 0,
2636  	};
2637  	struct rtable *rth;
2638  
2639  	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2640  	ip_rt_fix_tos(fl4);
2641  
2642  	rcu_read_lock();
2643  	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2644  	rcu_read_unlock();
2645  
2646  	return rth;
2647  }
2648  EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2649  
ip_route_output_key_hash_rcu(struct net * net,struct flowi4 * fl4,struct fib_result * res,const struct sk_buff * skb)2650  struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2651  					    struct fib_result *res,
2652  					    const struct sk_buff *skb)
2653  {
2654  	struct net_device *dev_out = NULL;
2655  	int orig_oif = fl4->flowi4_oif;
2656  	unsigned int flags = 0;
2657  	struct rtable *rth;
2658  	int err;
2659  
2660  	if (fl4->saddr) {
2661  		if (ipv4_is_multicast(fl4->saddr) ||
2662  		    ipv4_is_lbcast(fl4->saddr) ||
2663  		    ipv4_is_zeronet(fl4->saddr)) {
2664  			rth = ERR_PTR(-EINVAL);
2665  			goto out;
2666  		}
2667  
2668  		rth = ERR_PTR(-ENETUNREACH);
2669  
2670  		/* I removed check for oif == dev_out->oif here.
2671  		 * It was wrong for two reasons:
2672  		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2673  		 *    is assigned to multiple interfaces.
2674  		 * 2. Moreover, we are allowed to send packets with saddr
2675  		 *    of another iface. --ANK
2676  		 */
2677  
2678  		if (fl4->flowi4_oif == 0 &&
2679  		    (ipv4_is_multicast(fl4->daddr) ||
2680  		     ipv4_is_lbcast(fl4->daddr))) {
2681  			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2682  			dev_out = __ip_dev_find(net, fl4->saddr, false);
2683  			if (!dev_out)
2684  				goto out;
2685  
2686  			/* Special hack: user can direct multicasts
2687  			 * and limited broadcast via necessary interface
2688  			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2689  			 * This hack is not just for fun, it allows
2690  			 * vic,vat and friends to work.
2691  			 * They bind socket to loopback, set ttl to zero
2692  			 * and expect that it will work.
2693  			 * From the viewpoint of routing cache they are broken,
2694  			 * because we are not allowed to build multicast path
2695  			 * with loopback source addr (look, routing cache
2696  			 * cannot know, that ttl is zero, so that packet
2697  			 * will not leave this host and route is valid).
2698  			 * Luckily, this hack is good workaround.
2699  			 */
2700  
2701  			fl4->flowi4_oif = dev_out->ifindex;
2702  			goto make_route;
2703  		}
2704  
2705  		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2706  			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2707  			if (!__ip_dev_find(net, fl4->saddr, false))
2708  				goto out;
2709  		}
2710  	}
2711  
2712  
2713  	if (fl4->flowi4_oif) {
2714  		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2715  		rth = ERR_PTR(-ENODEV);
2716  		if (!dev_out)
2717  			goto out;
2718  
2719  		/* RACE: Check return value of inet_select_addr instead. */
2720  		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2721  			rth = ERR_PTR(-ENETUNREACH);
2722  			goto out;
2723  		}
2724  		if (ipv4_is_local_multicast(fl4->daddr) ||
2725  		    ipv4_is_lbcast(fl4->daddr) ||
2726  		    fl4->flowi4_proto == IPPROTO_IGMP) {
2727  			if (!fl4->saddr)
2728  				fl4->saddr = inet_select_addr(dev_out, 0,
2729  							      RT_SCOPE_LINK);
2730  			goto make_route;
2731  		}
2732  		if (!fl4->saddr) {
2733  			if (ipv4_is_multicast(fl4->daddr))
2734  				fl4->saddr = inet_select_addr(dev_out, 0,
2735  							      fl4->flowi4_scope);
2736  			else if (!fl4->daddr)
2737  				fl4->saddr = inet_select_addr(dev_out, 0,
2738  							      RT_SCOPE_HOST);
2739  		}
2740  	}
2741  
2742  	if (!fl4->daddr) {
2743  		fl4->daddr = fl4->saddr;
2744  		if (!fl4->daddr)
2745  			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2746  		dev_out = net->loopback_dev;
2747  		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2748  		res->type = RTN_LOCAL;
2749  		flags |= RTCF_LOCAL;
2750  		goto make_route;
2751  	}
2752  
2753  	err = fib_lookup(net, fl4, res, 0);
2754  	if (err) {
2755  		res->fi = NULL;
2756  		res->table = NULL;
2757  		if (fl4->flowi4_oif &&
2758  		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
2759  			/* Apparently, routing tables are wrong. Assume,
2760  			 * that the destination is on link.
2761  			 *
2762  			 * WHY? DW.
2763  			 * Because we are allowed to send to iface
2764  			 * even if it has NO routes and NO assigned
2765  			 * addresses. When oif is specified, routing
2766  			 * tables are looked up with only one purpose:
2767  			 * to catch if destination is gatewayed, rather than
2768  			 * direct. Moreover, if MSG_DONTROUTE is set,
2769  			 * we send packet, ignoring both routing tables
2770  			 * and ifaddr state. --ANK
2771  			 *
2772  			 *
2773  			 * We could make it even if oif is unknown,
2774  			 * likely IPv6, but we do not.
2775  			 */
2776  
2777  			if (fl4->saddr == 0)
2778  				fl4->saddr = inet_select_addr(dev_out, 0,
2779  							      RT_SCOPE_LINK);
2780  			res->type = RTN_UNICAST;
2781  			goto make_route;
2782  		}
2783  		rth = ERR_PTR(err);
2784  		goto out;
2785  	}
2786  
2787  	if (res->type == RTN_LOCAL) {
2788  		if (!fl4->saddr) {
2789  			if (res->fi->fib_prefsrc)
2790  				fl4->saddr = res->fi->fib_prefsrc;
2791  			else
2792  				fl4->saddr = fl4->daddr;
2793  		}
2794  
2795  		/* L3 master device is the loopback for that domain */
2796  		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2797  			net->loopback_dev;
2798  
2799  		/* make sure orig_oif points to fib result device even
2800  		 * though packet rx/tx happens over loopback or l3mdev
2801  		 */
2802  		orig_oif = FIB_RES_OIF(*res);
2803  
2804  		fl4->flowi4_oif = dev_out->ifindex;
2805  		flags |= RTCF_LOCAL;
2806  		goto make_route;
2807  	}
2808  
2809  	fib_select_path(net, res, fl4, skb);
2810  
2811  	dev_out = FIB_RES_DEV(*res);
2812  
2813  make_route:
2814  	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2815  
2816  out:
2817  	return rth;
2818  }
2819  
2820  static struct dst_ops ipv4_dst_blackhole_ops = {
2821  	.family			= AF_INET,
2822  	.default_advmss		= ipv4_default_advmss,
2823  	.neigh_lookup		= ipv4_neigh_lookup,
2824  	.check			= dst_blackhole_check,
2825  	.cow_metrics		= dst_blackhole_cow_metrics,
2826  	.update_pmtu		= dst_blackhole_update_pmtu,
2827  	.redirect		= dst_blackhole_redirect,
2828  	.mtu			= dst_blackhole_mtu,
2829  };
2830  
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2831  struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2832  {
2833  	struct rtable *ort = (struct rtable *) dst_orig;
2834  	struct rtable *rt;
2835  
2836  	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2837  	if (rt) {
2838  		struct dst_entry *new = &rt->dst;
2839  
2840  		new->__use = 1;
2841  		new->input = dst_discard;
2842  		new->output = dst_discard_out;
2843  
2844  		new->dev = net->loopback_dev;
2845  		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
2846  
2847  		rt->rt_is_input = ort->rt_is_input;
2848  		rt->rt_iif = ort->rt_iif;
2849  		rt->rt_pmtu = ort->rt_pmtu;
2850  		rt->rt_mtu_locked = ort->rt_mtu_locked;
2851  
2852  		rt->rt_genid = rt_genid_ipv4(net);
2853  		rt->rt_flags = ort->rt_flags;
2854  		rt->rt_type = ort->rt_type;
2855  		rt->rt_uses_gateway = ort->rt_uses_gateway;
2856  		rt->rt_gw_family = ort->rt_gw_family;
2857  		if (rt->rt_gw_family == AF_INET)
2858  			rt->rt_gw4 = ort->rt_gw4;
2859  		else if (rt->rt_gw_family == AF_INET6)
2860  			rt->rt_gw6 = ort->rt_gw6;
2861  
2862  		INIT_LIST_HEAD(&rt->rt_uncached);
2863  	}
2864  
2865  	dst_release(dst_orig);
2866  
2867  	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2868  }
2869  
ip_route_output_flow(struct net * net,struct flowi4 * flp4,const struct sock * sk)2870  struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2871  				    const struct sock *sk)
2872  {
2873  	struct rtable *rt = __ip_route_output_key(net, flp4);
2874  
2875  	if (IS_ERR(rt))
2876  		return rt;
2877  
2878  	if (flp4->flowi4_proto) {
2879  		flp4->flowi4_oif = rt->dst.dev->ifindex;
2880  		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2881  							flowi4_to_flowi(flp4),
2882  							sk, 0);
2883  	}
2884  
2885  	return rt;
2886  }
2887  EXPORT_SYMBOL_GPL(ip_route_output_flow);
2888  
ip_route_output_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,__be32 * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)2889  struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2890  				      struct net_device *dev,
2891  				      struct net *net, __be32 *saddr,
2892  				      const struct ip_tunnel_info *info,
2893  				      u8 protocol, bool use_cache)
2894  {
2895  #ifdef CONFIG_DST_CACHE
2896  	struct dst_cache *dst_cache;
2897  #endif
2898  	struct rtable *rt = NULL;
2899  	struct flowi4 fl4;
2900  	__u8 tos;
2901  
2902  #ifdef CONFIG_DST_CACHE
2903  	dst_cache = (struct dst_cache *)&info->dst_cache;
2904  	if (use_cache) {
2905  		rt = dst_cache_get_ip4(dst_cache, saddr);
2906  		if (rt)
2907  			return rt;
2908  	}
2909  #endif
2910  	memset(&fl4, 0, sizeof(fl4));
2911  	fl4.flowi4_mark = skb->mark;
2912  	fl4.flowi4_proto = protocol;
2913  	fl4.daddr = info->key.u.ipv4.dst;
2914  	fl4.saddr = info->key.u.ipv4.src;
2915  	tos = info->key.tos;
2916  	fl4.flowi4_tos = RT_TOS(tos);
2917  
2918  	rt = ip_route_output_key(net, &fl4);
2919  	if (IS_ERR(rt)) {
2920  		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2921  		return ERR_PTR(-ENETUNREACH);
2922  	}
2923  	if (rt->dst.dev == dev) { /* is this necessary? */
2924  		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2925  		ip_rt_put(rt);
2926  		return ERR_PTR(-ELOOP);
2927  	}
2928  #ifdef CONFIG_DST_CACHE
2929  	if (use_cache)
2930  		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2931  #endif
2932  	*saddr = fl4.saddr;
2933  	return rt;
2934  }
2935  EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2936  
2937  /* called with rcu_read_lock held */
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct rtable * rt,u32 table_id,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,unsigned int flags)2938  static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2939  			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2940  			struct sk_buff *skb, u32 portid, u32 seq,
2941  			unsigned int flags)
2942  {
2943  	struct rtmsg *r;
2944  	struct nlmsghdr *nlh;
2945  	unsigned long expires = 0;
2946  	u32 error;
2947  	u32 metrics[RTAX_MAX];
2948  
2949  	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2950  	if (!nlh)
2951  		return -EMSGSIZE;
2952  
2953  	r = nlmsg_data(nlh);
2954  	r->rtm_family	 = AF_INET;
2955  	r->rtm_dst_len	= 32;
2956  	r->rtm_src_len	= 0;
2957  	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2958  	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2959  	if (nla_put_u32(skb, RTA_TABLE, table_id))
2960  		goto nla_put_failure;
2961  	r->rtm_type	= rt->rt_type;
2962  	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2963  	r->rtm_protocol = RTPROT_UNSPEC;
2964  	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2965  	if (rt->rt_flags & RTCF_NOTIFY)
2966  		r->rtm_flags |= RTM_F_NOTIFY;
2967  	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2968  		r->rtm_flags |= RTCF_DOREDIRECT;
2969  
2970  	if (nla_put_in_addr(skb, RTA_DST, dst))
2971  		goto nla_put_failure;
2972  	if (src) {
2973  		r->rtm_src_len = 32;
2974  		if (nla_put_in_addr(skb, RTA_SRC, src))
2975  			goto nla_put_failure;
2976  	}
2977  	if (rt->dst.dev &&
2978  	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2979  		goto nla_put_failure;
2980  	if (rt->dst.lwtstate &&
2981  	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2982  		goto nla_put_failure;
2983  #ifdef CONFIG_IP_ROUTE_CLASSID
2984  	if (rt->dst.tclassid &&
2985  	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2986  		goto nla_put_failure;
2987  #endif
2988  	if (fl4 && !rt_is_input_route(rt) &&
2989  	    fl4->saddr != src) {
2990  		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2991  			goto nla_put_failure;
2992  	}
2993  	if (rt->rt_uses_gateway) {
2994  		if (rt->rt_gw_family == AF_INET &&
2995  		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2996  			goto nla_put_failure;
2997  		} else if (rt->rt_gw_family == AF_INET6) {
2998  			int alen = sizeof(struct in6_addr);
2999  			struct nlattr *nla;
3000  			struct rtvia *via;
3001  
3002  			nla = nla_reserve(skb, RTA_VIA, alen + 2);
3003  			if (!nla)
3004  				goto nla_put_failure;
3005  
3006  			via = nla_data(nla);
3007  			via->rtvia_family = AF_INET6;
3008  			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
3009  		}
3010  	}
3011  
3012  	expires = rt->dst.expires;
3013  	if (expires) {
3014  		unsigned long now = jiffies;
3015  
3016  		if (time_before(now, expires))
3017  			expires -= now;
3018  		else
3019  			expires = 0;
3020  	}
3021  
3022  	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3023  	if (rt->rt_pmtu && expires)
3024  		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3025  	if (rt->rt_mtu_locked && expires)
3026  		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3027  	if (rtnetlink_put_metrics(skb, metrics) < 0)
3028  		goto nla_put_failure;
3029  
3030  	if (fl4) {
3031  		if (fl4->flowi4_mark &&
3032  		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3033  			goto nla_put_failure;
3034  
3035  		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3036  		    nla_put_u32(skb, RTA_UID,
3037  				from_kuid_munged(current_user_ns(),
3038  						 fl4->flowi4_uid)))
3039  			goto nla_put_failure;
3040  
3041  		if (rt_is_input_route(rt)) {
3042  #ifdef CONFIG_IP_MROUTE
3043  			if (ipv4_is_multicast(dst) &&
3044  			    !ipv4_is_local_multicast(dst) &&
3045  			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3046  				int err = ipmr_get_route(net, skb,
3047  							 fl4->saddr, fl4->daddr,
3048  							 r, portid);
3049  
3050  				if (err <= 0) {
3051  					if (err == 0)
3052  						return 0;
3053  					goto nla_put_failure;
3054  				}
3055  			} else
3056  #endif
3057  				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3058  					goto nla_put_failure;
3059  		}
3060  	}
3061  
3062  	error = rt->dst.error;
3063  
3064  	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3065  		goto nla_put_failure;
3066  
3067  	nlmsg_end(skb, nlh);
3068  	return 0;
3069  
3070  nla_put_failure:
3071  	nlmsg_cancel(skb, nlh);
3072  	return -EMSGSIZE;
3073  }
3074  
fnhe_dump_bucket(struct net * net,struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fnhe_hash_bucket * bucket,int genid,int * fa_index,int fa_start,unsigned int flags)3075  static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3076  			    struct netlink_callback *cb, u32 table_id,
3077  			    struct fnhe_hash_bucket *bucket, int genid,
3078  			    int *fa_index, int fa_start, unsigned int flags)
3079  {
3080  	int i;
3081  
3082  	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3083  		struct fib_nh_exception *fnhe;
3084  
3085  		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3086  		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3087  			struct rtable *rt;
3088  			int err;
3089  
3090  			if (*fa_index < fa_start)
3091  				goto next;
3092  
3093  			if (fnhe->fnhe_genid != genid)
3094  				goto next;
3095  
3096  			if (fnhe->fnhe_expires &&
3097  			    time_after(jiffies, fnhe->fnhe_expires))
3098  				goto next;
3099  
3100  			rt = rcu_dereference(fnhe->fnhe_rth_input);
3101  			if (!rt)
3102  				rt = rcu_dereference(fnhe->fnhe_rth_output);
3103  			if (!rt)
3104  				goto next;
3105  
3106  			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3107  					   table_id, NULL, skb,
3108  					   NETLINK_CB(cb->skb).portid,
3109  					   cb->nlh->nlmsg_seq, flags);
3110  			if (err)
3111  				return err;
3112  next:
3113  			(*fa_index)++;
3114  		}
3115  	}
3116  
3117  	return 0;
3118  }
3119  
fib_dump_info_fnhe(struct sk_buff * skb,struct netlink_callback * cb,u32 table_id,struct fib_info * fi,int * fa_index,int fa_start,unsigned int flags)3120  int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3121  		       u32 table_id, struct fib_info *fi,
3122  		       int *fa_index, int fa_start, unsigned int flags)
3123  {
3124  	struct net *net = sock_net(cb->skb->sk);
3125  	int nhsel, genid = fnhe_genid(net);
3126  
3127  	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3128  		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3129  		struct fnhe_hash_bucket *bucket;
3130  		int err;
3131  
3132  		if (nhc->nhc_flags & RTNH_F_DEAD)
3133  			continue;
3134  
3135  		rcu_read_lock();
3136  		bucket = rcu_dereference(nhc->nhc_exceptions);
3137  		err = 0;
3138  		if (bucket)
3139  			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3140  					       genid, fa_index, fa_start,
3141  					       flags);
3142  		rcu_read_unlock();
3143  		if (err)
3144  			return err;
3145  	}
3146  
3147  	return 0;
3148  }
3149  
inet_rtm_getroute_build_skb(__be32 src,__be32 dst,u8 ip_proto,__be16 sport,__be16 dport)3150  static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3151  						   u8 ip_proto, __be16 sport,
3152  						   __be16 dport)
3153  {
3154  	struct sk_buff *skb;
3155  	struct iphdr *iph;
3156  
3157  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3158  	if (!skb)
3159  		return NULL;
3160  
3161  	/* Reserve room for dummy headers, this skb can pass
3162  	 * through good chunk of routing engine.
3163  	 */
3164  	skb_reset_mac_header(skb);
3165  	skb_reset_network_header(skb);
3166  	skb->protocol = htons(ETH_P_IP);
3167  	iph = skb_put(skb, sizeof(struct iphdr));
3168  	iph->protocol = ip_proto;
3169  	iph->saddr = src;
3170  	iph->daddr = dst;
3171  	iph->version = 0x4;
3172  	iph->frag_off = 0;
3173  	iph->ihl = 0x5;
3174  	skb_set_transport_header(skb, skb->len);
3175  
3176  	switch (iph->protocol) {
3177  	case IPPROTO_UDP: {
3178  		struct udphdr *udph;
3179  
3180  		udph = skb_put_zero(skb, sizeof(struct udphdr));
3181  		udph->source = sport;
3182  		udph->dest = dport;
3183  		udph->len = htons(sizeof(struct udphdr));
3184  		udph->check = 0;
3185  		break;
3186  	}
3187  	case IPPROTO_TCP: {
3188  		struct tcphdr *tcph;
3189  
3190  		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3191  		tcph->source	= sport;
3192  		tcph->dest	= dport;
3193  		tcph->doff	= sizeof(struct tcphdr) / 4;
3194  		tcph->rst = 1;
3195  		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3196  					    src, dst, 0);
3197  		break;
3198  	}
3199  	case IPPROTO_ICMP: {
3200  		struct icmphdr *icmph;
3201  
3202  		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3203  		icmph->type = ICMP_ECHO;
3204  		icmph->code = 0;
3205  	}
3206  	}
3207  
3208  	return skb;
3209  }
3210  
inet_rtm_valid_getroute_req(struct sk_buff * skb,const struct nlmsghdr * nlh,struct nlattr ** tb,struct netlink_ext_ack * extack)3211  static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3212  				       const struct nlmsghdr *nlh,
3213  				       struct nlattr **tb,
3214  				       struct netlink_ext_ack *extack)
3215  {
3216  	struct rtmsg *rtm;
3217  	int i, err;
3218  
3219  	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3220  		NL_SET_ERR_MSG(extack,
3221  			       "ipv4: Invalid header for route get request");
3222  		return -EINVAL;
3223  	}
3224  
3225  	if (!netlink_strict_get_check(skb))
3226  		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3227  					      rtm_ipv4_policy, extack);
3228  
3229  	rtm = nlmsg_data(nlh);
3230  	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3231  	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3232  	    rtm->rtm_table || rtm->rtm_protocol ||
3233  	    rtm->rtm_scope || rtm->rtm_type) {
3234  		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3235  		return -EINVAL;
3236  	}
3237  
3238  	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3239  			       RTM_F_LOOKUP_TABLE |
3240  			       RTM_F_FIB_MATCH)) {
3241  		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3242  		return -EINVAL;
3243  	}
3244  
3245  	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3246  					    rtm_ipv4_policy, extack);
3247  	if (err)
3248  		return err;
3249  
3250  	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3251  	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3252  		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3253  		return -EINVAL;
3254  	}
3255  
3256  	for (i = 0; i <= RTA_MAX; i++) {
3257  		if (!tb[i])
3258  			continue;
3259  
3260  		switch (i) {
3261  		case RTA_IIF:
3262  		case RTA_OIF:
3263  		case RTA_SRC:
3264  		case RTA_DST:
3265  		case RTA_IP_PROTO:
3266  		case RTA_SPORT:
3267  		case RTA_DPORT:
3268  		case RTA_MARK:
3269  		case RTA_UID:
3270  			break;
3271  		default:
3272  			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3273  			return -EINVAL;
3274  		}
3275  	}
3276  
3277  	return 0;
3278  }
3279  
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3280  static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3281  			     struct netlink_ext_ack *extack)
3282  {
3283  	struct net *net = sock_net(in_skb->sk);
3284  	struct nlattr *tb[RTA_MAX+1];
3285  	u32 table_id = RT_TABLE_MAIN;
3286  	__be16 sport = 0, dport = 0;
3287  	struct fib_result res = {};
3288  	u8 ip_proto = IPPROTO_UDP;
3289  	struct rtable *rt = NULL;
3290  	struct sk_buff *skb;
3291  	struct rtmsg *rtm;
3292  	struct flowi4 fl4 = {};
3293  	__be32 dst = 0;
3294  	__be32 src = 0;
3295  	kuid_t uid;
3296  	u32 iif;
3297  	int err;
3298  	int mark;
3299  
3300  	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3301  	if (err < 0)
3302  		return err;
3303  
3304  	rtm = nlmsg_data(nlh);
3305  	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3306  	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3307  	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3308  	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3309  	if (tb[RTA_UID])
3310  		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3311  	else
3312  		uid = (iif ? INVALID_UID : current_uid());
3313  
3314  	if (tb[RTA_IP_PROTO]) {
3315  		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3316  						  &ip_proto, AF_INET, extack);
3317  		if (err)
3318  			return err;
3319  	}
3320  
3321  	if (tb[RTA_SPORT])
3322  		sport = nla_get_be16(tb[RTA_SPORT]);
3323  
3324  	if (tb[RTA_DPORT])
3325  		dport = nla_get_be16(tb[RTA_DPORT]);
3326  
3327  	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3328  	if (!skb)
3329  		return -ENOBUFS;
3330  
3331  	fl4.daddr = dst;
3332  	fl4.saddr = src;
3333  	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3334  	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3335  	fl4.flowi4_mark = mark;
3336  	fl4.flowi4_uid = uid;
3337  	if (sport)
3338  		fl4.fl4_sport = sport;
3339  	if (dport)
3340  		fl4.fl4_dport = dport;
3341  	fl4.flowi4_proto = ip_proto;
3342  
3343  	rcu_read_lock();
3344  
3345  	if (iif) {
3346  		struct net_device *dev;
3347  
3348  		dev = dev_get_by_index_rcu(net, iif);
3349  		if (!dev) {
3350  			err = -ENODEV;
3351  			goto errout_rcu;
3352  		}
3353  
3354  		fl4.flowi4_iif = iif; /* for rt_fill_info */
3355  		skb->dev	= dev;
3356  		skb->mark	= mark;
3357  		err = ip_route_input_rcu(skb, dst, src,
3358  					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3359  					 &res);
3360  
3361  		rt = skb_rtable(skb);
3362  		if (err == 0 && rt->dst.error)
3363  			err = -rt->dst.error;
3364  	} else {
3365  		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3366  		skb->dev = net->loopback_dev;
3367  		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3368  		err = 0;
3369  		if (IS_ERR(rt))
3370  			err = PTR_ERR(rt);
3371  		else
3372  			skb_dst_set(skb, &rt->dst);
3373  	}
3374  
3375  	if (err)
3376  		goto errout_rcu;
3377  
3378  	if (rtm->rtm_flags & RTM_F_NOTIFY)
3379  		rt->rt_flags |= RTCF_NOTIFY;
3380  
3381  	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3382  		table_id = res.table ? res.table->tb_id : 0;
3383  
3384  	/* reset skb for netlink reply msg */
3385  	skb_trim(skb, 0);
3386  	skb_reset_network_header(skb);
3387  	skb_reset_transport_header(skb);
3388  	skb_reset_mac_header(skb);
3389  
3390  	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3391  		struct fib_rt_info fri;
3392  
3393  		if (!res.fi) {
3394  			err = fib_props[res.type].error;
3395  			if (!err)
3396  				err = -EHOSTUNREACH;
3397  			goto errout_rcu;
3398  		}
3399  		fri.fi = res.fi;
3400  		fri.tb_id = table_id;
3401  		fri.dst = res.prefix;
3402  		fri.dst_len = res.prefixlen;
3403  		fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3404  		fri.type = rt->rt_type;
3405  		fri.offload = 0;
3406  		fri.trap = 0;
3407  		fri.offload_failed = 0;
3408  		if (res.fa_head) {
3409  			struct fib_alias *fa;
3410  
3411  			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3412  				u8 slen = 32 - fri.dst_len;
3413  
3414  				if (fa->fa_slen == slen &&
3415  				    fa->tb_id == fri.tb_id &&
3416  				    fa->fa_dscp == fri.dscp &&
3417  				    fa->fa_info == res.fi &&
3418  				    fa->fa_type == fri.type) {
3419  					fri.offload = READ_ONCE(fa->offload);
3420  					fri.trap = READ_ONCE(fa->trap);
3421  					break;
3422  				}
3423  			}
3424  		}
3425  		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3426  				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3427  	} else {
3428  		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3429  				   NETLINK_CB(in_skb).portid,
3430  				   nlh->nlmsg_seq, 0);
3431  	}
3432  	if (err < 0)
3433  		goto errout_rcu;
3434  
3435  	rcu_read_unlock();
3436  
3437  	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3438  
3439  errout_free:
3440  	return err;
3441  errout_rcu:
3442  	rcu_read_unlock();
3443  	kfree_skb(skb);
3444  	goto errout_free;
3445  }
3446  
ip_rt_multicast_event(struct in_device * in_dev)3447  void ip_rt_multicast_event(struct in_device *in_dev)
3448  {
3449  	rt_cache_flush(dev_net(in_dev->dev));
3450  }
3451  
3452  #ifdef CONFIG_SYSCTL
3453  static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3454  static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3455  static int ip_rt_gc_elasticity __read_mostly	= 8;
3456  static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3457  
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)3458  static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3459  		void *buffer, size_t *lenp, loff_t *ppos)
3460  {
3461  	struct net *net = (struct net *)__ctl->extra1;
3462  
3463  	if (write) {
3464  		rt_cache_flush(net);
3465  		fnhe_genid_bump(net);
3466  		return 0;
3467  	}
3468  
3469  	return -EINVAL;
3470  }
3471  
3472  static struct ctl_table ipv4_route_table[] = {
3473  	{
3474  		.procname	= "gc_thresh",
3475  		.data		= &ipv4_dst_ops.gc_thresh,
3476  		.maxlen		= sizeof(int),
3477  		.mode		= 0644,
3478  		.proc_handler	= proc_dointvec,
3479  	},
3480  	{
3481  		.procname	= "max_size",
3482  		.data		= &ip_rt_max_size,
3483  		.maxlen		= sizeof(int),
3484  		.mode		= 0644,
3485  		.proc_handler	= proc_dointvec,
3486  	},
3487  	{
3488  		/*  Deprecated. Use gc_min_interval_ms */
3489  
3490  		.procname	= "gc_min_interval",
3491  		.data		= &ip_rt_gc_min_interval,
3492  		.maxlen		= sizeof(int),
3493  		.mode		= 0644,
3494  		.proc_handler	= proc_dointvec_jiffies,
3495  	},
3496  	{
3497  		.procname	= "gc_min_interval_ms",
3498  		.data		= &ip_rt_gc_min_interval,
3499  		.maxlen		= sizeof(int),
3500  		.mode		= 0644,
3501  		.proc_handler	= proc_dointvec_ms_jiffies,
3502  	},
3503  	{
3504  		.procname	= "gc_timeout",
3505  		.data		= &ip_rt_gc_timeout,
3506  		.maxlen		= sizeof(int),
3507  		.mode		= 0644,
3508  		.proc_handler	= proc_dointvec_jiffies,
3509  	},
3510  	{
3511  		.procname	= "gc_interval",
3512  		.data		= &ip_rt_gc_interval,
3513  		.maxlen		= sizeof(int),
3514  		.mode		= 0644,
3515  		.proc_handler	= proc_dointvec_jiffies,
3516  	},
3517  	{
3518  		.procname	= "redirect_load",
3519  		.data		= &ip_rt_redirect_load,
3520  		.maxlen		= sizeof(int),
3521  		.mode		= 0644,
3522  		.proc_handler	= proc_dointvec,
3523  	},
3524  	{
3525  		.procname	= "redirect_number",
3526  		.data		= &ip_rt_redirect_number,
3527  		.maxlen		= sizeof(int),
3528  		.mode		= 0644,
3529  		.proc_handler	= proc_dointvec,
3530  	},
3531  	{
3532  		.procname	= "redirect_silence",
3533  		.data		= &ip_rt_redirect_silence,
3534  		.maxlen		= sizeof(int),
3535  		.mode		= 0644,
3536  		.proc_handler	= proc_dointvec,
3537  	},
3538  	{
3539  		.procname	= "error_cost",
3540  		.data		= &ip_rt_error_cost,
3541  		.maxlen		= sizeof(int),
3542  		.mode		= 0644,
3543  		.proc_handler	= proc_dointvec,
3544  	},
3545  	{
3546  		.procname	= "error_burst",
3547  		.data		= &ip_rt_error_burst,
3548  		.maxlen		= sizeof(int),
3549  		.mode		= 0644,
3550  		.proc_handler	= proc_dointvec,
3551  	},
3552  	{
3553  		.procname	= "gc_elasticity",
3554  		.data		= &ip_rt_gc_elasticity,
3555  		.maxlen		= sizeof(int),
3556  		.mode		= 0644,
3557  		.proc_handler	= proc_dointvec,
3558  	},
3559  	{ }
3560  };
3561  
3562  static const char ipv4_route_flush_procname[] = "flush";
3563  
3564  static struct ctl_table ipv4_route_netns_table[] = {
3565  	{
3566  		.procname	= ipv4_route_flush_procname,
3567  		.maxlen		= sizeof(int),
3568  		.mode		= 0200,
3569  		.proc_handler	= ipv4_sysctl_rtcache_flush,
3570  	},
3571  	{
3572  		.procname       = "min_pmtu",
3573  		.data           = &init_net.ipv4.ip_rt_min_pmtu,
3574  		.maxlen         = sizeof(int),
3575  		.mode           = 0644,
3576  		.proc_handler   = proc_dointvec_minmax,
3577  		.extra1         = &ip_min_valid_pmtu,
3578  	},
3579  	{
3580  		.procname       = "mtu_expires",
3581  		.data           = &init_net.ipv4.ip_rt_mtu_expires,
3582  		.maxlen         = sizeof(int),
3583  		.mode           = 0644,
3584  		.proc_handler   = proc_dointvec_jiffies,
3585  	},
3586  	{
3587  		.procname   = "min_adv_mss",
3588  		.data       = &init_net.ipv4.ip_rt_min_advmss,
3589  		.maxlen     = sizeof(int),
3590  		.mode       = 0644,
3591  		.proc_handler   = proc_dointvec,
3592  	},
3593  	{ },
3594  };
3595  
sysctl_route_net_init(struct net * net)3596  static __net_init int sysctl_route_net_init(struct net *net)
3597  {
3598  	struct ctl_table *tbl;
3599  
3600  	tbl = ipv4_route_netns_table;
3601  	if (!net_eq(net, &init_net)) {
3602  		int i;
3603  
3604  		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3605  		if (!tbl)
3606  			goto err_dup;
3607  
3608  		/* Don't export non-whitelisted sysctls to unprivileged users */
3609  		if (net->user_ns != &init_user_ns) {
3610  			if (tbl[0].procname != ipv4_route_flush_procname)
3611  				tbl[0].procname = NULL;
3612  		}
3613  
3614  		/* Update the variables to point into the current struct net
3615  		 * except for the first element flush
3616  		 */
3617  		for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3618  			tbl[i].data += (void *)net - (void *)&init_net;
3619  	}
3620  	tbl[0].extra1 = net;
3621  
3622  	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3623  	if (!net->ipv4.route_hdr)
3624  		goto err_reg;
3625  	return 0;
3626  
3627  err_reg:
3628  	if (tbl != ipv4_route_netns_table)
3629  		kfree(tbl);
3630  err_dup:
3631  	return -ENOMEM;
3632  }
3633  
sysctl_route_net_exit(struct net * net)3634  static __net_exit void sysctl_route_net_exit(struct net *net)
3635  {
3636  	struct ctl_table *tbl;
3637  
3638  	tbl = net->ipv4.route_hdr->ctl_table_arg;
3639  	unregister_net_sysctl_table(net->ipv4.route_hdr);
3640  	BUG_ON(tbl == ipv4_route_netns_table);
3641  	kfree(tbl);
3642  }
3643  
3644  static __net_initdata struct pernet_operations sysctl_route_ops = {
3645  	.init = sysctl_route_net_init,
3646  	.exit = sysctl_route_net_exit,
3647  };
3648  #endif
3649  
netns_ip_rt_init(struct net * net)3650  static __net_init int netns_ip_rt_init(struct net *net)
3651  {
3652  	/* Set default value for namespaceified sysctls */
3653  	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3654  	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3655  	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3656  	return 0;
3657  }
3658  
3659  static struct pernet_operations __net_initdata ip_rt_ops = {
3660  	.init = netns_ip_rt_init,
3661  };
3662  
rt_genid_init(struct net * net)3663  static __net_init int rt_genid_init(struct net *net)
3664  {
3665  	atomic_set(&net->ipv4.rt_genid, 0);
3666  	atomic_set(&net->fnhe_genid, 0);
3667  	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
3668  	return 0;
3669  }
3670  
3671  static __net_initdata struct pernet_operations rt_genid_ops = {
3672  	.init = rt_genid_init,
3673  };
3674  
ipv4_inetpeer_init(struct net * net)3675  static int __net_init ipv4_inetpeer_init(struct net *net)
3676  {
3677  	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3678  
3679  	if (!bp)
3680  		return -ENOMEM;
3681  	inet_peer_base_init(bp);
3682  	net->ipv4.peers = bp;
3683  	return 0;
3684  }
3685  
ipv4_inetpeer_exit(struct net * net)3686  static void __net_exit ipv4_inetpeer_exit(struct net *net)
3687  {
3688  	struct inet_peer_base *bp = net->ipv4.peers;
3689  
3690  	net->ipv4.peers = NULL;
3691  	inetpeer_invalidate_tree(bp);
3692  	kfree(bp);
3693  }
3694  
3695  static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3696  	.init	=	ipv4_inetpeer_init,
3697  	.exit	=	ipv4_inetpeer_exit,
3698  };
3699  
3700  #ifdef CONFIG_IP_ROUTE_CLASSID
3701  struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3702  #endif /* CONFIG_IP_ROUTE_CLASSID */
3703  
ip_rt_init(void)3704  int __init ip_rt_init(void)
3705  {
3706  	void *idents_hash;
3707  	int cpu;
3708  
3709  	/* For modern hosts, this will use 2 MB of memory */
3710  	idents_hash = alloc_large_system_hash("IP idents",
3711  					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3712  					      0,
3713  					      16, /* one bucket per 64 KB */
3714  					      HASH_ZERO,
3715  					      NULL,
3716  					      &ip_idents_mask,
3717  					      2048,
3718  					      256*1024);
3719  
3720  	ip_idents = idents_hash;
3721  
3722  	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3723  
3724  	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3725  
3726  	for_each_possible_cpu(cpu) {
3727  		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3728  
3729  		INIT_LIST_HEAD(&ul->head);
3730  		INIT_LIST_HEAD(&ul->quarantine);
3731  		spin_lock_init(&ul->lock);
3732  	}
3733  #ifdef CONFIG_IP_ROUTE_CLASSID
3734  	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3735  	if (!ip_rt_acct)
3736  		panic("IP: failed to allocate ip_rt_acct\n");
3737  #endif
3738  
3739  	ipv4_dst_ops.kmem_cachep =
3740  		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3741  				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3742  
3743  	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3744  
3745  	if (dst_entries_init(&ipv4_dst_ops) < 0)
3746  		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3747  
3748  	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3749  		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3750  
3751  	ipv4_dst_ops.gc_thresh = ~0;
3752  	ip_rt_max_size = INT_MAX;
3753  
3754  	devinet_init();
3755  	ip_fib_init();
3756  
3757  	if (ip_rt_proc_init())
3758  		pr_err("Unable to create route proc files\n");
3759  #ifdef CONFIG_XFRM
3760  	xfrm_init();
3761  	xfrm4_init();
3762  #endif
3763  	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3764  		      RTNL_FLAG_DOIT_UNLOCKED);
3765  
3766  #ifdef CONFIG_SYSCTL
3767  	register_pernet_subsys(&sysctl_route_ops);
3768  #endif
3769  	register_pernet_subsys(&ip_rt_ops);
3770  	register_pernet_subsys(&rt_genid_ops);
3771  	register_pernet_subsys(&ipv4_inetpeer_ops);
3772  	return 0;
3773  }
3774  
3775  #ifdef CONFIG_SYSCTL
3776  /*
3777   * We really need to sanitize the damn ipv4 init order, then all
3778   * this nonsense will go away.
3779   */
ip_static_sysctl_init(void)3780  void __init ip_static_sysctl_init(void)
3781  {
3782  	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3783  }
3784  #endif
3785