1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
tcp_v4_init_seq(const struct sk_buff * skb)98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 
118 	if (reuse == 2) {
119 		/* Still does not detect *everything* that goes through
120 		 * lo, since we require a loopback src or dst address
121 		 * or direct binding to 'lo' interface.
122 		 */
123 		bool loopback = false;
124 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 			loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127 		if (tw->tw_family == AF_INET6) {
128 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 				loopback = true;
135 		} else
136 #endif
137 		{
138 			if (ipv4_is_loopback(tw->tw_daddr) ||
139 			    ipv4_is_loopback(tw->tw_rcv_saddr))
140 				loopback = true;
141 		}
142 		if (!loopback)
143 			reuse = 0;
144 	}
145 
146 	/* With PAWS, it is safe from the viewpoint
147 	   of data integrity. Even without PAWS it is safe provided sequence
148 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149 
150 	   Actually, the idea is close to VJ's one, only timestamp cache is
151 	   held not per host, but per port pair and TW bucket is used as state
152 	   holder.
153 
154 	   If TW bucket has been already destroyed we fall back to VJ's scheme
155 	   and use initial timestamp retrieved from peer table.
156 	 */
157 	if (tcptw->tw_ts_recent_stamp &&
158 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
159 					    tcptw->tw_ts_recent_stamp)))) {
160 		/* In case of repair and re-using TIME-WAIT sockets we still
161 		 * want to be sure that it is safe as above but honor the
162 		 * sequence numbers and time stamps set as part of the repair
163 		 * process.
164 		 *
165 		 * Without this check re-using a TIME-WAIT socket with TCP
166 		 * repair would accumulate a -1 on the repair assigned
167 		 * sequence number. The first time it is reused the sequence
168 		 * is -1, the second time -2, etc. This fixes that issue
169 		 * without appearing to create any others.
170 		 */
171 		if (likely(!tp->repair)) {
172 			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173 			if (tp->write_seq == 0)
174 				tp->write_seq = 1;
175 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
176 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177 		}
178 		sock_hold(sktw);
179 		return 1;
180 	}
181 
182 	return 0;
183 }
184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 			      int addr_len)
188 {
189 	/* This check is replicated from tcp_v4_connect() and intended to
190 	 * prevent BPF program called below from accessing bytes that are out
191 	 * of the bound specified by user in addr_len.
192 	 */
193 	if (addr_len < sizeof(struct sockaddr_in))
194 		return -EINVAL;
195 
196 	sock_owned_by_me(sk);
197 
198 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 }
200 
201 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 {
204 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
205 	struct inet_sock *inet = inet_sk(sk);
206 	struct tcp_sock *tp = tcp_sk(sk);
207 	__be16 orig_sport, orig_dport;
208 	__be32 daddr, nexthop;
209 	struct flowi4 *fl4;
210 	struct rtable *rt;
211 	int err;
212 	struct ip_options_rcu *inet_opt;
213 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
214 
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	if (usin->sin_family != AF_INET)
219 		return -EAFNOSUPPORT;
220 
221 	nexthop = daddr = usin->sin_addr.s_addr;
222 	inet_opt = rcu_dereference_protected(inet->inet_opt,
223 					     lockdep_sock_is_held(sk));
224 	if (inet_opt && inet_opt->opt.srr) {
225 		if (!daddr)
226 			return -EINVAL;
227 		nexthop = inet_opt->opt.faddr;
228 	}
229 
230 	orig_sport = inet->inet_sport;
231 	orig_dport = usin->sin_port;
232 	fl4 = &inet->cork.fl.u.ip4;
233 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235 			      IPPROTO_TCP,
236 			      orig_sport, orig_dport, sk);
237 	if (IS_ERR(rt)) {
238 		err = PTR_ERR(rt);
239 		if (err == -ENETUNREACH)
240 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241 		return err;
242 	}
243 
244 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 		ip_rt_put(rt);
246 		return -ENETUNREACH;
247 	}
248 
249 	if (!inet_opt || !inet_opt->opt.srr)
250 		daddr = fl4->daddr;
251 
252 	if (!inet->inet_saddr)
253 		inet->inet_saddr = fl4->saddr;
254 	sk_rcv_saddr_set(sk, inet->inet_saddr);
255 
256 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
257 		/* Reset inherited state */
258 		tp->rx_opt.ts_recent	   = 0;
259 		tp->rx_opt.ts_recent_stamp = 0;
260 		if (likely(!tp->repair))
261 			tp->write_seq	   = 0;
262 	}
263 
264 	inet->inet_dport = usin->sin_port;
265 	sk_daddr_set(sk, daddr);
266 
267 	inet_csk(sk)->icsk_ext_hdr_len = 0;
268 	if (inet_opt)
269 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
270 
271 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
272 
273 	/* Socket identity is still unknown (sport may be zero).
274 	 * However we set state to SYN-SENT and not releasing socket
275 	 * lock select source port, enter ourselves into the hash tables and
276 	 * complete initialization after this.
277 	 */
278 	tcp_set_state(sk, TCP_SYN_SENT);
279 	err = inet_hash_connect(tcp_death_row, sk);
280 	if (err)
281 		goto failure;
282 
283 	sk_set_txhash(sk);
284 
285 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
286 			       inet->inet_sport, inet->inet_dport, sk);
287 	if (IS_ERR(rt)) {
288 		err = PTR_ERR(rt);
289 		rt = NULL;
290 		goto failure;
291 	}
292 	/* OK, now commit destination to socket.  */
293 	sk->sk_gso_type = SKB_GSO_TCPV4;
294 	sk_setup_caps(sk, &rt->dst);
295 	rt = NULL;
296 
297 	if (likely(!tp->repair)) {
298 		if (!tp->write_seq)
299 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300 						       inet->inet_daddr,
301 						       inet->inet_sport,
302 						       usin->sin_port);
303 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 						 inet->inet_saddr,
305 						 inet->inet_daddr);
306 	}
307 
308 	inet->inet_id = tp->write_seq ^ jiffies;
309 
310 	if (tcp_fastopen_defer_connect(sk, &err))
311 		return err;
312 	if (err)
313 		goto failure;
314 
315 	err = tcp_connect(sk);
316 
317 	if (err)
318 		goto failure;
319 
320 	return 0;
321 
322 failure:
323 	/*
324 	 * This unhashes the socket and releases the local port,
325 	 * if necessary.
326 	 */
327 	tcp_set_state(sk, TCP_CLOSE);
328 	ip_rt_put(rt);
329 	sk->sk_route_caps = 0;
330 	inet->inet_dport = 0;
331 	return err;
332 }
333 EXPORT_SYMBOL(tcp_v4_connect);
334 
335 /*
336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337  * It can be called through tcp_release_cb() if socket was owned by user
338  * at the time tcp_v4_err() was called to handle ICMP message.
339  */
tcp_v4_mtu_reduced(struct sock * sk)340 void tcp_v4_mtu_reduced(struct sock *sk)
341 {
342 	struct inet_sock *inet = inet_sk(sk);
343 	struct dst_entry *dst;
344 	u32 mtu;
345 
346 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 		return;
348 	mtu = tcp_sk(sk)->mtu_info;
349 	dst = inet_csk_update_pmtu(sk, mtu);
350 	if (!dst)
351 		return;
352 
353 	/* Something is about to be wrong... Remember soft error
354 	 * for the case, if this connection will not able to recover.
355 	 */
356 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 		sk->sk_err_soft = EMSGSIZE;
358 
359 	mtu = dst_mtu(dst);
360 
361 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362 	    ip_sk_accept_pmtu(sk) &&
363 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364 		tcp_sync_mss(sk, mtu);
365 
366 		/* Resend the TCP packet because it's
367 		 * clear that the old packet has been
368 		 * dropped. This is the new "fast" path mtu
369 		 * discovery.
370 		 */
371 		tcp_simple_retransmit(sk);
372 	} /* else let the usual retransmit timer handle it */
373 }
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375 
do_redirect(struct sk_buff * skb,struct sock * sk)376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 {
378 	struct dst_entry *dst = __sk_dst_check(sk, 0);
379 
380 	if (dst)
381 		dst->ops->redirect(dst, sk, skb);
382 }
383 
384 
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 {
388 	struct request_sock *req = inet_reqsk(sk);
389 	struct net *net = sock_net(sk);
390 
391 	/* ICMPs are not backlogged, hence we cannot get
392 	 * an established socket here.
393 	 */
394 	if (seq != tcp_rsk(req)->snt_isn) {
395 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396 	} else if (abort) {
397 		/*
398 		 * Still in SYN_RECV, just remove it silently.
399 		 * There is no good way to pass the error to the newly
400 		 * created socket, and POSIX does not want network
401 		 * errors returned from accept().
402 		 */
403 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404 		tcp_listendrop(req->rsk_listener);
405 	}
406 	reqsk_put(req);
407 }
408 EXPORT_SYMBOL(tcp_req_err);
409 
410 /*
411  * This routine is called by the ICMP module when it gets some
412  * sort of error condition.  If err < 0 then the socket should
413  * be closed and the error returned to the user.  If err > 0
414  * it's just the icmp type << 8 | icmp code.  After adjustment
415  * header points to the first 8 bytes of the tcp header.  We need
416  * to find the appropriate port.
417  *
418  * The locking strategy used here is very "optimistic". When
419  * someone else accesses the socket the ICMP is just dropped
420  * and for some paths there is no check at all.
421  * A more general error queue to queue errors for later handling
422  * is probably better.
423  *
424  */
425 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)426 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 {
428 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430 	struct inet_connection_sock *icsk;
431 	struct tcp_sock *tp;
432 	struct inet_sock *inet;
433 	const int type = icmp_hdr(icmp_skb)->type;
434 	const int code = icmp_hdr(icmp_skb)->code;
435 	struct sock *sk;
436 	struct sk_buff *skb;
437 	struct request_sock *fastopen;
438 	u32 seq, snd_una;
439 	s32 remaining;
440 	u32 delta_us;
441 	int err;
442 	struct net *net = dev_net(icmp_skb->dev);
443 
444 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 				       th->dest, iph->saddr, ntohs(th->source),
446 				       inet_iif(icmp_skb), 0);
447 	if (!sk) {
448 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
449 		return;
450 	}
451 	if (sk->sk_state == TCP_TIME_WAIT) {
452 		inet_twsk_put(inet_twsk(sk));
453 		return;
454 	}
455 	seq = ntohl(th->seq);
456 	if (sk->sk_state == TCP_NEW_SYN_RECV)
457 		return tcp_req_err(sk, seq,
458 				  type == ICMP_PARAMETERPROB ||
459 				  type == ICMP_TIME_EXCEEDED ||
460 				  (type == ICMP_DEST_UNREACH &&
461 				   (code == ICMP_NET_UNREACH ||
462 				    code == ICMP_HOST_UNREACH)));
463 
464 	bh_lock_sock(sk);
465 	/* If too many ICMPs get dropped on busy
466 	 * servers this needs to be solved differently.
467 	 * We do take care of PMTU discovery (RFC1191) special case :
468 	 * we can receive locally generated ICMP messages while socket is held.
469 	 */
470 	if (sock_owned_by_user(sk)) {
471 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
472 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
473 	}
474 	if (sk->sk_state == TCP_CLOSE)
475 		goto out;
476 
477 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
478 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
479 		goto out;
480 	}
481 
482 	icsk = inet_csk(sk);
483 	tp = tcp_sk(sk);
484 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
485 	fastopen = tp->fastopen_rsk;
486 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
487 	if (sk->sk_state != TCP_LISTEN &&
488 	    !between(seq, snd_una, tp->snd_nxt)) {
489 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
490 		goto out;
491 	}
492 
493 	switch (type) {
494 	case ICMP_REDIRECT:
495 		if (!sock_owned_by_user(sk))
496 			do_redirect(icmp_skb, sk);
497 		goto out;
498 	case ICMP_SOURCE_QUENCH:
499 		/* Just silently ignore these. */
500 		goto out;
501 	case ICMP_PARAMETERPROB:
502 		err = EPROTO;
503 		break;
504 	case ICMP_DEST_UNREACH:
505 		if (code > NR_ICMP_UNREACH)
506 			goto out;
507 
508 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
509 			/* We are not interested in TCP_LISTEN and open_requests
510 			 * (SYN-ACKs send out by Linux are always <576bytes so
511 			 * they should go through unfragmented).
512 			 */
513 			if (sk->sk_state == TCP_LISTEN)
514 				goto out;
515 
516 			tp->mtu_info = info;
517 			if (!sock_owned_by_user(sk)) {
518 				tcp_v4_mtu_reduced(sk);
519 			} else {
520 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
521 					sock_hold(sk);
522 			}
523 			goto out;
524 		}
525 
526 		err = icmp_err_convert[code].errno;
527 		/* check if icmp_skb allows revert of backoff
528 		 * (see draft-zimmermann-tcp-lcd) */
529 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
530 			break;
531 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
532 		    !icsk->icsk_backoff || fastopen)
533 			break;
534 
535 		if (sock_owned_by_user(sk))
536 			break;
537 
538 		icsk->icsk_backoff--;
539 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
540 					       TCP_TIMEOUT_INIT;
541 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
542 
543 		skb = tcp_rtx_queue_head(sk);
544 		BUG_ON(!skb);
545 
546 		tcp_mstamp_refresh(tp);
547 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
548 		remaining = icsk->icsk_rto -
549 			    usecs_to_jiffies(delta_us);
550 
551 		if (remaining > 0) {
552 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
553 						  remaining, TCP_RTO_MAX);
554 		} else {
555 			/* RTO revert clocked out retransmission.
556 			 * Will retransmit now */
557 			tcp_retransmit_timer(sk);
558 		}
559 
560 		break;
561 	case ICMP_TIME_EXCEEDED:
562 		err = EHOSTUNREACH;
563 		break;
564 	default:
565 		goto out;
566 	}
567 
568 	switch (sk->sk_state) {
569 	case TCP_SYN_SENT:
570 	case TCP_SYN_RECV:
571 		/* Only in fast or simultaneous open. If a fast open socket is
572 		 * is already accepted it is treated as a connected one below.
573 		 */
574 		if (fastopen && !fastopen->sk)
575 			break;
576 
577 		if (!sock_owned_by_user(sk)) {
578 			sk->sk_err = err;
579 
580 			sk->sk_error_report(sk);
581 
582 			tcp_done(sk);
583 		} else {
584 			sk->sk_err_soft = err;
585 		}
586 		goto out;
587 	}
588 
589 	/* If we've already connected we will keep trying
590 	 * until we time out, or the user gives up.
591 	 *
592 	 * rfc1122 4.2.3.9 allows to consider as hard errors
593 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
594 	 * but it is obsoleted by pmtu discovery).
595 	 *
596 	 * Note, that in modern internet, where routing is unreliable
597 	 * and in each dark corner broken firewalls sit, sending random
598 	 * errors ordered by their masters even this two messages finally lose
599 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 	 *
601 	 * Now we are in compliance with RFCs.
602 	 *							--ANK (980905)
603 	 */
604 
605 	inet = inet_sk(sk);
606 	if (!sock_owned_by_user(sk) && inet->recverr) {
607 		sk->sk_err = err;
608 		sk->sk_error_report(sk);
609 	} else	{ /* Only an error on timeout */
610 		sk->sk_err_soft = err;
611 	}
612 
613 out:
614 	bh_unlock_sock(sk);
615 	sock_put(sk);
616 }
617 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 {
620 	struct tcphdr *th = tcp_hdr(skb);
621 
622 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 	skb->csum_start = skb_transport_header(skb) - skb->head;
624 	skb->csum_offset = offsetof(struct tcphdr, check);
625 }
626 
627 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 {
630 	const struct inet_sock *inet = inet_sk(sk);
631 
632 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 }
634 EXPORT_SYMBOL(tcp_v4_send_check);
635 
636 /*
637  *	This routine will send an RST to the other tcp.
638  *
639  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640  *		      for reset.
641  *	Answer: if a packet caused RST, it is not for a socket
642  *		existing in our system, if it is matched to a socket,
643  *		it is just duplicate segment or bug in other side's TCP.
644  *		So that we build reply only basing on parameters
645  *		arrived with segment.
646  *	Exception: precedence violation. We do not implement it in any case.
647  */
648 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 {
651 	const struct tcphdr *th = tcp_hdr(skb);
652 	struct {
653 		struct tcphdr th;
654 #ifdef CONFIG_TCP_MD5SIG
655 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
656 #endif
657 	} rep;
658 	struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 	struct tcp_md5sig_key *key = NULL;
661 	const __u8 *hash_location = NULL;
662 	unsigned char newhash[16];
663 	int genhash;
664 	struct sock *sk1 = NULL;
665 #endif
666 	struct net *net;
667 	struct sock *ctl_sk;
668 
669 	/* Never send a reset in response to a reset. */
670 	if (th->rst)
671 		return;
672 
673 	/* If sk not NULL, it means we did a successful lookup and incoming
674 	 * route had to be correct. prequeue might have dropped our dst.
675 	 */
676 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
677 		return;
678 
679 	/* Swap the send and the receive. */
680 	memset(&rep, 0, sizeof(rep));
681 	rep.th.dest   = th->source;
682 	rep.th.source = th->dest;
683 	rep.th.doff   = sizeof(struct tcphdr) / 4;
684 	rep.th.rst    = 1;
685 
686 	if (th->ack) {
687 		rep.th.seq = th->ack_seq;
688 	} else {
689 		rep.th.ack = 1;
690 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
691 				       skb->len - (th->doff << 2));
692 	}
693 
694 	memset(&arg, 0, sizeof(arg));
695 	arg.iov[0].iov_base = (unsigned char *)&rep;
696 	arg.iov[0].iov_len  = sizeof(rep.th);
697 
698 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
699 #ifdef CONFIG_TCP_MD5SIG
700 	rcu_read_lock();
701 	hash_location = tcp_parse_md5sig_option(th);
702 	if (sk && sk_fullsock(sk)) {
703 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
704 					&ip_hdr(skb)->saddr, AF_INET);
705 	} else if (hash_location) {
706 		/*
707 		 * active side is lost. Try to find listening socket through
708 		 * source port, and then find md5 key through listening socket.
709 		 * we are not loose security here:
710 		 * Incoming packet is checked with md5 hash with finding key,
711 		 * no RST generated if md5 hash doesn't match.
712 		 */
713 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
714 					     ip_hdr(skb)->saddr,
715 					     th->source, ip_hdr(skb)->daddr,
716 					     ntohs(th->source), inet_iif(skb),
717 					     tcp_v4_sdif(skb));
718 		/* don't send rst if it can't find key */
719 		if (!sk1)
720 			goto out;
721 
722 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
723 					&ip_hdr(skb)->saddr, AF_INET);
724 		if (!key)
725 			goto out;
726 
727 
728 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
729 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
730 			goto out;
731 
732 	}
733 
734 	if (key) {
735 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
736 				   (TCPOPT_NOP << 16) |
737 				   (TCPOPT_MD5SIG << 8) |
738 				   TCPOLEN_MD5SIG);
739 		/* Update length and the length the header thinks exists */
740 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
741 		rep.th.doff = arg.iov[0].iov_len / 4;
742 
743 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
744 				     key, ip_hdr(skb)->saddr,
745 				     ip_hdr(skb)->daddr, &rep.th);
746 	}
747 #endif
748 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
749 				      ip_hdr(skb)->saddr, /* XXX */
750 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
751 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
752 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
753 
754 	/* When socket is gone, all binding information is lost.
755 	 * routing might fail in this case. No choice here, if we choose to force
756 	 * input interface, we will misroute in case of asymmetric route.
757 	 */
758 	if (sk) {
759 		arg.bound_dev_if = sk->sk_bound_dev_if;
760 		if (sk_fullsock(sk))
761 			trace_tcp_send_reset(sk, skb);
762 	}
763 
764 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
765 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
766 
767 	arg.tos = ip_hdr(skb)->tos;
768 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
769 	local_bh_disable();
770 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
771 	if (sk)
772 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
773 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
774 	ip_send_unicast_reply(ctl_sk,
775 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
776 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 			      &arg, arg.iov[0].iov_len);
778 
779 	ctl_sk->sk_mark = 0;
780 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
781 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
782 	local_bh_enable();
783 
784 #ifdef CONFIG_TCP_MD5SIG
785 out:
786 	rcu_read_unlock();
787 #endif
788 }
789 
790 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
791    outside socket context is ugly, certainly. What can I do?
792  */
793 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)794 static void tcp_v4_send_ack(const struct sock *sk,
795 			    struct sk_buff *skb, u32 seq, u32 ack,
796 			    u32 win, u32 tsval, u32 tsecr, int oif,
797 			    struct tcp_md5sig_key *key,
798 			    int reply_flags, u8 tos)
799 {
800 	const struct tcphdr *th = tcp_hdr(skb);
801 	struct {
802 		struct tcphdr th;
803 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
804 #ifdef CONFIG_TCP_MD5SIG
805 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
806 #endif
807 			];
808 	} rep;
809 	struct net *net = sock_net(sk);
810 	struct ip_reply_arg arg;
811 	struct sock *ctl_sk;
812 
813 	memset(&rep.th, 0, sizeof(struct tcphdr));
814 	memset(&arg, 0, sizeof(arg));
815 
816 	arg.iov[0].iov_base = (unsigned char *)&rep;
817 	arg.iov[0].iov_len  = sizeof(rep.th);
818 	if (tsecr) {
819 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
820 				   (TCPOPT_TIMESTAMP << 8) |
821 				   TCPOLEN_TIMESTAMP);
822 		rep.opt[1] = htonl(tsval);
823 		rep.opt[2] = htonl(tsecr);
824 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
825 	}
826 
827 	/* Swap the send and the receive. */
828 	rep.th.dest    = th->source;
829 	rep.th.source  = th->dest;
830 	rep.th.doff    = arg.iov[0].iov_len / 4;
831 	rep.th.seq     = htonl(seq);
832 	rep.th.ack_seq = htonl(ack);
833 	rep.th.ack     = 1;
834 	rep.th.window  = htons(win);
835 
836 #ifdef CONFIG_TCP_MD5SIG
837 	if (key) {
838 		int offset = (tsecr) ? 3 : 0;
839 
840 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
841 					  (TCPOPT_NOP << 16) |
842 					  (TCPOPT_MD5SIG << 8) |
843 					  TCPOLEN_MD5SIG);
844 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
845 		rep.th.doff = arg.iov[0].iov_len/4;
846 
847 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
848 				    key, ip_hdr(skb)->saddr,
849 				    ip_hdr(skb)->daddr, &rep.th);
850 	}
851 #endif
852 	arg.flags = reply_flags;
853 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
854 				      ip_hdr(skb)->saddr, /* XXX */
855 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
856 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
857 	if (oif)
858 		arg.bound_dev_if = oif;
859 	arg.tos = tos;
860 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
861 	local_bh_disable();
862 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
863 	if (sk)
864 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
865 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
866 	ip_send_unicast_reply(ctl_sk,
867 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
868 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
869 			      &arg, arg.iov[0].iov_len);
870 
871 	ctl_sk->sk_mark = 0;
872 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
873 	local_bh_enable();
874 }
875 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)876 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
877 {
878 	struct inet_timewait_sock *tw = inet_twsk(sk);
879 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
880 
881 	tcp_v4_send_ack(sk, skb,
882 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
883 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
884 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
885 			tcptw->tw_ts_recent,
886 			tw->tw_bound_dev_if,
887 			tcp_twsk_md5_key(tcptw),
888 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
889 			tw->tw_tos
890 			);
891 
892 	inet_twsk_put(tw);
893 }
894 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)895 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
896 				  struct request_sock *req)
897 {
898 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
899 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
900 	 */
901 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
902 					     tcp_sk(sk)->snd_nxt;
903 
904 	/* RFC 7323 2.3
905 	 * The window field (SEG.WND) of every outgoing segment, with the
906 	 * exception of <SYN> segments, MUST be right-shifted by
907 	 * Rcv.Wind.Shift bits:
908 	 */
909 	tcp_v4_send_ack(sk, skb, seq,
910 			tcp_rsk(req)->rcv_nxt,
911 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
912 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
913 			req->ts_recent,
914 			0,
915 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
916 					  AF_INET),
917 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
918 			ip_hdr(skb)->tos);
919 }
920 
921 /*
922  *	Send a SYN-ACK after having received a SYN.
923  *	This still operates on a request_sock only, not on a big
924  *	socket.
925  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)926 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
927 			      struct flowi *fl,
928 			      struct request_sock *req,
929 			      struct tcp_fastopen_cookie *foc,
930 			      enum tcp_synack_type synack_type)
931 {
932 	const struct inet_request_sock *ireq = inet_rsk(req);
933 	struct flowi4 fl4;
934 	int err = -1;
935 	struct sk_buff *skb;
936 
937 	/* First, grab a route. */
938 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
939 		return -1;
940 
941 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
942 
943 	if (skb) {
944 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
945 
946 		rcu_read_lock();
947 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
948 					    ireq->ir_rmt_addr,
949 					    rcu_dereference(ireq->ireq_opt));
950 		rcu_read_unlock();
951 		err = net_xmit_eval(err);
952 	}
953 
954 	return err;
955 }
956 
957 /*
958  *	IPv4 request_sock destructor.
959  */
tcp_v4_reqsk_destructor(struct request_sock * req)960 static void tcp_v4_reqsk_destructor(struct request_sock *req)
961 {
962 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
963 }
964 
965 #ifdef CONFIG_TCP_MD5SIG
966 /*
967  * RFC2385 MD5 checksumming requires a mapping of
968  * IP address->MD5 Key.
969  * We need to maintain these in the sk structure.
970  */
971 
972 /* Find the Key structure for an address.  */
tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)973 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
974 					 const union tcp_md5_addr *addr,
975 					 int family)
976 {
977 	const struct tcp_sock *tp = tcp_sk(sk);
978 	struct tcp_md5sig_key *key;
979 	const struct tcp_md5sig_info *md5sig;
980 	__be32 mask;
981 	struct tcp_md5sig_key *best_match = NULL;
982 	bool match;
983 
984 	/* caller either holds rcu_read_lock() or socket lock */
985 	md5sig = rcu_dereference_check(tp->md5sig_info,
986 				       lockdep_sock_is_held(sk));
987 	if (!md5sig)
988 		return NULL;
989 
990 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
991 		if (key->family != family)
992 			continue;
993 
994 		if (family == AF_INET) {
995 			mask = inet_make_mask(key->prefixlen);
996 			match = (key->addr.a4.s_addr & mask) ==
997 				(addr->a4.s_addr & mask);
998 #if IS_ENABLED(CONFIG_IPV6)
999 		} else if (family == AF_INET6) {
1000 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1001 						  key->prefixlen);
1002 #endif
1003 		} else {
1004 			match = false;
1005 		}
1006 
1007 		if (match && (!best_match ||
1008 			      key->prefixlen > best_match->prefixlen))
1009 			best_match = key;
1010 	}
1011 	return best_match;
1012 }
1013 EXPORT_SYMBOL(tcp_md5_do_lookup);
1014 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1015 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1016 						      const union tcp_md5_addr *addr,
1017 						      int family, u8 prefixlen)
1018 {
1019 	const struct tcp_sock *tp = tcp_sk(sk);
1020 	struct tcp_md5sig_key *key;
1021 	unsigned int size = sizeof(struct in_addr);
1022 	const struct tcp_md5sig_info *md5sig;
1023 
1024 	/* caller either holds rcu_read_lock() or socket lock */
1025 	md5sig = rcu_dereference_check(tp->md5sig_info,
1026 				       lockdep_sock_is_held(sk));
1027 	if (!md5sig)
1028 		return NULL;
1029 #if IS_ENABLED(CONFIG_IPV6)
1030 	if (family == AF_INET6)
1031 		size = sizeof(struct in6_addr);
1032 #endif
1033 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1034 		if (key->family != family)
1035 			continue;
1036 		if (!memcmp(&key->addr, addr, size) &&
1037 		    key->prefixlen == prefixlen)
1038 			return key;
1039 	}
1040 	return NULL;
1041 }
1042 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1043 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1044 					 const struct sock *addr_sk)
1045 {
1046 	const union tcp_md5_addr *addr;
1047 
1048 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1049 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1050 }
1051 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1052 
1053 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)1054 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1055 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1056 		   gfp_t gfp)
1057 {
1058 	/* Add Key to the list */
1059 	struct tcp_md5sig_key *key;
1060 	struct tcp_sock *tp = tcp_sk(sk);
1061 	struct tcp_md5sig_info *md5sig;
1062 
1063 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1064 	if (key) {
1065 		/* Pre-existing entry - just update that one. */
1066 		memcpy(key->key, newkey, newkeylen);
1067 		key->keylen = newkeylen;
1068 		return 0;
1069 	}
1070 
1071 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1072 					   lockdep_sock_is_held(sk));
1073 	if (!md5sig) {
1074 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1075 		if (!md5sig)
1076 			return -ENOMEM;
1077 
1078 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1079 		INIT_HLIST_HEAD(&md5sig->head);
1080 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1081 	}
1082 
1083 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1084 	if (!key)
1085 		return -ENOMEM;
1086 	if (!tcp_alloc_md5sig_pool()) {
1087 		sock_kfree_s(sk, key, sizeof(*key));
1088 		return -ENOMEM;
1089 	}
1090 
1091 	memcpy(key->key, newkey, newkeylen);
1092 	key->keylen = newkeylen;
1093 	key->family = family;
1094 	key->prefixlen = prefixlen;
1095 	memcpy(&key->addr, addr,
1096 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1097 				      sizeof(struct in_addr));
1098 	hlist_add_head_rcu(&key->node, &md5sig->head);
1099 	return 0;
1100 }
1101 EXPORT_SYMBOL(tcp_md5_do_add);
1102 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1103 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1104 		   u8 prefixlen)
1105 {
1106 	struct tcp_md5sig_key *key;
1107 
1108 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1109 	if (!key)
1110 		return -ENOENT;
1111 	hlist_del_rcu(&key->node);
1112 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1113 	kfree_rcu(key, rcu);
1114 	return 0;
1115 }
1116 EXPORT_SYMBOL(tcp_md5_do_del);
1117 
tcp_clear_md5_list(struct sock * sk)1118 static void tcp_clear_md5_list(struct sock *sk)
1119 {
1120 	struct tcp_sock *tp = tcp_sk(sk);
1121 	struct tcp_md5sig_key *key;
1122 	struct hlist_node *n;
1123 	struct tcp_md5sig_info *md5sig;
1124 
1125 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1126 
1127 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1128 		hlist_del_rcu(&key->node);
1129 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1130 		kfree_rcu(key, rcu);
1131 	}
1132 }
1133 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1134 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1135 				 char __user *optval, int optlen)
1136 {
1137 	struct tcp_md5sig cmd;
1138 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1139 	u8 prefixlen = 32;
1140 
1141 	if (optlen < sizeof(cmd))
1142 		return -EINVAL;
1143 
1144 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1145 		return -EFAULT;
1146 
1147 	if (sin->sin_family != AF_INET)
1148 		return -EINVAL;
1149 
1150 	if (optname == TCP_MD5SIG_EXT &&
1151 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1152 		prefixlen = cmd.tcpm_prefixlen;
1153 		if (prefixlen > 32)
1154 			return -EINVAL;
1155 	}
1156 
1157 	if (!cmd.tcpm_keylen)
1158 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1159 				      AF_INET, prefixlen);
1160 
1161 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1162 		return -EINVAL;
1163 
1164 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1165 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1166 			      GFP_KERNEL);
1167 }
1168 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1169 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1170 				   __be32 daddr, __be32 saddr,
1171 				   const struct tcphdr *th, int nbytes)
1172 {
1173 	struct tcp4_pseudohdr *bp;
1174 	struct scatterlist sg;
1175 	struct tcphdr *_th;
1176 
1177 	bp = hp->scratch;
1178 	bp->saddr = saddr;
1179 	bp->daddr = daddr;
1180 	bp->pad = 0;
1181 	bp->protocol = IPPROTO_TCP;
1182 	bp->len = cpu_to_be16(nbytes);
1183 
1184 	_th = (struct tcphdr *)(bp + 1);
1185 	memcpy(_th, th, sizeof(*th));
1186 	_th->check = 0;
1187 
1188 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1189 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1190 				sizeof(*bp) + sizeof(*th));
1191 	return crypto_ahash_update(hp->md5_req);
1192 }
1193 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1194 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1195 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1196 {
1197 	struct tcp_md5sig_pool *hp;
1198 	struct ahash_request *req;
1199 
1200 	hp = tcp_get_md5sig_pool();
1201 	if (!hp)
1202 		goto clear_hash_noput;
1203 	req = hp->md5_req;
1204 
1205 	if (crypto_ahash_init(req))
1206 		goto clear_hash;
1207 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1208 		goto clear_hash;
1209 	if (tcp_md5_hash_key(hp, key))
1210 		goto clear_hash;
1211 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1212 	if (crypto_ahash_final(req))
1213 		goto clear_hash;
1214 
1215 	tcp_put_md5sig_pool();
1216 	return 0;
1217 
1218 clear_hash:
1219 	tcp_put_md5sig_pool();
1220 clear_hash_noput:
1221 	memset(md5_hash, 0, 16);
1222 	return 1;
1223 }
1224 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1225 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1226 			const struct sock *sk,
1227 			const struct sk_buff *skb)
1228 {
1229 	struct tcp_md5sig_pool *hp;
1230 	struct ahash_request *req;
1231 	const struct tcphdr *th = tcp_hdr(skb);
1232 	__be32 saddr, daddr;
1233 
1234 	if (sk) { /* valid for establish/request sockets */
1235 		saddr = sk->sk_rcv_saddr;
1236 		daddr = sk->sk_daddr;
1237 	} else {
1238 		const struct iphdr *iph = ip_hdr(skb);
1239 		saddr = iph->saddr;
1240 		daddr = iph->daddr;
1241 	}
1242 
1243 	hp = tcp_get_md5sig_pool();
1244 	if (!hp)
1245 		goto clear_hash_noput;
1246 	req = hp->md5_req;
1247 
1248 	if (crypto_ahash_init(req))
1249 		goto clear_hash;
1250 
1251 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1252 		goto clear_hash;
1253 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1254 		goto clear_hash;
1255 	if (tcp_md5_hash_key(hp, key))
1256 		goto clear_hash;
1257 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1258 	if (crypto_ahash_final(req))
1259 		goto clear_hash;
1260 
1261 	tcp_put_md5sig_pool();
1262 	return 0;
1263 
1264 clear_hash:
1265 	tcp_put_md5sig_pool();
1266 clear_hash_noput:
1267 	memset(md5_hash, 0, 16);
1268 	return 1;
1269 }
1270 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1271 
1272 #endif
1273 
1274 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1275 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1276 				    const struct sk_buff *skb)
1277 {
1278 #ifdef CONFIG_TCP_MD5SIG
1279 	/*
1280 	 * This gets called for each TCP segment that arrives
1281 	 * so we want to be efficient.
1282 	 * We have 3 drop cases:
1283 	 * o No MD5 hash and one expected.
1284 	 * o MD5 hash and we're not expecting one.
1285 	 * o MD5 hash and its wrong.
1286 	 */
1287 	const __u8 *hash_location = NULL;
1288 	struct tcp_md5sig_key *hash_expected;
1289 	const struct iphdr *iph = ip_hdr(skb);
1290 	const struct tcphdr *th = tcp_hdr(skb);
1291 	int genhash;
1292 	unsigned char newhash[16];
1293 
1294 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1295 					  AF_INET);
1296 	hash_location = tcp_parse_md5sig_option(th);
1297 
1298 	/* We've parsed the options - do we have a hash? */
1299 	if (!hash_expected && !hash_location)
1300 		return false;
1301 
1302 	if (hash_expected && !hash_location) {
1303 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1304 		return true;
1305 	}
1306 
1307 	if (!hash_expected && hash_location) {
1308 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1309 		return true;
1310 	}
1311 
1312 	/* Okay, so this is hash_expected and hash_location -
1313 	 * so we need to calculate the checksum.
1314 	 */
1315 	genhash = tcp_v4_md5_hash_skb(newhash,
1316 				      hash_expected,
1317 				      NULL, skb);
1318 
1319 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1320 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1321 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1322 				     &iph->saddr, ntohs(th->source),
1323 				     &iph->daddr, ntohs(th->dest),
1324 				     genhash ? " tcp_v4_calc_md5_hash failed"
1325 				     : "");
1326 		return true;
1327 	}
1328 	return false;
1329 #endif
1330 	return false;
1331 }
1332 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1333 static void tcp_v4_init_req(struct request_sock *req,
1334 			    const struct sock *sk_listener,
1335 			    struct sk_buff *skb)
1336 {
1337 	struct inet_request_sock *ireq = inet_rsk(req);
1338 	struct net *net = sock_net(sk_listener);
1339 
1340 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1341 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1342 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1343 }
1344 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1345 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1346 					  struct flowi *fl,
1347 					  const struct request_sock *req)
1348 {
1349 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1350 }
1351 
1352 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1353 	.family		=	PF_INET,
1354 	.obj_size	=	sizeof(struct tcp_request_sock),
1355 	.rtx_syn_ack	=	tcp_rtx_synack,
1356 	.send_ack	=	tcp_v4_reqsk_send_ack,
1357 	.destructor	=	tcp_v4_reqsk_destructor,
1358 	.send_reset	=	tcp_v4_send_reset,
1359 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1360 };
1361 
1362 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1363 	.mss_clamp	=	TCP_MSS_DEFAULT,
1364 #ifdef CONFIG_TCP_MD5SIG
1365 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1366 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1367 #endif
1368 	.init_req	=	tcp_v4_init_req,
1369 #ifdef CONFIG_SYN_COOKIES
1370 	.cookie_init_seq =	cookie_v4_init_sequence,
1371 #endif
1372 	.route_req	=	tcp_v4_route_req,
1373 	.init_seq	=	tcp_v4_init_seq,
1374 	.init_ts_off	=	tcp_v4_init_ts_off,
1375 	.send_synack	=	tcp_v4_send_synack,
1376 };
1377 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1378 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1379 {
1380 	/* Never answer to SYNs send to broadcast or multicast */
1381 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1382 		goto drop;
1383 
1384 	return tcp_conn_request(&tcp_request_sock_ops,
1385 				&tcp_request_sock_ipv4_ops, sk, skb);
1386 
1387 drop:
1388 	tcp_listendrop(sk);
1389 	return 0;
1390 }
1391 EXPORT_SYMBOL(tcp_v4_conn_request);
1392 
1393 
1394 /*
1395  * The three way handshake has completed - we got a valid synack -
1396  * now create the new socket.
1397  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1398 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1399 				  struct request_sock *req,
1400 				  struct dst_entry *dst,
1401 				  struct request_sock *req_unhash,
1402 				  bool *own_req)
1403 {
1404 	struct inet_request_sock *ireq;
1405 	struct inet_sock *newinet;
1406 	struct tcp_sock *newtp;
1407 	struct sock *newsk;
1408 #ifdef CONFIG_TCP_MD5SIG
1409 	struct tcp_md5sig_key *key;
1410 #endif
1411 	struct ip_options_rcu *inet_opt;
1412 
1413 	if (sk_acceptq_is_full(sk))
1414 		goto exit_overflow;
1415 
1416 	newsk = tcp_create_openreq_child(sk, req, skb);
1417 	if (!newsk)
1418 		goto exit_nonewsk;
1419 
1420 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1421 	inet_sk_rx_dst_set(newsk, skb);
1422 
1423 	newtp		      = tcp_sk(newsk);
1424 	newinet		      = inet_sk(newsk);
1425 	ireq		      = inet_rsk(req);
1426 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1427 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1428 	newsk->sk_bound_dev_if = ireq->ir_iif;
1429 	newinet->inet_saddr   = ireq->ir_loc_addr;
1430 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1431 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1432 	newinet->mc_index     = inet_iif(skb);
1433 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1434 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1435 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436 	if (inet_opt)
1437 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1438 	newinet->inet_id = newtp->write_seq ^ jiffies;
1439 
1440 	if (!dst) {
1441 		dst = inet_csk_route_child_sock(sk, newsk, req);
1442 		if (!dst)
1443 			goto put_and_exit;
1444 	} else {
1445 		/* syncookie case : see end of cookie_v4_check() */
1446 	}
1447 	sk_setup_caps(newsk, dst);
1448 
1449 	tcp_ca_openreq_child(newsk, dst);
1450 
1451 	tcp_sync_mss(newsk, dst_mtu(dst));
1452 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1453 
1454 	tcp_initialize_rcv_mss(newsk);
1455 
1456 #ifdef CONFIG_TCP_MD5SIG
1457 	/* Copy over the MD5 key from the original socket */
1458 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1459 				AF_INET);
1460 	if (key) {
1461 		/*
1462 		 * We're using one, so create a matching key
1463 		 * on the newsk structure. If we fail to get
1464 		 * memory, then we end up not copying the key
1465 		 * across. Shucks.
1466 		 */
1467 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1468 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1469 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1470 	}
1471 #endif
1472 
1473 	if (__inet_inherit_port(sk, newsk) < 0)
1474 		goto put_and_exit;
1475 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1476 	if (likely(*own_req)) {
1477 		tcp_move_syn(newtp, req);
1478 		ireq->ireq_opt = NULL;
1479 	} else {
1480 		newinet->inet_opt = NULL;
1481 	}
1482 	return newsk;
1483 
1484 exit_overflow:
1485 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486 exit_nonewsk:
1487 	dst_release(dst);
1488 exit:
1489 	tcp_listendrop(sk);
1490 	return NULL;
1491 put_and_exit:
1492 	newinet->inet_opt = NULL;
1493 	inet_csk_prepare_forced_close(newsk);
1494 	tcp_done(newsk);
1495 	goto exit;
1496 }
1497 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1499 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1500 {
1501 #ifdef CONFIG_SYN_COOKIES
1502 	const struct tcphdr *th = tcp_hdr(skb);
1503 
1504 	if (!th->syn)
1505 		sk = cookie_v4_check(sk, skb);
1506 #endif
1507 	return sk;
1508 }
1509 
1510 /* The socket must have it's spinlock held when we get
1511  * here, unless it is a TCP_LISTEN socket.
1512  *
1513  * We have a potential double-lock case here, so even when
1514  * doing backlog processing we use the BH locking scheme.
1515  * This is because we cannot sleep with the original spinlock
1516  * held.
1517  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1518 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1519 {
1520 	struct sock *rsk;
1521 
1522 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1523 		struct dst_entry *dst = sk->sk_rx_dst;
1524 
1525 		sock_rps_save_rxhash(sk, skb);
1526 		sk_mark_napi_id(sk, skb);
1527 		if (dst) {
1528 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1529 			    !dst->ops->check(dst, 0)) {
1530 				dst_release(dst);
1531 				sk->sk_rx_dst = NULL;
1532 			}
1533 		}
1534 		tcp_rcv_established(sk, skb);
1535 		return 0;
1536 	}
1537 
1538 	if (tcp_checksum_complete(skb))
1539 		goto csum_err;
1540 
1541 	if (sk->sk_state == TCP_LISTEN) {
1542 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1543 
1544 		if (!nsk)
1545 			goto discard;
1546 		if (nsk != sk) {
1547 			if (tcp_child_process(sk, nsk, skb)) {
1548 				rsk = nsk;
1549 				goto reset;
1550 			}
1551 			return 0;
1552 		}
1553 	} else
1554 		sock_rps_save_rxhash(sk, skb);
1555 
1556 	if (tcp_rcv_state_process(sk, skb)) {
1557 		rsk = sk;
1558 		goto reset;
1559 	}
1560 	return 0;
1561 
1562 reset:
1563 	tcp_v4_send_reset(rsk, skb);
1564 discard:
1565 	kfree_skb(skb);
1566 	/* Be careful here. If this function gets more complicated and
1567 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1568 	 * might be destroyed here. This current version compiles correctly,
1569 	 * but you have been warned.
1570 	 */
1571 	return 0;
1572 
1573 csum_err:
1574 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1575 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1576 	goto discard;
1577 }
1578 EXPORT_SYMBOL(tcp_v4_do_rcv);
1579 
tcp_v4_early_demux(struct sk_buff * skb)1580 int tcp_v4_early_demux(struct sk_buff *skb)
1581 {
1582 	const struct iphdr *iph;
1583 	const struct tcphdr *th;
1584 	struct sock *sk;
1585 
1586 	if (skb->pkt_type != PACKET_HOST)
1587 		return 0;
1588 
1589 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1590 		return 0;
1591 
1592 	iph = ip_hdr(skb);
1593 	th = tcp_hdr(skb);
1594 
1595 	if (th->doff < sizeof(struct tcphdr) / 4)
1596 		return 0;
1597 
1598 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1599 				       iph->saddr, th->source,
1600 				       iph->daddr, ntohs(th->dest),
1601 				       skb->skb_iif, inet_sdif(skb));
1602 	if (sk) {
1603 		skb->sk = sk;
1604 		skb->destructor = sock_edemux;
1605 		if (sk_fullsock(sk)) {
1606 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1607 
1608 			if (dst)
1609 				dst = dst_check(dst, 0);
1610 			if (dst &&
1611 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1612 				skb_dst_set_noref(skb, dst);
1613 		}
1614 	}
1615 	return 0;
1616 }
1617 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1618 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1619 {
1620 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1621 
1622 	/* Only socket owner can try to collapse/prune rx queues
1623 	 * to reduce memory overhead, so add a little headroom here.
1624 	 * Few sockets backlog are possibly concurrently non empty.
1625 	 */
1626 	limit += 64*1024;
1627 
1628 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1629 	 * we can fix skb->truesize to its real value to avoid future drops.
1630 	 * This is valid because skb is not yet charged to the socket.
1631 	 * It has been noticed pure SACK packets were sometimes dropped
1632 	 * (if cooked by drivers without copybreak feature).
1633 	 */
1634 	skb_condense(skb);
1635 
1636 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1637 		bh_unlock_sock(sk);
1638 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1639 		return true;
1640 	}
1641 	return false;
1642 }
1643 EXPORT_SYMBOL(tcp_add_backlog);
1644 
tcp_filter(struct sock * sk,struct sk_buff * skb)1645 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1646 {
1647 	struct tcphdr *th = (struct tcphdr *)skb->data;
1648 	unsigned int eaten = skb->len;
1649 	int err;
1650 
1651 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1652 	if (!err) {
1653 		eaten -= skb->len;
1654 		TCP_SKB_CB(skb)->end_seq -= eaten;
1655 	}
1656 	return err;
1657 }
1658 EXPORT_SYMBOL(tcp_filter);
1659 
tcp_v4_restore_cb(struct sk_buff * skb)1660 static void tcp_v4_restore_cb(struct sk_buff *skb)
1661 {
1662 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1663 		sizeof(struct inet_skb_parm));
1664 }
1665 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1666 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1667 			   const struct tcphdr *th)
1668 {
1669 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1670 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1671 	 */
1672 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1673 		sizeof(struct inet_skb_parm));
1674 	barrier();
1675 
1676 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1677 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1678 				    skb->len - th->doff * 4);
1679 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1680 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1681 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1682 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1683 	TCP_SKB_CB(skb)->sacked	 = 0;
1684 	TCP_SKB_CB(skb)->has_rxtstamp =
1685 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1686 }
1687 
1688 /*
1689  *	From tcp_input.c
1690  */
1691 
tcp_v4_rcv(struct sk_buff * skb)1692 int tcp_v4_rcv(struct sk_buff *skb)
1693 {
1694 	struct net *net = dev_net(skb->dev);
1695 	int sdif = inet_sdif(skb);
1696 	const struct iphdr *iph;
1697 	const struct tcphdr *th;
1698 	bool refcounted;
1699 	struct sock *sk;
1700 	int ret;
1701 
1702 	if (skb->pkt_type != PACKET_HOST)
1703 		goto discard_it;
1704 
1705 	/* Count it even if it's bad */
1706 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1707 
1708 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1709 		goto discard_it;
1710 
1711 	th = (const struct tcphdr *)skb->data;
1712 
1713 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1714 		goto bad_packet;
1715 	if (!pskb_may_pull(skb, th->doff * 4))
1716 		goto discard_it;
1717 
1718 	/* An explanation is required here, I think.
1719 	 * Packet length and doff are validated by header prediction,
1720 	 * provided case of th->doff==0 is eliminated.
1721 	 * So, we defer the checks. */
1722 
1723 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1724 		goto csum_error;
1725 
1726 	th = (const struct tcphdr *)skb->data;
1727 	iph = ip_hdr(skb);
1728 lookup:
1729 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1730 			       th->dest, sdif, &refcounted);
1731 	if (!sk)
1732 		goto no_tcp_socket;
1733 
1734 process:
1735 	if (sk->sk_state == TCP_TIME_WAIT)
1736 		goto do_time_wait;
1737 
1738 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1739 		struct request_sock *req = inet_reqsk(sk);
1740 		bool req_stolen = false;
1741 		struct sock *nsk;
1742 
1743 		sk = req->rsk_listener;
1744 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1745 			sk_drops_add(sk, skb);
1746 			reqsk_put(req);
1747 			goto discard_it;
1748 		}
1749 		if (tcp_checksum_complete(skb)) {
1750 			reqsk_put(req);
1751 			goto csum_error;
1752 		}
1753 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1754 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1755 			goto lookup;
1756 		}
1757 		/* We own a reference on the listener, increase it again
1758 		 * as we might lose it too soon.
1759 		 */
1760 		sock_hold(sk);
1761 		refcounted = true;
1762 		nsk = NULL;
1763 		if (!tcp_filter(sk, skb)) {
1764 			th = (const struct tcphdr *)skb->data;
1765 			iph = ip_hdr(skb);
1766 			tcp_v4_fill_cb(skb, iph, th);
1767 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1768 		}
1769 		if (!nsk) {
1770 			reqsk_put(req);
1771 			if (req_stolen) {
1772 				/* Another cpu got exclusive access to req
1773 				 * and created a full blown socket.
1774 				 * Try to feed this packet to this socket
1775 				 * instead of discarding it.
1776 				 */
1777 				tcp_v4_restore_cb(skb);
1778 				sock_put(sk);
1779 				goto lookup;
1780 			}
1781 			goto discard_and_relse;
1782 		}
1783 		if (nsk == sk) {
1784 			reqsk_put(req);
1785 			tcp_v4_restore_cb(skb);
1786 		} else if (tcp_child_process(sk, nsk, skb)) {
1787 			tcp_v4_send_reset(nsk, skb);
1788 			goto discard_and_relse;
1789 		} else {
1790 			sock_put(sk);
1791 			return 0;
1792 		}
1793 	}
1794 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1795 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1796 		goto discard_and_relse;
1797 	}
1798 
1799 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1800 		goto discard_and_relse;
1801 
1802 	if (tcp_v4_inbound_md5_hash(sk, skb))
1803 		goto discard_and_relse;
1804 
1805 	nf_reset(skb);
1806 
1807 	if (tcp_filter(sk, skb))
1808 		goto discard_and_relse;
1809 	th = (const struct tcphdr *)skb->data;
1810 	iph = ip_hdr(skb);
1811 	tcp_v4_fill_cb(skb, iph, th);
1812 
1813 	skb->dev = NULL;
1814 
1815 	if (sk->sk_state == TCP_LISTEN) {
1816 		ret = tcp_v4_do_rcv(sk, skb);
1817 		goto put_and_return;
1818 	}
1819 
1820 	sk_incoming_cpu_update(sk);
1821 
1822 	bh_lock_sock_nested(sk);
1823 	tcp_segs_in(tcp_sk(sk), skb);
1824 	ret = 0;
1825 	if (!sock_owned_by_user(sk)) {
1826 		ret = tcp_v4_do_rcv(sk, skb);
1827 	} else if (tcp_add_backlog(sk, skb)) {
1828 		goto discard_and_relse;
1829 	}
1830 	bh_unlock_sock(sk);
1831 
1832 put_and_return:
1833 	if (refcounted)
1834 		sock_put(sk);
1835 
1836 	return ret;
1837 
1838 no_tcp_socket:
1839 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1840 		goto discard_it;
1841 
1842 	tcp_v4_fill_cb(skb, iph, th);
1843 
1844 	if (tcp_checksum_complete(skb)) {
1845 csum_error:
1846 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1847 bad_packet:
1848 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1849 	} else {
1850 		tcp_v4_send_reset(NULL, skb);
1851 	}
1852 
1853 discard_it:
1854 	/* Discard frame. */
1855 	kfree_skb(skb);
1856 	return 0;
1857 
1858 discard_and_relse:
1859 	sk_drops_add(sk, skb);
1860 	if (refcounted)
1861 		sock_put(sk);
1862 	goto discard_it;
1863 
1864 do_time_wait:
1865 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1866 		inet_twsk_put(inet_twsk(sk));
1867 		goto discard_it;
1868 	}
1869 
1870 	tcp_v4_fill_cb(skb, iph, th);
1871 
1872 	if (tcp_checksum_complete(skb)) {
1873 		inet_twsk_put(inet_twsk(sk));
1874 		goto csum_error;
1875 	}
1876 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1877 	case TCP_TW_SYN: {
1878 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1879 							&tcp_hashinfo, skb,
1880 							__tcp_hdrlen(th),
1881 							iph->saddr, th->source,
1882 							iph->daddr, th->dest,
1883 							inet_iif(skb),
1884 							sdif);
1885 		if (sk2) {
1886 			inet_twsk_deschedule_put(inet_twsk(sk));
1887 			sk = sk2;
1888 			tcp_v4_restore_cb(skb);
1889 			refcounted = false;
1890 			goto process;
1891 		}
1892 	}
1893 		/* to ACK */
1894 		/* fall through */
1895 	case TCP_TW_ACK:
1896 		tcp_v4_timewait_ack(sk, skb);
1897 		break;
1898 	case TCP_TW_RST:
1899 		tcp_v4_send_reset(sk, skb);
1900 		inet_twsk_deschedule_put(inet_twsk(sk));
1901 		goto discard_it;
1902 	case TCP_TW_SUCCESS:;
1903 	}
1904 	goto discard_it;
1905 }
1906 
1907 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1908 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1909 	.twsk_unique	= tcp_twsk_unique,
1910 	.twsk_destructor= tcp_twsk_destructor,
1911 };
1912 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)1913 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1914 {
1915 	struct dst_entry *dst = skb_dst(skb);
1916 
1917 	if (dst && dst_hold_safe(dst)) {
1918 		sk->sk_rx_dst = dst;
1919 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1920 	}
1921 }
1922 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1923 
1924 const struct inet_connection_sock_af_ops ipv4_specific = {
1925 	.queue_xmit	   = ip_queue_xmit,
1926 	.send_check	   = tcp_v4_send_check,
1927 	.rebuild_header	   = inet_sk_rebuild_header,
1928 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1929 	.conn_request	   = tcp_v4_conn_request,
1930 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1931 	.net_header_len	   = sizeof(struct iphdr),
1932 	.setsockopt	   = ip_setsockopt,
1933 	.getsockopt	   = ip_getsockopt,
1934 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1935 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1936 #ifdef CONFIG_COMPAT
1937 	.compat_setsockopt = compat_ip_setsockopt,
1938 	.compat_getsockopt = compat_ip_getsockopt,
1939 #endif
1940 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1941 };
1942 EXPORT_SYMBOL(ipv4_specific);
1943 
1944 #ifdef CONFIG_TCP_MD5SIG
1945 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1946 	.md5_lookup		= tcp_v4_md5_lookup,
1947 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1948 	.md5_parse		= tcp_v4_parse_md5_keys,
1949 };
1950 #endif
1951 
1952 /* NOTE: A lot of things set to zero explicitly by call to
1953  *       sk_alloc() so need not be done here.
1954  */
tcp_v4_init_sock(struct sock * sk)1955 static int tcp_v4_init_sock(struct sock *sk)
1956 {
1957 	struct inet_connection_sock *icsk = inet_csk(sk);
1958 
1959 	tcp_init_sock(sk);
1960 
1961 	icsk->icsk_af_ops = &ipv4_specific;
1962 
1963 #ifdef CONFIG_TCP_MD5SIG
1964 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1965 #endif
1966 
1967 	return 0;
1968 }
1969 
tcp_v4_destroy_sock(struct sock * sk)1970 void tcp_v4_destroy_sock(struct sock *sk)
1971 {
1972 	struct tcp_sock *tp = tcp_sk(sk);
1973 
1974 	trace_tcp_destroy_sock(sk);
1975 
1976 	tcp_clear_xmit_timers(sk);
1977 
1978 	tcp_cleanup_congestion_control(sk);
1979 
1980 	tcp_cleanup_ulp(sk);
1981 
1982 	/* Cleanup up the write buffer. */
1983 	tcp_write_queue_purge(sk);
1984 
1985 	/* Check if we want to disable active TFO */
1986 	tcp_fastopen_active_disable_ofo_check(sk);
1987 
1988 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1989 	skb_rbtree_purge(&tp->out_of_order_queue);
1990 
1991 #ifdef CONFIG_TCP_MD5SIG
1992 	/* Clean up the MD5 key list, if any */
1993 	if (tp->md5sig_info) {
1994 		tcp_clear_md5_list(sk);
1995 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1996 		tp->md5sig_info = NULL;
1997 	}
1998 #endif
1999 
2000 	/* Clean up a referenced TCP bind bucket. */
2001 	if (inet_csk(sk)->icsk_bind_hash)
2002 		inet_put_port(sk);
2003 
2004 	BUG_ON(tp->fastopen_rsk);
2005 
2006 	/* If socket is aborted during connect operation */
2007 	tcp_free_fastopen_req(tp);
2008 	tcp_fastopen_destroy_cipher(sk);
2009 	tcp_saved_syn_free(tp);
2010 
2011 	sk_sockets_allocated_dec(sk);
2012 }
2013 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2014 
2015 #ifdef CONFIG_PROC_FS
2016 /* Proc filesystem TCP sock list dumping. */
2017 
2018 /*
2019  * Get next listener socket follow cur.  If cur is NULL, get first socket
2020  * starting from bucket given in st->bucket; when st->bucket is zero the
2021  * very first socket in the hash table is returned.
2022  */
listening_get_next(struct seq_file * seq,void * cur)2023 static void *listening_get_next(struct seq_file *seq, void *cur)
2024 {
2025 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2026 	struct tcp_iter_state *st = seq->private;
2027 	struct net *net = seq_file_net(seq);
2028 	struct inet_listen_hashbucket *ilb;
2029 	struct sock *sk = cur;
2030 
2031 	if (!sk) {
2032 get_head:
2033 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034 		spin_lock(&ilb->lock);
2035 		sk = sk_head(&ilb->head);
2036 		st->offset = 0;
2037 		goto get_sk;
2038 	}
2039 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2040 	++st->num;
2041 	++st->offset;
2042 
2043 	sk = sk_next(sk);
2044 get_sk:
2045 	sk_for_each_from(sk) {
2046 		if (!net_eq(sock_net(sk), net))
2047 			continue;
2048 		if (sk->sk_family == afinfo->family)
2049 			return sk;
2050 	}
2051 	spin_unlock(&ilb->lock);
2052 	st->offset = 0;
2053 	if (++st->bucket < INET_LHTABLE_SIZE)
2054 		goto get_head;
2055 	return NULL;
2056 }
2057 
listening_get_idx(struct seq_file * seq,loff_t * pos)2058 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059 {
2060 	struct tcp_iter_state *st = seq->private;
2061 	void *rc;
2062 
2063 	st->bucket = 0;
2064 	st->offset = 0;
2065 	rc = listening_get_next(seq, NULL);
2066 
2067 	while (rc && *pos) {
2068 		rc = listening_get_next(seq, rc);
2069 		--*pos;
2070 	}
2071 	return rc;
2072 }
2073 
empty_bucket(const struct tcp_iter_state * st)2074 static inline bool empty_bucket(const struct tcp_iter_state *st)
2075 {
2076 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2077 }
2078 
2079 /*
2080  * Get first established socket starting from bucket given in st->bucket.
2081  * If st->bucket is zero, the very first socket in the hash is returned.
2082  */
established_get_first(struct seq_file * seq)2083 static void *established_get_first(struct seq_file *seq)
2084 {
2085 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2086 	struct tcp_iter_state *st = seq->private;
2087 	struct net *net = seq_file_net(seq);
2088 	void *rc = NULL;
2089 
2090 	st->offset = 0;
2091 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2092 		struct sock *sk;
2093 		struct hlist_nulls_node *node;
2094 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2095 
2096 		/* Lockless fast path for the common case of empty buckets */
2097 		if (empty_bucket(st))
2098 			continue;
2099 
2100 		spin_lock_bh(lock);
2101 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2102 			if (sk->sk_family != afinfo->family ||
2103 			    !net_eq(sock_net(sk), net)) {
2104 				continue;
2105 			}
2106 			rc = sk;
2107 			goto out;
2108 		}
2109 		spin_unlock_bh(lock);
2110 	}
2111 out:
2112 	return rc;
2113 }
2114 
established_get_next(struct seq_file * seq,void * cur)2115 static void *established_get_next(struct seq_file *seq, void *cur)
2116 {
2117 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2118 	struct sock *sk = cur;
2119 	struct hlist_nulls_node *node;
2120 	struct tcp_iter_state *st = seq->private;
2121 	struct net *net = seq_file_net(seq);
2122 
2123 	++st->num;
2124 	++st->offset;
2125 
2126 	sk = sk_nulls_next(sk);
2127 
2128 	sk_nulls_for_each_from(sk, node) {
2129 		if (sk->sk_family == afinfo->family &&
2130 		    net_eq(sock_net(sk), net))
2131 			return sk;
2132 	}
2133 
2134 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2135 	++st->bucket;
2136 	return established_get_first(seq);
2137 }
2138 
established_get_idx(struct seq_file * seq,loff_t pos)2139 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2140 {
2141 	struct tcp_iter_state *st = seq->private;
2142 	void *rc;
2143 
2144 	st->bucket = 0;
2145 	rc = established_get_first(seq);
2146 
2147 	while (rc && pos) {
2148 		rc = established_get_next(seq, rc);
2149 		--pos;
2150 	}
2151 	return rc;
2152 }
2153 
tcp_get_idx(struct seq_file * seq,loff_t pos)2154 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2155 {
2156 	void *rc;
2157 	struct tcp_iter_state *st = seq->private;
2158 
2159 	st->state = TCP_SEQ_STATE_LISTENING;
2160 	rc	  = listening_get_idx(seq, &pos);
2161 
2162 	if (!rc) {
2163 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2164 		rc	  = established_get_idx(seq, pos);
2165 	}
2166 
2167 	return rc;
2168 }
2169 
tcp_seek_last_pos(struct seq_file * seq)2170 static void *tcp_seek_last_pos(struct seq_file *seq)
2171 {
2172 	struct tcp_iter_state *st = seq->private;
2173 	int offset = st->offset;
2174 	int orig_num = st->num;
2175 	void *rc = NULL;
2176 
2177 	switch (st->state) {
2178 	case TCP_SEQ_STATE_LISTENING:
2179 		if (st->bucket >= INET_LHTABLE_SIZE)
2180 			break;
2181 		st->state = TCP_SEQ_STATE_LISTENING;
2182 		rc = listening_get_next(seq, NULL);
2183 		while (offset-- && rc)
2184 			rc = listening_get_next(seq, rc);
2185 		if (rc)
2186 			break;
2187 		st->bucket = 0;
2188 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2189 		/* Fallthrough */
2190 	case TCP_SEQ_STATE_ESTABLISHED:
2191 		if (st->bucket > tcp_hashinfo.ehash_mask)
2192 			break;
2193 		rc = established_get_first(seq);
2194 		while (offset-- && rc)
2195 			rc = established_get_next(seq, rc);
2196 	}
2197 
2198 	st->num = orig_num;
2199 
2200 	return rc;
2201 }
2202 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2203 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2204 {
2205 	struct tcp_iter_state *st = seq->private;
2206 	void *rc;
2207 
2208 	if (*pos && *pos == st->last_pos) {
2209 		rc = tcp_seek_last_pos(seq);
2210 		if (rc)
2211 			goto out;
2212 	}
2213 
2214 	st->state = TCP_SEQ_STATE_LISTENING;
2215 	st->num = 0;
2216 	st->bucket = 0;
2217 	st->offset = 0;
2218 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2219 
2220 out:
2221 	st->last_pos = *pos;
2222 	return rc;
2223 }
2224 EXPORT_SYMBOL(tcp_seq_start);
2225 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2226 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2227 {
2228 	struct tcp_iter_state *st = seq->private;
2229 	void *rc = NULL;
2230 
2231 	if (v == SEQ_START_TOKEN) {
2232 		rc = tcp_get_idx(seq, 0);
2233 		goto out;
2234 	}
2235 
2236 	switch (st->state) {
2237 	case TCP_SEQ_STATE_LISTENING:
2238 		rc = listening_get_next(seq, v);
2239 		if (!rc) {
2240 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2241 			st->bucket = 0;
2242 			st->offset = 0;
2243 			rc	  = established_get_first(seq);
2244 		}
2245 		break;
2246 	case TCP_SEQ_STATE_ESTABLISHED:
2247 		rc = established_get_next(seq, v);
2248 		break;
2249 	}
2250 out:
2251 	++*pos;
2252 	st->last_pos = *pos;
2253 	return rc;
2254 }
2255 EXPORT_SYMBOL(tcp_seq_next);
2256 
tcp_seq_stop(struct seq_file * seq,void * v)2257 void tcp_seq_stop(struct seq_file *seq, void *v)
2258 {
2259 	struct tcp_iter_state *st = seq->private;
2260 
2261 	switch (st->state) {
2262 	case TCP_SEQ_STATE_LISTENING:
2263 		if (v != SEQ_START_TOKEN)
2264 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2265 		break;
2266 	case TCP_SEQ_STATE_ESTABLISHED:
2267 		if (v)
2268 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269 		break;
2270 	}
2271 }
2272 EXPORT_SYMBOL(tcp_seq_stop);
2273 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2274 static void get_openreq4(const struct request_sock *req,
2275 			 struct seq_file *f, int i)
2276 {
2277 	const struct inet_request_sock *ireq = inet_rsk(req);
2278 	long delta = req->rsk_timer.expires - jiffies;
2279 
2280 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2281 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2282 		i,
2283 		ireq->ir_loc_addr,
2284 		ireq->ir_num,
2285 		ireq->ir_rmt_addr,
2286 		ntohs(ireq->ir_rmt_port),
2287 		TCP_SYN_RECV,
2288 		0, 0, /* could print option size, but that is af dependent. */
2289 		1,    /* timers active (only the expire timer) */
2290 		jiffies_delta_to_clock_t(delta),
2291 		req->num_timeout,
2292 		from_kuid_munged(seq_user_ns(f),
2293 				 sock_i_uid(req->rsk_listener)),
2294 		0,  /* non standard timer */
2295 		0, /* open_requests have no inode */
2296 		0,
2297 		req);
2298 }
2299 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2300 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2301 {
2302 	int timer_active;
2303 	unsigned long timer_expires;
2304 	const struct tcp_sock *tp = tcp_sk(sk);
2305 	const struct inet_connection_sock *icsk = inet_csk(sk);
2306 	const struct inet_sock *inet = inet_sk(sk);
2307 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2308 	__be32 dest = inet->inet_daddr;
2309 	__be32 src = inet->inet_rcv_saddr;
2310 	__u16 destp = ntohs(inet->inet_dport);
2311 	__u16 srcp = ntohs(inet->inet_sport);
2312 	int rx_queue;
2313 	int state;
2314 
2315 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2316 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2317 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2318 		timer_active	= 1;
2319 		timer_expires	= icsk->icsk_timeout;
2320 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2321 		timer_active	= 4;
2322 		timer_expires	= icsk->icsk_timeout;
2323 	} else if (timer_pending(&sk->sk_timer)) {
2324 		timer_active	= 2;
2325 		timer_expires	= sk->sk_timer.expires;
2326 	} else {
2327 		timer_active	= 0;
2328 		timer_expires = jiffies;
2329 	}
2330 
2331 	state = inet_sk_state_load(sk);
2332 	if (state == TCP_LISTEN)
2333 		rx_queue = sk->sk_ack_backlog;
2334 	else
2335 		/* Because we don't lock the socket,
2336 		 * we might find a transient negative value.
2337 		 */
2338 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2339 
2340 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2341 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2342 		i, src, srcp, dest, destp, state,
2343 		tp->write_seq - tp->snd_una,
2344 		rx_queue,
2345 		timer_active,
2346 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2347 		icsk->icsk_retransmits,
2348 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2349 		icsk->icsk_probes_out,
2350 		sock_i_ino(sk),
2351 		refcount_read(&sk->sk_refcnt), sk,
2352 		jiffies_to_clock_t(icsk->icsk_rto),
2353 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2354 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2355 		tp->snd_cwnd,
2356 		state == TCP_LISTEN ?
2357 		    fastopenq->max_qlen :
2358 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2359 }
2360 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2361 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2362 			       struct seq_file *f, int i)
2363 {
2364 	long delta = tw->tw_timer.expires - jiffies;
2365 	__be32 dest, src;
2366 	__u16 destp, srcp;
2367 
2368 	dest  = tw->tw_daddr;
2369 	src   = tw->tw_rcv_saddr;
2370 	destp = ntohs(tw->tw_dport);
2371 	srcp  = ntohs(tw->tw_sport);
2372 
2373 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2375 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2376 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2377 		refcount_read(&tw->tw_refcnt), tw);
2378 }
2379 
2380 #define TMPSZ 150
2381 
tcp4_seq_show(struct seq_file * seq,void * v)2382 static int tcp4_seq_show(struct seq_file *seq, void *v)
2383 {
2384 	struct tcp_iter_state *st;
2385 	struct sock *sk = v;
2386 
2387 	seq_setwidth(seq, TMPSZ - 1);
2388 	if (v == SEQ_START_TOKEN) {
2389 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2390 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2391 			   "inode");
2392 		goto out;
2393 	}
2394 	st = seq->private;
2395 
2396 	if (sk->sk_state == TCP_TIME_WAIT)
2397 		get_timewait4_sock(v, seq, st->num);
2398 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2399 		get_openreq4(v, seq, st->num);
2400 	else
2401 		get_tcp4_sock(v, seq, st->num);
2402 out:
2403 	seq_pad(seq, '\n');
2404 	return 0;
2405 }
2406 
2407 static const struct seq_operations tcp4_seq_ops = {
2408 	.show		= tcp4_seq_show,
2409 	.start		= tcp_seq_start,
2410 	.next		= tcp_seq_next,
2411 	.stop		= tcp_seq_stop,
2412 };
2413 
2414 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2415 	.family		= AF_INET,
2416 };
2417 
tcp4_proc_init_net(struct net * net)2418 static int __net_init tcp4_proc_init_net(struct net *net)
2419 {
2420 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2421 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2422 		return -ENOMEM;
2423 	return 0;
2424 }
2425 
tcp4_proc_exit_net(struct net * net)2426 static void __net_exit tcp4_proc_exit_net(struct net *net)
2427 {
2428 	remove_proc_entry("tcp", net->proc_net);
2429 }
2430 
2431 static struct pernet_operations tcp4_net_ops = {
2432 	.init = tcp4_proc_init_net,
2433 	.exit = tcp4_proc_exit_net,
2434 };
2435 
tcp4_proc_init(void)2436 int __init tcp4_proc_init(void)
2437 {
2438 	return register_pernet_subsys(&tcp4_net_ops);
2439 }
2440 
tcp4_proc_exit(void)2441 void tcp4_proc_exit(void)
2442 {
2443 	unregister_pernet_subsys(&tcp4_net_ops);
2444 }
2445 #endif /* CONFIG_PROC_FS */
2446 
2447 struct proto tcp_prot = {
2448 	.name			= "TCP",
2449 	.owner			= THIS_MODULE,
2450 	.close			= tcp_close,
2451 	.pre_connect		= tcp_v4_pre_connect,
2452 	.connect		= tcp_v4_connect,
2453 	.disconnect		= tcp_disconnect,
2454 	.accept			= inet_csk_accept,
2455 	.ioctl			= tcp_ioctl,
2456 	.init			= tcp_v4_init_sock,
2457 	.destroy		= tcp_v4_destroy_sock,
2458 	.shutdown		= tcp_shutdown,
2459 	.setsockopt		= tcp_setsockopt,
2460 	.getsockopt		= tcp_getsockopt,
2461 	.keepalive		= tcp_set_keepalive,
2462 	.recvmsg		= tcp_recvmsg,
2463 	.sendmsg		= tcp_sendmsg,
2464 	.sendpage		= tcp_sendpage,
2465 	.backlog_rcv		= tcp_v4_do_rcv,
2466 	.release_cb		= tcp_release_cb,
2467 	.hash			= inet_hash,
2468 	.unhash			= inet_unhash,
2469 	.get_port		= inet_csk_get_port,
2470 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2471 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2472 	.stream_memory_free	= tcp_stream_memory_free,
2473 	.sockets_allocated	= &tcp_sockets_allocated,
2474 	.orphan_count		= &tcp_orphan_count,
2475 	.memory_allocated	= &tcp_memory_allocated,
2476 	.memory_pressure	= &tcp_memory_pressure,
2477 	.sysctl_mem		= sysctl_tcp_mem,
2478 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2479 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2480 	.max_header		= MAX_TCP_HEADER,
2481 	.obj_size		= sizeof(struct tcp_sock),
2482 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2483 	.twsk_prot		= &tcp_timewait_sock_ops,
2484 	.rsk_prot		= &tcp_request_sock_ops,
2485 	.h.hashinfo		= &tcp_hashinfo,
2486 	.no_autobind		= true,
2487 #ifdef CONFIG_COMPAT
2488 	.compat_setsockopt	= compat_tcp_setsockopt,
2489 	.compat_getsockopt	= compat_tcp_getsockopt,
2490 #endif
2491 	.diag_destroy		= tcp_abort,
2492 };
2493 EXPORT_SYMBOL(tcp_prot);
2494 
tcp_sk_exit(struct net * net)2495 static void __net_exit tcp_sk_exit(struct net *net)
2496 {
2497 	int cpu;
2498 
2499 	module_put(net->ipv4.tcp_congestion_control->owner);
2500 
2501 	for_each_possible_cpu(cpu)
2502 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2503 	free_percpu(net->ipv4.tcp_sk);
2504 }
2505 
tcp_sk_init(struct net * net)2506 static int __net_init tcp_sk_init(struct net *net)
2507 {
2508 	int res, cpu, cnt;
2509 
2510 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2511 	if (!net->ipv4.tcp_sk)
2512 		return -ENOMEM;
2513 
2514 	for_each_possible_cpu(cpu) {
2515 		struct sock *sk;
2516 
2517 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2518 					   IPPROTO_TCP, net);
2519 		if (res)
2520 			goto fail;
2521 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2522 
2523 		/* Please enforce IP_DF and IPID==0 for RST and
2524 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2525 		 */
2526 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2527 
2528 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2529 	}
2530 
2531 	net->ipv4.sysctl_tcp_ecn = 2;
2532 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2533 
2534 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2535 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2536 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2537 
2538 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2539 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2540 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2541 
2542 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2543 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2544 	net->ipv4.sysctl_tcp_syncookies = 1;
2545 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2546 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2547 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2548 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2549 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2550 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2551 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2552 
2553 	cnt = tcp_hashinfo.ehash_mask + 1;
2554 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2555 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2556 
2557 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2558 	net->ipv4.sysctl_tcp_sack = 1;
2559 	net->ipv4.sysctl_tcp_window_scaling = 1;
2560 	net->ipv4.sysctl_tcp_timestamps = 1;
2561 	net->ipv4.sysctl_tcp_early_retrans = 3;
2562 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2563 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2564 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2565 	net->ipv4.sysctl_tcp_max_reordering = 300;
2566 	net->ipv4.sysctl_tcp_dsack = 1;
2567 	net->ipv4.sysctl_tcp_app_win = 31;
2568 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2569 	net->ipv4.sysctl_tcp_frto = 2;
2570 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2571 	/* This limits the percentage of the congestion window which we
2572 	 * will allow a single TSO frame to consume.  Building TSO frames
2573 	 * which are too large can cause TCP streams to be bursty.
2574 	 */
2575 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2576 	/* Default TSQ limit of four TSO segments */
2577 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2578 	/* rfc5961 challenge ack rate limiting */
2579 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2580 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2581 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2582 	net->ipv4.sysctl_tcp_autocorking = 1;
2583 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2584 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2585 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2586 	if (net != &init_net) {
2587 		memcpy(net->ipv4.sysctl_tcp_rmem,
2588 		       init_net.ipv4.sysctl_tcp_rmem,
2589 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2590 		memcpy(net->ipv4.sysctl_tcp_wmem,
2591 		       init_net.ipv4.sysctl_tcp_wmem,
2592 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2593 	}
2594 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2595 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2596 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2597 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2598 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2599 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2600 
2601 	/* Reno is always built in */
2602 	if (!net_eq(net, &init_net) &&
2603 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2604 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2605 	else
2606 		net->ipv4.tcp_congestion_control = &tcp_reno;
2607 
2608 	return 0;
2609 fail:
2610 	tcp_sk_exit(net);
2611 
2612 	return res;
2613 }
2614 
tcp_sk_exit_batch(struct list_head * net_exit_list)2615 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616 {
2617 	struct net *net;
2618 
2619 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2620 
2621 	list_for_each_entry(net, net_exit_list, exit_list)
2622 		tcp_fastopen_ctx_destroy(net);
2623 }
2624 
2625 static struct pernet_operations __net_initdata tcp_sk_ops = {
2626        .init	   = tcp_sk_init,
2627        .exit	   = tcp_sk_exit,
2628        .exit_batch = tcp_sk_exit_batch,
2629 };
2630 
tcp_v4_init(void)2631 void __init tcp_v4_init(void)
2632 {
2633 	if (register_pernet_subsys(&tcp_sk_ops))
2634 		panic("Failed to create the TCP control socket.\n");
2635 }
2636