1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
tcp_v4_mtu_reduced(struct sock * sk)337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = tcp_sk(sk)->mtu_info;
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
do_redirect(struct sk_buff * skb,struct sock * sk)373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
tcp_v4_err(struct sk_buff * skb,u32 info)463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			tp->mtu_info = info;
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type,
969 			      struct sk_buff *syn_skb)
970 {
971 	const struct inet_request_sock *ireq = inet_rsk(req);
972 	struct flowi4 fl4;
973 	int err = -1;
974 	struct sk_buff *skb;
975 	u8 tos;
976 
977 	/* First, grab a route. */
978 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 		return -1;
980 
981 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982 
983 	if (skb) {
984 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
985 
986 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
988 				(inet_sk(sk)->tos & INET_ECN_MASK) :
989 				inet_sk(sk)->tos;
990 
991 		if (!INET_ECN_is_capable(tos) &&
992 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
993 			tos |= INET_ECN_ECT_0;
994 
995 		rcu_read_lock();
996 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
997 					    ireq->ir_rmt_addr,
998 					    rcu_dereference(ireq->ireq_opt),
999 					    tos);
1000 		rcu_read_unlock();
1001 		err = net_xmit_eval(err);
1002 	}
1003 
1004 	return err;
1005 }
1006 
1007 /*
1008  *	IPv4 request_sock destructor.
1009  */
tcp_v4_reqsk_destructor(struct request_sock * req)1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 {
1012 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 }
1014 
1015 #ifdef CONFIG_TCP_MD5SIG
1016 /*
1017  * RFC2385 MD5 checksumming requires a mapping of
1018  * IP address->MD5 Key.
1019  * We need to maintain these in the sk structure.
1020  */
1021 
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1024 
1025 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1026 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1027 					   const union tcp_md5_addr *addr,
1028 					   int family)
1029 {
1030 	const struct tcp_sock *tp = tcp_sk(sk);
1031 	struct tcp_md5sig_key *key;
1032 	const struct tcp_md5sig_info *md5sig;
1033 	__be32 mask;
1034 	struct tcp_md5sig_key *best_match = NULL;
1035 	bool match;
1036 
1037 	/* caller either holds rcu_read_lock() or socket lock */
1038 	md5sig = rcu_dereference_check(tp->md5sig_info,
1039 				       lockdep_sock_is_held(sk));
1040 	if (!md5sig)
1041 		return NULL;
1042 
1043 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1044 				 lockdep_sock_is_held(sk)) {
1045 		if (key->family != family)
1046 			continue;
1047 		if (key->l3index && key->l3index != l3index)
1048 			continue;
1049 		if (family == AF_INET) {
1050 			mask = inet_make_mask(key->prefixlen);
1051 			match = (key->addr.a4.s_addr & mask) ==
1052 				(addr->a4.s_addr & mask);
1053 #if IS_ENABLED(CONFIG_IPV6)
1054 		} else if (family == AF_INET6) {
1055 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1056 						  key->prefixlen);
1057 #endif
1058 		} else {
1059 			match = false;
1060 		}
1061 
1062 		if (match && (!best_match ||
1063 			      key->prefixlen > best_match->prefixlen))
1064 			best_match = key;
1065 	}
1066 	return best_match;
1067 }
1068 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1069 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1070 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1071 						      const union tcp_md5_addr *addr,
1072 						      int family, u8 prefixlen,
1073 						      int l3index)
1074 {
1075 	const struct tcp_sock *tp = tcp_sk(sk);
1076 	struct tcp_md5sig_key *key;
1077 	unsigned int size = sizeof(struct in_addr);
1078 	const struct tcp_md5sig_info *md5sig;
1079 
1080 	/* caller either holds rcu_read_lock() or socket lock */
1081 	md5sig = rcu_dereference_check(tp->md5sig_info,
1082 				       lockdep_sock_is_held(sk));
1083 	if (!md5sig)
1084 		return NULL;
1085 #if IS_ENABLED(CONFIG_IPV6)
1086 	if (family == AF_INET6)
1087 		size = sizeof(struct in6_addr);
1088 #endif
1089 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1090 				 lockdep_sock_is_held(sk)) {
1091 		if (key->family != family)
1092 			continue;
1093 		if (key->l3index && key->l3index != l3index)
1094 			continue;
1095 		if (!memcmp(&key->addr, addr, size) &&
1096 		    key->prefixlen == prefixlen)
1097 			return key;
1098 	}
1099 	return NULL;
1100 }
1101 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1102 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1103 					 const struct sock *addr_sk)
1104 {
1105 	const union tcp_md5_addr *addr;
1106 	int l3index;
1107 
1108 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1109 						 addr_sk->sk_bound_dev_if);
1110 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1111 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1112 }
1113 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1114 
1115 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1116 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1117 		   int family, u8 prefixlen, int l3index,
1118 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1119 {
1120 	/* Add Key to the list */
1121 	struct tcp_md5sig_key *key;
1122 	struct tcp_sock *tp = tcp_sk(sk);
1123 	struct tcp_md5sig_info *md5sig;
1124 
1125 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1126 	if (key) {
1127 		/* Pre-existing entry - just update that one.
1128 		 * Note that the key might be used concurrently.
1129 		 * data_race() is telling kcsan that we do not care of
1130 		 * key mismatches, since changing MD5 key on live flows
1131 		 * can lead to packet drops.
1132 		 */
1133 		data_race(memcpy(key->key, newkey, newkeylen));
1134 
1135 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1136 		 * Also note that a reader could catch new key->keylen value
1137 		 * but old key->key[], this is the reason we use __GFP_ZERO
1138 		 * at sock_kmalloc() time below these lines.
1139 		 */
1140 		WRITE_ONCE(key->keylen, newkeylen);
1141 
1142 		return 0;
1143 	}
1144 
1145 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1146 					   lockdep_sock_is_held(sk));
1147 	if (!md5sig) {
1148 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1149 		if (!md5sig)
1150 			return -ENOMEM;
1151 
1152 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1153 		INIT_HLIST_HEAD(&md5sig->head);
1154 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1155 	}
1156 
1157 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1158 	if (!key)
1159 		return -ENOMEM;
1160 	if (!tcp_alloc_md5sig_pool()) {
1161 		sock_kfree_s(sk, key, sizeof(*key));
1162 		return -ENOMEM;
1163 	}
1164 
1165 	memcpy(key->key, newkey, newkeylen);
1166 	key->keylen = newkeylen;
1167 	key->family = family;
1168 	key->prefixlen = prefixlen;
1169 	key->l3index = l3index;
1170 	memcpy(&key->addr, addr,
1171 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1172 				      sizeof(struct in_addr));
1173 	hlist_add_head_rcu(&key->node, &md5sig->head);
1174 	return 0;
1175 }
1176 EXPORT_SYMBOL(tcp_md5_do_add);
1177 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1178 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1179 		   u8 prefixlen, int l3index)
1180 {
1181 	struct tcp_md5sig_key *key;
1182 
1183 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1184 	if (!key)
1185 		return -ENOENT;
1186 	hlist_del_rcu(&key->node);
1187 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1188 	kfree_rcu(key, rcu);
1189 	return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_del);
1192 
tcp_clear_md5_list(struct sock * sk)1193 static void tcp_clear_md5_list(struct sock *sk)
1194 {
1195 	struct tcp_sock *tp = tcp_sk(sk);
1196 	struct tcp_md5sig_key *key;
1197 	struct hlist_node *n;
1198 	struct tcp_md5sig_info *md5sig;
1199 
1200 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1201 
1202 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1203 		hlist_del_rcu(&key->node);
1204 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1205 		kfree_rcu(key, rcu);
1206 	}
1207 }
1208 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1209 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1210 				 sockptr_t optval, int optlen)
1211 {
1212 	struct tcp_md5sig cmd;
1213 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1214 	const union tcp_md5_addr *addr;
1215 	u8 prefixlen = 32;
1216 	int l3index = 0;
1217 
1218 	if (optlen < sizeof(cmd))
1219 		return -EINVAL;
1220 
1221 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1222 		return -EFAULT;
1223 
1224 	if (sin->sin_family != AF_INET)
1225 		return -EINVAL;
1226 
1227 	if (optname == TCP_MD5SIG_EXT &&
1228 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1229 		prefixlen = cmd.tcpm_prefixlen;
1230 		if (prefixlen > 32)
1231 			return -EINVAL;
1232 	}
1233 
1234 	if (optname == TCP_MD5SIG_EXT &&
1235 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1236 		struct net_device *dev;
1237 
1238 		rcu_read_lock();
1239 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1240 		if (dev && netif_is_l3_master(dev))
1241 			l3index = dev->ifindex;
1242 
1243 		rcu_read_unlock();
1244 
1245 		/* ok to reference set/not set outside of rcu;
1246 		 * right now device MUST be an L3 master
1247 		 */
1248 		if (!dev || !l3index)
1249 			return -EINVAL;
1250 	}
1251 
1252 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1253 
1254 	if (!cmd.tcpm_keylen)
1255 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1256 
1257 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1258 		return -EINVAL;
1259 
1260 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1261 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1262 }
1263 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1264 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1265 				   __be32 daddr, __be32 saddr,
1266 				   const struct tcphdr *th, int nbytes)
1267 {
1268 	struct tcp4_pseudohdr *bp;
1269 	struct scatterlist sg;
1270 	struct tcphdr *_th;
1271 
1272 	bp = hp->scratch;
1273 	bp->saddr = saddr;
1274 	bp->daddr = daddr;
1275 	bp->pad = 0;
1276 	bp->protocol = IPPROTO_TCP;
1277 	bp->len = cpu_to_be16(nbytes);
1278 
1279 	_th = (struct tcphdr *)(bp + 1);
1280 	memcpy(_th, th, sizeof(*th));
1281 	_th->check = 0;
1282 
1283 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1284 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1285 				sizeof(*bp) + sizeof(*th));
1286 	return crypto_ahash_update(hp->md5_req);
1287 }
1288 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1289 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1290 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1291 {
1292 	struct tcp_md5sig_pool *hp;
1293 	struct ahash_request *req;
1294 
1295 	hp = tcp_get_md5sig_pool();
1296 	if (!hp)
1297 		goto clear_hash_noput;
1298 	req = hp->md5_req;
1299 
1300 	if (crypto_ahash_init(req))
1301 		goto clear_hash;
1302 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1303 		goto clear_hash;
1304 	if (tcp_md5_hash_key(hp, key))
1305 		goto clear_hash;
1306 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1307 	if (crypto_ahash_final(req))
1308 		goto clear_hash;
1309 
1310 	tcp_put_md5sig_pool();
1311 	return 0;
1312 
1313 clear_hash:
1314 	tcp_put_md5sig_pool();
1315 clear_hash_noput:
1316 	memset(md5_hash, 0, 16);
1317 	return 1;
1318 }
1319 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1320 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1321 			const struct sock *sk,
1322 			const struct sk_buff *skb)
1323 {
1324 	struct tcp_md5sig_pool *hp;
1325 	struct ahash_request *req;
1326 	const struct tcphdr *th = tcp_hdr(skb);
1327 	__be32 saddr, daddr;
1328 
1329 	if (sk) { /* valid for establish/request sockets */
1330 		saddr = sk->sk_rcv_saddr;
1331 		daddr = sk->sk_daddr;
1332 	} else {
1333 		const struct iphdr *iph = ip_hdr(skb);
1334 		saddr = iph->saddr;
1335 		daddr = iph->daddr;
1336 	}
1337 
1338 	hp = tcp_get_md5sig_pool();
1339 	if (!hp)
1340 		goto clear_hash_noput;
1341 	req = hp->md5_req;
1342 
1343 	if (crypto_ahash_init(req))
1344 		goto clear_hash;
1345 
1346 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1347 		goto clear_hash;
1348 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1349 		goto clear_hash;
1350 	if (tcp_md5_hash_key(hp, key))
1351 		goto clear_hash;
1352 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1353 	if (crypto_ahash_final(req))
1354 		goto clear_hash;
1355 
1356 	tcp_put_md5sig_pool();
1357 	return 0;
1358 
1359 clear_hash:
1360 	tcp_put_md5sig_pool();
1361 clear_hash_noput:
1362 	memset(md5_hash, 0, 16);
1363 	return 1;
1364 }
1365 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1366 
1367 #endif
1368 
1369 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1370 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1371 				    const struct sk_buff *skb,
1372 				    int dif, int sdif)
1373 {
1374 #ifdef CONFIG_TCP_MD5SIG
1375 	/*
1376 	 * This gets called for each TCP segment that arrives
1377 	 * so we want to be efficient.
1378 	 * We have 3 drop cases:
1379 	 * o No MD5 hash and one expected.
1380 	 * o MD5 hash and we're not expecting one.
1381 	 * o MD5 hash and its wrong.
1382 	 */
1383 	const __u8 *hash_location = NULL;
1384 	struct tcp_md5sig_key *hash_expected;
1385 	const struct iphdr *iph = ip_hdr(skb);
1386 	const struct tcphdr *th = tcp_hdr(skb);
1387 	const union tcp_md5_addr *addr;
1388 	unsigned char newhash[16];
1389 	int genhash, l3index;
1390 
1391 	/* sdif set, means packet ingressed via a device
1392 	 * in an L3 domain and dif is set to the l3mdev
1393 	 */
1394 	l3index = sdif ? dif : 0;
1395 
1396 	addr = (union tcp_md5_addr *)&iph->saddr;
1397 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1398 	hash_location = tcp_parse_md5sig_option(th);
1399 
1400 	/* We've parsed the options - do we have a hash? */
1401 	if (!hash_expected && !hash_location)
1402 		return false;
1403 
1404 	if (hash_expected && !hash_location) {
1405 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1406 		return true;
1407 	}
1408 
1409 	if (!hash_expected && hash_location) {
1410 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1411 		return true;
1412 	}
1413 
1414 	/* Okay, so this is hash_expected and hash_location -
1415 	 * so we need to calculate the checksum.
1416 	 */
1417 	genhash = tcp_v4_md5_hash_skb(newhash,
1418 				      hash_expected,
1419 				      NULL, skb);
1420 
1421 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1422 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1423 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1424 				     &iph->saddr, ntohs(th->source),
1425 				     &iph->daddr, ntohs(th->dest),
1426 				     genhash ? " tcp_v4_calc_md5_hash failed"
1427 				     : "", l3index);
1428 		return true;
1429 	}
1430 	return false;
1431 #endif
1432 	return false;
1433 }
1434 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1435 static void tcp_v4_init_req(struct request_sock *req,
1436 			    const struct sock *sk_listener,
1437 			    struct sk_buff *skb)
1438 {
1439 	struct inet_request_sock *ireq = inet_rsk(req);
1440 	struct net *net = sock_net(sk_listener);
1441 
1442 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445 }
1446 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1447 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448 					  struct flowi *fl,
1449 					  const struct request_sock *req)
1450 {
1451 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1452 }
1453 
1454 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1455 	.family		=	PF_INET,
1456 	.obj_size	=	sizeof(struct tcp_request_sock),
1457 	.rtx_syn_ack	=	tcp_rtx_synack,
1458 	.send_ack	=	tcp_v4_reqsk_send_ack,
1459 	.destructor	=	tcp_v4_reqsk_destructor,
1460 	.send_reset	=	tcp_v4_send_reset,
1461 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1462 };
1463 
1464 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1465 	.mss_clamp	=	TCP_MSS_DEFAULT,
1466 #ifdef CONFIG_TCP_MD5SIG
1467 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1468 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1469 #endif
1470 	.init_req	=	tcp_v4_init_req,
1471 #ifdef CONFIG_SYN_COOKIES
1472 	.cookie_init_seq =	cookie_v4_init_sequence,
1473 #endif
1474 	.route_req	=	tcp_v4_route_req,
1475 	.init_seq	=	tcp_v4_init_seq,
1476 	.init_ts_off	=	tcp_v4_init_ts_off,
1477 	.send_synack	=	tcp_v4_send_synack,
1478 };
1479 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1480 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1481 {
1482 	/* Never answer to SYNs send to broadcast or multicast */
1483 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1484 		goto drop;
1485 
1486 	return tcp_conn_request(&tcp_request_sock_ops,
1487 				&tcp_request_sock_ipv4_ops, sk, skb);
1488 
1489 drop:
1490 	tcp_listendrop(sk);
1491 	return 0;
1492 }
1493 EXPORT_SYMBOL(tcp_v4_conn_request);
1494 
1495 
1496 /*
1497  * The three way handshake has completed - we got a valid synack -
1498  * now create the new socket.
1499  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1500 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1501 				  struct request_sock *req,
1502 				  struct dst_entry *dst,
1503 				  struct request_sock *req_unhash,
1504 				  bool *own_req)
1505 {
1506 	struct inet_request_sock *ireq;
1507 	bool found_dup_sk = false;
1508 	struct inet_sock *newinet;
1509 	struct tcp_sock *newtp;
1510 	struct sock *newsk;
1511 #ifdef CONFIG_TCP_MD5SIG
1512 	const union tcp_md5_addr *addr;
1513 	struct tcp_md5sig_key *key;
1514 	int l3index;
1515 #endif
1516 	struct ip_options_rcu *inet_opt;
1517 
1518 	if (sk_acceptq_is_full(sk))
1519 		goto exit_overflow;
1520 
1521 	newsk = tcp_create_openreq_child(sk, req, skb);
1522 	if (!newsk)
1523 		goto exit_nonewsk;
1524 
1525 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1526 	inet_sk_rx_dst_set(newsk, skb);
1527 
1528 	newtp		      = tcp_sk(newsk);
1529 	newinet		      = inet_sk(newsk);
1530 	ireq		      = inet_rsk(req);
1531 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1532 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1533 	newsk->sk_bound_dev_if = ireq->ir_iif;
1534 	newinet->inet_saddr   = ireq->ir_loc_addr;
1535 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1536 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1537 	newinet->mc_index     = inet_iif(skb);
1538 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1539 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1540 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1541 	if (inet_opt)
1542 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1543 	newinet->inet_id = prandom_u32();
1544 
1545 	/* Set ToS of the new socket based upon the value of incoming SYN.
1546 	 * ECT bits are set later in tcp_init_transfer().
1547 	 */
1548 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1549 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1550 
1551 	if (!dst) {
1552 		dst = inet_csk_route_child_sock(sk, newsk, req);
1553 		if (!dst)
1554 			goto put_and_exit;
1555 	} else {
1556 		/* syncookie case : see end of cookie_v4_check() */
1557 	}
1558 	sk_setup_caps(newsk, dst);
1559 
1560 	tcp_ca_openreq_child(newsk, dst);
1561 
1562 	tcp_sync_mss(newsk, dst_mtu(dst));
1563 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1564 
1565 	tcp_initialize_rcv_mss(newsk);
1566 
1567 #ifdef CONFIG_TCP_MD5SIG
1568 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1569 	/* Copy over the MD5 key from the original socket */
1570 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1571 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1572 	if (key) {
1573 		/*
1574 		 * We're using one, so create a matching key
1575 		 * on the newsk structure. If we fail to get
1576 		 * memory, then we end up not copying the key
1577 		 * across. Shucks.
1578 		 */
1579 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1580 			       key->key, key->keylen, GFP_ATOMIC);
1581 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1582 	}
1583 #endif
1584 
1585 	if (__inet_inherit_port(sk, newsk) < 0)
1586 		goto put_and_exit;
1587 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1588 				       &found_dup_sk);
1589 	if (likely(*own_req)) {
1590 		tcp_move_syn(newtp, req);
1591 		ireq->ireq_opt = NULL;
1592 	} else {
1593 		if (!req_unhash && found_dup_sk) {
1594 			/* This code path should only be executed in the
1595 			 * syncookie case only
1596 			 */
1597 			bh_unlock_sock(newsk);
1598 			sock_put(newsk);
1599 			newsk = NULL;
1600 		} else {
1601 			newinet->inet_opt = NULL;
1602 		}
1603 	}
1604 	return newsk;
1605 
1606 exit_overflow:
1607 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1608 exit_nonewsk:
1609 	dst_release(dst);
1610 exit:
1611 	tcp_listendrop(sk);
1612 	return NULL;
1613 put_and_exit:
1614 	newinet->inet_opt = NULL;
1615 	inet_csk_prepare_forced_close(newsk);
1616 	tcp_done(newsk);
1617 	goto exit;
1618 }
1619 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1620 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1621 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1622 {
1623 #ifdef CONFIG_SYN_COOKIES
1624 	const struct tcphdr *th = tcp_hdr(skb);
1625 
1626 	if (!th->syn)
1627 		sk = cookie_v4_check(sk, skb);
1628 #endif
1629 	return sk;
1630 }
1631 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1632 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1633 			 struct tcphdr *th, u32 *cookie)
1634 {
1635 	u16 mss = 0;
1636 #ifdef CONFIG_SYN_COOKIES
1637 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1638 				    &tcp_request_sock_ipv4_ops, sk, th);
1639 	if (mss) {
1640 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1641 		tcp_synq_overflow(sk);
1642 	}
1643 #endif
1644 	return mss;
1645 }
1646 
1647 /* The socket must have it's spinlock held when we get
1648  * here, unless it is a TCP_LISTEN socket.
1649  *
1650  * We have a potential double-lock case here, so even when
1651  * doing backlog processing we use the BH locking scheme.
1652  * This is because we cannot sleep with the original spinlock
1653  * held.
1654  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1655 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1656 {
1657 	struct sock *rsk;
1658 
1659 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1660 		struct dst_entry *dst = sk->sk_rx_dst;
1661 
1662 		sock_rps_save_rxhash(sk, skb);
1663 		sk_mark_napi_id(sk, skb);
1664 		if (dst) {
1665 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1666 			    !dst->ops->check(dst, 0)) {
1667 				dst_release(dst);
1668 				sk->sk_rx_dst = NULL;
1669 			}
1670 		}
1671 		tcp_rcv_established(sk, skb);
1672 		return 0;
1673 	}
1674 
1675 	if (tcp_checksum_complete(skb))
1676 		goto csum_err;
1677 
1678 	if (sk->sk_state == TCP_LISTEN) {
1679 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1680 
1681 		if (!nsk)
1682 			goto discard;
1683 		if (nsk != sk) {
1684 			if (tcp_child_process(sk, nsk, skb)) {
1685 				rsk = nsk;
1686 				goto reset;
1687 			}
1688 			return 0;
1689 		}
1690 	} else
1691 		sock_rps_save_rxhash(sk, skb);
1692 
1693 	if (tcp_rcv_state_process(sk, skb)) {
1694 		rsk = sk;
1695 		goto reset;
1696 	}
1697 	return 0;
1698 
1699 reset:
1700 	tcp_v4_send_reset(rsk, skb);
1701 discard:
1702 	kfree_skb(skb);
1703 	/* Be careful here. If this function gets more complicated and
1704 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1705 	 * might be destroyed here. This current version compiles correctly,
1706 	 * but you have been warned.
1707 	 */
1708 	return 0;
1709 
1710 csum_err:
1711 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1712 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1713 	goto discard;
1714 }
1715 EXPORT_SYMBOL(tcp_v4_do_rcv);
1716 
tcp_v4_early_demux(struct sk_buff * skb)1717 int tcp_v4_early_demux(struct sk_buff *skb)
1718 {
1719 	const struct iphdr *iph;
1720 	const struct tcphdr *th;
1721 	struct sock *sk;
1722 
1723 	if (skb->pkt_type != PACKET_HOST)
1724 		return 0;
1725 
1726 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1727 		return 0;
1728 
1729 	iph = ip_hdr(skb);
1730 	th = tcp_hdr(skb);
1731 
1732 	if (th->doff < sizeof(struct tcphdr) / 4)
1733 		return 0;
1734 
1735 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1736 				       iph->saddr, th->source,
1737 				       iph->daddr, ntohs(th->dest),
1738 				       skb->skb_iif, inet_sdif(skb));
1739 	if (sk) {
1740 		skb->sk = sk;
1741 		skb->destructor = sock_edemux;
1742 		if (sk_fullsock(sk)) {
1743 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1744 
1745 			if (dst)
1746 				dst = dst_check(dst, 0);
1747 			if (dst &&
1748 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1749 				skb_dst_set_noref(skb, dst);
1750 		}
1751 	}
1752 	return 0;
1753 }
1754 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1755 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1756 {
1757 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1758 	struct skb_shared_info *shinfo;
1759 	const struct tcphdr *th;
1760 	struct tcphdr *thtail;
1761 	struct sk_buff *tail;
1762 	unsigned int hdrlen;
1763 	bool fragstolen;
1764 	u32 gso_segs;
1765 	int delta;
1766 
1767 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1768 	 * we can fix skb->truesize to its real value to avoid future drops.
1769 	 * This is valid because skb is not yet charged to the socket.
1770 	 * It has been noticed pure SACK packets were sometimes dropped
1771 	 * (if cooked by drivers without copybreak feature).
1772 	 */
1773 	skb_condense(skb);
1774 
1775 	skb_dst_drop(skb);
1776 
1777 	if (unlikely(tcp_checksum_complete(skb))) {
1778 		bh_unlock_sock(sk);
1779 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1780 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1781 		return true;
1782 	}
1783 
1784 	/* Attempt coalescing to last skb in backlog, even if we are
1785 	 * above the limits.
1786 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1787 	 */
1788 	th = (const struct tcphdr *)skb->data;
1789 	hdrlen = th->doff * 4;
1790 	shinfo = skb_shinfo(skb);
1791 
1792 	if (!shinfo->gso_size)
1793 		shinfo->gso_size = skb->len - hdrlen;
1794 
1795 	if (!shinfo->gso_segs)
1796 		shinfo->gso_segs = 1;
1797 
1798 	tail = sk->sk_backlog.tail;
1799 	if (!tail)
1800 		goto no_coalesce;
1801 	thtail = (struct tcphdr *)tail->data;
1802 
1803 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1804 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1805 	    ((TCP_SKB_CB(tail)->tcp_flags |
1806 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1807 	    !((TCP_SKB_CB(tail)->tcp_flags &
1808 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1809 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1810 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1811 #ifdef CONFIG_TLS_DEVICE
1812 	    tail->decrypted != skb->decrypted ||
1813 #endif
1814 	    thtail->doff != th->doff ||
1815 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1816 		goto no_coalesce;
1817 
1818 	__skb_pull(skb, hdrlen);
1819 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1820 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1821 
1822 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1823 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1824 			thtail->window = th->window;
1825 		}
1826 
1827 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1828 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1829 		 * is not entered if we append a packet with a FIN.
1830 		 * SYN, RST, URG are not present.
1831 		 * ACK is set on both packets.
1832 		 * PSH : we do not really care in TCP stack,
1833 		 *       at least for 'GRO' packets.
1834 		 */
1835 		thtail->fin |= th->fin;
1836 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1837 
1838 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1839 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1840 			tail->tstamp = skb->tstamp;
1841 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1842 		}
1843 
1844 		/* Not as strict as GRO. We only need to carry mss max value */
1845 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1846 						 skb_shinfo(tail)->gso_size);
1847 
1848 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1849 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1850 
1851 		sk->sk_backlog.len += delta;
1852 		__NET_INC_STATS(sock_net(sk),
1853 				LINUX_MIB_TCPBACKLOGCOALESCE);
1854 		kfree_skb_partial(skb, fragstolen);
1855 		return false;
1856 	}
1857 	__skb_push(skb, hdrlen);
1858 
1859 no_coalesce:
1860 	/* Only socket owner can try to collapse/prune rx queues
1861 	 * to reduce memory overhead, so add a little headroom here.
1862 	 * Few sockets backlog are possibly concurrently non empty.
1863 	 */
1864 	limit += 64*1024;
1865 
1866 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1867 		bh_unlock_sock(sk);
1868 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1869 		return true;
1870 	}
1871 	return false;
1872 }
1873 EXPORT_SYMBOL(tcp_add_backlog);
1874 
tcp_filter(struct sock * sk,struct sk_buff * skb)1875 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1876 {
1877 	struct tcphdr *th = (struct tcphdr *)skb->data;
1878 
1879 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1880 }
1881 EXPORT_SYMBOL(tcp_filter);
1882 
tcp_v4_restore_cb(struct sk_buff * skb)1883 static void tcp_v4_restore_cb(struct sk_buff *skb)
1884 {
1885 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1886 		sizeof(struct inet_skb_parm));
1887 }
1888 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1889 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1890 			   const struct tcphdr *th)
1891 {
1892 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1893 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1894 	 */
1895 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1896 		sizeof(struct inet_skb_parm));
1897 	barrier();
1898 
1899 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1900 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1901 				    skb->len - th->doff * 4);
1902 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1903 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1904 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1905 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1906 	TCP_SKB_CB(skb)->sacked	 = 0;
1907 	TCP_SKB_CB(skb)->has_rxtstamp =
1908 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1909 }
1910 
1911 /*
1912  *	From tcp_input.c
1913  */
1914 
tcp_v4_rcv(struct sk_buff * skb)1915 int tcp_v4_rcv(struct sk_buff *skb)
1916 {
1917 	struct net *net = dev_net(skb->dev);
1918 	struct sk_buff *skb_to_free;
1919 	int sdif = inet_sdif(skb);
1920 	int dif = inet_iif(skb);
1921 	const struct iphdr *iph;
1922 	const struct tcphdr *th;
1923 	bool refcounted;
1924 	struct sock *sk;
1925 	int ret;
1926 
1927 	if (skb->pkt_type != PACKET_HOST)
1928 		goto discard_it;
1929 
1930 	/* Count it even if it's bad */
1931 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1932 
1933 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1934 		goto discard_it;
1935 
1936 	th = (const struct tcphdr *)skb->data;
1937 
1938 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1939 		goto bad_packet;
1940 	if (!pskb_may_pull(skb, th->doff * 4))
1941 		goto discard_it;
1942 
1943 	/* An explanation is required here, I think.
1944 	 * Packet length and doff are validated by header prediction,
1945 	 * provided case of th->doff==0 is eliminated.
1946 	 * So, we defer the checks. */
1947 
1948 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949 		goto csum_error;
1950 
1951 	th = (const struct tcphdr *)skb->data;
1952 	iph = ip_hdr(skb);
1953 lookup:
1954 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955 			       th->dest, sdif, &refcounted);
1956 	if (!sk)
1957 		goto no_tcp_socket;
1958 
1959 process:
1960 	if (sk->sk_state == TCP_TIME_WAIT)
1961 		goto do_time_wait;
1962 
1963 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964 		struct request_sock *req = inet_reqsk(sk);
1965 		bool req_stolen = false;
1966 		struct sock *nsk;
1967 
1968 		sk = req->rsk_listener;
1969 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1970 			sk_drops_add(sk, skb);
1971 			reqsk_put(req);
1972 			goto discard_it;
1973 		}
1974 		if (tcp_checksum_complete(skb)) {
1975 			reqsk_put(req);
1976 			goto csum_error;
1977 		}
1978 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1979 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1980 			goto lookup;
1981 		}
1982 		/* We own a reference on the listener, increase it again
1983 		 * as we might lose it too soon.
1984 		 */
1985 		sock_hold(sk);
1986 		refcounted = true;
1987 		nsk = NULL;
1988 		if (!tcp_filter(sk, skb)) {
1989 			th = (const struct tcphdr *)skb->data;
1990 			iph = ip_hdr(skb);
1991 			tcp_v4_fill_cb(skb, iph, th);
1992 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1993 		}
1994 		if (!nsk) {
1995 			reqsk_put(req);
1996 			if (req_stolen) {
1997 				/* Another cpu got exclusive access to req
1998 				 * and created a full blown socket.
1999 				 * Try to feed this packet to this socket
2000 				 * instead of discarding it.
2001 				 */
2002 				tcp_v4_restore_cb(skb);
2003 				sock_put(sk);
2004 				goto lookup;
2005 			}
2006 			goto discard_and_relse;
2007 		}
2008 		if (nsk == sk) {
2009 			reqsk_put(req);
2010 			tcp_v4_restore_cb(skb);
2011 		} else if (tcp_child_process(sk, nsk, skb)) {
2012 			tcp_v4_send_reset(nsk, skb);
2013 			goto discard_and_relse;
2014 		} else {
2015 			sock_put(sk);
2016 			return 0;
2017 		}
2018 	}
2019 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2020 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2021 		goto discard_and_relse;
2022 	}
2023 
2024 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2025 		goto discard_and_relse;
2026 
2027 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2028 		goto discard_and_relse;
2029 
2030 	nf_reset_ct(skb);
2031 
2032 	if (tcp_filter(sk, skb))
2033 		goto discard_and_relse;
2034 	th = (const struct tcphdr *)skb->data;
2035 	iph = ip_hdr(skb);
2036 	tcp_v4_fill_cb(skb, iph, th);
2037 
2038 	skb->dev = NULL;
2039 
2040 	if (sk->sk_state == TCP_LISTEN) {
2041 		ret = tcp_v4_do_rcv(sk, skb);
2042 		goto put_and_return;
2043 	}
2044 
2045 	sk_incoming_cpu_update(sk);
2046 
2047 	bh_lock_sock_nested(sk);
2048 	tcp_segs_in(tcp_sk(sk), skb);
2049 	ret = 0;
2050 	if (!sock_owned_by_user(sk)) {
2051 		skb_to_free = sk->sk_rx_skb_cache;
2052 		sk->sk_rx_skb_cache = NULL;
2053 		ret = tcp_v4_do_rcv(sk, skb);
2054 	} else {
2055 		if (tcp_add_backlog(sk, skb))
2056 			goto discard_and_relse;
2057 		skb_to_free = NULL;
2058 	}
2059 	bh_unlock_sock(sk);
2060 	if (skb_to_free)
2061 		__kfree_skb(skb_to_free);
2062 
2063 put_and_return:
2064 	if (refcounted)
2065 		sock_put(sk);
2066 
2067 	return ret;
2068 
2069 no_tcp_socket:
2070 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2071 		goto discard_it;
2072 
2073 	tcp_v4_fill_cb(skb, iph, th);
2074 
2075 	if (tcp_checksum_complete(skb)) {
2076 csum_error:
2077 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2078 bad_packet:
2079 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2080 	} else {
2081 		tcp_v4_send_reset(NULL, skb);
2082 	}
2083 
2084 discard_it:
2085 	/* Discard frame. */
2086 	kfree_skb(skb);
2087 	return 0;
2088 
2089 discard_and_relse:
2090 	sk_drops_add(sk, skb);
2091 	if (refcounted)
2092 		sock_put(sk);
2093 	goto discard_it;
2094 
2095 do_time_wait:
2096 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2097 		inet_twsk_put(inet_twsk(sk));
2098 		goto discard_it;
2099 	}
2100 
2101 	tcp_v4_fill_cb(skb, iph, th);
2102 
2103 	if (tcp_checksum_complete(skb)) {
2104 		inet_twsk_put(inet_twsk(sk));
2105 		goto csum_error;
2106 	}
2107 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2108 	case TCP_TW_SYN: {
2109 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2110 							&tcp_hashinfo, skb,
2111 							__tcp_hdrlen(th),
2112 							iph->saddr, th->source,
2113 							iph->daddr, th->dest,
2114 							inet_iif(skb),
2115 							sdif);
2116 		if (sk2) {
2117 			inet_twsk_deschedule_put(inet_twsk(sk));
2118 			sk = sk2;
2119 			tcp_v4_restore_cb(skb);
2120 			refcounted = false;
2121 			goto process;
2122 		}
2123 	}
2124 		/* to ACK */
2125 		fallthrough;
2126 	case TCP_TW_ACK:
2127 		tcp_v4_timewait_ack(sk, skb);
2128 		break;
2129 	case TCP_TW_RST:
2130 		tcp_v4_send_reset(sk, skb);
2131 		inet_twsk_deschedule_put(inet_twsk(sk));
2132 		goto discard_it;
2133 	case TCP_TW_SUCCESS:;
2134 	}
2135 	goto discard_it;
2136 }
2137 
2138 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2139 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2140 	.twsk_unique	= tcp_twsk_unique,
2141 	.twsk_destructor= tcp_twsk_destructor,
2142 };
2143 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2144 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2145 {
2146 	struct dst_entry *dst = skb_dst(skb);
2147 
2148 	if (dst && dst_hold_safe(dst)) {
2149 		sk->sk_rx_dst = dst;
2150 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2151 	}
2152 }
2153 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2154 
2155 const struct inet_connection_sock_af_ops ipv4_specific = {
2156 	.queue_xmit	   = ip_queue_xmit,
2157 	.send_check	   = tcp_v4_send_check,
2158 	.rebuild_header	   = inet_sk_rebuild_header,
2159 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2160 	.conn_request	   = tcp_v4_conn_request,
2161 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2162 	.net_header_len	   = sizeof(struct iphdr),
2163 	.setsockopt	   = ip_setsockopt,
2164 	.getsockopt	   = ip_getsockopt,
2165 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2166 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2167 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2168 };
2169 EXPORT_SYMBOL(ipv4_specific);
2170 
2171 #ifdef CONFIG_TCP_MD5SIG
2172 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2173 	.md5_lookup		= tcp_v4_md5_lookup,
2174 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2175 	.md5_parse		= tcp_v4_parse_md5_keys,
2176 };
2177 #endif
2178 
2179 /* NOTE: A lot of things set to zero explicitly by call to
2180  *       sk_alloc() so need not be done here.
2181  */
tcp_v4_init_sock(struct sock * sk)2182 static int tcp_v4_init_sock(struct sock *sk)
2183 {
2184 	struct inet_connection_sock *icsk = inet_csk(sk);
2185 
2186 	tcp_init_sock(sk);
2187 
2188 	icsk->icsk_af_ops = &ipv4_specific;
2189 
2190 #ifdef CONFIG_TCP_MD5SIG
2191 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2192 #endif
2193 
2194 	return 0;
2195 }
2196 
tcp_v4_destroy_sock(struct sock * sk)2197 void tcp_v4_destroy_sock(struct sock *sk)
2198 {
2199 	struct tcp_sock *tp = tcp_sk(sk);
2200 
2201 	trace_tcp_destroy_sock(sk);
2202 
2203 	tcp_clear_xmit_timers(sk);
2204 
2205 	tcp_cleanup_congestion_control(sk);
2206 
2207 	tcp_cleanup_ulp(sk);
2208 
2209 	/* Cleanup up the write buffer. */
2210 	tcp_write_queue_purge(sk);
2211 
2212 	/* Check if we want to disable active TFO */
2213 	tcp_fastopen_active_disable_ofo_check(sk);
2214 
2215 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2216 	skb_rbtree_purge(&tp->out_of_order_queue);
2217 
2218 #ifdef CONFIG_TCP_MD5SIG
2219 	/* Clean up the MD5 key list, if any */
2220 	if (tp->md5sig_info) {
2221 		tcp_clear_md5_list(sk);
2222 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2223 		tp->md5sig_info = NULL;
2224 	}
2225 #endif
2226 
2227 	/* Clean up a referenced TCP bind bucket. */
2228 	if (inet_csk(sk)->icsk_bind_hash)
2229 		inet_put_port(sk);
2230 
2231 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2232 
2233 	/* If socket is aborted during connect operation */
2234 	tcp_free_fastopen_req(tp);
2235 	tcp_fastopen_destroy_cipher(sk);
2236 	tcp_saved_syn_free(tp);
2237 
2238 	sk_sockets_allocated_dec(sk);
2239 }
2240 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2241 
2242 #ifdef CONFIG_PROC_FS
2243 /* Proc filesystem TCP sock list dumping. */
2244 
2245 /*
2246  * Get next listener socket follow cur.  If cur is NULL, get first socket
2247  * starting from bucket given in st->bucket; when st->bucket is zero the
2248  * very first socket in the hash table is returned.
2249  */
listening_get_next(struct seq_file * seq,void * cur)2250 static void *listening_get_next(struct seq_file *seq, void *cur)
2251 {
2252 	struct tcp_seq_afinfo *afinfo;
2253 	struct tcp_iter_state *st = seq->private;
2254 	struct net *net = seq_file_net(seq);
2255 	struct inet_listen_hashbucket *ilb;
2256 	struct hlist_nulls_node *node;
2257 	struct sock *sk = cur;
2258 
2259 	if (st->bpf_seq_afinfo)
2260 		afinfo = st->bpf_seq_afinfo;
2261 	else
2262 		afinfo = PDE_DATA(file_inode(seq->file));
2263 
2264 	if (!sk) {
2265 get_head:
2266 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2267 		spin_lock(&ilb->lock);
2268 		sk = sk_nulls_head(&ilb->nulls_head);
2269 		st->offset = 0;
2270 		goto get_sk;
2271 	}
2272 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2273 	++st->num;
2274 	++st->offset;
2275 
2276 	sk = sk_nulls_next(sk);
2277 get_sk:
2278 	sk_nulls_for_each_from(sk, node) {
2279 		if (!net_eq(sock_net(sk), net))
2280 			continue;
2281 		if (afinfo->family == AF_UNSPEC ||
2282 		    sk->sk_family == afinfo->family)
2283 			return sk;
2284 	}
2285 	spin_unlock(&ilb->lock);
2286 	st->offset = 0;
2287 	if (++st->bucket < INET_LHTABLE_SIZE)
2288 		goto get_head;
2289 	return NULL;
2290 }
2291 
listening_get_idx(struct seq_file * seq,loff_t * pos)2292 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2293 {
2294 	struct tcp_iter_state *st = seq->private;
2295 	void *rc;
2296 
2297 	st->bucket = 0;
2298 	st->offset = 0;
2299 	rc = listening_get_next(seq, NULL);
2300 
2301 	while (rc && *pos) {
2302 		rc = listening_get_next(seq, rc);
2303 		--*pos;
2304 	}
2305 	return rc;
2306 }
2307 
empty_bucket(const struct tcp_iter_state * st)2308 static inline bool empty_bucket(const struct tcp_iter_state *st)
2309 {
2310 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2311 }
2312 
2313 /*
2314  * Get first established socket starting from bucket given in st->bucket.
2315  * If st->bucket is zero, the very first socket in the hash is returned.
2316  */
established_get_first(struct seq_file * seq)2317 static void *established_get_first(struct seq_file *seq)
2318 {
2319 	struct tcp_seq_afinfo *afinfo;
2320 	struct tcp_iter_state *st = seq->private;
2321 	struct net *net = seq_file_net(seq);
2322 	void *rc = NULL;
2323 
2324 	if (st->bpf_seq_afinfo)
2325 		afinfo = st->bpf_seq_afinfo;
2326 	else
2327 		afinfo = PDE_DATA(file_inode(seq->file));
2328 
2329 	st->offset = 0;
2330 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2331 		struct sock *sk;
2332 		struct hlist_nulls_node *node;
2333 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2334 
2335 		/* Lockless fast path for the common case of empty buckets */
2336 		if (empty_bucket(st))
2337 			continue;
2338 
2339 		spin_lock_bh(lock);
2340 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2341 			if ((afinfo->family != AF_UNSPEC &&
2342 			     sk->sk_family != afinfo->family) ||
2343 			    !net_eq(sock_net(sk), net)) {
2344 				continue;
2345 			}
2346 			rc = sk;
2347 			goto out;
2348 		}
2349 		spin_unlock_bh(lock);
2350 	}
2351 out:
2352 	return rc;
2353 }
2354 
established_get_next(struct seq_file * seq,void * cur)2355 static void *established_get_next(struct seq_file *seq, void *cur)
2356 {
2357 	struct tcp_seq_afinfo *afinfo;
2358 	struct sock *sk = cur;
2359 	struct hlist_nulls_node *node;
2360 	struct tcp_iter_state *st = seq->private;
2361 	struct net *net = seq_file_net(seq);
2362 
2363 	if (st->bpf_seq_afinfo)
2364 		afinfo = st->bpf_seq_afinfo;
2365 	else
2366 		afinfo = PDE_DATA(file_inode(seq->file));
2367 
2368 	++st->num;
2369 	++st->offset;
2370 
2371 	sk = sk_nulls_next(sk);
2372 
2373 	sk_nulls_for_each_from(sk, node) {
2374 		if ((afinfo->family == AF_UNSPEC ||
2375 		     sk->sk_family == afinfo->family) &&
2376 		    net_eq(sock_net(sk), net))
2377 			return sk;
2378 	}
2379 
2380 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2381 	++st->bucket;
2382 	return established_get_first(seq);
2383 }
2384 
established_get_idx(struct seq_file * seq,loff_t pos)2385 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2386 {
2387 	struct tcp_iter_state *st = seq->private;
2388 	void *rc;
2389 
2390 	st->bucket = 0;
2391 	rc = established_get_first(seq);
2392 
2393 	while (rc && pos) {
2394 		rc = established_get_next(seq, rc);
2395 		--pos;
2396 	}
2397 	return rc;
2398 }
2399 
tcp_get_idx(struct seq_file * seq,loff_t pos)2400 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2401 {
2402 	void *rc;
2403 	struct tcp_iter_state *st = seq->private;
2404 
2405 	st->state = TCP_SEQ_STATE_LISTENING;
2406 	rc	  = listening_get_idx(seq, &pos);
2407 
2408 	if (!rc) {
2409 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2410 		rc	  = established_get_idx(seq, pos);
2411 	}
2412 
2413 	return rc;
2414 }
2415 
tcp_seek_last_pos(struct seq_file * seq)2416 static void *tcp_seek_last_pos(struct seq_file *seq)
2417 {
2418 	struct tcp_iter_state *st = seq->private;
2419 	int offset = st->offset;
2420 	int orig_num = st->num;
2421 	void *rc = NULL;
2422 
2423 	switch (st->state) {
2424 	case TCP_SEQ_STATE_LISTENING:
2425 		if (st->bucket >= INET_LHTABLE_SIZE)
2426 			break;
2427 		st->state = TCP_SEQ_STATE_LISTENING;
2428 		rc = listening_get_next(seq, NULL);
2429 		while (offset-- && rc)
2430 			rc = listening_get_next(seq, rc);
2431 		if (rc)
2432 			break;
2433 		st->bucket = 0;
2434 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2435 		fallthrough;
2436 	case TCP_SEQ_STATE_ESTABLISHED:
2437 		if (st->bucket > tcp_hashinfo.ehash_mask)
2438 			break;
2439 		rc = established_get_first(seq);
2440 		while (offset-- && rc)
2441 			rc = established_get_next(seq, rc);
2442 	}
2443 
2444 	st->num = orig_num;
2445 
2446 	return rc;
2447 }
2448 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2449 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2450 {
2451 	struct tcp_iter_state *st = seq->private;
2452 	void *rc;
2453 
2454 	if (*pos && *pos == st->last_pos) {
2455 		rc = tcp_seek_last_pos(seq);
2456 		if (rc)
2457 			goto out;
2458 	}
2459 
2460 	st->state = TCP_SEQ_STATE_LISTENING;
2461 	st->num = 0;
2462 	st->bucket = 0;
2463 	st->offset = 0;
2464 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2465 
2466 out:
2467 	st->last_pos = *pos;
2468 	return rc;
2469 }
2470 EXPORT_SYMBOL(tcp_seq_start);
2471 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2472 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2473 {
2474 	struct tcp_iter_state *st = seq->private;
2475 	void *rc = NULL;
2476 
2477 	if (v == SEQ_START_TOKEN) {
2478 		rc = tcp_get_idx(seq, 0);
2479 		goto out;
2480 	}
2481 
2482 	switch (st->state) {
2483 	case TCP_SEQ_STATE_LISTENING:
2484 		rc = listening_get_next(seq, v);
2485 		if (!rc) {
2486 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2487 			st->bucket = 0;
2488 			st->offset = 0;
2489 			rc	  = established_get_first(seq);
2490 		}
2491 		break;
2492 	case TCP_SEQ_STATE_ESTABLISHED:
2493 		rc = established_get_next(seq, v);
2494 		break;
2495 	}
2496 out:
2497 	++*pos;
2498 	st->last_pos = *pos;
2499 	return rc;
2500 }
2501 EXPORT_SYMBOL(tcp_seq_next);
2502 
tcp_seq_stop(struct seq_file * seq,void * v)2503 void tcp_seq_stop(struct seq_file *seq, void *v)
2504 {
2505 	struct tcp_iter_state *st = seq->private;
2506 
2507 	switch (st->state) {
2508 	case TCP_SEQ_STATE_LISTENING:
2509 		if (v != SEQ_START_TOKEN)
2510 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2511 		break;
2512 	case TCP_SEQ_STATE_ESTABLISHED:
2513 		if (v)
2514 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2515 		break;
2516 	}
2517 }
2518 EXPORT_SYMBOL(tcp_seq_stop);
2519 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2520 static void get_openreq4(const struct request_sock *req,
2521 			 struct seq_file *f, int i)
2522 {
2523 	const struct inet_request_sock *ireq = inet_rsk(req);
2524 	long delta = req->rsk_timer.expires - jiffies;
2525 
2526 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2527 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2528 		i,
2529 		ireq->ir_loc_addr,
2530 		ireq->ir_num,
2531 		ireq->ir_rmt_addr,
2532 		ntohs(ireq->ir_rmt_port),
2533 		TCP_SYN_RECV,
2534 		0, 0, /* could print option size, but that is af dependent. */
2535 		1,    /* timers active (only the expire timer) */
2536 		jiffies_delta_to_clock_t(delta),
2537 		req->num_timeout,
2538 		from_kuid_munged(seq_user_ns(f),
2539 				 sock_i_uid(req->rsk_listener)),
2540 		0,  /* non standard timer */
2541 		0, /* open_requests have no inode */
2542 		0,
2543 		req);
2544 }
2545 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2546 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2547 {
2548 	int timer_active;
2549 	unsigned long timer_expires;
2550 	const struct tcp_sock *tp = tcp_sk(sk);
2551 	const struct inet_connection_sock *icsk = inet_csk(sk);
2552 	const struct inet_sock *inet = inet_sk(sk);
2553 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2554 	__be32 dest = inet->inet_daddr;
2555 	__be32 src = inet->inet_rcv_saddr;
2556 	__u16 destp = ntohs(inet->inet_dport);
2557 	__u16 srcp = ntohs(inet->inet_sport);
2558 	int rx_queue;
2559 	int state;
2560 
2561 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2562 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2563 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2564 		timer_active	= 1;
2565 		timer_expires	= icsk->icsk_timeout;
2566 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2567 		timer_active	= 4;
2568 		timer_expires	= icsk->icsk_timeout;
2569 	} else if (timer_pending(&sk->sk_timer)) {
2570 		timer_active	= 2;
2571 		timer_expires	= sk->sk_timer.expires;
2572 	} else {
2573 		timer_active	= 0;
2574 		timer_expires = jiffies;
2575 	}
2576 
2577 	state = inet_sk_state_load(sk);
2578 	if (state == TCP_LISTEN)
2579 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2580 	else
2581 		/* Because we don't lock the socket,
2582 		 * we might find a transient negative value.
2583 		 */
2584 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2585 				      READ_ONCE(tp->copied_seq), 0);
2586 
2587 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2588 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2589 		i, src, srcp, dest, destp, state,
2590 		READ_ONCE(tp->write_seq) - tp->snd_una,
2591 		rx_queue,
2592 		timer_active,
2593 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2594 		icsk->icsk_retransmits,
2595 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2596 		icsk->icsk_probes_out,
2597 		sock_i_ino(sk),
2598 		refcount_read(&sk->sk_refcnt), sk,
2599 		jiffies_to_clock_t(icsk->icsk_rto),
2600 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2601 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2602 		tp->snd_cwnd,
2603 		state == TCP_LISTEN ?
2604 		    fastopenq->max_qlen :
2605 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2606 }
2607 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2608 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2609 			       struct seq_file *f, int i)
2610 {
2611 	long delta = tw->tw_timer.expires - jiffies;
2612 	__be32 dest, src;
2613 	__u16 destp, srcp;
2614 
2615 	dest  = tw->tw_daddr;
2616 	src   = tw->tw_rcv_saddr;
2617 	destp = ntohs(tw->tw_dport);
2618 	srcp  = ntohs(tw->tw_sport);
2619 
2620 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2621 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2622 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2623 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2624 		refcount_read(&tw->tw_refcnt), tw);
2625 }
2626 
2627 #define TMPSZ 150
2628 
tcp4_seq_show(struct seq_file * seq,void * v)2629 static int tcp4_seq_show(struct seq_file *seq, void *v)
2630 {
2631 	struct tcp_iter_state *st;
2632 	struct sock *sk = v;
2633 
2634 	seq_setwidth(seq, TMPSZ - 1);
2635 	if (v == SEQ_START_TOKEN) {
2636 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2637 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2638 			   "inode");
2639 		goto out;
2640 	}
2641 	st = seq->private;
2642 
2643 	if (sk->sk_state == TCP_TIME_WAIT)
2644 		get_timewait4_sock(v, seq, st->num);
2645 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2646 		get_openreq4(v, seq, st->num);
2647 	else
2648 		get_tcp4_sock(v, seq, st->num);
2649 out:
2650 	seq_pad(seq, '\n');
2651 	return 0;
2652 }
2653 
2654 #ifdef CONFIG_BPF_SYSCALL
2655 struct bpf_iter__tcp {
2656 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2657 	__bpf_md_ptr(struct sock_common *, sk_common);
2658 	uid_t uid __aligned(8);
2659 };
2660 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2661 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2662 			     struct sock_common *sk_common, uid_t uid)
2663 {
2664 	struct bpf_iter__tcp ctx;
2665 
2666 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2667 	ctx.meta = meta;
2668 	ctx.sk_common = sk_common;
2669 	ctx.uid = uid;
2670 	return bpf_iter_run_prog(prog, &ctx);
2671 }
2672 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2673 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2674 {
2675 	struct bpf_iter_meta meta;
2676 	struct bpf_prog *prog;
2677 	struct sock *sk = v;
2678 	uid_t uid;
2679 
2680 	if (v == SEQ_START_TOKEN)
2681 		return 0;
2682 
2683 	if (sk->sk_state == TCP_TIME_WAIT) {
2684 		uid = 0;
2685 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2686 		const struct request_sock *req = v;
2687 
2688 		uid = from_kuid_munged(seq_user_ns(seq),
2689 				       sock_i_uid(req->rsk_listener));
2690 	} else {
2691 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2692 	}
2693 
2694 	meta.seq = seq;
2695 	prog = bpf_iter_get_info(&meta, false);
2696 	return tcp_prog_seq_show(prog, &meta, v, uid);
2697 }
2698 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2699 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2700 {
2701 	struct bpf_iter_meta meta;
2702 	struct bpf_prog *prog;
2703 
2704 	if (!v) {
2705 		meta.seq = seq;
2706 		prog = bpf_iter_get_info(&meta, true);
2707 		if (prog)
2708 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2709 	}
2710 
2711 	tcp_seq_stop(seq, v);
2712 }
2713 
2714 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2715 	.show		= bpf_iter_tcp_seq_show,
2716 	.start		= tcp_seq_start,
2717 	.next		= tcp_seq_next,
2718 	.stop		= bpf_iter_tcp_seq_stop,
2719 };
2720 #endif
2721 
2722 static const struct seq_operations tcp4_seq_ops = {
2723 	.show		= tcp4_seq_show,
2724 	.start		= tcp_seq_start,
2725 	.next		= tcp_seq_next,
2726 	.stop		= tcp_seq_stop,
2727 };
2728 
2729 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2730 	.family		= AF_INET,
2731 };
2732 
tcp4_proc_init_net(struct net * net)2733 static int __net_init tcp4_proc_init_net(struct net *net)
2734 {
2735 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2736 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2737 		return -ENOMEM;
2738 	return 0;
2739 }
2740 
tcp4_proc_exit_net(struct net * net)2741 static void __net_exit tcp4_proc_exit_net(struct net *net)
2742 {
2743 	remove_proc_entry("tcp", net->proc_net);
2744 }
2745 
2746 static struct pernet_operations tcp4_net_ops = {
2747 	.init = tcp4_proc_init_net,
2748 	.exit = tcp4_proc_exit_net,
2749 };
2750 
tcp4_proc_init(void)2751 int __init tcp4_proc_init(void)
2752 {
2753 	return register_pernet_subsys(&tcp4_net_ops);
2754 }
2755 
tcp4_proc_exit(void)2756 void tcp4_proc_exit(void)
2757 {
2758 	unregister_pernet_subsys(&tcp4_net_ops);
2759 }
2760 #endif /* CONFIG_PROC_FS */
2761 
2762 struct proto tcp_prot = {
2763 	.name			= "TCP",
2764 	.owner			= THIS_MODULE,
2765 	.close			= tcp_close,
2766 	.pre_connect		= tcp_v4_pre_connect,
2767 	.connect		= tcp_v4_connect,
2768 	.disconnect		= tcp_disconnect,
2769 	.accept			= inet_csk_accept,
2770 	.ioctl			= tcp_ioctl,
2771 	.init			= tcp_v4_init_sock,
2772 	.destroy		= tcp_v4_destroy_sock,
2773 	.shutdown		= tcp_shutdown,
2774 	.setsockopt		= tcp_setsockopt,
2775 	.getsockopt		= tcp_getsockopt,
2776 	.keepalive		= tcp_set_keepalive,
2777 	.recvmsg		= tcp_recvmsg,
2778 	.sendmsg		= tcp_sendmsg,
2779 	.sendpage		= tcp_sendpage,
2780 	.backlog_rcv		= tcp_v4_do_rcv,
2781 	.release_cb		= tcp_release_cb,
2782 	.hash			= inet_hash,
2783 	.unhash			= inet_unhash,
2784 	.get_port		= inet_csk_get_port,
2785 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2786 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2787 	.stream_memory_free	= tcp_stream_memory_free,
2788 	.sockets_allocated	= &tcp_sockets_allocated,
2789 	.orphan_count		= &tcp_orphan_count,
2790 	.memory_allocated	= &tcp_memory_allocated,
2791 	.memory_pressure	= &tcp_memory_pressure,
2792 	.sysctl_mem		= sysctl_tcp_mem,
2793 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2794 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2795 	.max_header		= MAX_TCP_HEADER,
2796 	.obj_size		= sizeof(struct tcp_sock),
2797 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2798 	.twsk_prot		= &tcp_timewait_sock_ops,
2799 	.rsk_prot		= &tcp_request_sock_ops,
2800 	.h.hashinfo		= &tcp_hashinfo,
2801 	.no_autobind		= true,
2802 	.diag_destroy		= tcp_abort,
2803 };
2804 EXPORT_SYMBOL(tcp_prot);
2805 
tcp_sk_exit(struct net * net)2806 static void __net_exit tcp_sk_exit(struct net *net)
2807 {
2808 	int cpu;
2809 
2810 	if (net->ipv4.tcp_congestion_control)
2811 		bpf_module_put(net->ipv4.tcp_congestion_control,
2812 			       net->ipv4.tcp_congestion_control->owner);
2813 
2814 	for_each_possible_cpu(cpu)
2815 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2816 	free_percpu(net->ipv4.tcp_sk);
2817 }
2818 
tcp_sk_init(struct net * net)2819 static int __net_init tcp_sk_init(struct net *net)
2820 {
2821 	int res, cpu, cnt;
2822 
2823 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2824 	if (!net->ipv4.tcp_sk)
2825 		return -ENOMEM;
2826 
2827 	for_each_possible_cpu(cpu) {
2828 		struct sock *sk;
2829 
2830 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2831 					   IPPROTO_TCP, net);
2832 		if (res)
2833 			goto fail;
2834 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2835 
2836 		/* Please enforce IP_DF and IPID==0 for RST and
2837 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2838 		 */
2839 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2840 
2841 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2842 	}
2843 
2844 	net->ipv4.sysctl_tcp_ecn = 2;
2845 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2846 
2847 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2848 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2849 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2850 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2851 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2852 
2853 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2854 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2855 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2856 
2857 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2858 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2859 	net->ipv4.sysctl_tcp_syncookies = 1;
2860 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2861 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2862 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2863 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2864 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2865 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2866 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2867 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2868 
2869 	cnt = tcp_hashinfo.ehash_mask + 1;
2870 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2871 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2872 
2873 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2874 	net->ipv4.sysctl_tcp_sack = 1;
2875 	net->ipv4.sysctl_tcp_window_scaling = 1;
2876 	net->ipv4.sysctl_tcp_timestamps = 1;
2877 	net->ipv4.sysctl_tcp_early_retrans = 3;
2878 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2879 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2880 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2881 	net->ipv4.sysctl_tcp_max_reordering = 300;
2882 	net->ipv4.sysctl_tcp_dsack = 1;
2883 	net->ipv4.sysctl_tcp_app_win = 31;
2884 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2885 	net->ipv4.sysctl_tcp_frto = 2;
2886 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2887 	/* This limits the percentage of the congestion window which we
2888 	 * will allow a single TSO frame to consume.  Building TSO frames
2889 	 * which are too large can cause TCP streams to be bursty.
2890 	 */
2891 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2892 	/* Default TSQ limit of 16 TSO segments */
2893 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2894 	/* rfc5961 challenge ack rate limiting */
2895 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2896 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2897 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2898 	net->ipv4.sysctl_tcp_autocorking = 1;
2899 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2900 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2901 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2902 	if (net != &init_net) {
2903 		memcpy(net->ipv4.sysctl_tcp_rmem,
2904 		       init_net.ipv4.sysctl_tcp_rmem,
2905 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2906 		memcpy(net->ipv4.sysctl_tcp_wmem,
2907 		       init_net.ipv4.sysctl_tcp_wmem,
2908 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2909 	}
2910 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2911 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2912 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2913 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2914 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2915 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2916 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2917 
2918 	/* Reno is always built in */
2919 	if (!net_eq(net, &init_net) &&
2920 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2921 			       init_net.ipv4.tcp_congestion_control->owner))
2922 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2923 	else
2924 		net->ipv4.tcp_congestion_control = &tcp_reno;
2925 
2926 	return 0;
2927 fail:
2928 	tcp_sk_exit(net);
2929 
2930 	return res;
2931 }
2932 
tcp_sk_exit_batch(struct list_head * net_exit_list)2933 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2934 {
2935 	struct net *net;
2936 
2937 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2938 
2939 	list_for_each_entry(net, net_exit_list, exit_list)
2940 		tcp_fastopen_ctx_destroy(net);
2941 }
2942 
2943 static struct pernet_operations __net_initdata tcp_sk_ops = {
2944        .init	   = tcp_sk_init,
2945        .exit	   = tcp_sk_exit,
2946        .exit_batch = tcp_sk_exit_batch,
2947 };
2948 
2949 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2950 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2951 		     struct sock_common *sk_common, uid_t uid)
2952 
2953 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2954 {
2955 	struct tcp_iter_state *st = priv_data;
2956 	struct tcp_seq_afinfo *afinfo;
2957 	int ret;
2958 
2959 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2960 	if (!afinfo)
2961 		return -ENOMEM;
2962 
2963 	afinfo->family = AF_UNSPEC;
2964 	st->bpf_seq_afinfo = afinfo;
2965 	ret = bpf_iter_init_seq_net(priv_data, aux);
2966 	if (ret)
2967 		kfree(afinfo);
2968 	return ret;
2969 }
2970 
bpf_iter_fini_tcp(void * priv_data)2971 static void bpf_iter_fini_tcp(void *priv_data)
2972 {
2973 	struct tcp_iter_state *st = priv_data;
2974 
2975 	kfree(st->bpf_seq_afinfo);
2976 	bpf_iter_fini_seq_net(priv_data);
2977 }
2978 
2979 static const struct bpf_iter_seq_info tcp_seq_info = {
2980 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2981 	.init_seq_private	= bpf_iter_init_tcp,
2982 	.fini_seq_private	= bpf_iter_fini_tcp,
2983 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2984 };
2985 
2986 static struct bpf_iter_reg tcp_reg_info = {
2987 	.target			= "tcp",
2988 	.ctx_arg_info_size	= 1,
2989 	.ctx_arg_info		= {
2990 		{ offsetof(struct bpf_iter__tcp, sk_common),
2991 		  PTR_TO_BTF_ID_OR_NULL },
2992 	},
2993 	.seq_info		= &tcp_seq_info,
2994 };
2995 
bpf_iter_register(void)2996 static void __init bpf_iter_register(void)
2997 {
2998 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2999 	if (bpf_iter_reg_target(&tcp_reg_info))
3000 		pr_warn("Warning: could not register bpf iterator tcp\n");
3001 }
3002 
3003 #endif
3004 
tcp_v4_init(void)3005 void __init tcp_v4_init(void)
3006 {
3007 	if (register_pernet_subsys(&tcp_sk_ops))
3008 		panic("Failed to create the TCP control socket.\n");
3009 
3010 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3011 	bpf_iter_register();
3012 #endif
3013 }
3014