1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
tcp_v4_mtu_reduced(struct sock * sk)337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
do_redirect(struct sk_buff * skb,struct sock * sk)373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
tcp_v4_err(struct sk_buff * skb,u32 info)463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			WRITE_ONCE(tp->mtu_info, info);
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 #ifdef CONFIG_TCP_MD5SIG
659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660 #else
661 #define OPTION_BYTES sizeof(__be32)
662 #endif
663 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665 {
666 	const struct tcphdr *th = tcp_hdr(skb);
667 	struct {
668 		struct tcphdr th;
669 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
670 	} rep;
671 	struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673 	struct tcp_md5sig_key *key = NULL;
674 	const __u8 *hash_location = NULL;
675 	unsigned char newhash[16];
676 	int genhash;
677 	struct sock *sk1 = NULL;
678 #endif
679 	u64 transmit_time = 0;
680 	struct sock *ctl_sk;
681 	struct net *net;
682 
683 	/* Never send a reset in response to a reset. */
684 	if (th->rst)
685 		return;
686 
687 	/* If sk not NULL, it means we did a successful lookup and incoming
688 	 * route had to be correct. prequeue might have dropped our dst.
689 	 */
690 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 		return;
692 
693 	/* Swap the send and the receive. */
694 	memset(&rep, 0, sizeof(rep));
695 	rep.th.dest   = th->source;
696 	rep.th.source = th->dest;
697 	rep.th.doff   = sizeof(struct tcphdr) / 4;
698 	rep.th.rst    = 1;
699 
700 	if (th->ack) {
701 		rep.th.seq = th->ack_seq;
702 	} else {
703 		rep.th.ack = 1;
704 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 				       skb->len - (th->doff << 2));
706 	}
707 
708 	memset(&arg, 0, sizeof(arg));
709 	arg.iov[0].iov_base = (unsigned char *)&rep;
710 	arg.iov[0].iov_len  = sizeof(rep.th);
711 
712 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
714 	rcu_read_lock();
715 	hash_location = tcp_parse_md5sig_option(th);
716 	if (sk && sk_fullsock(sk)) {
717 		const union tcp_md5_addr *addr;
718 		int l3index;
719 
720 		/* sdif set, means packet ingressed via a device
721 		 * in an L3 domain and inet_iif is set to it.
722 		 */
723 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 	} else if (hash_location) {
727 		const union tcp_md5_addr *addr;
728 		int sdif = tcp_v4_sdif(skb);
729 		int dif = inet_iif(skb);
730 		int l3index;
731 
732 		/*
733 		 * active side is lost. Try to find listening socket through
734 		 * source port, and then find md5 key through listening socket.
735 		 * we are not loose security here:
736 		 * Incoming packet is checked with md5 hash with finding key,
737 		 * no RST generated if md5 hash doesn't match.
738 		 */
739 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 					     ip_hdr(skb)->saddr,
741 					     th->source, ip_hdr(skb)->daddr,
742 					     ntohs(th->source), dif, sdif);
743 		/* don't send rst if it can't find key */
744 		if (!sk1)
745 			goto out;
746 
747 		/* sdif set, means packet ingressed via a device
748 		 * in an L3 domain and dif is set to it.
749 		 */
750 		l3index = sdif ? dif : 0;
751 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 		if (!key)
754 			goto out;
755 
756 
757 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 			goto out;
760 
761 	}
762 
763 	if (key) {
764 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 				   (TCPOPT_NOP << 16) |
766 				   (TCPOPT_MD5SIG << 8) |
767 				   TCPOLEN_MD5SIG);
768 		/* Update length and the length the header thinks exists */
769 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 		rep.th.doff = arg.iov[0].iov_len / 4;
771 
772 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 				     key, ip_hdr(skb)->saddr,
774 				     ip_hdr(skb)->daddr, &rep.th);
775 	}
776 #endif
777 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 	if (rep.opt[0] == 0) {
779 		__be32 mrst = mptcp_reset_option(skb);
780 
781 		if (mrst) {
782 			rep.opt[0] = mrst;
783 			arg.iov[0].iov_len += sizeof(mrst);
784 			rep.th.doff = arg.iov[0].iov_len / 4;
785 		}
786 	}
787 
788 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 				      ip_hdr(skb)->saddr, /* XXX */
790 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793 
794 	/* When socket is gone, all binding information is lost.
795 	 * routing might fail in this case. No choice here, if we choose to force
796 	 * input interface, we will misroute in case of asymmetric route.
797 	 */
798 	if (sk) {
799 		arg.bound_dev_if = sk->sk_bound_dev_if;
800 		if (sk_fullsock(sk))
801 			trace_tcp_send_reset(sk, skb);
802 	}
803 
804 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806 
807 	arg.tos = ip_hdr(skb)->tos;
808 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809 	local_bh_disable();
810 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811 	if (sk) {
812 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
814 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
816 		transmit_time = tcp_transmit_time(sk);
817 	}
818 	ip_send_unicast_reply(ctl_sk,
819 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 			      &arg, arg.iov[0].iov_len,
822 			      transmit_time);
823 
824 	ctl_sk->sk_mark = 0;
825 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827 	local_bh_enable();
828 
829 #ifdef CONFIG_TCP_MD5SIG
830 out:
831 	rcu_read_unlock();
832 #endif
833 }
834 
835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836    outside socket context is ugly, certainly. What can I do?
837  */
838 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)839 static void tcp_v4_send_ack(const struct sock *sk,
840 			    struct sk_buff *skb, u32 seq, u32 ack,
841 			    u32 win, u32 tsval, u32 tsecr, int oif,
842 			    struct tcp_md5sig_key *key,
843 			    int reply_flags, u8 tos)
844 {
845 	const struct tcphdr *th = tcp_hdr(skb);
846 	struct {
847 		struct tcphdr th;
848 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849 #ifdef CONFIG_TCP_MD5SIG
850 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851 #endif
852 			];
853 	} rep;
854 	struct net *net = sock_net(sk);
855 	struct ip_reply_arg arg;
856 	struct sock *ctl_sk;
857 	u64 transmit_time;
858 
859 	memset(&rep.th, 0, sizeof(struct tcphdr));
860 	memset(&arg, 0, sizeof(arg));
861 
862 	arg.iov[0].iov_base = (unsigned char *)&rep;
863 	arg.iov[0].iov_len  = sizeof(rep.th);
864 	if (tsecr) {
865 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 				   (TCPOPT_TIMESTAMP << 8) |
867 				   TCPOLEN_TIMESTAMP);
868 		rep.opt[1] = htonl(tsval);
869 		rep.opt[2] = htonl(tsecr);
870 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871 	}
872 
873 	/* Swap the send and the receive. */
874 	rep.th.dest    = th->source;
875 	rep.th.source  = th->dest;
876 	rep.th.doff    = arg.iov[0].iov_len / 4;
877 	rep.th.seq     = htonl(seq);
878 	rep.th.ack_seq = htonl(ack);
879 	rep.th.ack     = 1;
880 	rep.th.window  = htons(win);
881 
882 #ifdef CONFIG_TCP_MD5SIG
883 	if (key) {
884 		int offset = (tsecr) ? 3 : 0;
885 
886 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 					  (TCPOPT_NOP << 16) |
888 					  (TCPOPT_MD5SIG << 8) |
889 					  TCPOLEN_MD5SIG);
890 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 		rep.th.doff = arg.iov[0].iov_len/4;
892 
893 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 				    key, ip_hdr(skb)->saddr,
895 				    ip_hdr(skb)->daddr, &rep.th);
896 	}
897 #endif
898 	arg.flags = reply_flags;
899 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 				      ip_hdr(skb)->saddr, /* XXX */
901 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 	if (oif)
904 		arg.bound_dev_if = oif;
905 	arg.tos = tos;
906 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907 	local_bh_disable();
908 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
911 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
913 	transmit_time = tcp_transmit_time(sk);
914 	ip_send_unicast_reply(ctl_sk,
915 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 			      &arg, arg.iov[0].iov_len,
918 			      transmit_time);
919 
920 	ctl_sk->sk_mark = 0;
921 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 	local_bh_enable();
923 }
924 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926 {
927 	struct inet_timewait_sock *tw = inet_twsk(sk);
928 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929 
930 	tcp_v4_send_ack(sk, skb,
931 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 			tcptw->tw_ts_recent,
935 			tw->tw_bound_dev_if,
936 			tcp_twsk_md5_key(tcptw),
937 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 			tw->tw_tos
939 			);
940 
941 	inet_twsk_put(tw);
942 }
943 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 				  struct request_sock *req)
946 {
947 	const union tcp_md5_addr *addr;
948 	int l3index;
949 
950 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952 	 */
953 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 					     tcp_sk(sk)->snd_nxt;
955 
956 	/* RFC 7323 2.3
957 	 * The window field (SEG.WND) of every outgoing segment, with the
958 	 * exception of <SYN> segments, MUST be right-shifted by
959 	 * Rcv.Wind.Shift bits:
960 	 */
961 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 	tcp_v4_send_ack(sk, skb, seq,
964 			tcp_rsk(req)->rcv_nxt,
965 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 			req->ts_recent,
968 			0,
969 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 			ip_hdr(skb)->tos);
972 }
973 
974 /*
975  *	Send a SYN-ACK after having received a SYN.
976  *	This still operates on a request_sock only, not on a big
977  *	socket.
978  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980 			      struct flowi *fl,
981 			      struct request_sock *req,
982 			      struct tcp_fastopen_cookie *foc,
983 			      enum tcp_synack_type synack_type,
984 			      struct sk_buff *syn_skb)
985 {
986 	const struct inet_request_sock *ireq = inet_rsk(req);
987 	struct flowi4 fl4;
988 	int err = -1;
989 	struct sk_buff *skb;
990 	u8 tos;
991 
992 	/* First, grab a route. */
993 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994 		return -1;
995 
996 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997 
998 	if (skb) {
999 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000 
1001 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1004 				inet_sk(sk)->tos;
1005 
1006 		if (!INET_ECN_is_capable(tos) &&
1007 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 			tos |= INET_ECN_ECT_0;
1009 
1010 		rcu_read_lock();
1011 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 					    ireq->ir_rmt_addr,
1013 					    rcu_dereference(ireq->ireq_opt),
1014 					    tos);
1015 		rcu_read_unlock();
1016 		err = net_xmit_eval(err);
1017 	}
1018 
1019 	return err;
1020 }
1021 
1022 /*
1023  *	IPv4 request_sock destructor.
1024  */
tcp_v4_reqsk_destructor(struct request_sock * req)1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029 
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036 
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1040 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041 {
1042 	if (!old)
1043 		return true;
1044 
1045 	/* l3index always overrides non-l3index */
1046 	if (old->l3index && new->l3index == 0)
1047 		return false;
1048 	if (old->l3index == 0 && new->l3index)
1049 		return true;
1050 
1051 	return old->prefixlen < new->prefixlen;
1052 }
1053 
1054 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1055 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056 					   const union tcp_md5_addr *addr,
1057 					   int family)
1058 {
1059 	const struct tcp_sock *tp = tcp_sk(sk);
1060 	struct tcp_md5sig_key *key;
1061 	const struct tcp_md5sig_info *md5sig;
1062 	__be32 mask;
1063 	struct tcp_md5sig_key *best_match = NULL;
1064 	bool match;
1065 
1066 	/* caller either holds rcu_read_lock() or socket lock */
1067 	md5sig = rcu_dereference_check(tp->md5sig_info,
1068 				       lockdep_sock_is_held(sk));
1069 	if (!md5sig)
1070 		return NULL;
1071 
1072 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073 				 lockdep_sock_is_held(sk)) {
1074 		if (key->family != family)
1075 			continue;
1076 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1077 			continue;
1078 		if (family == AF_INET) {
1079 			mask = inet_make_mask(key->prefixlen);
1080 			match = (key->addr.a4.s_addr & mask) ==
1081 				(addr->a4.s_addr & mask);
1082 #if IS_ENABLED(CONFIG_IPV6)
1083 		} else if (family == AF_INET6) {
1084 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085 						  key->prefixlen);
1086 #endif
1087 		} else {
1088 			match = false;
1089 		}
1090 
1091 		if (match && better_md5_match(best_match, key))
1092 			best_match = key;
1093 	}
1094 	return best_match;
1095 }
1096 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1098 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099 						      const union tcp_md5_addr *addr,
1100 						      int family, u8 prefixlen,
1101 						      int l3index, u8 flags)
1102 {
1103 	const struct tcp_sock *tp = tcp_sk(sk);
1104 	struct tcp_md5sig_key *key;
1105 	unsigned int size = sizeof(struct in_addr);
1106 	const struct tcp_md5sig_info *md5sig;
1107 
1108 	/* caller either holds rcu_read_lock() or socket lock */
1109 	md5sig = rcu_dereference_check(tp->md5sig_info,
1110 				       lockdep_sock_is_held(sk));
1111 	if (!md5sig)
1112 		return NULL;
1113 #if IS_ENABLED(CONFIG_IPV6)
1114 	if (family == AF_INET6)
1115 		size = sizeof(struct in6_addr);
1116 #endif
1117 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118 				 lockdep_sock_is_held(sk)) {
1119 		if (key->family != family)
1120 			continue;
1121 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1122 			continue;
1123 		if (key->l3index != l3index)
1124 			continue;
1125 		if (!memcmp(&key->addr, addr, size) &&
1126 		    key->prefixlen == prefixlen)
1127 			return key;
1128 	}
1129 	return NULL;
1130 }
1131 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 					 const struct sock *addr_sk)
1134 {
1135 	const union tcp_md5_addr *addr;
1136 	int l3index;
1137 
1138 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 						 addr_sk->sk_bound_dev_if);
1140 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144 
1145 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 		   int family, u8 prefixlen, int l3index, u8 flags,
1148 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149 {
1150 	/* Add Key to the list */
1151 	struct tcp_md5sig_key *key;
1152 	struct tcp_sock *tp = tcp_sk(sk);
1153 	struct tcp_md5sig_info *md5sig;
1154 
1155 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1156 	if (key) {
1157 		/* Pre-existing entry - just update that one.
1158 		 * Note that the key might be used concurrently.
1159 		 * data_race() is telling kcsan that we do not care of
1160 		 * key mismatches, since changing MD5 key on live flows
1161 		 * can lead to packet drops.
1162 		 */
1163 		data_race(memcpy(key->key, newkey, newkeylen));
1164 
1165 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 		 * Also note that a reader could catch new key->keylen value
1167 		 * but old key->key[], this is the reason we use __GFP_ZERO
1168 		 * at sock_kmalloc() time below these lines.
1169 		 */
1170 		WRITE_ONCE(key->keylen, newkeylen);
1171 
1172 		return 0;
1173 	}
1174 
1175 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 					   lockdep_sock_is_held(sk));
1177 	if (!md5sig) {
1178 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1179 		if (!md5sig)
1180 			return -ENOMEM;
1181 
1182 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 		INIT_HLIST_HEAD(&md5sig->head);
1184 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1185 	}
1186 
1187 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188 	if (!key)
1189 		return -ENOMEM;
1190 	if (!tcp_alloc_md5sig_pool()) {
1191 		sock_kfree_s(sk, key, sizeof(*key));
1192 		return -ENOMEM;
1193 	}
1194 
1195 	memcpy(key->key, newkey, newkeylen);
1196 	key->keylen = newkeylen;
1197 	key->family = family;
1198 	key->prefixlen = prefixlen;
1199 	key->l3index = l3index;
1200 	key->flags = flags;
1201 	memcpy(&key->addr, addr,
1202 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1203 				      sizeof(struct in_addr));
1204 	hlist_add_head_rcu(&key->node, &md5sig->head);
1205 	return 0;
1206 }
1207 EXPORT_SYMBOL(tcp_md5_do_add);
1208 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1209 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1210 		   u8 prefixlen, int l3index, u8 flags)
1211 {
1212 	struct tcp_md5sig_key *key;
1213 
1214 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1215 	if (!key)
1216 		return -ENOENT;
1217 	hlist_del_rcu(&key->node);
1218 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1219 	kfree_rcu(key, rcu);
1220 	return 0;
1221 }
1222 EXPORT_SYMBOL(tcp_md5_do_del);
1223 
tcp_clear_md5_list(struct sock * sk)1224 static void tcp_clear_md5_list(struct sock *sk)
1225 {
1226 	struct tcp_sock *tp = tcp_sk(sk);
1227 	struct tcp_md5sig_key *key;
1228 	struct hlist_node *n;
1229 	struct tcp_md5sig_info *md5sig;
1230 
1231 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232 
1233 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1234 		hlist_del_rcu(&key->node);
1235 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1236 		kfree_rcu(key, rcu);
1237 	}
1238 }
1239 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1240 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1241 				 sockptr_t optval, int optlen)
1242 {
1243 	struct tcp_md5sig cmd;
1244 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1245 	const union tcp_md5_addr *addr;
1246 	u8 prefixlen = 32;
1247 	int l3index = 0;
1248 	u8 flags;
1249 
1250 	if (optlen < sizeof(cmd))
1251 		return -EINVAL;
1252 
1253 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1254 		return -EFAULT;
1255 
1256 	if (sin->sin_family != AF_INET)
1257 		return -EINVAL;
1258 
1259 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1260 
1261 	if (optname == TCP_MD5SIG_EXT &&
1262 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263 		prefixlen = cmd.tcpm_prefixlen;
1264 		if (prefixlen > 32)
1265 			return -EINVAL;
1266 	}
1267 
1268 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1269 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270 		struct net_device *dev;
1271 
1272 		rcu_read_lock();
1273 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274 		if (dev && netif_is_l3_master(dev))
1275 			l3index = dev->ifindex;
1276 
1277 		rcu_read_unlock();
1278 
1279 		/* ok to reference set/not set outside of rcu;
1280 		 * right now device MUST be an L3 master
1281 		 */
1282 		if (!dev || !l3index)
1283 			return -EINVAL;
1284 	}
1285 
1286 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1287 
1288 	if (!cmd.tcpm_keylen)
1289 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1290 
1291 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1292 		return -EINVAL;
1293 
1294 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1295 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1296 }
1297 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1298 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299 				   __be32 daddr, __be32 saddr,
1300 				   const struct tcphdr *th, int nbytes)
1301 {
1302 	struct tcp4_pseudohdr *bp;
1303 	struct scatterlist sg;
1304 	struct tcphdr *_th;
1305 
1306 	bp = hp->scratch;
1307 	bp->saddr = saddr;
1308 	bp->daddr = daddr;
1309 	bp->pad = 0;
1310 	bp->protocol = IPPROTO_TCP;
1311 	bp->len = cpu_to_be16(nbytes);
1312 
1313 	_th = (struct tcphdr *)(bp + 1);
1314 	memcpy(_th, th, sizeof(*th));
1315 	_th->check = 0;
1316 
1317 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319 				sizeof(*bp) + sizeof(*th));
1320 	return crypto_ahash_update(hp->md5_req);
1321 }
1322 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1323 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1324 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1325 {
1326 	struct tcp_md5sig_pool *hp;
1327 	struct ahash_request *req;
1328 
1329 	hp = tcp_get_md5sig_pool();
1330 	if (!hp)
1331 		goto clear_hash_noput;
1332 	req = hp->md5_req;
1333 
1334 	if (crypto_ahash_init(req))
1335 		goto clear_hash;
1336 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1337 		goto clear_hash;
1338 	if (tcp_md5_hash_key(hp, key))
1339 		goto clear_hash;
1340 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341 	if (crypto_ahash_final(req))
1342 		goto clear_hash;
1343 
1344 	tcp_put_md5sig_pool();
1345 	return 0;
1346 
1347 clear_hash:
1348 	tcp_put_md5sig_pool();
1349 clear_hash_noput:
1350 	memset(md5_hash, 0, 16);
1351 	return 1;
1352 }
1353 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1354 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355 			const struct sock *sk,
1356 			const struct sk_buff *skb)
1357 {
1358 	struct tcp_md5sig_pool *hp;
1359 	struct ahash_request *req;
1360 	const struct tcphdr *th = tcp_hdr(skb);
1361 	__be32 saddr, daddr;
1362 
1363 	if (sk) { /* valid for establish/request sockets */
1364 		saddr = sk->sk_rcv_saddr;
1365 		daddr = sk->sk_daddr;
1366 	} else {
1367 		const struct iphdr *iph = ip_hdr(skb);
1368 		saddr = iph->saddr;
1369 		daddr = iph->daddr;
1370 	}
1371 
1372 	hp = tcp_get_md5sig_pool();
1373 	if (!hp)
1374 		goto clear_hash_noput;
1375 	req = hp->md5_req;
1376 
1377 	if (crypto_ahash_init(req))
1378 		goto clear_hash;
1379 
1380 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1381 		goto clear_hash;
1382 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1383 		goto clear_hash;
1384 	if (tcp_md5_hash_key(hp, key))
1385 		goto clear_hash;
1386 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387 	if (crypto_ahash_final(req))
1388 		goto clear_hash;
1389 
1390 	tcp_put_md5sig_pool();
1391 	return 0;
1392 
1393 clear_hash:
1394 	tcp_put_md5sig_pool();
1395 clear_hash_noput:
1396 	memset(md5_hash, 0, 16);
1397 	return 1;
1398 }
1399 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1400 
1401 #endif
1402 
1403 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1404 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1405 				    const struct sk_buff *skb,
1406 				    int dif, int sdif)
1407 {
1408 #ifdef CONFIG_TCP_MD5SIG
1409 	/*
1410 	 * This gets called for each TCP segment that arrives
1411 	 * so we want to be efficient.
1412 	 * We have 3 drop cases:
1413 	 * o No MD5 hash and one expected.
1414 	 * o MD5 hash and we're not expecting one.
1415 	 * o MD5 hash and its wrong.
1416 	 */
1417 	const __u8 *hash_location = NULL;
1418 	struct tcp_md5sig_key *hash_expected;
1419 	const struct iphdr *iph = ip_hdr(skb);
1420 	const struct tcphdr *th = tcp_hdr(skb);
1421 	const union tcp_md5_addr *addr;
1422 	unsigned char newhash[16];
1423 	int genhash, l3index;
1424 
1425 	/* sdif set, means packet ingressed via a device
1426 	 * in an L3 domain and dif is set to the l3mdev
1427 	 */
1428 	l3index = sdif ? dif : 0;
1429 
1430 	addr = (union tcp_md5_addr *)&iph->saddr;
1431 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1432 	hash_location = tcp_parse_md5sig_option(th);
1433 
1434 	/* We've parsed the options - do we have a hash? */
1435 	if (!hash_expected && !hash_location)
1436 		return false;
1437 
1438 	if (hash_expected && !hash_location) {
1439 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1440 		return true;
1441 	}
1442 
1443 	if (!hash_expected && hash_location) {
1444 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1445 		return true;
1446 	}
1447 
1448 	/* Okay, so this is hash_expected and hash_location -
1449 	 * so we need to calculate the checksum.
1450 	 */
1451 	genhash = tcp_v4_md5_hash_skb(newhash,
1452 				      hash_expected,
1453 				      NULL, skb);
1454 
1455 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1456 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1457 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1458 				     &iph->saddr, ntohs(th->source),
1459 				     &iph->daddr, ntohs(th->dest),
1460 				     genhash ? " tcp_v4_calc_md5_hash failed"
1461 				     : "", l3index);
1462 		return true;
1463 	}
1464 	return false;
1465 #endif
1466 	return false;
1467 }
1468 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1469 static void tcp_v4_init_req(struct request_sock *req,
1470 			    const struct sock *sk_listener,
1471 			    struct sk_buff *skb)
1472 {
1473 	struct inet_request_sock *ireq = inet_rsk(req);
1474 	struct net *net = sock_net(sk_listener);
1475 
1476 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1478 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1479 }
1480 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1481 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1482 					  struct sk_buff *skb,
1483 					  struct flowi *fl,
1484 					  struct request_sock *req)
1485 {
1486 	tcp_v4_init_req(req, sk, skb);
1487 
1488 	if (security_inet_conn_request(sk, skb, req))
1489 		return NULL;
1490 
1491 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1492 }
1493 
1494 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1495 	.family		=	PF_INET,
1496 	.obj_size	=	sizeof(struct tcp_request_sock),
1497 	.rtx_syn_ack	=	tcp_rtx_synack,
1498 	.send_ack	=	tcp_v4_reqsk_send_ack,
1499 	.destructor	=	tcp_v4_reqsk_destructor,
1500 	.send_reset	=	tcp_v4_send_reset,
1501 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1502 };
1503 
1504 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1505 	.mss_clamp	=	TCP_MSS_DEFAULT,
1506 #ifdef CONFIG_TCP_MD5SIG
1507 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1508 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1509 #endif
1510 #ifdef CONFIG_SYN_COOKIES
1511 	.cookie_init_seq =	cookie_v4_init_sequence,
1512 #endif
1513 	.route_req	=	tcp_v4_route_req,
1514 	.init_seq	=	tcp_v4_init_seq,
1515 	.init_ts_off	=	tcp_v4_init_ts_off,
1516 	.send_synack	=	tcp_v4_send_synack,
1517 };
1518 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1519 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1520 {
1521 	/* Never answer to SYNs send to broadcast or multicast */
1522 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1523 		goto drop;
1524 
1525 	return tcp_conn_request(&tcp_request_sock_ops,
1526 				&tcp_request_sock_ipv4_ops, sk, skb);
1527 
1528 drop:
1529 	tcp_listendrop(sk);
1530 	return 0;
1531 }
1532 EXPORT_SYMBOL(tcp_v4_conn_request);
1533 
1534 
1535 /*
1536  * The three way handshake has completed - we got a valid synack -
1537  * now create the new socket.
1538  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1539 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1540 				  struct request_sock *req,
1541 				  struct dst_entry *dst,
1542 				  struct request_sock *req_unhash,
1543 				  bool *own_req)
1544 {
1545 	struct inet_request_sock *ireq;
1546 	bool found_dup_sk = false;
1547 	struct inet_sock *newinet;
1548 	struct tcp_sock *newtp;
1549 	struct sock *newsk;
1550 #ifdef CONFIG_TCP_MD5SIG
1551 	const union tcp_md5_addr *addr;
1552 	struct tcp_md5sig_key *key;
1553 	int l3index;
1554 #endif
1555 	struct ip_options_rcu *inet_opt;
1556 
1557 	if (sk_acceptq_is_full(sk))
1558 		goto exit_overflow;
1559 
1560 	newsk = tcp_create_openreq_child(sk, req, skb);
1561 	if (!newsk)
1562 		goto exit_nonewsk;
1563 
1564 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1565 	inet_sk_rx_dst_set(newsk, skb);
1566 
1567 	newtp		      = tcp_sk(newsk);
1568 	newinet		      = inet_sk(newsk);
1569 	ireq		      = inet_rsk(req);
1570 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1572 	newsk->sk_bound_dev_if = ireq->ir_iif;
1573 	newinet->inet_saddr   = ireq->ir_loc_addr;
1574 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1575 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1576 	newinet->mc_index     = inet_iif(skb);
1577 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1578 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1579 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1580 	if (inet_opt)
1581 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1582 	newinet->inet_id = prandom_u32();
1583 
1584 	/* Set ToS of the new socket based upon the value of incoming SYN.
1585 	 * ECT bits are set later in tcp_init_transfer().
1586 	 */
1587 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1589 
1590 	if (!dst) {
1591 		dst = inet_csk_route_child_sock(sk, newsk, req);
1592 		if (!dst)
1593 			goto put_and_exit;
1594 	} else {
1595 		/* syncookie case : see end of cookie_v4_check() */
1596 	}
1597 	sk_setup_caps(newsk, dst);
1598 
1599 	tcp_ca_openreq_child(newsk, dst);
1600 
1601 	tcp_sync_mss(newsk, dst_mtu(dst));
1602 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1603 
1604 	tcp_initialize_rcv_mss(newsk);
1605 
1606 #ifdef CONFIG_TCP_MD5SIG
1607 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1608 	/* Copy over the MD5 key from the original socket */
1609 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1610 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1611 	if (key) {
1612 		/*
1613 		 * We're using one, so create a matching key
1614 		 * on the newsk structure. If we fail to get
1615 		 * memory, then we end up not copying the key
1616 		 * across. Shucks.
1617 		 */
1618 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1619 			       key->key, key->keylen, GFP_ATOMIC);
1620 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1621 	}
1622 #endif
1623 
1624 	if (__inet_inherit_port(sk, newsk) < 0)
1625 		goto put_and_exit;
1626 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1627 				       &found_dup_sk);
1628 	if (likely(*own_req)) {
1629 		tcp_move_syn(newtp, req);
1630 		ireq->ireq_opt = NULL;
1631 	} else {
1632 		newinet->inet_opt = NULL;
1633 
1634 		if (!req_unhash && found_dup_sk) {
1635 			/* This code path should only be executed in the
1636 			 * syncookie case only
1637 			 */
1638 			bh_unlock_sock(newsk);
1639 			sock_put(newsk);
1640 			newsk = NULL;
1641 		}
1642 	}
1643 	return newsk;
1644 
1645 exit_overflow:
1646 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1647 exit_nonewsk:
1648 	dst_release(dst);
1649 exit:
1650 	tcp_listendrop(sk);
1651 	return NULL;
1652 put_and_exit:
1653 	newinet->inet_opt = NULL;
1654 	inet_csk_prepare_forced_close(newsk);
1655 	tcp_done(newsk);
1656 	goto exit;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1659 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1660 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1661 {
1662 #ifdef CONFIG_SYN_COOKIES
1663 	const struct tcphdr *th = tcp_hdr(skb);
1664 
1665 	if (!th->syn)
1666 		sk = cookie_v4_check(sk, skb);
1667 #endif
1668 	return sk;
1669 }
1670 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1671 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672 			 struct tcphdr *th, u32 *cookie)
1673 {
1674 	u16 mss = 0;
1675 #ifdef CONFIG_SYN_COOKIES
1676 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677 				    &tcp_request_sock_ipv4_ops, sk, th);
1678 	if (mss) {
1679 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680 		tcp_synq_overflow(sk);
1681 	}
1682 #endif
1683 	return mss;
1684 }
1685 
1686 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1687 							   u32));
1688 /* The socket must have it's spinlock held when we get
1689  * here, unless it is a TCP_LISTEN socket.
1690  *
1691  * We have a potential double-lock case here, so even when
1692  * doing backlog processing we use the BH locking scheme.
1693  * This is because we cannot sleep with the original spinlock
1694  * held.
1695  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1696 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1697 {
1698 	struct sock *rsk;
1699 
1700 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1701 		struct dst_entry *dst = sk->sk_rx_dst;
1702 
1703 		sock_rps_save_rxhash(sk, skb);
1704 		sk_mark_napi_id(sk, skb);
1705 		if (dst) {
1706 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1707 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1708 					     dst, 0)) {
1709 				dst_release(dst);
1710 				sk->sk_rx_dst = NULL;
1711 			}
1712 		}
1713 		tcp_rcv_established(sk, skb);
1714 		return 0;
1715 	}
1716 
1717 	if (tcp_checksum_complete(skb))
1718 		goto csum_err;
1719 
1720 	if (sk->sk_state == TCP_LISTEN) {
1721 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1722 
1723 		if (!nsk)
1724 			goto discard;
1725 		if (nsk != sk) {
1726 			if (tcp_child_process(sk, nsk, skb)) {
1727 				rsk = nsk;
1728 				goto reset;
1729 			}
1730 			return 0;
1731 		}
1732 	} else
1733 		sock_rps_save_rxhash(sk, skb);
1734 
1735 	if (tcp_rcv_state_process(sk, skb)) {
1736 		rsk = sk;
1737 		goto reset;
1738 	}
1739 	return 0;
1740 
1741 reset:
1742 	tcp_v4_send_reset(rsk, skb);
1743 discard:
1744 	kfree_skb(skb);
1745 	/* Be careful here. If this function gets more complicated and
1746 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1747 	 * might be destroyed here. This current version compiles correctly,
1748 	 * but you have been warned.
1749 	 */
1750 	return 0;
1751 
1752 csum_err:
1753 	trace_tcp_bad_csum(skb);
1754 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1755 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1756 	goto discard;
1757 }
1758 EXPORT_SYMBOL(tcp_v4_do_rcv);
1759 
tcp_v4_early_demux(struct sk_buff * skb)1760 int tcp_v4_early_demux(struct sk_buff *skb)
1761 {
1762 	const struct iphdr *iph;
1763 	const struct tcphdr *th;
1764 	struct sock *sk;
1765 
1766 	if (skb->pkt_type != PACKET_HOST)
1767 		return 0;
1768 
1769 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1770 		return 0;
1771 
1772 	iph = ip_hdr(skb);
1773 	th = tcp_hdr(skb);
1774 
1775 	if (th->doff < sizeof(struct tcphdr) / 4)
1776 		return 0;
1777 
1778 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1779 				       iph->saddr, th->source,
1780 				       iph->daddr, ntohs(th->dest),
1781 				       skb->skb_iif, inet_sdif(skb));
1782 	if (sk) {
1783 		skb->sk = sk;
1784 		skb->destructor = sock_edemux;
1785 		if (sk_fullsock(sk)) {
1786 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1787 
1788 			if (dst)
1789 				dst = dst_check(dst, 0);
1790 			if (dst &&
1791 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1792 				skb_dst_set_noref(skb, dst);
1793 		}
1794 	}
1795 	return 0;
1796 }
1797 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1798 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1799 {
1800 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1801 	u32 tail_gso_size, tail_gso_segs;
1802 	struct skb_shared_info *shinfo;
1803 	const struct tcphdr *th;
1804 	struct tcphdr *thtail;
1805 	struct sk_buff *tail;
1806 	unsigned int hdrlen;
1807 	bool fragstolen;
1808 	u32 gso_segs;
1809 	u32 gso_size;
1810 	int delta;
1811 
1812 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1813 	 * we can fix skb->truesize to its real value to avoid future drops.
1814 	 * This is valid because skb is not yet charged to the socket.
1815 	 * It has been noticed pure SACK packets were sometimes dropped
1816 	 * (if cooked by drivers without copybreak feature).
1817 	 */
1818 	skb_condense(skb);
1819 
1820 	skb_dst_drop(skb);
1821 
1822 	if (unlikely(tcp_checksum_complete(skb))) {
1823 		bh_unlock_sock(sk);
1824 		trace_tcp_bad_csum(skb);
1825 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1826 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1827 		return true;
1828 	}
1829 
1830 	/* Attempt coalescing to last skb in backlog, even if we are
1831 	 * above the limits.
1832 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1833 	 */
1834 	th = (const struct tcphdr *)skb->data;
1835 	hdrlen = th->doff * 4;
1836 
1837 	tail = sk->sk_backlog.tail;
1838 	if (!tail)
1839 		goto no_coalesce;
1840 	thtail = (struct tcphdr *)tail->data;
1841 
1842 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1843 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1844 	    ((TCP_SKB_CB(tail)->tcp_flags |
1845 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1846 	    !((TCP_SKB_CB(tail)->tcp_flags &
1847 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1848 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1849 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1850 #ifdef CONFIG_TLS_DEVICE
1851 	    tail->decrypted != skb->decrypted ||
1852 #endif
1853 	    thtail->doff != th->doff ||
1854 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1855 		goto no_coalesce;
1856 
1857 	__skb_pull(skb, hdrlen);
1858 
1859 	shinfo = skb_shinfo(skb);
1860 	gso_size = shinfo->gso_size ?: skb->len;
1861 	gso_segs = shinfo->gso_segs ?: 1;
1862 
1863 	shinfo = skb_shinfo(tail);
1864 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1865 	tail_gso_segs = shinfo->gso_segs ?: 1;
1866 
1867 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1868 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1869 
1870 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1871 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1872 			thtail->window = th->window;
1873 		}
1874 
1875 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1876 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1877 		 * is not entered if we append a packet with a FIN.
1878 		 * SYN, RST, URG are not present.
1879 		 * ACK is set on both packets.
1880 		 * PSH : we do not really care in TCP stack,
1881 		 *       at least for 'GRO' packets.
1882 		 */
1883 		thtail->fin |= th->fin;
1884 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1885 
1886 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1887 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1888 			tail->tstamp = skb->tstamp;
1889 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1890 		}
1891 
1892 		/* Not as strict as GRO. We only need to carry mss max value */
1893 		shinfo->gso_size = max(gso_size, tail_gso_size);
1894 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1895 
1896 		sk->sk_backlog.len += delta;
1897 		__NET_INC_STATS(sock_net(sk),
1898 				LINUX_MIB_TCPBACKLOGCOALESCE);
1899 		kfree_skb_partial(skb, fragstolen);
1900 		return false;
1901 	}
1902 	__skb_push(skb, hdrlen);
1903 
1904 no_coalesce:
1905 	/* Only socket owner can try to collapse/prune rx queues
1906 	 * to reduce memory overhead, so add a little headroom here.
1907 	 * Few sockets backlog are possibly concurrently non empty.
1908 	 */
1909 	limit += 64*1024;
1910 
1911 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1912 		bh_unlock_sock(sk);
1913 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1914 		return true;
1915 	}
1916 	return false;
1917 }
1918 EXPORT_SYMBOL(tcp_add_backlog);
1919 
tcp_filter(struct sock * sk,struct sk_buff * skb)1920 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1921 {
1922 	struct tcphdr *th = (struct tcphdr *)skb->data;
1923 
1924 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1925 }
1926 EXPORT_SYMBOL(tcp_filter);
1927 
tcp_v4_restore_cb(struct sk_buff * skb)1928 static void tcp_v4_restore_cb(struct sk_buff *skb)
1929 {
1930 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1931 		sizeof(struct inet_skb_parm));
1932 }
1933 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1934 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1935 			   const struct tcphdr *th)
1936 {
1937 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1938 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1939 	 */
1940 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1941 		sizeof(struct inet_skb_parm));
1942 	barrier();
1943 
1944 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1945 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1946 				    skb->len - th->doff * 4);
1947 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1948 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1949 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1950 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1951 	TCP_SKB_CB(skb)->sacked	 = 0;
1952 	TCP_SKB_CB(skb)->has_rxtstamp =
1953 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1954 }
1955 
1956 /*
1957  *	From tcp_input.c
1958  */
1959 
tcp_v4_rcv(struct sk_buff * skb)1960 int tcp_v4_rcv(struct sk_buff *skb)
1961 {
1962 	struct net *net = dev_net(skb->dev);
1963 	struct sk_buff *skb_to_free;
1964 	int sdif = inet_sdif(skb);
1965 	int dif = inet_iif(skb);
1966 	const struct iphdr *iph;
1967 	const struct tcphdr *th;
1968 	bool refcounted;
1969 	struct sock *sk;
1970 	int ret;
1971 
1972 	if (skb->pkt_type != PACKET_HOST)
1973 		goto discard_it;
1974 
1975 	/* Count it even if it's bad */
1976 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1977 
1978 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1979 		goto discard_it;
1980 
1981 	th = (const struct tcphdr *)skb->data;
1982 
1983 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1984 		goto bad_packet;
1985 	if (!pskb_may_pull(skb, th->doff * 4))
1986 		goto discard_it;
1987 
1988 	/* An explanation is required here, I think.
1989 	 * Packet length and doff are validated by header prediction,
1990 	 * provided case of th->doff==0 is eliminated.
1991 	 * So, we defer the checks. */
1992 
1993 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1994 		goto csum_error;
1995 
1996 	th = (const struct tcphdr *)skb->data;
1997 	iph = ip_hdr(skb);
1998 lookup:
1999 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2000 			       th->dest, sdif, &refcounted);
2001 	if (!sk)
2002 		goto no_tcp_socket;
2003 
2004 process:
2005 	if (sk->sk_state == TCP_TIME_WAIT)
2006 		goto do_time_wait;
2007 
2008 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2009 		struct request_sock *req = inet_reqsk(sk);
2010 		bool req_stolen = false;
2011 		struct sock *nsk;
2012 
2013 		sk = req->rsk_listener;
2014 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2015 			sk_drops_add(sk, skb);
2016 			reqsk_put(req);
2017 			goto discard_it;
2018 		}
2019 		if (tcp_checksum_complete(skb)) {
2020 			reqsk_put(req);
2021 			goto csum_error;
2022 		}
2023 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2024 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2025 			if (!nsk) {
2026 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2027 				goto lookup;
2028 			}
2029 			sk = nsk;
2030 			/* reuseport_migrate_sock() has already held one sk_refcnt
2031 			 * before returning.
2032 			 */
2033 		} else {
2034 			/* We own a reference on the listener, increase it again
2035 			 * as we might lose it too soon.
2036 			 */
2037 			sock_hold(sk);
2038 		}
2039 		refcounted = true;
2040 		nsk = NULL;
2041 		if (!tcp_filter(sk, skb)) {
2042 			th = (const struct tcphdr *)skb->data;
2043 			iph = ip_hdr(skb);
2044 			tcp_v4_fill_cb(skb, iph, th);
2045 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2046 		}
2047 		if (!nsk) {
2048 			reqsk_put(req);
2049 			if (req_stolen) {
2050 				/* Another cpu got exclusive access to req
2051 				 * and created a full blown socket.
2052 				 * Try to feed this packet to this socket
2053 				 * instead of discarding it.
2054 				 */
2055 				tcp_v4_restore_cb(skb);
2056 				sock_put(sk);
2057 				goto lookup;
2058 			}
2059 			goto discard_and_relse;
2060 		}
2061 		if (nsk == sk) {
2062 			reqsk_put(req);
2063 			tcp_v4_restore_cb(skb);
2064 		} else if (tcp_child_process(sk, nsk, skb)) {
2065 			tcp_v4_send_reset(nsk, skb);
2066 			goto discard_and_relse;
2067 		} else {
2068 			sock_put(sk);
2069 			return 0;
2070 		}
2071 	}
2072 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2073 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2074 		goto discard_and_relse;
2075 	}
2076 
2077 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2078 		goto discard_and_relse;
2079 
2080 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2081 		goto discard_and_relse;
2082 
2083 	nf_reset_ct(skb);
2084 
2085 	if (tcp_filter(sk, skb))
2086 		goto discard_and_relse;
2087 	th = (const struct tcphdr *)skb->data;
2088 	iph = ip_hdr(skb);
2089 	tcp_v4_fill_cb(skb, iph, th);
2090 
2091 	skb->dev = NULL;
2092 
2093 	if (sk->sk_state == TCP_LISTEN) {
2094 		ret = tcp_v4_do_rcv(sk, skb);
2095 		goto put_and_return;
2096 	}
2097 
2098 	sk_incoming_cpu_update(sk);
2099 
2100 	bh_lock_sock_nested(sk);
2101 	tcp_segs_in(tcp_sk(sk), skb);
2102 	ret = 0;
2103 	if (!sock_owned_by_user(sk)) {
2104 		skb_to_free = sk->sk_rx_skb_cache;
2105 		sk->sk_rx_skb_cache = NULL;
2106 		ret = tcp_v4_do_rcv(sk, skb);
2107 	} else {
2108 		if (tcp_add_backlog(sk, skb))
2109 			goto discard_and_relse;
2110 		skb_to_free = NULL;
2111 	}
2112 	bh_unlock_sock(sk);
2113 	if (skb_to_free)
2114 		__kfree_skb(skb_to_free);
2115 
2116 put_and_return:
2117 	if (refcounted)
2118 		sock_put(sk);
2119 
2120 	return ret;
2121 
2122 no_tcp_socket:
2123 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2124 		goto discard_it;
2125 
2126 	tcp_v4_fill_cb(skb, iph, th);
2127 
2128 	if (tcp_checksum_complete(skb)) {
2129 csum_error:
2130 		trace_tcp_bad_csum(skb);
2131 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2132 bad_packet:
2133 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2134 	} else {
2135 		tcp_v4_send_reset(NULL, skb);
2136 	}
2137 
2138 discard_it:
2139 	/* Discard frame. */
2140 	kfree_skb(skb);
2141 	return 0;
2142 
2143 discard_and_relse:
2144 	sk_drops_add(sk, skb);
2145 	if (refcounted)
2146 		sock_put(sk);
2147 	goto discard_it;
2148 
2149 do_time_wait:
2150 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2151 		inet_twsk_put(inet_twsk(sk));
2152 		goto discard_it;
2153 	}
2154 
2155 	tcp_v4_fill_cb(skb, iph, th);
2156 
2157 	if (tcp_checksum_complete(skb)) {
2158 		inet_twsk_put(inet_twsk(sk));
2159 		goto csum_error;
2160 	}
2161 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2162 	case TCP_TW_SYN: {
2163 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2164 							&tcp_hashinfo, skb,
2165 							__tcp_hdrlen(th),
2166 							iph->saddr, th->source,
2167 							iph->daddr, th->dest,
2168 							inet_iif(skb),
2169 							sdif);
2170 		if (sk2) {
2171 			inet_twsk_deschedule_put(inet_twsk(sk));
2172 			sk = sk2;
2173 			tcp_v4_restore_cb(skb);
2174 			refcounted = false;
2175 			goto process;
2176 		}
2177 	}
2178 		/* to ACK */
2179 		fallthrough;
2180 	case TCP_TW_ACK:
2181 		tcp_v4_timewait_ack(sk, skb);
2182 		break;
2183 	case TCP_TW_RST:
2184 		tcp_v4_send_reset(sk, skb);
2185 		inet_twsk_deschedule_put(inet_twsk(sk));
2186 		goto discard_it;
2187 	case TCP_TW_SUCCESS:;
2188 	}
2189 	goto discard_it;
2190 }
2191 
2192 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2193 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2194 	.twsk_unique	= tcp_twsk_unique,
2195 	.twsk_destructor= tcp_twsk_destructor,
2196 };
2197 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2198 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2199 {
2200 	struct dst_entry *dst = skb_dst(skb);
2201 
2202 	if (dst && dst_hold_safe(dst)) {
2203 		sk->sk_rx_dst = dst;
2204 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2205 	}
2206 }
2207 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2208 
2209 const struct inet_connection_sock_af_ops ipv4_specific = {
2210 	.queue_xmit	   = ip_queue_xmit,
2211 	.send_check	   = tcp_v4_send_check,
2212 	.rebuild_header	   = inet_sk_rebuild_header,
2213 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2214 	.conn_request	   = tcp_v4_conn_request,
2215 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2216 	.net_header_len	   = sizeof(struct iphdr),
2217 	.setsockopt	   = ip_setsockopt,
2218 	.getsockopt	   = ip_getsockopt,
2219 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2220 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2221 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2222 };
2223 EXPORT_SYMBOL(ipv4_specific);
2224 
2225 #ifdef CONFIG_TCP_MD5SIG
2226 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2227 	.md5_lookup		= tcp_v4_md5_lookup,
2228 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2229 	.md5_parse		= tcp_v4_parse_md5_keys,
2230 };
2231 #endif
2232 
2233 /* NOTE: A lot of things set to zero explicitly by call to
2234  *       sk_alloc() so need not be done here.
2235  */
tcp_v4_init_sock(struct sock * sk)2236 static int tcp_v4_init_sock(struct sock *sk)
2237 {
2238 	struct inet_connection_sock *icsk = inet_csk(sk);
2239 
2240 	tcp_init_sock(sk);
2241 
2242 	icsk->icsk_af_ops = &ipv4_specific;
2243 
2244 #ifdef CONFIG_TCP_MD5SIG
2245 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2246 #endif
2247 
2248 	return 0;
2249 }
2250 
tcp_v4_destroy_sock(struct sock * sk)2251 void tcp_v4_destroy_sock(struct sock *sk)
2252 {
2253 	struct tcp_sock *tp = tcp_sk(sk);
2254 
2255 	trace_tcp_destroy_sock(sk);
2256 
2257 	tcp_clear_xmit_timers(sk);
2258 
2259 	tcp_cleanup_congestion_control(sk);
2260 
2261 	tcp_cleanup_ulp(sk);
2262 
2263 	/* Cleanup up the write buffer. */
2264 	tcp_write_queue_purge(sk);
2265 
2266 	/* Check if we want to disable active TFO */
2267 	tcp_fastopen_active_disable_ofo_check(sk);
2268 
2269 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2270 	skb_rbtree_purge(&tp->out_of_order_queue);
2271 
2272 #ifdef CONFIG_TCP_MD5SIG
2273 	/* Clean up the MD5 key list, if any */
2274 	if (tp->md5sig_info) {
2275 		tcp_clear_md5_list(sk);
2276 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2277 		tp->md5sig_info = NULL;
2278 	}
2279 #endif
2280 
2281 	/* Clean up a referenced TCP bind bucket. */
2282 	if (inet_csk(sk)->icsk_bind_hash)
2283 		inet_put_port(sk);
2284 
2285 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2286 
2287 	/* If socket is aborted during connect operation */
2288 	tcp_free_fastopen_req(tp);
2289 	tcp_fastopen_destroy_cipher(sk);
2290 	tcp_saved_syn_free(tp);
2291 
2292 	sk_sockets_allocated_dec(sk);
2293 }
2294 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2295 
2296 #ifdef CONFIG_PROC_FS
2297 /* Proc filesystem TCP sock list dumping. */
2298 
2299 static unsigned short seq_file_family(const struct seq_file *seq);
2300 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2301 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2302 {
2303 	unsigned short family = seq_file_family(seq);
2304 
2305 	/* AF_UNSPEC is used as a match all */
2306 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2307 		net_eq(sock_net(sk), seq_file_net(seq)));
2308 }
2309 
2310 /* Find a non empty bucket (starting from st->bucket)
2311  * and return the first sk from it.
2312  */
listening_get_first(struct seq_file * seq)2313 static void *listening_get_first(struct seq_file *seq)
2314 {
2315 	struct tcp_iter_state *st = seq->private;
2316 
2317 	st->offset = 0;
2318 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2319 		struct inet_listen_hashbucket *ilb2;
2320 		struct inet_connection_sock *icsk;
2321 		struct sock *sk;
2322 
2323 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2324 		if (hlist_empty(&ilb2->head))
2325 			continue;
2326 
2327 		spin_lock(&ilb2->lock);
2328 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2329 			sk = (struct sock *)icsk;
2330 			if (seq_sk_match(seq, sk))
2331 				return sk;
2332 		}
2333 		spin_unlock(&ilb2->lock);
2334 	}
2335 
2336 	return NULL;
2337 }
2338 
2339 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2340  * If "cur" is the last one in the st->bucket,
2341  * call listening_get_first() to return the first sk of the next
2342  * non empty bucket.
2343  */
listening_get_next(struct seq_file * seq,void * cur)2344 static void *listening_get_next(struct seq_file *seq, void *cur)
2345 {
2346 	struct tcp_iter_state *st = seq->private;
2347 	struct inet_listen_hashbucket *ilb2;
2348 	struct inet_connection_sock *icsk;
2349 	struct sock *sk = cur;
2350 
2351 	++st->num;
2352 	++st->offset;
2353 
2354 	icsk = inet_csk(sk);
2355 	inet_lhash2_for_each_icsk_continue(icsk) {
2356 		sk = (struct sock *)icsk;
2357 		if (seq_sk_match(seq, sk))
2358 			return sk;
2359 	}
2360 
2361 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2362 	spin_unlock(&ilb2->lock);
2363 	++st->bucket;
2364 	return listening_get_first(seq);
2365 }
2366 
listening_get_idx(struct seq_file * seq,loff_t * pos)2367 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2368 {
2369 	struct tcp_iter_state *st = seq->private;
2370 	void *rc;
2371 
2372 	st->bucket = 0;
2373 	st->offset = 0;
2374 	rc = listening_get_first(seq);
2375 
2376 	while (rc && *pos) {
2377 		rc = listening_get_next(seq, rc);
2378 		--*pos;
2379 	}
2380 	return rc;
2381 }
2382 
empty_bucket(const struct tcp_iter_state * st)2383 static inline bool empty_bucket(const struct tcp_iter_state *st)
2384 {
2385 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2386 }
2387 
2388 /*
2389  * Get first established socket starting from bucket given in st->bucket.
2390  * If st->bucket is zero, the very first socket in the hash is returned.
2391  */
established_get_first(struct seq_file * seq)2392 static void *established_get_first(struct seq_file *seq)
2393 {
2394 	struct tcp_iter_state *st = seq->private;
2395 
2396 	st->offset = 0;
2397 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2398 		struct sock *sk;
2399 		struct hlist_nulls_node *node;
2400 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2401 
2402 		/* Lockless fast path for the common case of empty buckets */
2403 		if (empty_bucket(st))
2404 			continue;
2405 
2406 		spin_lock_bh(lock);
2407 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2408 			if (seq_sk_match(seq, sk))
2409 				return sk;
2410 		}
2411 		spin_unlock_bh(lock);
2412 	}
2413 
2414 	return NULL;
2415 }
2416 
established_get_next(struct seq_file * seq,void * cur)2417 static void *established_get_next(struct seq_file *seq, void *cur)
2418 {
2419 	struct sock *sk = cur;
2420 	struct hlist_nulls_node *node;
2421 	struct tcp_iter_state *st = seq->private;
2422 
2423 	++st->num;
2424 	++st->offset;
2425 
2426 	sk = sk_nulls_next(sk);
2427 
2428 	sk_nulls_for_each_from(sk, node) {
2429 		if (seq_sk_match(seq, sk))
2430 			return sk;
2431 	}
2432 
2433 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2434 	++st->bucket;
2435 	return established_get_first(seq);
2436 }
2437 
established_get_idx(struct seq_file * seq,loff_t pos)2438 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439 {
2440 	struct tcp_iter_state *st = seq->private;
2441 	void *rc;
2442 
2443 	st->bucket = 0;
2444 	rc = established_get_first(seq);
2445 
2446 	while (rc && pos) {
2447 		rc = established_get_next(seq, rc);
2448 		--pos;
2449 	}
2450 	return rc;
2451 }
2452 
tcp_get_idx(struct seq_file * seq,loff_t pos)2453 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2454 {
2455 	void *rc;
2456 	struct tcp_iter_state *st = seq->private;
2457 
2458 	st->state = TCP_SEQ_STATE_LISTENING;
2459 	rc	  = listening_get_idx(seq, &pos);
2460 
2461 	if (!rc) {
2462 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2463 		rc	  = established_get_idx(seq, pos);
2464 	}
2465 
2466 	return rc;
2467 }
2468 
tcp_seek_last_pos(struct seq_file * seq)2469 static void *tcp_seek_last_pos(struct seq_file *seq)
2470 {
2471 	struct tcp_iter_state *st = seq->private;
2472 	int bucket = st->bucket;
2473 	int offset = st->offset;
2474 	int orig_num = st->num;
2475 	void *rc = NULL;
2476 
2477 	switch (st->state) {
2478 	case TCP_SEQ_STATE_LISTENING:
2479 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2480 			break;
2481 		st->state = TCP_SEQ_STATE_LISTENING;
2482 		rc = listening_get_first(seq);
2483 		while (offset-- && rc && bucket == st->bucket)
2484 			rc = listening_get_next(seq, rc);
2485 		if (rc)
2486 			break;
2487 		st->bucket = 0;
2488 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2489 		fallthrough;
2490 	case TCP_SEQ_STATE_ESTABLISHED:
2491 		if (st->bucket > tcp_hashinfo.ehash_mask)
2492 			break;
2493 		rc = established_get_first(seq);
2494 		while (offset-- && rc && bucket == st->bucket)
2495 			rc = established_get_next(seq, rc);
2496 	}
2497 
2498 	st->num = orig_num;
2499 
2500 	return rc;
2501 }
2502 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2503 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2504 {
2505 	struct tcp_iter_state *st = seq->private;
2506 	void *rc;
2507 
2508 	if (*pos && *pos == st->last_pos) {
2509 		rc = tcp_seek_last_pos(seq);
2510 		if (rc)
2511 			goto out;
2512 	}
2513 
2514 	st->state = TCP_SEQ_STATE_LISTENING;
2515 	st->num = 0;
2516 	st->bucket = 0;
2517 	st->offset = 0;
2518 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2519 
2520 out:
2521 	st->last_pos = *pos;
2522 	return rc;
2523 }
2524 EXPORT_SYMBOL(tcp_seq_start);
2525 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2526 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527 {
2528 	struct tcp_iter_state *st = seq->private;
2529 	void *rc = NULL;
2530 
2531 	if (v == SEQ_START_TOKEN) {
2532 		rc = tcp_get_idx(seq, 0);
2533 		goto out;
2534 	}
2535 
2536 	switch (st->state) {
2537 	case TCP_SEQ_STATE_LISTENING:
2538 		rc = listening_get_next(seq, v);
2539 		if (!rc) {
2540 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2541 			st->bucket = 0;
2542 			st->offset = 0;
2543 			rc	  = established_get_first(seq);
2544 		}
2545 		break;
2546 	case TCP_SEQ_STATE_ESTABLISHED:
2547 		rc = established_get_next(seq, v);
2548 		break;
2549 	}
2550 out:
2551 	++*pos;
2552 	st->last_pos = *pos;
2553 	return rc;
2554 }
2555 EXPORT_SYMBOL(tcp_seq_next);
2556 
tcp_seq_stop(struct seq_file * seq,void * v)2557 void tcp_seq_stop(struct seq_file *seq, void *v)
2558 {
2559 	struct tcp_iter_state *st = seq->private;
2560 
2561 	switch (st->state) {
2562 	case TCP_SEQ_STATE_LISTENING:
2563 		if (v != SEQ_START_TOKEN)
2564 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2565 		break;
2566 	case TCP_SEQ_STATE_ESTABLISHED:
2567 		if (v)
2568 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2569 		break;
2570 	}
2571 }
2572 EXPORT_SYMBOL(tcp_seq_stop);
2573 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2574 static void get_openreq4(const struct request_sock *req,
2575 			 struct seq_file *f, int i)
2576 {
2577 	const struct inet_request_sock *ireq = inet_rsk(req);
2578 	long delta = req->rsk_timer.expires - jiffies;
2579 
2580 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2581 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2582 		i,
2583 		ireq->ir_loc_addr,
2584 		ireq->ir_num,
2585 		ireq->ir_rmt_addr,
2586 		ntohs(ireq->ir_rmt_port),
2587 		TCP_SYN_RECV,
2588 		0, 0, /* could print option size, but that is af dependent. */
2589 		1,    /* timers active (only the expire timer) */
2590 		jiffies_delta_to_clock_t(delta),
2591 		req->num_timeout,
2592 		from_kuid_munged(seq_user_ns(f),
2593 				 sock_i_uid(req->rsk_listener)),
2594 		0,  /* non standard timer */
2595 		0, /* open_requests have no inode */
2596 		0,
2597 		req);
2598 }
2599 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2600 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2601 {
2602 	int timer_active;
2603 	unsigned long timer_expires;
2604 	const struct tcp_sock *tp = tcp_sk(sk);
2605 	const struct inet_connection_sock *icsk = inet_csk(sk);
2606 	const struct inet_sock *inet = inet_sk(sk);
2607 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2608 	__be32 dest = inet->inet_daddr;
2609 	__be32 src = inet->inet_rcv_saddr;
2610 	__u16 destp = ntohs(inet->inet_dport);
2611 	__u16 srcp = ntohs(inet->inet_sport);
2612 	int rx_queue;
2613 	int state;
2614 
2615 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2616 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2617 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2618 		timer_active	= 1;
2619 		timer_expires	= icsk->icsk_timeout;
2620 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2621 		timer_active	= 4;
2622 		timer_expires	= icsk->icsk_timeout;
2623 	} else if (timer_pending(&sk->sk_timer)) {
2624 		timer_active	= 2;
2625 		timer_expires	= sk->sk_timer.expires;
2626 	} else {
2627 		timer_active	= 0;
2628 		timer_expires = jiffies;
2629 	}
2630 
2631 	state = inet_sk_state_load(sk);
2632 	if (state == TCP_LISTEN)
2633 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2634 	else
2635 		/* Because we don't lock the socket,
2636 		 * we might find a transient negative value.
2637 		 */
2638 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2639 				      READ_ONCE(tp->copied_seq), 0);
2640 
2641 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2642 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2643 		i, src, srcp, dest, destp, state,
2644 		READ_ONCE(tp->write_seq) - tp->snd_una,
2645 		rx_queue,
2646 		timer_active,
2647 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2648 		icsk->icsk_retransmits,
2649 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2650 		icsk->icsk_probes_out,
2651 		sock_i_ino(sk),
2652 		refcount_read(&sk->sk_refcnt), sk,
2653 		jiffies_to_clock_t(icsk->icsk_rto),
2654 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2655 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2656 		tp->snd_cwnd,
2657 		state == TCP_LISTEN ?
2658 		    fastopenq->max_qlen :
2659 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2660 }
2661 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2662 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2663 			       struct seq_file *f, int i)
2664 {
2665 	long delta = tw->tw_timer.expires - jiffies;
2666 	__be32 dest, src;
2667 	__u16 destp, srcp;
2668 
2669 	dest  = tw->tw_daddr;
2670 	src   = tw->tw_rcv_saddr;
2671 	destp = ntohs(tw->tw_dport);
2672 	srcp  = ntohs(tw->tw_sport);
2673 
2674 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2675 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2676 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2677 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2678 		refcount_read(&tw->tw_refcnt), tw);
2679 }
2680 
2681 #define TMPSZ 150
2682 
tcp4_seq_show(struct seq_file * seq,void * v)2683 static int tcp4_seq_show(struct seq_file *seq, void *v)
2684 {
2685 	struct tcp_iter_state *st;
2686 	struct sock *sk = v;
2687 
2688 	seq_setwidth(seq, TMPSZ - 1);
2689 	if (v == SEQ_START_TOKEN) {
2690 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2691 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2692 			   "inode");
2693 		goto out;
2694 	}
2695 	st = seq->private;
2696 
2697 	if (sk->sk_state == TCP_TIME_WAIT)
2698 		get_timewait4_sock(v, seq, st->num);
2699 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2700 		get_openreq4(v, seq, st->num);
2701 	else
2702 		get_tcp4_sock(v, seq, st->num);
2703 out:
2704 	seq_pad(seq, '\n');
2705 	return 0;
2706 }
2707 
2708 #ifdef CONFIG_BPF_SYSCALL
2709 struct bpf_tcp_iter_state {
2710 	struct tcp_iter_state state;
2711 	unsigned int cur_sk;
2712 	unsigned int end_sk;
2713 	unsigned int max_sk;
2714 	struct sock **batch;
2715 	bool st_bucket_done;
2716 };
2717 
2718 struct bpf_iter__tcp {
2719 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2720 	__bpf_md_ptr(struct sock_common *, sk_common);
2721 	uid_t uid __aligned(8);
2722 };
2723 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2724 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725 			     struct sock_common *sk_common, uid_t uid)
2726 {
2727 	struct bpf_iter__tcp ctx;
2728 
2729 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2730 	ctx.meta = meta;
2731 	ctx.sk_common = sk_common;
2732 	ctx.uid = uid;
2733 	return bpf_iter_run_prog(prog, &ctx);
2734 }
2735 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2736 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737 {
2738 	while (iter->cur_sk < iter->end_sk)
2739 		sock_put(iter->batch[iter->cur_sk++]);
2740 }
2741 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2742 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743 				      unsigned int new_batch_sz)
2744 {
2745 	struct sock **new_batch;
2746 
2747 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748 			     GFP_USER | __GFP_NOWARN);
2749 	if (!new_batch)
2750 		return -ENOMEM;
2751 
2752 	bpf_iter_tcp_put_batch(iter);
2753 	kvfree(iter->batch);
2754 	iter->batch = new_batch;
2755 	iter->max_sk = new_batch_sz;
2756 
2757 	return 0;
2758 }
2759 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2760 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761 						 struct sock *start_sk)
2762 {
2763 	struct bpf_tcp_iter_state *iter = seq->private;
2764 	struct tcp_iter_state *st = &iter->state;
2765 	struct inet_connection_sock *icsk;
2766 	unsigned int expected = 1;
2767 	struct sock *sk;
2768 
2769 	sock_hold(start_sk);
2770 	iter->batch[iter->end_sk++] = start_sk;
2771 
2772 	icsk = inet_csk(start_sk);
2773 	inet_lhash2_for_each_icsk_continue(icsk) {
2774 		sk = (struct sock *)icsk;
2775 		if (seq_sk_match(seq, sk)) {
2776 			if (iter->end_sk < iter->max_sk) {
2777 				sock_hold(sk);
2778 				iter->batch[iter->end_sk++] = sk;
2779 			}
2780 			expected++;
2781 		}
2782 	}
2783 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2784 
2785 	return expected;
2786 }
2787 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2788 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789 						   struct sock *start_sk)
2790 {
2791 	struct bpf_tcp_iter_state *iter = seq->private;
2792 	struct tcp_iter_state *st = &iter->state;
2793 	struct hlist_nulls_node *node;
2794 	unsigned int expected = 1;
2795 	struct sock *sk;
2796 
2797 	sock_hold(start_sk);
2798 	iter->batch[iter->end_sk++] = start_sk;
2799 
2800 	sk = sk_nulls_next(start_sk);
2801 	sk_nulls_for_each_from(sk, node) {
2802 		if (seq_sk_match(seq, sk)) {
2803 			if (iter->end_sk < iter->max_sk) {
2804 				sock_hold(sk);
2805 				iter->batch[iter->end_sk++] = sk;
2806 			}
2807 			expected++;
2808 		}
2809 	}
2810 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2811 
2812 	return expected;
2813 }
2814 
bpf_iter_tcp_batch(struct seq_file * seq)2815 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2816 {
2817 	struct bpf_tcp_iter_state *iter = seq->private;
2818 	struct tcp_iter_state *st = &iter->state;
2819 	unsigned int expected;
2820 	bool resized = false;
2821 	struct sock *sk;
2822 
2823 	/* The st->bucket is done.  Directly advance to the next
2824 	 * bucket instead of having the tcp_seek_last_pos() to skip
2825 	 * one by one in the current bucket and eventually find out
2826 	 * it has to advance to the next bucket.
2827 	 */
2828 	if (iter->st_bucket_done) {
2829 		st->offset = 0;
2830 		st->bucket++;
2831 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2832 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2833 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2834 			st->bucket = 0;
2835 		}
2836 	}
2837 
2838 again:
2839 	/* Get a new batch */
2840 	iter->cur_sk = 0;
2841 	iter->end_sk = 0;
2842 	iter->st_bucket_done = false;
2843 
2844 	sk = tcp_seek_last_pos(seq);
2845 	if (!sk)
2846 		return NULL; /* Done */
2847 
2848 	if (st->state == TCP_SEQ_STATE_LISTENING)
2849 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2850 	else
2851 		expected = bpf_iter_tcp_established_batch(seq, sk);
2852 
2853 	if (iter->end_sk == expected) {
2854 		iter->st_bucket_done = true;
2855 		return sk;
2856 	}
2857 
2858 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2859 		resized = true;
2860 		goto again;
2861 	}
2862 
2863 	return sk;
2864 }
2865 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2866 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2867 {
2868 	/* bpf iter does not support lseek, so it always
2869 	 * continue from where it was stop()-ped.
2870 	 */
2871 	if (*pos)
2872 		return bpf_iter_tcp_batch(seq);
2873 
2874 	return SEQ_START_TOKEN;
2875 }
2876 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2877 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2878 {
2879 	struct bpf_tcp_iter_state *iter = seq->private;
2880 	struct tcp_iter_state *st = &iter->state;
2881 	struct sock *sk;
2882 
2883 	/* Whenever seq_next() is called, the iter->cur_sk is
2884 	 * done with seq_show(), so advance to the next sk in
2885 	 * the batch.
2886 	 */
2887 	if (iter->cur_sk < iter->end_sk) {
2888 		/* Keeping st->num consistent in tcp_iter_state.
2889 		 * bpf_iter_tcp does not use st->num.
2890 		 * meta.seq_num is used instead.
2891 		 */
2892 		st->num++;
2893 		/* Move st->offset to the next sk in the bucket such that
2894 		 * the future start() will resume at st->offset in
2895 		 * st->bucket.  See tcp_seek_last_pos().
2896 		 */
2897 		st->offset++;
2898 		sock_put(iter->batch[iter->cur_sk++]);
2899 	}
2900 
2901 	if (iter->cur_sk < iter->end_sk)
2902 		sk = iter->batch[iter->cur_sk];
2903 	else
2904 		sk = bpf_iter_tcp_batch(seq);
2905 
2906 	++*pos;
2907 	/* Keeping st->last_pos consistent in tcp_iter_state.
2908 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2909 	 */
2910 	st->last_pos = *pos;
2911 	return sk;
2912 }
2913 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2914 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2915 {
2916 	struct bpf_iter_meta meta;
2917 	struct bpf_prog *prog;
2918 	struct sock *sk = v;
2919 	bool slow;
2920 	uid_t uid;
2921 	int ret;
2922 
2923 	if (v == SEQ_START_TOKEN)
2924 		return 0;
2925 
2926 	if (sk_fullsock(sk))
2927 		slow = lock_sock_fast(sk);
2928 
2929 	if (unlikely(sk_unhashed(sk))) {
2930 		ret = SEQ_SKIP;
2931 		goto unlock;
2932 	}
2933 
2934 	if (sk->sk_state == TCP_TIME_WAIT) {
2935 		uid = 0;
2936 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2937 		const struct request_sock *req = v;
2938 
2939 		uid = from_kuid_munged(seq_user_ns(seq),
2940 				       sock_i_uid(req->rsk_listener));
2941 	} else {
2942 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2943 	}
2944 
2945 	meta.seq = seq;
2946 	prog = bpf_iter_get_info(&meta, false);
2947 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2948 
2949 unlock:
2950 	if (sk_fullsock(sk))
2951 		unlock_sock_fast(sk, slow);
2952 	return ret;
2953 
2954 }
2955 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2956 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2957 {
2958 	struct bpf_tcp_iter_state *iter = seq->private;
2959 	struct bpf_iter_meta meta;
2960 	struct bpf_prog *prog;
2961 
2962 	if (!v) {
2963 		meta.seq = seq;
2964 		prog = bpf_iter_get_info(&meta, true);
2965 		if (prog)
2966 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2967 	}
2968 
2969 	if (iter->cur_sk < iter->end_sk) {
2970 		bpf_iter_tcp_put_batch(iter);
2971 		iter->st_bucket_done = false;
2972 	}
2973 }
2974 
2975 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2976 	.show		= bpf_iter_tcp_seq_show,
2977 	.start		= bpf_iter_tcp_seq_start,
2978 	.next		= bpf_iter_tcp_seq_next,
2979 	.stop		= bpf_iter_tcp_seq_stop,
2980 };
2981 #endif
seq_file_family(const struct seq_file * seq)2982 static unsigned short seq_file_family(const struct seq_file *seq)
2983 {
2984 	const struct tcp_seq_afinfo *afinfo;
2985 
2986 #ifdef CONFIG_BPF_SYSCALL
2987 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2988 	if (seq->op == &bpf_iter_tcp_seq_ops)
2989 		return AF_UNSPEC;
2990 #endif
2991 
2992 	/* Iterated from proc fs */
2993 	afinfo = PDE_DATA(file_inode(seq->file));
2994 	return afinfo->family;
2995 }
2996 
2997 static const struct seq_operations tcp4_seq_ops = {
2998 	.show		= tcp4_seq_show,
2999 	.start		= tcp_seq_start,
3000 	.next		= tcp_seq_next,
3001 	.stop		= tcp_seq_stop,
3002 };
3003 
3004 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3005 	.family		= AF_INET,
3006 };
3007 
tcp4_proc_init_net(struct net * net)3008 static int __net_init tcp4_proc_init_net(struct net *net)
3009 {
3010 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3011 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3012 		return -ENOMEM;
3013 	return 0;
3014 }
3015 
tcp4_proc_exit_net(struct net * net)3016 static void __net_exit tcp4_proc_exit_net(struct net *net)
3017 {
3018 	remove_proc_entry("tcp", net->proc_net);
3019 }
3020 
3021 static struct pernet_operations tcp4_net_ops = {
3022 	.init = tcp4_proc_init_net,
3023 	.exit = tcp4_proc_exit_net,
3024 };
3025 
tcp4_proc_init(void)3026 int __init tcp4_proc_init(void)
3027 {
3028 	return register_pernet_subsys(&tcp4_net_ops);
3029 }
3030 
tcp4_proc_exit(void)3031 void tcp4_proc_exit(void)
3032 {
3033 	unregister_pernet_subsys(&tcp4_net_ops);
3034 }
3035 #endif /* CONFIG_PROC_FS */
3036 
3037 /* @wake is one when sk_stream_write_space() calls us.
3038  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3039  * This mimics the strategy used in sock_def_write_space().
3040  */
tcp_stream_memory_free(const struct sock * sk,int wake)3041 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3042 {
3043 	const struct tcp_sock *tp = tcp_sk(sk);
3044 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3045 			    READ_ONCE(tp->snd_nxt);
3046 
3047 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3048 }
3049 EXPORT_SYMBOL(tcp_stream_memory_free);
3050 
3051 struct proto tcp_prot = {
3052 	.name			= "TCP",
3053 	.owner			= THIS_MODULE,
3054 	.close			= tcp_close,
3055 	.pre_connect		= tcp_v4_pre_connect,
3056 	.connect		= tcp_v4_connect,
3057 	.disconnect		= tcp_disconnect,
3058 	.accept			= inet_csk_accept,
3059 	.ioctl			= tcp_ioctl,
3060 	.init			= tcp_v4_init_sock,
3061 	.destroy		= tcp_v4_destroy_sock,
3062 	.shutdown		= tcp_shutdown,
3063 	.setsockopt		= tcp_setsockopt,
3064 	.getsockopt		= tcp_getsockopt,
3065 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3066 	.keepalive		= tcp_set_keepalive,
3067 	.recvmsg		= tcp_recvmsg,
3068 	.sendmsg		= tcp_sendmsg,
3069 	.sendpage		= tcp_sendpage,
3070 	.backlog_rcv		= tcp_v4_do_rcv,
3071 	.release_cb		= tcp_release_cb,
3072 	.hash			= inet_hash,
3073 	.unhash			= inet_unhash,
3074 	.get_port		= inet_csk_get_port,
3075 #ifdef CONFIG_BPF_SYSCALL
3076 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3077 #endif
3078 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3079 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3080 	.stream_memory_free	= tcp_stream_memory_free,
3081 	.sockets_allocated	= &tcp_sockets_allocated,
3082 	.orphan_count		= &tcp_orphan_count,
3083 	.memory_allocated	= &tcp_memory_allocated,
3084 	.memory_pressure	= &tcp_memory_pressure,
3085 	.sysctl_mem		= sysctl_tcp_mem,
3086 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3087 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3088 	.max_header		= MAX_TCP_HEADER,
3089 	.obj_size		= sizeof(struct tcp_sock),
3090 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3091 	.twsk_prot		= &tcp_timewait_sock_ops,
3092 	.rsk_prot		= &tcp_request_sock_ops,
3093 	.h.hashinfo		= &tcp_hashinfo,
3094 	.no_autobind		= true,
3095 	.diag_destroy		= tcp_abort,
3096 };
3097 EXPORT_SYMBOL(tcp_prot);
3098 
tcp_sk_exit(struct net * net)3099 static void __net_exit tcp_sk_exit(struct net *net)
3100 {
3101 	int cpu;
3102 
3103 	if (net->ipv4.tcp_congestion_control)
3104 		bpf_module_put(net->ipv4.tcp_congestion_control,
3105 			       net->ipv4.tcp_congestion_control->owner);
3106 
3107 	for_each_possible_cpu(cpu)
3108 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3109 	free_percpu(net->ipv4.tcp_sk);
3110 }
3111 
tcp_sk_init(struct net * net)3112 static int __net_init tcp_sk_init(struct net *net)
3113 {
3114 	int res, cpu, cnt;
3115 
3116 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3117 	if (!net->ipv4.tcp_sk)
3118 		return -ENOMEM;
3119 
3120 	for_each_possible_cpu(cpu) {
3121 		struct sock *sk;
3122 
3123 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3124 					   IPPROTO_TCP, net);
3125 		if (res)
3126 			goto fail;
3127 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3128 
3129 		/* Please enforce IP_DF and IPID==0 for RST and
3130 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3131 		 */
3132 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3133 
3134 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3135 	}
3136 
3137 	net->ipv4.sysctl_tcp_ecn = 2;
3138 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139 
3140 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3141 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3142 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3143 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3144 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3145 
3146 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3147 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3148 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3149 
3150 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3151 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3152 	net->ipv4.sysctl_tcp_syncookies = 1;
3153 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3154 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3155 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3156 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3157 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3158 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3159 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3160 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3161 
3162 	cnt = tcp_hashinfo.ehash_mask + 1;
3163 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3164 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3165 
3166 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3167 	net->ipv4.sysctl_tcp_sack = 1;
3168 	net->ipv4.sysctl_tcp_window_scaling = 1;
3169 	net->ipv4.sysctl_tcp_timestamps = 1;
3170 	net->ipv4.sysctl_tcp_early_retrans = 3;
3171 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3172 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3173 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3174 	net->ipv4.sysctl_tcp_max_reordering = 300;
3175 	net->ipv4.sysctl_tcp_dsack = 1;
3176 	net->ipv4.sysctl_tcp_app_win = 31;
3177 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3178 	net->ipv4.sysctl_tcp_frto = 2;
3179 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3180 	/* This limits the percentage of the congestion window which we
3181 	 * will allow a single TSO frame to consume.  Building TSO frames
3182 	 * which are too large can cause TCP streams to be bursty.
3183 	 */
3184 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3185 	/* Default TSQ limit of 16 TSO segments */
3186 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3187 	/* rfc5961 challenge ack rate limiting */
3188 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3189 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3190 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3191 	net->ipv4.sysctl_tcp_autocorking = 1;
3192 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3193 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3194 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3195 	if (net != &init_net) {
3196 		memcpy(net->ipv4.sysctl_tcp_rmem,
3197 		       init_net.ipv4.sysctl_tcp_rmem,
3198 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199 		memcpy(net->ipv4.sysctl_tcp_wmem,
3200 		       init_net.ipv4.sysctl_tcp_wmem,
3201 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202 	}
3203 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3204 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3205 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3206 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3207 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3208 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209 
3210 	/* Reno is always built in */
3211 	if (!net_eq(net, &init_net) &&
3212 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213 			       init_net.ipv4.tcp_congestion_control->owner))
3214 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215 	else
3216 		net->ipv4.tcp_congestion_control = &tcp_reno;
3217 
3218 	return 0;
3219 fail:
3220 	tcp_sk_exit(net);
3221 
3222 	return res;
3223 }
3224 
tcp_sk_exit_batch(struct list_head * net_exit_list)3225 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3226 {
3227 	struct net *net;
3228 
3229 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3230 
3231 	list_for_each_entry(net, net_exit_list, exit_list)
3232 		tcp_fastopen_ctx_destroy(net);
3233 }
3234 
3235 static struct pernet_operations __net_initdata tcp_sk_ops = {
3236        .init	   = tcp_sk_init,
3237        .exit	   = tcp_sk_exit,
3238        .exit_batch = tcp_sk_exit_batch,
3239 };
3240 
3241 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3242 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3243 		     struct sock_common *sk_common, uid_t uid)
3244 
3245 #define INIT_BATCH_SZ 16
3246 
3247 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3248 {
3249 	struct bpf_tcp_iter_state *iter = priv_data;
3250 	int err;
3251 
3252 	err = bpf_iter_init_seq_net(priv_data, aux);
3253 	if (err)
3254 		return err;
3255 
3256 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3257 	if (err) {
3258 		bpf_iter_fini_seq_net(priv_data);
3259 		return err;
3260 	}
3261 
3262 	return 0;
3263 }
3264 
bpf_iter_fini_tcp(void * priv_data)3265 static void bpf_iter_fini_tcp(void *priv_data)
3266 {
3267 	struct bpf_tcp_iter_state *iter = priv_data;
3268 
3269 	bpf_iter_fini_seq_net(priv_data);
3270 	kvfree(iter->batch);
3271 }
3272 
3273 static const struct bpf_iter_seq_info tcp_seq_info = {
3274 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3275 	.init_seq_private	= bpf_iter_init_tcp,
3276 	.fini_seq_private	= bpf_iter_fini_tcp,
3277 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3278 };
3279 
3280 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3281 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3282 			    const struct bpf_prog *prog)
3283 {
3284 	switch (func_id) {
3285 	case BPF_FUNC_setsockopt:
3286 		return &bpf_sk_setsockopt_proto;
3287 	case BPF_FUNC_getsockopt:
3288 		return &bpf_sk_getsockopt_proto;
3289 	default:
3290 		return NULL;
3291 	}
3292 }
3293 
3294 static struct bpf_iter_reg tcp_reg_info = {
3295 	.target			= "tcp",
3296 	.ctx_arg_info_size	= 1,
3297 	.ctx_arg_info		= {
3298 		{ offsetof(struct bpf_iter__tcp, sk_common),
3299 		  PTR_TO_BTF_ID_OR_NULL },
3300 	},
3301 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3302 	.seq_info		= &tcp_seq_info,
3303 };
3304 
bpf_iter_register(void)3305 static void __init bpf_iter_register(void)
3306 {
3307 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3308 	if (bpf_iter_reg_target(&tcp_reg_info))
3309 		pr_warn("Warning: could not register bpf iterator tcp\n");
3310 }
3311 
3312 #endif
3313 
tcp_v4_init(void)3314 void __init tcp_v4_init(void)
3315 {
3316 	if (register_pernet_subsys(&tcp_sk_ops))
3317 		panic("Failed to create the TCP control socket.\n");
3318 
3319 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3320 	bpf_iter_register();
3321 #endif
3322 }
3323